Mercurial > repos > dlalgroup > simtext_app
annotate pmids_to_pubtator_matrix.R @ 2:d7b190591e63 draft default tip
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
| author | dlalgroup | 
|---|---|
| date | Thu, 24 Sep 2020 05:44:58 +0000 | 
| parents | 34ed44f3f85c | 
| children | 
| rev | line source | 
|---|---|
| 0 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 1 #!/usr/bin/env Rscript | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 2 #tool: pmids_to_pubtator_matrix | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 3 # | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 4 #The tool uses all PMIDs per row and extracts "Gene", "Disease", "Mutation", "Chemical" and "Species" terms of the | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 5 #corresponding abstracts, using PubTator annotations. The user can choose from which categories terms should be extracted. | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 6 #The extracted terms are united in one large binary matrix, with 0= term not present in abstracts of that row and 1= term | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 7 #present in abstracts of that row. The user can decide if the extracted scientific terms should be extracted and used as | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 8 #they are or if they should be grouped by their geneIDs/ meshIDs (several terms can often be grouped into one ID). | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 9 #äAlso, by default all terms are extracted, otherwise the user can specify a number of most frequent words to be extracted per row. | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 10 # | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 11 #Input: Output of abstracts_by_pmids or tab-delimited table with columns containing PMIDs. | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 12 #The names of the PMID columns should start with "PMID", e.g. "PMID_1", "PMID_2" etc. | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 13 # | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 14 #Output: Binary matrix in that each column represents one of the extracted terms. | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 15 # | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 16 # usage: $ pmids_to_pubtator_matrix.R [-h] [-i INPUT] [-o OUTPUT] [-n NUMBER] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 17 # [-c {Genes,Diseases,Mutations,Chemicals,Species} [{Genes,Diseases,Mutations,Chemicals,Species} ...]] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 18 # | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 19 # optional arguments: | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 20 # -h, --help show help message | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 21 # -i INPUT, --input INPUT input file name. add path if file is not in workind directory | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 22 # -n NUMBER, --number NUMBER Number of most frequent terms/IDs to extract. By default all terms/IDs are extracted. | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 23 # -o OUTPUT, --output OUTPUT output file name. [default "pmids_to_pubtator_matrix_output"] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 24 # -c {Gene,Disease,Mutation,Chemical,Species} [{Genes,Diseases,Mutations,Chemicals,Species} ...], --categories {Gene,Disease,Mutation,Chemical,Species} [{Gene,Disease,Mutation,Chemical,Species} ...] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 25 # Pubtator categories that should be considered. [default "('Gene', 'Disease', 'Mutation','Chemical')"] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 26 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 27 if ( '--install_packages' %in% commandArgs()) { | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 28 print('Installing packages') | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 29 if (!require('argparse')) install.packages('argparse',repo="http://cran.rstudio.com/"); | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 30 if (!require('stringr')) install.packages('stringr',repo="http://cran.rstudio.com/"); | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 31 if (!require('RCurl')) install.packages('RCurl',repo="http://cran.rstudio.com/"); | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 32 if (!require('stringi')) install.packages('stringi',repo="http://cran.rstudio.com/"); | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 33 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 34 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 35 suppressPackageStartupMessages(library("argparse")) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 36 library('stringr') | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 37 library('stringi') | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 38 library('RCurl') | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 39 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 40 parser <- ArgumentParser() | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 41 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 42 parser$add_argument("-i", "--input", | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 43 help = "input fie name. add path if file is not in workind directory") | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 44 parser$add_argument("-o", "--output", default="pmids_to_pubtator_matrix_output", | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 45 help = "output file name. [default \"%(default)s\"]") | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 46 parser$add_argument("-c", "--categories", choices=c("Gene", "Disease", "Mutation", "Chemical", "Species"), nargs="+", | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 47 default= c("Gene", "Disease", "Mutation", "Chemical"), | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 48 help = "Pubtator categories that should be considered. [default \"%(default)s\"]") | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 49 parser$add_argument("-b", "--byid", action="store_true", default=FALSE, | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 50 help="If you want to find common gene IDs / mesh IDs instead of scientific terms.") | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 51 parser$add_argument("-n", "--number", default=NULL, type="integer", | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 52 help="Number of most frequent terms/IDs to extract. By default all terms/IDs are extracted.") | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 53 parser$add_argument("--install_packages", action="store_true", default=FALSE, | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 54 help="If you want to auto install missing required packages.") | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 55 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 56 args <- parser$parse_args() | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 57 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 58 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 59 data = read.delim(args$input, stringsAsFactors=FALSE, header = TRUE, sep='\t') | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 60 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 61 pmid_cols_index <- grep(c("PMID"), names(data)) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 62 word_matrix = data.frame() | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 63 dict.table = data.frame() | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 64 pmids_count <- 0 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 65 pubtator_max_ids = 100 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 66 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 67 get_pubtator_terms = function(pmids, categories){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 68 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 69 table = NULL | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 70 for (pmid_split in split(pmids, ceiling(seq_along(pmids)/pubtator_max_ids))){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 71 out.data = NULL | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 72 try_num <- 1 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 73 t_0 <- Sys.time() | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 74 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 75 while(TRUE) { | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 76 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 77 # Timing check: kill at 3 min | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 78 if (try_num > 1){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 79 cat("Connection problem. Please wait. Try number:",try_num,"\n") | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 80 Sys.sleep(time = 2*try_num) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 81 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 82 try_num <- try_num + 1 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 83 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 84 t_1 <- Sys.time() | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 85 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 86 if(as.numeric(difftime(t_1, t_0, units = "mins")) > 3){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 87 message("Killing the request! Something is not working. Please, try again later","\n") | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 88 return(table) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 89 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 90 out.data <- tryCatch({ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 91 getURL(paste("https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/pubtator?pmids=", | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 92 paste(pmid_split, collapse=","), sep = "")) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 93 }, error = function(e) { | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 94 print(e) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 95 next | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 96 }, finally = { | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 97 Sys.sleep(0) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 98 }) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 99 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 100 if(!is.null(out.data)){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 101 out.data = unlist(strsplit(out.data, "\n", fixed = T)) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 102 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 103 # skip first few lines, is this needed? | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 104 for (i in 3:length(out.data)) { | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 105 temps = unlist(strsplit(out.data[i], "\t", fixed = T)) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 106 if (length(temps) == 5) { | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 107 # make 5 be 6 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 108 temps = c(temps, NA) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 109 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 110 if (length(temps) == 6) { | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 111 # keep only 6 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 112 table = rbind(table, temps) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 113 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 114 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 115 break | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 116 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 117 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 118 } #end while loop | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 119 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 120 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 121 index.categories = c() | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 122 categories = as.character(unlist(categories)) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 123 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 124 if(ncol(table) == 6){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 125 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 126 for(i in categories){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 127 tmp.index = grep(TRUE, i == as.character(table[,5])) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 128 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 129 if(length(tmp.index) > 0){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 130 index.categories = c(index.categories,tmp.index) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 131 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 132 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 133 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 134 table = as.data.frame(table, stringsAsFactors=FALSE) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 135 table = table[index.categories,c(4,6)] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 136 table = table[!is.na(table[,2]),] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 137 table = table[!(table[,2] == "NA"),] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 138 table = table[!(table[,1] == "NA"),] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 139 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 140 if(args$byid){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 141 if(!is.null(args$number)){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 142 #retrieve top X mesh.ids | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 143 table.mesh = as.data.frame(table(table[,2])) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 144 colnames(table.mesh)[1] = "mesh.id" | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 145 table = table[order(table.mesh$Freq, decreasing = TRUE),] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 146 table = table[1:min(args$number, nrow(table.mesh)),] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 147 table.mesh$mesh.id = as.character(table.mesh$mesh.id) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 148 #subset table for top X mesh.ids | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 149 table = table[which(as.character(table$V6) %in% as.character(table.mesh$mesh.id)),] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 150 table = table[!duplicated(table[,2]),] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 151 }else{ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 152 table = table[!duplicated(table[,2]),] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 153 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 154 } else { | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 155 if(!is.null(args$number)){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 156 table[,1] = tolower(as.character(table[,1])) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 157 table = as.data.frame(table(table[,1])) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 158 colnames(table)[1] = "term" | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 159 table = table[order(table$Freq, decreasing = TRUE),] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 160 table = table[1:min(args$number, nrow(table)),] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 161 table$term = as.character(table$term) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 162 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 163 }else{ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 164 table[,1] = tolower(as.character(table[,1])) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 165 table = table[!duplicated(table[,1]),] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 166 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 167 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 168 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 169 return(table) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 170 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 171 } else { | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 172 return(NULL) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 173 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 174 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 175 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 176 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 177 #for all PMIDs of a row get PubTator terms and add them to the matrix | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 178 for (i in 1:nrow(data)){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 179 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 180 pmids = as.character(data[i,pmid_cols_index]) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 181 pmids = pmids[!pmids == "NA"] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 182 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 183 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 184 if ( (pmids_count > 10000)){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 185 cat("Break (10s) to avoid killing of requests. Please wait.",'\n') | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 186 Sys.sleep(10) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 187 pmids_count = 0 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 188 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 189 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 190 pmids_count = pmids_count + length(pmids) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 191 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 192 #get puptator terms with get_pubtator_terms function | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 193 if (length(pmids) >0){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 194 table = get_pubtator_terms(pmids, args$categories) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 195 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 196 if(!is.null(table)){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 197 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 198 colnames(table)= c("term","mesh.id") | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 199 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 200 # add data in binary matrix | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 201 if (args$byid){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 202 mesh.ids = as.character(table$mesh.id) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 203 if (length(mesh.ids) > 0 ){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 204 word_matrix[i,mesh.ids] <- 1 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 205 cat(length(mesh.ids), " IDs for PMIDs of row", i," were added",'\n') | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 206 # add data in dictionary | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 207 dict.table = rbind(dict.table, table) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 208 dict.table = dict.table[!duplicated(as.character(dict.table[,2])),] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 209 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 210 } else { | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 211 terms = as.character(table[,1]) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 212 if (length(terms) > 0 ){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 213 word_matrix[i,terms] <- 1 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 214 cat(length(terms), " terms for PMIDs of row", i," were added.",'\n') | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 215 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 216 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 217 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 218 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 219 } else { | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 220 cat("No terms for PMIDs of row", i," were found.",'\n') | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 221 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 222 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 223 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 224 if (args$byid){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 225 #change column names of matrix: exchange mesh ids/ids with term | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 226 index_names = match(names(word_matrix), as.character(dict.table[[2]])) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 227 names(word_matrix) = dict.table[index_names,1] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 228 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 229 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 230 colnames(word_matrix) = gsub("[^[:print:]]","",colnames(word_matrix)) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 231 colnames(word_matrix) = gsub('\"', "", colnames(word_matrix), fixed = TRUE) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 232 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 233 #merge duplicated columns | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 234 word_matrix = as.data.frame(do.call(cbind, by(t(word_matrix),INDICES=names(word_matrix),FUN=colSums))) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 235 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 236 #save binary matrix | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 237 word_matrix <- as.matrix(word_matrix) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 238 word_matrix[is.na(word_matrix)] <- 0 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 239 cat("Matrix with ",nrow(word_matrix)," rows and ",ncol(word_matrix)," columns generated.","\n") | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 240 #write.table(word_matrix, args$output) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 241 write.table(word_matrix, args$output, row.names = FALSE, sep = '\t') | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 242 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 243 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 244 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 245 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 246 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 247 | 
