Mercurial > repos > dlalgroup > text_to_wordmatrix
annotate text_to_wordmatrix.R @ 0:dd696b179eb7 draft
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
| author | dlalgroup | 
|---|---|
| date | Thu, 24 Sep 2020 02:58:53 +0000 | 
| parents | |
| children | 
| rev | line source | 
|---|---|
| 
0
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
1 #!/usr/bin/env Rscript | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
2 # tool: text_to_wordmatrix | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
3 # | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
4 #The tool extracts the most frequent words per entity (per row). Text of columns starting with "ABSTRACT" or "TEXT" are considered. | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
5 #All extracted terms are used to generate a word matrix with rows = entities and columns = extracted words. | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
6 #The resulting matrix is binary with 0= word not present in abstracts of entity and 1= word present in abstracts of entity. | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
7 # | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
8 #Input: Output of 'pubmed_by_queries' or 'abstracts_by_pmids', or tab-delimited table with entities in column called “ID_<name>”, | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
9 #e.g. “ID_genes” and text in columns starting with "ABSTRACT" or "TEXT". | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
10 # | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
11 #Output: Binary matrix with rows = entities and columns = extracted words. | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
12 # | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
13 #usage: text_to_wordmatrix.R [-h] [-i INPUT] [-o OUTPUT] [-n NUMBER] [-r] [-l] [-w] [-s] [-p] | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
14 # | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
15 # optional arguments: | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
16 # -h, --help show help message | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
17 # -i INPUT, --input INPUT input file name. add path if file is not in working directory | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
18 # -o OUTPUT, --output OUTPUT output file name. [default "text_to_wordmatrix_output"] | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
19 # -n NUMBER, --number NUMBER number of most frequent words that should be extracted [default "50"] | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
20 # -r, --remove_num remove any numbers in text | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
21 # -l, --lower_case by default all characters are translated to lower case. otherwise use -l | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
22 # -w, --remove_stopwords by default a set of english stopwords (e.g., 'the' or 'not') are removed. otherwise use -w | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
23 # -s, --stemDoc apply Porter's stemming algorithm: collapsing words to a common root to aid comparison of vocabulary | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
24 # -p, --plurals by default words in plural and singular are merged to the singular form. otherwise use -p | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
25 | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
26 if ( '--install_packages' %in% commandArgs()) { | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
27 print('Installing packages') | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
28 if (!require('argparse')) install.packages('argparse', repo="http://cran.rstudio.com/"); | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
29 if (!require("PubMedWordcloud")) install.packages("PubMedWordcloud", repo="http://cran.rstudio.com/"); | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
30 if (!require('SnowballC')) install.packages('SnowballC', repo="http://cran.rstudio.com/"); | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
31 if (!require('textclean')) install.packages('textclean', repo="http://cran.rstudio.com/"); | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
32 if (!require('SemNetCleaner')) install.packages('SemNetCleaner',repo="http://cran.rstudio.com/"); | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
33 if (!require('stringi')) install.packages('stringi',repo="http://cran.rstudio.com/"); | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
34 if (!require('stringr')) install.packages('stringr',repo="http://cran.rstudio.com/"); | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
35 } | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
36 | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
37 suppressPackageStartupMessages(library("argparse")) | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
38 suppressPackageStartupMessages(library("PubMedWordcloud")) | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
39 suppressPackageStartupMessages(library("SnowballC")) | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
40 suppressPackageStartupMessages(library("SemNetCleaner")) | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
41 suppressPackageStartupMessages(library("textclean")) | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
42 suppressPackageStartupMessages(library("stringi")) | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
43 suppressPackageStartupMessages(library("stringr")) | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
44 | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
45 parser <- ArgumentParser() | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
46 parser$add_argument("-i", "--input", | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
47 help = "input fie name. add path if file is not in workind directory") | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
48 parser$add_argument("-o", "--output", default="text_to_wordmatrix_output", | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
49 help = "output file name. [default \"%(default)s\"]") | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
50 parser$add_argument("-n", "--number", type="integer", default=50, choices=seq(1, 500), metavar="{0..500}", | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
51 help="number of most frequent words used per ID in word matrix [default \"%(default)s\"]") | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
52 parser$add_argument("-r", "--remove_num", action="store_true", default=FALSE, | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
53 help= "remove any numbers in text") | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
54 parser$add_argument("-l", "--lower_case", action="store_false", default=TRUE, | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
55 help="by default all characters are translated to lower case. otherwise use -l") | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
56 parser$add_argument("-w", "--remove_stopwords", action="store_false", default=TRUE, | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
57 help="by default a set of English stopwords (e.g., 'the' or 'not') are removed. otherwise use -s") | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
58 parser$add_argument("-s", "--stemDoc", action="store_true", default=FALSE, | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
59 help="apply Porter's stemming algorithm: collapsing words to a common root to aid comparison of vocabulary") | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
60 parser$add_argument("-p", "--plurals", action="store_false", default=TRUE, | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
61 help="by default words in plural and singular are merged to the singular form. otherwise use -p") | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
62 parser$add_argument("--install_packages", action="store_true", default=FALSE, | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
63 help="If you want to auto install missing required packages.") | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
64 | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
65 args <- parser$parse_args() | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
66 | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
67 | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
68 data = read.delim(args$input, stringsAsFactors=FALSE, header = TRUE, sep='\t') | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
69 word_matrix = data.frame() | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
70 | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
71 text_cols_index <- grep(c("ABSTRACT|TEXT"), names(data)) | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
72 | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
73 for(row in 1:nrow(data)){ | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
74 top_words = cleanAbstracts(abstracts= data[row,text_cols_index], | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
75 rmNum = args$remove_num, | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
76 tolw= args$lower_case, | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
77 rmWords= args$remove_stopwords, | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
78 stemDoc= args$stemDoc) | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
79 | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
80 top_words$word <- as.character(top_words$word) | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
81 | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
82 # δ γ ε | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
83 | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
84 cat("Most frequent words for row", row, " are extracted.", "\n") | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
85 | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
86 if(args$plurals == TRUE){ | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
87 top_words$word <- sapply(top_words$word, function(x){singularize(x)}) | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
88 top_words = aggregate(freq~word,top_words,sum) | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
89 } | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
90 | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
91 top_words = top_words[order(top_words$freq, decreasing = TRUE), ] | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
92 top_words$word = as.character(top_words$word) | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
93 | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
94 number_extract = min(args$number, nrow(top_words)) | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
95 word_matrix[row,sapply(1:number_extract, function(x){paste0(top_words$word[x])})] <- top_words$freq[1:number_extract] | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
96 } | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
97 | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
98 word_matrix <- as.matrix(word_matrix) | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
99 word_matrix[is.na(word_matrix)] <- 0 | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
100 word_matrix <- (word_matrix>0) *1 #binary matrix | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
101 | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
102 cat("A matrix with ", nrow(word_matrix), " rows and ", ncol(word_matrix), "columns is generated.", "\n") | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
103 | 
| 
 
dd696b179eb7
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
104 write.table(word_matrix, args$output, row.names = FALSE, sep = '\t') | 
