Mercurial > repos > dlalgroup > text_to_wordmatrix
view text_to_wordmatrix.xml @ 0:dd696b179eb7 draft
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
author | dlalgroup |
---|---|
date | Thu, 24 Sep 2020 02:58:53 +0000 |
parents | |
children |
line wrap: on
line source
<tool id="text_to_wordmatrix" name="text_to_wordmatrix" version="@VERSION@"> <description>Extract most frequent words from each row in a table and generate binary word matrix</description> <macros> <import>macros.xml</import> </macros> <expand macro="requirements"> <requirement type="package" version="2.0.1">r-argparse</requirement> <requirement type="package" version="0.7.0">r-snowballc</requirement> <requirement type="package" version="0.3.6">r-pubmedwordcloud</requirement> <requirement type="package" version="1.2.0">r-semnetcleaner</requirement> <requirement type="package" version="0.9.3">r-textclean</requirement> <requirement type="package" version="1.4.3">r-stringi</requirement> <requirement type="package" version="1.4.0">r-stringr</requirement> </expand> <expand macro="stdio"/> <command><![CDATA[Rscript '${__tool_directory__}/text_to_wordmatrix.R' --input '$input' --output '$output' --number '$number' $remove_num $lower_case $remove_stopwords $stemDoc $plurals ]]> </command> <inputs> <param argument="--input" label="Input file" name="input" optional="false" type="data" format="tabular" help="input"/> <param argument="--number" label="Number of most frequent words that should be extracted per row" name="number" optional="false" type="integer" help="number" value="50" min="1" max="500"/> <param argument="--remove_num" label="Remove any numbers in text" name="remove_num" type="boolean" truevalue="--remove_num" falsevalue="" help="remove_num" checked="false"/> <param argument="--lower_case" label="Translate all characters are to lower case." name="lower_case" type="boolean" truevalue="" falsevalue="--lower_case" help="lower_case" checked="true"/> <param argument="--remove_stopwords" label="Remove english stopwords (e.g., 'the' or 'not')." name="remove_stopwords" type="boolean" truevalue="" falsevalue="--remove_stopwords" help="remove_stopwords" checked="true"/> <param argument="--stemDoc" label="Apply Porter's stemming algorithm: collapsing words to a common root to aid comparison of vocabulary." name="stemDoc" type="boolean" truevalue="--stemDoc" falsevalue="" help="stemDoc" checked="false"/> <param argument="--plurals" label="Transform words in plural to their singular form." name="plurals" type="boolean" truevalue="" falsevalue="--plurals" help="plurals" checked="true"/> </inputs> <outputs> <data format="tabular" name="output" /> </outputs> <tests> <test> <param name="input" value="pubmed_by_queries_output_abstracts" ftype="tabular"/> <output name="output" value="text_to_wordmatrix_output"/> </test> </tests> <help><![CDATA[ The tool extracts for each row the most frequent words from the text in columns starting with "ABSTRACT" or "TEXT. The extracted words from each row are united in one large binary matrix, with 0= word not frequently occurring in text of that row and 1= word frequently present in text of that row. Input: The output of "pubmed_by_queries" or "abstracts_by_pmids" tools, or a table with text in columns starting with "ABSTRACT" or "TEXT". Output: A binary matrix in that each column represents one of the extracted words. ]]></help> <expand macro="citations"/> </tool>