view text_to_wordmatrix.xml @ 0:0692d11af909 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
author iuc
date Wed, 24 Mar 2021 08:33:25 +0000
parents
children
line wrap: on
line source

<tool id="text_to_wordmatrix" name="Text to wordmatrix" version="@VERSION@" license="MIT">
    <description>by extracting most frequent words</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <requirements>
        <requirement type="package" version="2.0.3">r-argparse</requirement>
        <requirement type="package" version="0.7.0">r-snowballc</requirement>
        <requirement type="package" version="0.3.6">r-pubmedwordcloud</requirement>
        <requirement type="package" version="1.2.0">r-semnetcleaner</requirement>
        <requirement type="package" version="0.9.3">r-textclean</requirement>
        <requirement type="package" version="1.5.3">r-stringi</requirement>
        <requirement type="package" version="1.4.0">r-stringr</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
    Rscript 
      '${__tool_directory__}/text_to_wordmatrix.R'
      --input '$input'
      --output '$output'
      --number '$number'
      $remove_num
      $lower_case
      $remove_stopwords
      $stemDoc
      $plurals
      ]]>
    </command>
    <inputs>
        <param argument="--input" type="data" format="tabular" label="Input file" />
        <param argument="--number" type="integer" value="50" min="1" max="500" label="Number of most frequent words that should be extracted per row."/>
        <param argument="--remove_num" type="boolean" truevalue="--remove_num" falsevalue="" checked="false" label="Remove any numbers in text." />
        <param argument="--lower_case" type="boolean" truevalue="" falsevalue="--lower_case" checked="true" label="Translate all characters are to lower case." />
        <param argument="--remove_stopwords" type="boolean" truevalue="" falsevalue="--remove_stopwords" checked="true" label="Remove english stopwords" help="e.g. 'the' or 'not'" />
        <param argument="--stemDoc" type="boolean" truevalue="--stemDoc" falsevalue="" checked="false" label="Apply Porter's stemming algorithm: collapsing words to a common root to aid comparison of vocabulary." />
        <param argument="--plurals" type="boolean" truevalue="" falsevalue="--plurals" checked="true" label="Transform words in plural to their singular form." />
    </inputs>
    <outputs>
        <data format="tabular" name="output" />
    </outputs>
    <tests>
        <test>
            <param name="input" value="pubmed_by_queries_output_abstracts" ftype="tabular"/>
            <output name="output">
                <assert_contents>
                    <has_n_lines n="7"/>
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="input" value="pubmed_by_queries_output_abstracts" ftype="tabular"/>
            <param name="remove_num" value="True"/>
            <param name="remove_stopwords" value="False"/>
            <param name="plurals" value="False"/>
            <output name="output">
                <assert_contents>
                    <has_n_lines n="7"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[

**What it does**

The tool extracts for each row the most frequent words from the text in columns starting with "ABSTRACT" or "TEXT. The extracted words from each row are united in one large binary matrix, with 0= word not frequently occurring in text of that row and 1= word frequently present in text of that row.

- Input table:

    The output of "pubmed_by_queries" or "abstracts_by_pmids" tools, or a table with text in columns starting with "ABSTRACT" or "TEXT".

- Output table: 

    A binary matrix in that each column represents one of the extracted words.

-----

**Example**

- Input table:

    | ABSTRACT_1  | ABSTRACT_2  | TEXT_1
    | abcd def...  | abcd def... |   abcd def...
    | abcd def...  |  abcd def... |   abcd def...

- Extract of output table:

    | chronic    | seizure   |  child    | channel  |  signaling |   grin2a
    |   1       |   1       |    1      |    1     |      1     |     1
    |   0       |   1       |    0      |    1     |      0     |     1

        ]]></help>
    <expand macro="citations"/>
</tool>