Mercurial > repos > dlalgroup > text_to_wordmatrix

<tool id="text_to_wordmatrix" name="text_to_wordmatrix" version="@VERSION@">
    <description>Extract most frequent words from each row in a table and generate binary word matrix</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements">
      <requirement type="package" version="2.0.1">r-argparse</requirement>
      <requirement type="package" version="0.7.0">r-snowballc</requirement>
      <requirement type="package" version="0.3.6">r-pubmedwordcloud</requirement>
      <requirement type="package" version="1.2.0">r-semnetcleaner</requirement>
      <requirement type="package" version="0.9.3">r-textclean</requirement>
      <requirement type="package" version="1.4.3">r-stringi</requirement>
      <requirement type="package" version="1.4.0">r-stringr</requirement>
    </expand>
    <expand macro="stdio"/>
    <command><![CDATA[Rscript
      '${__tool_directory__}/text_to_wordmatrix.R'
      --input '$input'
      --output '$output'
      --number '$number'
      $remove_num
      $lower_case
      $remove_stopwords
      $stemDoc
      $plurals
      ]]>
    </command>
    <inputs>
      <param argument="--input" label="Input file" name="input" optional="false" type="data" format="tabular" help="input"/>
      <param argument="--number" label="Number of most frequent words that should be extracted per row" name="number" optional="false" type="integer" help="number" value="50" min="1" max="500"/>
      <param argument="--remove_num" label="Remove any numbers in text" name="remove_num" type="boolean" truevalue="--remove_num" falsevalue="" help="remove_num" checked="false"/>
      <param argument="--lower_case" label="Translate all characters are to lower case." name="lower_case" type="boolean" truevalue="" falsevalue="--lower_case" help="lower_case" checked="true"/>
      <param argument="--remove_stopwords" label="Remove english stopwords (e.g., 'the' or 'not')." name="remove_stopwords" type="boolean" truevalue="" falsevalue="--remove_stopwords" help="remove_stopwords" checked="true"/>
      <param argument="--stemDoc" label="Apply Porter's stemming algorithm: collapsing words to a common root to aid comparison of vocabulary." name="stemDoc" type="boolean" truevalue="--stemDoc" falsevalue="" help="stemDoc" checked="false"/>
      <param argument="--plurals" label="Transform words in plural to their singular form." name="plurals" type="boolean" truevalue="" falsevalue="--plurals" help="plurals" checked="true"/>
    </inputs>
    <outputs>
      <data format="tabular" name="output" />
    </outputs>
    <tests>
        <test>
          <param name="input" value="pubmed_by_queries_output_abstracts" ftype="tabular"/>
          <output name="output" value="text_to_wordmatrix_output"/>
        </test>
    </tests>
    <help><![CDATA[

The tool extracts for each row the most frequent words from the text in columns starting with "ABSTRACT" or "TEXT. The extracted words from each row are united in one large binary matrix, with 0= word not frequently occurring in text of that row and 1= word frequently present in text of that row.

Input:
The output of "pubmed_by_queries" or "abstracts_by_pmids" tools, or a table with text in columns starting with "ABSTRACT" or "TEXT".
Output:
A binary matrix in that each column represents one of the extracted words.

        ]]></help>
    <expand macro="citations"/>
</tool>
author	dlalgroup
date	Thu, 24 Sep 2020 02:58:53 +0000
parents
children