view text_to_wordmatrix.xml @ 1:949e46ca234b draft default tip

"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
author dlalgroup
date Thu, 24 Sep 2020 04:22:41 +0000
parents dd696b179eb7
children
line wrap: on
line source

<tool id="text_to_wordmatrix" name="text_to_wordmatrix" version="@VERSION@">
    <description>Extract most frequent words from each row in a table and generate binary word matrix</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements">
      <requirement type="package" version="2.0.1">r-argparse</requirement>
      <requirement type="package" version="0.7.0">r-snowballc</requirement>
      <requirement type="package" version="0.3.6">r-pubmedwordcloud</requirement>
      <requirement type="package" version="1.2.0">r-semnetcleaner</requirement>
      <requirement type="package" version="0.9.3">r-textclean</requirement>
      <requirement type="package" version="1.4.3">r-stringi</requirement>
      <requirement type="package" version="1.4.0">r-stringr</requirement>
    </expand>
    <expand macro="stdio"/>
    <command><![CDATA[Rscript 
      '${__tool_directory__}/text_to_wordmatrix.R'
      --input '$input'
      --output '$output'
      --number '$number'
      $remove_num
      $lower_case
      $remove_stopwords
      $stemDoc
      $plurals
      ]]>
    </command>
    <inputs>
      <param argument="--input" label="Input file" name="input" optional="false" type="data" format="tabular" help="input"/>
      <param argument="--number" label="Number of most frequent words that should be extracted per row" name="number" optional="false" type="integer" help="number" value="50" min="1" max="500"/>
      <param argument="--remove_num" label="Remove any numbers in text" name="remove_num" type="boolean" truevalue="--remove_num" falsevalue="" help="remove_num" checked="false"/>
      <param argument="--lower_case" label="Translate all characters are to lower case." name="lower_case" type="boolean" truevalue="" falsevalue="--lower_case" help="lower_case" checked="true"/>
      <param argument="--remove_stopwords" label="Remove english stopwords (e.g., 'the' or 'not')." name="remove_stopwords" type="boolean" truevalue="" falsevalue="--remove_stopwords" help="remove_stopwords" checked="true"/>
      <param argument="--stemDoc" label="Apply Porter's stemming algorithm: collapsing words to a common root to aid comparison of vocabulary." name="stemDoc" type="boolean" truevalue="--stemDoc" falsevalue="" help="stemDoc" checked="false"/>
      <param argument="--plurals" label="Transform words in plural to their singular form." name="plurals" type="boolean" truevalue="" falsevalue="--plurals" help="plurals" checked="true"/>
    </inputs>
    <outputs>
      <data format="tabular" name="output" />
    </outputs>
    <tests>
        <test>
          <param name="input" value="pubmed_by_queries_output_abstracts" ftype="tabular"/>
          <output name="output" value="text_to_wordmatrix_output"/>
        </test>
    </tests>
    <help><![CDATA[

The tool extracts for each row the most frequent words from the text in columns starting with "ABSTRACT" or "TEXT. The extracted words from each row are united in one large binary matrix, with 0= word not frequently occurring in text of that row and 1= word frequently present in text of that row.

Input: 
The output of "pubmed_by_queries" or "abstracts_by_pmids" tools, or a table with text in columns starting with "ABSTRACT" or "TEXT".
Output: 
A binary matrix in that each column represents one of the extracted words.

        ]]></help>
    <expand macro="citations"/>
</tool>