Mercurial > repos > iuc > pmids_to_pubtator_matrix

diff pmids_to_pubtator_matrix.xml @ 0:69714f06f18b draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
author: iuc
date: Wed, 24 Mar 2021 08:33:56 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pmids_to_pubtator_matrix.xml	Wed Mar 24 08:33:56 2021 +0000
@@ -0,0 +1,109 @@
+  <tool id="pmids_to_pubtator_matrix" name="PMIDs to PubTator" version="@VERSION@" license="MIT">
+    <description>binary matrix</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <requirements>
+        <requirement type="package" version="2.0.3">r-argparse</requirement>
+        <requirement type="package" version="1.4.0">r-stringr</requirement>
+        <requirement type="package" version="1.98_1.2">r-rcurl</requirement>
+        <requirement type="package" version="1.5.3">r-stringi</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+    Rscript 
+      '${__tool_directory__}/pmids_to_pubtator_matrix.R'
+      --input '$input'
+      --output '$output'
+      --number '$number'
+      $byid
+      --categories 
+      #for $category in $categories:
+        '$category'
+      #end for
+      ]]>
+    </command>
+    <inputs>
+        <param argument="--input" type="data" format="tabular" label="Input file with PMID IDs" />
+        <param argument="--categories" type="select" label="categories" multiple="true" display="checkboxes">
+            <option value="Gene">Genes</option>
+            <option value="Disease">Diseases</option>
+            <option value="Mutation">Mutations</option>
+            <option value="Chemical">Chemicals</option>
+            <option value="Species">Species</option>
+        </param>
+        <param argument="--byid" label="If you want to find common gene IDs / mesh IDs instead of specific scientific terms." name="byid" type="boolean" truevalue="--byid" falsevalue="" help="byid" checked="false"/>
+        <param argument="--number" label="Number of most frequent terms/IDs to extract." name="number" optional="true" type="integer" help="number" value="50"/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" value="pubmed_by_queries_output" ftype="tabular"/>
+            <param name="categories" value="Gene,Mutation"/>
+            <output name="output">
+                <assert_contents>
+                    <has_n_lines n="7"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="input" value="pubmed_by_queries_output" ftype="tabular"/>
+            <param name="categories" value="Gene,Disease"/>
+            <param name="byid" value="True"/>
+            <output name="output">
+                <assert_contents>
+                    <has_n_lines n="7"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="input" value="pubmed_by_queries_output" ftype="tabular"/>
+            <param name="categories" value="Gene,Disease"/>
+            <param name="number" value="5"/>
+            <output name="output">
+                <assert_contents>
+                    <has_n_lines n="7"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+**What it does**
+
+The tool uses all PMIDs per row and extracts "Gene", "Disease", "Mutation", "Chemical" and "Species" terms of the corresponding abstracts, 
+using PubTator annotations. The user can choose from which categories terms should be extracted. The extracted terms are united in one
+large binary matrix, with 0= term not present in abstracts of that row and 1= term present in abstracts of that row.
+The user can decide if the scientific terms should be extracted and used as they are or if they should be grouped by their
+geneIDs/ meshIDs (several terms are often grouped into one ID). The the user can specify a number of most frequent words to extract per row.
+
+- Input file:
+
+    Output of 'abstracts_by_pmids' tool, or tab-delimited table with columns containing PMIDs. 
+    The names of the PMID columns should start with "PMID", e.g. "PMID_1", "PMID_2" etc.
+
+- Output file: 
+
+    Binary matrix in that each column represents one of the extracted terms.
+
+-----
+
+**Example**
+
+- Input table:
+
+    | PMID_1      | PMID_2      | PMID_2    
+    | 33565071    | 33531663    | 33528079  
+    | 33377604    | 33334860    | 33277917
+
+- Extract of output table:
+
+    | egfr        | hormone     | tp53        | scn8a       | cacna1a     | grin2a      
+    | 1           | 0           | 1           | 0           | 1           | 0           
+    | 1           | 1           | 1           | 1           | 0           | 1           
+
+
+        ]]></help>
+    <expand macro="citations"/>
+</tool>
\ No newline at end of file