Repository 'cpat'
hg clone https://toolshed.g2.bx.psu.edu/repos/bgruening/cpat

Changeset 0:8e9e228c54c4 (2023-02-01)
Next changeset 1:1ac12c0cc7a0 (2024-01-29)
Commit message:
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/cpat commit 1903b0d45ff57d20d6e3a9b95fa55ad6d4a0f345
added:
cpat.xml
test-data/sequences.fasta.gz
test-data/test01_no_orgs.txt
test-data/test02_no_orgs.txt
b
diff -r 000000000000 -r 8e9e228c54c4 cpat.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpat.xml Wed Feb 01 19:22:06 2023 +0000
[
b'@@ -0,0 +1,169 @@\n+<tool id="cpat" name="CPAT" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">\n+    <description>coding potential assessment</description>\n+    <macros>\n+        <token name="@TOOL_VERSION@">3.0.4</token>\n+        <token name="@VERSION_SUFFIX@">0</token>\n+    </macros>\n+    <requirements>\n+        <requirement type="package" version="@TOOL_VERSION@">cpat</requirement>\n+    </requirements>\n+    <version_command>cpat --version</version_command>\n+    <command detect_errors="exit_code"><![CDATA[\n+        #set $gen_ext = $gene.ext\n+        ln -s \'${gene}\' \'./gene_sequences.${gen_ext}\' &&\n+        #set $cod_ext = $c.ext\n+        ln -s \'${c}\' \'./conding_sequences.${cod_ext}\' &&\n+        #set $ncod_ext = $n.ext\n+        ln -s \'${n}\' \'./nonconding_sequences.${ncod_ext}\' &&\n+        #set $ref_ext = $r.ext\n+        ln -s \'${r}\' \'./referece.${ref_ext}\' &&\n+        make_hexamer_tab.py \n+            -c \'./conding_sequences.${cod_ext}\'\n+            -n \'./nonconding_sequences.${ncod_ext}\' > \'./hexamer.tsv\' &&\n+        make_logitModel.py  \n+            -x \'./hexamer.tsv\' \n+            -c \'./conding_sequences.${cod_ext}\'\n+            -n \'./nonconding_sequences.${ncod_ext}\'\n+            -r \'./referece.${ref_ext}\'\n+            --start=\'${start}\'\n+            --stop=\'${stop}\'\n+            --min-orf=$min_orf_model\n+            -o \'./logit_model\' &&\n+        cpat.py --verbose\n+            -x \'./hexamer.tsv\'\n+            -d \'./logit_model.logit.RData\'\n+            -g \'./gene_sequences.${gen_ext}\'\n+            --top-orf=$top_orf\n+            --antisense\n+            --start=\'${start}\'\n+            --stop=\'${stop}\'\n+            --min-orf=$min_orf_cpat\n+            $antisense\n+            --top-orf=$top_orf\n+            --best-orf=$best_orf\n+            -o \'output\' \n+        ]]>\n+    </command>\n+    <inputs>\n+        <param argument="--gene" type="data" format="fasta,fasta.gz" label="Query nucletide sequences" help="It is recommended to use short and unique sequence identifiers"/>\n+        <param argument="-r" type="data" format="fasta,fasta.gz" label="Reference genome" help="Reference genome sequences in FASTA format" />\n+        <param argument="-c" type="data" format="fasta,fasta.gz" label="Coding sequences file" help="Coding sequence (must be CDS without UTR, i.e. from start coden to stop coden) in FASTA format" />\n+        <param argument="-n" type="data" format="fasta,fasta.gz" label="Non coding sequeces file" help="Noncoding sequences in FASTA format" />\n+        <param argument="--start" type="text" value="ATG" label="Start codon">\n+            <sanitizer invalid_char="">\n+                <valid initial="string.letters"/>\n+            </sanitizer>\n+            <validator type="regex">[a-zA-Z]+</validator>\n+        </param>\n+        <param argument="--stop" type="text" value="TAG,TAA,TGA" label="Stop codons">\n+            <sanitizer invalid_char="">\n+                <valid initial="string.letters">\n+                    <add value="," />\n+                </valid>\n+            </sanitizer>\n+            <validator type="regex">[a-zA-Z,]+</validator>\n+        </param>\n+        <param argument="--min-orf" name="min_orf_model" type="integer" min="0" value="30" label="Minimum ORF length" help="Minimum ORF length in nucleotides" />\n+        <param argument="--min-orf" name="min_orf_cpat" type="integer" min="0" value="75" label="Minimum ORF length" help="Minimum ORF length in nucleotides" />\n+        <param argument="--antisense" type="boolean" truevalue="--antisense" falsevalue="" checked="false" label="Search for ORFs from the anti-sense strand"/>\n+        <param argument="--top-orf" type="integer" min="0" value="5" label="Number of ORF candidates reported" help="RNAs may have dozens of putative ORFs, in most cases, the real ORF \n+            is ranked (by size) in the top several" />\n+        <param argument="--best-orf" type="select" label="Criteria to select the best ORF">\n+            <option value="l">ORF length (l)</option>\n+    '..b'    <data name="no_orf_seqs" format="txt" from_work_dir="output.no_ORF.txt" label="${tool.name} on ${on_string}: no ORFs (TXT)"/>\n+    </outputs>\n+    <tests>\n+        <test expect_num_outputs="4">\n+            <param name="gene" value="sequences.fasta.gz"/>\n+            <param name="r" value="sequences.fasta.gz"/>\n+            <param name="c" value="sequences.fasta.gz"/>\n+            <param name="n" value="sequences.fasta.gz"/>\n+            <param name="start" value="ATG"/>\n+            <param name="stop" value="TAG,TAA,TGA"/>\n+            <param name="min_orf_model" value="30"/>\n+            <param name="min_orf_cpat" value="75"/>\n+            <param name="antisense" value="false"/>\n+            <param name="top_orf" value="5"/>\n+            <param name="best_orf" value="l"/>\n+            <output name="orf_seqs" ftype="fasta">\n+                <assert_contents>\n+                    <has_text text=">ENST00000616016.5_ORF_1"/>\n+                    <has_n_lines n="41009"/>\n+                </assert_contents>\n+            </output>\n+            <output name="orf_seqs_prob"  ftype="tsv">\n+                <assert_contents>\n+                    <has_text text="ENST00000616016.5_ORF_1"/>\n+                    <has_n_lines n="6237"/>\n+                </assert_contents>\n+            </output>\n+            <output name="orf_seqs_prob_best" ftype="tsv">\n+                <assert_contents>\n+                    <has_text text="ENST00000683977.1"/>\n+                    <has_n_lines n="1301" delta="5"/>\n+                </assert_contents>\n+            </output>\n+            <output name="no_orf_seqs" file="test01_no_orgs.txt" ftype="txt"/>\n+        </test>\n+        <test expect_num_outputs="4">\n+            <param name="gene" value="sequences.fasta.gz"/>\n+            <param name="r" value="sequences.fasta.gz"/>\n+            <param name="c" value="sequences.fasta.gz"/>\n+            <param name="n" value="sequences.fasta.gz"/>\n+            <param name="start" value="ATG"/>\n+            <param name="stop" value="TAG,TAA,TGA"/>\n+            <param name="min_orf_model" value="15"/>\n+            <param name="min_orf_cpat" value="60"/>\n+            <param name="antisense" value="true"/>\n+            <param name="top_orf" value="10"/>\n+            <param name="best_orf" value="p"/>\n+            <output name="orf_seqs" ftype="fasta">\n+                <assert_contents>\n+                    <has_text text=">ENST00000616016.5_ORF_1"/>\n+                    <has_n_lines n="57357"/>\n+                </assert_contents>\n+            </output>            \n+            <output name="orf_seqs_prob" ftype="tsv">\n+                <assert_contents>\n+                    <has_text text="ENST00000616016.5_ORF_1"/>\n+                    <has_n_lines n="11667"/>\n+                </assert_contents>\n+            </output>\n+            <output name="orf_seqs_prob_best" ftype="tsv">\n+                <assert_contents>\n+                    <has_text text="ENST00000683977.1"/>\n+                    <has_n_lines n="1301"/>\n+                </assert_contents>\n+            </output>\n+            <output name="no_orf_seqs" file="test02_no_orgs.txt" ftype="txt"/>\n+        </test>\n+    </tests>\n+    <help><![CDATA[\n+\n+.. class:: infomark\n+\n+**Purpose**\n+\n+CPAT is a bioinformatics tool to predict RNAs coding probability based on the RNA sequence characteristics. To achieve this goal, CPAT calculates scores of these 4 linguistic features \n+from a set of known protein-coding genes and another set of non-coding genes.\n+\n+- ORF size\n+- ORF coverage\n+- Fickett TESTCODE\n+- Hexamer usage bias\n+\n+CPAT will then builds a logistic regression model using these 4 features as predictor variables and the \xe2\x80\x9cprotein-coding status\xe2\x80\x9d as the response variable. After evaluating the performance \n+and determining the probability cutoff, the model can be used to predict new RNA sequences.\n+\n+]]></help>\n+    <citations>\n+        <citation type="doi">10.1093/nar/gkt006</citation>\n+    </citations>\n+</tool>\n'
b
diff -r 000000000000 -r 8e9e228c54c4 test-data/sequences.fasta.gz
b
Binary file test-data/sequences.fasta.gz has changed
b
diff -r 000000000000 -r 8e9e228c54c4 test-data/test01_no_orgs.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test01_no_orgs.txt Wed Feb 01 19:22:06 2023 +0000
b
@@ -0,0 +1,3 @@
+ENST00000637839.1
+ENST00000636635.1
+ENST00000502273.8
b
diff -r 000000000000 -r 8e9e228c54c4 test-data/test02_no_orgs.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test02_no_orgs.txt Wed Feb 01 19:22:06 2023 +0000
b
@@ -0,0 +1,1 @@
+ENST00000637839.1