Mercurial > repos > brinkmanlab > biopython_convert

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bioperl_compat.py	Fri Jan 24 18:52:04 2020 -0500
@@ -0,0 +1,11 @@
+#!/usr/bin/env python
+import sys
+from biopython_convert import get_args, convert
+from Bio.SeqIO.InsdcIO import _InsdcWriter
+
+# Quote anticodon qualifiers
+_InsdcWriter.FTQUAL_NO_QUOTE = tuple(v for v in _InsdcWriter.FTQUAL_NO_QUOTE if v not in ['anticodon', 'transl_except'])
+
+if __name__ == '__main__':
+    convert(*get_args(sys.argv[1:]))
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/biopython-convert.xml	Fri Jan 24 18:52:04 2020 -0500
@@ -0,0 +1,174 @@
+<tool id="biopython-convert" name="BioPython SeqIO Converter" version="1.0" profile="16.04">
+    <description>Interconvert between the various sequence file formats that BioPython supports</description>
+    <edam_topics>
+        <edam_topic>topic_0091</edam_topic>
+    </edam_topics>
+    <edam_operations>
+        <edam_operation>operation_3434</edam_operation>
+        <edam_operation>operation_0335</edam_operation>
+        <edam_operation>operation_3359</edam_operation>
+        <edam_operation>operation_0224</edam_operation>
+        <edam_operation>operation_3695</edam_operation>
+    </edam_operations>
+    <requirements>
+        <requirement type="package" version="3.7">python</requirement>
+        <requirement type="package" version="1.0">biopython.convert</requirement>
+    </requirements>
+    <version_command><![CDATA[ biopython.convert -v ]]></version_command>
+    <command detect_errors="aggressive"><![CDATA[
+        #if $bioperl
+            #set $script = 'python ' + $__tool_directory__ + '/bioperl_compat.py'
+        #else
+            #set $script = 'biopython.convert'
+        #end if
+        #if $split
+            mkdir -p output &&
+            $script $split $info
+            #if $query
+                -q '$query'
+            #end if
+            $input $input.ext output/record $output_type
+        #else
+            $script $split $info
+            #if $query
+                -q '$query'
+            #end if
+            $input $input.ext $output $output_type
+        #end if
+        #if $info
+            > $info_output
+        #end if
+    ]]></command>
+    <inputs>
+        <param name="input" type="data" format="abi,abi-trim,ace,cif-atom,cif-seqres,clustal,embl,fasta,fasta-2line,fastq-sanger,fastq,fastq-solexa,fastq-illumina,genbank,gb,ig,imgt,nexus,pdb-seqres,pdb-atom,phd,phylip,pir,seqxml,sff,sff-trim,stockholm,swiss,tab,qual,uniprot-xml,gff3" label="Input" />
+        <param name="output_type" type="select" label="Output Format">
+            <option value="clustal">clustal</option>
+            <option value="embl">embl</option>
+            <option value="fasta">fasta</option>
+            <option value="fasta-2line">fasta-2line</option>
+            <option value="fastq-sanger">fastq-sanger</option>
+            <option value="fastq">fastq</option>
+            <option value="fastq-solexa">fastq-solexa</option>
+            <option value="fastq-illumina">fastq-illumina</option>
+            <option value="genbank">genbank</option>
+            <option value="gb">gb</option>
+            <option value="imgt">imgt</option>
+            <option value="nexus">nexus</option>
+            <option value="phd">phd</option>
+            <option value="phylip">phylip</option>
+            <option value="pir">pir</option>
+            <option value="seqxml">seqxml</option>
+            <option value="sff">sff</option>
+            <option value="stockholm">stockholm</option>
+            <option value="tab">tab</option>
+            <option value="qual">qual</option>
+            <option value="gff3">gff3</option>
+        </param>
+        <param name="split" type="boolean" truevalue="-s" falsevalue="" checked="false" label="Split each record into a different file" />
+        <param name="info" type="boolean" truevalue="-i" falsevalue="" checked="false" label="Output record information while converting" />
+        <param name="query" type="text" label="Query records, keeping only what matches path" help="Provide a JMESPath selecting records to keep. The root is the list of records. The path must return a list of records.">
+            <sanitizer>
+                <valid initial="string.printable">
+                    <remove value="&apos;" />
+                </valid>
+            </sanitizer>
+        </param>
+        <param name="bioperl" type="boolean" checked="false" label="Modify biopython to generate files similar to bioperl" />
+    </inputs>
+    <outputs>
+        <data name="info_output" format="gff3" label="Record Info from ${tool.name} on ${on_string}">
+            <filter>info</filter>
+        </data>
+        <data name="output" format="txt" label="Output from ${tool.name} on ${on_string}">
+            <filter>not split</filter>
+            <change_format>
+                <when input="output_type" value="clustal" format="clustal" />
+                <when input="output_type" value="embl" format="embl" />
+                <when input="output_type" value="fasta" format="fasta" />
+                <when input="output_type" value="fasta-2line" format="fasta-2line" />
+                <when input="output_type" value="fastq-sanger" format="fastq-sanger" />
+                <when input="output_type" value="fastq" format="fastq" />
+                <when input="output_type" value="fastq-solexa" format="fastq-solexa" />
+                <when input="output_type" value="fastq-illumina" format="fastq-illumina" />
+                <when input="output_type" value="genbank" format="genbank" />
+                <when input="output_type" value="gb" format="genbank" />
+                <when input="output_type" value="imgt" format="imgt" />
+                <when input="output_type" value="nexus" format="nexus" />
+                <when input="output_type" value="phd" format="phd" />
+                <when input="output_type" value="phylip" format="phylip" />
+                <when input="output_type" value="pir" format="pir" />
+                <when input="output_type" value="seqxml" format="seqxml" />
+                <when input="output_type" value="sff" format="sff" />
+                <when input="output_type" value="stockholm" format="stockholm" />
+                <when input="output_type" value="tab" format="tabular" />
+                <when input="output_type" value="qual" format="qual" />
+                <when input="output_type" value="gff3" format="gff3" />
+            </change_format>
+        </data>
+        <collection name="split_output" type="list" label="Output split per record from ${tool.name} on ${on_string}">
+            <filter>split</filter>
+            <discover_datasets pattern=".*" directory="output" format="$output_type" />
+        </collection>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <!-- Test basic conversion to same format -->
+            <param name="input" value="BioPython-Convert/test-data/has_plasmids.gbff" ftype="genbank" />
+            <param name="output_type" value="genbank" />
+            <output name="output" checksum="sha256:2808187bb8e2231545e4d2d7a27dc802df4d1f7c0e953a8399300b2df6b0c737" ftype="genbank" />
+        </test>
+        <test expect_num_outputs="1">
+            <!-- Test basic conversion to same format with info -->
+            <param name="input" value="BioPython-Convert/test-data/has_plasmids.gbff" ftype="genbank" />
+            <param name="info" value="-i" />
+            <param name="output_type" value="genbank" />
+            <output name="output" checksum="sha256:2808187bb8e2231545e4d2d7a27dc802df4d1f7c0e953a8399300b2df6b0c737" ftype="genbank" />
+            <output name="info_output" checksum="sha256:a611656c5a7e7f719c3d64f6b348b67c1abcb8ed56fa82f51fc90cbe2125e5f0" ftype="gff3" />
+        </test>
+        <test expect_num_outputs="1">
+            <!-- Test basic conversion to different format -->
+            <param name="input" value="BioPython-Convert/test-data/has_plasmids.gbff" ftype="genbank" />
+            <param name="output_type" value="embl" />
+            <output name="output" checksum="sha256:5598cb679f5f6c31349968ddde3646fe97296da42ee528ed3f46dec3f5490cbd" ftype="embl" />
+        </test>
+        <test expect_num_outputs="1">
+            <!-- Test basic conversion to same format with filter -->
+            <param name="input" value="BioPython-Convert/test-data/has_plasmids.gbff" ftype="genbank" />
+            <param name="query" value="[?!(features[?type==`source`].qualifiers.plasmid)]" />
+            <param name="output_type" value="genbank" />
+            <output name="output" checksum="sha256:e142d7e1fbd103c96e3b728e3b75f7af6955c97cdbddb87c3202f2c1e2f133d4" ftype="genbank" />
+        </test>
+        <test expect_num_outputs="1">
+            <!-- Test split -->
+            <param name="input" value="BioPython-Convert/test-data/has_plasmids.gbff" ftype="genbank" />
+            <param name="output_type" value="genbank" />
+            <output_collection name="split_output" type="list" count="3">
+                <element name="record.0" ftype="genbank" checksum="sha256:8d02b2087c4cea42da7c5f0a69b7a40d544d953c1a9d611b97bd116cc1f8cd7f" />
+                <element name="record.1" ftype="genbank" checksum="sha256:e37ecc4288ae8b2c3bea25484326a69ced9679fa791162ed593064fdf535944d" />
+                <element name="record.2" ftype="genbank" checksum="sha256:e142d7e1fbd103c96e3b728e3b75f7af6955c97cdbddb87c3202f2c1e2f133d4" />
+            </output_collection>
+        </test>
+    </tests>
+    <help><![CDATA[
+        Interconvert between different file formats that BioPython SeqIO supports. Support for GFF3 has also been included.
+
+        Included features are:
+
+        - Split: Output a collection of datasets, one for each record in the input.
+                This is useful when a tool only accepts single records.
+                The resulting output collection can then be mapped over the receiving tool.
+        - Info: Output an additional GFF3 dataset that contains a summary record for each record in the output dataset.
+                This is useful for extracting sequence IDs, counting how many records are in the output dataset, and various
+                diagnostic processes.
+        - Query: Datasets can be queried or filtered using JMESPath query language.
+                For example ``[?!(features[?type==`source`].qualifiers.plasmid)]`` will remove any plasmid records sometimes
+                found in prokaryotic Genbank or EMBL reference genomes.
+                See http://jmespath.org/ for documentation, and https://biopython.org/DIST/docs/api/Bio.SeqFeature.SeqFeature-class.html
+                for the data structure. Single quotes `'` are not permitted in the query.
+
+    ]]></help>
+    <citations>
+        <citation type="doi">10.5281/zenodo.3364782</citation>
+        <citation type="doi">10.5281/zenodo.3364789</citation>
+    </citations>
+</tool>