Mercurial > repos > peterjc > seq_length

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/SRR639755_sample_strict.fastq	Tue May 08 09:35:45 2018 -0400
@@ -0,0 +1,8 @@
+@SRR639755.6451003/1
+ATATCTGCAGTTAACATAAAAATATAGCACGAAAGTAACTTTAATATCTCCGACCACACGATAGCTAAGACCCAAACTGGGATTAGATACCCCGCTATGCT
++
+<?7A4ADDFHHDHIIBH<HGDHEIG>HBHGGEFH@?D<GDGGHDGGG>DDDFGFHGHGIGB;CEH>A>DEEC?B;;=@CC9;;?CCCCCCC@<9>5<<@A4
+@SRR639755.6451003/2
+CTCATGGGCTACACCTTGACCTAACTTTTTTGTGTTAAGGCACTTGTGCTTACTTTTCTTCCTTTTTAGGGTTTGCTGAAGATGGCGGTATGTAGGCTGAA
++
+@@@F=DDDHGFHHBHEH>HDHIEH8CECCFAGDBHH@DFBHGG@DHBFCHFH@FHIIIGDD@CHIJJF>EECBFFFEEE>AC>CC@@B89?C:B3::AB?>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/SRR639755_sample_strict.length.tabular	Tue May 08 09:35:45 2018 -0400
@@ -0,0 +1,3 @@
+#Identifier	Length
+SRR639755.6451003/1	101
+SRR639755.6451003/2	101
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/four_human_proteins.fasta	Tue May 08 09:35:45 2018 -0400
@@ -0,0 +1,61 @@
+>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1
+MHPAVFLSLPDLRCSLLLLVTWVFTPVTTEITSLDTENIDEILNNADVALVNFYADWCRF
+SQMLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGMMMK
+REYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFER
+VANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDMVYLGAMTNFDVTYNWIQDK
+CVPLVREITFENGEELTEEGLPFLILFHMKEDTESLEIFQNEVARQLISEKGTINFLHAD
+CDKFRHPLLHIQKTPADCPVIAIDSFRHMYVFGDFKDVLIPGKLKQFVFDLHSGKLHREF
+HHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL
+>sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens GN=BMP2K PE=1 SV=2
+MKKFSRMPKSEGGSGGGAAGGGAGGAGAGAGCGSGGSSVGVRVFAVGRHQVTLEESLAEG
+GFSTVFLVRTHGGIRCALKRMYVNNMPDLNVCKREITIMKELSGHKNIVGYLDCAVNSIS
+DNVWEVLILMEYCRAGQVVNQMNKKLQTGFTEPEVLQIFCDTCEAVARLHQCKTPIIHRD
+LKVENILLNDGGNYVLCDFGSATNKFLNPQKDGVNVVEEEIKKYTTLSYRAPEMINLYGG
+KPITTKADIWALGCLLYKLCFFTLPFGESQVAICDGNFTIPDNSRYSRNIHCLIRFMLEP
+DPEHRPDIFQVSYFAFKFAKKDCPVSNINNSSIPSALPEPMTASEAAARKSQIKARITDT
+IGPTETSIAPRQRPKANSATTATPSVLTIQSSATPVKVLAPGEFGNHRPKGALRPGNGPE
+ILLGQGPPQQPPQQHRVLQQLQQGDWRLQQLHLQHRHPHQQQQQQQQQQQQQQQQQQQQQ
+QQQQQQHHHHHHHHLLQDAYMQQYQHATQQQQMLQQQFLMHSVYQPQPSASQYPTMMPQY
+QQAFFQQQMLAQHQPSQQQASPEYLTSPQEFSPALVSYTSSLPAQVGTIMDSSYSANRSV
+ADKEAIANFTNQKNISNPPDMSGWNPFGEDNFSKLTEEELLDREFDLLRSNRLEERASSD
+KNVDSLSAPHNHPPEDPFGSVPFISHSGSPEKKAEHSSINQENGTANPIKNGKTSPASKD
+QRTGKKTSVQGQVQKGNDESESDFESDPPSPKSSEEEEQDDEEVLQGEQGDFNDDDTEPE
+NLGHRPLLMDSEDEEEEEKHSSDSDYEQAKAKYSDMSSVYRDRSGSGPTQDLNTILLTSA
+QLSSDVAVETPKQEFDVFGAVPFFAVRAQQPQQEKNEKNLPQHRFPAAGLEQEEFDVFTK
+APFSKKVNVQECHAVGPEAHTIPGYPKSVDVFGSTPFQPFLTSTSKSESNEDLFGLVPFD
+EITGSQQQKVKQRSLQKLSSRQRRTKQDMSKSNGKRHHGTPTSTKKTLKPTYRTPERARR
+HKKVGRRDSQSSNEFLTISDSKENISVALTDGKDRGNVLQPEESLLDPFGAKPFHSPDLS
+WHPPHQGLSDIRADHNTVLPGRPRQNSLHGSFHSADVLKMDDFGAVPFTELVVQSITPHQ
+SQQSQPVELDPFGAAPFPSKQ
+>sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4
+MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHL
+QILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYAL
+VIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNE
+ECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECL
+GNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQG
+CHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGC
+TVINGSLIINIRGGNNLAAELEANLGLIEEISGYLKIRRSYALVSLSFFRKLRLIRGETL
+EIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQE
+RNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQ
+NVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFS
+DERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWE
+RQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQIL
+KELEESSFRKTFEDYLHNVVFVPRKTSSGTGAEDPRPSRKRRSLGDVGNVTVAVPTVAAF
+PNTSSTSVPTSPEEHRPFEKVVNKESLVISGLRHFTGYRIELQACNQDTPEERCSVAAYV
+SARTMPEAKADDIVGPVTHEIFENNVVHLMWQEPKEPNGLIVLYEVSYRRYGDEELHLCV
+SRKHFALERGCRLRGLSPGNYSVRIRATSLAGNGSWTEPTYFYVTDYLDVPSNIAKIIIG
+PLIFVFLFSVVIGSIYLFLRKRQPDGPLGPLYASSNPEYLSASDVFPCSVYVPDEWEVSR
+EKITLLRELGQGSFGMVYEGNARDIIKGEAETRVAVKTVNESASLRERIEFLNEASVMKG
+FTCHHVVRLLGVVSKGQPTLVVMELMAHGDLKSYLRSLRPEAENNPGRPPPTLQEMIQMA
+AEIADGMAYLNAKKFVHRDLAARNCMVAHDFTVKIGDFGMTRDIYETDYYRKGGKGLLPV
+RWMAPESLKDGVFTTSSDMWSFGVVLWEITSLAEQPYQGLSNEQVLKFVMDGGYLDQPDN
+CPERVTDLMRMCWQFNPKMRPTFLEIVNLLKDDLHPSFPEVSFFHSEENKAPESEELEME
+FEDMENVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSN
+PS
+>sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1
+MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLY
+VTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLG
+GEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIP
+EGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQES
+ATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAI
+YNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/four_human_proteins.length.tabular	Tue May 08 09:35:45 2018 -0400
@@ -0,0 +1,5 @@
+#Identifier	Length
+sp|Q9BS26|ERP44_HUMAN	406
+sp|Q9NSY1|BMP2K_HUMAN	1161
+sp|P06213|INSR_HUMAN	1382
+sp|P08100|OPSD_HUMAN	348
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/seq_length/README.rst	Tue May 08 09:35:45 2018 -0400
@@ -0,0 +1,119 @@
+Galaxy tool to rename FASTA, QUAL, FASTQ or SFF sequences
+=========================================================
+
+This tool is copyright 2011-2017 by Peter Cock, The James Hutton Institute
+(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
+See the licence text below.
+
+This tool is a short Python script (using Biopython library functions) to rename
+sequences from a FASTA, QUAL, FASTQ, or SFF file based on an ID mapping gives as
+two columns of a tabular file. The output order follows that of the sequence file,
+and if there are duplicates in the input sequence file, there will be duplicates
+in the output sequence file.
+
+This tool is available from the Galaxy Tool Shed,
+
+* http://toolshed.g2.bx.psu.edu/view/peterjc/seq_length
+
+See also the sister tools to filter or select sequence files according to IDs
+from column(s) of tabular file:
+
+* http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_id
+* http://toolshed.g2.bx.psu.edu/view/peterjc/seq_select_by_id
+
+
+Automated Installation
+======================
+
+This should be straightforward using the Galaxy Tool Shed, which should be
+able to automatically install the dependency on Biopython, and then install
+this tool and run its unit tests.
+
+
+Manual Installation
+===================
+
+There are just two files to install to use this tool from within Galaxy:
+
+* ``seq_length.py`` (the Python script)
+* ``seq_length.xml`` (the Galaxy tool definition)
+
+The suggested location is in a dedicated ``tools/seq_length`` folder.
+
+You will also need to modify the ``tools_conf.xml`` file to tell Galaxy to offer the
+tool. One suggested location is in the filters section. Simply add the line::
+
+    <tool file="seq_length/seq_length.xml" />
+
+If you wish to run the unit tests, also move/copy the ``test-data/`` files
+under Galaxy's ``test-data/`` folder. Then::
+
+    $ ./run_tests.sh -id seq_length
+
+You will also need to install Biopython 1.54 or later. That's it.
+
+
+History
+=======
+
+======= ======================================================================
+Version Changes
+------- ----------------------------------------------------------------------
+v0.0.1  - Initial version.
+======= ======================================================================
+
+
+Developers
+==========
+
+Development is here:
+
+https://github.com/peterjc/pico_galaxy/tree/master/tools/seq_length
+
+For pushing a release to the test or main "Galaxy Tool Shed", use the following
+Planemo commands (which requires you have set your Tool Shed access details in
+``~/.planemo.yml`` and that you have access rights on the Tool Shed)::
+
+    $ planemo shed_update -t testtoolshed --check_diff tools/seq_length/
+    ...
+
+or::
+
+    $ planemo shed_update -t toolshed --check_diff tools/seq_length/
+    ...
+
+To just build and check the tar ball, use::
+
+    $ planemo shed_upload --tar_only tools/seq_length/
+    ...
+    $ tar -tzf shed_upload.tar.gz
+    test-data/SRR639755_sample_strict.fastq
+    test-data/SRR639755_sample_strict.length.tabular
+    test-data/four_human_proteins.fasta
+    test-data/four_human_proteins.length.tabular
+    tools/seq_length/README.rst
+    tools/seq_length/seq_length.py
+    tools/seq_length/seq_length.xml
+    tools/seq_length/tool_dependencies.xml
+
+
+Licence (MIT)
+=============
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/seq_length/seq_length.py	Tue May 08 09:35:45 2018 -0400
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+"""Compute length of FASTA, QUAL, FASTQ or SSF sequences.
+
+Takes three command line options: input sequence filename, input type
+(e.g. FASTA or SFF) and the output filename (tabular).
+
+This tool is a short Python script which requires Biopython 1.54 or later
+for SFF file support. If you use this tool in scientific work leading to a
+publication, please cite the Biopython application note:
+
+Cock et al 2009. Biopython: freely available Python tools for computational
+molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
+http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
+
+This script is copyright 2018 by Peter Cock, The James Hutton Institute UK.
+All rights reserved. See accompanying text file for licence details (MIT
+license).
+"""
+
+from __future__ import print_function
+
+import sys
+
+if "-v" in sys.argv or "--version" in sys.argv:
+    print("v0.0.1")
+    sys.exit(0)
+
+try:
+    from Bio import SeqIO
+except ImportError:
+    sys.exit("Missing required Python library Biopython.")
+
+
+# Parse Command Line
+try:
+    in_file, seq_format, out_file = sys.argv[1:]
+except ValueError:
+    sys.exit("Expected three arguments (input file, format, output file), "
+             "got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv)))
+
+
+if seq_format.startswith("fastq"):
+    # We don't care about the quality score encoding, just
+    # need to translate Galaxy format name into something
+    # Biopython will accept:
+    format = "fastq"
+elif seq_format.lower() == "csfasta":
+    # I have not tested with colour space FASTA
+    format = "fasta"
+elif seq_format.lower == "sff":
+    # The masked/trimmed numbers are more interesting
+    format = "sff-trim"
+elif seq_format.lower() in ["fasta", "qual"]:
+    format = seq_format.lower()
+else:
+    # TODO: Does Galaxy understand GenBank, EMBL, etc yet?
+    sys.exit("Unexpected format argument: %r" % seq_format)
+
+
+count = 0
+total = 0
+with open(out_file, "w") as out_handle:
+    out_handle.write("#Identifier\tLength\n")
+    for record in SeqIO.parse(in_file, format):
+        count += 1
+        length = len(record)
+        total += length
+        out_handle.write("%s\t%i\n" % (record.id, length))
+print("%i sequences, total length %i" % (count, total))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/seq_length/seq_length.xml	Tue May 08 09:35:45 2018 -0400
@@ -0,0 +1,54 @@
+<tool id="seq_length" name="Sequence lengths" version="0.0.1">
+    <description>with ID mapping from a tabular file</description>
+    <requirements>
+        <!-- This is the currently the last release of Biopython which is available via Galaxy's legacy XML packaging system -->
+        <requirement type="package" version="1.67">biopython</requirement>
+    </requirements>
+    <version_command>
+python $__tool_directory__/seq_length.py --version
+</version_command>
+    <command detect_errors="aggressive">
+python $__tool_directory__/seq_length.py '$input_file' '$input_file.ext' '$output_file'
+    </command>
+    <inputs>
+        <param name="input_file" type="data" format="fasta,qual,fastq,sff" label="Sequence file" help="FASTA, QUAL, FASTQ, or SFF format." />
+    </inputs>
+    <outputs>
+        <data name="output_file" format="tabular" label="${on_string} length"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_file" value="four_human_proteins.fasta" ftype="fasta" />
+            <output name="output_file" file="four_human_proteins.length.tabular" ftype="tabular" />
+        </test>
+        <test>
+            <param name="input_file" value="SRR639755_sample_strict.fastq" ftype="fastq" />
+            <output name="output_file" file="SRR639755_sample_strict.length.tabular" ftype="tabular" />
+        </test>
+    </tests>
+    <help>
+**What it does**
+
+Takes a FASTA, QUAL, FASTQ or Standard Flowgram Format (SFF) file and produces a
+two-column tabular file containing one line per sequence giving the sequence
+identifier and the associated sequence's length.
+
+WARNING: If there are any duplicate sequence identifiers, these will all appear
+in the tabular output.
+
+**References**
+
+This tool uses Biopython's ``SeqIO`` library to read sequences, so please cite
+the Biopython application note (and Galaxy too of course):
+
+Cock et al (2009). Biopython: freely available Python tools for computational
+molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
+http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
+
+This tool is available to install into other Galaxy Instances via the Galaxy
+Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/seq_length
+    </help>
+    <citations>
+        <citation type="doi">10.1093/bioinformatics/btp163</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/seq_length/tool_dependencies.xml	Tue May 08 09:35:45 2018 -0400
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="biopython" version="1.67">
+        <repository changeset_revision="a12f73c3b116" name="package_biopython_1_67" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>