Mercurial > repos > peterjc > seq_length
changeset 0:c323e29a8248 draft
Initial release v0.0.1
author | peterjc |
---|---|
date | Tue, 08 May 2018 09:35:45 -0400 |
parents | |
children | 458f987918a6 |
files | test-data/SRR639755_sample_strict.fastq test-data/SRR639755_sample_strict.length.tabular test-data/four_human_proteins.fasta test-data/four_human_proteins.length.tabular tools/seq_length/README.rst tools/seq_length/seq_length.py tools/seq_length/seq_length.xml tools/seq_length/tool_dependencies.xml |
diffstat | 8 files changed, 325 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/SRR639755_sample_strict.fastq Tue May 08 09:35:45 2018 -0400 @@ -0,0 +1,8 @@ +@SRR639755.6451003/1 +ATATCTGCAGTTAACATAAAAATATAGCACGAAAGTAACTTTAATATCTCCGACCACACGATAGCTAAGACCCAAACTGGGATTAGATACCCCGCTATGCT ++ +<?7A4ADDFHHDHIIBH<HGDHEIG>HBHGGEFH@?D<GDGGHDGGG>DDDFGFHGHGIGB;CEH>A>DEEC?B;;=@CC9;;?CCCCCCC@<9>5<<@A4 +@SRR639755.6451003/2 +CTCATGGGCTACACCTTGACCTAACTTTTTTGTGTTAAGGCACTTGTGCTTACTTTTCTTCCTTTTTAGGGTTTGCTGAAGATGGCGGTATGTAGGCTGAA ++ +@@@F=DDDHGFHHBHEH>HDHIEH8CECCFAGDBHH@DFBHGG@DHBFCHFH@FHIIIGDD@CHIJJF>EECBFFFEEE>AC>CC@@B89?C:B3::AB?>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/SRR639755_sample_strict.length.tabular Tue May 08 09:35:45 2018 -0400 @@ -0,0 +1,3 @@ +#Identifier Length +SRR639755.6451003/1 101 +SRR639755.6451003/2 101
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/four_human_proteins.fasta Tue May 08 09:35:45 2018 -0400 @@ -0,0 +1,61 @@ +>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1 +MHPAVFLSLPDLRCSLLLLVTWVFTPVTTEITSLDTENIDEILNNADVALVNFYADWCRF +SQMLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGMMMK +REYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFER +VANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDMVYLGAMTNFDVTYNWIQDK +CVPLVREITFENGEELTEEGLPFLILFHMKEDTESLEIFQNEVARQLISEKGTINFLHAD +CDKFRHPLLHIQKTPADCPVIAIDSFRHMYVFGDFKDVLIPGKLKQFVFDLHSGKLHREF +HHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL +>sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens GN=BMP2K PE=1 SV=2 +MKKFSRMPKSEGGSGGGAAGGGAGGAGAGAGCGSGGSSVGVRVFAVGRHQVTLEESLAEG +GFSTVFLVRTHGGIRCALKRMYVNNMPDLNVCKREITIMKELSGHKNIVGYLDCAVNSIS +DNVWEVLILMEYCRAGQVVNQMNKKLQTGFTEPEVLQIFCDTCEAVARLHQCKTPIIHRD +LKVENILLNDGGNYVLCDFGSATNKFLNPQKDGVNVVEEEIKKYTTLSYRAPEMINLYGG +KPITTKADIWALGCLLYKLCFFTLPFGESQVAICDGNFTIPDNSRYSRNIHCLIRFMLEP +DPEHRPDIFQVSYFAFKFAKKDCPVSNINNSSIPSALPEPMTASEAAARKSQIKARITDT +IGPTETSIAPRQRPKANSATTATPSVLTIQSSATPVKVLAPGEFGNHRPKGALRPGNGPE +ILLGQGPPQQPPQQHRVLQQLQQGDWRLQQLHLQHRHPHQQQQQQQQQQQQQQQQQQQQQ +QQQQQQHHHHHHHHLLQDAYMQQYQHATQQQQMLQQQFLMHSVYQPQPSASQYPTMMPQY +QQAFFQQQMLAQHQPSQQQASPEYLTSPQEFSPALVSYTSSLPAQVGTIMDSSYSANRSV +ADKEAIANFTNQKNISNPPDMSGWNPFGEDNFSKLTEEELLDREFDLLRSNRLEERASSD +KNVDSLSAPHNHPPEDPFGSVPFISHSGSPEKKAEHSSINQENGTANPIKNGKTSPASKD +QRTGKKTSVQGQVQKGNDESESDFESDPPSPKSSEEEEQDDEEVLQGEQGDFNDDDTEPE +NLGHRPLLMDSEDEEEEEKHSSDSDYEQAKAKYSDMSSVYRDRSGSGPTQDLNTILLTSA +QLSSDVAVETPKQEFDVFGAVPFFAVRAQQPQQEKNEKNLPQHRFPAAGLEQEEFDVFTK +APFSKKVNVQECHAVGPEAHTIPGYPKSVDVFGSTPFQPFLTSTSKSESNEDLFGLVPFD +EITGSQQQKVKQRSLQKLSSRQRRTKQDMSKSNGKRHHGTPTSTKKTLKPTYRTPERARR +HKKVGRRDSQSSNEFLTISDSKENISVALTDGKDRGNVLQPEESLLDPFGAKPFHSPDLS +WHPPHQGLSDIRADHNTVLPGRPRQNSLHGSFHSADVLKMDDFGAVPFTELVVQSITPHQ +SQQSQPVELDPFGAAPFPSKQ +>sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4 +MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHL +QILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYAL +VIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNE +ECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECL +GNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQG +CHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGC +TVINGSLIINIRGGNNLAAELEANLGLIEEISGYLKIRRSYALVSLSFFRKLRLIRGETL +EIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQE +RNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQ +NVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFS +DERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWE +RQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQIL +KELEESSFRKTFEDYLHNVVFVPRKTSSGTGAEDPRPSRKRRSLGDVGNVTVAVPTVAAF +PNTSSTSVPTSPEEHRPFEKVVNKESLVISGLRHFTGYRIELQACNQDTPEERCSVAAYV +SARTMPEAKADDIVGPVTHEIFENNVVHLMWQEPKEPNGLIVLYEVSYRRYGDEELHLCV +SRKHFALERGCRLRGLSPGNYSVRIRATSLAGNGSWTEPTYFYVTDYLDVPSNIAKIIIG +PLIFVFLFSVVIGSIYLFLRKRQPDGPLGPLYASSNPEYLSASDVFPCSVYVPDEWEVSR +EKITLLRELGQGSFGMVYEGNARDIIKGEAETRVAVKTVNESASLRERIEFLNEASVMKG +FTCHHVVRLLGVVSKGQPTLVVMELMAHGDLKSYLRSLRPEAENNPGRPPPTLQEMIQMA +AEIADGMAYLNAKKFVHRDLAARNCMVAHDFTVKIGDFGMTRDIYETDYYRKGGKGLLPV +RWMAPESLKDGVFTTSSDMWSFGVVLWEITSLAEQPYQGLSNEQVLKFVMDGGYLDQPDN +CPERVTDLMRMCWQFNPKMRPTFLEIVNLLKDDLHPSFPEVSFFHSEENKAPESEELEME +FEDMENVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSN +PS +>sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1 +MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLY +VTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLG +GEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIP +EGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQES +ATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAI +YNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/four_human_proteins.length.tabular Tue May 08 09:35:45 2018 -0400 @@ -0,0 +1,5 @@ +#Identifier Length +sp|Q9BS26|ERP44_HUMAN 406 +sp|Q9NSY1|BMP2K_HUMAN 1161 +sp|P06213|INSR_HUMAN 1382 +sp|P08100|OPSD_HUMAN 348
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/seq_length/README.rst Tue May 08 09:35:45 2018 -0400 @@ -0,0 +1,119 @@ +Galaxy tool to rename FASTA, QUAL, FASTQ or SFF sequences +========================================================= + +This tool is copyright 2011-2017 by Peter Cock, The James Hutton Institute +(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. +See the licence text below. + +This tool is a short Python script (using Biopython library functions) to rename +sequences from a FASTA, QUAL, FASTQ, or SFF file based on an ID mapping gives as +two columns of a tabular file. The output order follows that of the sequence file, +and if there are duplicates in the input sequence file, there will be duplicates +in the output sequence file. + +This tool is available from the Galaxy Tool Shed, + +* http://toolshed.g2.bx.psu.edu/view/peterjc/seq_length + +See also the sister tools to filter or select sequence files according to IDs +from column(s) of tabular file: + +* http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_id +* http://toolshed.g2.bx.psu.edu/view/peterjc/seq_select_by_id + + +Automated Installation +====================== + +This should be straightforward using the Galaxy Tool Shed, which should be +able to automatically install the dependency on Biopython, and then install +this tool and run its unit tests. + + +Manual Installation +=================== + +There are just two files to install to use this tool from within Galaxy: + +* ``seq_length.py`` (the Python script) +* ``seq_length.xml`` (the Galaxy tool definition) + +The suggested location is in a dedicated ``tools/seq_length`` folder. + +You will also need to modify the ``tools_conf.xml`` file to tell Galaxy to offer the +tool. One suggested location is in the filters section. Simply add the line:: + + <tool file="seq_length/seq_length.xml" /> + +If you wish to run the unit tests, also move/copy the ``test-data/`` files +under Galaxy's ``test-data/`` folder. Then:: + + $ ./run_tests.sh -id seq_length + +You will also need to install Biopython 1.54 or later. That's it. + + +History +======= + +======= ====================================================================== +Version Changes +------- ---------------------------------------------------------------------- +v0.0.1 - Initial version. +======= ====================================================================== + + +Developers +========== + +Development is here: + +https://github.com/peterjc/pico_galaxy/tree/master/tools/seq_length + +For pushing a release to the test or main "Galaxy Tool Shed", use the following +Planemo commands (which requires you have set your Tool Shed access details in +``~/.planemo.yml`` and that you have access rights on the Tool Shed):: + + $ planemo shed_update -t testtoolshed --check_diff tools/seq_length/ + ... + +or:: + + $ planemo shed_update -t toolshed --check_diff tools/seq_length/ + ... + +To just build and check the tar ball, use:: + + $ planemo shed_upload --tar_only tools/seq_length/ + ... + $ tar -tzf shed_upload.tar.gz + test-data/SRR639755_sample_strict.fastq + test-data/SRR639755_sample_strict.length.tabular + test-data/four_human_proteins.fasta + test-data/four_human_proteins.length.tabular + tools/seq_length/README.rst + tools/seq_length/seq_length.py + tools/seq_length/seq_length.xml + tools/seq_length/tool_dependencies.xml + + +Licence (MIT) +============= + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/seq_length/seq_length.py Tue May 08 09:35:45 2018 -0400 @@ -0,0 +1,69 @@ +#!/usr/bin/env python +"""Compute length of FASTA, QUAL, FASTQ or SSF sequences. + +Takes three command line options: input sequence filename, input type +(e.g. FASTA or SFF) and the output filename (tabular). + +This tool is a short Python script which requires Biopython 1.54 or later +for SFF file support. If you use this tool in scientific work leading to a +publication, please cite the Biopython application note: + +Cock et al 2009. Biopython: freely available Python tools for computational +molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. +http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. + +This script is copyright 2018 by Peter Cock, The James Hutton Institute UK. +All rights reserved. See accompanying text file for licence details (MIT +license). +""" + +from __future__ import print_function + +import sys + +if "-v" in sys.argv or "--version" in sys.argv: + print("v0.0.1") + sys.exit(0) + +try: + from Bio import SeqIO +except ImportError: + sys.exit("Missing required Python library Biopython.") + + +# Parse Command Line +try: + in_file, seq_format, out_file = sys.argv[1:] +except ValueError: + sys.exit("Expected three arguments (input file, format, output file), " + "got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv))) + + +if seq_format.startswith("fastq"): + # We don't care about the quality score encoding, just + # need to translate Galaxy format name into something + # Biopython will accept: + format = "fastq" +elif seq_format.lower() == "csfasta": + # I have not tested with colour space FASTA + format = "fasta" +elif seq_format.lower == "sff": + # The masked/trimmed numbers are more interesting + format = "sff-trim" +elif seq_format.lower() in ["fasta", "qual"]: + format = seq_format.lower() +else: + # TODO: Does Galaxy understand GenBank, EMBL, etc yet? + sys.exit("Unexpected format argument: %r" % seq_format) + + +count = 0 +total = 0 +with open(out_file, "w") as out_handle: + out_handle.write("#Identifier\tLength\n") + for record in SeqIO.parse(in_file, format): + count += 1 + length = len(record) + total += length + out_handle.write("%s\t%i\n" % (record.id, length)) +print("%i sequences, total length %i" % (count, total))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/seq_length/seq_length.xml Tue May 08 09:35:45 2018 -0400 @@ -0,0 +1,54 @@ +<tool id="seq_length" name="Sequence lengths" version="0.0.1"> + <description>with ID mapping from a tabular file</description> + <requirements> + <!-- This is the currently the last release of Biopython which is available via Galaxy's legacy XML packaging system --> + <requirement type="package" version="1.67">biopython</requirement> + </requirements> + <version_command> +python $__tool_directory__/seq_length.py --version +</version_command> + <command detect_errors="aggressive"> +python $__tool_directory__/seq_length.py '$input_file' '$input_file.ext' '$output_file' + </command> + <inputs> + <param name="input_file" type="data" format="fasta,qual,fastq,sff" label="Sequence file" help="FASTA, QUAL, FASTQ, or SFF format." /> + </inputs> + <outputs> + <data name="output_file" format="tabular" label="${on_string} length"/> + </outputs> + <tests> + <test> + <param name="input_file" value="four_human_proteins.fasta" ftype="fasta" /> + <output name="output_file" file="four_human_proteins.length.tabular" ftype="tabular" /> + </test> + <test> + <param name="input_file" value="SRR639755_sample_strict.fastq" ftype="fastq" /> + <output name="output_file" file="SRR639755_sample_strict.length.tabular" ftype="tabular" /> + </test> + </tests> + <help> +**What it does** + +Takes a FASTA, QUAL, FASTQ or Standard Flowgram Format (SFF) file and produces a +two-column tabular file containing one line per sequence giving the sequence +identifier and the associated sequence's length. + +WARNING: If there are any duplicate sequence identifiers, these will all appear +in the tabular output. + +**References** + +This tool uses Biopython's ``SeqIO`` library to read sequences, so please cite +the Biopython application note (and Galaxy too of course): + +Cock et al (2009). Biopython: freely available Python tools for computational +molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. +http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. + +This tool is available to install into other Galaxy Instances via the Galaxy +Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/seq_length + </help> + <citations> + <citation type="doi">10.1093/bioinformatics/btp163</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/seq_length/tool_dependencies.xml Tue May 08 09:35:45 2018 -0400 @@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="biopython" version="1.67"> + <repository changeset_revision="a12f73c3b116" name="package_biopython_1_67" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>