# HG changeset patch
# User peterjc
# Date 1381480756 14400
# Node ID 7c0642fc57ad33b45f168baf926e789b0711405a
# Parent 9c8c5079c8afdd57045504b06cf8ce2d0284c5f1
Uploaded v0.0.4, automatic dependency on Biopython 1.62, new README file, citation information, MIT licence.
Includes additional tested added in v0.0.3
diff -r 9c8c5079c8af -r 7c0642fc57ad tools/filters/seq_rename.py
--- a/tools/filters/seq_rename.py Mon Apr 29 13:14:23 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,145 +0,0 @@
-#!/usr/bin/env python
-"""Rename FASTA, QUAL, FASTQ or SSF sequences with ID mapping from tabular file.
-
-Takes six command line options, tabular filename, current (old) ID column
-number (using one based counting), new ID column number (also using one based
-counting), input sequence filename, input type (e.g. FASTA or SFF) and the
-output filename (same format as input sequence file).
-
-When selecting from an SFF file, any Roche XML manifest in the input file is
-preserved in both output files.
-
-This tool is a short Python script which requires Biopython 1.54 or later
-for SFF file support. If you use this tool in scientific work leading to a
-publication, please cite the Biopython application note:
-
-Cock et al 2009. Biopython: freely available Python tools for computational
-molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
-http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
-
-This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute UK.
-All rights reserved. See accompanying text file for licence details (MIT/BSD
-style).
-
-This is version 0.0.2 of the script.
-"""
-import sys
-
-if "-v" in sys.argv or "--version" in sys.argv:
- print "v0.0.2"
- sys.exit(0)
-
-def stop_err(msg, err=1):
- sys.stderr.write(msg.rstrip() + "\n")
- sys.exit(err)
-
-#Parse Command Line
-try:
- tabular_file, old_col_arg, new_col_arg, in_file, seq_format, out_file = sys.argv[1:]
-except ValueError:
- stop_err("Expected six arguments (tabular file, old col, new col, input file, format, output file), got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
-
-try:
- if old_col_arg.startswith("c"):
- old_column = int(old_col_arg[1:])-1
- else:
- old_column = int(old_col_arg)-1
-except ValueError:
- stop_err("Expected column number, got %s" % old_col_arg)
-try:
- if old_col_arg.startswith("c"):
- new_column = int(new_col_arg[1:])-1
- else:
- new_column = int(new_col_arg)-1
-except ValueError:
- stop_err("Expected column number, got %s" % new_col_arg)
-if old_column == new_column:
- stop_err("Old and new column arguments are the same!")
-
-def parse_ids(tabular_file, old_col, new_col):
- """Read tabular file and record all specified ID mappings."""
- handle = open(tabular_file, "rU")
- for line in handle:
- if not line.startswith("#"):
- parts = line.rstrip("\n").split("\t")
- yield parts[old_col].strip(), parts[new_col].strip()
- handle.close()
-
-#Load the rename mappings
-rename = dict(parse_ids(tabular_file, old_column, new_column))
-print "Loaded %i ID mappings" % len(rename)
-
-#Rewrite the sequence file
-if seq_format.lower()=="sff":
- #Use Biopython for this format
- renamed = 0
- def rename_seqrecords(records, mapping):
- global renamed #nasty, but practical!
- for record in records:
- try:
- record.id = mapping[record.id]
- renamed += 1
- except KeyError:
- pass
- yield record
-
- try:
- from Bio.SeqIO.SffIO import SffIterator, SffWriter
- except ImportError:
- stop_err("Requires Biopython 1.54 or later")
-
- try:
- from Bio.SeqIO.SffIO import ReadRocheXmlManifest
- except ImportError:
- #Prior to Biopython 1.56 this was a private function
- from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
-
- in_handle = open(in_file, "rb") #must be binary mode!
- try:
- manifest = ReadRocheXmlManifest(in_handle)
- except ValueError:
- manifest = None
- out_handle = open(out_file, "wb")
- writer = SffWriter(out_handle, xml=manifest)
- in_handle.seek(0) #start again after getting manifest
- count = writer.write_file(rename_seqrecords(SffIterator(in_handle), rename))
- out_handle.close()
- in_handle.close()
-else:
- #Use Galaxy for FASTA, QUAL or FASTQ
- if seq_format.lower() in ["fasta", "csfasta"] \
- or seq_format.lower().startswith("qual"):
- from galaxy_utils.sequence.fasta import fastaReader, fastaWriter
- reader = fastaReader(open(in_file, "rU"))
- writer = fastaWriter(open(out_file, "w"))
- marker = ">"
- elif seq_format.lower().startswith("fastq"):
- from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
- reader = fastqReader(open(in_file, "rU"))
- writer = fastqWriter(open(out_file, "w"))
- marker = "@"
- else:
- stop_err("Unsupported file type %r" % seq_format)
- #Now do the renaming
- count = 0
- renamed = 0
- for record in reader:
- #The [1:] is because the fastaReader leaves the > on the identifier,
- #likewise the fastqReader leaves the @ on the identifier
- try:
- idn, descr = record.identifier[1:].split(None, 1)
- except ValueError:
- idn = record.identifier[1:]
- descr = None
- if idn in rename:
- if descr:
- record.identifier = "%s%s %s" % (marker, rename[idn], descr)
- else:
- record.identifier = "%s%s" % (marker, rename[idn])
- renamed += 1
- writer.write(record)
- count += 1
- writer.close()
- reader.close()
-
-print "Renamed %i out of %i records" % (renamed, count)
diff -r 9c8c5079c8af -r 7c0642fc57ad tools/filters/seq_rename.txt
--- a/tools/filters/seq_rename.txt Mon Apr 29 13:14:23 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,87 +0,0 @@
-Galaxy tool to renamed FASTA, QUAL, FASTQ or SFF sequences
-==========================================================
-
-This tool is copyright 2011 by Peter Cock, The James Hutton Institute
-(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
-See the licence text below.
-
-This tool is a short Python script (using Biopython library functions) to rename
-sequences from a FASTA, QUAL, FASTQ, or SFF file based on an ID mapping gives as
-two columns of a tabular file. The output order follows that of the sequence file,
-and if there are duplicates in the input sequence file, there will be duplicates
-in the output sequence file.
-
-See also the sister tools to filter or select sequence files according to IDs
-from column(s) of tabular file.
-
-
-Manual Installation
-===================
-
-There are just two files to install:
-
-* seq_rename.py (the Python script)
-* seq_rename.xml (the Galaxy tool definition)
-
-The suggested location is in the Galaxy folder tools/filters next to the tool
-for calling sff_extract.py for converting SFF to FASTQ or FASTA + QUAL.
-
-You will also need to modify the tools_conf.xml file to tell Galaxy to offer the
-tool. One suggested location is in the filters section. Simply add the line:
-
-
-
-You will also need to install Biopython 1.54 or later. That's it.
-
-
-History
-=======
-
-v0.0.1 - Initial version.
-v0.0.2 - Record script version when run from Galaxy.
- - Add unit test.
- - Check for errors using Python script's return code.
-
-
-Developers
-==========
-
-This script and related tools are being developed on the following hg branch:
-http://bitbucket.org/peterjc/galaxy-central/src/tools
-
-For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use
-the following command from the Galaxy root folder:
-
-$ tar -czf seq_rename.tar.gz tools/filters/seq_rename.* test-data/four_human_proteins.fasta test-data/four_human_proteins.rename.tabular test-data/four_human_proteins.rename.fasta
-
-Check this worked:
-
-$ tar -tzf seq_rename.tar.gz
-tools/filter/seq_rename.py
-tools/filter/seq_rename.txt
-tools/filter/seq_rename.xml
-test-data/four_human_proteins.fasta
-test-data/four_human_proteins.rename.tabular
-test-data/four_human_proteins.rename.fasta
-
-
-Licence (MIT/BSD style)
-=======================
-
-Permission to use, copy, modify, and distribute this software and its
-documentation with or without modifications and for any purpose and
-without fee is hereby granted, provided that any copyright notices
-appear in all copies and that both those copyright notices and this
-permission notice appear in supporting documentation, and that the
-names of the contributors or copyright holders not be used in
-advertising or publicity pertaining to distribution of the software
-without specific prior permission.
-
-THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
-WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
-CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
-OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
-OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
-OR PERFORMANCE OF THIS SOFTWARE.
diff -r 9c8c5079c8af -r 7c0642fc57ad tools/filters/seq_rename.xml
--- a/tools/filters/seq_rename.xml Mon Apr 29 13:14:23 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,68 +0,0 @@
-
- with ID mapping from a tabular file
- seq_rename.py --version
-
-seq_rename.py $input_tabular $old_column $new_column $input_file $input_file.ext $output_file
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Bio
-
-
-
-**What it does**
-
-Takes a FASTA, QUAL, FASTQ or Standard Flowgram Format (SFF) file and produces a
-new sequence file (of the same format) where the sequence identifiers have been
-renamed according two the specified columns a the tabular file.
-
-WARNING: If you have any duplicates in the intput sequence file, you will still
-have duplicate sequences in the output.
-
-WARNING: If the tabular file has more than one new name for any old ID, the
-last one is used.
-
-**Citation**
-
-This tool uses Biopython to read and write SFF files. If you use this tool in
-scientific work leading to a publication, please cite the Biopython application
-note (and Galaxy too of course):
-
-Cock et al 2009. Biopython: freely available Python tools for computational
-molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
-http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
-
-
-
diff -r 9c8c5079c8af -r 7c0642fc57ad tools/seq_rename/README.rst
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/seq_rename/README.rst Fri Oct 11 04:39:16 2013 -0400
@@ -0,0 +1,121 @@
+Galaxy tool to rename FASTA, QUAL, FASTQ or SFF sequences
+=========================================================
+
+This tool is copyright 2011-2013 by Peter Cock, The James Hutton Institute
+(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
+See the licence text below.
+
+This tool is a short Python script (using Biopython library functions) to rename
+sequences from a FASTA, QUAL, FASTQ, or SFF file based on an ID mapping gives as
+two columns of a tabular file. The output order follows that of the sequence file,
+and if there are duplicates in the input sequence file, there will be duplicates
+in the output sequence file.
+
+This tool is available from the Galaxy Tool Shed,
+
+* http://toolshed.g2.bx.psu.edu/view/peterjc/seq_rename
+
+See also the sister tools to filter or select sequence files according to IDs
+from column(s) of tabular file:
+
+* http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_id
+* http://toolshed.g2.bx.psu.edu/view/peterjc/seq_select_by_id
+
+
+Automated Installation
+======================
+
+This should be straightforward using the Galaxy Tool Shed, which should be
+able to automatically install the dependency on Biopython, and then install
+this tool and run its unit tests.
+
+
+Manual Installation
+===================
+
+There are just two files to install to use this tool from within Galaxy:
+
+* seq_rename.py (the Python script)
+* seq_rename.xml (the Galaxy tool definition)
+
+The suggested location is in a dedicated tools/seq_rename folder.
+
+You will also need to modify the tools_conf.xml file to tell Galaxy to offer the
+tool. One suggested location is in the filters section. Simply add the line::
+
+
+
+If you wish to run the unit tests, also add this to tools_conf.xml.sample
+and move/copy the test-data files under Galaxy's test-data folder. Then::
+
+ $ ./run_functional_tests.sh -id seq_rename
+
+You will also need to install Biopython 1.54 or later. That's it.
+
+
+History
+=======
+
+======= ======================================================================
+Version Changes
+------- ----------------------------------------------------------------------
+v0.0.1 - Initial version.
+v0.0.2 - Record script version when run from Galaxy.
+ - Add unit test.
+ - Check for errors using Python script's return code.
+v0.0.3 - Link to Tool Shed added to help text and this documentation.
+v0.0.4 - Automated installation of Biopython dependency.
+ - Use reStructuredText for this README file.
+ - Adopt standard MIT License.
+ - Updated citation information (Cock et al. 2013).
+ - Development moved to GitHub, https://github.com/peterjc/pico_galaxy
+ - Renamed folder and adopted README.rst naming.
+======= ======================================================================
+
+
+Developers
+==========
+
+This script and related tools were initially developed on the following hg branch:
+http://bitbucket.org/peterjc/galaxy-central/src/tools
+
+Development has now moved to a dedicated GitHub repository:
+https://github.com/peterjc/pico_galaxy/tree/master/tools
+
+For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use
+the following command from the Galaxy root folder::
+
+ $ tar -czf seq_rename.tar.gz tools/seq_rename/README.rst tools/seq_rename/seq_rename.* tools/seq_rename/repository_dependencies.xml test-data/four_human_proteins.fasta test-data/four_human_proteins.rename.tabular test-data/four_human_proteins.rename.fasta
+
+Check this worked::
+
+ $ tar -tzf seq_rename.tar.gz
+ tools/seq_rename/README.rst
+ tools/seq_rename/seq_rename.py
+ tools/seq_rename/seq_rename.xml
+ tools/seq_rename/repository_dependencies.xml
+ test-data/four_human_proteins.fasta
+ test-data/four_human_proteins.rename.tabular
+ test-data/four_human_proteins.rename.fasta
+
+
+Licence (MIT)
+=============
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff -r 9c8c5079c8af -r 7c0642fc57ad tools/seq_rename/repository_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/seq_rename/repository_dependencies.xml Fri Oct 11 04:39:16 2013 -0400
@@ -0,0 +1,6 @@
+
+
+
+
+
diff -r 9c8c5079c8af -r 7c0642fc57ad tools/seq_rename/seq_rename.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/seq_rename/seq_rename.py Fri Oct 11 04:39:16 2013 -0400
@@ -0,0 +1,145 @@
+#!/usr/bin/env python
+"""Rename FASTA, QUAL, FASTQ or SSF sequences with ID mapping from tabular file.
+
+Takes six command line options, tabular filename, current (old) ID column
+number (using one based counting), new ID column number (also using one based
+counting), input sequence filename, input type (e.g. FASTA or SFF) and the
+output filename (same format as input sequence file).
+
+When selecting from an SFF file, any Roche XML manifest in the input file is
+preserved in both output files.
+
+This tool is a short Python script which requires Biopython 1.54 or later
+for SFF file support. If you use this tool in scientific work leading to a
+publication, please cite the Biopython application note:
+
+Cock et al 2009. Biopython: freely available Python tools for computational
+molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
+http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
+
+This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute UK.
+All rights reserved. See accompanying text file for licence details (MIT
+license).
+
+This is version 0.0.4 of the script.
+"""
+import sys
+
+if "-v" in sys.argv or "--version" in sys.argv:
+ print "v0.0.4"
+ sys.exit(0)
+
+def stop_err(msg, err=1):
+ sys.stderr.write(msg.rstrip() + "\n")
+ sys.exit(err)
+
+#Parse Command Line
+try:
+ tabular_file, old_col_arg, new_col_arg, in_file, seq_format, out_file = sys.argv[1:]
+except ValueError:
+ stop_err("Expected six arguments (tabular file, old col, new col, input file, format, output file), got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
+
+try:
+ if old_col_arg.startswith("c"):
+ old_column = int(old_col_arg[1:])-1
+ else:
+ old_column = int(old_col_arg)-1
+except ValueError:
+ stop_err("Expected column number, got %s" % old_col_arg)
+try:
+ if old_col_arg.startswith("c"):
+ new_column = int(new_col_arg[1:])-1
+ else:
+ new_column = int(new_col_arg)-1
+except ValueError:
+ stop_err("Expected column number, got %s" % new_col_arg)
+if old_column == new_column:
+ stop_err("Old and new column arguments are the same!")
+
+def parse_ids(tabular_file, old_col, new_col):
+ """Read tabular file and record all specified ID mappings."""
+ handle = open(tabular_file, "rU")
+ for line in handle:
+ if not line.startswith("#"):
+ parts = line.rstrip("\n").split("\t")
+ yield parts[old_col].strip(), parts[new_col].strip()
+ handle.close()
+
+#Load the rename mappings
+rename = dict(parse_ids(tabular_file, old_column, new_column))
+print "Loaded %i ID mappings" % len(rename)
+
+#Rewrite the sequence file
+if seq_format.lower()=="sff":
+ #Use Biopython for this format
+ renamed = 0
+ def rename_seqrecords(records, mapping):
+ global renamed #nasty, but practical!
+ for record in records:
+ try:
+ record.id = mapping[record.id]
+ renamed += 1
+ except KeyError:
+ pass
+ yield record
+
+ try:
+ from Bio.SeqIO.SffIO import SffIterator, SffWriter
+ except ImportError:
+ stop_err("Requires Biopython 1.54 or later")
+
+ try:
+ from Bio.SeqIO.SffIO import ReadRocheXmlManifest
+ except ImportError:
+ #Prior to Biopython 1.56 this was a private function
+ from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
+
+ in_handle = open(in_file, "rb") #must be binary mode!
+ try:
+ manifest = ReadRocheXmlManifest(in_handle)
+ except ValueError:
+ manifest = None
+ out_handle = open(out_file, "wb")
+ writer = SffWriter(out_handle, xml=manifest)
+ in_handle.seek(0) #start again after getting manifest
+ count = writer.write_file(rename_seqrecords(SffIterator(in_handle), rename))
+ out_handle.close()
+ in_handle.close()
+else:
+ #Use Galaxy for FASTA, QUAL or FASTQ
+ if seq_format.lower() in ["fasta", "csfasta"] \
+ or seq_format.lower().startswith("qual"):
+ from galaxy_utils.sequence.fasta import fastaReader, fastaWriter
+ reader = fastaReader(open(in_file, "rU"))
+ writer = fastaWriter(open(out_file, "w"))
+ marker = ">"
+ elif seq_format.lower().startswith("fastq"):
+ from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
+ reader = fastqReader(open(in_file, "rU"))
+ writer = fastqWriter(open(out_file, "w"))
+ marker = "@"
+ else:
+ stop_err("Unsupported file type %r" % seq_format)
+ #Now do the renaming
+ count = 0
+ renamed = 0
+ for record in reader:
+ #The [1:] is because the fastaReader leaves the > on the identifier,
+ #likewise the fastqReader leaves the @ on the identifier
+ try:
+ idn, descr = record.identifier[1:].split(None, 1)
+ except ValueError:
+ idn = record.identifier[1:]
+ descr = None
+ if idn in rename:
+ if descr:
+ record.identifier = "%s%s %s" % (marker, rename[idn], descr)
+ else:
+ record.identifier = "%s%s" % (marker, rename[idn])
+ renamed += 1
+ writer.write(record)
+ count += 1
+ writer.close()
+ reader.close()
+
+print "Renamed %i out of %i records" % (renamed, count)
diff -r 9c8c5079c8af -r 7c0642fc57ad tools/seq_rename/seq_rename.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/seq_rename/seq_rename.xml Fri Oct 11 04:39:16 2013 -0400
@@ -0,0 +1,84 @@
+
+ with ID mapping from a tabular file
+
+ biopython
+ Bio
+
+ seq_rename.py --version
+
+seq_rename.py $input_tabular $old_column $new_column $input_file $input_file.ext $output_file
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+Takes a FASTA, QUAL, FASTQ or Standard Flowgram Format (SFF) file and produces a
+new sequence file (of the same format) where the sequence identifiers have been
+renamed according to the specified columns in your tabular file.
+
+WARNING: If you have any duplicates in the input sequence file, you will still
+have duplicate sequences in the output.
+
+WARNING: If the tabular file has more than one new name for any old ID, the
+last one is used.
+
+**References**
+
+If you use this Galaxy tool in work leading to a scientific publication please
+cite the following papers:
+
+Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013).
+Galaxy tools and workflows for sequence analysis with applications
+in molecular plant pathology. PeerJ 1:e167
+http://dx.doi.org/10.7717/peerj.167
+
+This tool uses Biopython to read and write SFF files, so you may also wish to
+cite the Biopython application note (and Galaxy too of course):
+
+Cock et al (2009). Biopython: freely available Python tools for computational
+molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
+http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
+
+This tool is available to install into other Galaxy Instances via the Galaxy
+Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/seq_rename
+
+