comparison tools/seq_rename/seq_rename.py @ 2:7c0642fc57ad draft

Uploaded v0.0.4, automatic dependency on Biopython 1.62, new README file, citation information, MIT licence. Includes additional tested added in v0.0.3
author peterjc
date Fri, 11 Oct 2013 04:39:16 -0400
parents
children e1398f2ba9fe
comparison
equal deleted inserted replaced
1:9c8c5079c8af 2:7c0642fc57ad
1 #!/usr/bin/env python
2 """Rename FASTA, QUAL, FASTQ or SSF sequences with ID mapping from tabular file.
3
4 Takes six command line options, tabular filename, current (old) ID column
5 number (using one based counting), new ID column number (also using one based
6 counting), input sequence filename, input type (e.g. FASTA or SFF) and the
7 output filename (same format as input sequence file).
8
9 When selecting from an SFF file, any Roche XML manifest in the input file is
10 preserved in both output files.
11
12 This tool is a short Python script which requires Biopython 1.54 or later
13 for SFF file support. If you use this tool in scientific work leading to a
14 publication, please cite the Biopython application note:
15
16 Cock et al 2009. Biopython: freely available Python tools for computational
17 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
18 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
19
20 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute UK.
21 All rights reserved. See accompanying text file for licence details (MIT
22 license).
23
24 This is version 0.0.4 of the script.
25 """
26 import sys
27
28 if "-v" in sys.argv or "--version" in sys.argv:
29 print "v0.0.4"
30 sys.exit(0)
31
32 def stop_err(msg, err=1):
33 sys.stderr.write(msg.rstrip() + "\n")
34 sys.exit(err)
35
36 #Parse Command Line
37 try:
38 tabular_file, old_col_arg, new_col_arg, in_file, seq_format, out_file = sys.argv[1:]
39 except ValueError:
40 stop_err("Expected six arguments (tabular file, old col, new col, input file, format, output file), got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
41
42 try:
43 if old_col_arg.startswith("c"):
44 old_column = int(old_col_arg[1:])-1
45 else:
46 old_column = int(old_col_arg)-1
47 except ValueError:
48 stop_err("Expected column number, got %s" % old_col_arg)
49 try:
50 if old_col_arg.startswith("c"):
51 new_column = int(new_col_arg[1:])-1
52 else:
53 new_column = int(new_col_arg)-1
54 except ValueError:
55 stop_err("Expected column number, got %s" % new_col_arg)
56 if old_column == new_column:
57 stop_err("Old and new column arguments are the same!")
58
59 def parse_ids(tabular_file, old_col, new_col):
60 """Read tabular file and record all specified ID mappings."""
61 handle = open(tabular_file, "rU")
62 for line in handle:
63 if not line.startswith("#"):
64 parts = line.rstrip("\n").split("\t")
65 yield parts[old_col].strip(), parts[new_col].strip()
66 handle.close()
67
68 #Load the rename mappings
69 rename = dict(parse_ids(tabular_file, old_column, new_column))
70 print "Loaded %i ID mappings" % len(rename)
71
72 #Rewrite the sequence file
73 if seq_format.lower()=="sff":
74 #Use Biopython for this format
75 renamed = 0
76 def rename_seqrecords(records, mapping):
77 global renamed #nasty, but practical!
78 for record in records:
79 try:
80 record.id = mapping[record.id]
81 renamed += 1
82 except KeyError:
83 pass
84 yield record
85
86 try:
87 from Bio.SeqIO.SffIO import SffIterator, SffWriter
88 except ImportError:
89 stop_err("Requires Biopython 1.54 or later")
90
91 try:
92 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
93 except ImportError:
94 #Prior to Biopython 1.56 this was a private function
95 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
96
97 in_handle = open(in_file, "rb") #must be binary mode!
98 try:
99 manifest = ReadRocheXmlManifest(in_handle)
100 except ValueError:
101 manifest = None
102 out_handle = open(out_file, "wb")
103 writer = SffWriter(out_handle, xml=manifest)
104 in_handle.seek(0) #start again after getting manifest
105 count = writer.write_file(rename_seqrecords(SffIterator(in_handle), rename))
106 out_handle.close()
107 in_handle.close()
108 else:
109 #Use Galaxy for FASTA, QUAL or FASTQ
110 if seq_format.lower() in ["fasta", "csfasta"] \
111 or seq_format.lower().startswith("qual"):
112 from galaxy_utils.sequence.fasta import fastaReader, fastaWriter
113 reader = fastaReader(open(in_file, "rU"))
114 writer = fastaWriter(open(out_file, "w"))
115 marker = ">"
116 elif seq_format.lower().startswith("fastq"):
117 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
118 reader = fastqReader(open(in_file, "rU"))
119 writer = fastqWriter(open(out_file, "w"))
120 marker = "@"
121 else:
122 stop_err("Unsupported file type %r" % seq_format)
123 #Now do the renaming
124 count = 0
125 renamed = 0
126 for record in reader:
127 #The [1:] is because the fastaReader leaves the > on the identifier,
128 #likewise the fastqReader leaves the @ on the identifier
129 try:
130 idn, descr = record.identifier[1:].split(None, 1)
131 except ValueError:
132 idn = record.identifier[1:]
133 descr = None
134 if idn in rename:
135 if descr:
136 record.identifier = "%s%s %s" % (marker, rename[idn], descr)
137 else:
138 record.identifier = "%s%s" % (marker, rename[idn])
139 renamed += 1
140 writer.write(record)
141 count += 1
142 writer.close()
143 reader.close()
144
145 print "Renamed %i out of %i records" % (renamed, count)