seq_filter_by_id: tools/filters/seq_filter_by

comparison tools/filters/seq_filter_by_id.py @ 1:262f08104540 draft

Uploaded v0.0.4 which includes a unit test and is faster at filtering FASTA files with large records (e.g. whole chromosomes)

author	peterjc
date	Mon, 15 Apr 2013 12:27:30 -0400
parents	5844f6a450ed
children	abdd608c869b

comparison

equal deleted inserted replaced

-:5844f6a450ed
+:262f08104540
 Cock et al 2009. Biopython: freely available Python tools for computational
 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
-This script is copyright 2010 by Peter Cock, SCRI, UK. All rights reserved.
+This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute
+(formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
 See accompanying text file for licence details (MIT/BSD style).
-This is version 0.0.1 of the script.
+This is version 0.0.4 of the script, use -v or --version to get the version.
 """
 import sys
 def stop_err(msg, err=1):
 sys.stderr.write(msg.rstrip() + "\n")
 sys.exit(err)
+if "-v" in sys.argv or "--version" in sys.argv:
+print "v0.0.4"
+sys.exit(0)
 #Parse Command Line
 try:
 tabular_file, cols_arg, in_file, seq_format, out_positive_file, out_negative_file = sys.argv[1:]
 except ValueError:
-stop_err("Expected six arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
+stop_err("Expected six arguments (tab file, columns, in seq, seq format, out pos, out neg), got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
 try:
 columns = [int(arg)-1 for arg in cols_arg.split(",")]
 except ValueError:
 stop_err("Expected list of columns (comma separated integers), got %s" % cols_arg)
+if min(columns) < 0:
+stop_err("Expect one-based column numbers (not zero-based counting), got %s" % cols_arg)
 if out_positive_file == "-" and out_negative_file == "-":
 stop_err("Neither output file requested")
 ids.add(line.rstrip("\n").split("\t")[col])
 print "Using %i IDs from tabular file" % (len(ids))
 handle.close()
+def crude_fasta_iterator(handle):
+"""Yields tuples, record ID and the full record as a string."""
+while True:
+line = handle.readline()
+if line == "":
+return # Premature end of file, or just empty?
+if line[0] == ">":
+break
+while True:
+if line[0] != ">":
+raise ValueError(
+"Records in Fasta files should start with '>' character")
+id = line[1:].split(None, 1)[0]
+lines = [line]
+line = handle.readline()
+while True:
+if not line:
+break
+if line[0] == ">":
+break
+lines.append(line)
+line = handle.readline()
+yield id, "".join(lines)
+if not line:
+return # StopIteration
+def fasta_filter(in_file, pos_file, neg_file, wanted):
+"""FASTA filter producing 60 character line wrapped outout."""
+pos_count = neg_count = 0
+#Galaxy now requires Python 2.5+ so can use with statements,
+with open(in_file) as in_handle:
+#Doing the if statement outside the loop for speed
+#(with the downside of three very similar loops).
+if pos_file != "-" and neg_file != "-":
+print "Generating two FASTA files"
+with open(pos_file, "w") as pos_handle:
+with open(neg_file, "w") as neg_handle:
+for identifier, record in crude_fasta_iterator(in_handle):
+if identifier in wanted:
+pos_handle.write(record)
+pos_count += 1
+else:
+neg_handle.write(record)
+neg_count += 1
+elif pos_file != "-":
+print "Generating matching FASTA file"
+with open(pos_file, "w") as pos_handle:
+for identifier, record in crude_fasta_iterator(in_handle):
+if identifier in wanted:
+pos_handle.write(record)
+pos_count += 1
+else:
+neg_count += 1
+else:
+print "Generating non-matching FASTA file"
+assert neg_file != "-"
+with open(neg_file, "w") as neg_handle:
+for identifier, record in crude_fasta_iterator(in_handle):
+if identifier in wanted:
+pos_count += 1
+else:
+neg_handle.write(record)
+neg_count += 1
+return pos_count, neg_count
 if seq_format.lower()=="sff":
 #Now write filtered SFF file based on IDs from BLAST file
 try:
 from Bio.SeqIO.SffIO import SffIterator, SffWriter
 except ImportError:
-stop_err("Requires Biopython 1.54 or later")
+stop_err("SFF filtering requires Biopython 1.54 or later")
 try:
 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
 except ImportError:
 #Prior to Biopython 1.56 this was a private function
 manifest = ReadRocheXmlManifest(in_handle)
 except ValueError:
 manifest = None
 #This makes two passes though the SFF file with isn't so efficient,
 #but this makes the code simple.
+pos_count = neg_count = 0
 if out_positive_file != "-":
 out_handle = open(out_positive_file, "wb")
 writer = SffWriter(out_handle, xml=manifest)
 in_handle.seek(0) #start again after getting manifest
 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id in ids)
 out_handle.close()
 #And we're done
 in_handle.close()
 #At the time of writing, Galaxy doesn't show SFF file read counts,
 #so it is useful to put them in stdout and thus shown in job info.
-if out_positive_file != "-" and out_negative_file != "-":
+print "%i with and %i without specified IDs" % (pos_count, neg_count)
-print "%i with and %i without specified IDs" % (pos_count, neg_count)
-elif out_positive_file != "-":
-print "%i with specified IDs" % pos_count
-elif out_negative_file != "-":
-print "%i without specified IDs" % neg_count
 elif seq_format.lower()=="fasta":
 #Write filtered FASTA file based on IDs from tabular file
-from galaxy_utils.sequence.fasta import fastaReader, fastaWriter
+pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids)
-reader = fastaReader(open(in_file, "rU"))
+print "%i with and %i without specified IDs" % (pos_count, neg_count)
-if out_positive_file != "-" and out_negative_file != "-":
-print "Generating two FASTA files"
-positive_writer = fastaWriter(open(out_positive_file, "w"))
-negative_writer = fastaWriter(open(out_negative_file, "w"))
-for record in reader:
-#The [1:] is because the fastaReader leaves the > on the identifer.
-if record.identifier and record.identifier.split()[0][1:] in ids:
-positive_writer.write(record)
-else:
-negative_writer.write(record)
-positive_writer.close()
-negative_writer.close()
-elif out_positive_file != "-":
-print "Generating matching FASTA file"
-positive_writer = fastaWriter(open(out_positive_file, "w"))
-for record in reader:
-#The [1:] is because the fastaReader leaves the > on the identifer.
-if record.identifier and record.identifier.split()[0][1:] in ids:
-positive_writer.write(record)
-positive_writer.close()
-elif out_negative_file != "-":
-print "Generating non-matching FASTA file"
-negative_writer = fastaWriter(open(out_negative_file, "w"))
-for record in reader:
-#The [1:] is because the fastaReader leaves the > on the identifer.
-if not record.identifier or record.identifier.split()[0][1:] not in ids:
-negative_writer.write(record)
-negative_writer.close()
 elif seq_format.lower().startswith("fastq"):
 #Write filtered FASTQ file based on IDs from tabular file
 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
 reader = fastqReader(open(in_file, "rU"))
 if out_positive_file != "-" and out_negative_file != "-":
 print "Generating two FASTQ files"
 positive_writer = fastqWriter(open(out_positive_file, "w"))
 negative_writer = fastqWriter(open(out_negative_file, "w"))
 for record in reader:
-#The [1:] is because the fastaReader leaves the @ on the identifer.
+#The [1:] is because the fastaReader leaves the > on the identifier.
 if record.identifier and record.identifier.split()[0][1:] in ids:
 positive_writer.write(record)
 else:
 negative_writer.write(record)
 positive_writer.close()
 negative_writer.close()
 elif out_positive_file != "-":
 print "Generating matching FASTQ file"
 positive_writer = fastqWriter(open(out_positive_file, "w"))
 for record in reader:
-#The [1:] is because the fastaReader leaves the @ on the identifer.
+#The [1:] is because the fastaReader leaves the > on the identifier.
 if record.identifier and record.identifier.split()[0][1:] in ids:
 positive_writer.write(record)
 positive_writer.close()
 elif out_negative_file != "-":
 print "Generating non-matching FASTQ file"
 negative_writer = fastqWriter(open(out_negative_file, "w"))
 for record in reader:
-#The [1:] is because the fastaReader leaves the @ on the identifer.
+#The [1:] is because the fastaReader leaves the > on the identifier.
 if not record.identifier or record.identifier.split()[0][1:] not in ids:
 negative_writer.write(record)
 negative_writer.close()
+reader.close()
 else:
 stop_err("Unsupported file type %r" % seq_format)

Mercurial > repos > peterjc > seq_filter_by_id

comparison tools/filters/seq_filter_by_id.py @ 1:262f08104540 draft