seq_filter_by_id: tools/seq_filter_by_id/seq_filter_by

comparison tools/seq_filter_by_id/seq_filter_by_id.py @ 6:03e134cae41a draft

v0.2.3, ignore blank lines in ID file (contributed by Gildas Le Corguille)

author	peterjc
date	Tue, 17 May 2016 05:59:24 -0400
parents	832c1fd57852
children	fb1313d79396

comparison

equal deleted inserted replaced

-:832c1fd57852
+:03e134cae41a
 import os
 import sys
 import re
 from optparse import OptionParser
-def sys_exit(msg, err=1):
+# Parse Command Line
-sys.stderr.write(msg.rstrip() + "\n")
-sys.exit(err)
-#Parse Command Line
 usage = """Use as follows:
 $ python seq_filter_by_id.py [options] tab1 cols1 [, tab2 cols2, ...]
 e.g. Positive matches using column one from tabular file:
 help="Show version and quit")
 options, args = parser.parse_args()
 if options.version:
-print "v0.2.1"
+print "v0.2.3"
 sys.exit(0)
 in_file = options.input
 seq_format = options.format
 out_positive_file = options.output_positive
 out_negative_file = options.output_negative
 logic = options.logic
 drop_suffices = bool(options.suffix)
 if in_file is None or not os.path.isfile(in_file):
-sys_exit("Missing input file: %r" % in_file)
+sys.exit("Missing input file: %r" % in_file)
 if out_positive_file is None and out_negative_file is None:
-sys_exit("Neither output file requested")
+sys.exit("Neither output file requested")
 if seq_format is None:
-sys_exit("Missing sequence format")
+sys.exit("Missing sequence format")
 if logic not in ["UNION", "INTERSECTION"]:
-sys_exit("Logic agrument should be 'UNION' or 'INTERSECTION', not %r" % logic)
+sys.exit("Logic agrument should be 'UNION' or 'INTERSECTION', not %r" % logic)
 if options.id_list and args:
-sys_exit("Cannot accepted IDs via both -t and as tabular files")
+sys.exit("Cannot accepted IDs via both -t and as tabular files")
 elif not options.id_list and not args:
-sys_exit("Expected matched pairs of tabular files and columns (or -t given)")
+sys.exit("Expected matched pairs of tabular files and columns (or -t given)")
 if len(args) % 2:
-sys_exit("Expected matched pairs of tabular files and columns, not: %r" % args)
+sys.exit("Expected matched pairs of tabular files and columns, not: %r" % args)
-#Cope with three widely used suffix naming convensions,
+# Cope with three widely used suffix naming convensions,
-#Illumina: /1 or /2
+# Illumina: /1 or /2
-#Forward/revered: .f or .r
+# Forward/revered: .f or .r
-#Sanger, e.g. .p1k and .q1k
+# Sanger, e.g. .p1k and .q1k
-#See http://staden.sourceforge.net/manual/pregap4_unix_50.html
+# See http://staden.sourceforge.net/manual/pregap4_unix_50.html
-#re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$")
+# re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$")
-#re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$")
+# re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$")
 re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$")
 assert re_suffix.search("demo.f")
 assert re_suffix.search("demo.s1")
 assert re_suffix.search("demo.f1k")
 assert re_suffix.search("demo.p1")
 assert re_suffix.search("demo.q1")
 assert re_suffix.search("demo.q1lk")
 identifiers = []
 for i in range(len(args) // 2):
-tabular_file = args[2*i]
+tabular_file = args[2 * i]
-cols_arg = args[2*i + 1]
+cols_arg = args[2 * i + 1]
 if not os.path.isfile(tabular_file):
-sys_exit("Missing tabular identifier file %r" % tabular_file)
+sys.exit("Missing tabular identifier file %r" % tabular_file)
 try:
-columns = [int(arg)-1 for arg in cols_arg.split(",")]
+columns = [int(arg) - 1 for arg in cols_arg.split(",")]
 except ValueError:
-sys_exit("Expected list of columns (comma separated integers), got %r" % cols_arg)
+sys.exit("Expected list of columns (comma separated integers), got %r" % cols_arg)
 if min(columns) < 0:
-sys_exit("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg)
+sys.exit("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg)
 identifiers.append((tabular_file, columns))
 name_warn = False
 def check_white_space(name):
 parts = name.split(None, 1)
 global name_warn
 if not name_warn and len(parts) > 1:
 name_warn = "WARNING: Some of your identifiers had white space in them, " + \
 else:
 # Just check the white space
 clean_name = check_white_space
-mapped_chars = { '>' :'__gt__',
+mapped_chars = {
-'<' :'__lt__',
+'>': '__gt__',
-"'" :'__sq__',
+'<': '__lt__',
-'"' :'__dq__',
+"'": '__sq__',
-'[' :'__ob__',
+'"': '__dq__',
-']' :'__cb__',
+'[': '__ob__',
-'{' :'__oc__',
+']': '__cb__',
-'}' :'__cc__',
+'{': '__oc__',
-'@' : '__at__',
+'}': '__cc__',
-'\n' : '__cn__',
+'@': '__at__',
-'\r' : '__cr__',
+'\n': '__cn__',
-'\t' : '__tc__',
+'\r': '__cr__',
-'#' : '__pd__'
+'\t': '__tc__',
-}
+'#': '__pd__',
+}
-#Read tabular file(s) and record all specified identifiers
-ids = None #Will be a set
+# Read tabular file(s) and record all specified identifiers
+ids = None  # Will be a set
 if options.id_list:
 assert not identifiers
 ids = set()
 id_list = options.id_list
-#Galaxy turns \r into __cr__ (CR) etc
+# Galaxy turns \r into __cr__ (CR) etc
 for k in mapped_chars:
 id_list = id_list.replace(mapped_chars[k], k)
 for x in options.id_list.split():
 ids.add(clean_name(x.strip()))
 print("Have %i unique identifiers from list" % len(ids))
 for tabular_file, columns in identifiers:
 file_ids = set()
 handle = open(tabular_file, "rU")
-if len(columns)>1:
+if len(columns) > 1:
-#General case of many columns
+# General case of many columns
 for line in handle:
 if line.startswith("#"):
-#Ignore comments
+# Ignore comments
 continue
 parts = line.rstrip("\n").split("\t")
 for col in columns:
 file_ids.add(clean_name(parts[col]))
 else:
-#Single column, special case speed up
+# Single column, special case speed up
 col = columns[0]
 for line in handle:
+if not line.strip(): #skip empty lines
+continue
 if not line.startswith("#"):
 file_ids.add(clean_name(line.rstrip("\n").split("\t")[col]))
-print "Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col+1) for col in columns))
+print "Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col + 1) for col in columns))
 if ids is None:
 ids = file_ids
 if logic == "UNION":
 ids.update(file_ids)
 else:
 else:
 print "Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers))
 if name_warn:
 sys.stderr.write(name_warn)
 def crude_fasta_iterator(handle):
 """Yields tuples, record ID and the full record as a string."""
 while True:
 line = handle.readline()
 if line == "":
-return # Premature end of file, or just empty?
+return  # Premature end of file, or just empty?
 if line[0] == ">":
 break
 no_id_warned = False
 while True:
 break
 lines.append(line)
 line = handle.readline()
 yield id, "".join(lines)
 if not line:
-return # StopIteration
+return  # StopIteration
 def fasta_filter(in_file, pos_file, neg_file, wanted):
 """FASTA filter producing 60 character line wrapped outout."""
 pos_count = neg_count = 0
-#Galaxy now requires Python 2.5+ so can use with statements,
+# Galaxy now requires Python 2.5+ so can use with statements,
 with open(in_file) as in_handle:
-#Doing the if statement outside the loop for speed
+# Doing the if statement outside the loop for speed
-#(with the downside of three very similar loops).
+# (with the downside of three very similar loops).
 if pos_file is not None and neg_file is not None:
 print "Generating two FASTA files"
 with open(pos_file, "w") as pos_handle:
 with open(neg_file, "w") as neg_handle:
 for identifier, record in crude_fasta_iterator(in_handle):
 def fastq_filter(in_file, pos_file, neg_file, wanted):
 """FASTQ filter."""
 from Bio.SeqIO.QualityIO import FastqGeneralIterator
 handle = open(in_file, "r")
-if out_positive_file is not None and out_negative_file is not None:
+if pos_file is not None and neg_file is not None:
 print "Generating two FASTQ files"
-positive_handle = open(out_positive_file, "w")
+positive_handle = open(pos_file, "w")
-negative_handle = open(out_negative_file, "w")
+negative_handle = open(neg_file, "w")
 print in_file
 for title, seq, qual in FastqGeneralIterator(handle):
 print("%s --> %s" % (title, clean_name(title.split(None, 1)[0])))
-if clean_name(title.split(None, 1)[0]) in ids:
+if clean_name(title.split(None, 1)[0]) in wanted:
 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
 else:
 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
 positive_handle.close()
 negative_handle.close()
-elif out_positive_file is not None:
+elif pos_file is not None:
 print "Generating matching FASTQ file"
-positive_handle = open(out_positive_file, "w")
+positive_handle = open(pos_file, "w")
 for title, seq, qual in FastqGeneralIterator(handle):
-if clean_name(title.split(None, 1)[0]) in ids:
+if clean_name(title.split(None, 1)[0]) in wanted:
 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
 positive_handle.close()
-elif out_negative_file is not None:
+elif neg_file is not None:
 print "Generating non-matching FASTQ file"
-negative_handle = open(out_negative_file, "w")
+negative_handle = open(neg_file, "w")
 for title, seq, qual in FastqGeneralIterator(handle):
-if clean_name(title.split(None, 1)[0]) not in ids:
+if clean_name(title.split(None, 1)[0]) not in wanted:
 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
 negative_handle.close()
 handle.close()
 # This does not currently bother to record record counts (faster)
 def sff_filter(in_file, pos_file, neg_file, wanted):
 """SFF filter."""
 try:
 from Bio.SeqIO.SffIO import SffIterator, SffWriter
 except ImportError:
-sys_exit("SFF filtering requires Biopython 1.54 or later")
+sys.exit("SFF filtering requires Biopython 1.54 or later")
 try:
 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
 except ImportError:
-#Prior to Biopython 1.56 this was a private function
+# Prior to Biopython 1.56 this was a private function
 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
-in_handle = open(in_file, "rb") #must be binary mode!
+in_handle = open(in_file, "rb")  # must be binary mode!
 try:
 manifest = ReadRocheXmlManifest(in_handle)
 except ValueError:
 manifest = None
-#This makes two passes though the SFF file with isn't so efficient,
+# This makes two passes though the SFF file with isn't so efficient,
-#but this makes the code simple.
+# but this makes the code simple.
 pos_count = neg_count = 0
-if out_positive_file is not None:
+if pos_file is not None:
-out_handle = open(out_positive_file, "wb")
+out_handle = open(pos_file, "wb")
 writer = SffWriter(out_handle, xml=manifest)
-in_handle.seek(0) #start again after getting manifest
+in_handle.seek(0)  # start again after getting manifest
-pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids)
+pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted)
 out_handle.close()
-if out_negative_file is not None:
+if neg_file is not None:
-out_handle = open(out_negative_file, "wb")
+out_handle = open(neg_file, "wb")
 writer = SffWriter(out_handle, xml=manifest)
-in_handle.seek(0) #start again
+in_handle.seek(0)  # start again
-neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids)
+neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted)
 out_handle.close()
-#And we're done
+# And we're done
 in_handle.close()
-#At the time of writing, Galaxy doesn't show SFF file read counts,
+# At the time of writing, Galaxy doesn't show SFF file read counts,
-#so it is useful to put them in stdout and thus shown in job info.
+# so it is useful to put them in stdout and thus shown in job info.
 return pos_count, neg_count
-if seq_format.lower()=="sff":
+if seq_format.lower() == "sff":
 # Now write filtered SFF file based on IDs wanted
 pos_count, neg_count = sff_filter(in_file, out_positive_file, out_negative_file, ids)
 # At the time of writing, Galaxy doesn't show SFF file read counts,
 # so it is useful to put them in stdout and thus shown in job info.
-elif seq_format.lower()=="fasta":
+elif seq_format.lower() == "fasta":
 # Write filtered FASTA file based on IDs from tabular file
 pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids)
 print "%i with and %i without specified IDs" % (pos_count, neg_count)
 elif seq_format.lower().startswith("fastq"):
 # Write filtered FASTQ file based on IDs from tabular file
 fastq_filter(in_file, out_positive_file, out_negative_file, ids)
 # This does not currently track the counts
 else:
-sys_exit("Unsupported file type %r" % seq_format)
+sys.exit("Unsupported file type %r" % seq_format)

Mercurial > repos > peterjc > seq_filter_by_id

comparison tools/seq_filter_by_id/seq_filter_by_id.py @ 6:03e134cae41a draft