Mercurial > repos > peterjc > seq_filter_by_id

diff tools/seq_filter_by_id/seq_filter_by_id.py @ 5:832c1fd57852 draft
v0.2.2; New options for IDs via text parameter, ignore paired read suffix; misc changes
author: peterjc
date: Wed, 13 May 2015 11:03:57 -0400
parents: 44ab4c0f7683
children: 03e134cae41a
--- a/tools/seq_filter_by_id/seq_filter_by_id.py	Wed Jul 30 06:39:53 2014 -0400
+++ b/tools/seq_filter_by_id/seq_filter_by_id.py	Wed May 13 11:03:57 2015 -0400
@@ -3,12 +3,8 @@
 
 Takes six command line options, tabular filename, ID column numbers (comma
 separated list using one based counting), input filename, input type (e.g.
-FASTA or SFF) and two output filenames (for records with and without the
-given IDs, same format as input sequence file).
-
-If either output filename is just a minus sign, that file is not created.
-This is intended to allow output for just the matched (or just the non-matched)
-records.
+FASTA or SFF) and up to two output filenames (for records with and without
+the given IDs, same format as input sequence file).
 
 When filtering an SFF file, any Roche XML manifest in the input file is
 preserved in both output files.
@@ -17,9 +13,9 @@
 in column one, and the ID of the match from the database is in column two.
 Here sensible values for the column numbers would therefore be "1" or "2".
 
-This tool is a short Python script which requires Biopython 1.54 or later
-for SFF file support. If you use this tool in scientific work leading to a
-publication, please cite the Biopython application note:
+This tool is a short Python script which requires Biopython 1.54 or later.
+If you use this tool in scientific work leading to a publication, please
+cite the Biopython application note:
 
 Cock et al 2009. Biopython: freely available Python tools for computational
 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
@@ -29,51 +25,177 @@
 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
 See accompanying text file for licence details (MIT license).
 
-This is version 0.1.0 of the script, use -v or --version to get the version.
+Use -v or --version to get the version, -h or --help for help.
 """
 import os
 import sys
+import re
+from optparse import OptionParser
 
-def stop_err(msg, err=1):
+def sys_exit(msg, err=1):
     sys.stderr.write(msg.rstrip() + "\n")
     sys.exit(err)
 
-if "-v" in sys.argv or "--version" in sys.argv:
-    print "v0.1.0"
+#Parse Command Line
+usage = """Use as follows:
+
+$ python seq_filter_by_id.py [options] tab1 cols1 [, tab2 cols2, ...]
+
+e.g. Positive matches using column one from tabular file:
+
+$ seq_filter_by_id.py -i my_seqs.fastq -f fastq -p matches.fastq ids.tabular 1
+
+Multiple tabular files and column numbers may be given, or replaced with
+the -t or --text option.
+"""
+parser = OptionParser(usage=usage)
+parser.add_option('-i', '--input', dest='input',
+                  default=None, help='Input sequences filename',
+                  metavar="FILE")
+parser.add_option('-f', '--format', dest='format',
+                  default=None,
+                  help='Input sequence format (e.g. fasta, fastq, sff)')
+parser.add_option('-t', '--text', dest='id_list',
+                  default=None, help="Lists of white space separated IDs (instead of a tabular file)")
+parser.add_option('-p', '--positive', dest='output_positive',
+                  default=None,
+                  help='Output filename for matches',
+                  metavar="FILE")
+parser.add_option('-n', '--negative', dest='output_negative',
+                  default=None,
+                  help='Output filename for non-matches',
+                  metavar="FILE")
+parser.add_option("-l", "--logic", dest="logic",
+                  default="UNION",
+                  help="How to combined multiple ID columns (UNION or INTERSECTION)")
+parser.add_option("-s", "--suffix", dest="suffix",
+                  action="store_true",
+                  help="Ignore pair-read suffices for matching names")
+parser.add_option("-v", "--version", dest="version",
+                  default=False, action="store_true",
+                  help="Show version and quit")
+
+options, args = parser.parse_args()
+
+if options.version:
+    print "v0.2.1"
     sys.exit(0)
 
-#Parse Command Line
-if len(sys.argv) - 1 < 7 or len(sys.argv) % 2 == 1:
-    stop_err("Expected 7 or more arguments, 5 required "
-             "(in seq, seq format, out pos, out neg, logic) "
-             "then one or more pairs (tab file, columns), "
-             "got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
+in_file = options.input
+seq_format = options.format
+out_positive_file = options.output_positive
+out_negative_file = options.output_negative
+logic = options.logic
+drop_suffices = bool(options.suffix)
 
-in_file, seq_format, out_positive_file, out_negative_file, logic = sys.argv[1:6]
+if in_file is None or not os.path.isfile(in_file):
+    sys_exit("Missing input file: %r" % in_file)
+if out_positive_file is None and out_negative_file is None:
+    sys_exit("Neither output file requested")
+if seq_format is None:
+    sys_exit("Missing sequence format")
+if logic not in ["UNION", "INTERSECTION"]:
+    sys_exit("Logic agrument should be 'UNION' or 'INTERSECTION', not %r" % logic)
+if options.id_list and args:
+    sys_exit("Cannot accepted IDs via both -t and as tabular files")
+elif not options.id_list and not args:
+    sys_exit("Expected matched pairs of tabular files and columns (or -t given)")
+if len(args) % 2:
+    sys_exit("Expected matched pairs of tabular files and columns, not: %r" % args)
+
 
-if not os.path.isfile(in_file):
-    stop_err("Missing input file %r" % in_file)
-if out_positive_file == "-" and out_negative_file == "-":
-    stop_err("Neither output file requested")
-if logic not in ["UNION", "INTERSECTION"]:
-    stop_err("Fifth agrument should be 'UNION' or 'INTERSECTION', not %r" % logic)
+#Cope with three widely used suffix naming convensions,
+#Illumina: /1 or /2
+#Forward/revered: .f or .r
+#Sanger, e.g. .p1k and .q1k
+#See http://staden.sourceforge.net/manual/pregap4_unix_50.html
+#re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$")
+#re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$")
+re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$")
+assert re_suffix.search("demo.f")
+assert re_suffix.search("demo.s1")
+assert re_suffix.search("demo.f1k")
+assert re_suffix.search("demo.p1")
+assert re_suffix.search("demo.p1k")
+assert re_suffix.search("demo.p1lk")
+assert re_suffix.search("demo/2")
+assert re_suffix.search("demo.r")
+assert re_suffix.search("demo.q1")
+assert re_suffix.search("demo.q1lk")
 
 identifiers = []
-for i in range((len(sys.argv) - 6) // 2):
-    tabular_file = sys.argv[6+2*i]
-    cols_arg = sys.argv[7+2*i]
+for i in range(len(args) // 2):
+    tabular_file = args[2*i]
+    cols_arg = args[2*i + 1]
     if not os.path.isfile(tabular_file):
-        stop_err("Missing tabular identifier file %r" % tabular_file)
+        sys_exit("Missing tabular identifier file %r" % tabular_file)
     try:
         columns = [int(arg)-1 for arg in cols_arg.split(",")]
     except ValueError:
-        stop_err("Expected list of columns (comma separated integers), got %r" % cols_arg)
+        sys_exit("Expected list of columns (comma separated integers), got %r" % cols_arg)
     if min(columns) < 0:
-        stop_err("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg)
+        sys_exit("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg)
     identifiers.append((tabular_file, columns))
 
+name_warn = False
+def check_white_space(name):
+    parts = name.split(None, 1)
+    global name_warn
+    if not name_warn and len(parts) > 1:
+        name_warn = "WARNING: Some of your identifiers had white space in them, " + \
+                    "using first word only. e.g.:\n%s\n" % name
+    return parts[0]
+
+if drop_suffices:
+    def clean_name(name):
+        """Remove suffix."""
+        name = check_white_space(name)
+        match = re_suffix.search(name)
+        if match:
+            # Use the fact this is a suffix, and regular expression will be
+            # anchored to the end of the name:
+            return name[:match.start()]
+        else:
+            # Nothing to do
+            return name
+    assert clean_name("foo/1") == "foo"
+    assert clean_name("foo/2") == "foo"
+    assert clean_name("bar.f") == "bar"
+    assert clean_name("bar.r") == "bar"
+    assert clean_name("baz.p1") == "baz"
+    assert clean_name("baz.q2") == "baz"
+else:
+    # Just check the white space
+    clean_name = check_white_space
+
+
+mapped_chars = { '>' :'__gt__',
+                 '<' :'__lt__',
+                 "'" :'__sq__',
+                 '"' :'__dq__',
+                 '[' :'__ob__',
+                 ']' :'__cb__',
+                 '{' :'__oc__',
+                 '}' :'__cc__',
+                 '@' : '__at__',
+                 '\n' : '__cn__',
+                 '\r' : '__cr__',
+                 '\t' : '__tc__',
+                 '#' : '__pd__'
+                 }
+
 #Read tabular file(s) and record all specified identifiers
 ids = None #Will be a set
+if options.id_list:
+    assert not identifiers
+    ids = set()
+    id_list = options.id_list
+    #Galaxy turns \r into __cr__ (CR) etc
+    for k in mapped_chars:
+        id_list = id_list.replace(mapped_chars[k], k)
+    for x in options.id_list.split():
+        ids.add(clean_name(x.strip()))
+    print("Have %i unique identifiers from list" % len(ids))
 for tabular_file, columns in identifiers:
     file_ids = set()
     handle = open(tabular_file, "rU")
@@ -85,13 +207,13 @@
                 continue
             parts = line.rstrip("\n").split("\t")
             for col in columns:
-                file_ids.add(parts[col])
+                file_ids.add(clean_name(parts[col]))
     else:
         #Single column, special case speed up
         col = columns[0]
         for line in handle:
             if not line.startswith("#"):
-                file_ids.add(line.rstrip("\n").split("\t")[col])
+                file_ids.add(clean_name(line.rstrip("\n").split("\t")[col]))
     print "Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col+1) for col in columns))
     if ids is None:
         ids = file_ids
@@ -105,7 +227,8 @@
         print "Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers))
     else:
         print "Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers))
-
+if name_warn:
+    sys.stderr.write(name_warn)
 
 def crude_fasta_iterator(handle):
     """Yields tuples, record ID and the full record as a string."""
@@ -149,32 +272,32 @@
     with open(in_file) as in_handle:
         #Doing the if statement outside the loop for speed
         #(with the downside of three very similar loops).
-        if pos_file != "-" and neg_file != "-":
+        if pos_file is not None and neg_file is not None:
             print "Generating two FASTA files"
             with open(pos_file, "w") as pos_handle:
                 with open(neg_file, "w") as neg_handle:
                     for identifier, record in crude_fasta_iterator(in_handle):
-                        if identifier in wanted:
+                        if clean_name(identifier) in wanted:
                             pos_handle.write(record)
                             pos_count += 1
                         else:
                             neg_handle.write(record)
                             neg_count += 1
-        elif pos_file != "-":
+        elif pos_file is not None:
             print "Generating matching FASTA file"
             with open(pos_file, "w") as pos_handle:
                 for identifier, record in crude_fasta_iterator(in_handle):
-                    if identifier in wanted:
+                    if clean_name(identifier) in wanted:
                         pos_handle.write(record)
                         pos_count += 1
                     else:
                         neg_count += 1
         else:
             print "Generating non-matching FASTA file"
-            assert neg_file != "-"
+            assert neg_file is not None
             with open(neg_file, "w") as neg_handle:
                 for identifier, record in crude_fasta_iterator(in_handle):
-                    if identifier in wanted:
+                    if clean_name(identifier) in wanted:
                         pos_count += 1
                     else:
                         neg_handle.write(record)
@@ -182,79 +305,94 @@
     return pos_count, neg_count
 
 
-if seq_format.lower()=="sff":
-    #Now write filtered SFF file based on IDs from BLAST file
+def fastq_filter(in_file, pos_file, neg_file, wanted):
+    """FASTQ filter."""
+    from Bio.SeqIO.QualityIO import FastqGeneralIterator
+    handle = open(in_file, "r")
+    if out_positive_file is not None and out_negative_file is not None:
+        print "Generating two FASTQ files"
+        positive_handle = open(out_positive_file, "w")
+        negative_handle = open(out_negative_file, "w")
+        print in_file
+        for title, seq, qual in FastqGeneralIterator(handle):
+            print("%s --> %s" % (title, clean_name(title.split(None, 1)[0])))
+            if clean_name(title.split(None, 1)[0]) in ids:
+                positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
+            else:
+                negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
+        positive_handle.close()
+        negative_handle.close()
+    elif out_positive_file is not None:
+        print "Generating matching FASTQ file"
+        positive_handle = open(out_positive_file, "w")
+        for title, seq, qual in FastqGeneralIterator(handle):
+            if clean_name(title.split(None, 1)[0]) in ids:
+                positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
+        positive_handle.close()
+    elif out_negative_file is not None:
+        print "Generating non-matching FASTQ file"
+        negative_handle = open(out_negative_file, "w")
+        for title, seq, qual in FastqGeneralIterator(handle):
+            if clean_name(title.split(None, 1)[0]) not in ids:
+                negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
+        negative_handle.close()
+    handle.close()
+    # This does not currently bother to record record counts (faster)
+
+
+def sff_filter(in_file, pos_file, neg_file, wanted):
+    """SFF filter."""
     try:
         from Bio.SeqIO.SffIO import SffIterator, SffWriter
     except ImportError:
-        stop_err("SFF filtering requires Biopython 1.54 or later")
+        sys_exit("SFF filtering requires Biopython 1.54 or later")
 
     try:
         from Bio.SeqIO.SffIO import ReadRocheXmlManifest
     except ImportError:
         #Prior to Biopython 1.56 this was a private function
         from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
+
     in_handle = open(in_file, "rb") #must be binary mode!
     try:
         manifest = ReadRocheXmlManifest(in_handle)
     except ValueError:
         manifest = None
+
     #This makes two passes though the SFF file with isn't so efficient,
     #but this makes the code simple.
     pos_count = neg_count = 0
-    if out_positive_file != "-":
+    if out_positive_file is not None:
         out_handle = open(out_positive_file, "wb")
         writer = SffWriter(out_handle, xml=manifest)
         in_handle.seek(0) #start again after getting manifest
-        pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id in ids)
+        pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids)
         out_handle.close()
-    if out_negative_file != "-":
+    if out_negative_file is not None:
         out_handle = open(out_negative_file, "wb")
         writer = SffWriter(out_handle, xml=manifest)
         in_handle.seek(0) #start again
-        neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id not in ids)
+        neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids)
         out_handle.close()
     #And we're done
     in_handle.close()
     #At the time of writing, Galaxy doesn't show SFF file read counts,
     #so it is useful to put them in stdout and thus shown in job info.
-    print "%i with and %i without specified IDs" % (pos_count, neg_count)
+    return pos_count, neg_count
+
+
+if seq_format.lower()=="sff":
+    # Now write filtered SFF file based on IDs wanted
+    pos_count, neg_count = sff_filter(in_file, out_positive_file, out_negative_file, ids)
+    # At the time of writing, Galaxy doesn't show SFF file read counts,
+    # so it is useful to put them in stdout and thus shown in job info.
 elif seq_format.lower()=="fasta":
-    #Write filtered FASTA file based on IDs from tabular file
+    # Write filtered FASTA file based on IDs from tabular file
     pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids)
     print "%i with and %i without specified IDs" % (pos_count, neg_count)
 elif seq_format.lower().startswith("fastq"):
-    #Write filtered FASTQ file based on IDs from tabular file
-    from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
-    reader = fastqReader(open(in_file, "rU"))
-    if out_positive_file != "-" and out_negative_file != "-":
-        print "Generating two FASTQ files"
-        positive_writer = fastqWriter(open(out_positive_file, "w"))
-        negative_writer = fastqWriter(open(out_negative_file, "w"))
-        for record in reader:
-            #The [1:] is because the fastaReader leaves the > on the identifier.
-            if record.identifier and record.identifier.split()[0][1:] in ids:
-                positive_writer.write(record)
-            else:
-                negative_writer.write(record)
-        positive_writer.close()
-        negative_writer.close()
-    elif out_positive_file != "-":
-        print "Generating matching FASTQ file"
-        positive_writer = fastqWriter(open(out_positive_file, "w"))
-        for record in reader:
-            #The [1:] is because the fastaReader leaves the > on the identifier.
-            if record.identifier and record.identifier.split()[0][1:] in ids:
-                positive_writer.write(record)
-        positive_writer.close()
-    elif out_negative_file != "-":
-        print "Generating non-matching FASTQ file"
-        negative_writer = fastqWriter(open(out_negative_file, "w"))
-        for record in reader:
-            #The [1:] is because the fastaReader leaves the > on the identifier.
-            if not record.identifier or record.identifier.split()[0][1:] not in ids:
-                negative_writer.write(record)
-        negative_writer.close()
-    reader.close()
+    # Write filtered FASTQ file based on IDs from tabular file
+    fastq_filter(in_file, out_positive_file, out_negative_file, ids)
+    # This does not currently track the counts
 else:
-    stop_err("Unsupported file type %r" % seq_format)
+    sys_exit("Unsupported file type %r" % seq_format)
author	peterjc
date	Wed, 13 May 2015 11:03:57 -0400
parents	44ab4c0f7683
children	03e134cae41a