diff fastq_subset.py @ 9:52dbe2089d14 draft default tip

Version 0.02.04.8 (update fastq subsetting).
author pjbriggs
date Wed, 04 Jul 2018 06:05:52 -0400
parents 4e625d3672ba
children
line wrap: on
line diff
--- a/fastq_subset.py	Wed May 16 07:39:16 2018 -0400
+++ b/fastq_subset.py	Wed Jul 04 06:05:52 2018 -0400
@@ -2,7 +2,63 @@
 
 import argparse
 import random
-from Bio.SeqIO.QualityIO import FastqGeneralIterator
+import gzip
+
+CHUNKSIZE = 102400
+
+def getlines(filen):
+    """
+    Efficiently fetch lines from a file one by one
+
+    Generator function implementing an efficient
+    method of reading lines sequentially from a file,
+    attempting to minimise the number of read operations
+    and performing the line splitting in memory:
+
+    >>> for line in getlines(filen):
+    >>> ...do something...
+
+    Input file can be gzipped; this function should
+    handle this invisibly provided the file names ends
+    with '.gz'.
+
+    Arguments:
+      filen (str): path of the file to read lines from
+
+    Yields:
+      String: next line of text from the file, with any
+        newline character removed.
+    """
+    if filen.split('.')[-1] == 'gz':
+        fp = gzip.open(filen,'rb')
+    else:
+        fp = open(filen,'rb')
+    # Read in data in chunks
+    buf = ''
+    lines = []
+    while True:
+        # Grab a chunk of data
+        data = fp.read(CHUNKSIZE)
+        # Check for EOF
+        if not data:
+            break
+        # Add to buffer and split into lines
+        buf = buf + data
+        if buf[0] == '\n':
+            buf = buf[1:]
+        if buf[-1] != '\n':
+            i = buf.rfind('\n')
+            if i == -1:
+                continue
+            else:
+                lines = buf[:i].split('\n')
+                buf = buf[i+1:]
+        else:
+            lines = buf[:-1].split('\n')
+            buf = ''
+        # Return the lines one at a time
+        for line in lines:
+            yield line
 
 def count_reads(fastq):
     """
@@ -25,16 +81,34 @@
     positions in the input file will be written
     to the output.
     """
-    with open(fastq_in,'r') as fq_in:
-        fq_out = open(fastq_out,'w')
+    with open(fastq_out,'w') as fq_out:
+        # Current index
         i = 0
-        for title,seq,qual in FastqGeneralIterator(fq_in):
-            if i in indices:
-                fq_out.write("@%s\n%s\n+\n%s\n" % (title,
-                                                   seq,
-                                                   qual))
-            i += 1
-        fq_out.close()
+        # Read count
+        n = 0
+        # Read contents
+        rd = []
+        # Iterate through the file
+        for ii,line in enumerate(getlines(fastq_in),start=1):
+            rd.append(line)
+            if ii%4 == 0:
+                # Got a complete read
+                try:
+                    # If read index matches the current index
+                    # then output the read
+                    if n == indices[i]:
+                        fq_out.write("%s\n" % '\n'.join(rd))
+                        i += 1
+                    # Update for next read
+                    n += 1
+                    rd = []
+                except IndexError:
+                    # Subset complete
+                    return
+    # End of file: check nothing was left over
+    if rd:
+        raise Exception("Incomplete read at file end: %s"
+                        % rd)
 
 if __name__ == "__main__":
 
@@ -69,6 +143,6 @@
         if args.seed is not None:
             print "Random number generator seed: %d" % args.seed
             random.seed(args.seed)
-        subset = random.sample(xrange(nreads),subset_size)
+        subset = sorted(random.sample(xrange(nreads),subset_size))
         fastq_subset(args.fastq_r1,"subset_r1.fq",subset)
         fastq_subset(args.fastq_r2,"subset_r2.fq",subset)