comparison fastq_subset.py @ 9:52dbe2089d14 draft default tip

Version 0.02.04.8 (update fastq subsetting).
author pjbriggs
date Wed, 04 Jul 2018 06:05:52 -0400
parents 4e625d3672ba
children
comparison
equal deleted inserted replaced
8:4e625d3672ba 9:52dbe2089d14
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 import argparse 3 import argparse
4 import random 4 import random
5 from Bio.SeqIO.QualityIO import FastqGeneralIterator 5 import gzip
6
7 CHUNKSIZE = 102400
8
9 def getlines(filen):
10 """
11 Efficiently fetch lines from a file one by one
12
13 Generator function implementing an efficient
14 method of reading lines sequentially from a file,
15 attempting to minimise the number of read operations
16 and performing the line splitting in memory:
17
18 >>> for line in getlines(filen):
19 >>> ...do something...
20
21 Input file can be gzipped; this function should
22 handle this invisibly provided the file names ends
23 with '.gz'.
24
25 Arguments:
26 filen (str): path of the file to read lines from
27
28 Yields:
29 String: next line of text from the file, with any
30 newline character removed.
31 """
32 if filen.split('.')[-1] == 'gz':
33 fp = gzip.open(filen,'rb')
34 else:
35 fp = open(filen,'rb')
36 # Read in data in chunks
37 buf = ''
38 lines = []
39 while True:
40 # Grab a chunk of data
41 data = fp.read(CHUNKSIZE)
42 # Check for EOF
43 if not data:
44 break
45 # Add to buffer and split into lines
46 buf = buf + data
47 if buf[0] == '\n':
48 buf = buf[1:]
49 if buf[-1] != '\n':
50 i = buf.rfind('\n')
51 if i == -1:
52 continue
53 else:
54 lines = buf[:i].split('\n')
55 buf = buf[i+1:]
56 else:
57 lines = buf[:-1].split('\n')
58 buf = ''
59 # Return the lines one at a time
60 for line in lines:
61 yield line
6 62
7 def count_reads(fastq): 63 def count_reads(fastq):
8 """ 64 """
9 Count number of reads in a Fastq file 65 Count number of reads in a Fastq file
10 """ 66 """
23 The reads to output are specifed by a list 79 The reads to output are specifed by a list
24 of integer indices; only reads at those 80 of integer indices; only reads at those
25 positions in the input file will be written 81 positions in the input file will be written
26 to the output. 82 to the output.
27 """ 83 """
28 with open(fastq_in,'r') as fq_in: 84 with open(fastq_out,'w') as fq_out:
29 fq_out = open(fastq_out,'w') 85 # Current index
30 i = 0 86 i = 0
31 for title,seq,qual in FastqGeneralIterator(fq_in): 87 # Read count
32 if i in indices: 88 n = 0
33 fq_out.write("@%s\n%s\n+\n%s\n" % (title, 89 # Read contents
34 seq, 90 rd = []
35 qual)) 91 # Iterate through the file
36 i += 1 92 for ii,line in enumerate(getlines(fastq_in),start=1):
37 fq_out.close() 93 rd.append(line)
94 if ii%4 == 0:
95 # Got a complete read
96 try:
97 # If read index matches the current index
98 # then output the read
99 if n == indices[i]:
100 fq_out.write("%s\n" % '\n'.join(rd))
101 i += 1
102 # Update for next read
103 n += 1
104 rd = []
105 except IndexError:
106 # Subset complete
107 return
108 # End of file: check nothing was left over
109 if rd:
110 raise Exception("Incomplete read at file end: %s"
111 % rd)
38 112
39 if __name__ == "__main__": 113 if __name__ == "__main__":
40 114
41 p = argparse.ArgumentParser() 115 p = argparse.ArgumentParser()
42 p.add_argument("fastq_r1") 116 p.add_argument("fastq_r1")
67 subset_size = int(subset_size) 141 subset_size = int(subset_size)
68 print "Extracting subset of reads: %s" % subset_size 142 print "Extracting subset of reads: %s" % subset_size
69 if args.seed is not None: 143 if args.seed is not None:
70 print "Random number generator seed: %d" % args.seed 144 print "Random number generator seed: %d" % args.seed
71 random.seed(args.seed) 145 random.seed(args.seed)
72 subset = random.sample(xrange(nreads),subset_size) 146 subset = sorted(random.sample(xrange(nreads),subset_size))
73 fastq_subset(args.fastq_r1,"subset_r1.fq",subset) 147 fastq_subset(args.fastq_r1,"subset_r1.fq",subset)
74 fastq_subset(args.fastq_r2,"subset_r2.fq",subset) 148 fastq_subset(args.fastq_r2,"subset_r2.fq",subset)