comparison fastq_subset.py @ 8:4e625d3672ba draft

Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
author pjbriggs
date Wed, 16 May 2018 07:39:16 -0400
parents
children 52dbe2089d14
comparison
equal deleted inserted replaced
7:5e133b7b79a6 8:4e625d3672ba
1 #!/usr/bin/env python
2
3 import argparse
4 import random
5 from Bio.SeqIO.QualityIO import FastqGeneralIterator
6
7 def count_reads(fastq):
8 """
9 Count number of reads in a Fastq file
10 """
11 n = 0
12 with open(fastq,'r') as fq:
13 while True:
14 buf = fq.read()
15 n += buf.count('\n')
16 if buf == "": break
17 return n/4
18
19 def fastq_subset(fastq_in,fastq_out,indices):
20 """
21 Output a subset of reads from a Fastq file
22
23 The reads to output are specifed by a list
24 of integer indices; only reads at those
25 positions in the input file will be written
26 to the output.
27 """
28 with open(fastq_in,'r') as fq_in:
29 fq_out = open(fastq_out,'w')
30 i = 0
31 for title,seq,qual in FastqGeneralIterator(fq_in):
32 if i in indices:
33 fq_out.write("@%s\n%s\n+\n%s\n" % (title,
34 seq,
35 qual))
36 i += 1
37 fq_out.close()
38
39 if __name__ == "__main__":
40
41 p = argparse.ArgumentParser()
42 p.add_argument("fastq_r1")
43 p.add_argument("fastq_r2")
44 p.add_argument("-n",
45 dest="subset_size",
46 default=None,
47 help="subset size")
48 p.add_argument("-s",
49 dest="seed",
50 type=int,
51 default=None,
52 help="seed for random number generator")
53 args = p.parse_args()
54
55 print "Processing fastq pair:"
56 print "\t%s" % args.fastq_r1
57 print "\t%s" % args.fastq_r2
58
59 nreads = count_reads(args.fastq_r1)
60 print "Counted %d reads in %s" % (nreads,args.fastq_r1)
61
62 if args.subset_size is not None:
63 subset_size = float(args.subset_size)
64 if subset_size < 1.0:
65 subset_size = int(nreads*subset_size)
66 else:
67 subset_size = int(subset_size)
68 print "Extracting subset of reads: %s" % subset_size
69 if args.seed is not None:
70 print "Random number generator seed: %d" % args.seed
71 random.seed(args.seed)
72 subset = random.sample(xrange(nreads),subset_size)
73 fastq_subset(args.fastq_r1,"subset_r1.fq",subset)
74 fastq_subset(args.fastq_r2,"subset_r2.fq",subset)