Mercurial > repos > bvalot > fasta_filter
comparison fastq_subsampling.py @ 0:c74e633b40e9 draft default tip
planemo upload for repository https://github.com/bvalot/galaxy commit d57c24d4b2c0c741d572af9ca0d09f8b82689640
author | bvalot |
---|---|
date | Tue, 14 Jun 2022 08:51:44 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c74e633b40e9 |
---|---|
1 #!/usr/bin/env python3 | |
2 # -*- coding: utf-8 -*- | |
3 | |
4 """Return a subsampling fastq file""" | |
5 | |
6 import argparse | |
7 import gzip | |
8 import io | |
9 import os | |
10 import random | |
11 import sys | |
12 | |
13 import pysam | |
14 | |
15 desc = "Subsampling a single or a paired of fastq(.gz) file" | |
16 command = argparse.ArgumentParser( | |
17 prog="fastq_subsampling.py", description=desc, usage="%(prog)s [options] genomeSize" | |
18 ) | |
19 command.add_argument("--galaxy", action="store_true", help=argparse.SUPPRESS) | |
20 command.add_argument( | |
21 "-d", | |
22 "--outdir", | |
23 default="./", | |
24 type=str, | |
25 nargs="?", | |
26 help="Directory to store subsampling fastq file, default: current directory", | |
27 ) | |
28 command.add_argument( | |
29 "-p", | |
30 "--prefix", | |
31 default="_sub", | |
32 type=str, | |
33 nargs="?", | |
34 help="Output fastq file with prefix, default:_sub", | |
35 ) | |
36 command.add_argument( | |
37 "-c", | |
38 "--coverage", | |
39 type=int, | |
40 nargs="?", | |
41 default=60, | |
42 help="Mean coverage to sampling (default=60)", | |
43 ) | |
44 command.add_argument( | |
45 "--copy", | |
46 action="store_true", | |
47 help="Perform a copy of sample without enough coverage (default=no subsampling)", | |
48 ) | |
49 command.add_argument( | |
50 "-s", | |
51 "--sfastq", | |
52 nargs="?", | |
53 type=argparse.FileType("r"), | |
54 help="Fastq(.gz) file to subsampling (single)", | |
55 ) | |
56 command.add_argument( | |
57 "-r", | |
58 "--rfastq", | |
59 nargs="?", | |
60 type=argparse.FileType("r"), | |
61 help="Fastq(.gz) file to subsampling in paired (right)", | |
62 ) | |
63 command.add_argument( | |
64 "-l", | |
65 "--lfastq", | |
66 nargs="?", | |
67 type=argparse.FileType("r"), | |
68 help="Fastq(.gz) file to subsampling in paired (left)", | |
69 ) | |
70 command.add_argument( | |
71 "genomesize", | |
72 type=int, | |
73 help="Size of the genome (bp) for calculation of subsampling", | |
74 ) | |
75 command.add_argument("-v", "--version", action="version", version="%(prog)s 0.2.0") | |
76 | |
77 | |
78 def outname(inname, outdir, append): | |
79 inname = outdir + inname.split("/")[-1] | |
80 if isgzip(inname): | |
81 if inname[-5:] == "fq.gz": | |
82 return inname.rstrip(".fq.gz") + append + ".fastq.gz" | |
83 elif inname[-8:] == "fastq.gz": | |
84 return inname.rstrip(".fastq.gz") + append + ".fastq.gz" | |
85 else: | |
86 return inname + append + ".fastq.gz" | |
87 elif inname[-2:] == "fq": | |
88 return inname.rstrip(".fq") + append + ".fastq.gz" | |
89 elif inname[-5:] == "fastq": | |
90 return inname.rstrip(".fastq") + append + ".fastq.gz" | |
91 else: | |
92 return inname + append + ".fastq.gz" | |
93 | |
94 | |
95 def isgzip(filename): | |
96 if filename[-2:] == "gz": | |
97 return True | |
98 else: | |
99 return False | |
100 | |
101 | |
102 def complet_count(fastq): | |
103 count = 0 | |
104 total = 0.0 | |
105 for read in pysam.FastxFile(fastq): | |
106 count += 1 | |
107 total += len(read.sequence) | |
108 return count, int(total / count) | |
109 | |
110 | |
111 def make_copy(infile, outdir, prefix): | |
112 output = outname(infile, outdir, prefix) | |
113 os.popen(" ".join(["cp", infile, output])) | |
114 | |
115 | |
116 def make_subsampling(infile, outdir, prefix, select): | |
117 output = io.BufferedWriter(gzip.open(outname(infile, outdir, prefix), "wb", 5)) | |
118 for i, read in enumerate(pysam.FastxFile(infile)): | |
119 if i in select: | |
120 output.write((str(read) + "\n").encode()) | |
121 output.flush() | |
122 output.close() | |
123 | |
124 | |
125 def make_subsampling_galaxy(infile, number, select): | |
126 output = io.BufferedWriter(gzip.open("./" + str(number) + ".fastq.gz", "wb", 5)) | |
127 for i, read in enumerate(pysam.FastxFile(infile)): | |
128 if i in select: | |
129 output.write((str(read) + "\n").encode()) | |
130 output.flush() | |
131 output.close() | |
132 | |
133 | |
134 if __name__ == "__main__": | |
135 """Performed job on execution script""" | |
136 args = command.parse_args() | |
137 | |
138 # verify input | |
139 if args.sfastq is None and args.lfastq is None and args.rfastq is None: | |
140 raise Exception("At least single or paired reads must be defined") | |
141 outdir = args.outdir | |
142 if not os.path.isdir(outdir): | |
143 raise Exception(outdir + " is not a valid directory") | |
144 else: | |
145 outdir = outdir.rstrip("/") + "/" | |
146 | |
147 # compute count and length | |
148 fastqname = None | |
149 if args.sfastq is not None: | |
150 fastqname = args.sfastq.name | |
151 elif args.rfastq is not None and args.lfastq is not None: | |
152 fastqname = args.rfastq.name | |
153 else: | |
154 raise Exception("You must defined right and left fastq file for paired data") | |
155 | |
156 count = None | |
157 length = None | |
158 count, length = complet_count(fastqname) | |
159 | |
160 # Calculate coverage and report | |
161 coverage = 0 | |
162 if args.sfastq is not None: | |
163 coverage = int(float(count) * length / args.genomesize) | |
164 print( | |
165 " : ".join([fastqname, str(length) + "bp", str(count), str(coverage) + "X"]) | |
166 ) | |
167 else: | |
168 coverage = int(float(count) * length * 2 / args.genomesize) | |
169 print( | |
170 " : ".join( | |
171 [fastqname, str(length) + "bp*2", str(count), str(coverage) + "X"] | |
172 ) | |
173 ) | |
174 | |
175 # check coverage | |
176 if coverage <= args.coverage: | |
177 if args.copy and args.galaxy is False: | |
178 if args.sfastq is not None: | |
179 make_copy(args.sfastq.name, outdir, args.prefix) | |
180 else: | |
181 make_copy(args.rfastq.name, outdir, args.prefix) | |
182 make_copy(args.lfastq.name, outdir, args.prefix) | |
183 print("Coverage is less than threshold, no subsampling need") | |
184 sys.exit(0) | |
185 | |
186 # performed subsampling | |
187 if args.sfastq is not None: | |
188 subread = int((float(args.genomesize) * args.coverage) / length) | |
189 select = set(random.sample(range(0, count), subread)) | |
190 if args.galaxy: | |
191 make_subsampling_galaxy(args.sfastq.name, 1, select) | |
192 else: | |
193 make_subsampling(args.sfastq.name, outdir, args.prefix, select) | |
194 print("Subsampling to " + str(subread)) | |
195 else: | |
196 subread = int((float(args.genomesize) * args.coverage) / (length * 2)) | |
197 select = set(random.sample(range(0, count), subread)) | |
198 if args.galaxy: | |
199 make_subsampling_galaxy(args.rfastq.name, 1, select) | |
200 make_subsampling_galaxy(args.lfastq.name, 2, select) | |
201 else: | |
202 make_subsampling(args.rfastq.name, outdir, args.prefix, select) | |
203 make_subsampling(args.lfastq.name, outdir, args.prefix, select) | |
204 print("Subsampling to " + str(subread)) |