comparison fastq_subsampling.py @ 0:bc23da9d464c draft default tip

planemo upload for repository http://172.20.124.12:3000/bvalot3/PythonScript commit 9676573ee48ce5343600ab45cd3ac1a6adddabe4
author bvalot
date Tue, 14 Jun 2022 08:15:55 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:bc23da9d464c
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3
4 """Return a subsampling fastq file"""
5
6 import argparse
7 import gzip
8 import io
9 import os
10 import random
11 import sys
12
13 import pysam
14
15 desc = "Subsampling a single or a paired of fastq(.gz) file"
16 command = argparse.ArgumentParser(
17 prog="fastq_subsampling.py", description=desc, usage="%(prog)s [options] genomeSize"
18 )
19 command.add_argument("--galaxy", action="store_true", help=argparse.SUPPRESS)
20 command.add_argument(
21 "-d",
22 "--outdir",
23 default="./",
24 type=str,
25 nargs="?",
26 help="Directory to store subsampling fastq file, default: current directory",
27 )
28 command.add_argument(
29 "-p",
30 "--prefix",
31 default="_sub",
32 type=str,
33 nargs="?",
34 help="Output fastq file with prefix, default:_sub",
35 )
36 command.add_argument(
37 "-c",
38 "--coverage",
39 type=int,
40 nargs="?",
41 default=60,
42 help="Mean coverage to sampling (default=60)",
43 )
44 command.add_argument(
45 "--copy",
46 action="store_true",
47 help="Perform a copy of sample without enough coverage (default=no subsampling)",
48 )
49 command.add_argument(
50 "-s",
51 "--sfastq",
52 nargs="?",
53 type=argparse.FileType("r"),
54 help="Fastq(.gz) file to subsampling (single)",
55 )
56 command.add_argument(
57 "-r",
58 "--rfastq",
59 nargs="?",
60 type=argparse.FileType("r"),
61 help="Fastq(.gz) file to subsampling in paired (right)",
62 )
63 command.add_argument(
64 "-l",
65 "--lfastq",
66 nargs="?",
67 type=argparse.FileType("r"),
68 help="Fastq(.gz) file to subsampling in paired (left)",
69 )
70 command.add_argument(
71 "genomesize",
72 type=int,
73 help="Size of the genome (bp) for calculation of subsampling",
74 )
75 command.add_argument("-v", "--version", action="version", version="%(prog)s 0.2.0")
76
77
78 def outname(inname, outdir, append):
79 inname = outdir + inname.split("/")[-1]
80 if isgzip(inname):
81 if inname[-5:] == "fq.gz":
82 return inname.rstrip(".fq.gz") + append + ".fastq.gz"
83 elif inname[-8:] == "fastq.gz":
84 return inname.rstrip(".fastq.gz") + append + ".fastq.gz"
85 else:
86 return inname + append + ".fastq.gz"
87 elif inname[-2:] == "fq":
88 return inname.rstrip(".fq") + append + ".fastq.gz"
89 elif inname[-5:] == "fastq":
90 return inname.rstrip(".fastq") + append + ".fastq.gz"
91 else:
92 return inname + append + ".fastq.gz"
93
94
95 def isgzip(filename):
96 if filename[-2:] == "gz":
97 return True
98 else:
99 return False
100
101
102 def complet_count(fastq):
103 count = 0
104 total = 0.0
105 for read in pysam.FastxFile(fastq):
106 count += 1
107 total += len(read.sequence)
108 return count, int(total / count)
109
110
111 def make_copy(infile, outdir, prefix):
112 output = outname(infile, outdir, prefix)
113 os.popen(" ".join(["cp", infile, output]))
114
115
116 def make_subsampling(infile, outdir, prefix, select):
117 output = io.BufferedWriter(gzip.open(outname(infile, outdir, prefix), "wb", 5))
118 for i, read in enumerate(pysam.FastxFile(infile)):
119 if i in select:
120 output.write((str(read) + "\n").encode())
121 output.flush()
122 output.close()
123
124
125 def make_subsampling_galaxy(infile, number, select):
126 output = io.BufferedWriter(gzip.open("./" + str(number) + ".fastq.gz", "wb", 5))
127 for i, read in enumerate(pysam.FastxFile(infile)):
128 if i in select:
129 output.write((str(read) + "\n").encode())
130 output.flush()
131 output.close()
132
133
134 if __name__ == "__main__":
135 """Performed job on execution script"""
136 args = command.parse_args()
137
138 # verify input
139 if args.sfastq is None and args.lfastq is None and args.rfastq is None:
140 raise Exception("At least single or paired reads must be defined")
141 outdir = args.outdir
142 if not os.path.isdir(outdir):
143 raise Exception(outdir + " is not a valid directory")
144 else:
145 outdir = outdir.rstrip("/") + "/"
146
147 # compute count and length
148 fastqname = None
149 if args.sfastq is not None:
150 fastqname = args.sfastq.name
151 elif args.rfastq is not None and args.lfastq is not None:
152 fastqname = args.rfastq.name
153 else:
154 raise Exception("You must defined right and left fastq file for paired data")
155
156 count = None
157 length = None
158 count, length = complet_count(fastqname)
159
160 # Calculate coverage and report
161 coverage = 0
162 if args.sfastq is not None:
163 coverage = int(float(count) * length / args.genomesize)
164 print(
165 " : ".join([fastqname, str(length) + "bp", str(count), str(coverage) + "X"])
166 )
167 else:
168 coverage = int(float(count) * length * 2 / args.genomesize)
169 print(
170 " : ".join(
171 [fastqname, str(length) + "bp*2", str(count), str(coverage) + "X"]
172 )
173 )
174
175 # check coverage
176 if coverage <= args.coverage:
177 if args.copy and args.galaxy is False:
178 if args.sfastq is not None:
179 make_copy(args.sfastq.name, outdir, args.prefix)
180 else:
181 make_copy(args.rfastq.name, outdir, args.prefix)
182 make_copy(args.lfastq.name, outdir, args.prefix)
183 print("Coverage is less than threshold, no subsampling need")
184 sys.exit(0)
185
186 # performed subsampling
187 if args.sfastq is not None:
188 subread = int((float(args.genomesize) * args.coverage) / length)
189 select = set(random.sample(range(0, count), subread))
190 if args.galaxy:
191 make_subsampling_galaxy(args.sfastq.name, 1, select)
192 else:
193 make_subsampling(args.sfastq.name, outdir, args.prefix, select)
194 print("Subsampling to " + str(subread))
195 else:
196 subread = int((float(args.genomesize) * args.coverage) / (length * 2))
197 select = set(random.sample(range(0, count), subread))
198 if args.galaxy:
199 make_subsampling_galaxy(args.rfastq.name, 1, select)
200 make_subsampling_galaxy(args.lfastq.name, 2, select)
201 else:
202 make_subsampling(args.rfastq.name, outdir, args.prefix, select)
203 make_subsampling(args.lfastq.name, outdir, args.prefix, select)
204 print("Subsampling to " + str(subread))