Mercurial > repos > bvalot > fasta_filter
diff fastq_subsampling.py @ 0:c74e633b40e9 draft default tip
planemo upload for repository https://github.com/bvalot/galaxy commit d57c24d4b2c0c741d572af9ca0d09f8b82689640
author | bvalot |
---|---|
date | Tue, 14 Jun 2022 08:51:44 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastq_subsampling.py Tue Jun 14 08:51:44 2022 +0000 @@ -0,0 +1,204 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +"""Return a subsampling fastq file""" + +import argparse +import gzip +import io +import os +import random +import sys + +import pysam + +desc = "Subsampling a single or a paired of fastq(.gz) file" +command = argparse.ArgumentParser( + prog="fastq_subsampling.py", description=desc, usage="%(prog)s [options] genomeSize" +) +command.add_argument("--galaxy", action="store_true", help=argparse.SUPPRESS) +command.add_argument( + "-d", + "--outdir", + default="./", + type=str, + nargs="?", + help="Directory to store subsampling fastq file, default: current directory", +) +command.add_argument( + "-p", + "--prefix", + default="_sub", + type=str, + nargs="?", + help="Output fastq file with prefix, default:_sub", +) +command.add_argument( + "-c", + "--coverage", + type=int, + nargs="?", + default=60, + help="Mean coverage to sampling (default=60)", +) +command.add_argument( + "--copy", + action="store_true", + help="Perform a copy of sample without enough coverage (default=no subsampling)", +) +command.add_argument( + "-s", + "--sfastq", + nargs="?", + type=argparse.FileType("r"), + help="Fastq(.gz) file to subsampling (single)", +) +command.add_argument( + "-r", + "--rfastq", + nargs="?", + type=argparse.FileType("r"), + help="Fastq(.gz) file to subsampling in paired (right)", +) +command.add_argument( + "-l", + "--lfastq", + nargs="?", + type=argparse.FileType("r"), + help="Fastq(.gz) file to subsampling in paired (left)", +) +command.add_argument( + "genomesize", + type=int, + help="Size of the genome (bp) for calculation of subsampling", +) +command.add_argument("-v", "--version", action="version", version="%(prog)s 0.2.0") + + +def outname(inname, outdir, append): + inname = outdir + inname.split("/")[-1] + if isgzip(inname): + if inname[-5:] == "fq.gz": + return inname.rstrip(".fq.gz") + append + ".fastq.gz" + elif inname[-8:] == "fastq.gz": + return inname.rstrip(".fastq.gz") + append + ".fastq.gz" + else: + return inname + append + ".fastq.gz" + elif inname[-2:] == "fq": + return inname.rstrip(".fq") + append + ".fastq.gz" + elif inname[-5:] == "fastq": + return inname.rstrip(".fastq") + append + ".fastq.gz" + else: + return inname + append + ".fastq.gz" + + +def isgzip(filename): + if filename[-2:] == "gz": + return True + else: + return False + + +def complet_count(fastq): + count = 0 + total = 0.0 + for read in pysam.FastxFile(fastq): + count += 1 + total += len(read.sequence) + return count, int(total / count) + + +def make_copy(infile, outdir, prefix): + output = outname(infile, outdir, prefix) + os.popen(" ".join(["cp", infile, output])) + + +def make_subsampling(infile, outdir, prefix, select): + output = io.BufferedWriter(gzip.open(outname(infile, outdir, prefix), "wb", 5)) + for i, read in enumerate(pysam.FastxFile(infile)): + if i in select: + output.write((str(read) + "\n").encode()) + output.flush() + output.close() + + +def make_subsampling_galaxy(infile, number, select): + output = io.BufferedWriter(gzip.open("./" + str(number) + ".fastq.gz", "wb", 5)) + for i, read in enumerate(pysam.FastxFile(infile)): + if i in select: + output.write((str(read) + "\n").encode()) + output.flush() + output.close() + + +if __name__ == "__main__": + """Performed job on execution script""" + args = command.parse_args() + + # verify input + if args.sfastq is None and args.lfastq is None and args.rfastq is None: + raise Exception("At least single or paired reads must be defined") + outdir = args.outdir + if not os.path.isdir(outdir): + raise Exception(outdir + " is not a valid directory") + else: + outdir = outdir.rstrip("/") + "/" + + # compute count and length + fastqname = None + if args.sfastq is not None: + fastqname = args.sfastq.name + elif args.rfastq is not None and args.lfastq is not None: + fastqname = args.rfastq.name + else: + raise Exception("You must defined right and left fastq file for paired data") + + count = None + length = None + count, length = complet_count(fastqname) + + # Calculate coverage and report + coverage = 0 + if args.sfastq is not None: + coverage = int(float(count) * length / args.genomesize) + print( + " : ".join([fastqname, str(length) + "bp", str(count), str(coverage) + "X"]) + ) + else: + coverage = int(float(count) * length * 2 / args.genomesize) + print( + " : ".join( + [fastqname, str(length) + "bp*2", str(count), str(coverage) + "X"] + ) + ) + + # check coverage + if coverage <= args.coverage: + if args.copy and args.galaxy is False: + if args.sfastq is not None: + make_copy(args.sfastq.name, outdir, args.prefix) + else: + make_copy(args.rfastq.name, outdir, args.prefix) + make_copy(args.lfastq.name, outdir, args.prefix) + print("Coverage is less than threshold, no subsampling need") + sys.exit(0) + + # performed subsampling + if args.sfastq is not None: + subread = int((float(args.genomesize) * args.coverage) / length) + select = set(random.sample(range(0, count), subread)) + if args.galaxy: + make_subsampling_galaxy(args.sfastq.name, 1, select) + else: + make_subsampling(args.sfastq.name, outdir, args.prefix, select) + print("Subsampling to " + str(subread)) + else: + subread = int((float(args.genomesize) * args.coverage) / (length * 2)) + select = set(random.sample(range(0, count), subread)) + if args.galaxy: + make_subsampling_galaxy(args.rfastq.name, 1, select) + make_subsampling_galaxy(args.lfastq.name, 2, select) + else: + make_subsampling(args.rfastq.name, outdir, args.prefix, select) + make_subsampling(args.lfastq.name, outdir, args.prefix, select) + print("Subsampling to " + str(subread))