Mercurial > repos > bgruening > bismark
diff bismark_wrapper.py @ 21:120b7b35e442 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit 8fdc76a99a9dcf34549898a208317607afd18798"
author | bgruening |
---|---|
date | Thu, 22 Apr 2021 17:05:46 +0000 |
parents | bb74e17e47d7 |
children |
line wrap: on
line diff
--- a/bismark_wrapper.py Fri Oct 04 11:33:27 2019 -0400 +++ b/bismark_wrapper.py Thu Apr 22 17:05:46 2021 +0000 @@ -18,79 +18,137 @@ def log_subprocess_output(logger, pipe): - for line in iter(pipe.readline, b''): + for line in iter(pipe.readline, b""): logger.debug(line.decode().rstrip()) def __main__(): # Parse Command Line - parser = argparse.ArgumentParser(description='Wrapper for the bismark bisulfite mapper.') - parser.add_argument('-p', '--num-threads', dest='num_threads', - type=int, default=4, help='Use this many threads to align reads. The default is 4.') + parser = argparse.ArgumentParser( + description="Wrapper for the bismark bisulfite mapper." + ) + parser.add_argument( + "-p", + "--num-threads", + dest="num_threads", + type=int, + default=4, + help="Use this many threads to align reads. The default is 4.", + ) # input options - parser.add_argument('--own-file', dest='own_file', help='') - parser.add_argument('-D', '--indexes-path', dest='index_path', - help='Indexes directory; location of .ebwt and .fa files.') - parser.add_argument('-O', '--output', dest='output') + parser.add_argument("--own-file", dest="own_file", help="") + parser.add_argument( + "-D", + "--indexes-path", + dest="index_path", + help="Indexes directory; location of .ebwt and .fa files.", + ) + parser.add_argument("-O", "--output", dest="output") - parser.add_argument('--output-report-file', dest='output_report_file') - parser.add_argument('--suppress-header', dest='suppress_header', action="store_true") + parser.add_argument("--output-report-file", dest="output_report_file") + parser.add_argument( + "--suppress-header", dest="suppress_header", action="store_true" + ) - parser.add_argument('--mate-paired', dest='mate_paired', action='store_true', help='Reads are mate-paired', - default=False) + parser.add_argument( + "--mate-paired", + dest="mate_paired", + action="store_true", + help="Reads are mate-paired", + default=False, + ) - parser.add_argument('-1', '--mate1', dest='mate1', - help='The forward reads file in Sanger FASTQ or FASTA format.') - parser.add_argument('-2', '--mate2', dest='mate2', - help='The reverse reads file in Sanger FASTQ or FASTA format.') - parser.add_argument('--sort-bam', dest='sort_bam', action="store_true") + parser.add_argument( + "-1", + "--mate1", + dest="mate1", + help="The forward reads file in Sanger FASTQ or FASTA format.", + ) + parser.add_argument( + "-2", + "--mate2", + dest="mate2", + help="The reverse reads file in Sanger FASTQ or FASTA format.", + ) + parser.add_argument("--sort-bam", dest="sort_bam", action="store_true") - parser.add_argument('--output-unmapped-reads', dest='output_unmapped_reads', - help='Additional output file with unmapped reads (single-end).') - parser.add_argument('--output-unmapped-reads-l', dest='output_unmapped_reads_l', - help='File name for unmapped reads (left, paired-end).') - parser.add_argument('--output-unmapped-reads-r', dest='output_unmapped_reads_r', - help='File name for unmapped reads (right, paired-end).') + parser.add_argument( + "--output-unmapped-reads", + dest="output_unmapped_reads", + help="Additional output file with unmapped reads (single-end).", + ) + parser.add_argument( + "--output-unmapped-reads-l", + dest="output_unmapped_reads_l", + help="File name for unmapped reads (left, paired-end).", + ) + parser.add_argument( + "--output-unmapped-reads-r", + dest="output_unmapped_reads_r", + help="File name for unmapped reads (right, paired-end).", + ) - parser.add_argument('--output-suppressed-reads', dest='output_suppressed_reads', - help='Additional output file with suppressed reads (single-end).') - parser.add_argument('--output-suppressed-reads-l', dest='output_suppressed_reads_l', - help='File name for suppressed reads (left, paired-end).') - parser.add_argument('--output-suppressed-reads-r', dest='output_suppressed_reads_r', - help='File name for suppressed reads (right, paired-end).') - parser.add_argument('--stdout', dest='output_stdout', - help='File name for the standard output of bismark.') + parser.add_argument( + "--output-suppressed-reads", + dest="output_suppressed_reads", + help="Additional output file with suppressed reads (single-end).", + ) + parser.add_argument( + "--output-suppressed-reads-l", + dest="output_suppressed_reads_l", + help="File name for suppressed reads (left, paired-end).", + ) + parser.add_argument( + "--output-suppressed-reads-r", + dest="output_suppressed_reads_r", + help="File name for suppressed reads (right, paired-end).", + ) + parser.add_argument( + "--stdout", + dest="output_stdout", + help="File name for the standard output of bismark.", + ) - parser.add_argument('--single-paired', dest='single_paired', - help='The single-end reads file in Sanger FASTQ or FASTA format.') + parser.add_argument( + "--single-paired", + dest="single_paired", + help="The single-end reads file in Sanger FASTQ or FASTA format.", + ) - parser.add_argument('--fastq', action='store_true', help='Query filetype is in FASTQ format') - parser.add_argument('--fasta', action='store_true', help='Query filetype is in FASTA format') - parser.add_argument('--phred64-quals', dest='phred64', action="store_true") - parser.add_argument('--non-directional', dest='non_directional', action="store_true") - parser.add_argument('--pbat', dest='pbat', action="store_true") + parser.add_argument( + "--fastq", action="store_true", help="Query filetype is in FASTQ format" + ) + parser.add_argument( + "--fasta", action="store_true", help="Query filetype is in FASTA format" + ) + parser.add_argument("--phred64-quals", dest="phred64", action="store_true") + parser.add_argument( + "--non_directional", dest="non_directional", action="store_true" + ) + parser.add_argument("--pbat", dest="pbat", action="store_true") - parser.add_argument('--skip-reads', dest='skip_reads', type=int) - parser.add_argument('--score-min', dest='score_min', type=str) - parser.add_argument('--qupto', type=int) + parser.add_argument("--skip-reads", dest="skip_reads", type=int) + parser.add_argument("--score-min", dest="score_min", type=str) + parser.add_argument("--qupto", type=int) # paired end options - parser.add_argument('-I', '--minins', dest='min_insert') - parser.add_argument('-X', '--maxins', dest='max_insert') - parser.add_argument('--no-mixed', dest='no_mixed', action="store_true") - parser.add_argument('--no-discordant', dest='no_discordant', action="store_true") + parser.add_argument("-I", "--minins", dest="min_insert") + parser.add_argument("-X", "--maxins", dest="max_insert") + parser.add_argument("--no-mixed", dest="no_mixed", action="store_true") + parser.add_argument("--no-discordant", dest="no_discordant", action="store_true") # parse general options # default 20 - parser.add_argument('--seed-len', dest='seed_len', type=int) + parser.add_argument("--seed-len", dest="seed_len", type=int) # default 15 - parser.add_argument('--seed-extention-attempts', dest='seed_extention_attempts', type=int) + parser.add_argument( + "--seed-extention-attempts", dest="seed_extention_attempts", type=int + ) # default 0 - parser.add_argument('--seed-mismatches', dest='seed_mismatches', type=int) + parser.add_argument("--seed-mismatches", dest="seed_mismatches", type=int) # default 2 - parser.add_argument('--max-reseed', dest='max_reseed', type=int) - + parser.add_argument("--max-reseed", dest="max_reseed", type=int) """ The number of megabytes of memory a given thread is given to store path @@ -101,11 +159,11 @@ saying that chunk memory has been exhausted in --best mode, try adjusting this parameter up to dedicate more memory to the descriptors. Default: 512. """ - parser.add_argument('--chunkmbs', type=int, default=512) + parser.add_argument("--chunkmbs", type=int, default=512) args = parser.parse_args() - logger = logging.getLogger('bismark_wrapper') + logger = logging.getLogger("bismark_wrapper") logger.setLevel(logging.DEBUG) ch = logging.StreamHandler(sys.stdout) if args.output_stdout: @@ -121,22 +179,28 @@ index_dir = "" tmp_index_dir = None if args.own_file: - logger.info("Create a temporary index with the offered files from the user. " - "Utilizing the script: bismark_genome_preparation") + logger.info( + "Create a temporary index with the offered files from the user. " + "Utilizing the script: bismark_genome_preparation" + ) tmp_index_dir = tempfile.mkdtemp() - index_path = os.path.join(tmp_index_dir, '.'.join(os.path.split(args.own_file)[1].split('.')[:-1])) + index_path = os.path.join( + tmp_index_dir, ".".join(os.path.split(args.own_file)[1].split(".")[:-1]) + ) try: # Create a hard link pointing to args.own_file named 'index_path'.fa. - os.symlink(args.own_file, index_path + '.fa') + os.symlink(args.own_file, index_path + ".fa") except Exception as e: if os.path.exists(tmp_index_dir): shutil.rmtree(tmp_index_dir) - stop_err(logger, 'Error in linking the reference database!\n%s' % e) + stop_err(logger, "Error in linking the reference database!\n%s" % e) # bismark_genome_preparation needs the complete path to the folder in which the database is stored - cmd_index = ['bismark_genome_preparation', '--bowtie2', tmp_index_dir] + cmd_index = ["bismark_genome_preparation", "--bowtie2", tmp_index_dir] try: logger.info("Generating index with: '%s'", " ".join(cmd_index)) - process = subprocess.Popen(cmd_index, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + process = subprocess.Popen( + cmd_index, stdout=subprocess.PIPE, stderr=subprocess.STDOUT + ) with process.stdout: log_subprocess_output(logger, process.stdout) exitcode = process.wait() @@ -145,7 +209,7 @@ except Exception as e: if os.path.exists(tmp_index_dir): shutil.rmtree(tmp_index_dir) - stop_err(logger, 'Error indexing reference sequence!\n%s' % e) + stop_err(logger, "Error indexing reference sequence!\n%s" % e) index_dir = tmp_index_dir else: # bowtie path is the path to the index directory and the first path of the index file name @@ -153,57 +217,80 @@ # Build bismark command tmp_bismark_dir = tempfile.mkdtemp() - output_dir = os.path.join(tmp_bismark_dir, 'results') - cmd = ['bismark', '--bam', '--temp_dir', tmp_bismark_dir, '-o', output_dir, '--quiet'] + output_dir = os.path.join(tmp_bismark_dir, "results") + cmd = [ + "bismark", + "--bam", + "--temp_dir", + tmp_bismark_dir, + "-o", + output_dir, + "--quiet", + ] if not args.pbat: - cmd.append('--gzip') + cmd.append("--gzip") if args.fasta: # the query input files (specified as mate1,mate2 or singles) are FastA - cmd.append('--fasta') + cmd.append("--fasta") elif args.fastq: - cmd.append('--fastq') + cmd.append("--fastq") # alignment options if args.num_threads > 2: # divide num_threads by 2 here since bismark will spawn 2 jobs with -p threads each - cmd.extend(['-p', str(int(math.floor(args.num_threads / 2)))]) + cmd.extend(["-p", str(int(math.floor(args.num_threads / 2)))]) if args.seed_mismatches: - cmd.extend(['-N', str(args.seed_mismatches)]) + cmd.extend(["-N", str(args.seed_mismatches)]) if args.seed_len: - cmd.extend(['-L', str(args.seed_len)]) + cmd.extend(["-L", str(args.seed_len)]) if args.seed_extention_attempts: - cmd.extend(['-D', str(args.seed_extention_attempts)]) + cmd.extend(["-D", str(args.seed_extention_attempts)]) if args.max_reseed: - cmd.extend(['-R', str(args.max_reseed)]) + cmd.extend(["-R", str(args.max_reseed)]) if args.no_discordant: - cmd.append('--no-discordant') + cmd.append("--no-discordant") if args.no_mixed: - cmd.append('--no-mixed') + cmd.append("--no-mixed") if args.skip_reads: - cmd.extend(['--skip', str(args.skip_reads)]) + cmd.extend(["--skip", str(args.skip_reads)]) if args.score_min: - cmd.extend(['--score_min', str(args.score_min)]) + cmd.extend(["--score_min", str(args.score_min)]) if args.qupto: - cmd.extend(['--upto', 'args.qupto']) + cmd.extend(["--upto", "args.qupto"]) if args.phred64: - cmd.append('--phred64-quals') + cmd.append("--phred64-quals") if args.non_directional: - cmd.append('--non-directional') + cmd.append("--non_directional") if args.pbat: - cmd.append('--pbat') + cmd.append("--pbat") if args.suppress_header: - cmd.append('--sam-no-hd') - if args.output_unmapped_reads or (args.output_unmapped_reads_l and args.output_unmapped_reads_r): - cmd.append('--un') - if args.output_suppressed_reads or (args.output_suppressed_reads_l and args.output_suppressed_reads_r): - cmd.append('--ambiguous') + cmd.append("--sam-no-hd") + if args.output_unmapped_reads or ( + args.output_unmapped_reads_l and args.output_unmapped_reads_r + ): + cmd.append("--un") + if args.output_suppressed_reads or ( + args.output_suppressed_reads_l and args.output_suppressed_reads_r + ): + cmd.append("--ambiguous") cmd.append(index_dir) # Set up the reads if args.mate_paired: # paired-end reads library - cmd.extend(['-1', args.mate1, '-2', args.mate2, '-I', str(args.min_insert), '-X', str(args.max_insert)]) + cmd.extend( + [ + "-1", + args.mate1, + "-2", + args.mate2, + "-I", + str(args.min_insert), + "-X", + str(args.max_insert), + ] + ) else: # single paired reads library cmd.append(args.single_paired) @@ -211,51 +298,81 @@ # Run try: logger.info("Running bismark with: '%s'", " ".join(cmd)) - process = subprocess.Popen(args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + process = subprocess.Popen( + args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT + ) with process.stdout: log_subprocess_output(logger, process.stdout) exitcode = process.wait() if exitcode != 0: raise Exception(process.stderr) except Exception as e: - stop_err(logger, 'Error in running bismark!\n%s' % e) + stop_err(logger, "Error in running bismark!\n%s" % e) # collect and copy output files if args.output_report_file: - output_report_file = open(args.output_report_file, 'w+') - for line in fileinput.input(glob(os.path.join(output_dir, '*report.txt'))): + output_report_file = open(args.output_report_file, "w+") + for line in fileinput.input(glob(os.path.join(output_dir, "*report.txt"))): output_report_file.write(line) output_report_file.close() if args.output_suppressed_reads: - if glob(os.path.join(output_dir, '*ambiguous_reads.*')): - shutil.move(glob(os.path.join(output_dir, '*ambiguous_reads.*'))[0], args.output_suppressed_reads) + if glob(os.path.join(output_dir, "*ambiguous_reads.*")): + shutil.move( + glob(os.path.join(output_dir, "*ambiguous_reads.*"))[0], + args.output_suppressed_reads, + ) if args.output_suppressed_reads_l: - if glob(os.path.join(output_dir, '*ambiguous_reads_1.*')): - shutil.move(glob(os.path.join(output_dir, '*ambiguous_reads_1.*'))[0], args.output_suppressed_reads_l) + if glob(os.path.join(output_dir, "*ambiguous_reads_1.*")): + shutil.move( + glob(os.path.join(output_dir, "*ambiguous_reads_1.*"))[0], + args.output_suppressed_reads_l, + ) if args.output_suppressed_reads_r: - if glob(os.path.join(output_dir, '*ambiguous_reads_2.*')): - shutil.move(glob(os.path.join(output_dir, '*ambiguous_reads_2.*'))[0], args.output_suppressed_reads_r) + if glob(os.path.join(output_dir, "*ambiguous_reads_2.*")): + shutil.move( + glob(os.path.join(output_dir, "*ambiguous_reads_2.*"))[0], + args.output_suppressed_reads_r, + ) if args.output_unmapped_reads: - if glob(os.path.join(output_dir, '*unmapped_reads.*')): - shutil.move(glob(os.path.join(output_dir, '*unmapped_reads.*'))[0], args.output_unmapped_reads) + if glob(os.path.join(output_dir, "*unmapped_reads.*")): + shutil.move( + glob(os.path.join(output_dir, "*unmapped_reads.*"))[0], + args.output_unmapped_reads, + ) if args.output_unmapped_reads_l: - if glob(os.path.join(output_dir, '*unmapped_reads_1.*')): - shutil.move(glob(os.path.join(output_dir, '*unmapped_reads_1.*'))[0], args.output_unmapped_reads_l) + if glob(os.path.join(output_dir, "*unmapped_reads_1.*")): + shutil.move( + glob(os.path.join(output_dir, "*unmapped_reads_1.*"))[0], + args.output_unmapped_reads_l, + ) if args.output_unmapped_reads_r: - if glob(os.path.join(output_dir, '*unmapped_reads_2.*')): - shutil.move(glob(os.path.join(output_dir, '*unmapped_reads_2.*'))[0], args.output_unmapped_reads_r) + if glob(os.path.join(output_dir, "*unmapped_reads_2.*")): + shutil.move( + glob(os.path.join(output_dir, "*unmapped_reads_2.*"))[0], + args.output_unmapped_reads_r, + ) try: # merge all bam files tmp_res = tempfile.NamedTemporaryFile(dir=output_dir).name - bam_files = glob(os.path.join(output_dir, '*.bam')) + bam_files = glob(os.path.join(output_dir, "*.bam")) if len(bam_files) > 1: - cmd = ['samtools', 'merge', '-@', str(args.num_threads), '-f', tmp_res, ' '.join(bam_files)] + cmd = [ + "samtools", + "merge", + "-@", + str(args.num_threads), + "-f", + tmp_res + ] + [cmd.append(x) for x in bam_files] logger.info("Merging bams with: '%s'", cmd) - process = subprocess.Popen(args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + process = subprocess.Popen( + args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT + ) with process.stdout: log_subprocess_output(logger, process.stdout) exitcode = process.wait() @@ -268,22 +385,31 @@ if os.path.exists(bam_path): if args.sort_bam: - cmd = ['samtools', 'sort', '-@', str(args.num_threads), bam_path, 'sorted_bam'] + cmd = [ + "samtools", + "sort", + "-@", + str(args.num_threads), + bam_path, + "sorted_bam", + ] logger.info("Sorting bam with: '%s'", cmd) - process = subprocess.Popen(args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + process = subprocess.Popen( + args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT + ) with process.stdout: log_subprocess_output(logger, process.stdout) exitcode = process.wait() if exitcode != 0: raise Exception(process.stderr) - shutil.move('sorted_bam.bam', args.output) + shutil.move("sorted_bam.bam", args.output) else: shutil.move(bam_path, args.output) else: - stop_err(logger, 'BAM file no found:\n%s' % bam_path) + stop_err(logger, "BAM file no found:\n%s" % bam_path) except Exception as e: - stop_err(logger, 'Error in merging bam files!\n%s' % e) + stop_err(logger, "Error in merging bam files!\n%s" % e) # Clean up temp dirs if tmp_index_dir and os.path.exists(tmp_index_dir): @@ -294,4 +420,5 @@ shutil.rmtree(output_dir) -if __name__ == "__main__": __main__() +if __name__ == "__main__": + __main__()