Mercurial > repos > pjbriggs > amplicon_analysis_pipeline
view amplicon_analysis_pipeline.py @ 4:86a12d75ebe4 draft default tip
planemo upload for repository https://github.com/pjbriggs/Amplicon_analysis-galaxy commit 7be61b7ed35ca3deaad68d2eae384c8cd365bcb8
author | pjbriggs |
---|---|
date | Fri, 20 Dec 2019 06:59:49 -0500 |
parents | 3ab198df8f3f |
children |
line wrap: on
line source
#!/usr/bin/env python # # Wrapper script to run Amplicon_analysis_pipeline.sh # from Galaxy tool import sys import os import argparse import subprocess import glob class PipelineCmd(object): def __init__(self,cmd): self.cmd = [str(cmd)] def add_args(self,*args): for arg in args: self.cmd.append(str(arg)) def __repr__(self): return ' '.join([str(arg) for arg in self.cmd]) def ahref(target,name=None,type=None): if name is None: name = os.path.basename(target) ahref = "<a href='%s'" % target if type is not None: ahref += " type='%s'" % type ahref += ">%s</a>" % name return ahref def check_errors(): # Errors in Amplicon_analysis_pipeline.log with open('Amplicon_analysis_pipeline.log','r') as pipeline_log: log = pipeline_log.read() if "Names in the first column of Metatable.txt and in the second column of Final_name.txt do not match" in log: print_error("""*** Sample IDs don't match dataset names *** The sample IDs (first column of the Metatable file) don't match the supplied sample names for the input Fastq pairs. """) # Errors in pipeline output with open('pipeline.log','r') as pipeline_log: log = pipeline_log.read() if "Errors and/or warnings detected in mapping file" in log: with open("Metatable_log/Metatable.log","r") as metatable_log: # Echo the Metatable log file to the tool log print_error("""*** Error in Metatable mapping file *** %s""" % metatable_log.read()) elif "No header line was found in mapping file" in log: # Report error to the tool log print_error("""*** No header in Metatable mapping file *** Check you've specified the correct file as the input Metatable""") def print_error(message): width = max([len(line) for line in message.split('\n')]) + 4 sys.stderr.write("\n%s\n" % ('*'*width)) for line in message.split('\n'): sys.stderr.write("* %s%s *\n" % (line,' '*(width-len(line)-4))) sys.stderr.write("%s\n\n" % ('*'*width)) def clean_up_name(sample): # Remove extensions and trailing "_L[0-9]+_001" from # Fastq pair names sample_name = '.'.join(sample.split('.')[:1]) split_name = sample_name.split('_') if split_name[-1] == "001": split_name = split_name[:-1] if split_name[-1].startswith('L'): try: int(split_name[-1][1:]) split_name = split_name[:-1] except ValueError: pass return '_'.join(split_name) def list_outputs(filen=None): # List the output directory contents # If filen is specified then will be the filename to # write to, otherwise write to stdout if filen is not None: fp = open(filen,'w') else: fp = sys.stdout results_dir = os.path.abspath("RESULTS") fp.write("Listing contents of output dir %s:\n" % results_dir) ix = 0 for d,dirs,files in os.walk(results_dir): ix += 1 fp.write("-- %d: %s\n" % (ix, os.path.relpath(d,results_dir))) for f in files: ix += 1 fp.write("---- %d: %s\n" % (ix, os.path.relpath(f,results_dir))) # Close output file if filen is not None: fp.close() if __name__ == "__main__": # Command line print "Amplicon analysis: starting" p = argparse.ArgumentParser() p.add_argument("metatable", metavar="METATABLE_FILE", help="Metatable.txt file") p.add_argument("fastq_pairs", metavar="SAMPLE_NAME FQ_R1 FQ_R2", nargs="+", default=list(), help="Triplets of SAMPLE_NAME followed by " "a R1/R2 FASTQ file pair") p.add_argument("-g",dest="forward_pcr_primer") p.add_argument("-G",dest="reverse_pcr_primer") p.add_argument("-q",dest="trimming_threshold") p.add_argument("-O",dest="minimum_overlap") p.add_argument("-L",dest="minimum_length") p.add_argument("-l",dest="sliding_window_length") p.add_argument("-P",dest="pipeline", choices=["Vsearch","DADA2"], type=str, default="Vsearch") p.add_argument("-S",dest="use_silva",action="store_true") p.add_argument("-H",dest="use_homd",action="store_true") p.add_argument("-r",dest="reference_data_path") p.add_argument("-c",dest="categories_file") args = p.parse_args() # Build the environment for running the pipeline print "Amplicon analysis: building the environment" metatable_file = os.path.abspath(args.metatable) os.symlink(metatable_file,"Metatable.txt") print "-- made symlink to Metatable.txt" # Link to Categories.txt file (if provided) if args.categories_file is not None: categories_file = os.path.abspath(args.categories_file) os.symlink(categories_file,"Categories.txt") print "-- made symlink to Categories.txt" # Link to FASTQs and construct Final_name.txt file sample_names = [] print "-- making Final_name.txt" with open("Final_name.txt",'w') as final_name: fastqs = iter(args.fastq_pairs) for sample_name,fqr1,fqr2 in zip(fastqs,fastqs,fastqs): sample_name = clean_up_name(sample_name) print " %s" % sample_name r1 = "%s_R1_.fastq" % sample_name r2 = "%s_R2_.fastq" % sample_name os.symlink(fqr1,r1) os.symlink(fqr2,r2) final_name.write("%s\n" % '\t'.join((r1,sample_name))) final_name.write("%s\n" % '\t'.join((r2,sample_name))) sample_names.append(sample_name) # Reference database if args.pipeline == "Vsearch": if args.use_silva: ref_database = "silva" elif args.use_homd: ref_database = "homd" else: ref_database = "gg" elif args.pipeline == "DADA2": ref_database = "silva" # Construct the pipeline command print "Amplicon analysis: constructing pipeline command" pipeline = PipelineCmd("Amplicon_analysis_pipeline.sh") if args.forward_pcr_primer: pipeline.add_args("-g",args.forward_pcr_primer) if args.reverse_pcr_primer: pipeline.add_args("-G",args.reverse_pcr_primer) if args.trimming_threshold: pipeline.add_args("-q",args.trimming_threshold) if args.minimum_overlap: pipeline.add_args("-O",args.minimum_overlap) if args.minimum_length: pipeline.add_args("-L",args.minimum_length) if args.sliding_window_length: pipeline.add_args("-l",args.sliding_window_length) if args.reference_data_path: pipeline.add_args("-r",args.reference_data_path) pipeline.add_args("-P",args.pipeline) if args.pipeline == "Vsearch": if ref_database == "silva": pipeline.add_args("-S") elif ref_database == "homd": pipeline.add_args("-H") # Echo the pipeline command to stdout print "Running %s" % pipeline # Run the pipeline with open("pipeline.log","w") as pipeline_out: try: subprocess.check_call(pipeline.cmd, stdout=pipeline_out, stderr=subprocess.STDOUT) exit_code = 0 print "Pipeline completed ok" except subprocess.CalledProcessError as ex: # Non-zero exit status sys.stderr.write("Pipeline failed: exit code %s\n" % ex.returncode) exit_code = ex.returncode except Exception as ex: # Some other problem sys.stderr.write("Unexpected error: %s\n" % str(ex)) exit_code = 1 # Write out the list of outputs outputs_file = "Pipeline_outputs.txt" list_outputs(outputs_file) # Check for log file log_file = "Amplicon_analysis_pipeline.log" if os.path.exists(log_file): print "Found log file: %s" % log_file if exit_code == 0: # Create an HTML file to link to log files etc # NB the paths to the files should be correct once # copied by Galaxy on job completion with open("pipeline_outputs.html","w") as html_out: html_out.write("""<html> <head> <title>Amplicon analysis pipeline: log files</title> <head> <body> <h1>Amplicon analysis pipeline: log files</h1> <ul> """) html_out.write( "<li>%s</li>\n" % ahref("Amplicon_analysis_pipeline.log", type="text/plain")) html_out.write( "<li>%s</li>\n" % ahref("pipeline.log",type="text/plain")) html_out.write( "<li>%s</li>\n" % ahref("Pipeline_outputs.txt", type="text/plain")) html_out.write( "<li>%s</li>\n" % ahref("Metatable.html")) html_out.write("""<ul> </body> </html> """) else: # Check for known error messages check_errors() # Write pipeline stdout to tool stderr sys.stderr.write("\nOutput from pipeline:\n") with open("pipeline.log",'r') as log: sys.stderr.write("%s" % log.read()) # Write log file contents to tool log print "\nAmplicon_analysis_pipeline.log:" with open(log_file,'r') as log: print "%s" % log.read() else: sys.stderr.write("ERROR missing log file \"%s\"\n" % log_file) # Handle FastQC boxplots print "Amplicon analysis: collating per base quality boxplots" with open("fastqc_quality_boxplots.html","w") as quality_boxplots: # PHRED value for trimming phred_score = 20 if args.trimming_threshold is not None: phred_score = args.trimming_threshold # Write header for HTML output file quality_boxplots.write("""<html> <head> <title>Amplicon analysis pipeline: Per-base Quality Boxplots (FastQC)</title> <head> <body> <h1>Amplicon analysis pipeline: Per-base Quality Boxplots (FastQC)</h1> """) # Look for raw and trimmed FastQC output for each sample for sample_name in sample_names: # Replace underscores with hyphens in sample names sample_name = sample_name.replace('_','-') # Write HTML file with links to the FastQC boxplots fastqc_dir = os.path.join(sample_name,"FastQC") quality_boxplots.write("<h2>%s</h2>" % sample_name) for d in ("Raw","cutdapt_sickle/Q%s" % phred_score): quality_boxplots.write("<h3>%s</h3>" % d) fastqc_html_files = glob.glob( os.path.join(fastqc_dir,d,"*_fastqc.html")) if not fastqc_html_files: quality_boxplots.write("<p>No FastQC outputs found</p>") continue # Pull out the per-base quality boxplots for f in fastqc_html_files: boxplot = None with open(f) as fp: for line in fp.read().split(">"): try: line.index("alt=\"Per base quality graph\"") boxplot = line + ">" break except ValueError: pass if boxplot is None: boxplot = "Missing plot" quality_boxplots.write("<h4>%s</h4><p>%s</p>" % (os.path.basename(f), boxplot)) quality_boxplots.write("""</body> </html> """) # Handle DADA2 error rate plot PDFs if args.pipeline == "DADA2": print("Amplicon analysis: collecting error rate plots") error_rate_plots_dir = os.path.abspath( os.path.join("DADA2_OTU_tables", "Error_rate_plots")) error_rate_plot_pdfs = [os.path.basename(pdf) for pdf in sorted(glob.glob( os.path.join(error_rate_plots_dir,"*.pdf")))] error_rate_plots_html = os.path.join(error_rate_plots_dir, "error_rate_plots.html") with open(error_rate_plots_html,"w") as error_rate_plots_out: error_rate_plots_out.write("""<html> <head> <title>Amplicon analysis pipeline: DADA2 Error Rate Plots</title> <head> <body> <h1>Amplicon analysis pipeline: DADA2 Error Rate Plots</h1> """) error_rate_plots_out.write("<ul>\n") for pdf in error_rate_plot_pdfs: error_rate_plots_out.write("<li>%s</li>\n" % ahref(pdf)) error_rate_plots_out.write("<ul>\n") error_rate_plots_out.write("""</body> </html> """) # Handle additional output when categories file was supplied if args.categories_file is not None: # Alpha diversity boxplots print "Amplicon analysis: indexing alpha diversity boxplots" boxplots_dir = os.path.abspath( os.path.join("RESULTS", "%s_%s" % (args.pipeline, ref_database), "Alpha_diversity", "Alpha_diversity_boxplot", "Categories_shannon")) print "Amplicon analysis: gathering PDFs from %s" % boxplots_dir boxplot_pdfs = [os.path.basename(pdf) for pdf in sorted(glob.glob( os.path.join(boxplots_dir,"*.pdf")))] with open("alpha_diversity_boxplots.html","w") as boxplots_out: boxplots_out.write("""<html> <head> <title>Amplicon analysis pipeline: Alpha Diversity Boxplots (Shannon)</title> <head> <body> <h1>Amplicon analysis pipeline: Alpha Diversity Boxplots (Shannon)</h1> """) boxplots_out.write("<ul>\n") for pdf in boxplot_pdfs: boxplots_out.write("<li>%s</li>\n" % ahref(pdf)) boxplots_out.write("<ul>\n") boxplots_out.write("""</body> </html> """) # Finish print "Amplicon analysis: finishing, exit code: %s" % exit_code sys.exit(exit_code)