Mercurial > repos > pjbriggs > amplicon_analysis_pipeline

diff amplicon_analysis_pipeline.py @ 0:47ec9c6f44b8 draft
planemo upload for repository https://github.com/pjbriggs/Amplicon_analysis-galaxy commit b63924933a03255872077beb4d0fde49d77afa92
author: pjbriggs
date: Thu, 09 Nov 2017 10:13:29 -0500
children: 1c1902e12caf
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/amplicon_analysis_pipeline.py	Thu Nov 09 10:13:29 2017 -0500
@@ -0,0 +1,329 @@
+#!/usr/bin/env python
+#
+# Wrapper script to run Amplicon_analysis_pipeline.sh
+# from Galaxy tool
+
+import sys
+import os
+import argparse
+import subprocess
+import glob
+
+class PipelineCmd(object):
+    def __init__(self,cmd):
+        self.cmd = [str(cmd)]
+    def add_args(self,*args):
+        for arg in args:
+            self.cmd.append(str(arg))
+    def __repr__(self):
+        return ' '.join([str(arg) for arg in self.cmd])
+
+def ahref(target,name=None,type=None):
+    if name is None:
+        name = os.path.basename(target)
+    ahref = "<a href='%s'" % target
+    if type is not None:
+        ahref += " type='%s'" % type
+    ahref += ">%s</a>" % name
+    return ahref
+
+def check_errors():
+    # Errors in Amplicon_analysis_pipeline.log
+    with open('Amplicon_analysis_pipeline.log','r') as pipeline_log:
+        log = pipeline_log.read()
+        if "Names in the first column of Metatable.txt and in the second column of Final_name.txt do not match" in log:
+            print_error("""*** Sample IDs don't match dataset names ***
+
+The sample IDs (first column of the Metatable file) don't match the
+supplied sample names for the input Fastq pairs.
+""")
+    # Errors in pipeline output
+    with open('pipeline.log','r') as pipeline_log:
+        log = pipeline_log.read()
+        if "Errors and/or warnings detected in mapping file" in log:
+            with open("Metatable_log/Metatable.log","r") as metatable_log:
+                # Echo the Metatable log file to the tool log
+                print_error("""*** Error in Metatable mapping file ***
+
+%s""" % metatable_log.read())
+        elif "No header line was found in mapping file" in log:
+            # Report error to the tool log
+            print_error("""*** No header in Metatable mapping file ***
+
+Check you've specified the correct file as the input Metatable""")
+
+def print_error(message):
+    width = max([len(line) for line in message.split('\n')]) + 4
+    sys.stderr.write("\n%s\n" % ('*'*width))
+    for line in message.split('\n'):
+        sys.stderr.write("* %s%s *\n" % (line,' '*(width-len(line)-4)))
+    sys.stderr.write("%s\n\n" % ('*'*width))
+
+def clean_up_name(sample):
+    # Remove trailing "_L[0-9]+_001" from Fastq
+    # pair names
+    split_name = sample.split('_')
+    if split_name[-1] == "001":
+        split_name = split_name[:-1]
+    if split_name[-1].startswith('L'):
+        try:
+            int(split_name[-1][1:])
+            split_name = split_name[:-1]
+        except ValueError:
+            pass
+    return '_'.join(split_name)
+
+def list_outputs(filen=None):
+    # List the output directory contents
+    # If filen is specified then will be the filename to
+    # write to, otherwise write to stdout
+    if filen is not None:
+        fp = open(filen,'w')
+    else:
+        fp = sys.stdout
+    results_dir = os.path.abspath("RESULTS")
+    fp.write("Listing contents of output dir %s:\n" % results_dir)
+    ix = 0
+    for d,dirs,files in os.walk(results_dir):
+        ix += 1
+        fp.write("-- %d: %s\n" % (ix,
+                                  os.path.relpath(d,results_dir)))
+        for f in files:
+            ix += 1
+            fp.write("---- %d: %s\n" % (ix,
+                                        os.path.relpath(f,results_dir)))
+    # Close output file
+    if filen is not None:
+        fp.close()
+
+if __name__ == "__main__":
+    # Command line
+    print "Amplicon analysis: starting"
+    p = argparse.ArgumentParser()
+    p.add_argument("metatable",
+                   metavar="METATABLE_FILE",
+                   help="Metatable.txt file")
+    p.add_argument("fastq_pairs",
+                   metavar="SAMPLE_NAME FQ_R1 FQ_R2",
+                   nargs="+",
+                   default=list(),
+                   help="Triplets of SAMPLE_NAME followed by "
+                   "a R1/R2 FASTQ file pair")
+    p.add_argument("-g",dest="forward_pcr_primer")
+    p.add_argument("-G",dest="reverse_pcr_primer")
+    p.add_argument("-q",dest="trimming_threshold")
+    p.add_argument("-O",dest="minimum_overlap")
+    p.add_argument("-L",dest="minimum_length")
+    p.add_argument("-l",dest="sliding_window_length")
+    p.add_argument("-P",dest="pipeline",
+                   choices=["vsearch","uparse","qiime"],
+                   type=str.lower,
+                   default="vsearch")
+    p.add_argument("-S",dest="use_silva",action="store_true")
+    p.add_argument("-r",dest="reference_data_path")
+    p.add_argument("-c",dest="categories_file")
+    args = p.parse_args()
+
+    # Build the environment for running the pipeline
+    print "Amplicon analysis: building the environment"
+    metatable_file = os.path.abspath(args.metatable)
+    os.symlink(metatable_file,"Metatable.txt")
+    print "-- made symlink to Metatable.txt"
+
+    # Link to Categories.txt file (if provided)
+    if args.categories_file is not None:
+        categories_file = os.path.abspath(args.categories_file)
+        os.symlink(categories_file,"Categories.txt")
+        print "-- made symlink to Categories.txt"
+
+    # Link to FASTQs and construct Final_name.txt file
+    sample_names = []
+    with open("Final_name.txt",'w') as final_name:
+        fastqs = iter(args.fastq_pairs)
+        for sample_name,fqr1,fqr2 in zip(fastqs,fastqs,fastqs):
+            sample_name = clean_up_name(sample_name)
+            r1 = "%s_R1_.fastq" % sample_name
+            r2 = "%s_R2_.fastq" % sample_name
+            os.symlink(fqr1,r1)
+            os.symlink(fqr2,r2)
+            final_name.write("%s\n" % '\t'.join((r1,sample_name)))
+            final_name.write("%s\n" % '\t'.join((r2,sample_name)))
+            sample_names.append(sample_name)
+
+    # Construct the pipeline command
+    print "Amplicon analysis: constructing pipeline command"
+    pipeline = PipelineCmd("Amplicon_analysis_pipeline.sh")
+    if args.forward_pcr_primer:
+        pipeline.add_args("-g",args.forward_pcr_primer)
+    if args.reverse_pcr_primer:
+        pipeline.add_args("-G",args.reverse_pcr_primer)
+    if args.trimming_threshold:
+        pipeline.add_args("-q",args.trimming_threshold)
+    if args.minimum_overlap:
+        pipeline.add_args("-O",args.minimum_overlap)
+    if args.minimum_length:
+        pipeline.add_args("-L",args.minimum_length)
+    if args.sliding_window_length:
+        pipeline.add_args("-l",args.sliding_window_length)
+    if args.reference_data_path:
+        pipeline.add_args("-r",args.reference_data_path)
+    pipeline.add_args("-P",args.pipeline)
+    if args.use_silva:
+        pipeline.add_args("-S")
+
+    # Echo the pipeline command to stdout
+    print "Running %s" % pipeline
+
+    # Run the pipeline
+    with open("pipeline.log","w") as pipeline_out:
+        try:
+            subprocess.check_call(pipeline.cmd,
+                                  stdout=pipeline_out,
+                                  stderr=subprocess.STDOUT)
+            exit_code = 0
+            print "Pipeline completed ok"
+        except subprocess.CalledProcessError as ex:
+            # Non-zero exit status
+            sys.stderr.write("Pipeline failed: exit code %s\n" %
+                             ex.returncode)
+            exit_code = ex.returncode
+        except Exception as ex:
+            # Some other problem
+            sys.stderr.write("Unexpected error: %s\n" % str(ex))
+
+    # Write out the list of outputs
+    outputs_file = "Pipeline_outputs.txt"
+    list_outputs(outputs_file)
+
+    # Check for log file
+    log_file = "Amplicon_analysis_pipeline.log"
+    if os.path.exists(log_file):
+        print "Found log file: %s" % log_file
+        if exit_code == 0:
+            # Create an HTML file to link to log files etc
+            # NB the paths to the files should be correct once
+            # copied by Galaxy on job completion
+            with open("pipeline_outputs.html","w") as html_out:
+                html_out.write("""<html>
+<head>
+<title>Amplicon analysis pipeline: log files</title>
+<head>
+<body>
+<h1>Amplicon analysis pipeline: log files</h1>
+<ul>
+""")
+                html_out.write(
+                    "<li>%s</li>\n" %
+                    ahref("Amplicon_analysis_pipeline.log",
+                          type="text/plain"))
+                html_out.write(
+                    "<li>%s</li>\n" %
+                    ahref("pipeline.log",type="text/plain"))
+                html_out.write(
+                    "<li>%s</li>\n" %
+                    ahref("Pipeline_outputs.txt",
+                          type="text/plain"))
+                html_out.write(
+                    "<li>%s</li>\n" %
+                    ahref("Metatable.html"))
+                html_out.write("""<ul>
+</body>
+</html>
+""")
+        else:
+            # Check for known error messages
+            check_errors()
+            # Write pipeline stdout to tool stderr
+            sys.stderr.write("\nOutput from pipeline:\n")
+            with open("pipeline.log",'r') as log:
+                sys.stderr.write("%s" % log.read())
+            # Write log file contents to tool log
+            print "\nAmplicon_analysis_pipeline.log:"
+            with open(log_file,'r') as log:
+                print "%s" % log.read()
+    else:
+        sys.stderr.write("ERROR missing log file \"%s\"\n" %
+                         log_file)
+
+    # Handle FastQC boxplots
+    print "Amplicon analysis: collating per base quality boxplots"
+    with open("fastqc_quality_boxplots.html","w") as quality_boxplots:
+        # PHRED value for trimming
+        phred_score = 20
+        if args.trimming_threshold is not None:
+            phred_score = args.trimming_threshold
+        # Write header for HTML output file
+        quality_boxplots.write("""<html>
+<head>
+<title>Amplicon analysis pipeline: Per-base Quality Boxplots (FastQC)</title>
+<head>
+<body>
+<h1>Amplicon analysis pipeline: Per-base Quality Boxplots (FastQC)</h1>
+""")
+        # Look for raw and trimmed FastQC output for each sample
+        for sample_name in sample_names:
+            fastqc_dir = os.path.join(sample_name,"FastQC")
+            quality_boxplots.write("<h2>%s</h2>" % sample_name)
+            for d in ("Raw","cutdapt_sickle/Q%s" % phred_score):
+                quality_boxplots.write("<h3>%s</h3>" % d)
+                fastqc_html_files = glob.glob(
+                    os.path.join(fastqc_dir,d,"*_fastqc.html"))
+                if not fastqc_html_files:
+                    quality_boxplots.write("<p>No FastQC outputs found</p>")
+                    continue
+                # Pull out the per-base quality boxplots
+                for f in fastqc_html_files:
+                    boxplot = None
+                    with open(f) as fp:
+                        for line in fp.read().split(">"):
+                            try:
+                                line.index("alt=\"Per base quality graph\"")
+                                boxplot = line + ">"
+                                break
+                            except ValueError:
+                                pass
+                    if boxplot is None:
+                        boxplot = "Missing plot"
+                    quality_boxplots.write("<h4>%s</h4><p>%s</p>" %
+                                           (os.path.basename(f),
+                                            boxplot))
+            quality_boxplots.write("""</body>
+</html>
+""")
+
+    # Handle additional output when categories file was supplied
+    if args.categories_file is not None:
+        # Alpha diversity boxplots
+        print "Amplicon analysis: indexing alpha diversity boxplots"
+        boxplots_dir = os.path.abspath(
+            os.path.join("RESULTS",
+                         "%s_%s" % (args.pipeline.title(),
+                                    ("gg" if not args.use_silva
+                                     else "silva")),
+                         "Alpha_diversity",
+                         "Alpha_diversity_boxplot",
+                         "Categories_shannon"))
+        print "Amplicon analysis: gathering PDFs from %s" % boxplots_dir
+        boxplot_pdfs = [os.path.basename(pdf)
+                        for pdf in
+                        sorted(glob.glob(
+                            os.path.join(boxplots_dir,"*.pdf")))]
+        with open("alpha_diversity_boxplots.html","w") as boxplots_out:
+            boxplots_out.write("""<html>
+<head>
+<title>Amplicon analysis pipeline: Alpha Diversity Boxplots (Shannon)</title>
+<head>
+<body>
+<h1>Amplicon analysis pipeline: Alpha Diversity Boxplots (Shannon)</h1>
+""")
+            boxplots_out.write("<ul>\n")
+            for pdf in boxplot_pdfs:
+                boxplots_out.write("<li>%s</li>\n" % ahref(pdf))
+            boxplots_out.write("<ul>\n")
+            boxplots_out.write("""</body>
+</html>
+""")
+
+    # Finish
+    print "Amplicon analysis: finishing, exit code: %s" % exit_code
+    sys.exit(exit_code)
author	pjbriggs
date	Thu, 09 Nov 2017 10:13:29 -0500
parents
children	1c1902e12caf