comparison amplicon_analysis_pipeline.py @ 0:47ec9c6f44b8 draft

planemo upload for repository https://github.com/pjbriggs/Amplicon_analysis-galaxy commit b63924933a03255872077beb4d0fde49d77afa92
author pjbriggs
date Thu, 09 Nov 2017 10:13:29 -0500
parents
children 1c1902e12caf
comparison
equal deleted inserted replaced
-1:000000000000 0:47ec9c6f44b8
1 #!/usr/bin/env python
2 #
3 # Wrapper script to run Amplicon_analysis_pipeline.sh
4 # from Galaxy tool
5
6 import sys
7 import os
8 import argparse
9 import subprocess
10 import glob
11
12 class PipelineCmd(object):
13 def __init__(self,cmd):
14 self.cmd = [str(cmd)]
15 def add_args(self,*args):
16 for arg in args:
17 self.cmd.append(str(arg))
18 def __repr__(self):
19 return ' '.join([str(arg) for arg in self.cmd])
20
21 def ahref(target,name=None,type=None):
22 if name is None:
23 name = os.path.basename(target)
24 ahref = "<a href='%s'" % target
25 if type is not None:
26 ahref += " type='%s'" % type
27 ahref += ">%s</a>" % name
28 return ahref
29
30 def check_errors():
31 # Errors in Amplicon_analysis_pipeline.log
32 with open('Amplicon_analysis_pipeline.log','r') as pipeline_log:
33 log = pipeline_log.read()
34 if "Names in the first column of Metatable.txt and in the second column of Final_name.txt do not match" in log:
35 print_error("""*** Sample IDs don't match dataset names ***
36
37 The sample IDs (first column of the Metatable file) don't match the
38 supplied sample names for the input Fastq pairs.
39 """)
40 # Errors in pipeline output
41 with open('pipeline.log','r') as pipeline_log:
42 log = pipeline_log.read()
43 if "Errors and/or warnings detected in mapping file" in log:
44 with open("Metatable_log/Metatable.log","r") as metatable_log:
45 # Echo the Metatable log file to the tool log
46 print_error("""*** Error in Metatable mapping file ***
47
48 %s""" % metatable_log.read())
49 elif "No header line was found in mapping file" in log:
50 # Report error to the tool log
51 print_error("""*** No header in Metatable mapping file ***
52
53 Check you've specified the correct file as the input Metatable""")
54
55 def print_error(message):
56 width = max([len(line) for line in message.split('\n')]) + 4
57 sys.stderr.write("\n%s\n" % ('*'*width))
58 for line in message.split('\n'):
59 sys.stderr.write("* %s%s *\n" % (line,' '*(width-len(line)-4)))
60 sys.stderr.write("%s\n\n" % ('*'*width))
61
62 def clean_up_name(sample):
63 # Remove trailing "_L[0-9]+_001" from Fastq
64 # pair names
65 split_name = sample.split('_')
66 if split_name[-1] == "001":
67 split_name = split_name[:-1]
68 if split_name[-1].startswith('L'):
69 try:
70 int(split_name[-1][1:])
71 split_name = split_name[:-1]
72 except ValueError:
73 pass
74 return '_'.join(split_name)
75
76 def list_outputs(filen=None):
77 # List the output directory contents
78 # If filen is specified then will be the filename to
79 # write to, otherwise write to stdout
80 if filen is not None:
81 fp = open(filen,'w')
82 else:
83 fp = sys.stdout
84 results_dir = os.path.abspath("RESULTS")
85 fp.write("Listing contents of output dir %s:\n" % results_dir)
86 ix = 0
87 for d,dirs,files in os.walk(results_dir):
88 ix += 1
89 fp.write("-- %d: %s\n" % (ix,
90 os.path.relpath(d,results_dir)))
91 for f in files:
92 ix += 1
93 fp.write("---- %d: %s\n" % (ix,
94 os.path.relpath(f,results_dir)))
95 # Close output file
96 if filen is not None:
97 fp.close()
98
99 if __name__ == "__main__":
100 # Command line
101 print "Amplicon analysis: starting"
102 p = argparse.ArgumentParser()
103 p.add_argument("metatable",
104 metavar="METATABLE_FILE",
105 help="Metatable.txt file")
106 p.add_argument("fastq_pairs",
107 metavar="SAMPLE_NAME FQ_R1 FQ_R2",
108 nargs="+",
109 default=list(),
110 help="Triplets of SAMPLE_NAME followed by "
111 "a R1/R2 FASTQ file pair")
112 p.add_argument("-g",dest="forward_pcr_primer")
113 p.add_argument("-G",dest="reverse_pcr_primer")
114 p.add_argument("-q",dest="trimming_threshold")
115 p.add_argument("-O",dest="minimum_overlap")
116 p.add_argument("-L",dest="minimum_length")
117 p.add_argument("-l",dest="sliding_window_length")
118 p.add_argument("-P",dest="pipeline",
119 choices=["vsearch","uparse","qiime"],
120 type=str.lower,
121 default="vsearch")
122 p.add_argument("-S",dest="use_silva",action="store_true")
123 p.add_argument("-r",dest="reference_data_path")
124 p.add_argument("-c",dest="categories_file")
125 args = p.parse_args()
126
127 # Build the environment for running the pipeline
128 print "Amplicon analysis: building the environment"
129 metatable_file = os.path.abspath(args.metatable)
130 os.symlink(metatable_file,"Metatable.txt")
131 print "-- made symlink to Metatable.txt"
132
133 # Link to Categories.txt file (if provided)
134 if args.categories_file is not None:
135 categories_file = os.path.abspath(args.categories_file)
136 os.symlink(categories_file,"Categories.txt")
137 print "-- made symlink to Categories.txt"
138
139 # Link to FASTQs and construct Final_name.txt file
140 sample_names = []
141 with open("Final_name.txt",'w') as final_name:
142 fastqs = iter(args.fastq_pairs)
143 for sample_name,fqr1,fqr2 in zip(fastqs,fastqs,fastqs):
144 sample_name = clean_up_name(sample_name)
145 r1 = "%s_R1_.fastq" % sample_name
146 r2 = "%s_R2_.fastq" % sample_name
147 os.symlink(fqr1,r1)
148 os.symlink(fqr2,r2)
149 final_name.write("%s\n" % '\t'.join((r1,sample_name)))
150 final_name.write("%s\n" % '\t'.join((r2,sample_name)))
151 sample_names.append(sample_name)
152
153 # Construct the pipeline command
154 print "Amplicon analysis: constructing pipeline command"
155 pipeline = PipelineCmd("Amplicon_analysis_pipeline.sh")
156 if args.forward_pcr_primer:
157 pipeline.add_args("-g",args.forward_pcr_primer)
158 if args.reverse_pcr_primer:
159 pipeline.add_args("-G",args.reverse_pcr_primer)
160 if args.trimming_threshold:
161 pipeline.add_args("-q",args.trimming_threshold)
162 if args.minimum_overlap:
163 pipeline.add_args("-O",args.minimum_overlap)
164 if args.minimum_length:
165 pipeline.add_args("-L",args.minimum_length)
166 if args.sliding_window_length:
167 pipeline.add_args("-l",args.sliding_window_length)
168 if args.reference_data_path:
169 pipeline.add_args("-r",args.reference_data_path)
170 pipeline.add_args("-P",args.pipeline)
171 if args.use_silva:
172 pipeline.add_args("-S")
173
174 # Echo the pipeline command to stdout
175 print "Running %s" % pipeline
176
177 # Run the pipeline
178 with open("pipeline.log","w") as pipeline_out:
179 try:
180 subprocess.check_call(pipeline.cmd,
181 stdout=pipeline_out,
182 stderr=subprocess.STDOUT)
183 exit_code = 0
184 print "Pipeline completed ok"
185 except subprocess.CalledProcessError as ex:
186 # Non-zero exit status
187 sys.stderr.write("Pipeline failed: exit code %s\n" %
188 ex.returncode)
189 exit_code = ex.returncode
190 except Exception as ex:
191 # Some other problem
192 sys.stderr.write("Unexpected error: %s\n" % str(ex))
193
194 # Write out the list of outputs
195 outputs_file = "Pipeline_outputs.txt"
196 list_outputs(outputs_file)
197
198 # Check for log file
199 log_file = "Amplicon_analysis_pipeline.log"
200 if os.path.exists(log_file):
201 print "Found log file: %s" % log_file
202 if exit_code == 0:
203 # Create an HTML file to link to log files etc
204 # NB the paths to the files should be correct once
205 # copied by Galaxy on job completion
206 with open("pipeline_outputs.html","w") as html_out:
207 html_out.write("""<html>
208 <head>
209 <title>Amplicon analysis pipeline: log files</title>
210 <head>
211 <body>
212 <h1>Amplicon analysis pipeline: log files</h1>
213 <ul>
214 """)
215 html_out.write(
216 "<li>%s</li>\n" %
217 ahref("Amplicon_analysis_pipeline.log",
218 type="text/plain"))
219 html_out.write(
220 "<li>%s</li>\n" %
221 ahref("pipeline.log",type="text/plain"))
222 html_out.write(
223 "<li>%s</li>\n" %
224 ahref("Pipeline_outputs.txt",
225 type="text/plain"))
226 html_out.write(
227 "<li>%s</li>\n" %
228 ahref("Metatable.html"))
229 html_out.write("""<ul>
230 </body>
231 </html>
232 """)
233 else:
234 # Check for known error messages
235 check_errors()
236 # Write pipeline stdout to tool stderr
237 sys.stderr.write("\nOutput from pipeline:\n")
238 with open("pipeline.log",'r') as log:
239 sys.stderr.write("%s" % log.read())
240 # Write log file contents to tool log
241 print "\nAmplicon_analysis_pipeline.log:"
242 with open(log_file,'r') as log:
243 print "%s" % log.read()
244 else:
245 sys.stderr.write("ERROR missing log file \"%s\"\n" %
246 log_file)
247
248 # Handle FastQC boxplots
249 print "Amplicon analysis: collating per base quality boxplots"
250 with open("fastqc_quality_boxplots.html","w") as quality_boxplots:
251 # PHRED value for trimming
252 phred_score = 20
253 if args.trimming_threshold is not None:
254 phred_score = args.trimming_threshold
255 # Write header for HTML output file
256 quality_boxplots.write("""<html>
257 <head>
258 <title>Amplicon analysis pipeline: Per-base Quality Boxplots (FastQC)</title>
259 <head>
260 <body>
261 <h1>Amplicon analysis pipeline: Per-base Quality Boxplots (FastQC)</h1>
262 """)
263 # Look for raw and trimmed FastQC output for each sample
264 for sample_name in sample_names:
265 fastqc_dir = os.path.join(sample_name,"FastQC")
266 quality_boxplots.write("<h2>%s</h2>" % sample_name)
267 for d in ("Raw","cutdapt_sickle/Q%s" % phred_score):
268 quality_boxplots.write("<h3>%s</h3>" % d)
269 fastqc_html_files = glob.glob(
270 os.path.join(fastqc_dir,d,"*_fastqc.html"))
271 if not fastqc_html_files:
272 quality_boxplots.write("<p>No FastQC outputs found</p>")
273 continue
274 # Pull out the per-base quality boxplots
275 for f in fastqc_html_files:
276 boxplot = None
277 with open(f) as fp:
278 for line in fp.read().split(">"):
279 try:
280 line.index("alt=\"Per base quality graph\"")
281 boxplot = line + ">"
282 break
283 except ValueError:
284 pass
285 if boxplot is None:
286 boxplot = "Missing plot"
287 quality_boxplots.write("<h4>%s</h4><p>%s</p>" %
288 (os.path.basename(f),
289 boxplot))
290 quality_boxplots.write("""</body>
291 </html>
292 """)
293
294 # Handle additional output when categories file was supplied
295 if args.categories_file is not None:
296 # Alpha diversity boxplots
297 print "Amplicon analysis: indexing alpha diversity boxplots"
298 boxplots_dir = os.path.abspath(
299 os.path.join("RESULTS",
300 "%s_%s" % (args.pipeline.title(),
301 ("gg" if not args.use_silva
302 else "silva")),
303 "Alpha_diversity",
304 "Alpha_diversity_boxplot",
305 "Categories_shannon"))
306 print "Amplicon analysis: gathering PDFs from %s" % boxplots_dir
307 boxplot_pdfs = [os.path.basename(pdf)
308 for pdf in
309 sorted(glob.glob(
310 os.path.join(boxplots_dir,"*.pdf")))]
311 with open("alpha_diversity_boxplots.html","w") as boxplots_out:
312 boxplots_out.write("""<html>
313 <head>
314 <title>Amplicon analysis pipeline: Alpha Diversity Boxplots (Shannon)</title>
315 <head>
316 <body>
317 <h1>Amplicon analysis pipeline: Alpha Diversity Boxplots (Shannon)</h1>
318 """)
319 boxplots_out.write("<ul>\n")
320 for pdf in boxplot_pdfs:
321 boxplots_out.write("<li>%s</li>\n" % ahref(pdf))
322 boxplots_out.write("<ul>\n")
323 boxplots_out.write("""</body>
324 </html>
325 """)
326
327 # Finish
328 print "Amplicon analysis: finishing, exit code: %s" % exit_code
329 sys.exit(exit_code)