comparison scripts/rgFastQC.py @ 0:e37910d2c794 draft

planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author cstrittmatter
date Mon, 20 Jan 2020 15:11:03 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e37910d2c794
1 """
2 Rewrite of rgFastQC.py for Version 0.11.2 of FastQC.
3
4 Changes implemented from tmcgowan at
5 https://testtoolshed.g2.bx.psu.edu/view/tmcgowan/fastqc
6 and iuc at https://toolshed.g2.bx.psu.edu/view/iuc/fastqc
7 with minor changes and bug fixes
8
9 SYNOPSIS
10
11 rgFastQC.py -i input_file -j input_file.name -o output_html_file [-d output_directory]
12 [-f fastq|bam|sam] [-n job_name] [-c contaminant_file] [-e fastqc_executable]
13
14 EXAMPLE (generated by Galaxy)
15
16 rgFastQC.py -i path/dataset_1.dat -j 1000gsample.fastq -o path/dataset_3.dat -d path/job_working_directory/subfolder
17 -f fastq -n FastQC -c path/dataset_2.dat -e fastqc
18
19 """
20
21 import re
22 import os
23 import shutil
24 import subprocess
25 import optparse
26 import tempfile
27 import glob
28 import gzip
29 import bz2
30 import zipfile
31
32 class FastQCRunner(object):
33
34 def __init__(self,opts=None):
35 '''
36 Initializes an object to run FastQC in Galaxy. To start the process, use the function run_fastqc()
37 '''
38
39 # Check whether the options are specified and saves them into the object
40 assert opts != None
41 self.opts = opts
42
43 def prepare_command_line(self):
44 '''
45 Develops the Commandline to run FastQC in Galaxy
46 '''
47
48 # Check whether a given file compression format is valid
49 # This prevents uncompression of already uncompressed files
50 infname = self.opts.inputfilename
51 linf = infname.lower()
52 trimext = False
53 # decompression at upload currently does NOT remove this now bogus ending - fastqc will barf
54 # patched may 29 2013 until this is fixed properly
55 if ( linf.endswith('.gz') or linf.endswith('.gzip') ):
56 f = gzip.open(self.opts.input)
57 try:
58 f.readline()
59 except:
60 trimext = True
61 f.close()
62 elif linf.endswith('bz2'):
63 f = bz2.open(self.opts.input,'rb')
64 try:
65 f.readline()
66 except:
67 trimext = True
68 f.close()
69 elif linf.endswith('.zip'):
70 if not zipfile.is_zipfile(self.opts.input):
71 trimext = True
72 if trimext:
73 f = open(self.opts.input)
74 try:
75 f.readline()
76 except:
77 raise Exception("Input file corruption, could not identify the filetype")
78 infname = os.path.splitext(infname)[0]
79
80 # Replace unwanted or problematic charaters in the input file name
81 self.fastqinfilename = re.sub(ur'[^a-zA-Z0-9_\-\.]', '_', os.path.basename(infname))
82 # check that the symbolic link gets a proper ending, fastqc seems to ignore the given format otherwise
83 if 'fastq' in opts.informat:
84 # with fastq the .ext is ignored, but when a format is actually passed it must comply with fastqc's
85 # accepted formats..
86 opts.informat = 'fastq'
87 elif not self.fastqinfilename.endswith(opts.informat):
88 self.fastqinfilename += '.%s' % opts.informat
89
90 # Build the Commandline from the given parameters
91 command_line = [opts.executable, '--outdir %s' % opts.outputdir]
92 if opts.contaminants != None:
93 command_line.append('--contaminants %s' % opts.contaminants)
94 if opts.limits != None:
95 command_line.append('--limits %s' % opts.limits)
96 command_line.append('--quiet')
97 command_line.append('--extract') # to access the output text file
98 command_line.append(self.fastqinfilename)
99 command_line.append('-f %s' % opts.informat)
100 self.command_line = ' '.join(command_line)
101
102 def copy_output_file_to_dataset(self):
103 '''
104 Retrieves the output html and text files from the output directory and copies them to the Galaxy output files
105 '''
106
107 # retrieve html file
108 result_file = glob.glob(opts.outputdir + '/*html')
109 with open(result_file[0], 'rb') as fsrc:
110 with open(self.opts.htmloutput, 'wb') as fdest:
111 shutil.copyfileobj(fsrc, fdest)
112
113 # retrieve text file
114 text_file = glob.glob(opts.outputdir + '/*/fastqc_data.txt')
115 with open(text_file[0], 'rb') as fsrc:
116 with open(self.opts.textoutput, 'wb') as fdest:
117 shutil.copyfileobj(fsrc, fdest)
118
119 def run_fastqc(self):
120 '''
121 Executes FastQC. Make sure the mandatory import parameters input, inputfilename, outputdir and htmloutput have been specified in the options (opts)
122 '''
123
124 # Create a log file
125 dummy,tlog = tempfile.mkstemp(prefix='rgFastQC',suffix=".log",dir=self.opts.outputdir)
126 sout = open(tlog, 'w')
127
128 self.prepare_command_line()
129 sout.write(self.command_line)
130 sout.write('\n')
131 sout.write("Creating symlink\n") # between the input (.dat) file and the given input file name
132 os.symlink(self.opts.input, self.fastqinfilename)
133 sout.write("check_call\n")
134 subprocess.check_call(self.command_line, shell=True)
135 sout.write("Copying working %s file to %s \n" % (self.fastqinfilename, self.opts.htmloutput))
136 self.copy_output_file_to_dataset()
137 sout.write("Finished")
138 sout.close()
139
140 if __name__ == '__main__':
141 op = optparse.OptionParser()
142 op.add_option('-i', '--input', default=None)
143 op.add_option('-j', '--inputfilename', default=None)
144 op.add_option('-o', '--htmloutput', default=None)
145 op.add_option('-t', '--textoutput', default=None)
146 op.add_option('-d', '--outputdir', default="/tmp/shortread")
147 op.add_option('-f', '--informat', default='fastq')
148 op.add_option('-n', '--namejob', default='rgFastQC')
149 op.add_option('-c', '--contaminants', default=None)
150 op.add_option('-l', '--limits', default=None)
151 op.add_option('-e', '--executable', default='fastqc')
152 opts, args = op.parse_args()
153
154 assert opts.input != None
155 assert opts.inputfilename != None
156 assert opts.htmloutput != None
157 #assert os.path.isfile(opts.executable),'##rgFastQC.py error - cannot find executable %s' % opts.executable
158 if not os.path.exists(opts.outputdir):
159 os.makedirs(opts.outputdir)
160
161 fastqc_runner = FastQCRunner(opts)
162 fastqc_runner.run_fastqc()