diff scripts/rgFastQC.py @ 0:e37910d2c794 draft

planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author cstrittmatter
date Mon, 20 Jan 2020 15:11:03 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/rgFastQC.py	Mon Jan 20 15:11:03 2020 -0500
@@ -0,0 +1,162 @@
+"""
+Rewrite of rgFastQC.py for Version 0.11.2 of FastQC.
+
+Changes implemented from tmcgowan at
+https://testtoolshed.g2.bx.psu.edu/view/tmcgowan/fastqc
+and iuc at https://toolshed.g2.bx.psu.edu/view/iuc/fastqc
+with minor changes and bug fixes
+
+SYNOPSIS
+
+    rgFastQC.py -i input_file -j input_file.name -o output_html_file [-d output_directory]
+        [-f fastq|bam|sam] [-n job_name] [-c contaminant_file] [-e fastqc_executable]
+
+EXAMPLE (generated by Galaxy)
+
+    rgFastQC.py -i path/dataset_1.dat -j 1000gsample.fastq -o path/dataset_3.dat -d path/job_working_directory/subfolder
+        -f fastq -n FastQC -c path/dataset_2.dat -e fastqc
+
+"""
+
+import re
+import os
+import shutil
+import subprocess
+import optparse
+import tempfile
+import glob
+import gzip
+import bz2
+import zipfile
+
+class FastQCRunner(object):
+    
+    def __init__(self,opts=None):
+        '''
+        Initializes an object to run FastQC in Galaxy. To start the process, use the function run_fastqc()
+        '''
+        
+        # Check whether the options are specified and saves them into the object
+        assert opts != None
+        self.opts = opts
+
+    def prepare_command_line(self):
+        '''
+        Develops the Commandline to run FastQC in Galaxy
+        '''
+        
+        # Check whether a given file compression format is valid
+        # This prevents uncompression of already uncompressed files
+        infname = self.opts.inputfilename
+        linf = infname.lower()
+        trimext = False
+        # decompression at upload currently does NOT remove this now bogus ending - fastqc will barf
+        # patched may 29 2013 until this is fixed properly
+        if ( linf.endswith('.gz') or linf.endswith('.gzip') ): 
+            f = gzip.open(self.opts.input)
+            try:
+                f.readline()
+            except:
+                trimext = True
+            f.close()
+        elif linf.endswith('bz2'):
+            f = bz2.open(self.opts.input,'rb')
+            try:
+                f.readline()
+            except:
+                trimext = True
+            f.close()
+        elif linf.endswith('.zip'):
+            if not zipfile.is_zipfile(self.opts.input):
+                trimext = True
+        if trimext:
+	   f = open(self.opts.input)
+	   try:
+	       f.readline()
+	   except:
+	       raise Exception("Input file corruption, could not identify the filetype")
+           infname = os.path.splitext(infname)[0]
+        
+        # Replace unwanted or problematic charaters in the input file name
+        self.fastqinfilename = re.sub(ur'[^a-zA-Z0-9_\-\.]', '_', os.path.basename(infname))
+        # check that the symbolic link gets a proper ending, fastqc seems to ignore the given format otherwise
+        if 'fastq' in opts.informat:
+            # with fastq the .ext is ignored, but when a format is actually passed it must comply with fastqc's 
+            # accepted formats..
+            opts.informat = 'fastq'
+        elif not self.fastqinfilename.endswith(opts.informat):
+            self.fastqinfilename += '.%s' % opts.informat
+
+        # Build the Commandline from the given parameters
+        command_line = [opts.executable, '--outdir %s' % opts.outputdir]
+        if opts.contaminants != None:
+            command_line.append('--contaminants %s' % opts.contaminants)
+        if opts.limits != None:
+	    command_line.append('--limits %s' % opts.limits)
+        command_line.append('--quiet')
+        command_line.append('--extract') # to access the output text file
+        command_line.append(self.fastqinfilename)
+        command_line.append('-f %s' % opts.informat)
+        self.command_line = ' '.join(command_line)
+
+    def copy_output_file_to_dataset(self):
+        '''
+        Retrieves the output html and text files from the output directory and copies them to the Galaxy output files
+        '''
+        
+        # retrieve html file
+        result_file = glob.glob(opts.outputdir + '/*html')
+        with open(result_file[0], 'rb') as fsrc:
+            with open(self.opts.htmloutput, 'wb') as fdest:
+                shutil.copyfileobj(fsrc, fdest)
+        
+        # retrieve text file
+        text_file = glob.glob(opts.outputdir + '/*/fastqc_data.txt')
+        with open(text_file[0], 'rb') as fsrc:
+            with open(self.opts.textoutput, 'wb') as fdest:
+                shutil.copyfileobj(fsrc, fdest)
+
+    def run_fastqc(self):
+        '''
+        Executes FastQC. Make sure the mandatory import parameters input, inputfilename, outputdir and htmloutput have been specified in the options (opts)
+        '''
+        
+        # Create a log file
+        dummy,tlog = tempfile.mkstemp(prefix='rgFastQC',suffix=".log",dir=self.opts.outputdir)
+        sout = open(tlog, 'w')
+        
+        self.prepare_command_line()
+        sout.write(self.command_line)
+        sout.write('\n')
+        sout.write("Creating symlink\n") # between the input (.dat) file and the given input file name
+        os.symlink(self.opts.input, self.fastqinfilename)
+        sout.write("check_call\n")
+        subprocess.check_call(self.command_line, shell=True)
+        sout.write("Copying working %s file to %s \n" % (self.fastqinfilename, self.opts.htmloutput))
+        self.copy_output_file_to_dataset()
+        sout.write("Finished")
+        sout.close()
+
+if __name__ == '__main__':
+    op = optparse.OptionParser()
+    op.add_option('-i', '--input', default=None)
+    op.add_option('-j', '--inputfilename', default=None)
+    op.add_option('-o', '--htmloutput', default=None)
+    op.add_option('-t', '--textoutput', default=None)
+    op.add_option('-d', '--outputdir', default="/tmp/shortread")
+    op.add_option('-f', '--informat', default='fastq')
+    op.add_option('-n', '--namejob', default='rgFastQC')
+    op.add_option('-c', '--contaminants', default=None)
+    op.add_option('-l', '--limits', default=None)
+    op.add_option('-e', '--executable', default='fastqc')
+    opts, args = op.parse_args()
+    
+    assert opts.input != None
+    assert opts.inputfilename != None
+    assert opts.htmloutput != None
+    #assert os.path.isfile(opts.executable),'##rgFastQC.py error - cannot find executable %s' % opts.executable
+    if not os.path.exists(opts.outputdir): 
+        os.makedirs(opts.outputdir)
+    
+    fastqc_runner = FastQCRunner(opts)
+    fastqc_runner.run_fastqc()