diff rgFastQC.py @ 1:8fae48caaf06 draft

Uploaded form GH
author devteam
date Tue, 11 Nov 2014 12:46:27 -0500
parents e28c965eeed4
children 0b201de108b9
line wrap: on
line diff
--- a/rgFastQC.py	Mon Jan 27 09:29:14 2014 -0500
+++ b/rgFastQC.py	Tue Nov 11 12:46:27 2014 -0500
@@ -1,83 +1,52 @@
 """
-# May 2013 ross added check for bogus gz extension - fastqc gets confused
-# added sanitizer for user supplied name
-# removed shell and make cl a sequence for Popen call
-# ross lazarus August 10 2012 in response to anon insecurity report
-wrapper for fastqc
+Rewrite of rgFastQC.py for Version 0.11.2 of FastQC.
+
+Changes implemented from tmcgowan at
+https://testtoolshed.g2.bx.psu.edu/view/tmcgowan/fastqc
+and iuc at https://toolshed.g2.bx.psu.edu/view/iuc/fastqc
+with minor changes and bug fixes
 
-called as
-  <command interpreter="python">
-    rgFastqc.py -i $input_file -d $html_file.files_path -o $html_file -n "$out_prefix"
-  </command>
+SYNOPSIS
+
+    rgFastQC.py -i input_file -j input_file.name -o output_html_file [-d output_directory]
+        [-f fastq|bam|sam] [-n job_name] [-c contaminant_file] [-e fastqc_executable]
 
-
+EXAMPLE (generated by Galaxy)
 
-Current release seems overly intolerant of sam/bam header strangeness
-Author notified...
-
+    rgFastQC.py -i path/dataset_1.dat -j 1000gsample.fastq -o path/dataset_3.dat -d path/job_working_directory/subfolder
+        -f fastq -n FastQC -c path/dataset_2.dat -e fastqc
 
 """
+
 import re
 import os
-import sys
+import shutil
 import subprocess
 import optparse
-import shutil
 import tempfile
+import glob
+import gzip
+import bz2
 import zipfile
-import gzip
-
 
-def getFileString(fpath, outpath):
-    """
-    format a nice file size string
-    """
-    size = ''
-    fp = os.path.join(outpath, fpath)
-    s = '? ?'
-    if os.path.isfile(fp):
-        n = float(os.path.getsize(fp))
-        if n > 2**20:
-            size = ' (%1.1f MB)' % (n/2**20)
-        elif n > 2**10:
-            size = ' (%1.1f KB)' % (n/2**10)
-        elif n > 0:
-            size = ' (%d B)' % (int(n))
-        s = '%s %s' % (fpath, size) 
-    return s
-
-
-class FastQC():
-    """wrapper
-    """
-    
+class FastQCRunner(object):
     
     def __init__(self,opts=None):
-        assert opts <> None
-        self.opts = opts
-        
+        '''
+        Initializes an object to run FastQC in Galaxy. To start the process, use the function run_fastqc()
+        '''
         
-    def run_fastqc(self):
-        """
-        In batch mode fastqc behaves not very nicely - will write to a new folder in
-        the same place as the infile called [infilebasename]_fastqc
-    rlazarus@omics:/data/galaxy/test$ ls FC041_1_sequence_fastqc
-    duplication_levels.png  fastqc_icon.png          per_base_n_content.png         per_sequence_gc_content.png       summary.txt
-    error.png               fastqc_report.html       per_base_quality.png           per_sequence_quality.png          tick.png
-    fastqc_data.txt         per_base_gc_content.png  per_base_sequence_content.png  sequence_length_distribution.png  warning.png
+        # Check whether the options are specified and saves them into the object
+        assert opts != None
+        self.opts = opts
 
-        """
-        serr = ''
-        dummy,tlog = tempfile.mkstemp(prefix='rgFastQC',suffix=".log",dir=self.opts.outputdir)
-        sout = open(tlog, 'w')
-        fastq = os.path.basename(self.opts.input)
-        cl = [self.opts.executable,'--outdir=%s' % self.opts.outputdir]
-        if self.opts.informat in ['sam','bam']:
-            cl.append('--f=%s' % self.opts.informat)
-        if self.opts.contaminants <> None :
-            cl.append('--contaminants=%s' % self.opts.contaminants)
-        # patch suggested by bwlang https://bitbucket.org/galaxy/galaxy-central/pull-request/30
-        # use a symlink in a temporary directory so that the FastQC report reflects the history input file name
+    def prepare_command_line(self):
+        '''
+        Develops the Commandline to run FastQC in Galaxy
+        '''
+        
+        # Check whether a given file compression format is valid
+        # This prevents uncompression of already uncompressed files
         infname = self.opts.inputfilename
         linf = infname.lower()
         trimext = False
@@ -86,7 +55,7 @@
         if ( linf.endswith('.gz') or linf.endswith('.gzip') ): 
             f = gzip.open(self.opts.input)
             try:
-                testrow = f.readline()
+                f.readline()
             except:
                 trimext = True
             f.close()
@@ -101,116 +70,85 @@
             if not zipfile.is_zipfile(self.opts.input):
                 trimext = True
         if trimext:
-            infname = os.path.splitext(infname)[0]
-        fastqinfilename = re.sub(ur'[^a-zA-Z0-9_\-\.]', '_', os.path.basename(infname))
-        link_name = os.path.join(self.opts.outputdir, fastqinfilename)
-        os.symlink(self.opts.input, link_name)
-        cl.append(link_name)        
-        sout.write('# FastQC cl = %s\n' % ' '.join(cl))
-        sout.flush()
-        p = subprocess.Popen(cl, shell=False, stderr=sout, stdout=sout, cwd=self.opts.outputdir)
-        retval = p.wait()
+	   f = open(self.opts.input)
+	   try:
+	       f.readline()
+	   except:
+	       raise Exception("Input file corruption, could not identify the filetype")
+           infname = os.path.splitext(infname)[0]
+        
+        # Replace unwanted or problematic charaters in the input file name
+        self.fastqinfilename = re.sub(ur'[^a-zA-Z0-9_\-\.]', '_', os.path.basename(infname))
+        
+        # Build the Commandline from the given parameters
+        command_line = [opts.executable, '--outdir %s' % opts.outputdir]
+        if opts.contaminants != None:
+            command_line.append('--contaminants %s' % opts.contaminants)
+        if opts.limits != None:
+	    command_line.append('--limits %s' % opts.limits)
+        command_line.append('--quiet')
+        command_line.append('--extract') # to access the output text file
+        command_line.append(self.fastqinfilename)
+        self.command_line = ' '.join(command_line)
+
+    def copy_output_file_to_dataset(self):
+        '''
+        Retrieves the output html and text files from the output directory and copies them to the Galaxy output files
+        '''
+        
+        # retrieve html file
+        result_file = glob.glob(opts.outputdir + '/*html')
+        with open(result_file[0], 'rb') as fsrc:
+            with open(self.opts.htmloutput, 'wb') as fdest:
+                shutil.copyfileobj(fsrc, fdest)
+        
+        # retrieve text file
+        text_file = glob.glob(opts.outputdir + '/*/fastqc_data.txt')
+        with open(text_file[0], 'rb') as fsrc:
+            with open(self.opts.textoutput, 'wb') as fdest:
+                shutil.copyfileobj(fsrc, fdest)
+
+    def run_fastqc(self):
+        '''
+        Executes FastQC. Make sure the mandatory import parameters input, inputfilename, outputdir and htmloutput have been specified in the options (opts)
+        '''
+        
+        # Create a log file
+        dummy,tlog = tempfile.mkstemp(prefix='rgFastQC',suffix=".log",dir=self.opts.outputdir)
+        sout = open(tlog, 'w')
+        
+        self.prepare_command_line()
+        sout.write(self.command_line)
+        sout.write('\n')
+        sout.write("Creating symlink\n") # between the input (.dat) file and the given input file name
+        os.symlink(self.opts.input, self.fastqinfilename)
+        sout.write("check_call\n")
+        subprocess.check_call(self.command_line, shell=True)
+        sout.write("Copying working %s file to %s \n" % (self.fastqinfilename, self.opts.htmloutput))
+        self.copy_output_file_to_dataset()
+        sout.write("Finished")
         sout.close()
-        runlog = open(tlog,'r').readlines()
-        os.unlink(link_name)
-        flist = os.listdir(self.opts.outputdir) # fastqc plays games with its output directory name. eesh
-        odpath = None
-        for f in flist:
-            d = os.path.join(self.opts.outputdir,f)
-            if os.path.isdir(d):
-                if d.endswith('_fastqc'):
-                    odpath = d 
-        hpath = None
-        if odpath <> None:
-            try: 
-                hpath = os.path.join(odpath,'fastqc_report.html')
-                rep = open(hpath,'r').readlines() # for our new html file but we need to insert our stuff after the <body> tag
-            except:
-                pass
-        if hpath == None:
-            serr = '\n'.join(runlog)       
-            res =  ['## odpath=%s: No output found in %s. Output for the run was:<pre>\n' % (odpath,hpath),]
-            res += runlog
-            res += ['</pre>\n',
-                   'Please read the above for clues<br/>\n',
-                   'If you selected a sam/bam format file, it might not have headers or they may not start with @HD?<br/>\n',
-                   'It is also possible that the log shows that fastqc is not installed?<br/>\n',
-                   'If that is the case, please tell the relevant Galaxy administrator that it can be snarfed from<br/>\n',
-                   'http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/<br/>\n',]
-            return res,1,serr
-        self.fix_fastqcimages(odpath)
-        flist = os.listdir(self.opts.outputdir) # these have now been fixed
-        excludefiles = ['tick.png','warning.png','fastqc_icon.png','error.png']
-        flist = [x for x in flist if not x in excludefiles]
-        for i in range(len(rep)): # need to fix links to Icons and Image subdirectories in lastest fastqc code - ugh
-            rep[i] = rep[i].replace('Icons/','')
-            rep[i] = rep[i].replace('Images/','')
-
-        html = self.fix_fastqc(rep,flist,runlog)
-        return html,retval,serr
-        
-
-        
-    def fix_fastqc(self,rep=[],flist=[],runlog=[]):
-        """ add some of our stuff to the html
-        """
-        bodyindex = len(rep) -1  # hope they don't change this
-        footrow = bodyindex - 1 
-        footer = rep[footrow]
-        rep = rep[:footrow] + rep[footrow+1:]
-        res = ['<div class="module"><h2>Files created by FastQC</h2><table cellspacing="2" cellpadding="2">\n']
-        flist.sort()
-        for i,f in enumerate(flist):
-            if not(os.path.isdir(f)):
-                fn = os.path.split(f)[-1]
-                res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,getFileString(fn, self.opts.outputdir)))
-        res.append('</table>\n') 
-        res.append('<a href="http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/">FastQC documentation and full attribution is here</a><br/><hr/>\n')
-        res.append('FastQC was run by Galaxy using the rgenetics rgFastQC wrapper - see http://bitbucket.org/rgenetics for details and licensing\n</div>')
-        res.append(footer)
-        fixed = rep[:bodyindex] + res + rep[bodyindex:]
-        return fixed # with our additions
-
-
-    def fix_fastqcimages(self,odpath):
-        """ Galaxy wants everything in the same files_dir
-        """
-        icpath = os.path.join(odpath,'Icons')
-        impath = os.path.join(odpath,'Images')
-        for adir in [icpath,impath,odpath]:
-            if os.path.exists(adir):
-                flist = os.listdir(adir) # get all files created
-                for f in flist:
-                    if not os.path.isdir(os.path.join(adir,f)):
-                        sauce = os.path.join(adir,f)
-                        dest = os.path.join(self.opts.outputdir,f)
-                        shutil.move(sauce,dest)
-                os.rmdir(adir)
-
-    
 
 if __name__ == '__main__':
     op = optparse.OptionParser()
     op.add_option('-i', '--input', default=None)
-    op.add_option('-j', '--inputfilename', default=None)    
+    op.add_option('-j', '--inputfilename', default=None)
     op.add_option('-o', '--htmloutput', default=None)
+    op.add_option('-t', '--textoutput', default=None)
     op.add_option('-d', '--outputdir', default="/tmp/shortread")
     op.add_option('-f', '--informat', default='fastq')
     op.add_option('-n', '--namejob', default='rgFastQC')
     op.add_option('-c', '--contaminants', default=None)
+    op.add_option('-l', '--limits', default=None)
     op.add_option('-e', '--executable', default='fastqc')
     opts, args = op.parse_args()
-    assert opts.input <> None
+    
+    assert opts.input != None
+    assert opts.inputfilename != None
+    assert opts.htmloutput != None
     assert os.path.isfile(opts.executable),'##rgFastQC.py error - cannot find executable %s' % opts.executable
     if not os.path.exists(opts.outputdir): 
         os.makedirs(opts.outputdir)
-    f = FastQC(opts)
-    html,retval,serr = f.run_fastqc()
-    f = open(opts.htmloutput, 'w')
-    f.write(''.join(html))
-    f.close()
-    if retval <> 0:
-        print >> sys.stderr, serr # indicate failure
-         
     
-
+    fastqc_runner = FastQCRunner(opts)
+    fastqc_runner.run_fastqc()
\ No newline at end of file