comparison rgFastQC.py @ 0:d4ac6e05c96c default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:43:47 -0600
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d4ac6e05c96c
1 """
2 # May 2013 ross added check for bogus gz extension - fastqc gets confused
3 # added sanitizer for user supplied name
4 # removed shell and make cl a sequence for Popen call
5 # ross lazarus August 10 2012 in response to anon insecurity report
6 wrapper for fastqc
7
8 called as
9 <command interpreter="python">
10 rgFastqc.py -i $input_file -d $html_file.files_path -o $html_file -n "$out_prefix"
11 </command>
12
13
14
15 Current release seems overly intolerant of sam/bam header strangeness
16 Author notified...
17
18
19 """
20 import re
21 import os
22 import sys
23 import subprocess
24 import optparse
25 import shutil
26 import tempfile
27 import zipfile
28 import gzip
29 import magic
30
31
32 def getFileString(fpath, outpath):
33 """
34 format a nice file size string
35 """
36 size = ''
37 fp = os.path.join(outpath, fpath)
38 s = '? ?'
39 if os.path.isfile(fp):
40 n = float(os.path.getsize(fp))
41 if n > 2**20:
42 size = ' (%1.1f MB)' % (n/2**20)
43 elif n > 2**10:
44 size = ' (%1.1f KB)' % (n/2**10)
45 elif n > 0:
46 size = ' (%d B)' % (int(n))
47 s = '%s %s' % (fpath, size)
48 return s
49
50
51 class FastQC():
52 """wrapper
53 """
54
55
56 def __init__(self,opts=None):
57 assert opts <> None
58 self.opts = opts
59
60
61 def run_fastqc(self):
62 """
63 In batch mode fastqc behaves not very nicely - will write to a new folder in
64 the same place as the infile called [infilebasename]_fastqc
65 rlazarus@omics:/data/galaxy/test$ ls FC041_1_sequence_fastqc
66 duplication_levels.png fastqc_icon.png per_base_n_content.png per_sequence_gc_content.png summary.txt
67 error.png fastqc_report.html per_base_quality.png per_sequence_quality.png tick.png
68 fastqc_data.txt per_base_gc_content.png per_base_sequence_content.png sequence_length_distribution.png warning.png
69
70 """
71 serr = ''
72 dummy,tlog = tempfile.mkstemp(prefix='rgFastQC',suffix=".log",dir=self.opts.outputdir)
73 sout = open(tlog, 'w')
74 fastq = os.path.basename(self.opts.input)
75 cl = [self.opts.executable,'--outdir=%s' % self.opts.outputdir]
76 if self.opts.informat in ['sam','bam']:
77 cl.append('--format=%s' % self.opts.informat)
78 if self.opts.contaminants <> None :
79 cl.append('--contaminants=%s' % self.opts.contaminants)
80 # patch suggested by bwlang https://bitbucket.org/galaxy/galaxy-central/pull-request/30
81 # use a symlink in a temporary directory so that the FastQC report reflects the history input file name
82 infname = self.opts.inputfilename
83 linf = infname.lower()
84 trimext = False
85 # decompression at upload currently does NOT remove this now bogus ending - fastqc will barf
86 # patched may 29 2013 until this is fixed properly
87 input_magic = magic.from_file(self.opts.input)
88 if ( linf.endswith('.gz') or linf.endswith('.gzip') or 'gzip' in input_magic):
89 f = gzip.open(self.opts.input)
90 try:
91 testrow = f.readline()
92 except:
93 trimext = True
94 f.close()
95 elif linf.endswith('bz2'):
96 f = bz2.open(self.opts.input,'rb')
97 try:
98 f.readline()
99 except:
100 trimext = True
101 f.close()
102 elif linf.endswith('.zip'):
103 if not zipfile.is_zipfile(self.opts.input):
104 trimext = True
105 if trimext:
106 infname = os.path.splitext(infname)[0]
107 fastqinfilename = re.sub(ur'[^a-zA-Z0-9_\-\.]', '_', os.path.basename(infname))
108 link_name = os.path.join(self.opts.outputdir, fastqinfilename)
109 os.symlink(self.opts.input, link_name)
110 cl.append(link_name)
111 if('gzip' in input_magic):
112 sout.write('# File magic = %s\n' % input_magic)
113 sout.write('# FastQC cl = %s\n' % ' '.join(cl))
114 sout.flush()
115 p = subprocess.Popen(cl, shell=False, stderr=sout, stdout=sout, cwd=self.opts.outputdir)
116 retval = p.wait()
117 sout.close()
118 runlog = open(tlog,'r').readlines()
119 os.unlink(link_name)
120 flist = os.listdir(self.opts.outputdir) # fastqc plays games with its output directory name. eesh
121 odpath = None
122 for f in flist:
123 d = os.path.join(self.opts.outputdir,f)
124 if os.path.isdir(d):
125 if d.endswith('_fastqc'):
126 odpath = d
127 hpath = None
128 if odpath <> None:
129 try:
130 hpath = os.path.join(odpath,'fastqc_report.html')
131 rep = open(hpath,'r').readlines() # for our new html file but we need to insert our stuff after the <body> tag
132 except:
133 pass
134 if hpath == None:
135 serr = '\n'.join(runlog)
136 res = ['## odpath=%s: No output found in %s. Output for the run was:<pre>\n' % (odpath,hpath),]
137 res += runlog
138 res += ['</pre>\n',
139 'Please read the above for clues<br/>\n',
140 'If you selected a sam/bam format file, it might not have headers or they may not start with @HD?<br/>\n',
141 'It is also possible that the log shows that fastqc is not installed?<br/>\n',
142 'If that is the case, please tell the relevant Galaxy administrator that it can be snarfed from<br/>\n',
143 'http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/<br/>\n',]
144 return res,1,serr
145 self.fix_fastqcimages(odpath)
146 flist = os.listdir(self.opts.outputdir) # these have now been fixed
147 excludefiles = ['tick.png','warning.png','fastqc_icon.png','error.png']
148 flist = [x for x in flist if not x in excludefiles]
149 for i in range(len(rep)): # need to fix links to Icons and Image subdirectories in lastest fastqc code - ugh
150 rep[i] = rep[i].replace('Icons/','')
151 rep[i] = rep[i].replace('Images/','')
152
153 html = self.fix_fastqc(rep,flist,runlog)
154 return html,retval,serr
155
156
157
158 def fix_fastqc(self,rep=[],flist=[],runlog=[]):
159 """ add some of our stuff to the html
160 """
161 bodyindex = len(rep) -1 # hope they don't change this
162 footrow = bodyindex - 1
163 footer = rep[footrow]
164 rep = rep[:footrow] + rep[footrow+1:]
165 res = ['<div class="module"><h2>Files created by FastQC</h2><table cellspacing="2" cellpadding="2">\n']
166 flist.sort()
167 for i,f in enumerate(flist):
168 if not(os.path.isdir(f)):
169 fn = os.path.split(f)[-1]
170 res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,getFileString(fn, self.opts.outputdir)))
171 res.append('</table>\n')
172 res.append('<a href="http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/">FastQC documentation and full attribution is here</a><br/><hr/>\n')
173 res.append('FastQC was run by Galaxy using the rgenetics rgFastQC wrapper - see http://bitbucket.org/rgenetics for details and licensing\n</div>')
174 res.append(footer)
175 fixed = rep[:bodyindex] + res + rep[bodyindex:]
176 return fixed # with our additions
177
178
179 def fix_fastqcimages(self,odpath):
180 """ Galaxy wants everything in the same files_dir
181 """
182 icpath = os.path.join(odpath,'Icons')
183 impath = os.path.join(odpath,'Images')
184 for adir in [icpath,impath,odpath]:
185 if os.path.exists(adir):
186 flist = os.listdir(adir) # get all files created
187 for f in flist:
188 if not os.path.isdir(os.path.join(adir,f)):
189 sauce = os.path.join(adir,f)
190 dest = os.path.join(self.opts.outputdir,f)
191 shutil.move(sauce,dest)
192 os.rmdir(adir)
193
194
195
196 if __name__ == '__main__':
197 op = optparse.OptionParser()
198 op.add_option('-i', '--input', default=None)
199 op.add_option('-j', '--inputfilename', default=None)
200 op.add_option('-o', '--htmloutput', default=None)
201 op.add_option('-d', '--outputdir', default="/tmp/shortread")
202 op.add_option('-f', '--informat', default='fastq')
203 op.add_option('-n', '--namejob', default='rgFastQC')
204 op.add_option('-c', '--contaminants', default=None)
205 op.add_option('-e', '--executable', default='fastqc')
206 opts, args = op.parse_args()
207 assert opts.input <> None
208 assert os.path.isfile(opts.executable),'##rgFastQC.py error - cannot find executable %s' % opts.executable
209 if not os.path.exists(opts.outputdir):
210 os.makedirs(opts.outputdir)
211 f = FastQC(opts)
212 html,retval,serr = f.run_fastqc()
213 f = open(opts.htmloutput, 'w')
214 f.write(''.join(html))
215 f.close()
216 if retval <> 0:
217 print >> sys.stderr, serr # indicate failure
218
219
220