comparison rgFastQC.py @ 0:e28c965eeed4 draft

Imported from capsule None
author devteam
date Mon, 27 Jan 2014 09:29:14 -0500
parents
children 8fae48caaf06
comparison
equal deleted inserted replaced
-1:000000000000 0:e28c965eeed4
1 """
2 # May 2013 ross added check for bogus gz extension - fastqc gets confused
3 # added sanitizer for user supplied name
4 # removed shell and make cl a sequence for Popen call
5 # ross lazarus August 10 2012 in response to anon insecurity report
6 wrapper for fastqc
7
8 called as
9 <command interpreter="python">
10 rgFastqc.py -i $input_file -d $html_file.files_path -o $html_file -n "$out_prefix"
11 </command>
12
13
14
15 Current release seems overly intolerant of sam/bam header strangeness
16 Author notified...
17
18
19 """
20 import re
21 import os
22 import sys
23 import subprocess
24 import optparse
25 import shutil
26 import tempfile
27 import zipfile
28 import gzip
29
30
31 def getFileString(fpath, outpath):
32 """
33 format a nice file size string
34 """
35 size = ''
36 fp = os.path.join(outpath, fpath)
37 s = '? ?'
38 if os.path.isfile(fp):
39 n = float(os.path.getsize(fp))
40 if n > 2**20:
41 size = ' (%1.1f MB)' % (n/2**20)
42 elif n > 2**10:
43 size = ' (%1.1f KB)' % (n/2**10)
44 elif n > 0:
45 size = ' (%d B)' % (int(n))
46 s = '%s %s' % (fpath, size)
47 return s
48
49
50 class FastQC():
51 """wrapper
52 """
53
54
55 def __init__(self,opts=None):
56 assert opts <> None
57 self.opts = opts
58
59
60 def run_fastqc(self):
61 """
62 In batch mode fastqc behaves not very nicely - will write to a new folder in
63 the same place as the infile called [infilebasename]_fastqc
64 rlazarus@omics:/data/galaxy/test$ ls FC041_1_sequence_fastqc
65 duplication_levels.png fastqc_icon.png per_base_n_content.png per_sequence_gc_content.png summary.txt
66 error.png fastqc_report.html per_base_quality.png per_sequence_quality.png tick.png
67 fastqc_data.txt per_base_gc_content.png per_base_sequence_content.png sequence_length_distribution.png warning.png
68
69 """
70 serr = ''
71 dummy,tlog = tempfile.mkstemp(prefix='rgFastQC',suffix=".log",dir=self.opts.outputdir)
72 sout = open(tlog, 'w')
73 fastq = os.path.basename(self.opts.input)
74 cl = [self.opts.executable,'--outdir=%s' % self.opts.outputdir]
75 if self.opts.informat in ['sam','bam']:
76 cl.append('--f=%s' % self.opts.informat)
77 if self.opts.contaminants <> None :
78 cl.append('--contaminants=%s' % self.opts.contaminants)
79 # patch suggested by bwlang https://bitbucket.org/galaxy/galaxy-central/pull-request/30
80 # use a symlink in a temporary directory so that the FastQC report reflects the history input file name
81 infname = self.opts.inputfilename
82 linf = infname.lower()
83 trimext = False
84 # decompression at upload currently does NOT remove this now bogus ending - fastqc will barf
85 # patched may 29 2013 until this is fixed properly
86 if ( linf.endswith('.gz') or linf.endswith('.gzip') ):
87 f = gzip.open(self.opts.input)
88 try:
89 testrow = f.readline()
90 except:
91 trimext = True
92 f.close()
93 elif linf.endswith('bz2'):
94 f = bz2.open(self.opts.input,'rb')
95 try:
96 f.readline()
97 except:
98 trimext = True
99 f.close()
100 elif linf.endswith('.zip'):
101 if not zipfile.is_zipfile(self.opts.input):
102 trimext = True
103 if trimext:
104 infname = os.path.splitext(infname)[0]
105 fastqinfilename = re.sub(ur'[^a-zA-Z0-9_\-\.]', '_', os.path.basename(infname))
106 link_name = os.path.join(self.opts.outputdir, fastqinfilename)
107 os.symlink(self.opts.input, link_name)
108 cl.append(link_name)
109 sout.write('# FastQC cl = %s\n' % ' '.join(cl))
110 sout.flush()
111 p = subprocess.Popen(cl, shell=False, stderr=sout, stdout=sout, cwd=self.opts.outputdir)
112 retval = p.wait()
113 sout.close()
114 runlog = open(tlog,'r').readlines()
115 os.unlink(link_name)
116 flist = os.listdir(self.opts.outputdir) # fastqc plays games with its output directory name. eesh
117 odpath = None
118 for f in flist:
119 d = os.path.join(self.opts.outputdir,f)
120 if os.path.isdir(d):
121 if d.endswith('_fastqc'):
122 odpath = d
123 hpath = None
124 if odpath <> None:
125 try:
126 hpath = os.path.join(odpath,'fastqc_report.html')
127 rep = open(hpath,'r').readlines() # for our new html file but we need to insert our stuff after the <body> tag
128 except:
129 pass
130 if hpath == None:
131 serr = '\n'.join(runlog)
132 res = ['## odpath=%s: No output found in %s. Output for the run was:<pre>\n' % (odpath,hpath),]
133 res += runlog
134 res += ['</pre>\n',
135 'Please read the above for clues<br/>\n',
136 'If you selected a sam/bam format file, it might not have headers or they may not start with @HD?<br/>\n',
137 'It is also possible that the log shows that fastqc is not installed?<br/>\n',
138 'If that is the case, please tell the relevant Galaxy administrator that it can be snarfed from<br/>\n',
139 'http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/<br/>\n',]
140 return res,1,serr
141 self.fix_fastqcimages(odpath)
142 flist = os.listdir(self.opts.outputdir) # these have now been fixed
143 excludefiles = ['tick.png','warning.png','fastqc_icon.png','error.png']
144 flist = [x for x in flist if not x in excludefiles]
145 for i in range(len(rep)): # need to fix links to Icons and Image subdirectories in lastest fastqc code - ugh
146 rep[i] = rep[i].replace('Icons/','')
147 rep[i] = rep[i].replace('Images/','')
148
149 html = self.fix_fastqc(rep,flist,runlog)
150 return html,retval,serr
151
152
153
154 def fix_fastqc(self,rep=[],flist=[],runlog=[]):
155 """ add some of our stuff to the html
156 """
157 bodyindex = len(rep) -1 # hope they don't change this
158 footrow = bodyindex - 1
159 footer = rep[footrow]
160 rep = rep[:footrow] + rep[footrow+1:]
161 res = ['<div class="module"><h2>Files created by FastQC</h2><table cellspacing="2" cellpadding="2">\n']
162 flist.sort()
163 for i,f in enumerate(flist):
164 if not(os.path.isdir(f)):
165 fn = os.path.split(f)[-1]
166 res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,getFileString(fn, self.opts.outputdir)))
167 res.append('</table>\n')
168 res.append('<a href="http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/">FastQC documentation and full attribution is here</a><br/><hr/>\n')
169 res.append('FastQC was run by Galaxy using the rgenetics rgFastQC wrapper - see http://bitbucket.org/rgenetics for details and licensing\n</div>')
170 res.append(footer)
171 fixed = rep[:bodyindex] + res + rep[bodyindex:]
172 return fixed # with our additions
173
174
175 def fix_fastqcimages(self,odpath):
176 """ Galaxy wants everything in the same files_dir
177 """
178 icpath = os.path.join(odpath,'Icons')
179 impath = os.path.join(odpath,'Images')
180 for adir in [icpath,impath,odpath]:
181 if os.path.exists(adir):
182 flist = os.listdir(adir) # get all files created
183 for f in flist:
184 if not os.path.isdir(os.path.join(adir,f)):
185 sauce = os.path.join(adir,f)
186 dest = os.path.join(self.opts.outputdir,f)
187 shutil.move(sauce,dest)
188 os.rmdir(adir)
189
190
191
192 if __name__ == '__main__':
193 op = optparse.OptionParser()
194 op.add_option('-i', '--input', default=None)
195 op.add_option('-j', '--inputfilename', default=None)
196 op.add_option('-o', '--htmloutput', default=None)
197 op.add_option('-d', '--outputdir', default="/tmp/shortread")
198 op.add_option('-f', '--informat', default='fastq')
199 op.add_option('-n', '--namejob', default='rgFastQC')
200 op.add_option('-c', '--contaminants', default=None)
201 op.add_option('-e', '--executable', default='fastqc')
202 opts, args = op.parse_args()
203 assert opts.input <> None
204 assert os.path.isfile(opts.executable),'##rgFastQC.py error - cannot find executable %s' % opts.executable
205 if not os.path.exists(opts.outputdir):
206 os.makedirs(opts.outputdir)
207 f = FastQC(opts)
208 html,retval,serr = f.run_fastqc()
209 f = open(opts.htmloutput, 'w')
210 f.write(''.join(html))
211 f.close()
212 if retval <> 0:
213 print >> sys.stderr, serr # indicate failure
214
215
216