comparison cuffdiff_wrapper.py @ 0:da7241f92ecf

Uploaded
author jjohnson
date Mon, 04 Feb 2013 19:50:25 -0500
parents
children ebb9a992508d
comparison
equal deleted inserted replaced
-1:000000000000 0:da7241f92ecf
1 #!/usr/bin/env python
2
3 # Wrapper supports Cuffdiff versions v1.3.0-v2.0
4
5 import optparse, os, shutil, subprocess, sys, tempfile
6
7 def group_callback( option, op_str, value, parser ):
8 groups = []
9 flist = []
10 for arg in parser.rargs:
11 arg = arg.strip()
12 if arg[0] is "-":
13 break
14 elif arg[0] is ",":
15 groups.append(flist)
16 flist = []
17 else:
18 flist.append(arg)
19 groups.append(flist)
20
21 setattr(parser.values, option.dest, groups)
22
23 def label_callback( option, op_str, value, parser ):
24 labels = []
25 for arg in parser.rargs:
26 arg = arg.strip()
27 if arg[0] is "-":
28 break
29 else:
30 labels.append(arg)
31
32 setattr(parser.values, option.dest, labels)
33
34 def stop_err( msg ):
35 sys.stderr.write( "%s\n" % msg )
36 sys.exit()
37
38 # Copied from sam_to_bam.py:
39 def check_seq_file( dbkey, cached_seqs_pointer_file ):
40 seq_path = ''
41 for line in open( cached_seqs_pointer_file ):
42 line = line.rstrip( '\r\n' )
43 if line and not line.startswith( '#' ) and line.startswith( 'index' ):
44 fields = line.split( '\t' )
45 if len( fields ) < 3:
46 continue
47 if fields[1] == dbkey:
48 seq_path = fields[2].strip()
49 break
50 return seq_path
51
52 def __main__():
53 #Parse Command Line
54 parser = optparse.OptionParser()
55
56 # Cuffdiff options.
57 parser.add_option( '-s', '--inner-dist-std-dev', dest='inner_dist_std_dev', help='The standard deviation for the distribution on inner distances between mate pairs. The default is 20bp.' )
58 parser.add_option( '-p', '--num-threads', dest='num_threads', help='Use this many threads to align reads. The default is 1.' )
59 parser.add_option( '-m', '--inner-mean-dist', dest='inner_mean_dist', help='This is the expected (mean) inner distance between mate pairs. \
60 For, example, for paired end runs with fragments selected at 300bp, \
61 where each end is 50bp, you should set -r to be 200. The default is 45bp.')
62 parser.add_option( '-c', '--min-alignment-count', dest='min_alignment_count', help='The minimum number of alignments in a locus for needed to conduct significance testing on changes in that locus observed between samples. If no testing is performed, changes in the locus are deemed not signficant, and the locus\' observed changes don\'t contribute to correction for multiple testing. The default is 1,000 fragment alignments (up to 2,000 paired reads).' )
63 parser.add_option( '--FDR', dest='FDR', help='The allowed false discovery rate. The default is 0.05.' )
64 parser.add_option( '-u', '--multi-read-correct', dest='multi_read_correct', action="store_true", help='Tells Cufflinks to do an initial estimation procedure to more accurately weight reads mapping to multiple locations in the genome')
65
66 # Advanced Options:
67 parser.add_option( '--num-importance-samples', dest='num_importance_samples', help='Sets the number of importance samples generated for each locus during abundance estimation. Default: 1000' )
68 parser.add_option( '--max-mle-iterations', dest='max_mle_iterations', help='Sets the number of iterations allowed during maximum likelihood estimation of abundances. Default: 5000' )
69
70 # Wrapper / Galaxy options.
71 parser.add_option( '-f', '--files', dest='groups', action="callback", callback=group_callback, help="Groups to be processed, groups are separated by spaces, replicates in a group comma separated. group1_rep1,group1_rep2 group2_rep1,group2_rep2, ..., groupN_rep1, groupN_rep2" )
72 parser.add_option( '-A', '--inputA', dest='inputA', help='A transcript GTF file produced by cufflinks, cuffcompare, or other source.')
73 parser.add_option( '-1', '--input1', dest='input1', help='File of RNA-Seq read alignments in the SAM format. SAM is a standard short read alignment, that allows aligners to attach custom tags to individual alignments, and Cufflinks requires that the alignments you supply have some of these tags. Please see Input formats for more details.' )
74 parser.add_option( '-2', '--input2', dest='input2', help='File of RNA-Seq read alignments in the SAM format. SAM is a standard short read alignment, that allows aligners to attach custom tags to individual alignments, and Cufflinks requires that the alignments you supply have some of these tags. Please see Input formats for more details.' )
75
76 # Label options
77 parser.add_option('-L', '--labels', dest='labels', action="callback", callback=label_callback, help="Labels for the groups the replicates are in.")
78
79 # Normalization options.
80 parser.add_option( "-N", "--quartile-normalization", dest="do_normalization", action="store_true" )
81
82 # Bias correction options.
83 parser.add_option( '-b', dest='do_bias_correction', action="store_true", help='Providing Cufflinks with a multifasta file via this option instructs it to run our new bias detection and correction algorithm which can significantly improve accuracy of transcript abundance estimates.')
84 parser.add_option( '', '--dbkey', dest='dbkey', help='The build of the reference dataset' )
85 parser.add_option( '', '--index_dir', dest='index_dir', help='GALAXY_DATA_INDEX_DIR' )
86 parser.add_option( '', '--ref_file', dest='ref_file', help='The reference dataset from the history' )
87
88 # Outputs.
89 parser.add_option( "--isoforms_fpkm_tracking_output", dest="isoforms_fpkm_tracking_output" )
90 parser.add_option( "--genes_fpkm_tracking_output", dest="genes_fpkm_tracking_output" )
91 parser.add_option( "--cds_fpkm_tracking_output", dest="cds_fpkm_tracking_output" )
92 parser.add_option( "--tss_groups_fpkm_tracking_output", dest="tss_groups_fpkm_tracking_output" )
93 parser.add_option( "--isoforms_read_group_tracking_output", dest="isoforms_read_group_tracking_output", default=None)
94 parser.add_option( "--genes_read_group_tracking_output", dest="genes_read_group_tracking_output", default=None )
95 parser.add_option( "--cds_read_group_tracking_output", dest="cds_read_group_tracking_output", default=None )
96 parser.add_option( "--tss_groups_read_group_tracking_output", dest="tss_groups_read_group_tracking_output", default=None )
97 parser.add_option( "--isoforms_exp_output", dest="isoforms_exp_output" )
98 parser.add_option( "--genes_exp_output", dest="genes_exp_output" )
99 parser.add_option( "--tss_groups_exp_output", dest="tss_groups_exp_output" )
100 parser.add_option( "--cds_exp_fpkm_tracking_output", dest="cds_exp_fpkm_tracking_output" )
101 parser.add_option( "--cds_diff_output", dest="cds_diff_output" )
102 parser.add_option( "--isoforms_count_tracking_output", dest="isoforms_count_tracking_output" )
103 parser.add_option( "--genes_count_tracking_output", dest="genes_count_tracking_output" )
104 parser.add_option( "--cds_count_tracking_output", dest="cds_count_tracking_output" )
105 parser.add_option( "--tss_groups_count_tracking_output", dest="tss_groups_count_tracking_output" )
106 parser.add_option( "--splicing_diff_output", dest="splicing_diff_output" )
107 parser.add_option( "--cds_diff_output", dest="cds_diff_output" )
108 parser.add_option( "--promoters_diff_output", dest="promoters_diff_output" )
109 parser.add_option( "--run_info_output", dest="run_info_output" )
110 parser.add_option( "--read_groups_info_output", dest="read_groups_info_output" )
111 parser.add_option( "--cuffdatadir", dest="cuffdatadir", default=None)
112 parser.add_option( "--cummeRbund_db_output", dest="cummeRbund_db_output", default=None)
113
114 (options, args) = parser.parse_args()
115
116 # output version # of tool
117 try:
118 tmp = tempfile.NamedTemporaryFile().name
119 tmp_stdout = open( tmp, 'wb' )
120 proc = subprocess.Popen( args='cuffdiff --no-update-check 2>&1', shell=True, stdout=tmp_stdout )
121 tmp_stdout.close()
122 returncode = proc.wait()
123 stdout = None
124 for line in open( tmp_stdout.name, 'rb' ):
125 if line.lower().find( 'cuffdiff v' ) >= 0:
126 stdout = line.strip()
127 break
128 if stdout:
129 sys.stdout.write( '%s\n' % stdout )
130 else:
131 raise Exception
132 except:
133 sys.stdout.write( 'Could not determine Cuffdiff version\n' )
134
135 # Make temp directory for output.
136 tmp_output_dir = tempfile.mkdtemp()
137 cuffdatadir = options.cuffdatadir if options.cuffdatadir else tmp_output_dir
138 if not os.path.exists( cuffdatadir ):
139 os.makedirs( cuffdatadir )
140
141
142 # If doing bias correction, set/link to sequence file.
143 if options.do_bias_correction:
144 if options.ref_file != 'None':
145 # Sequence data from history.
146 # Create symbolic link to ref_file so that index will be created in working directory.
147 seq_path = os.path.join( cuffdatadir, "ref.fa" )
148 os.symlink( options.ref_file, seq_path )
149 else:
150 # Sequence data from loc file.
151 cached_seqs_pointer_file = os.path.join( options.index_dir, 'sam_fa_indices.loc' )
152 if not os.path.exists( cached_seqs_pointer_file ):
153 stop_err( 'The required file (%s) does not exist.' % cached_seqs_pointer_file )
154 # If found for the dbkey, seq_path will look something like /galaxy/data/equCab2/sam_index/equCab2.fa,
155 # and the equCab2.fa file will contain fasta sequences.
156 seq_path = check_seq_file( options.dbkey, cached_seqs_pointer_file )
157 if seq_path == '':
158 stop_err( 'No sequence data found for dbkey %s, so bias correction cannot be used.' % options.dbkey )
159
160 # Build command.
161
162 # Base; always use quiet mode to avoid problems with storing log output.
163 cmd = "cuffdiff --no-update-check -q"
164
165 # Add options.
166 if options.inner_dist_std_dev:
167 cmd += ( " -s %i" % int ( options.inner_dist_std_dev ) )
168 if options.num_threads:
169 cmd += ( " -p %i" % int ( options.num_threads ) )
170 if options.inner_mean_dist:
171 cmd += ( " -m %i" % int ( options.inner_mean_dist ) )
172 if options.min_alignment_count:
173 cmd += ( " -c %i" % int ( options.min_alignment_count ) )
174 if options.FDR:
175 cmd += ( " --FDR %f" % float( options.FDR ) )
176 if options.multi_read_correct:
177 cmd += ( " -u" )
178 if options.num_importance_samples:
179 cmd += ( " --num-importance-samples %i" % int ( options.num_importance_samples ) )
180 if options.max_mle_iterations:
181 cmd += ( " --max-mle-iterations %i" % int ( options.max_mle_iterations ) )
182 if options.do_normalization:
183 cmd += ( " -N" )
184 if options.do_bias_correction:
185 cmd += ( " -b %s" % seq_path )
186
187 # Add inputs.
188 # For replicate analysis: group1_rep1,group1_rep2 groupN_rep1,groupN_rep2
189 if options.groups:
190 cmd += " --labels "
191 for label in options.labels:
192 cmd += label + ","
193 cmd = cmd[:-1]
194
195 cmd += " " + options.inputA + " "
196
197 for group in options.groups:
198 for filename in group:
199 cmd += filename + ","
200 cmd = cmd[:-1] + " "
201 else:
202 cmd += " " + options.inputA + " " + options.input1 + " " + options.input2
203
204 # Debugging.
205 print cmd
206
207 # Run command.
208 try:
209 tmp_name = tempfile.NamedTemporaryFile( dir=tmp_output_dir ).name
210 tmp_stderr = open( tmp_name, 'wb' )
211 proc = subprocess.Popen( args=cmd, shell=True, cwd=cuffdatadir, stderr=tmp_stderr.fileno() )
212 returncode = proc.wait()
213 tmp_stderr.close()
214
215 # Get stderr, allowing for case where it's very large.
216 tmp_stderr = open( tmp_name, 'rb' )
217 stderr = ''
218 buffsize = 1048576
219 try:
220 while True:
221 stderr += tmp_stderr.read( buffsize )
222 if not stderr or len( stderr ) % buffsize != 0:
223 break
224 except OverflowError:
225 pass
226 tmp_stderr.close()
227
228 # Error checking.
229 if returncode != 0:
230 raise Exception, stderr
231
232 # check that there are results in the output file
233 if len( open( os.path.join( cuffdatadir, "isoforms.fpkm_tracking" ), 'rb' ).read().strip() ) == 0:
234 raise Exception, 'The main output file is empty, there may be an error with your input file or settings.'
235 except Exception, e:
236 stop_err( 'Error running cuffdiff. ' + str( e ) )
237
238
239 # Copy output files from tmp directory to specified files.
240 try:
241 try:
242 if options.isoforms_fpkm_tracking_output and os.path.exists(os.path.join( cuffdatadir, "isoforms.fpkm_tracking" )):
243 shutil.copyfile( os.path.join( cuffdatadir, "isoforms.fpkm_tracking" ), options.isoforms_fpkm_tracking_output )
244 if options.genes_fpkm_tracking_output and os.path.exists(os.path.join( cuffdatadir, "genes.fpkm_tracking" )):
245 shutil.copyfile( os.path.join( cuffdatadir, "genes.fpkm_tracking" ), options.genes_fpkm_tracking_output )
246 if options.cds_fpkm_tracking_output and os.path.exists(os.path.join( cuffdatadir, "cds.fpkm_tracking" )):
247 shutil.copyfile( os.path.join( cuffdatadir, "cds.fpkm_tracking" ), options.cds_fpkm_tracking_output )
248 if options.tss_groups_fpkm_tracking_output and os.path.exists(os.path.join( cuffdatadir, "tss_groups.fpkm_tracking" )):
249 shutil.copyfile( os.path.join( cuffdatadir, "tss_groups.fpkm_tracking" ), options.tss_groups_fpkm_tracking_output )
250
251 if options.isoforms_read_group_tracking_output and os.path.exists(os.path.join( cuffdatadir, "isoforms.read_group_tracking" )):
252 shutil.copyfile( os.path.join( cuffdatadir, "isoforms.read_group_tracking" ), options.isoforms_read_group_tracking_output )
253 if options.genes_read_group_tracking_output and os.path.exists(os.path.join( cuffdatadir, "genes.read_group_tracking" )):
254 shutil.copyfile( os.path.join( cuffdatadir, "genes.read_group_tracking" ), options.genes_read_group_tracking_output )
255 if options.cds_read_group_tracking_output and os.path.exists(os.path.join( cuffdatadir, "cds.read_group_tracking" )):
256 shutil.copyfile( os.path.join( cuffdatadir, "cds.read_group_tracking" ), options.cds_read_group_tracking_output )
257 if options.tss_groups_read_group_tracking_output and os.path.exists(os.path.join( cuffdatadir, "tss_groups.read_group_tracking" )):
258 shutil.copyfile( os.path.join( cuffdatadir, "tss_groups.read_group_tracking" ), options.tss_groups_read_group_tracking_output )
259
260 if options.isoforms_exp_output and os.path.exists(os.path.join( cuffdatadir, "isoform_exp.diff" )):
261 shutil.copyfile( os.path.join( cuffdatadir, "isoform_exp.diff" ), options.isoforms_exp_output )
262 if options.genes_exp_output and os.path.exists(os.path.join( cuffdatadir, "gene_exp.diff" )):
263 shutil.copyfile( os.path.join( cuffdatadir, "gene_exp.diff" ), options.genes_exp_output )
264 if options.cds_exp_fpkm_tracking_output and os.path.exists(os.path.join( cuffdatadir, "cds_exp.diff" )):
265 shutil.copyfile( os.path.join( cuffdatadir, "cds_exp.diff" ), options.cds_exp_fpkm_tracking_output )
266 if options.tss_groups_exp_output and os.path.exists(os.path.join( cuffdatadir, "tss_group_exp.diff" )):
267 shutil.copyfile( os.path.join( cuffdatadir, "tss_group_exp.diff" ), options.tss_groups_exp_output )
268
269 if options.isoforms_count_tracking_output and os.path.exists(os.path.join( cuffdatadir, "isoforms.count_tracking" )):
270 shutil.copyfile( os.path.join( cuffdatadir, "isoforms.count_tracking" ), options.isoforms_count_tracking_output )
271 if options.genes_count_tracking_output and os.path.exists(os.path.join( cuffdatadir, "genes.count_tracking" )):
272 shutil.copyfile( os.path.join( cuffdatadir, "genes.count_tracking" ), options.genes_count_tracking_output )
273 if options.cds_count_tracking_output and os.path.exists(os.path.join( cuffdatadir, "cds.count_tracking" )):
274 shutil.copyfile( os.path.join( cuffdatadir, "cds.count_tracking" ), options.cds_count_tracking_output )
275 if options.tss_groups_count_tracking_output and os.path.exists(os.path.join( cuffdatadir, "tss_groups.count_tracking" )):
276 shutil.copyfile( os.path.join( cuffdatadir, "tss_groups.count_tracking" ), options.tss_groups_count_tracking_output )
277
278 if options.cds_diff_output and os.path.exists(os.path.join( cuffdatadir, "cds.diff" )):
279 shutil.copyfile( os.path.join( cuffdatadir, "cds.diff" ), options.cds_diff_output )
280
281 if options.splicing_diff_output and os.path.exists(os.path.join( cuffdatadir, "splicing.diff" )):
282 shutil.copyfile( os.path.join( cuffdatadir, "splicing.diff" ), options.splicing_diff_output )
283 if options.promoters_diff_output and os.path.exists(os.path.join( cuffdatadir, "promoters.diff" )):
284 shutil.copyfile( os.path.join( cuffdatadir, "promoters.diff" ), options.promoters_diff_output )
285
286 if options.run_info_output and os.path.exists(os.path.join( cuffdatadir, "run.info" )):
287 shutil.copyfile( os.path.join( cuffdatadir, "run.info" ), options.run_info_output )
288 if options.read_groups_info_output and os.path.exists(os.path.join( cuffdatadir, "read_groups.info" )):
289 shutil.copyfile( os.path.join( cuffdatadir, "read_groups.info" ), options.read_groups_info_output )
290
291 except Exception, e:
292 stop_err( 'Error in cuffdiff:\n' + str( e ) )
293 if options.cummeRbund_db_output:
294 try:
295 dbFile = 'cuffData.db'
296 rscript = tempfile.NamedTemporaryFile( dir=tmp_output_dir,suffix='.r' ).name
297 rscript_fh = open( rscript, 'wb' )
298 rscript_fh.write('library(cummeRbund)\n')
299 if options.inputA and options.ref_file:
300 rscript_fh.write('cuff<-readCufflinks(dir = "%s", dbFile = "%s", gtfFile = "%s", genome = "%s", rebuild = T)\n' % (cuffdatadir,dbFile,options.inputA,options.ref_file))
301 else:
302 rscript_fh.write('cuff<-readCufflinks(dir = "%s", dbFile = "%s", rebuild = T)\n' % (cuffdatadir,dbFile))
303 rscript_fh.close()
304 cmd = ( "Rscript --vanilla %s" % rscript )
305 tmp_name = tempfile.NamedTemporaryFile( dir=tmp_output_dir ).name
306 tmp_stderr = open( tmp_name, 'wb' )
307 proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() )
308 #proc = subprocess.Popen( args=cmd, shell=True)
309 returncode = proc.wait()
310 tmp_stderr.close()
311 if os.path.exists(os.path.join( cuffdatadir, dbFile )):
312 shutil.copyfile( os.path.join( cuffdatadir, dbFile ), options.cummeRbund_db_output )
313 shutil.rmtree(os.path.join( cuffdatadir, dbFile ))
314 except Exception, e:
315 stop_err( 'Error generating cummeRbund cuffData.db:\n' + str( e ) )
316 finally:
317 # Clean up temp dirs
318 if os.path.exists( tmp_output_dir ):
319 shutil.rmtree( tmp_output_dir )
320
321 if __name__=="__main__": __main__()