Mercurial > repos > jjohnson > cummerbund
annotate cuffdiff_wrapper.py @ 2:fdf01b3c1841
Update to new cuffdiff wrapper, add cuffdb_info.txt to cummerbund html output
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Fri, 08 Nov 2013 14:54:01 -0600 |
parents | ebb9a992508d |
children |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python |
2 | |
3 # Wrapper supports Cuffdiff versions v1.3.0-v2.0 | |
4 | |
5 import optparse, os, shutil, subprocess, sys, tempfile | |
6 | |
7 def group_callback( option, op_str, value, parser ): | |
8 groups = [] | |
9 flist = [] | |
10 for arg in parser.rargs: | |
11 arg = arg.strip() | |
12 if arg[0] is "-": | |
13 break | |
14 elif arg[0] is ",": | |
15 groups.append(flist) | |
16 flist = [] | |
17 else: | |
18 flist.append(arg) | |
19 groups.append(flist) | |
20 | |
21 setattr(parser.values, option.dest, groups) | |
22 | |
23 def label_callback( option, op_str, value, parser ): | |
24 labels = [] | |
25 for arg in parser.rargs: | |
26 arg = arg.strip() | |
27 if arg[0] is "-": | |
28 break | |
29 else: | |
30 labels.append(arg) | |
31 | |
32 setattr(parser.values, option.dest, labels) | |
33 | |
34 def stop_err( msg ): | |
35 sys.stderr.write( "%s\n" % msg ) | |
2
fdf01b3c1841
Update to new cuffdiff wrapper, add cuffdb_info.txt to cummerbund html output
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
36 sys.exit(1) |
0 | 37 |
38 # Copied from sam_to_bam.py: | |
39 def check_seq_file( dbkey, cached_seqs_pointer_file ): | |
40 seq_path = '' | |
41 for line in open( cached_seqs_pointer_file ): | |
42 line = line.rstrip( '\r\n' ) | |
43 if line and not line.startswith( '#' ) and line.startswith( 'index' ): | |
44 fields = line.split( '\t' ) | |
45 if len( fields ) < 3: | |
46 continue | |
47 if fields[1] == dbkey: | |
48 seq_path = fields[2].strip() | |
49 break | |
50 return seq_path | |
51 | |
52 def __main__(): | |
53 #Parse Command Line | |
54 parser = optparse.OptionParser() | |
55 | |
56 # Cuffdiff options. | |
57 parser.add_option( '-s', '--inner-dist-std-dev', dest='inner_dist_std_dev', help='The standard deviation for the distribution on inner distances between mate pairs. The default is 20bp.' ) | |
58 parser.add_option( '-p', '--num-threads', dest='num_threads', help='Use this many threads to align reads. The default is 1.' ) | |
59 parser.add_option( '-m', '--inner-mean-dist', dest='inner_mean_dist', help='This is the expected (mean) inner distance between mate pairs. \ | |
60 For, example, for paired end runs with fragments selected at 300bp, \ | |
61 where each end is 50bp, you should set -r to be 200. The default is 45bp.') | |
62 parser.add_option( '-c', '--min-alignment-count', dest='min_alignment_count', help='The minimum number of alignments in a locus for needed to conduct significance testing on changes in that locus observed between samples. If no testing is performed, changes in the locus are deemed not signficant, and the locus\' observed changes don\'t contribute to correction for multiple testing. The default is 1,000 fragment alignments (up to 2,000 paired reads).' ) | |
63 parser.add_option( '--FDR', dest='FDR', help='The allowed false discovery rate. The default is 0.05.' ) | |
64 parser.add_option( '-u', '--multi-read-correct', dest='multi_read_correct', action="store_true", help='Tells Cufflinks to do an initial estimation procedure to more accurately weight reads mapping to multiple locations in the genome') | |
2
fdf01b3c1841
Update to new cuffdiff wrapper, add cuffdb_info.txt to cummerbund html output
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
65 parser.add_option( '--library-norm-method', dest='library_norm_method' ) |
fdf01b3c1841
Update to new cuffdiff wrapper, add cuffdb_info.txt to cummerbund html output
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
66 parser.add_option( '--dispersion-method', dest='dispersion_method' ) |
0 | 67 |
68 # Advanced Options: | |
69 parser.add_option( '--num-importance-samples', dest='num_importance_samples', help='Sets the number of importance samples generated for each locus during abundance estimation. Default: 1000' ) | |
70 parser.add_option( '--max-mle-iterations', dest='max_mle_iterations', help='Sets the number of iterations allowed during maximum likelihood estimation of abundances. Default: 5000' ) | |
71 | |
72 # Wrapper / Galaxy options. | |
73 parser.add_option( '-f', '--files', dest='groups', action="callback", callback=group_callback, help="Groups to be processed, groups are separated by spaces, replicates in a group comma separated. group1_rep1,group1_rep2 group2_rep1,group2_rep2, ..., groupN_rep1, groupN_rep2" ) | |
74 parser.add_option( '-A', '--inputA', dest='inputA', help='A transcript GTF file produced by cufflinks, cuffcompare, or other source.') | |
75 parser.add_option( '-1', '--input1', dest='input1', help='File of RNA-Seq read alignments in the SAM format. SAM is a standard short read alignment, that allows aligners to attach custom tags to individual alignments, and Cufflinks requires that the alignments you supply have some of these tags. Please see Input formats for more details.' ) | |
76 parser.add_option( '-2', '--input2', dest='input2', help='File of RNA-Seq read alignments in the SAM format. SAM is a standard short read alignment, that allows aligners to attach custom tags to individual alignments, and Cufflinks requires that the alignments you supply have some of these tags. Please see Input formats for more details.' ) | |
77 | |
78 # Label options | |
79 parser.add_option('-L', '--labels', dest='labels', action="callback", callback=label_callback, help="Labels for the groups the replicates are in.") | |
80 | |
81 # Normalization options. | |
82 parser.add_option( "-N", "--quartile-normalization", dest="do_normalization", action="store_true" ) | |
83 | |
84 # Bias correction options. | |
85 parser.add_option( '-b', dest='do_bias_correction', action="store_true", help='Providing Cufflinks with a multifasta file via this option instructs it to run our new bias detection and correction algorithm which can significantly improve accuracy of transcript abundance estimates.') | |
86 parser.add_option( '', '--dbkey', dest='dbkey', help='The build of the reference dataset' ) | |
87 parser.add_option( '', '--index_dir', dest='index_dir', help='GALAXY_DATA_INDEX_DIR' ) | |
88 parser.add_option( '', '--ref_file', dest='ref_file', help='The reference dataset from the history' ) | |
89 | |
90 # Outputs. | |
91 parser.add_option( "--isoforms_fpkm_tracking_output", dest="isoforms_fpkm_tracking_output" ) | |
92 parser.add_option( "--genes_fpkm_tracking_output", dest="genes_fpkm_tracking_output" ) | |
93 parser.add_option( "--cds_fpkm_tracking_output", dest="cds_fpkm_tracking_output" ) | |
94 parser.add_option( "--tss_groups_fpkm_tracking_output", dest="tss_groups_fpkm_tracking_output" ) | |
95 parser.add_option( "--isoforms_read_group_tracking_output", dest="isoforms_read_group_tracking_output", default=None) | |
96 parser.add_option( "--genes_read_group_tracking_output", dest="genes_read_group_tracking_output", default=None ) | |
97 parser.add_option( "--cds_read_group_tracking_output", dest="cds_read_group_tracking_output", default=None ) | |
98 parser.add_option( "--tss_groups_read_group_tracking_output", dest="tss_groups_read_group_tracking_output", default=None ) | |
99 parser.add_option( "--isoforms_exp_output", dest="isoforms_exp_output" ) | |
100 parser.add_option( "--genes_exp_output", dest="genes_exp_output" ) | |
101 parser.add_option( "--tss_groups_exp_output", dest="tss_groups_exp_output" ) | |
102 parser.add_option( "--cds_exp_fpkm_tracking_output", dest="cds_exp_fpkm_tracking_output" ) | |
2
fdf01b3c1841
Update to new cuffdiff wrapper, add cuffdb_info.txt to cummerbund html output
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
103 parser.add_option( "--cds_diff_output", dest="cds_diff_output" ) |
0 | 104 parser.add_option( "--isoforms_count_tracking_output", dest="isoforms_count_tracking_output" ) |
105 parser.add_option( "--genes_count_tracking_output", dest="genes_count_tracking_output" ) | |
106 parser.add_option( "--cds_count_tracking_output", dest="cds_count_tracking_output" ) | |
107 parser.add_option( "--tss_groups_count_tracking_output", dest="tss_groups_count_tracking_output" ) | |
108 parser.add_option( "--splicing_diff_output", dest="splicing_diff_output" ) | |
109 parser.add_option( "--promoters_diff_output", dest="promoters_diff_output" ) | |
110 parser.add_option( "--run_info_output", dest="run_info_output" ) | |
111 parser.add_option( "--read_groups_info_output", dest="read_groups_info_output" ) | |
112 parser.add_option( "--cuffdatadir", dest="cuffdatadir", default=None) | |
113 parser.add_option( "--cummeRbund_db_output", dest="cummeRbund_db_output", default=None) | |
114 | |
115 (options, args) = parser.parse_args() | |
116 | |
117 # output version # of tool | |
118 try: | |
119 tmp = tempfile.NamedTemporaryFile().name | |
120 tmp_stdout = open( tmp, 'wb' ) | |
121 proc = subprocess.Popen( args='cuffdiff --no-update-check 2>&1', shell=True, stdout=tmp_stdout ) | |
122 tmp_stdout.close() | |
123 returncode = proc.wait() | |
124 stdout = None | |
125 for line in open( tmp_stdout.name, 'rb' ): | |
126 if line.lower().find( 'cuffdiff v' ) >= 0: | |
127 stdout = line.strip() | |
128 break | |
129 if stdout: | |
130 sys.stdout.write( '%s\n' % stdout ) | |
131 else: | |
132 raise Exception | |
133 except: | |
134 sys.stdout.write( 'Could not determine Cuffdiff version\n' ) | |
135 | |
136 # Make temp directory for output. | |
137 tmp_output_dir = tempfile.mkdtemp() | |
138 cuffdatadir = options.cuffdatadir if options.cuffdatadir else tmp_output_dir | |
139 if not os.path.exists( cuffdatadir ): | |
140 os.makedirs( cuffdatadir ) | |
141 | |
142 # If doing bias correction, set/link to sequence file. | |
143 if options.do_bias_correction: | |
144 if options.ref_file != 'None': | |
145 # Sequence data from history. | |
146 # Create symbolic link to ref_file so that index will be created in working directory. | |
147 seq_path = os.path.join( cuffdatadir, "ref.fa" ) | |
148 os.symlink( options.ref_file, seq_path ) | |
149 else: | |
150 # Sequence data from loc file. | |
151 cached_seqs_pointer_file = os.path.join( options.index_dir, 'sam_fa_indices.loc' ) | |
152 if not os.path.exists( cached_seqs_pointer_file ): | |
153 stop_err( 'The required file (%s) does not exist.' % cached_seqs_pointer_file ) | |
154 # If found for the dbkey, seq_path will look something like /galaxy/data/equCab2/sam_index/equCab2.fa, | |
155 # and the equCab2.fa file will contain fasta sequences. | |
156 seq_path = check_seq_file( options.dbkey, cached_seqs_pointer_file ) | |
157 if seq_path == '': | |
158 stop_err( 'No sequence data found for dbkey %s, so bias correction cannot be used.' % options.dbkey ) | |
159 | |
160 # Build command. | |
161 | |
162 # Base; always use quiet mode to avoid problems with storing log output. | |
163 cmd = "cuffdiff --no-update-check -q" | |
164 | |
165 # Add options. | |
2
fdf01b3c1841
Update to new cuffdiff wrapper, add cuffdb_info.txt to cummerbund html output
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
166 if options.library_norm_method: |
fdf01b3c1841
Update to new cuffdiff wrapper, add cuffdb_info.txt to cummerbund html output
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
167 cmd += ( " --library-norm-method %s" % options.library_norm_method ) |
fdf01b3c1841
Update to new cuffdiff wrapper, add cuffdb_info.txt to cummerbund html output
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
168 if options.dispersion_method: |
fdf01b3c1841
Update to new cuffdiff wrapper, add cuffdb_info.txt to cummerbund html output
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
169 cmd += ( " --dispersion-method %s" % options.dispersion_method ) |
0 | 170 if options.inner_dist_std_dev: |
171 cmd += ( " -s %i" % int ( options.inner_dist_std_dev ) ) | |
172 if options.num_threads: | |
173 cmd += ( " -p %i" % int ( options.num_threads ) ) | |
174 if options.inner_mean_dist: | |
175 cmd += ( " -m %i" % int ( options.inner_mean_dist ) ) | |
176 if options.min_alignment_count: | |
177 cmd += ( " -c %i" % int ( options.min_alignment_count ) ) | |
178 if options.FDR: | |
179 cmd += ( " --FDR %f" % float( options.FDR ) ) | |
180 if options.multi_read_correct: | |
181 cmd += ( " -u" ) | |
182 if options.num_importance_samples: | |
183 cmd += ( " --num-importance-samples %i" % int ( options.num_importance_samples ) ) | |
184 if options.max_mle_iterations: | |
185 cmd += ( " --max-mle-iterations %i" % int ( options.max_mle_iterations ) ) | |
186 if options.do_normalization: | |
187 cmd += ( " -N" ) | |
188 if options.do_bias_correction: | |
189 cmd += ( " -b %s" % seq_path ) | |
190 | |
191 # Add inputs. | |
192 # For replicate analysis: group1_rep1,group1_rep2 groupN_rep1,groupN_rep2 | |
193 if options.groups: | |
194 cmd += " --labels " | |
195 for label in options.labels: | |
2
fdf01b3c1841
Update to new cuffdiff wrapper, add cuffdb_info.txt to cummerbund html output
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
196 cmd += '"%s",' % label |
0 | 197 cmd = cmd[:-1] |
198 | |
199 cmd += " " + options.inputA + " " | |
200 | |
201 for group in options.groups: | |
202 for filename in group: | |
203 cmd += filename + "," | |
204 cmd = cmd[:-1] + " " | |
205 else: | |
206 cmd += " " + options.inputA + " " + options.input1 + " " + options.input2 | |
207 | |
208 # Debugging. | |
209 print cmd | |
210 | |
211 # Run command. | |
212 try: | |
213 tmp_name = tempfile.NamedTemporaryFile( dir=tmp_output_dir ).name | |
214 tmp_stderr = open( tmp_name, 'wb' ) | |
215 proc = subprocess.Popen( args=cmd, shell=True, cwd=cuffdatadir, stderr=tmp_stderr.fileno() ) | |
216 returncode = proc.wait() | |
217 tmp_stderr.close() | |
218 | |
219 # Get stderr, allowing for case where it's very large. | |
220 tmp_stderr = open( tmp_name, 'rb' ) | |
221 stderr = '' | |
222 buffsize = 1048576 | |
223 try: | |
224 while True: | |
225 stderr += tmp_stderr.read( buffsize ) | |
226 if not stderr or len( stderr ) % buffsize != 0: | |
227 break | |
228 except OverflowError: | |
229 pass | |
230 tmp_stderr.close() | |
231 | |
232 # Error checking. | |
233 if returncode != 0: | |
234 raise Exception, stderr | |
235 | |
236 # check that there are results in the output file | |
237 if len( open( os.path.join( cuffdatadir, "isoforms.fpkm_tracking" ), 'rb' ).read().strip() ) == 0: | |
238 raise Exception, 'The main output file is empty, there may be an error with your input file or settings.' | |
239 except Exception, e: | |
240 stop_err( 'Error running cuffdiff. ' + str( e ) ) | |
241 | |
242 | |
243 # Copy output files from tmp directory to specified files. | |
244 try: | |
245 try: | |
246 if options.isoforms_fpkm_tracking_output and os.path.exists(os.path.join( cuffdatadir, "isoforms.fpkm_tracking" )): | |
247 shutil.copyfile( os.path.join( cuffdatadir, "isoforms.fpkm_tracking" ), options.isoforms_fpkm_tracking_output ) | |
248 if options.genes_fpkm_tracking_output and os.path.exists(os.path.join( cuffdatadir, "genes.fpkm_tracking" )): | |
249 shutil.copyfile( os.path.join( cuffdatadir, "genes.fpkm_tracking" ), options.genes_fpkm_tracking_output ) | |
250 if options.cds_fpkm_tracking_output and os.path.exists(os.path.join( cuffdatadir, "cds.fpkm_tracking" )): | |
251 shutil.copyfile( os.path.join( cuffdatadir, "cds.fpkm_tracking" ), options.cds_fpkm_tracking_output ) | |
252 if options.tss_groups_fpkm_tracking_output and os.path.exists(os.path.join( cuffdatadir, "tss_groups.fpkm_tracking" )): | |
253 shutil.copyfile( os.path.join( cuffdatadir, "tss_groups.fpkm_tracking" ), options.tss_groups_fpkm_tracking_output ) | |
254 | |
255 if options.isoforms_read_group_tracking_output and os.path.exists(os.path.join( cuffdatadir, "isoforms.read_group_tracking" )): | |
256 shutil.copyfile( os.path.join( cuffdatadir, "isoforms.read_group_tracking" ), options.isoforms_read_group_tracking_output ) | |
257 if options.genes_read_group_tracking_output and os.path.exists(os.path.join( cuffdatadir, "genes.read_group_tracking" )): | |
258 shutil.copyfile( os.path.join( cuffdatadir, "genes.read_group_tracking" ), options.genes_read_group_tracking_output ) | |
259 if options.cds_read_group_tracking_output and os.path.exists(os.path.join( cuffdatadir, "cds.read_group_tracking" )): | |
260 shutil.copyfile( os.path.join( cuffdatadir, "cds.read_group_tracking" ), options.cds_read_group_tracking_output ) | |
261 if options.tss_groups_read_group_tracking_output and os.path.exists(os.path.join( cuffdatadir, "tss_groups.read_group_tracking" )): | |
262 shutil.copyfile( os.path.join( cuffdatadir, "tss_groups.read_group_tracking" ), options.tss_groups_read_group_tracking_output ) | |
263 | |
264 if options.isoforms_exp_output and os.path.exists(os.path.join( cuffdatadir, "isoform_exp.diff" )): | |
265 shutil.copyfile( os.path.join( cuffdatadir, "isoform_exp.diff" ), options.isoforms_exp_output ) | |
266 if options.genes_exp_output and os.path.exists(os.path.join( cuffdatadir, "gene_exp.diff" )): | |
267 shutil.copyfile( os.path.join( cuffdatadir, "gene_exp.diff" ), options.genes_exp_output ) | |
268 if options.cds_exp_fpkm_tracking_output and os.path.exists(os.path.join( cuffdatadir, "cds_exp.diff" )): | |
269 shutil.copyfile( os.path.join( cuffdatadir, "cds_exp.diff" ), options.cds_exp_fpkm_tracking_output ) | |
270 if options.tss_groups_exp_output and os.path.exists(os.path.join( cuffdatadir, "tss_group_exp.diff" )): | |
271 shutil.copyfile( os.path.join( cuffdatadir, "tss_group_exp.diff" ), options.tss_groups_exp_output ) | |
272 | |
273 if options.isoforms_count_tracking_output and os.path.exists(os.path.join( cuffdatadir, "isoforms.count_tracking" )): | |
274 shutil.copyfile( os.path.join( cuffdatadir, "isoforms.count_tracking" ), options.isoforms_count_tracking_output ) | |
275 if options.genes_count_tracking_output and os.path.exists(os.path.join( cuffdatadir, "genes.count_tracking" )): | |
276 shutil.copyfile( os.path.join( cuffdatadir, "genes.count_tracking" ), options.genes_count_tracking_output ) | |
277 if options.cds_count_tracking_output and os.path.exists(os.path.join( cuffdatadir, "cds.count_tracking" )): | |
278 shutil.copyfile( os.path.join( cuffdatadir, "cds.count_tracking" ), options.cds_count_tracking_output ) | |
279 if options.tss_groups_count_tracking_output and os.path.exists(os.path.join( cuffdatadir, "tss_groups.count_tracking" )): | |
280 shutil.copyfile( os.path.join( cuffdatadir, "tss_groups.count_tracking" ), options.tss_groups_count_tracking_output ) | |
281 | |
282 if options.cds_diff_output and os.path.exists(os.path.join( cuffdatadir, "cds.diff" )): | |
283 shutil.copyfile( os.path.join( cuffdatadir, "cds.diff" ), options.cds_diff_output ) | |
284 | |
285 if options.splicing_diff_output and os.path.exists(os.path.join( cuffdatadir, "splicing.diff" )): | |
286 shutil.copyfile( os.path.join( cuffdatadir, "splicing.diff" ), options.splicing_diff_output ) | |
287 if options.promoters_diff_output and os.path.exists(os.path.join( cuffdatadir, "promoters.diff" )): | |
288 shutil.copyfile( os.path.join( cuffdatadir, "promoters.diff" ), options.promoters_diff_output ) | |
289 | |
290 if options.run_info_output and os.path.exists(os.path.join( cuffdatadir, "run.info" )): | |
291 shutil.copyfile( os.path.join( cuffdatadir, "run.info" ), options.run_info_output ) | |
292 if options.read_groups_info_output and os.path.exists(os.path.join( cuffdatadir, "read_groups.info" )): | |
293 shutil.copyfile( os.path.join( cuffdatadir, "read_groups.info" ), options.read_groups_info_output ) | |
294 | |
295 except Exception, e: | |
296 stop_err( 'Error in cuffdiff:\n' + str( e ) ) | |
297 if options.cummeRbund_db_output: | |
298 try: | |
299 dbFile = 'cuffData.db' | |
300 rscript = tempfile.NamedTemporaryFile( dir=tmp_output_dir,suffix='.r' ).name | |
301 rscript_fh = open( rscript, 'wb' ) | |
302 rscript_fh.write('library(cummeRbund)\n') | |
303 if options.inputA and options.ref_file: | |
304 rscript_fh.write('cuff<-readCufflinks(dir = "%s", dbFile = "%s", gtfFile = "%s", genome = "%s", rebuild = T)\n' % (cuffdatadir,dbFile,options.inputA,options.ref_file)) | |
305 else: | |
306 rscript_fh.write('cuff<-readCufflinks(dir = "%s", dbFile = "%s", rebuild = T)\n' % (cuffdatadir,dbFile)) | |
307 rscript_fh.close() | |
308 cmd = ( "Rscript --vanilla %s" % rscript ) | |
309 tmp_name = tempfile.NamedTemporaryFile( dir=tmp_output_dir ).name | |
310 tmp_stderr = open( tmp_name, 'wb' ) | |
311 proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() ) | |
312 #proc = subprocess.Popen( args=cmd, shell=True) | |
313 returncode = proc.wait() | |
314 tmp_stderr.close() | |
315 if os.path.exists(os.path.join( cuffdatadir, dbFile )): | |
316 shutil.copyfile( os.path.join( cuffdatadir, dbFile ), options.cummeRbund_db_output ) | |
317 shutil.rmtree(os.path.join( cuffdatadir, dbFile )) | |
318 except Exception, e: | |
319 stop_err( 'Error generating cummeRbund cuffData.db:\n' + str( e ) ) | |
320 finally: | |
321 # Clean up temp dirs | |
322 if os.path.exists( tmp_output_dir ): | |
323 shutil.rmtree( tmp_output_dir ) | |
324 | |
325 if __name__=="__main__": __main__() |