comparison cufflinks_wrapper.py @ 12:d080005cffe1 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/cufflinks/cufflinks commit a0b0845a9d1b3e7ecdeacd1e606133617e3918bd"
author iuc
date Tue, 16 Jun 2020 13:00:32 -0400
parents e04dbae2abe0
children
comparison
equal deleted inserted replaced
11:e04dbae2abe0 12:d080005cffe1
1 #!/usr/bin/env python
2
3 import optparse
4 import os
5 import shutil
6 import subprocess
7 import sys
8 import tempfile
9
10
11 def parse_gff_attributes( attr_str ):
12 """
13 Parses a GFF/GTF attribute string and returns a dictionary of name-value
14 pairs. The general format for a GFF3 attributes string is
15
16 name1=value1;name2=value2
17
18 The general format for a GTF attribute string is
19
20 name1 "value1" ; name2 "value2"
21
22 The general format for a GFF attribute string is a single string that
23 denotes the interval's group; in this case, method returns a dictionary
24 with a single key-value pair, and key name is 'group'
25 """
26 attributes_list = attr_str.split(";")
27 attributes = {}
28 for name_value_pair in attributes_list:
29 # Try splitting by '=' (GFF3) first because spaces are allowed in GFF3
30 # attribute; next, try double quotes for GTF.
31 pair = name_value_pair.strip().split("=")
32 if len( pair ) == 1:
33 pair = name_value_pair.strip().split("\"")
34 if len( pair ) == 1:
35 # Could not split for some reason -- raise exception?
36 continue
37 if pair == '':
38 continue
39 name = pair[0].strip()
40 if name == '':
41 continue
42 # Need to strip double quote from values
43 value = pair[1].strip(" \"")
44 attributes[ name ] = value
45
46 if len( attributes ) == 0:
47 # Could not split attributes string, so entire string must be
48 # 'group' attribute. This is the case for strictly GFF files.
49 attributes['group'] = attr_str
50 return attributes
51
52
53 def gff_attributes_to_str( attrs, gff_format ):
54 """
55 Convert GFF attributes to string. Supported formats are GFF3, GTF.
56 """
57 if gff_format == 'GTF':
58 format_string = '%s "%s"'
59 # Convert group (GFF) and ID, parent (GFF3) attributes to transcript_id, gene_id
60 id_attr = None
61 if 'group' in attrs:
62 id_attr = 'group'
63 elif 'ID' in attrs:
64 id_attr = 'ID'
65 elif 'Parent' in attrs:
66 id_attr = 'Parent'
67 if id_attr:
68 attrs['transcript_id'] = attrs['gene_id'] = attrs[id_attr]
69 elif gff_format == 'GFF3':
70 format_string = '%s=%s'
71 attrs_strs = []
72 for name, value in attrs.items():
73 attrs_strs.append( format_string % ( name, value ) )
74 return " ; ".join( attrs_strs )
75
76
77 def stop_err(msg):
78 sys.exit("%s\n" % msg)
79
80
81 def __main__():
82 parser = optparse.OptionParser()
83 parser.add_option('-1', '--input', dest='input', help=' file of RNA-Seq read alignments in the SAM format. SAM is a standard short read alignment, that allows aligners to attach custom tags to individual alignments, and Cufflinks requires that the alignments you supply have some of these tags. Please see Input formats for more details.')
84 parser.add_option('-I', '--max-intron-length', dest='max_intron_len', help='The minimum intron length. Cufflinks will not report transcripts with introns longer than this, and will ignore SAM alignments with REF_SKIP CIGAR operations longer than this. The default is 300,000.')
85 parser.add_option('-F', '--min-isoform-fraction', dest='min_isoform_fraction', help='After calculating isoform abundance for a gene, Cufflinks filters out transcripts that it believes are very low abundance, because isoforms expressed at extremely low levels often cannot reliably be assembled, and may even be artifacts of incompletely spliced precursors of processed transcripts. This parameter is also used to filter out introns that have far fewer spliced alignments supporting them. The default is 0.05, or 5% of the most abundant isoform (the major isoform) of the gene.')
86 parser.add_option('-j', '--pre-mrna-fraction', dest='pre_mrna_fraction', help='Some RNA-Seq protocols produce a significant amount of reads that originate from incompletely spliced transcripts, and these reads can confound the assembly of fully spliced mRNAs. Cufflinks uses this parameter to filter out alignments that lie within the intronic intervals implied by the spliced alignments. The minimum depth of coverage in the intronic region covered by the alignment is divided by the number of spliced reads, and if the result is lower than this parameter value, the intronic alignments are ignored. The default is 5%.')
87 parser.add_option('-p', '--num-threads', dest='num_threads', help='Use this many threads to align reads. The default is 1.')
88 parser.add_option('-G', '--GTF', dest='GTF', help='Tells Cufflinks to use the supplied reference annotation to estimate isoform expression. It will not assemble novel transcripts, and the program will ignore alignments not structurally compatible with any reference transcript.')
89 parser.add_option("--compatible-hits-norm", dest='compatible_hits_norm', action="store_true", help='Count hits compatible with reference RNAs only')
90 parser.add_option('-g', '--GTF-guide', dest='GTFguide', help='use reference transcript annotation to guide assembly')
91 parser.add_option("--3-overhang-tolerance", dest='three_overhang_tolerance', help='The number of bp allowed to overhang the 3prime end of a reference transcript when determining if an assembled transcript should be merged with it (ie, the assembled transcript is not novel). The default is 600 bp.')
92 parser.add_option("--intron-overhang-tolerance", dest='intron_overhang_tolerance', help='The number of bp allowed to enter the intron of a reference transcript when determining if an assembled transcript should be merged with it (ie, the assembled transcript is not novel). The default is 50 bp.')
93 parser.add_option("--no-faux-reads", dest='no_faux_reads', help='This option disables tiling of the reference transcripts with faux reads. Use this if you only want to use sequencing reads in assembly but do not want to output assembled transcripts that lay within reference transcripts. All reference transcripts in the input annotation will also be included in the output.')
94 parser.add_option('-u', '--multi-read-correct', dest='multi_read_correct', action="store_true", help='Tells Cufflinks to do an initial estimation procedure to more accurately weight reads mapping to multiple locations in the genome')
95
96 # Normalization options.
97 parser.add_option("--no-effective-length-correction", dest="no_effective_length_correction", action="store_true")
98 parser.add_option("--no-length-correction", dest="no_length_correction", action="store_true")
99
100 # Wrapper / Galaxy options.
101 parser.add_option('-A', '--assembled-isoforms-output', dest='assembled_isoforms_output_file', help='Assembled isoforms output file; formate is GTF.')
102
103 # Advanced Options:
104 parser.add_option("--library-type", dest="library_type", help=' library prep used for input reads, default fr-unstranded')
105 parser.add_option('-M', '--mask-file', dest='mask_file', help='Tells Cufflinks to ignore all reads that could have come from transcripts in this GTF file. \
106 We recommend including any annotated rRNA, mitochondrial transcripts other abundant transcripts \
107 you wish to ignore in your analysis in this file. Due to variable efficiency of mRNA enrichment \
108 methods and rRNA depletion kits, masking these transcripts often improves the overall robustness \
109 of transcript abundance estimates.')
110 parser.add_option('-m', '--inner-mean-dist', dest='inner_mean_dist', help='This is the expected (mean) inner distance between mate pairs. \
111 For, example, for paired end runs with fragments selected at 300bp, \
112 where each end is 50bp, you should set -r to be 200. The default is 45bp.') # cufflinks: --frag-len-mean
113
114 parser.add_option('-s', '--inner-dist-std-dev', dest='inner_dist_std_dev', help='The standard deviation for the distribution on inner distances between mate pairs. The default is 20bp.') # cufflinks: --frag-len-std-dev
115 parser.add_option('--max-mle-iterations', dest='max_mle_iterations', help='Sets the number of iterations allowed during maximum likelihood estimation of abundances. Default: 5000')
116 parser.add_option('--junc-alpha', dest='junc_alpha', help='Alpha value for the binomial test used during false positive spliced alignment filtration. Default: 0.001')
117 parser.add_option('--small-anchor-fraction', dest='small_anchor_fraction', help='Spliced reads with less than this percent of their length on each side of\
118 the junction are considered suspicious and are candidates for filtering prior to assembly. Default: 0.09.')
119 parser.add_option('--overhang-tolerance', dest='overhang_tolerance', help='The number of bp allowed to enter the intron of a transcript when determining if a \
120 read or another transcript is mappable to/compatible with it. The default is 8 bp based on the default bowtie/TopHat parameters.')
121 parser.add_option('--max-bundle-length', dest='max_bundle_length', help='Maximum genomic length of a given bundle" help="Default: 3,500,000bp')
122 parser.add_option('--max-bundle-frags', dest='max_bundle_frags', help='Sets the maximum number of fragments a locus may have before being skipped. Skipped loci are listed in skipped.gtf. Default: 1,000,000')
123 parser.add_option('--min-intron-length', dest='min_intron_length', help='Minimal allowed intron size. Default: 50')
124 parser.add_option('--trim-3-avgcov-thresh', dest='trim_three_avgcov_thresh', help='Minimum average coverage required to attempt 3prime trimming. Default: 10')
125 parser.add_option('--trim-3-dropoff-frac', dest='trim_three_dropoff_frac', help='The fraction of average coverage below which to trim the 3prime end of an assembled transcript. Default: 0.1')
126
127 # Bias correction options.
128 parser.add_option('-b', dest='do_bias_correction', action="store_true", help='Providing Cufflinks with a multifasta file via this option instructs it to run our new bias detection and correction algorithm which can significantly improve accuracy of transcript abundance estimates.')
129 parser.add_option('', '--index', dest='index', help='The path of the reference genome')
130 parser.add_option('', '--ref_file', dest='ref_file', help='The reference dataset from the history')
131
132 # Global model (for trackster).
133 parser.add_option('', '--global_model', dest='global_model_file', help='Global model used for computing on local data')
134
135 (options, args) = parser.parse_args()
136
137 # If doing bias correction, set/link to sequence file.
138 if options.do_bias_correction:
139 if options.ref_file:
140 # Sequence data from history.
141 # Create symbolic link to ref_file so that index will be created in working directory.
142 seq_path = "ref.fa"
143 os.symlink(options.ref_file, seq_path)
144 else:
145 if not os.path.exists(options.index):
146 stop_err('Reference genome %s not present, request it by reporting this error.' % options.index)
147 seq_path = options.index
148
149 # Build command.
150
151 # Base; always use quiet mode to avoid problems with storing log output.
152 cmd = "cufflinks -q --no-update-check"
153
154 # Add options.
155 if options.max_intron_len:
156 cmd += (" -I %i" % int(options.max_intron_len))
157 if options.min_isoform_fraction:
158 cmd += (" -F %f" % float(options.min_isoform_fraction))
159 if options.pre_mrna_fraction:
160 cmd += (" -j %f" % float(options.pre_mrna_fraction))
161 if options.num_threads:
162 cmd += (" -p %i" % int(options.num_threads))
163 if options.GTF:
164 cmd += (" -G %s" % options.GTF)
165 if options.compatible_hits_norm:
166 cmd += (" --compatible-hits-norm")
167 if options.GTFguide:
168 cmd += (" -g %s" % options.GTFguide)
169 cmd += (" --3-overhang-tolerance %i" % int(options.three_overhang_tolerance))
170 cmd += (" --intron-overhang-tolerance %i" % int(options.intron_overhang_tolerance))
171 if options.no_faux_reads:
172 cmd += (" --no-faux-reads")
173 if options.multi_read_correct:
174 cmd += (" -u")
175
176 if options.library_type and options.library_type != 'auto':
177 cmd += (" --library-type %s" % options.library_type)
178 if options.mask_file:
179 cmd += (" --mask-file %s" % options.mask_file)
180 if options.inner_mean_dist:
181 cmd += (" -m %i" % int(options.inner_mean_dist))
182 if options.inner_dist_std_dev:
183 cmd += (" -s %i" % int(options.inner_dist_std_dev))
184 if options.max_mle_iterations:
185 cmd += (" --max-mle-iterations %i" % int(options.max_mle_iterations))
186 if options.junc_alpha:
187 cmd += (" --junc-alpha %f" % float(options.junc_alpha))
188 if options.small_anchor_fraction:
189 cmd += (" --small-anchor-fraction %f" % float(options.small_anchor_fraction))
190 if options.overhang_tolerance:
191 cmd += (" --overhang-tolerance %i" % int(options.overhang_tolerance))
192 if options.max_bundle_length:
193 cmd += (" --max-bundle-length %i" % int(options.max_bundle_length))
194 if options.max_bundle_frags:
195 cmd += (" --max-bundle-frags %i" % int(options.max_bundle_frags))
196 if options.min_intron_length:
197 cmd += (" --min-intron-length %i" % int(options.min_intron_length))
198 if options.trim_three_avgcov_thresh:
199 cmd += (" --trim-3-avgcov-thresh %i" % int(options.trim_three_avgcov_thresh))
200 if options.trim_three_dropoff_frac:
201 cmd += (" --trim-3-dropoff-frac %f" % float(options.trim_three_dropoff_frac))
202
203 if options.do_bias_correction:
204 cmd += (" -b %s" % seq_path)
205 if options.no_effective_length_correction:
206 cmd += (" --no-effective-length-correction")
207 if options.no_length_correction:
208 cmd += (" --no-length-correction")
209
210 # Add input files.
211 cmd += " " + options.input
212
213 # Run command and handle output.
214 try:
215 # Run command.
216 with tempfile.NamedTemporaryFile(dir=".") as tmp_stderr:
217 returncode = subprocess.call(args=cmd, stderr=tmp_stderr, shell=True)
218
219 # Error checking.
220 if returncode != 0:
221 # Get stderr, allowing for case where it's very large.
222 buffsize = 1048576
223 stderr = ''
224 with open(tmp_stderr.name) as tmp_stderr2:
225 try:
226 while True:
227 stderr += tmp_stderr2.read(buffsize)
228 if not stderr or len(stderr) % buffsize != 0:
229 break
230 except OverflowError:
231 pass
232 raise Exception(stderr)
233
234 # Read standard error to get total map/upper quartile mass.
235 total_map_mass = -1
236 with open(tmp_stderr.name, 'r') as tmp_stderr2:
237 for line in tmp_stderr2:
238 if line.lower().find("map mass") >= 0 or line.lower().find("upper quartile") >= 0:
239 total_map_mass = float(line.split(":")[1].strip())
240 break
241
242 # If there's a global model provided, use model's total map mass
243 # to adjust FPKM + confidence intervals.
244 if options.global_model_file:
245 # Global model is simply total map mass from original run.
246 with open(options.global_model_file, 'r') as global_model_file:
247 global_model_total_map_mass = float(global_model_file.readline())
248
249 # Ratio of global model's total map mass to original run's map mass is
250 # factor used to adjust FPKM.
251 fpkm_map_mass_ratio = total_map_mass / global_model_total_map_mass
252
253 # Update FPKM values in transcripts.gtf file.
254 with open("transcripts.gtf", 'r') as transcripts_file:
255 with tempfile.NamedTemporaryFile(dir=".", delete=False) as new_transcripts_file:
256 for line in transcripts_file:
257 fields = line.split('\t')
258 attrs = parse_gff_attributes(fields[8])
259 attrs["FPKM"] = str(float(attrs["FPKM"]) * fpkm_map_mass_ratio)
260 attrs["conf_lo"] = str(float(attrs["conf_lo"]) * fpkm_map_mass_ratio)
261 attrs["conf_hi"] = str(float(attrs["conf_hi"]) * fpkm_map_mass_ratio)
262 fields[8] = gff_attributes_to_str(attrs, "GTF")
263 new_transcripts_file.write("%s\n" % '\t'.join(fields))
264 shutil.move(new_transcripts_file.name, "transcripts.gtf")
265
266 # TODO: update expression files as well.
267
268 # Set outputs. Transcript and gene expression handled by wrapper directives.
269 shutil.move("transcripts.gtf", options.assembled_isoforms_output_file)
270 if total_map_mass > -1:
271 with open("global_model.txt", 'w') as f:
272 f.write("%f\n" % total_map_mass)
273 except Exception as e:
274 stop_err('Error running cufflinks: %s' % e)
275
276
277 if __name__ == "__main__":
278 __main__()