Mercurial > repos > daumsoft > test11
comparison ICGC_STAR_ALIGNMENT_PIPELINE/star_align.py @ 0:0ea9868b554c draft default tip
Uploaded
author | daumsoft |
---|---|
date | Mon, 10 Sep 2018 01:15:58 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:0ea9868b554c |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import os | |
4 import sys | |
5 import re | |
6 import string | |
7 import tempfile | |
8 import subprocess | |
9 import argparse | |
10 import shutil | |
11 import lxml.etree as etree | |
12 import fnmatch | |
13 | |
14 def walk_dir(base, pattern): | |
15 files = [] | |
16 for root, dirnames, filenames in os.walk(base): | |
17 for fname in fnmatch.filter(filenames, pattern): | |
18 files.append(os.path.join(root, fname)) | |
19 return files | |
20 | |
21 def scan_workdir(base): | |
22 | |
23 ### scan for paired-end files | |
24 ############################# | |
25 | |
26 ### unzipped fastq input | |
27 fastq_files = walk_dir(base, "*_read[12]_*fastq") | |
28 if len(fastq_files): | |
29 o = {} | |
30 for i in sorted(fastq_files): | |
31 basename = re.sub(r'_read[12]', '', i) | |
32 try: | |
33 o[basename].append(i) | |
34 except KeyError: | |
35 o[basename] = [i] | |
36 if not all( (len(i) == 2 for i in o.values())): | |
37 raise Exception("Missing Pair") | |
38 return ( 'cat', list( (os.path.basename(i), o[i][0], o[i][1]) for i in o.keys()), 'PE') | |
39 | |
40 ### unzipped fastq input | |
41 fastq_files = walk_dir(base, "*_R[12]_001.fastq") | |
42 if len(fastq_files): | |
43 o = {} | |
44 for i in fastq_files: | |
45 basename = re.sub(r'_R[12]_001.fastq$', '', i) | |
46 o[basename] = o.get(basename, 0) + 1 | |
47 if not all( (i == 2 for i in o.values())): | |
48 raise Exception("Missing Pair") | |
49 return ( 'cat', list( (os.path.basename(i), "%s_R1_001.fastq" % i,"%s_R2_001.fastq" % i) for i in o.keys()), 'PE') | |
50 | |
51 ### unzipped fastq input | |
52 fastq_files = walk_dir(base, "*_[12].fastq") | |
53 if len(fastq_files): | |
54 o = {} | |
55 for i in fastq_files: | |
56 basename = re.sub(r'_[12].fastq$', '', i) | |
57 o[basename] = o.get(basename, 0) + 1 | |
58 if not all( (i == 2 for i in o.values())): | |
59 raise Exception("Missing Pair") | |
60 return ( 'cat', list( (os.path.basename(i), "%s_1.fastq" % i,"%s_2.fastq" % i) for i in o.keys()), 'PE') | |
61 | |
62 ### unzipped fastq input | |
63 fastq_files = walk_dir(base, "*[.][12].fastq") | |
64 if len(fastq_files): | |
65 o = {} | |
66 for i in fastq_files: | |
67 basename = re.sub(r'[.][12].fastq$', '', i) | |
68 o[basename] = o.get(basename, 0) + 1 | |
69 if not all( (i == 2 for i in o.values())): | |
70 raise Exception("Missing Pair") | |
71 return ( 'cat', list( (os.path.basename(i), "%s.1.fastq" % i,"%s.2.fastq" % i) for i in o.keys()), 'PE') | |
72 | |
73 ### unzipped fastq input | |
74 fastq_files = walk_dir(base, "*.fastq[12]") | |
75 if len(fastq_files): | |
76 o = {} | |
77 for i in fastq_files: | |
78 basename = re.sub(r'.fastq[12]$', '', i) | |
79 o[basename] = o.get(basename, 0) + 1 | |
80 if not all( (i == 2 for i in o.values())): | |
81 raise Exception("Missing Pair") | |
82 return ( 'cat', list( (os.path.basename(i), "%s.fastq1" % i,"%s.fastq2" % i) for i in o.keys()), 'PE') | |
83 | |
84 ### unzipped txt input | |
85 fastq_files = walk_dir(base, "*_[12]_sequence.txt") | |
86 if len(fastq_files): | |
87 o = {} | |
88 for i in fastq_files: | |
89 basename = re.sub(r'_[12]_sequence.txt$', '', i) | |
90 o[basename] = o.get(basename, 0) + 1 | |
91 if not all( (i == 2 for i in o.values())): | |
92 raise Exception("Missing Pair") | |
93 return ( 'cat', list( (os.path.basename(i), "%s_1_sequence.txt" % i,"%s_2_sequence.txt" % i) for i in o.keys()), 'PE') | |
94 | |
95 ### gzipped input | |
96 fastq_gz_files = walk_dir(base, "*_[12].fastq.gz") | |
97 if len(fastq_gz_files): | |
98 o = {} | |
99 for i in fastq_gz_files: | |
100 basename = re.sub(r'_[12].fastq.gz$', '', i) | |
101 o[basename] = o.get(basename, 0) + 1 | |
102 if not all( (i == 2 for i in o.values())): | |
103 raise Exception("Missing Pair") | |
104 return ( 'zcat', list( (os.path.basename(i), "%s_1.fastq.gz" % i,"%s_2.fastq.gz" % i) for i in o.keys()), 'PE') | |
105 | |
106 ### bzipped input | |
107 fastq_bz_files = walk_dir(base, "*_[12].fastq.bz") | |
108 if len(fastq_gz_files): | |
109 o = {} | |
110 for i in fastq_gz_files: | |
111 basename = re.sub(r'_[12].fastq.bz$', '', i) | |
112 o[basename] = o.get(basename, 0) + 1 | |
113 if not all( (i == 2 for i in o.values())): | |
114 raise Exception("Missing Pair") | |
115 return ( 'bzcat', list( (os.path.basename(i), "%s_1.fastq.bz" % i,"%s_2.fastq.bz" % i) for i in o.keys()), 'PE') | |
116 | |
117 ### scan for single-end files | |
118 ############################# | |
119 | |
120 ### unzipped input | |
121 fastq_files = walk_dir(base, "*.fastq") | |
122 if len(fastq_files): | |
123 return ( 'cat', list( (os.path.basename(re.sub(r'.fastq$', '', i)), i) for i in fastq_files), 'SE') | |
124 | |
125 ### unzipped input | |
126 fastq_files = walk_dir(base, "*.fq") | |
127 if len(fastq_files): | |
128 return ( 'cat', list( (os.path.basename(re.sub(r'.fq$', '', i)), i) for i in fastq_files), 'SE') | |
129 | |
130 ### gzipped input | |
131 fastq_files = walk_dir(base, "*.fastq.gz") | |
132 if len(fastq_files): | |
133 return ( 'zcat', list( (os.path.basename(re.sub(r'.fastq.gz$', '', i)), i) for i in fastq_files), 'SE') | |
134 | |
135 ### bzipped input | |
136 fastq_files = walk_dir(base, "*.fastq.bz") | |
137 if len(fastq_files): | |
138 return ( 'bzcat', list( (os.path.basename(re.sub(r'.fastq.bz$', '', i)), i) for i in fastq_files), 'SE') | |
139 | |
140 raise Exception("Unable to determine input type") | |
141 | |
142 | |
143 def spreadsheet2dict(spreadFile): | |
144 """ | |
145 Takes the filename of the spreadsheet, loads the data and organizes | |
146 it into a dictionary""" | |
147 | |
148 spreadDict = {} | |
149 key2field = {} | |
150 for l, line in enumerate(open(spreadFile)): | |
151 sl = line.strip().split('\t') | |
152 if l == 0: | |
153 for k, key in enumerate(sl): | |
154 key2field[key] = k | |
155 else: | |
156 spreadDict[sl[key2field['analysis_id']]] = sl | |
157 | |
158 return (spreadDict, key2field) | |
159 | |
160 | |
161 def spreadsheet2RGdict(spreadFile, analysisID): | |
162 """Compiles a read group dictionary from the information | |
163 in the spreadFile for the given analysisID.""" | |
164 | |
165 sD, k2f = spreadsheet2dict(spreadFile) | |
166 | |
167 try: | |
168 rec = sD[analysisID] | |
169 except KeyError: | |
170 raise Exception('Information for analysis ID %s could not be found in %s' % (analysisID, spreadFile)) | |
171 | |
172 ### build dictionary | |
173 RG_dict = { 'ID' : '%s:%s' % (rec[k2f['center_name']], analysisID), | |
174 'CN' : rec[k2f['center_name']], | |
175 'LB' : 'RNA-Seq:%s:%s' % (rec[k2f['center_name']], rec[k2f['lib_id']]), | |
176 'PL' : rec[k2f['platform']], | |
177 'PM' : rec[k2f['platform_model']], | |
178 'SM' : rec[k2f['specimen_id']], | |
179 'SI' : rec[k2f['submitted_sample_id']]} | |
180 | |
181 files = [] | |
182 if 'fastq_files' in k2f: | |
183 if not rec[k2f['fastq_files']].strip(' ') in ['N/A', 'NA', 'no', '']: | |
184 files = rec[k2f['fastq_files']].strip(' ').split(' ') | |
185 | |
186 return (RG_dict, files) | |
187 | |
188 | |
189 def xml2RGdict(xmlfile): | |
190 | |
191 ### read xml in | |
192 root = etree.parse(xmlfile) | |
193 rtree = root.find('Result') | |
194 | |
195 ### analysis_id | |
196 analysis_id = rtree.find('analysis_id').text | |
197 center = rtree.find('center_name').text | |
198 try: | |
199 date_string = rtree.find('analysis_xml/ANALYSIS_SET/ANALYSIS').attrib['analysis_date'] | |
200 except KeyError: | |
201 date_string = rtree.find('run_xml/RUN_SET/RUN').attrib['run_date'] | |
202 sample_id = rtree.find('sample_id').text | |
203 submitter_id = rtree.find('legacy_sample_id').text | |
204 library_id = rtree.find('experiment_xml/EXPERIMENT_SET/EXPERIMENT').attrib['alias'] | |
205 platform = rtree.find('experiment_xml/EXPERIMENT_SET/EXPERIMENT/PLATFORM').getchildren()[0].tag | |
206 instrument = rtree.find('experiment_xml/EXPERIMENT_SET/EXPERIMENT/PLATFORM/*/INSTRUMENT_MODEL').text | |
207 | |
208 ### build dictionary | |
209 RG_dict = { 'ID' : '%s:%s' % (center, analysis_id), | |
210 'CN' : center, | |
211 'DT' : date_string, | |
212 'LB' : 'RNA-Seq:%s:%s' % (center, library_id), | |
213 'PL' : platform, | |
214 'PM' : instrument, | |
215 'SM' : sample_id, | |
216 'SI' : submitter_id} | |
217 | |
218 return RG_dict | |
219 | |
220 | |
221 if __name__ == "__main__": | |
222 | |
223 parser = argparse.ArgumentParser(description="ICGC RNA-Seq alignment wrapper for STAR alignments.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, usage='%(prog)s [options]', add_help=False) | |
224 required = parser.add_argument_group("Required input parameters") | |
225 required.add_argument("--genomeDir", default=None, help="Directory containing the reference genome index", required=True) | |
226 required.add_argument("--tarFileIn", default=None, help="Input file containing the sequence information", required=True) | |
227 optional = parser.add_argument_group("optional input parameters") | |
228 optional.add_argument("--out", default="out.bam", help="Name of the output BAM file") | |
229 optional.add_argument("--workDir", default="./", help="Work directory") | |
230 optional.add_argument("--metaDataTab", default=None, help="File containing metadata for the alignment header") | |
231 optional.add_argument("--analysisID", default=None, help="Analysis ID to be considered in the metadata file") | |
232 optional.add_argument("--keepJunctions", default=False, action='store_true', help="keeps the junction file as {--out}.junctions") | |
233 optional.add_argument("--useTMP", default=None, help="environment variable that is used as prefix for temprary data") | |
234 optional.add_argument("-h", "--help", action='store_true', help="show this help message and exit") | |
235 star = parser.add_argument_group("STAR input parameters") | |
236 star.add_argument("--runThreadN", type=int, default=4, help="Number of threads") | |
237 star.add_argument("--outFilterMultimapScoreRange", type=int, default=1, help="outFilterMultimapScoreRange") | |
238 star.add_argument("--outFilterMultimapNmax", type=int, default=20, help="outFilterMultimapNmax") | |
239 star.add_argument("--outFilterMismatchNmax", type=int, default=10, help="outFilterMismatchNmax") | |
240 star.add_argument("--alignIntronMax", type=int, default=500000, help="alignIntronMax") | |
241 star.add_argument("--alignMatesGapMax", type=int, default=1000000, help="alignMatesGapMax") | |
242 star.add_argument("--sjdbScore", type=int, default=2, help="sjdbScore") | |
243 star.add_argument("--limitBAMsortRAM", type=int, default=0, help="limitBAMsortRAM") | |
244 star.add_argument("--alignSJDBoverhangMin", type=int, default=1, help="alignSJDBoverhangMin") | |
245 star.add_argument("--genomeLoad", default="NoSharedMemory", help="genomeLoad") | |
246 star.add_argument("--genomeFastaFiles", default=None, help="genome sequence in fasta format to rebuild index") | |
247 star.add_argument("--outFilterMatchNminOverLread", type=float, default=0.33, help="outFilterMatchNminOverLread") | |
248 star.add_argument("--outFilterScoreMinOverLread", type=float, default=0.33, help="outFilterScoreMinOverLread") | |
249 star.add_argument("--twopass1readsN", type=int, default=-1, help="twopass1readsN (-1 means all reads are used for remapping)") | |
250 star.add_argument("--sjdbOverhang", type=int, default=100, help="sjdbOverhang (only necessary for two-pass mode)") | |
251 star.add_argument("--outSAMstrandField", default="intronMotif", help="outSAMstrandField") | |
252 star.add_argument("--outSAMattributes", default=["NH", "HI", "NM", "MD", "AS", "XS"], help="outSAMattributes") | |
253 star.add_argument("--outSAMunmapped", default="Within", help="outSAMunmapped") | |
254 star.add_argument("--outSAMtype", default=["BAM", "SortedByCoordinate"], help="outSAMtype") | |
255 star.add_argument("--outSAMheaderHD", default=["@HD", "VN:1.4"], help="outSAMheaderHD") | |
256 star.add_argument("--outSAMattrRGline", default=None, help="RG attribute line submitted to outSAMattrRGline") | |
257 star.add_argument("--outSAMattrRGfile", default=None, help="File containing the RG attribute line submitted to outSAMattrRGline") | |
258 star.add_argument("--outSAMattrRGxml", default=None, help="XML-File in TCGA format to compile RG attribute line") | |
259 | |
260 ### check completeness of command line inputs | |
261 if len(sys.argv) < 2: | |
262 parser.print_help() | |
263 sys.exit(0) | |
264 | |
265 args = parser.parse_args() | |
266 | |
267 ### some sanity checks on command line parameters | |
268 if args.metaDataTab is not None: | |
269 if not os.path.exists(args.metaDataTab): | |
270 raise Exception("File provided via --metaDataTab does not exist\nFile: %s" % args.metaDataTab) | |
271 if args.analysisID is None: | |
272 raise Exception("When providing information in a metadata file, a value for --analysisID is required") | |
273 if args.outSAMattrRGxml is not None and not os.path.exists(args.outSAMattrRGxml): | |
274 raise Exception("File provided via --outSAMattrRGxml does not exist\nFile: %s" % args.outSAMattrRGxml) | |
275 if args.outSAMattrRGfile is not None and not os.path.exists(args.outSAMattrRGfile): | |
276 raise Exception("File provided via --outSAMattrRGfile does not exist\nFile: %s" % args.outSAMattrRGfile) | |
277 | |
278 ### handling of input file (unpacking, etc. ) | |
279 if args.useTMP is not None: | |
280 workdir = tempfile.mkdtemp(dir=os.environ[args.useTMP], prefix="star_inputdir_") | |
281 else: | |
282 workdir = tempfile.mkdtemp(dir=args.workDir, prefix="star_inputdir_") | |
283 if args.tarFileIn.endswith(".gz"): | |
284 tarcmd = "tar xvzf %s -C %s" % (args.tarFileIn, workdir) | |
285 elif args.tarFileIn.endswith(".bz"): | |
286 tarcmd = "tar xvjf %s -C %s" % (args.tarFileIn, workdir) | |
287 elif args.tarFileIn.endswith(".tar"): | |
288 tarcmd = "tar xvf %s -C %s" % (args.tarFileIn, workdir) | |
289 elif args.tarFileIn.endswith(".sra"): | |
290 tarcmd = "fastq-dump --gzip --split-3 --outdir %s %s" % (workdir, args.tarFileIn) | |
291 else: | |
292 raise Exception("Unknown input file extension for file %s" % (args.tarFileIn)) | |
293 subprocess.check_call(tarcmd, shell=True) | |
294 | |
295 ### collect fastq information from extraction dir | |
296 align_sets = scan_workdir(os.path.abspath(workdir)) | |
297 | |
298 ### process read group information | |
299 files = [] | |
300 if args.metaDataTab is not None: | |
301 (RG_dict, files_tmp) = spreadsheet2RGdict(args.metaDataTab, args.analysisID) | |
302 files.extend(files_tmp) | |
303 elif args.outSAMattrRGxml is not None: | |
304 RG_dict = xml2RGdict(args.outSAMattrRGxml) | |
305 elif args.outSAMattrRGline is not None: | |
306 RG_dict = dict([(x.split(':', 1)[0], x.split(':', 1)[1]) for x in args.outSAMattrRGline.split()]) | |
307 elif args.outSAMattrRGfile is not None: | |
308 _fh = open(args.outSAMattrRGfile, 'r') | |
309 RG_dict = dict([(x.split(':', 1)[0], x.split(':', 1)[1]) for x in _fh.next().strip().split()]) | |
310 _fh.close() | |
311 else: | |
312 RG_dict = {'ID' : '', 'SM' : ''} | |
313 | |
314 ### post-process RG-dict to comply with STAR conventions | |
315 for key in RG_dict: | |
316 sl = RG_dict[key].split(' ') | |
317 if len(sl) > 1: | |
318 RG_dict[key] = '"%s"' % RG_dict[key] | |
319 | |
320 ### use list of fastq input files for whitelisting | |
321 if len(files) > 0: | |
322 align_sets = (align_sets[0], [x for x in align_sets[1] if (re.sub('(_[12]){,1}.fastq(.(gz|bz2|bz))*', '', os.path.basename(x[1])) in files)], align_sets[2]) | |
323 if len(align_sets[1]) == 0: | |
324 print >> sys.stderr, 'All input files have been filtered out - no input remaining. Terminating.' | |
325 sys.exit() | |
326 | |
327 ### use filename stub as read group label | |
328 RG_dict['RG'] = [] | |
329 for fn in [x[1] for x in align_sets[1]]: | |
330 fn_stub = re.sub('(_[12]){,1}.fastq(.(gz|bz2|bz))*', '', os.path.basename(fn)) | |
331 fn_stub = re.sub('(_[12]){,1}_sequence.txt(.(gz|bz2|bz))*', '', fn_stub) | |
332 fn_stub = re.sub('_read[12]', '', fn_stub) | |
333 fn_stub = re.sub('_R[12]_001$', '', fn_stub) | |
334 RG_dict['RG'].append(fn_stub) | |
335 | |
336 ### prepare template string | |
337 if align_sets[2] == 'PE': | |
338 read_str = '${fastq_left} ${fastq_right}' | |
339 else: | |
340 read_str = '${fastq_left}' | |
341 | |
342 ### simulate two pass alignment until STAR fully implements this | |
343 if args.twopass1readsN != 0: | |
344 | |
345 ### run first round of alignments and only record junctions | |
346 align_template_str_1st = """STAR \ | |
347 --genomeDir ${genomeDir} --readFilesIn %s \ | |
348 --runThreadN ${runThreadN} \ | |
349 --outFilterMultimapScoreRange ${outFilterMultimapScoreRange} \ | |
350 --outFilterMultimapNmax ${outFilterMultimapNmax} \ | |
351 --outFilterMismatchNmax ${outFilterMismatchNmax} \ | |
352 --alignIntronMax ${alignIntronMax} \ | |
353 --alignMatesGapMax ${alignMatesGapMax} \ | |
354 --sjdbScore ${sjdbScore} \ | |
355 --alignSJDBoverhangMin ${alignSJDBoverhangMin} \ | |
356 --genomeLoad ${genomeLoad} \ | |
357 --readFilesCommand ${readFilesCommand} \ | |
358 --outFilterMatchNminOverLread ${outFilterMatchNminOverLread} \ | |
359 --outFilterScoreMinOverLread ${outFilterScoreMinOverLread} \ | |
360 --sjdbOverhang ${sjdbOverhang} \ | |
361 --outSAMstrandField ${outSAMstrandField} \ | |
362 --outSAMtype None \ | |
363 --outSAMmode None""" % read_str | |
364 | |
365 if args.twopass1readsN > 0: | |
366 align_template_str_1st += " --readMapNumber %i" % args.twopass1readsN | |
367 | |
368 cmd = string.Template(align_template_str_1st).safe_substitute({ | |
369 'genomeDir' : os.path.abspath(args.genomeDir), | |
370 'runThreadN' : args.runThreadN, | |
371 'outFilterMultimapScoreRange' : args.outFilterMultimapScoreRange, | |
372 'outFilterMultimapNmax' : args.outFilterMultimapNmax, | |
373 'outFilterMismatchNmax' : args.outFilterMismatchNmax, | |
374 'fastq_left' : ','.join([os.path.join(x[0], x[1]) for x in align_sets[1]]), | |
375 'alignIntronMax' : args.alignIntronMax, | |
376 'alignMatesGapMax': args.alignMatesGapMax, | |
377 'sjdbScore': args.sjdbScore, | |
378 'alignSJDBoverhangMin' : args.alignSJDBoverhangMin, | |
379 'genomeLoad' : args.genomeLoad, | |
380 'readFilesCommand' : align_sets[0], | |
381 'outFilterMatchNminOverLread' : args.outFilterMatchNminOverLread, | |
382 'outFilterScoreMinOverLread' : args.outFilterScoreMinOverLread, | |
383 'sjdbOverhang' : args.sjdbOverhang, | |
384 'outSAMstrandField' : args.outSAMstrandField | |
385 }) | |
386 if align_sets[2] == 'PE': | |
387 cmd = string.Template(cmd).substitute({ | |
388 'fastq_right' : ','.join([os.path.join(x[0], x[2]) for x in align_sets[1]]) | |
389 }) | |
390 | |
391 ### take temp directory from environment variable | |
392 if args.useTMP is not None: | |
393 align_dir_1st = os.path.abspath( tempfile.mkdtemp(dir=os.environ[args.useTMP], prefix="star_aligndir_1st_") ) | |
394 genome_dir_1st = os.path.abspath( tempfile.mkdtemp(dir=os.environ[args.useTMP], prefix="star_genomedir_1st_") ) | |
395 else: | |
396 align_dir_1st = os.path.abspath( tempfile.mkdtemp(dir=args.workDir, prefix="star_aligndir_1st_") ) | |
397 genome_dir_1st = os.path.abspath( tempfile.mkdtemp(dir=args.workDir, prefix="star_genomedir_1st_") ) | |
398 print "Running", cmd | |
399 subprocess.check_call(cmd, shell=True, cwd=align_dir_1st) | |
400 | |
401 ### build index using provided genome fasta as well as junctions from first run | |
402 cmd = """STAR --runMode genomeGenerate --genomeDir %s \ | |
403 --genomeFastaFiles %s \ | |
404 --sjdbOverhang %i \ | |
405 --runThreadN %i \ | |
406 --sjdbFileChrStartEnd %s""" % (genome_dir_1st, args.genomeFastaFiles, args.sjdbOverhang, args.runThreadN, os.path.join(align_dir_1st, 'SJ.out.tab')) | |
407 print "Running", cmd | |
408 subprocess.check_call(cmd, shell=True, cwd=align_dir_1st) | |
409 | |
410 ### replace index for the second run with the one currently built | |
411 genome_dir = genome_dir_1st | |
412 else: | |
413 genome_dir = os.path.abspath(args.genomeDir) | |
414 | |
415 | |
416 align_template_str = """STAR \ | |
417 --genomeDir ${genomeDir} --readFilesIn %s \ | |
418 --runThreadN ${runThreadN} \ | |
419 --outFilterMultimapScoreRange ${outFilterMultimapScoreRange} \ | |
420 --outFilterMultimapNmax ${outFilterMultimapNmax} \ | |
421 --outFilterMismatchNmax ${outFilterMismatchNmax} \ | |
422 --alignIntronMax ${alignIntronMax} \ | |
423 --alignMatesGapMax ${alignMatesGapMax} \ | |
424 --sjdbScore ${sjdbScore} \ | |
425 --alignSJDBoverhangMin ${alignSJDBoverhangMin} \ | |
426 --genomeLoad ${genomeLoad} \ | |
427 --limitBAMsortRAM ${limitBAMsortRAM} \ | |
428 --readFilesCommand ${readFilesCommand} \ | |
429 --outFilterMatchNminOverLread ${outFilterMatchNminOverLread} \ | |
430 --outFilterScoreMinOverLread ${outFilterScoreMinOverLread} \ | |
431 --sjdbOverhang ${sjdbOverhang} \ | |
432 --outSAMstrandField ${outSAMstrandField} \ | |
433 --outSAMattributes ${outSAMattributes} \ | |
434 --outSAMunmapped ${outSAMunmapped} \ | |
435 --outSAMtype ${outSAMtype} \ | |
436 --outSAMheaderHD ${outSAMheaderHD}""" % read_str | |
437 | |
438 #--twopass1readsN ${twopass1readsN} \ | |
439 | |
440 cmd = string.Template(align_template_str).safe_substitute({ | |
441 'genomeDir' : genome_dir, | |
442 'runThreadN' : args.runThreadN, | |
443 'fastq_left' : ','.join([os.path.join(x[0], x[1]) for x in align_sets[1]]), #os.path.abspath(pair[1]), | |
444 'outFilterMultimapScoreRange' : args.outFilterMultimapScoreRange, | |
445 'outFilterMultimapNmax' : args.outFilterMultimapNmax, | |
446 'outFilterMismatchNmax' : args.outFilterMismatchNmax, | |
447 'alignIntronMax' : args.alignIntronMax, | |
448 'alignMatesGapMax': args.alignMatesGapMax, | |
449 'sjdbScore': args.sjdbScore, | |
450 'alignSJDBoverhangMin' : args.alignSJDBoverhangMin, | |
451 'genomeLoad' : args.genomeLoad, | |
452 'limitBAMsortRAM' : args.limitBAMsortRAM, | |
453 'readFilesCommand' : align_sets[0], | |
454 'outFilterMatchNminOverLread' : args.outFilterMatchNminOverLread, | |
455 'outFilterScoreMinOverLread' : args.outFilterScoreMinOverLread, | |
456 'sjdbOverhang' : args.sjdbOverhang, | |
457 'outSAMstrandField' : args.outSAMstrandField, | |
458 'outSAMattributes' : " ".join(args.outSAMattributes), | |
459 'outSAMunmapped' : args.outSAMunmapped, | |
460 'outSAMtype' : " ".join(args.outSAMtype), | |
461 'outSAMheaderHD' : " ".join(args.outSAMheaderHD) | |
462 }) | |
463 # 'twopass1readsN' : args.twopass1readsN, | |
464 if align_sets[2] == 'PE': | |
465 cmd = string.Template(cmd).substitute({ | |
466 'fastq_right' : ','.join([os.path.join(x[0], x[2]) for x in align_sets[1]]) # os.path.abspath(pair[2]), | |
467 }) | |
468 | |
469 ### convert RG_dict into formatted RG line | |
470 RG_line = [] | |
471 for r, readgroup in enumerate(align_sets[1]): | |
472 if 'RG' in RG_dict: | |
473 tmp = 'ID:%s:%s' % (RG_dict['ID'], RG_dict['RG'][r]) | |
474 else: | |
475 tmp = 'ID:%s:%s' % (RG_dict['ID'], readgroup[0]) | |
476 if len(RG_dict) > 1: | |
477 tmp += '\t' | |
478 tmp += '\t'.join(['%s:%s' % (key, RG_dict[key]) for key in RG_dict if key not in ['ID', 'RG', 'SI']]) | |
479 ### add read group label | |
480 if 'RG' in RG_dict and 'CN' in RG_dict: | |
481 tmp += '\tPU:%s:%s' % (RG_dict['CN'], RG_dict['RG'][r]) | |
482 RG_line.append('%s' % tmp) | |
483 cmd += ' --outSAMattrRGline %s' % ' , '.join(RG_line) | |
484 | |
485 ### handle comment lines | |
486 comment_file = None | |
487 if 'SI' in RG_dict: | |
488 if args.useTMP is not None: | |
489 comment_file = os.path.abspath( tempfile.mkstemp(dir=os.environ[args.useTMP], prefix="star_comments_")[1] ) | |
490 else: | |
491 comment_file = os.path.abspath( tempfile.mkstemp(dir=args.workDir, prefix="star_comments_")[1] ) | |
492 | |
493 fd_com = open(comment_file, 'w') | |
494 fd_com.write('@CO\tsubmitter_sample_id:%s\n' % RG_dict['SI']) | |
495 | |
496 fd_com.flush() | |
497 fd_com.close() | |
498 | |
499 cmd += ' --outSAMheaderCommentFile %s' % comment_file | |
500 | |
501 | |
502 ### take temp directory from environment variable | |
503 if args.useTMP is not None: | |
504 align_dir = os.path.abspath( tempfile.mkdtemp(dir=os.environ[args.useTMP], prefix="star_aligndir_") ) | |
505 else: | |
506 align_dir = os.path.abspath( tempfile.mkdtemp(dir=args.workDir, prefix="star_aligndir_") ) | |
507 print "Running", cmd | |
508 subprocess.check_call(cmd, shell=True, cwd=align_dir) | |
509 | |
510 ### move output file | |
511 if 'BAM' in args.outSAMtype and 'SortedByCoordinate' in args.outSAMtype: | |
512 shutil.move(os.path.join(align_dir, 'Aligned.sortedByCoord.out.bam'), args.out) | |
513 elif 'BAM' in args.outSAMtype and 'Unsorted' in args.outSAMtype: | |
514 shutil.move(os.path.join(align_dir, 'Aligned.out.bam'), args.out) | |
515 else: | |
516 raise Exception('STAR output file could not be determined') | |
517 | |
518 ### move junctions if to be kept | |
519 if args.keepJunctions: | |
520 shutil.move(os.path.join(align_dir, 'SJ.out.tab'), args.out + '.junctions') | |
521 | |
522 ### clean up working directory | |
523 shutil.rmtree(workdir) | |
524 shutil.rmtree(align_dir) | |
525 if args.twopass1readsN != 0: | |
526 shutil.rmtree(align_dir_1st) | |
527 shutil.rmtree(genome_dir_1st) |