annotate ICGC_STAR_ALIGNMENT_PIPELINE/star_align.py @ 0:be0f5f54462d draft default tip

Uploaded
author daumsoft
date Mon, 10 Sep 2018 01:20:03 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
1 #!/usr/bin/env python
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
2
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
3 import os
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
4 import sys
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
5 import re
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
6 import string
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
7 import tempfile
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
8 import subprocess
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
9 import argparse
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
10 import shutil
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
11 import lxml.etree as etree
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
12 import fnmatch
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
13
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
14 def walk_dir(base, pattern):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
15 files = []
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
16 for root, dirnames, filenames in os.walk(base):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
17 for fname in fnmatch.filter(filenames, pattern):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
18 files.append(os.path.join(root, fname))
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
19 return files
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
20
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
21 def scan_workdir(base):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
22
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
23 ### scan for paired-end files
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
24 #############################
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
25
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
26 ### unzipped fastq input
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
27 fastq_files = walk_dir(base, "*_read[12]_*fastq")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
28 if len(fastq_files):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
29 o = {}
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
30 for i in sorted(fastq_files):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
31 basename = re.sub(r'_read[12]', '', i)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
32 try:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
33 o[basename].append(i)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
34 except KeyError:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
35 o[basename] = [i]
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
36 if not all( (len(i) == 2 for i in o.values())):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
37 raise Exception("Missing Pair")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
38 return ( 'cat', list( (os.path.basename(i), o[i][0], o[i][1]) for i in o.keys()), 'PE')
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
39
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
40 ### unzipped fastq input
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
41 fastq_files = walk_dir(base, "*_R[12]_001.fastq")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
42 if len(fastq_files):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
43 o = {}
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
44 for i in fastq_files:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
45 basename = re.sub(r'_R[12]_001.fastq$', '', i)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
46 o[basename] = o.get(basename, 0) + 1
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
47 if not all( (i == 2 for i in o.values())):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
48 raise Exception("Missing Pair")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
49 return ( 'cat', list( (os.path.basename(i), "%s_R1_001.fastq" % i,"%s_R2_001.fastq" % i) for i in o.keys()), 'PE')
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
50
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
51 ### unzipped fastq input
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
52 fastq_files = walk_dir(base, "*_[12].fastq")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
53 if len(fastq_files):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
54 o = {}
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
55 for i in fastq_files:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
56 basename = re.sub(r'_[12].fastq$', '', i)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
57 o[basename] = o.get(basename, 0) + 1
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
58 if not all( (i == 2 for i in o.values())):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
59 raise Exception("Missing Pair")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
60 return ( 'cat', list( (os.path.basename(i), "%s_1.fastq" % i,"%s_2.fastq" % i) for i in o.keys()), 'PE')
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
61
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
62 ### unzipped fastq input
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
63 fastq_files = walk_dir(base, "*[.][12].fastq")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
64 if len(fastq_files):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
65 o = {}
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
66 for i in fastq_files:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
67 basename = re.sub(r'[.][12].fastq$', '', i)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
68 o[basename] = o.get(basename, 0) + 1
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
69 if not all( (i == 2 for i in o.values())):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
70 raise Exception("Missing Pair")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
71 return ( 'cat', list( (os.path.basename(i), "%s.1.fastq" % i,"%s.2.fastq" % i) for i in o.keys()), 'PE')
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
72
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
73 ### unzipped fastq input
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
74 fastq_files = walk_dir(base, "*.fastq[12]")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
75 if len(fastq_files):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
76 o = {}
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
77 for i in fastq_files:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
78 basename = re.sub(r'.fastq[12]$', '', i)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
79 o[basename] = o.get(basename, 0) + 1
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
80 if not all( (i == 2 for i in o.values())):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
81 raise Exception("Missing Pair")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
82 return ( 'cat', list( (os.path.basename(i), "%s.fastq1" % i,"%s.fastq2" % i) for i in o.keys()), 'PE')
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
83
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
84 ### unzipped txt input
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
85 fastq_files = walk_dir(base, "*_[12]_sequence.txt")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
86 if len(fastq_files):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
87 o = {}
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
88 for i in fastq_files:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
89 basename = re.sub(r'_[12]_sequence.txt$', '', i)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
90 o[basename] = o.get(basename, 0) + 1
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
91 if not all( (i == 2 for i in o.values())):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
92 raise Exception("Missing Pair")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
93 return ( 'cat', list( (os.path.basename(i), "%s_1_sequence.txt" % i,"%s_2_sequence.txt" % i) for i in o.keys()), 'PE')
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
94
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
95 ### gzipped input
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
96 fastq_gz_files = walk_dir(base, "*_[12].fastq.gz")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
97 if len(fastq_gz_files):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
98 o = {}
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
99 for i in fastq_gz_files:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
100 basename = re.sub(r'_[12].fastq.gz$', '', i)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
101 o[basename] = o.get(basename, 0) + 1
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
102 if not all( (i == 2 for i in o.values())):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
103 raise Exception("Missing Pair")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
104 return ( 'zcat', list( (os.path.basename(i), "%s_1.fastq.gz" % i,"%s_2.fastq.gz" % i) for i in o.keys()), 'PE')
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
105
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
106 ### bzipped input
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
107 fastq_bz_files = walk_dir(base, "*_[12].fastq.bz")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
108 if len(fastq_gz_files):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
109 o = {}
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
110 for i in fastq_gz_files:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
111 basename = re.sub(r'_[12].fastq.bz$', '', i)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
112 o[basename] = o.get(basename, 0) + 1
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
113 if not all( (i == 2 for i in o.values())):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
114 raise Exception("Missing Pair")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
115 return ( 'bzcat', list( (os.path.basename(i), "%s_1.fastq.bz" % i,"%s_2.fastq.bz" % i) for i in o.keys()), 'PE')
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
116
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
117 ### scan for single-end files
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
118 #############################
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
119
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
120 ### unzipped input
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
121 fastq_files = walk_dir(base, "*.fastq")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
122 if len(fastq_files):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
123 return ( 'cat', list( (os.path.basename(re.sub(r'.fastq$', '', i)), i) for i in fastq_files), 'SE')
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
124
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
125 ### unzipped input
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
126 fastq_files = walk_dir(base, "*.fq")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
127 if len(fastq_files):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
128 return ( 'cat', list( (os.path.basename(re.sub(r'.fq$', '', i)), i) for i in fastq_files), 'SE')
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
129
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
130 ### gzipped input
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
131 fastq_files = walk_dir(base, "*.fastq.gz")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
132 if len(fastq_files):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
133 return ( 'zcat', list( (os.path.basename(re.sub(r'.fastq.gz$', '', i)), i) for i in fastq_files), 'SE')
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
134
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
135 ### bzipped input
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
136 fastq_files = walk_dir(base, "*.fastq.bz")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
137 if len(fastq_files):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
138 return ( 'bzcat', list( (os.path.basename(re.sub(r'.fastq.bz$', '', i)), i) for i in fastq_files), 'SE')
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
139
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
140 raise Exception("Unable to determine input type")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
141
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
142
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
143 def spreadsheet2dict(spreadFile):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
144 """
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
145 Takes the filename of the spreadsheet, loads the data and organizes
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
146 it into a dictionary"""
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
147
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
148 spreadDict = {}
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
149 key2field = {}
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
150 for l, line in enumerate(open(spreadFile)):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
151 sl = line.strip().split('\t')
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
152 if l == 0:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
153 for k, key in enumerate(sl):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
154 key2field[key] = k
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
155 else:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
156 spreadDict[sl[key2field['analysis_id']]] = sl
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
157
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
158 return (spreadDict, key2field)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
159
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
160
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
161 def spreadsheet2RGdict(spreadFile, analysisID):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
162 """Compiles a read group dictionary from the information
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
163 in the spreadFile for the given analysisID."""
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
164
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
165 sD, k2f = spreadsheet2dict(spreadFile)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
166
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
167 try:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
168 rec = sD[analysisID]
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
169 except KeyError:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
170 raise Exception('Information for analysis ID %s could not be found in %s' % (analysisID, spreadFile))
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
171
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
172 ### build dictionary
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
173 RG_dict = { 'ID' : '%s:%s' % (rec[k2f['center_name']], analysisID),
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
174 'CN' : rec[k2f['center_name']],
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
175 'LB' : 'RNA-Seq:%s:%s' % (rec[k2f['center_name']], rec[k2f['lib_id']]),
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
176 'PL' : rec[k2f['platform']],
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
177 'PM' : rec[k2f['platform_model']],
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
178 'SM' : rec[k2f['specimen_id']],
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
179 'SI' : rec[k2f['submitted_sample_id']]}
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
180
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
181 files = []
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
182 if 'fastq_files' in k2f:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
183 if not rec[k2f['fastq_files']].strip(' ') in ['N/A', 'NA', 'no', '']:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
184 files = rec[k2f['fastq_files']].strip(' ').split(' ')
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
185
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
186 return (RG_dict, files)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
187
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
188
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
189 def xml2RGdict(xmlfile):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
190
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
191 ### read xml in
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
192 root = etree.parse(xmlfile)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
193 rtree = root.find('Result')
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
194
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
195 ### analysis_id
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
196 analysis_id = rtree.find('analysis_id').text
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
197 center = rtree.find('center_name').text
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
198 try:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
199 date_string = rtree.find('analysis_xml/ANALYSIS_SET/ANALYSIS').attrib['analysis_date']
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
200 except KeyError:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
201 date_string = rtree.find('run_xml/RUN_SET/RUN').attrib['run_date']
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
202 sample_id = rtree.find('sample_id').text
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
203 submitter_id = rtree.find('legacy_sample_id').text
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
204 library_id = rtree.find('experiment_xml/EXPERIMENT_SET/EXPERIMENT').attrib['alias']
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
205 platform = rtree.find('experiment_xml/EXPERIMENT_SET/EXPERIMENT/PLATFORM').getchildren()[0].tag
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
206 instrument = rtree.find('experiment_xml/EXPERIMENT_SET/EXPERIMENT/PLATFORM/*/INSTRUMENT_MODEL').text
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
207
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
208 ### build dictionary
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
209 RG_dict = { 'ID' : '%s:%s' % (center, analysis_id),
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
210 'CN' : center,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
211 'DT' : date_string,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
212 'LB' : 'RNA-Seq:%s:%s' % (center, library_id),
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
213 'PL' : platform,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
214 'PM' : instrument,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
215 'SM' : sample_id,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
216 'SI' : submitter_id}
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
217
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
218 return RG_dict
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
219
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
220
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
221 if __name__ == "__main__":
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
222
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
223 parser = argparse.ArgumentParser(description="ICGC RNA-Seq alignment wrapper for STAR alignments.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, usage='%(prog)s [options]', add_help=False)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
224 required = parser.add_argument_group("Required input parameters")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
225 required.add_argument("--genomeDir", default=None, help="Directory containing the reference genome index", required=True)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
226 required.add_argument("--tarFileIn", default=None, help="Input file containing the sequence information", required=True)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
227 optional = parser.add_argument_group("optional input parameters")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
228 optional.add_argument("--out", default="out.bam", help="Name of the output BAM file")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
229 optional.add_argument("--workDir", default="./", help="Work directory")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
230 optional.add_argument("--metaDataTab", default=None, help="File containing metadata for the alignment header")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
231 optional.add_argument("--analysisID", default=None, help="Analysis ID to be considered in the metadata file")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
232 optional.add_argument("--keepJunctions", default=False, action='store_true', help="keeps the junction file as {--out}.junctions")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
233 optional.add_argument("--useTMP", default=None, help="environment variable that is used as prefix for temprary data")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
234 optional.add_argument("-h", "--help", action='store_true', help="show this help message and exit")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
235 star = parser.add_argument_group("STAR input parameters")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
236 star.add_argument("--runThreadN", type=int, default=4, help="Number of threads")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
237 star.add_argument("--outFilterMultimapScoreRange", type=int, default=1, help="outFilterMultimapScoreRange")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
238 star.add_argument("--outFilterMultimapNmax", type=int, default=20, help="outFilterMultimapNmax")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
239 star.add_argument("--outFilterMismatchNmax", type=int, default=10, help="outFilterMismatchNmax")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
240 star.add_argument("--alignIntronMax", type=int, default=500000, help="alignIntronMax")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
241 star.add_argument("--alignMatesGapMax", type=int, default=1000000, help="alignMatesGapMax")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
242 star.add_argument("--sjdbScore", type=int, default=2, help="sjdbScore")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
243 star.add_argument("--limitBAMsortRAM", type=int, default=0, help="limitBAMsortRAM")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
244 star.add_argument("--alignSJDBoverhangMin", type=int, default=1, help="alignSJDBoverhangMin")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
245 star.add_argument("--genomeLoad", default="NoSharedMemory", help="genomeLoad")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
246 star.add_argument("--genomeFastaFiles", default=None, help="genome sequence in fasta format to rebuild index")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
247 star.add_argument("--outFilterMatchNminOverLread", type=float, default=0.33, help="outFilterMatchNminOverLread")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
248 star.add_argument("--outFilterScoreMinOverLread", type=float, default=0.33, help="outFilterScoreMinOverLread")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
249 star.add_argument("--twopass1readsN", type=int, default=-1, help="twopass1readsN (-1 means all reads are used for remapping)")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
250 star.add_argument("--sjdbOverhang", type=int, default=100, help="sjdbOverhang (only necessary for two-pass mode)")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
251 star.add_argument("--outSAMstrandField", default="intronMotif", help="outSAMstrandField")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
252 star.add_argument("--outSAMattributes", default=["NH", "HI", "NM", "MD", "AS", "XS"], help="outSAMattributes")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
253 star.add_argument("--outSAMunmapped", default="Within", help="outSAMunmapped")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
254 star.add_argument("--outSAMtype", default=["BAM", "SortedByCoordinate"], help="outSAMtype")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
255 star.add_argument("--outSAMheaderHD", default=["@HD", "VN:1.4"], help="outSAMheaderHD")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
256 star.add_argument("--outSAMattrRGline", default=None, help="RG attribute line submitted to outSAMattrRGline")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
257 star.add_argument("--outSAMattrRGfile", default=None, help="File containing the RG attribute line submitted to outSAMattrRGline")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
258 star.add_argument("--outSAMattrRGxml", default=None, help="XML-File in TCGA format to compile RG attribute line")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
259
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
260 ### check completeness of command line inputs
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
261 if len(sys.argv) < 2:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
262 parser.print_help()
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
263 sys.exit(0)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
264
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
265 args = parser.parse_args()
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
266
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
267 ### some sanity checks on command line parameters
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
268 if args.metaDataTab is not None:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
269 if not os.path.exists(args.metaDataTab):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
270 raise Exception("File provided via --metaDataTab does not exist\nFile: %s" % args.metaDataTab)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
271 if args.analysisID is None:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
272 raise Exception("When providing information in a metadata file, a value for --analysisID is required")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
273 if args.outSAMattrRGxml is not None and not os.path.exists(args.outSAMattrRGxml):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
274 raise Exception("File provided via --outSAMattrRGxml does not exist\nFile: %s" % args.outSAMattrRGxml)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
275 if args.outSAMattrRGfile is not None and not os.path.exists(args.outSAMattrRGfile):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
276 raise Exception("File provided via --outSAMattrRGfile does not exist\nFile: %s" % args.outSAMattrRGfile)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
277
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
278 ### handling of input file (unpacking, etc. )
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
279 if args.useTMP is not None:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
280 workdir = tempfile.mkdtemp(dir=os.environ[args.useTMP], prefix="star_inputdir_")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
281 else:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
282 workdir = tempfile.mkdtemp(dir=args.workDir, prefix="star_inputdir_")
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
283 if args.tarFileIn.endswith(".gz"):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
284 tarcmd = "tar xvzf %s -C %s" % (args.tarFileIn, workdir)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
285 elif args.tarFileIn.endswith(".bz"):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
286 tarcmd = "tar xvjf %s -C %s" % (args.tarFileIn, workdir)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
287 elif args.tarFileIn.endswith(".tar"):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
288 tarcmd = "tar xvf %s -C %s" % (args.tarFileIn, workdir)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
289 elif args.tarFileIn.endswith(".sra"):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
290 tarcmd = "fastq-dump --gzip --split-3 --outdir %s %s" % (workdir, args.tarFileIn)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
291 else:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
292 raise Exception("Unknown input file extension for file %s" % (args.tarFileIn))
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
293 subprocess.check_call(tarcmd, shell=True)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
294
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
295 ### collect fastq information from extraction dir
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
296 align_sets = scan_workdir(os.path.abspath(workdir))
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
297
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
298 ### process read group information
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
299 files = []
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
300 if args.metaDataTab is not None:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
301 (RG_dict, files_tmp) = spreadsheet2RGdict(args.metaDataTab, args.analysisID)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
302 files.extend(files_tmp)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
303 elif args.outSAMattrRGxml is not None:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
304 RG_dict = xml2RGdict(args.outSAMattrRGxml)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
305 elif args.outSAMattrRGline is not None:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
306 RG_dict = dict([(x.split(':', 1)[0], x.split(':', 1)[1]) for x in args.outSAMattrRGline.split()])
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
307 elif args.outSAMattrRGfile is not None:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
308 _fh = open(args.outSAMattrRGfile, 'r')
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
309 RG_dict = dict([(x.split(':', 1)[0], x.split(':', 1)[1]) for x in _fh.next().strip().split()])
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
310 _fh.close()
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
311 else:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
312 RG_dict = {'ID' : '', 'SM' : ''}
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
313
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
314 ### post-process RG-dict to comply with STAR conventions
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
315 for key in RG_dict:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
316 sl = RG_dict[key].split(' ')
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
317 if len(sl) > 1:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
318 RG_dict[key] = '"%s"' % RG_dict[key]
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
319
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
320 ### use list of fastq input files for whitelisting
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
321 if len(files) > 0:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
322 align_sets = (align_sets[0], [x for x in align_sets[1] if (re.sub('(_[12]){,1}.fastq(.(gz|bz2|bz))*', '', os.path.basename(x[1])) in files)], align_sets[2])
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
323 if len(align_sets[1]) == 0:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
324 print >> sys.stderr, 'All input files have been filtered out - no input remaining. Terminating.'
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
325 sys.exit()
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
326
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
327 ### use filename stub as read group label
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
328 RG_dict['RG'] = []
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
329 for fn in [x[1] for x in align_sets[1]]:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
330 fn_stub = re.sub('(_[12]){,1}.fastq(.(gz|bz2|bz))*', '', os.path.basename(fn))
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
331 fn_stub = re.sub('(_[12]){,1}_sequence.txt(.(gz|bz2|bz))*', '', fn_stub)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
332 fn_stub = re.sub('_read[12]', '', fn_stub)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
333 fn_stub = re.sub('_R[12]_001$', '', fn_stub)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
334 RG_dict['RG'].append(fn_stub)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
335
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
336 ### prepare template string
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
337 if align_sets[2] == 'PE':
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
338 read_str = '${fastq_left} ${fastq_right}'
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
339 else:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
340 read_str = '${fastq_left}'
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
341
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
342 ### simulate two pass alignment until STAR fully implements this
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
343 if args.twopass1readsN != 0:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
344
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
345 ### run first round of alignments and only record junctions
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
346 align_template_str_1st = """STAR \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
347 --genomeDir ${genomeDir} --readFilesIn %s \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
348 --runThreadN ${runThreadN} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
349 --outFilterMultimapScoreRange ${outFilterMultimapScoreRange} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
350 --outFilterMultimapNmax ${outFilterMultimapNmax} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
351 --outFilterMismatchNmax ${outFilterMismatchNmax} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
352 --alignIntronMax ${alignIntronMax} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
353 --alignMatesGapMax ${alignMatesGapMax} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
354 --sjdbScore ${sjdbScore} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
355 --alignSJDBoverhangMin ${alignSJDBoverhangMin} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
356 --genomeLoad ${genomeLoad} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
357 --readFilesCommand ${readFilesCommand} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
358 --outFilterMatchNminOverLread ${outFilterMatchNminOverLread} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
359 --outFilterScoreMinOverLread ${outFilterScoreMinOverLread} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
360 --sjdbOverhang ${sjdbOverhang} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
361 --outSAMstrandField ${outSAMstrandField} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
362 --outSAMtype None \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
363 --outSAMmode None""" % read_str
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
364
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
365 if args.twopass1readsN > 0:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
366 align_template_str_1st += " --readMapNumber %i" % args.twopass1readsN
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
367
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
368 cmd = string.Template(align_template_str_1st).safe_substitute({
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
369 'genomeDir' : os.path.abspath(args.genomeDir),
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
370 'runThreadN' : args.runThreadN,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
371 'outFilterMultimapScoreRange' : args.outFilterMultimapScoreRange,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
372 'outFilterMultimapNmax' : args.outFilterMultimapNmax,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
373 'outFilterMismatchNmax' : args.outFilterMismatchNmax,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
374 'fastq_left' : ','.join([os.path.join(x[0], x[1]) for x in align_sets[1]]),
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
375 'alignIntronMax' : args.alignIntronMax,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
376 'alignMatesGapMax': args.alignMatesGapMax,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
377 'sjdbScore': args.sjdbScore,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
378 'alignSJDBoverhangMin' : args.alignSJDBoverhangMin,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
379 'genomeLoad' : args.genomeLoad,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
380 'readFilesCommand' : align_sets[0],
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
381 'outFilterMatchNminOverLread' : args.outFilterMatchNminOverLread,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
382 'outFilterScoreMinOverLread' : args.outFilterScoreMinOverLread,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
383 'sjdbOverhang' : args.sjdbOverhang,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
384 'outSAMstrandField' : args.outSAMstrandField
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
385 })
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
386 if align_sets[2] == 'PE':
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
387 cmd = string.Template(cmd).substitute({
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
388 'fastq_right' : ','.join([os.path.join(x[0], x[2]) for x in align_sets[1]])
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
389 })
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
390
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
391 ### take temp directory from environment variable
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
392 if args.useTMP is not None:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
393 align_dir_1st = os.path.abspath( tempfile.mkdtemp(dir=os.environ[args.useTMP], prefix="star_aligndir_1st_") )
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
394 genome_dir_1st = os.path.abspath( tempfile.mkdtemp(dir=os.environ[args.useTMP], prefix="star_genomedir_1st_") )
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
395 else:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
396 align_dir_1st = os.path.abspath( tempfile.mkdtemp(dir=args.workDir, prefix="star_aligndir_1st_") )
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
397 genome_dir_1st = os.path.abspath( tempfile.mkdtemp(dir=args.workDir, prefix="star_genomedir_1st_") )
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
398 print "Running", cmd
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
399 subprocess.check_call(cmd, shell=True, cwd=align_dir_1st)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
400
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
401 ### build index using provided genome fasta as well as junctions from first run
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
402 cmd = """STAR --runMode genomeGenerate --genomeDir %s \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
403 --genomeFastaFiles %s \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
404 --sjdbOverhang %i \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
405 --runThreadN %i \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
406 --sjdbFileChrStartEnd %s""" % (genome_dir_1st, args.genomeFastaFiles, args.sjdbOverhang, args.runThreadN, os.path.join(align_dir_1st, 'SJ.out.tab'))
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
407 print "Running", cmd
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
408 subprocess.check_call(cmd, shell=True, cwd=align_dir_1st)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
409
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
410 ### replace index for the second run with the one currently built
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
411 genome_dir = genome_dir_1st
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
412 else:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
413 genome_dir = os.path.abspath(args.genomeDir)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
414
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
415
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
416 align_template_str = """STAR \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
417 --genomeDir ${genomeDir} --readFilesIn %s \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
418 --runThreadN ${runThreadN} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
419 --outFilterMultimapScoreRange ${outFilterMultimapScoreRange} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
420 --outFilterMultimapNmax ${outFilterMultimapNmax} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
421 --outFilterMismatchNmax ${outFilterMismatchNmax} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
422 --alignIntronMax ${alignIntronMax} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
423 --alignMatesGapMax ${alignMatesGapMax} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
424 --sjdbScore ${sjdbScore} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
425 --alignSJDBoverhangMin ${alignSJDBoverhangMin} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
426 --genomeLoad ${genomeLoad} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
427 --limitBAMsortRAM ${limitBAMsortRAM} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
428 --readFilesCommand ${readFilesCommand} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
429 --outFilterMatchNminOverLread ${outFilterMatchNminOverLread} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
430 --outFilterScoreMinOverLread ${outFilterScoreMinOverLread} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
431 --sjdbOverhang ${sjdbOverhang} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
432 --outSAMstrandField ${outSAMstrandField} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
433 --outSAMattributes ${outSAMattributes} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
434 --outSAMunmapped ${outSAMunmapped} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
435 --outSAMtype ${outSAMtype} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
436 --outSAMheaderHD ${outSAMheaderHD}""" % read_str
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
437
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
438 #--twopass1readsN ${twopass1readsN} \
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
439
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
440 cmd = string.Template(align_template_str).safe_substitute({
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
441 'genomeDir' : genome_dir,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
442 'runThreadN' : args.runThreadN,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
443 'fastq_left' : ','.join([os.path.join(x[0], x[1]) for x in align_sets[1]]), #os.path.abspath(pair[1]),
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
444 'outFilterMultimapScoreRange' : args.outFilterMultimapScoreRange,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
445 'outFilterMultimapNmax' : args.outFilterMultimapNmax,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
446 'outFilterMismatchNmax' : args.outFilterMismatchNmax,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
447 'alignIntronMax' : args.alignIntronMax,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
448 'alignMatesGapMax': args.alignMatesGapMax,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
449 'sjdbScore': args.sjdbScore,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
450 'alignSJDBoverhangMin' : args.alignSJDBoverhangMin,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
451 'genomeLoad' : args.genomeLoad,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
452 'limitBAMsortRAM' : args.limitBAMsortRAM,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
453 'readFilesCommand' : align_sets[0],
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
454 'outFilterMatchNminOverLread' : args.outFilterMatchNminOverLread,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
455 'outFilterScoreMinOverLread' : args.outFilterScoreMinOverLread,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
456 'sjdbOverhang' : args.sjdbOverhang,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
457 'outSAMstrandField' : args.outSAMstrandField,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
458 'outSAMattributes' : " ".join(args.outSAMattributes),
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
459 'outSAMunmapped' : args.outSAMunmapped,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
460 'outSAMtype' : " ".join(args.outSAMtype),
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
461 'outSAMheaderHD' : " ".join(args.outSAMheaderHD)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
462 })
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
463 # 'twopass1readsN' : args.twopass1readsN,
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
464 if align_sets[2] == 'PE':
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
465 cmd = string.Template(cmd).substitute({
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
466 'fastq_right' : ','.join([os.path.join(x[0], x[2]) for x in align_sets[1]]) # os.path.abspath(pair[2]),
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
467 })
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
468
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
469 ### convert RG_dict into formatted RG line
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
470 RG_line = []
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
471 for r, readgroup in enumerate(align_sets[1]):
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
472 if 'RG' in RG_dict:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
473 tmp = 'ID:%s:%s' % (RG_dict['ID'], RG_dict['RG'][r])
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
474 else:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
475 tmp = 'ID:%s:%s' % (RG_dict['ID'], readgroup[0])
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
476 if len(RG_dict) > 1:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
477 tmp += '\t'
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
478 tmp += '\t'.join(['%s:%s' % (key, RG_dict[key]) for key in RG_dict if key not in ['ID', 'RG', 'SI']])
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
479 ### add read group label
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
480 if 'RG' in RG_dict and 'CN' in RG_dict:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
481 tmp += '\tPU:%s:%s' % (RG_dict['CN'], RG_dict['RG'][r])
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
482 RG_line.append('%s' % tmp)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
483 cmd += ' --outSAMattrRGline %s' % ' , '.join(RG_line)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
484
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
485 ### handle comment lines
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
486 comment_file = None
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
487 if 'SI' in RG_dict:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
488 if args.useTMP is not None:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
489 comment_file = os.path.abspath( tempfile.mkstemp(dir=os.environ[args.useTMP], prefix="star_comments_")[1] )
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
490 else:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
491 comment_file = os.path.abspath( tempfile.mkstemp(dir=args.workDir, prefix="star_comments_")[1] )
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
492
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
493 fd_com = open(comment_file, 'w')
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
494 fd_com.write('@CO\tsubmitter_sample_id:%s\n' % RG_dict['SI'])
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
495
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
496 fd_com.flush()
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
497 fd_com.close()
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
498
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
499 cmd += ' --outSAMheaderCommentFile %s' % comment_file
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
500
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
501
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
502 ### take temp directory from environment variable
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
503 if args.useTMP is not None:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
504 align_dir = os.path.abspath( tempfile.mkdtemp(dir=os.environ[args.useTMP], prefix="star_aligndir_") )
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
505 else:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
506 align_dir = os.path.abspath( tempfile.mkdtemp(dir=args.workDir, prefix="star_aligndir_") )
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
507 print "Running", cmd
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
508 subprocess.check_call(cmd, shell=True, cwd=align_dir)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
509
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
510 ### move output file
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
511 if 'BAM' in args.outSAMtype and 'SortedByCoordinate' in args.outSAMtype:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
512 shutil.move(os.path.join(align_dir, 'Aligned.sortedByCoord.out.bam'), args.out)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
513 elif 'BAM' in args.outSAMtype and 'Unsorted' in args.outSAMtype:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
514 shutil.move(os.path.join(align_dir, 'Aligned.out.bam'), args.out)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
515 else:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
516 raise Exception('STAR output file could not be determined')
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
517
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
518 ### move junctions if to be kept
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
519 if args.keepJunctions:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
520 shutil.move(os.path.join(align_dir, 'SJ.out.tab'), args.out + '.junctions')
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
521
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
522 ### clean up working directory
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
523 shutil.rmtree(workdir)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
524 shutil.rmtree(align_dir)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
525 if args.twopass1readsN != 0:
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
526 shutil.rmtree(align_dir_1st)
be0f5f54462d Uploaded
daumsoft
parents:
diff changeset
527 shutil.rmtree(genome_dir_1st)