0
|
1 #!/usr/bin/env python
|
|
2
|
|
3
|
|
4 """
|
|
5 Script to scan and extract TCGA data and compile it into the cgData
|
|
6
|
|
7 Usage::
|
|
8
|
|
9 tcga2cgdata.py [options]
|
|
10
|
|
11 Options::
|
|
12
|
|
13 -h, --help show this help message and exit
|
|
14 -a, --platform-list Get list of platforms
|
|
15 -p PLATFORM, --platform=PLATFORM
|
|
16 Platform Selection
|
|
17 -l, --supported List Supported Platforms
|
|
18 -f FILELIST, --filelist=FILELIST
|
|
19 List files needed to convert TCGA project basename
|
|
20 into cgData
|
|
21 -b BASENAME, --basename=BASENAME
|
|
22 Convert TCGA project basename into cgData
|
|
23 -m MIRROR, --mirror=MIRROR
|
|
24 Mirror Location
|
|
25 -w WORKDIR_BASE, --workdir=WORKDIR_BASE
|
|
26 Working directory
|
|
27 -o OUTDIR, --out-dir=OUTDIR
|
|
28 Working directory
|
|
29 -c CANCER, --cancer=CANCER
|
|
30 List Archives by cancer type
|
|
31 -d DOWNLOAD, --download=DOWNLOAD
|
|
32 Download files for archive
|
|
33 -e LEVEL, --level=LEVEL
|
|
34 Data Level
|
|
35 -s CHECKSUM, --check-sum=CHECKSUM
|
|
36 Check project md5
|
|
37 -r, --sanitize Remove race/ethnicity from clinical data
|
|
38
|
|
39
|
|
40 Example::
|
|
41
|
|
42 ./scripts/tcga2cgdata.py -b intgen.org_KIRC_bio -m /inside/depot -e 1 -r -w tmp
|
|
43
|
|
44
|
|
45 """
|
|
46
|
|
47 from xml.dom.minidom import parseString
|
|
48 import urllib
|
|
49 import urllib2
|
|
50 import os
|
|
51 import csv
|
|
52 import sys
|
|
53 import hashlib
|
|
54 import tempfile
|
|
55 import re
|
|
56 import copy
|
|
57 import json
|
|
58 import datetime
|
|
59 import hashlib
|
|
60 import subprocess
|
|
61 from glob import glob
|
|
62 import shutil
|
|
63 import subprocess
|
|
64 from argparse import ArgumentParser
|
|
65
|
|
66
|
|
67
|
|
68
|
|
69 """
|
|
70
|
|
71 Net query code
|
|
72
|
|
73 """
|
|
74
|
|
75 class dccwsItem(object):
|
|
76 baseURL = "http://tcga-data.nci.nih.gov/tcgadccws/GetXML?query="
|
|
77
|
|
78 def __init__(self):
|
|
79 self.url = None
|
|
80
|
|
81 def __iter__(self):
|
|
82 next = self.url
|
|
83 while next != None:
|
|
84 handle = urllib.urlopen(next)
|
|
85 data = handle.read()
|
|
86 handle.close()
|
|
87 dom = parseString(data)
|
|
88 # there might not be any archives for a dataset
|
|
89 if len(dom.getElementsByTagName('queryResponse')) > 0:
|
|
90 response = dom.getElementsByTagName('queryResponse').pop()
|
|
91 classList = response.getElementsByTagName('class')
|
|
92 for cls in classList:
|
|
93 className = cls.getAttribute("recordNumber")
|
|
94 outData = {}
|
|
95 #aObj = Archive()
|
|
96 for node in cls.childNodes:
|
|
97 nodeName = node.getAttribute("name")
|
|
98 if node.hasAttribute("xlink:href"):
|
|
99 outData[ nodeName ] = node.getAttribute("xlink:href")
|
|
100 else:
|
|
101 outData[ nodeName ] = getText( node.childNodes )
|
|
102 yield outData
|
|
103 if len( dom.getElementsByTagName('next') ) > 0:
|
|
104 nextElm = dom.getElementsByTagName('next').pop()
|
|
105 next = nextElm.getAttribute( 'xlink:href' )
|
|
106 else:
|
|
107 next = None
|
|
108
|
|
109
|
|
110 class CustomQuery(dccwsItem):
|
|
111 def __init__(self, query):
|
|
112 super(CustomQuery, self).__init__()
|
|
113 if query.startswith("http://"):
|
|
114 self.url = query
|
|
115 else:
|
|
116 self.url = dccwsItem.baseURL + query
|
|
117
|
|
118
|
|
119 def getText(nodelist):
|
|
120 rc = []
|
|
121 for node in nodelist:
|
|
122 if node.nodeType == node.TEXT_NODE:
|
|
123 rc.append(node.data)
|
|
124 return ''.join(rc)
|
|
125
|
|
126 """
|
|
127
|
|
128 Build Configuration
|
|
129
|
|
130 """
|
|
131
|
|
132 class BuildConf:
|
|
133 def __init__(self, platform, name, version, meta, tarlist):
|
|
134 self.platform = platform
|
|
135 self.name = name
|
|
136 self.version = version
|
|
137 self.meta = meta
|
|
138 self.tarlist = tarlist
|
|
139 self.abbr = ''
|
|
140 self.uuid_table = None
|
|
141 if 'diseaseAbbr' in meta:
|
|
142 self.abbr = meta['diseaseAbbr']
|
|
143
|
|
144 def addOptions(self, opts):
|
|
145 self.workdir_base = opts.workdir_base
|
|
146 self.outdir = opts.outdir
|
|
147 self.sanitize = opts.sanitize
|
|
148 self.outpath = opts.outpath
|
|
149 self.metapath = opts.metapath
|
|
150 self.errorpath = opts.errorpath
|
|
151 self.clinical_type = opts.clinical_type
|
|
152
|
|
153 self.clinical_type_map = {}
|
|
154 for t, path, meta in opts.out_clinical:
|
|
155 self.clinical_type_map[ "." + t] = (path, meta)
|
|
156
|
|
157 if opts.uuid_table is not None:
|
|
158 self.uuid_table = {}
|
|
159 handle = open(opts.uuid_table)
|
|
160 for line in handle:
|
|
161 tmp = line.rstrip().split("\t")
|
|
162 self.uuid_table[tmp[0]] = tmp[1]
|
|
163
|
|
164 def translateUUID(self, uuid):
|
|
165 if self.uuid_table is None or uuid not in self.uuid_table:
|
|
166 return uuid
|
|
167 return self.uuid_table[uuid]
|
|
168
|
|
169 def getOutPath(self, name):
|
|
170 if self.outpath is not None:
|
|
171 return self.outpath
|
|
172 if name in self.clinical_type_map:
|
|
173 return self.clinical_type_map[name][0]
|
|
174 return os.path.join(self.outdir, self.name) + name
|
|
175
|
|
176 def getOutMeta(self, name):
|
|
177 if self.outpath is not None:
|
|
178 if self.metapath is not None:
|
|
179 return self.metapath
|
|
180 return self.outpath + ".json"
|
|
181 if name in self.clinical_type_map:
|
|
182 return self.clinical_type_map[name][1]
|
|
183 return os.path.join(self.outdir, self.name) + name + ".json"
|
|
184
|
|
185 def getOutError(self, name):
|
|
186 if self.outpath is not None:
|
|
187 if self.errorpath is not None:
|
|
188 return self.errorpath
|
|
189 return self.outpath + ".error"
|
|
190 return os.path.join(self.outdir, self.name) + name + ".error"
|
|
191
|
|
192
|
|
193 def getBaseBuildConf(basename, level, mirror):
|
|
194 dates = []
|
|
195 print "TCGA Query for: ", basename
|
|
196 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (basename, level))
|
|
197 urls = {}
|
|
198 meta = None
|
|
199 platform = None
|
|
200 for e in q:
|
|
201 dates.append( datetime.datetime.strptime( e['addedDate'], "%m-%d-%Y" ) )
|
|
202 if meta is None:
|
|
203 meta = {"sourceUrl" : []}
|
|
204 for e2 in CustomQuery(e['platform']):
|
|
205 platform = e2['name']
|
|
206 meta['platform'] = e2['name']
|
|
207 meta['platformTitle'] = e2['displayName']
|
|
208 for e2 in CustomQuery(e['disease']):
|
|
209 meta['diseaseAbbr'] = e2['abbreviation']
|
|
210 meta['diseaseTitle'] = e2['name']
|
|
211 for e3 in CustomQuery(e2['tissueCollection']):
|
|
212 meta['tissue'] = e3['name']
|
|
213 for e2 in CustomQuery(e['center']):
|
|
214 meta['centerTitle'] = e2['displayName']
|
|
215 meta['center'] = e2['name']
|
|
216 meta['sourceUrl'].append( "http://tcga-data.nci.nih.gov/" + e['deployLocation'] )
|
|
217 urls[ mirror + e['deployLocation'] ] = platform
|
|
218
|
|
219 print "TCGA Query for mage-tab: ", basename
|
|
220 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (basename))
|
|
221 for e in q:
|
|
222 dates.append( datetime.datetime.strptime( e['addedDate'], "%m-%d-%Y" ) )
|
|
223 q2 = CustomQuery(e['platform'])
|
|
224 platform = None
|
|
225 for e2 in q2:
|
|
226 print e2
|
|
227 platform = e2['name']
|
|
228 meta['sourceUrl'].append( "http://tcga-data.nci.nih.gov/" + e['deployLocation'] )
|
|
229 urls[ mirror + e['deployLocation'] ] = platform
|
|
230
|
|
231 if len(dates) == 0:
|
|
232 print "No Files found"
|
|
233 return
|
|
234 dates.sort()
|
|
235 dates.reverse()
|
|
236 versionDate = dates[0].strftime( "%Y-%m-%d" )
|
|
237
|
|
238 return BuildConf(platform, basename, versionDate, meta, urls)
|
|
239
|
|
240
|
|
241
|
|
242
|
|
243
|
|
244 class TableReader:
|
|
245 def __init__(self, path):
|
|
246 self.path = path
|
|
247
|
|
248 def __iter__(self):
|
|
249 if self.path is not None and os.path.exists(self.path):
|
|
250 handle = open(self.path)
|
|
251 for line in handle:
|
|
252 tmp = line.rstrip().split("\t")
|
|
253 yield tmp[0], json.loads(tmp[1])
|
|
254 handle.close()
|
|
255
|
|
256
|
|
257 class FileImporter:
|
|
258
|
|
259 fileInclude = None
|
|
260 fileExclude = None
|
|
261
|
|
262 excludes = [
|
|
263 "MANIFEST.txt$",
|
|
264 "CHANGES_DCC.txt$",
|
|
265 "README_DCC.txt$",
|
|
266 "README.txt$",
|
|
267 "CHANGES.txt$",
|
|
268 "DCC_ALTERED_FILES.txt$",
|
|
269 r'.wig$',
|
|
270 "DESCRIPTIO$"
|
|
271 ]
|
|
272
|
|
273 def __init__(self, config):
|
|
274 self.config = config
|
|
275
|
|
276 def extractTars(self):
|
|
277 self.work_dir = tempfile.mkdtemp(dir=self.config.workdir_base)
|
|
278 print "Extract to ", self.work_dir
|
|
279 for path in self.config.tarlist:
|
|
280 subprocess.check_call([ "tar", "xvzf", path, "-C", self.work_dir], stderr=sys.stdout)
|
|
281
|
|
282 def run(self):
|
|
283 self.extractTars()
|
|
284
|
|
285 filterInclude = None
|
|
286 filterExclude = None
|
|
287 if self.fileInclude is not None:
|
|
288 filterInclude = re.compile(self.fileInclude)
|
|
289 if self.fileExclude is not None:
|
|
290 filterExclude = re.compile(self.fileExclude)
|
|
291 self.inc = 0
|
|
292 self.out = {}
|
|
293 self.errors = []
|
|
294 self.ext_meta = {}
|
|
295 self.scandirs(self.work_dir, filterInclude, filterExclude)
|
|
296 for o in self.out:
|
|
297 self.out[o].close()
|
|
298 self.fileBuild()
|
|
299 #shutil.rmtree(self.work_dir)
|
|
300
|
|
301 def checkExclude( self, name ):
|
|
302 for e in self.excludes:
|
|
303 if re.search( e, name ):
|
|
304 return True
|
|
305 return False
|
|
306
|
|
307 def scandirs(self, path, filterInclude=None, filterExclude=None):
|
|
308 if os.path.isdir(path):
|
|
309 for a in glob(os.path.join(path, "*")):
|
|
310 self.scandirs(a, filterInclude, filterExclude)
|
|
311 else:
|
|
312 name = os.path.basename(path)
|
|
313 if self.isMage(path):
|
|
314 self.mageScan(path)
|
|
315 else:
|
|
316 if not self.checkExclude(name):
|
|
317 if (filterInclude is None or filterInclude.match(name)) and (filterExclude is None or not filterExclude.match(name)):
|
|
318 self.fileScan(path)
|
|
319
|
|
320 def isMage(self, path):
|
|
321 if path.endswith( '.sdrf.txt' ) or path.endswith( '.idf.txt' ) or path.endswith("DESCRIPTION.txt"):
|
|
322 return True
|
|
323
|
|
324
|
|
325 def emit(self, key, data, port):
|
|
326 if port not in self.out:
|
|
327 self.out[port] = open(self.work_dir + "/" + port, "w")
|
|
328 self.out[port].write( "%s\t%s\n" % (key, json.dumps(data)))
|
|
329
|
|
330 def emitFile(self, name, meta, file):
|
|
331 md5 = hashlib.md5()
|
|
332 oHandle = open(self.config.getOutPath(name), "wb")
|
|
333 with open(file,'rb') as f:
|
|
334 for chunk in iter(lambda: f.read(8192), ''):
|
|
335 md5.update(chunk)
|
|
336 oHandle.write(chunk)
|
|
337 oHandle.close()
|
|
338 md5str = md5.hexdigest()
|
|
339 meta['md5'] = md5str
|
|
340 mHandle = open(self.config.getOutMeta(name), "w")
|
|
341 mHandle.write( json.dumps(meta))
|
|
342 mHandle.close()
|
|
343 if len(self.errors):
|
|
344 eHandle = open( self.config.getOutError(name), "w" )
|
|
345 for msg in self.errors:
|
|
346 eHandle.write( msg + "\n" )
|
|
347 eHandle.close()
|
|
348
|
|
349 def addError(self, msg):
|
|
350 self.errors.append(msg)
|
|
351
|
|
352
|
|
353 commonMap = {
|
|
354 "mean" : "seg.mean",
|
|
355 "Segment_Mean" : "seg.mean",
|
|
356 "Start" : "loc.start",
|
|
357 "End" : "loc.end",
|
|
358 "Chromosome" : "chrom"
|
|
359 }
|
|
360
|
|
361
|
|
362 idfMap = {
|
|
363 "Investigation Title" : "title",
|
|
364 "Experiment Description" : "experimentalDescription",
|
|
365 "Person Affiliation" : "dataProducer",
|
|
366 "Date of Experiment" : "experimentalDate"
|
|
367 }
|
|
368
|
|
369 class TCGAGeneticImport(FileImporter):
|
|
370
|
|
371
|
|
372
|
|
373 def mageScan(self, path):
|
|
374 if path.endswith(".sdrf.txt"):
|
|
375 iHandle = open(path, "rU")
|
|
376 read = csv.reader( iHandle, delimiter="\t" )
|
|
377 colNum = None
|
|
378 for row in read:
|
|
379 if colNum is None:
|
|
380 colNum = {}
|
|
381 for i in range(len(row)):
|
|
382 colNum[ row[i] ] = i
|
|
383 else:
|
|
384 if not colNum.has_key("Material Type") or ( not row[ colNum[ "Material Type" ] ] in [ "genomic_DNA", "total_RNA", "MDA cell line" ] ):
|
|
385 try:
|
|
386 if colNum.has_key( "Derived Array Data File" ):
|
|
387 self.emit( row[ colNum[ "Derived Array Data File" ] ].split('.')[0], row[ colNum[ "Extract Name" ] ], "targets" )
|
|
388 self.emit( row[ colNum[ "Derived Array Data File" ] ], row[ colNum[ "Extract Name" ] ], "targets" )
|
|
389 if colNum.has_key("Derived Array Data Matrix File" ):
|
|
390 self.emit( row[ colNum[ "Derived Array Data Matrix File" ] ], row[ colNum[ "Extract Name" ] ], "targets" )
|
|
391 if colNum.has_key( "Derived Data File"):
|
|
392 self.emit( row[ colNum[ "Derived Data File" ] ].split('.')[0], row[ colNum[ "Extract Name" ] ], "targets" )
|
|
393 self.emit( row[ colNum[ "Derived Data File" ] ], row[ colNum[ "Extract Name" ] ], "targets" )
|
|
394 if colNum.has_key( "Hybridization Name" ):
|
|
395 self.emit( row[ colNum[ "Hybridization Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" )
|
|
396 if colNum.has_key( "Sample Name" ):
|
|
397 self.emit( row[ colNum[ "Sample Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" )
|
|
398 self.emit( row[ colNum[ "Extract Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" )
|
|
399 except IndexError:
|
|
400 pass #there can be blank lines in the SDRF
|
|
401 if path.endswith(".idf.txt"):
|
|
402 iHandle = open(path)
|
|
403 for line in iHandle:
|
|
404 row = line.split("\t")
|
|
405 if len(row):
|
|
406 if row[0] in idfMap:
|
|
407 self.ext_meta[ idfMap[row[0]] ] = row[1]
|
|
408 iHandle.close()
|
|
409 if path.endswith("DESCRIPTION.txt"):
|
|
410 handle = open(path)
|
|
411 self.ext_meta['description'] = handle.read()
|
|
412 handle.close()
|
|
413
|
|
414 def translateUUID(self, uuid):
|
|
415 return self.config.translateUUID(uuid)
|
|
416
|
|
417 def getTargetMap(self):
|
|
418 subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True)
|
|
419 handle = TableReader(self.work_dir + "/targets.sort")
|
|
420 tTrans = {}
|
|
421 for key, value in handle:
|
|
422 tTrans[ key ] = value
|
|
423 return tTrans
|
|
424
|
|
425 def fileScan(self, path):
|
|
426 """
|
|
427 This function takes a TCGA level 3 genetic file (file name and input handle),
|
|
428 and tries to extract probe levels or target mappings (experimental ID to TCGA barcode)
|
|
429 it emits these values to a handle, using the 'targets' and 'probes' string to identify
|
|
430 the type of data being emited
|
|
431 """
|
|
432 iHandle = open(path)
|
|
433 mode = None
|
|
434 #modes
|
|
435 #1 - segmentFile - one sample per file/no sample info inside file
|
|
436 #2 - two col header matrix file
|
|
437 #3 - segmentFile - sample information inside file
|
|
438 target = None
|
|
439 colName = None
|
|
440 colType = None
|
|
441 for line in iHandle:
|
|
442 if colName is None:
|
|
443 colName = line.rstrip().split("\t")
|
|
444 if colName[0] == "Hybridization REF" or colName[0] == "Sample REF":
|
|
445 mode=2
|
|
446 elif colName[0] == "Chromosome" or colName[0] == "chromosome":
|
|
447 mode=1
|
|
448 target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention
|
|
449 elif colName[1] == "chrom":
|
|
450 mode = 3
|
|
451 target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention
|
|
452
|
|
453 for i in range(len(colName)):
|
|
454 if commonMap.has_key( colName[i] ):
|
|
455 colName[i] = commonMap[ colName[i] ]
|
|
456 elif mode==2 and colType is None:
|
|
457 colType=line.rstrip().split("\t")
|
|
458 for i in range(len(colType)):
|
|
459 if commonMap.has_key( colType[i] ):
|
|
460 colType[i] = commonMap[ colType[i] ]
|
|
461 else:
|
|
462 tmp = line.rstrip().split("\t")
|
|
463 if mode == 2:
|
|
464 out={}
|
|
465 for col in colName[1:]:
|
|
466 out[ col ] = { "target" : col }
|
|
467 for i in range(1,len(colType)):
|
|
468 try:
|
|
469 if colType[i] in self.probeFields:
|
|
470 out[ colName[i] ][ colType[i] ] = tmp[i]
|
|
471 except IndexError:
|
|
472 out[ colName[i] ][ colType[i] ] = "NA"
|
|
473 for col in out:
|
|
474 self.emit( tmp[0], out[col], "probes" )
|
|
475 else:
|
|
476 out = {}
|
|
477 for i in range(len(colName)):
|
|
478 out[ colName[i] ] = tmp[i]
|
|
479 out['file'] = os.path.basename(path)
|
|
480 if mode==1:
|
|
481 self.emit( target, out, "segments" )
|
|
482 elif mode == 3:
|
|
483 self.emit( tmp[0], out, "segments" )
|
|
484 else:
|
|
485 self.emit( tmp[0], out, "probes" )
|
|
486
|
|
487
|
|
488
|
|
489
|
|
490 class TCGASegmentImport(TCGAGeneticImport):
|
|
491
|
|
492
|
|
493 def fileScan(self, path):
|
|
494 """
|
|
495 This function takes a TCGA level 3 genetic file (file name and input handle),
|
|
496 and tries to extract probe levels or target mappings (experimental ID to TCGA barcode)
|
|
497 it emits these values to a handle, using the 'targets' and 'probes' string to identify
|
|
498 the type of data being emited
|
|
499 """
|
|
500 iHandle = open(path)
|
|
501 mode = None
|
|
502 #modes
|
|
503 #1 - segmentFile - one sample per file/no sample info inside file
|
|
504 #2 - segmentFile - sample information inside file
|
|
505 target = None
|
|
506 colName = None
|
|
507 colType = None
|
|
508 for line in iHandle:
|
|
509 if colName is None:
|
|
510 colName = line.rstrip().split("\t")
|
|
511 if colName[0] == "Chromosome" or colName[0] == "chromosome":
|
|
512 mode=1
|
|
513 target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention
|
|
514 elif colName[1] == "chrom":
|
|
515 mode = 2
|
|
516
|
|
517 for i in range(len(colName)):
|
|
518 if commonMap.has_key( colName[i] ):
|
|
519 colName[i] = commonMap[ colName[i] ]
|
|
520 else:
|
|
521 tmp = line.rstrip().split("\t")
|
|
522 out = {}
|
|
523 for i in range(len(colName)):
|
|
524 out[ colName[i] ] = tmp[i]
|
|
525 out['file'] = os.path.basename(path)
|
|
526 if mode==1:
|
|
527 self.emit( target, out, "segments" )
|
|
528 elif mode == 2:
|
|
529 self.emit( tmp[0], out, "segments" )
|
|
530
|
|
531
|
|
532 def getMeta(self, name):
|
|
533 matrixInfo = {
|
|
534 '@context' : "http://purl.org/cgdata/",
|
|
535 '@type' : 'bed5',
|
|
536 '@id' : name,
|
|
537 "lastModified" : self.config.version,
|
|
538 'rowKeySrc' : {
|
|
539 '@type' : 'idDAG',
|
|
540 '@id' : "tcga.%s" % (self.config.abbr)
|
|
541 },
|
|
542 'dataSubType' : { "@id" : self.dataSubType },
|
|
543 'dataProducer' : 'TCGA Import',
|
|
544 "accessMap" : "public", "redistribution" : "yes"
|
|
545 }
|
|
546 matrixInfo.update(self.ext_meta)
|
|
547 matrixInfo.update(self.config.meta)
|
|
548 return matrixInfo
|
|
549
|
|
550 def fileBuild(self):
|
|
551 #use the target table to create a name translation table
|
|
552 #also setup target name enumeration, so they will have columns
|
|
553 #numbers
|
|
554
|
|
555 tTrans = self.getTargetMap()
|
|
556 subprocess.call("sort -k 1 %s/segments > %s/segments.sort" % (self.work_dir, self.work_dir), shell=True)
|
|
557 sHandle = TableReader(self.work_dir + "/segments.sort")
|
|
558
|
|
559 segFile = None
|
|
560 curName = None
|
|
561
|
|
562 curData = {}
|
|
563 missingCount = 0
|
|
564
|
|
565 startField = "loc.start"
|
|
566 endField = "loc.end"
|
|
567 valField = "seg.mean"
|
|
568 chromeField = "chrom"
|
|
569
|
|
570 segFile = None
|
|
571
|
|
572 for key, value in sHandle:
|
|
573 if segFile is None:
|
|
574 segFile = open("%s/segment_file" % (self.work_dir), "w")
|
|
575 try:
|
|
576 curName = self.translateUUID(tTrans[key]) # "-".join( tTrans[ key ].split('-')[0:4] )
|
|
577 if curName is not None:
|
|
578 try:
|
|
579 chrom = value[ chromeField ].lower()
|
|
580 if not chrom.startswith("chr"):
|
|
581 chrom = "chr" + chrom
|
|
582 chrom = chrom.upper().replace("CHR", "chr")
|
|
583 #segFile.write( "%s\t%s\t%s\t%s\t.\t%s\n" % ( curName, chrom, int(value[ startField ])+1, value[ endField ], value[ valField ] ) )
|
|
584 segFile.write( "%s\t%s\t%s\t%s\t%s\n" % ( chrom, value[ startField ], value[ endField ], curName, value[ valField ] ) )
|
|
585 except KeyError:
|
|
586 self.addError( "Field error: %s" % (str(value)))
|
|
587 except KeyError:
|
|
588 self.addError( "TargetInfo Not Found: %s" % (key))
|
|
589
|
|
590 segFile.close()
|
|
591 matrixName = self.config.name
|
|
592
|
|
593 self.emitFile( "", self.getMeta(matrixName), "%s/segment_file" % (self.work_dir) )
|
|
594
|
|
595
|
|
596
|
|
597 class TCGAMatrixImport(TCGAGeneticImport):
|
|
598
|
|
599 def getMeta(self, name):
|
|
600 matrixInfo = {
|
|
601 "@context" : 'http://purl.org/cgdata/',
|
|
602 '@type' : 'genomicMatrix',
|
|
603 '@id' : name,
|
|
604 "lastModified" : self.config.version,
|
|
605 'dataSubType' : { "@id" : self.dataSubType },
|
|
606 'dataProducer' : 'TCGA',
|
|
607 "accessMap" : "public",
|
|
608 "redistribution" : "yes",
|
|
609 'rowKeySrc' : {
|
|
610 "@type" : "probe", "@id" : self.probeMap
|
|
611 },
|
|
612 'columnKeySrc' : {
|
|
613 "@type" : "idDAG", "@id" : "tcga.%s" % (self.config.abbr)
|
|
614 }
|
|
615 }
|
|
616 matrixInfo.update(self.ext_meta)
|
|
617 matrixInfo.update(self.config.meta)
|
|
618 return matrixInfo
|
|
619
|
|
620 def fileBuild(self):
|
|
621 #use the target table to create a name translation table
|
|
622 #also setup target name enumeration, so they will have columns
|
|
623 #numbers
|
|
624
|
|
625 subprocess.call("sort -k 1 %s/probes > %s/probes.sort" % (self.work_dir, self.work_dir), shell=True)
|
|
626 subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True)
|
|
627
|
|
628 handles = {}
|
|
629 handles[ "geneticExtract:targets" ] = TableReader(self.work_dir + "/targets.sort")
|
|
630 handles[ "geneticExtract:probes" ] = TableReader(self.work_dir + "/probes.sort")
|
|
631
|
|
632 tTrans = self.getTargetMap()
|
|
633
|
|
634 tEnum = {}
|
|
635 for t in tTrans:
|
|
636 tlabel = self.translateUUID(tTrans[t])
|
|
637 if tlabel is not None and tlabel not in tEnum:
|
|
638 tEnum[tlabel] = len(tEnum)
|
|
639
|
|
640 matrixFile = None
|
|
641 segFile = None
|
|
642
|
|
643 curName = None
|
|
644 curData = {}
|
|
645 missingCount = 0
|
|
646 rowCount = 0
|
|
647 pHandle = handles["geneticExtract:probes"]
|
|
648 for key, value in pHandle:
|
|
649 if matrixFile is None:
|
|
650 matrixFile = open("%s/matrix_file" % (self.work_dir), "w" )
|
|
651 out = ["NA"] * len(tEnum)
|
|
652 for target in tEnum:
|
|
653 out[ tEnum[ target ] ] = target
|
|
654 matrixFile.write( "%s\t%s\n" % ( "#probe", "\t".join( out ) ) )
|
|
655
|
|
656 if curName != key:
|
|
657 if curName is not None:
|
|
658 out = ["NA"] * len(tEnum)
|
|
659 for target in curData:
|
|
660 try:
|
|
661 ttarget = self.translateUUID(tTrans[target])
|
|
662 if ttarget is not None:
|
|
663 out[ tEnum[ ttarget ] ] = str( curData[ target ] )
|
|
664 except KeyError:
|
|
665 self.addError( "TargetInfo Not Found: %s" % (target))
|
|
666 if out.count("NA") != len(tEnum):
|
|
667 rowCount += 1
|
|
668 matrixFile.write( "%s\t%s\n" % ( curName, "\t".join( out ) ) )
|
|
669 curName = key
|
|
670 curData = {}
|
|
671 if "target" in value:
|
|
672 for probeField in self.probeFields:
|
|
673 if probeField in value:
|
|
674 curData[ value[ "target" ] ] = value[ probeField ]
|
|
675 elif "file" in value:
|
|
676 for probeField in self.probeFields:
|
|
677 if probeField in value:
|
|
678 curData[ value[ "file" ] ] = value[ probeField ]
|
|
679 matrixFile.close()
|
|
680 matrixName = self.config.name
|
|
681 if rowCount > 0:
|
|
682 self.emitFile( "", self.getMeta(matrixName), "%s/matrix_file" % (self.work_dir) )
|
|
683
|
|
684
|
|
685 adminNS = "http://tcga.nci/bcr/xml/administration/2.3"
|
|
686
|
|
687
|
|
688 class TCGAClinicalImport(FileImporter):
|
|
689
|
|
690 def fileScan(self, path):
|
|
691 handle = open(path)
|
|
692 data = handle.read()
|
|
693 handle.close()
|
|
694 xml=parseString(data)
|
|
695 self.parseXMLFile(xml)
|
|
696
|
|
697 def getText(self, nodelist):
|
|
698 rc = []
|
|
699 for node in nodelist:
|
|
700 if node.nodeType == node.TEXT_NODE:
|
|
701 rc.append(node.data)
|
|
702 return ''.join(rc)
|
|
703
|
|
704
|
|
705 def parseXMLFile(self, dom):
|
|
706 admin = {}
|
|
707 for node in dom.getElementsByTagNameNS( adminNS, "admin"):
|
|
708 for cNode in node.childNodes:
|
|
709 if cNode.nodeType == cNode.ELEMENT_NODE:
|
|
710 admin[ cNode.localName ] = {}
|
|
711 admin[ cNode.localName ]['value'] = getText( cNode.childNodes )
|
|
712
|
|
713 name = None
|
|
714 patient = {}
|
|
715 patientName = None
|
|
716 for node in dom.childNodes[0].childNodes:
|
|
717 if node.nodeType == node.ELEMENT_NODE:
|
|
718 if node.localName == 'patient':
|
|
719 for elm in node.childNodes:
|
|
720 if elm.nodeType == elm.ELEMENT_NODE:
|
|
721 if ( elm.localName == 'bcr_patient_barcode' ):
|
|
722 name = getText( elm.childNodes )
|
|
723 patientName = name
|
|
724
|
|
725 if ( elm.getAttribute( 'procurement_status' ) == "Completed" ):
|
|
726 patient[ elm.localName ] = {}
|
|
727 patient[ elm.localName ]['value'] = getText( elm.childNodes )
|
|
728 patient[ elm.localName ]['tier'] = elm.getAttribute( 'tier' )
|
|
729 patient[ elm.localName ]['precision'] = elm.getAttribute( 'precision' )
|
|
730
|
|
731 if elm.prefix == "auxiliary":
|
|
732 for aux in elm.childNodes:
|
|
733 if aux.nodeType == aux.ELEMENT_NODE:
|
|
734 for auxval in aux.childNodes:
|
|
735 if auxval.nodeType == auxval.ELEMENT_NODE:
|
|
736 patient[ auxval.localName ] = {}
|
|
737 patient[ auxval.localName ]['value'] = getText( auxval.childNodes )
|
|
738 patient[ auxval.localName ]['tier'] = auxval.getAttribute( 'tier' )
|
|
739 patient[ auxval.localName ]['precision'] = auxval.getAttribute( 'precision' )
|
|
740
|
|
741 if name is not None:
|
|
742 for key in admin:
|
|
743 patient[ key ] = admin[ key ]
|
|
744 self.emit( name, patient, "patient" )
|
|
745
|
|
746 for node in dom.childNodes[0].childNodes:
|
|
747 if node.nodeType == node.ELEMENT_NODE and node.localName == 'patient':
|
|
748 for samples in node.childNodes:
|
|
749 if samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'samples':
|
|
750 for sample in samples.childNodes:
|
|
751 if sample.nodeType == samples.ELEMENT_NODE and sample.localName == 'sample':
|
|
752 sampleData = {}
|
|
753 for value in sample.childNodes:
|
|
754 if value.nodeType == value.ELEMENT_NODE:
|
|
755 if value.localName == 'bcr_sample_barcode' :
|
|
756 name = getText( value.childNodes )
|
|
757 if value.getAttribute( 'procurement_status' ) == "Completed" :
|
|
758 sampleData[ value.localName ] = {}
|
|
759 sampleData[ value.localName ]['value'] = getText( value.childNodes )
|
|
760
|
|
761 if value.localName == 'portions' :
|
|
762 for portions in value.childNodes:
|
|
763 if portions.nodeType == value.ELEMENT_NODE and portions.localName == "portion":
|
|
764 portionName = None
|
|
765 portionData = {}
|
|
766 for portion in portions.childNodes:
|
|
767 if portion.nodeType == value.ELEMENT_NODE:
|
|
768 if portion.localName == "analytes":
|
|
769 for analytes in portion.childNodes:
|
|
770 if analytes.nodeType == analytes.ELEMENT_NODE and analytes.localName =="analyte":
|
|
771 analyteName = None
|
|
772 analyteData = {}
|
|
773 for analyte in analytes.childNodes:
|
|
774 if analyte.nodeType == value.ELEMENT_NODE:
|
|
775 if analyte.localName == "aliquots":
|
|
776 for aliquots in analyte.childNodes:
|
|
777 if aliquots.nodeType == aliquots.ELEMENT_NODE and aliquots.localName =="aliquot":
|
|
778 aliquotName = None
|
|
779 aliquotData = {}
|
|
780 for aliquot in aliquots.childNodes:
|
|
781 if aliquot.nodeType == value.ELEMENT_NODE:
|
|
782 if aliquot.localName == "bcr_aliquot_barcode":
|
|
783 aliquotName = getText(aliquot.childNodes)
|
|
784 if aliquot.getAttribute( 'procurement_status' ) == "Completed" :
|
|
785 aliquotData[ aliquot.localName ] = {}
|
|
786 aliquotData[ aliquot.localName ]['value'] = getText( aliquot.childNodes )
|
|
787 if aliquotName is not None and len(aliquotData):
|
|
788 self.emit( aliquotName, aliquotData, 'aliquot' )
|
|
789
|
|
790
|
|
791 if analyte.localName == "bcr_analyte_barcode":
|
|
792 analyteName = getText(analyte.childNodes)
|
|
793 if analyte.getAttribute( 'procurement_status' ) == "Completed" :
|
|
794 analyteData[ analyte.localName ] = {}
|
|
795 analyteData[ analyte.localName ]['value'] = getText( analyte.childNodes )
|
|
796 if analyteName is not None and len(analyteData):
|
|
797 self.emit( analyteName, analyteData, 'analyte' )
|
|
798
|
|
799 if portion.localName == "bcr_portion_barcode":
|
|
800 portionName = getText( portion.childNodes )
|
|
801 if portion.getAttribute( 'procurement_status' ) == "Completed" :
|
|
802 portionData[ portion.localName ] = {}
|
|
803 portionData[ portion.localName ]['value'] = getText( portion.childNodes )
|
|
804 if portionName is not None and len(portionData):
|
|
805 self.emit( portionName, portionData, 'portion' )
|
|
806
|
|
807
|
|
808 #patientName = re.sub( r'\-...$', "", name )
|
|
809 self.emit( name, sampleData, "sample" )
|
|
810 self.emit( name, patient, "sample")
|
|
811 elif samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'drugs':
|
|
812 for drug in samples.childNodes:
|
|
813 if drug.nodeType == samples.ELEMENT_NODE and drug.localName == 'drug':
|
|
814 drugData = {}
|
|
815 for value in drug.childNodes:
|
|
816 if value.nodeType == value.ELEMENT_NODE:
|
|
817 if value.localName == 'bcr_drug_barcode' :
|
|
818 name = getText( value.childNodes )
|
|
819 if value.getAttribute( 'procurement_status' ) == "Completed" :
|
|
820 drugData[ value.localName ] = {}
|
|
821 drugData[ value.localName ]['value'] = getText( value.childNodes )
|
|
822
|
|
823 #patientName = re.sub( r'\-...$', "", name )
|
|
824 self.emit( patientName, drugData, "drug" )
|
|
825 elif samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'radiations':
|
|
826 for rad in samples.childNodes:
|
|
827 if rad.nodeType == samples.ELEMENT_NODE and rad.localName == 'radiation':
|
|
828 radData = {}
|
|
829 for value in rad.childNodes:
|
|
830 if value.nodeType == value.ELEMENT_NODE:
|
|
831 if value.localName == 'bcr_radiation_barcode' :
|
|
832 name = getText( value.childNodes )
|
|
833 if value.getAttribute( 'procurement_status' ) == "Completed" :
|
|
834 radData[ value.localName ] = {}
|
|
835 radData[ value.localName ]['value'] = getText( value.childNodes )
|
|
836
|
|
837 #patientName = re.sub( r'\-...$', "", name )
|
|
838 self.emit( patientName, radData, "radiation" )
|
|
839
|
|
840
|
|
841
|
|
842 def getMeta(self, name):
|
|
843 fileInfo = {
|
|
844 "@context" : "http://purl.org/cgdata/",
|
|
845 "@type" : "clinicalMatrix",
|
|
846 "@id" : name,
|
|
847 "lastModified" : self.config.version,
|
|
848 'dataSubType' : { "@id" : "clinical" },
|
|
849 "rowKeySrc" : {
|
|
850 "@type" : "idDAG", "@id" : "tcga.%s" % (self.config.abbr)
|
|
851 }
|
|
852
|
|
853 }
|
|
854 fileInfo.update(self.ext_meta)
|
|
855 fileInfo.update(self.config.meta)
|
|
856 return fileInfo
|
|
857
|
|
858 def fileBuild(self):
|
|
859
|
|
860 matrixList = [ "patient", "sample", "radiation", "drug", "portion", "analyte", "aliquot" ]
|
|
861 if self.config.clinical_type is not None:
|
|
862 matrixList = [ self.config.clinical_type ]
|
|
863
|
|
864 for matrixName in matrixList:
|
|
865 if os.path.exists( "%s/%s" % (self.work_dir, matrixName)):
|
|
866 subprocess.call("cat %s/%s | sort -k 1 > %s/%s.sort" % (self.work_dir, matrixName, self.work_dir, matrixName), shell=True)
|
|
867 handle = TableReader(self.work_dir + "/" + matrixName + ".sort")
|
|
868 matrix = {}
|
|
869 colEnum = {}
|
|
870 for key, value in handle:
|
|
871 if key not in matrix:
|
|
872 matrix[key] = {}
|
|
873 for col in value:
|
|
874 matrix[key][col] = value[col]
|
|
875 if col not in colEnum:
|
|
876 if not self.config.sanitize or col not in [ 'race', 'ethnicity' ]:
|
|
877 colEnum[col] = len(colEnum)
|
|
878
|
|
879 handle = open( os.path.join(self.work_dir, matrixName + "_file"), "w")
|
|
880 cols = [None] * (len(colEnum))
|
|
881 for col in colEnum:
|
|
882 cols[colEnum[col]] = col
|
|
883 handle.write("sample\t%s\n" % ("\t".join(cols)))
|
|
884 for key in matrix:
|
|
885 cols = [""] * (len(colEnum))
|
|
886 for col in colEnum:
|
|
887 if col in matrix[key]:
|
|
888 cols[colEnum[col]] = matrix[key][col]['value']
|
|
889 handle.write("%s\t%s\n" % (key, "\t".join(cols).encode("ASCII", "replace")))
|
|
890 handle.close()
|
|
891 self.emitFile( "." + matrixName, self.getMeta(self.config.name + "." + matrixName), "%s/%s_file" % (self.work_dir, matrixName))
|
|
892
|
|
893
|
|
894 class AgilentImport(TCGAMatrixImport):
|
|
895 dataSubType = 'geneExp'
|
|
896 probeMap = 'hugo'
|
|
897 sampleMap = 'tcga.iddag'
|
|
898 dataType = 'genomicMatrix'
|
|
899 probeFields = ['log2 lowess normalized (cy5/cy3) collapsed by gene symbol']
|
|
900
|
|
901
|
|
902 class CGH1x1mImport(TCGASegmentImport):
|
|
903 dataSubType = 'cna'
|
|
904 sampleMap = 'tcga.iddag'
|
|
905 dataType = 'genomicSegment'
|
|
906 probeFields = ['seg.mean']
|
|
907
|
|
908 class SNP6Import(TCGASegmentImport):
|
|
909 assembly = 'hg19'
|
|
910 dataSubType = 'cna'
|
|
911 sampleMap ='tcga.iddag'
|
|
912 dataType = 'genomicSegment'
|
|
913 probeFields = ['seg.mean']
|
|
914
|
|
915 def fileScan(self, path):
|
|
916 outport = None
|
|
917 #if path.endswith(".hg18.seg.txt"):
|
|
918 # outport = "hg18_segment"
|
|
919 if path.endswith(".hg19.seg.txt"):
|
|
920 outport = "hg19_segment"
|
|
921
|
|
922 if outport is not None:
|
|
923 handle = open(path)
|
|
924 colName = None
|
|
925 for line in handle:
|
|
926 if colName is None:
|
|
927 colName = line.rstrip().split("\t")
|
|
928 for i, col in enumerate(colName):
|
|
929 if commonMap.has_key( col ):
|
|
930 colName[i] = commonMap[ col ]
|
|
931 else:
|
|
932 tmp = line.rstrip().split("\t")
|
|
933 out = {}
|
|
934 for i in range(1, len(colName)):
|
|
935 out[ colName[i] ] = tmp[i]
|
|
936 self.emit( tmp[0], out, outport )
|
|
937 handle.close()
|
|
938
|
|
939 def fileBuild(self):
|
|
940 tmap = self.getTargetMap()
|
|
941
|
|
942 for base in ['hg19']:
|
|
943 subprocess.call("sort -k 1 %s/%s_segment > %s/%s_segment.sort" % (self.work_dir, base, self.work_dir, base), shell=True)
|
|
944 handle = TableReader(self.work_dir + "/%s_segment.sort" % (base))
|
|
945
|
|
946 segFile = None
|
|
947 curName = None
|
|
948 curData = {}
|
|
949 missingCount = 0
|
|
950
|
|
951 startField = "loc.start"
|
|
952 endField = "loc.end"
|
|
953 valField = "seg.mean"
|
|
954 chromeField = "chrom"
|
|
955
|
|
956 segFile = None
|
|
957 sHandle = handle
|
|
958 for key, value in sHandle:
|
|
959 if segFile is None:
|
|
960 segFile = open("%s/%s_segment.out" % (self.work_dir, base), "w")
|
|
961 try:
|
|
962 curName = self.translateUUID(tmap[key])
|
|
963 if curName is not None:
|
|
964 chrom = value[ chromeField ].lower()
|
|
965 if not chrom.startswith("chr"):
|
|
966 chrom = "chr" + chrom
|
|
967 chrom = chrom.upper().replace("CHR", "chr")
|
|
968 #segFile.write( "%s\t%s\t%s\t%s\t.\t%s\n" % ( curName, chrom, int(value[ startField ])+1, value[ endField ], value[ valField ] ) )
|
|
969 segFile.write( "%s\t%s\t%s\t%s\t%s\n" % ( chrom, value[ startField ], value[ endField ], curName, value[ valField ] ) )
|
|
970 except KeyError:
|
|
971 self.addError( "TargetInfo Not Found: %s" % (key))
|
|
972
|
|
973 segFile.close()
|
|
974
|
|
975 self.emitFile("." + base, self.getMeta(self.config.name + "." + base), "%s/%s_segment.out" % (self.work_dir, base))
|
|
976
|
|
977
|
|
978 class HmiRNAImport(TCGAMatrixImport):
|
|
979 dataSubType = 'miRNAExp'
|
|
980 probeMap = 'agilentHumanMiRNA'
|
|
981 sampleMap = 'tcga.iddag'
|
|
982 dataType = 'genomicMatrix'
|
|
983 probeFields = ['unc_DWD_Batch_adjusted']
|
|
984
|
|
985 class CGH244AImport(TCGASegmentImport):
|
|
986 dataSubType = 'cna'
|
|
987 sampleMap = 'tcga.iddag'
|
|
988 dataType = 'genomicSegment'
|
|
989 probeFields = ['Segment_Mean']
|
|
990
|
|
991 class CGH415K_G4124A(TCGASegmentImport):
|
|
992 dataSubType = 'cna'
|
|
993 sampleMap = 'tcga.iddag'
|
|
994 chromeField = 'Chromosome'
|
|
995 dataType = 'genomicSegment'
|
|
996 endField = 'End'
|
|
997 probeFields = ['Segment_Mean']
|
|
998 startField = 'Start'
|
|
999
|
|
1000
|
|
1001 class IlluminaHiSeq_DNASeqC(TCGASegmentImport):
|
|
1002 dataSubType = 'cna'
|
|
1003 sampleMap = 'tcga.iddag'
|
|
1004 chromeField = 'Chromosome'
|
|
1005 dataType = 'genomicSegment'
|
|
1006 endField = 'End'
|
|
1007 probeFields = ['Segment_Mean']
|
|
1008 startField = 'Start'
|
|
1009
|
|
1010 def translateUUID(self, uuid):
|
|
1011 out = self.config.translateUUID(uuid)
|
|
1012 #censor out normal ids
|
|
1013 if re.search(r'^TCGA-..-....-1', out):
|
|
1014 return None
|
|
1015 return out
|
|
1016
|
|
1017 class HT_HGU133A(TCGAMatrixImport):
|
|
1018 dataSubType = 'geneExp'
|
|
1019 probeMap = 'affyU133a'
|
|
1020 sampleMap = 'tcga.iddag'
|
|
1021 dataType = 'genomicMatrix'
|
|
1022 probeFields = ['Signal']
|
|
1023
|
|
1024 class HuEx1_0stv2(TCGAMatrixImport):
|
|
1025 dataSubType = 'miRNAExp'
|
|
1026 probeMap = 'hugo'
|
|
1027 sampleMap = 'tcga.iddag'
|
|
1028 dataType = 'genomicMatrix'
|
|
1029 probeFields = ['Signal']
|
|
1030 fileInclude = '^.*gene.txt$|^.*sdrf.txt$'
|
|
1031
|
|
1032 class Human1MDuoImport(TCGASegmentImport):
|
|
1033 dataSubType = 'cna'
|
|
1034 sampleMap = 'tcga.iddag'
|
|
1035 dataType = 'genomicSegment'
|
|
1036 probeFields = ['mean']
|
|
1037
|
|
1038 class HumanHap550(TCGASegmentImport):
|
|
1039 dataSubType = 'cna'
|
|
1040 sampleMap = 'tcga.iddag'
|
|
1041 dataType = 'genomicSegment'
|
|
1042 probeFields = ['mean']
|
|
1043
|
|
1044 class HumanMethylation27(TCGAMatrixImport):
|
|
1045 dataSubType = 'DNAMethylation'
|
|
1046 probeMap= 'illuminaMethyl27K_gpl8490'
|
|
1047 sampleMap= 'tcga.iddag'
|
|
1048 dataType= 'genomicMatrix'
|
|
1049 fileExclude= '.*.adf.txt'
|
|
1050 probeFields = ['Beta_Value', 'Beta_value']
|
|
1051
|
|
1052
|
|
1053 class HumanMethylation450(TCGAMatrixImport):
|
|
1054 dataSubType = 'DNAMethylation'
|
|
1055 probeMap = 'illuminaHumanMethylation450'
|
|
1056 sampleMap = 'tcga.iddag'
|
|
1057 dataType = 'genomicMatrix'
|
|
1058 fileExclude = '.*.adf.txt'
|
|
1059 probeFields = ['Beta_value', 'Beta_Value']
|
|
1060
|
|
1061 def fileScan(self, path):
|
|
1062 """
|
|
1063 This function takes a TCGA level 3 genetic file (file name and input handle),
|
|
1064 and tries to extract probe levels or target mappings (experimental ID to TCGA barcode)
|
|
1065 it emits these values to a handle, using the 'targets' and 'probes' string to identify
|
|
1066 the type of data being emited
|
|
1067 """
|
|
1068 iHandle = open(path)
|
|
1069 mode = None
|
|
1070 #modes
|
|
1071 #1 - two col header matrix file
|
|
1072 target = None
|
|
1073 colName = None
|
|
1074 colType = None
|
|
1075 for line in iHandle:
|
|
1076 if colName is None:
|
|
1077 colName = line.rstrip().split("\t")
|
|
1078 if colName[0] == "Hybridization REF" or colName[0] == "Sample REF":
|
|
1079 mode=1
|
|
1080 for i in range(len(colName)):
|
|
1081 if commonMap.has_key( colName[i] ):
|
|
1082 colName[i] = commonMap[ colName[i] ]
|
|
1083 elif mode==1 and colType is None:
|
|
1084 colType=line.rstrip().split("\t")
|
|
1085 for i in range(len(colType)):
|
|
1086 if commonMap.has_key( colType[i] ):
|
|
1087 colType[i] = commonMap[ colType[i] ]
|
|
1088 else:
|
|
1089 tmp = line.rstrip().split("\t")
|
|
1090 if mode == 1:
|
|
1091 out={}
|
|
1092 for col in colName[1:]:
|
|
1093 out[ col ] = { "target" : col }
|
|
1094 for i in range(1,len(colType)):
|
|
1095 try:
|
|
1096 if colType[i] in self.probeFields:
|
|
1097 out[ colName[i] ][ colType[i] ] = "%.4f" % float(tmp[i])
|
|
1098 except IndexError:
|
|
1099 out[ colName[i] ][ colType[i] ] = "NA"
|
|
1100 except ValueError:
|
|
1101 out[ colName[i] ][ colType[i] ] = "NA"
|
|
1102 for col in out:
|
|
1103 self.emit( tmp[0], out[col], "probes" )
|
|
1104
|
|
1105 class Illumina_RNASeq(TCGAMatrixImport):
|
|
1106 sampleMap= 'tcga.iddag'
|
|
1107 dataSubType= 'geneExp'
|
|
1108 fileInclude= r'^.*\.gene.quantification.txt$|^.*sdrf.txt$'
|
|
1109 probeFields = ['RPKM']
|
|
1110 probeMap= 'hugo.unc'
|
|
1111
|
|
1112 class Illumina_RNASeqV2(TCGAMatrixImport):
|
|
1113 sampleMap= 'tcga.iddag'
|
|
1114 dataSubType= 'geneExp'
|
|
1115 fileInclude= r'^.*rsem.genes.normalized_results$|^.*sdrf.txt$'
|
|
1116 probeFields = ['normalized_count']
|
|
1117 probeMap= 'hugo.unc'
|
|
1118
|
|
1119 class IlluminaHiSeq_RNASeq(TCGAMatrixImport):
|
|
1120 sampleMap= 'tcga.iddag'
|
|
1121 dataSubType= 'geneExp'
|
|
1122 fileInclude= r'^.*gene.quantification.txt$'
|
|
1123 probeFields = ['RPKM']
|
|
1124 probeMap= 'hugo.unc'
|
|
1125
|
|
1126 class MDA_RPPA_Core(TCGAMatrixImport):
|
|
1127 sampleMap = 'tcga.iddag'
|
|
1128 probeMap = "md_anderson_antibodies"
|
|
1129 dataSubType = "RPPA"
|
|
1130 fileExclude = r'^.*.antibody_annotation.txt'
|
|
1131 probeFields = [ 'Protein Expression' ]
|
|
1132
|
|
1133 def getTargetMap(self):
|
|
1134 subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True)
|
|
1135 handle = TableReader(self.work_dir + "/targets.sort")
|
|
1136 tTrans = {}
|
|
1137 for key, value in handle:
|
|
1138 value = re.sub(r'\.SD', '', value)
|
|
1139 tTrans[ key ] = value
|
|
1140 return tTrans
|
|
1141
|
|
1142
|
|
1143 class Illumina_miRNASeq(TCGAMatrixImport):
|
|
1144 sampleMap= 'tcga.iddag'
|
|
1145 dataSubType= 'miRNA'
|
|
1146 fileInclude= '^.*.mirna.quantification.txt$'
|
|
1147 probeFields = ['reads_per_million_miRNA_mapped']
|
|
1148 probeMap= 'hsa.mirna'
|
|
1149
|
|
1150
|
|
1151 class bioImport(TCGAClinicalImport):
|
|
1152 sampleMap = 'tcga.iddag'
|
|
1153 fileInclude = '.*.xml$'
|
|
1154
|
|
1155 tcgaConfig = {
|
|
1156 'AgilentG4502A_07' : AgilentImport,
|
|
1157 'AgilentG4502A_07_1' : AgilentImport,
|
|
1158 'AgilentG4502A_07_2' : AgilentImport,
|
|
1159 'AgilentG4502A_07_3': AgilentImport,
|
|
1160 'CGH-1x1M_G4447A': CGH1x1mImport,
|
|
1161 'Genome_Wide_SNP_6': SNP6Import,
|
|
1162 'H-miRNA_8x15K': HmiRNAImport,
|
|
1163 'H-miRNA_8x15Kv2': HmiRNAImport,
|
|
1164 'HG-CGH-244A': CGH244AImport,
|
|
1165 'HG-CGH-415K_G4124A': CGH415K_G4124A,
|
|
1166 'HT_HG-U133A': HT_HGU133A,
|
|
1167 'HuEx-1_0-st-v2': HuEx1_0stv2,
|
|
1168 'Human1MDuo': Human1MDuoImport,
|
|
1169 'HumanHap550': HumanHap550,
|
|
1170 'IlluminaHiSeq_DNASeqC' : IlluminaHiSeq_DNASeqC,
|
|
1171 'HumanMethylation27': HumanMethylation27,
|
|
1172 'HumanMethylation450': HumanMethylation450,
|
|
1173 'IlluminaHiSeq_RNASeq': IlluminaHiSeq_RNASeq,
|
|
1174 'IlluminaGA_RNASeq' : Illumina_RNASeq,
|
|
1175 'IlluminaHiSeq_RNASeqV2' : Illumina_RNASeqV2,
|
|
1176 'MDA_RPPA_Core' : MDA_RPPA_Core,
|
|
1177 'IlluminaGA_miRNASeq' : Illumina_miRNASeq,
|
|
1178 'IlluminaHiSeq_miRNASeq' : Illumina_miRNASeq,
|
|
1179 'bio' : bioImport
|
|
1180 }
|
|
1181
|
|
1182 def fileDigest( file ):
|
|
1183 md5 = hashlib.md5()
|
|
1184 with open(file,'rb') as f:
|
|
1185 for chunk in iter(lambda: f.read(8192), ''):
|
|
1186 md5.update(chunk)
|
|
1187 return md5.hexdigest()
|
|
1188
|
|
1189
|
|
1190 def platform_list():
|
|
1191 q = CustomQuery("Platform")
|
|
1192 for e in q:
|
|
1193 yield e['name']
|
|
1194
|
|
1195 def supported_list():
|
|
1196 q = CustomQuery("Platform")
|
|
1197 for e in q:
|
|
1198 if e['name'] in tcgaConfig:
|
|
1199 yield e['name']
|
|
1200
|
|
1201 def platform_archives(platform):
|
|
1202 q = CustomQuery("Archive[Platform[@name=%s]][@isLatest=1]" % platform)
|
|
1203 out = {}
|
|
1204 for e in q:
|
|
1205 name = e['baseName']
|
|
1206 if name not in out:
|
|
1207 yield name
|
|
1208 out[name] = True
|
|
1209
|
|
1210
|
|
1211 if __name__ == "__main__":
|
|
1212
|
|
1213 parser = ArgumentParser()
|
|
1214 #Stack.addJobTreeOptions(parser)
|
|
1215
|
|
1216 parser.add_argument("-a", "--platform-list", dest="platform_list", action="store_true", help="Get list of platforms", default=False)
|
|
1217 parser.add_argument("-u", "--uuid", dest="uuid_table", help="UUID to Barcode Table", default=None)
|
|
1218 parser.add_argument("-t", "--uuid-download", dest="uuid_download", help="Download UUID/Barcode Table", default=False)
|
|
1219 parser.add_argument("-z", "--all-archives", dest="all_archives", action="store_true", help="List all archives", default=False)
|
|
1220 parser.add_argument("-p", "--platform", dest="platform", help="Platform Selection", default=None)
|
|
1221 parser.add_argument("-l", "--supported", dest="supported_list", action="store_true", help="List Supported Platforms", default=None)
|
|
1222 parser.add_argument("-f", "--filelist", dest="filelist", help="List files needed to convert TCGA project basename into cgData", default=None)
|
|
1223 parser.add_argument("-b", "--basename", dest="basename", help="Convert TCGA project basename into cgData", default=None)
|
|
1224 parser.add_argument("-m", "--mirror", dest="mirror", help="Mirror Location", default=None)
|
|
1225 parser.add_argument("-w", "--workdir", dest="workdir_base", help="Working directory", default="/tmp")
|
|
1226 parser.add_argument("--out-dir", dest="outdir", help="Working directory", default="./")
|
|
1227 parser.add_argument("-o", "--out", dest="outpath", help="Output Dest", default=None)
|
|
1228 parser.add_argument("--out-error", dest="errorpath", help="Output Error", default=None)
|
|
1229 parser.add_argument("--out-meta", dest="metapath", help="Output Meta", default=None)
|
|
1230 parser.add_argument("-c", "--cancer", dest="cancer", help="List Archives by cancer type", default=None)
|
|
1231 parser.add_argument("-d", "--download", dest="download", help="Download files for archive", default=None)
|
|
1232 parser.add_argument("-e", "--level", dest="level", help="Data Level ", default="3")
|
|
1233 parser.add_argument("-s", "--check-sum", dest="checksum", help="Check project md5", default=None)
|
|
1234 parser.add_argument("-r", "--sanitize", dest="sanitize", action="store_true", help="Remove race/ethnicity from clinical data", default=False)
|
|
1235 parser.add_argument("-x", "--clinical", dest="clinical", help="Process clinical info", default=None)
|
|
1236 parser.add_argument("--clinical-basename", dest="clinical_basename", help="Select Clinical Data by basename", default=None)
|
|
1237 parser.add_argument("--clinical-type", dest="clinical_type", help="Clinical Data Type", default=None)
|
|
1238 parser.add_argument("--all-clinical", dest="all_clinical", action="store_true", help="List all clinical archives", default=False)
|
|
1239 parser.add_argument("--out-clinical", dest="out_clinical", action="append", nargs=3, default=[])
|
|
1240 parser.add_argument("--samples", dest="get_samples", action="store_true", default=False)
|
|
1241
|
|
1242 options = parser.parse_args()
|
|
1243
|
|
1244 if options.uuid_download:
|
|
1245 url="https://tcga-data.nci.nih.gov/uuid/uuidBrowserExport.htm"
|
|
1246 data = {}
|
|
1247 data['exportType'] = 'tab'
|
|
1248 data['cols'] = "uuid,barcode"
|
|
1249 urllib.urlretrieve( url, options.uuid_download, data=urllib.urlencode(data))
|
|
1250
|
|
1251 if options.platform_list:
|
|
1252 for e in platform_list():
|
|
1253 print e
|
|
1254
|
|
1255 if options.supported_list:
|
|
1256 for e in supported_list():
|
|
1257 print e
|
|
1258
|
|
1259 if options.platform:
|
|
1260 for name in platform_archives( options.platform ):
|
|
1261 print name
|
|
1262
|
|
1263 if options.all_archives:
|
|
1264 q = CustomQuery("Archive[@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.level))
|
|
1265 out = {}
|
|
1266 for e in q:
|
|
1267 name = e['baseName']
|
|
1268 if name not in out:
|
|
1269 print name
|
|
1270 out[name] = True
|
|
1271
|
|
1272 if options.all_clinical:
|
|
1273 q = CustomQuery("Archive[@isLatest=1][Platform[@alias=bio]]")
|
|
1274 out = {}
|
|
1275 for e in q:
|
|
1276 name = e['baseName']
|
|
1277 if name not in out:
|
|
1278 print name
|
|
1279 out[name] = True
|
|
1280
|
|
1281 if options.get_samples:
|
|
1282 url="https://tcga-data.nci.nih.gov/datareports/aliquotExport.htm"
|
|
1283 data = {}
|
|
1284
|
|
1285 data['exportType'] = 'tab'
|
|
1286 data['cols'] = 'aliquotId,disease,bcrBatch,center,platform,levelOne,levelTwo,levelThree'
|
|
1287 data['filterReq'] = json.dumps({"disease":"","levelOne":"","aliquotId":"","center":"","levelTwo":"","bcrBatch":"","platform":"","levelThree":""})
|
|
1288 data['formFilter'] = json.dumps({"disease":"","levelOne":"","aliquotId":"","center":"","levelTwo":"","bcrBatch":"","platform":"","levelThree":""})
|
|
1289 handle = urllib.urlopen( url + "?" + urllib.urlencode(data))
|
|
1290
|
|
1291 for line in handle:
|
|
1292 tmp = line.rstrip().split("\t")
|
|
1293 if tmp[7] == "Submitted":
|
|
1294 if tmp[0][13]=='0':
|
|
1295 print "\t".join( [ tmp[0], tmp[1], "Tumor", tmp[4] ] )
|
|
1296 elif tmp[0][13] == '1':
|
|
1297 print "\t".join( [ tmp[0], tmp[1], "Normal", tmp[4] ] )
|
|
1298
|
|
1299
|
|
1300 if options.cancer is not None:
|
|
1301 q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][ArchiveType[@type=Level_%s]]" % (options.cancer, options.level))
|
|
1302 out = {}
|
|
1303 for e in q:
|
|
1304 name = e['baseName']
|
|
1305 if name not in out:
|
|
1306 print name
|
|
1307 out[name] = True
|
|
1308
|
|
1309 if options.filelist:
|
|
1310 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.filelist, options.level))
|
|
1311 for e in q:
|
|
1312 print e['deployLocation']
|
|
1313 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.filelist))
|
|
1314 for e in q:
|
|
1315 print e['deployLocation']
|
|
1316
|
|
1317 if options.checksum:
|
|
1318 urls = []
|
|
1319 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.checksum, options.level))
|
|
1320 for e in q:
|
|
1321 urls.append( e['deployLocation'] )
|
|
1322 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.checksum))
|
|
1323 for e in q:
|
|
1324 urls.append( e['deployLocation'] )
|
|
1325
|
|
1326 for url in urls:
|
|
1327 dst = os.path.join(options.mirror, re.sub("^/", "", url))
|
|
1328 if not os.path.exists( dst ):
|
|
1329 print "NOT_FOUND:", dst
|
|
1330 continue
|
|
1331 if not os.path.exists( dst + ".md5" ):
|
|
1332 print "MD5_NOT_FOUND", dst
|
|
1333 continue
|
|
1334
|
|
1335 handle = open( dst + ".md5" )
|
|
1336 line = handle.readline()
|
|
1337 omd5 = line.split(' ')[0]
|
|
1338 handle.close()
|
|
1339
|
|
1340 nmd5 = fileDigest( dst )
|
|
1341 if omd5 != nmd5:
|
|
1342 print "CORRUPT:", dst
|
|
1343 else:
|
|
1344 print "OK:", dst
|
|
1345
|
|
1346
|
|
1347 if options.download is not None:
|
|
1348 if options.mirror is None:
|
|
1349 print "Define mirror location"
|
|
1350 sys.exit(1)
|
|
1351
|
|
1352 urls = []
|
|
1353
|
|
1354 if options.basename is None and options.clinical is None and options.clinical_basename is None:
|
|
1355 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.download, options.level))
|
|
1356 for e in q:
|
|
1357 urls.append( e['deployLocation'] )
|
|
1358 urls.append( e['deployLocation'] + ".md5" )
|
|
1359
|
|
1360 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.download))
|
|
1361 for e in q:
|
|
1362 urls.append( e['deployLocation'] )
|
|
1363 urls.append( e['deployLocation'] + ".md5" )
|
|
1364
|
|
1365 if options.basename:
|
|
1366 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.basename, options.level))
|
|
1367 for e in q:
|
|
1368 urls.append( e['deployLocation'] )
|
|
1369 urls.append( e['deployLocation'] + ".md5" )
|
|
1370
|
|
1371 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.basename))
|
|
1372 for e in q:
|
|
1373 urls.append( e['deployLocation'] )
|
|
1374 urls.append( e['deployLocation'] + ".md5" )
|
|
1375
|
|
1376 if options.clinical:
|
|
1377 q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][Platform[@alias=bio]]" % (options.clinical))
|
|
1378 for e in q:
|
|
1379 urls.append( e['deployLocation'] )
|
|
1380 urls.append( e['deployLocation'] + ".md5" )
|
|
1381
|
|
1382 if options.clinical_basename:
|
|
1383 q = CustomQuery("Archive[@isLatest=1][@baseName=%s]" % (options.clinical_basename))
|
|
1384 for e in q:
|
|
1385 urls.append( e['deployLocation'] )
|
|
1386 urls.append( e['deployLocation'] + ".md5" )
|
|
1387
|
|
1388
|
|
1389
|
|
1390 for url in urls:
|
|
1391 src = "https://tcga-data.nci.nih.gov/" + url
|
|
1392 dst = os.path.join(options.mirror, re.sub("^/", "", url))
|
|
1393 dir = os.path.dirname(dst)
|
|
1394 if not os.path.exists(dir):
|
|
1395 print "mkdir", dir
|
|
1396 os.makedirs(dir)
|
|
1397 if not os.path.exists( dst ):
|
|
1398 print "download %s to %s" % (src, dst)
|
|
1399 urllib.urlretrieve(src, dst)
|
|
1400
|
|
1401 if options.basename:
|
|
1402 if options.mirror is None:
|
|
1403 sys.stderr.write("Need mirror location\n")
|
|
1404 sys.exit(1)
|
|
1405
|
|
1406 conf = getBaseBuildConf(options.basename, options.level, options.mirror)
|
|
1407 conf.addOptions(options)
|
|
1408 if conf.platform not in tcgaConfig:
|
|
1409 sys.stderr.write("Platform %s not supported\n" % (conf.platform))
|
|
1410 sys.exit(1)
|
|
1411
|
|
1412 ext = tcgaConfig[conf.platform](conf)
|
|
1413 ext.run()
|
|
1414
|
|
1415
|
|
1416 if options.clinical:
|
|
1417 if options.mirror is None:
|
|
1418 sys.stderr.write("Need mirror location\n")
|
|
1419 sys.exit(1)
|
|
1420
|
|
1421 q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][Platform[@alias=bio]]" % (options.clinical))
|
|
1422 basenames = {}
|
|
1423 for s in q:
|
|
1424 basenames[s['baseName']] = True
|
|
1425
|
|
1426 for base in basenames:
|
|
1427 conf = getBaseBuildConf(base, 1, options.mirror)
|
|
1428 conf.addOptions(options)
|
|
1429
|
|
1430 ext = tcgaConfig[conf.platform](conf)
|
|
1431 ext.run()
|
|
1432
|
|
1433 if options.clinical_basename:
|
|
1434 if options.mirror is None:
|
|
1435 sys.stderr.write("Need mirror location\n")
|
|
1436 sys.exit(1)
|
|
1437
|
|
1438
|
|
1439 conf = getBaseBuildConf(options.clinical_basename, 1, options.mirror)
|
|
1440 conf.addOptions(options)
|
|
1441
|
|
1442 ext = tcgaConfig[conf.platform](conf)
|
|
1443 ext.run()
|
|
1444
|