comparison tcga_import/tcgaImport.py @ 0:f1c71f5363ae draft default tip

Uploaded
author kellrott
date Tue, 30 Oct 2012 14:23:49 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:f1c71f5363ae
1 #!/usr/bin/env python
2
3
4 """
5 Script to scan and extract TCGA data and compile it into the cgData
6
7 Usage::
8
9 tcga2cgdata.py [options]
10
11 Options::
12
13 -h, --help show this help message and exit
14 -a, --platform-list Get list of platforms
15 -p PLATFORM, --platform=PLATFORM
16 Platform Selection
17 -l, --supported List Supported Platforms
18 -f FILELIST, --filelist=FILELIST
19 List files needed to convert TCGA project basename
20 into cgData
21 -b BASENAME, --basename=BASENAME
22 Convert TCGA project basename into cgData
23 -m MIRROR, --mirror=MIRROR
24 Mirror Location
25 -w WORKDIR_BASE, --workdir=WORKDIR_BASE
26 Working directory
27 -o OUTDIR, --out-dir=OUTDIR
28 Working directory
29 -c CANCER, --cancer=CANCER
30 List Archives by cancer type
31 -d DOWNLOAD, --download=DOWNLOAD
32 Download files for archive
33 -e LEVEL, --level=LEVEL
34 Data Level
35 -s CHECKSUM, --check-sum=CHECKSUM
36 Check project md5
37 -r, --sanitize Remove race/ethnicity from clinical data
38
39
40 Example::
41
42 ./scripts/tcga2cgdata.py -b intgen.org_KIRC_bio -m /inside/depot -e 1 -r -w tmp
43
44
45 """
46
47 from xml.dom.minidom import parseString
48 import urllib
49 import urllib2
50 import os
51 import csv
52 import sys
53 import hashlib
54 import tempfile
55 import re
56 import copy
57 import json
58 import datetime
59 import hashlib
60 import subprocess
61 from glob import glob
62 import shutil
63 import subprocess
64 from argparse import ArgumentParser
65
66
67
68
69 """
70
71 Net query code
72
73 """
74
75 class dccwsItem(object):
76 baseURL = "http://tcga-data.nci.nih.gov/tcgadccws/GetXML?query="
77
78 def __init__(self):
79 self.url = None
80
81 def __iter__(self):
82 next = self.url
83 while next != None:
84 handle = urllib.urlopen(next)
85 data = handle.read()
86 handle.close()
87 dom = parseString(data)
88 # there might not be any archives for a dataset
89 if len(dom.getElementsByTagName('queryResponse')) > 0:
90 response = dom.getElementsByTagName('queryResponse').pop()
91 classList = response.getElementsByTagName('class')
92 for cls in classList:
93 className = cls.getAttribute("recordNumber")
94 outData = {}
95 #aObj = Archive()
96 for node in cls.childNodes:
97 nodeName = node.getAttribute("name")
98 if node.hasAttribute("xlink:href"):
99 outData[ nodeName ] = node.getAttribute("xlink:href")
100 else:
101 outData[ nodeName ] = getText( node.childNodes )
102 yield outData
103 if len( dom.getElementsByTagName('next') ) > 0:
104 nextElm = dom.getElementsByTagName('next').pop()
105 next = nextElm.getAttribute( 'xlink:href' )
106 else:
107 next = None
108
109
110 class CustomQuery(dccwsItem):
111 def __init__(self, query):
112 super(CustomQuery, self).__init__()
113 if query.startswith("http://"):
114 self.url = query
115 else:
116 self.url = dccwsItem.baseURL + query
117
118
119 def getText(nodelist):
120 rc = []
121 for node in nodelist:
122 if node.nodeType == node.TEXT_NODE:
123 rc.append(node.data)
124 return ''.join(rc)
125
126 """
127
128 Build Configuration
129
130 """
131
132 class BuildConf:
133 def __init__(self, platform, name, version, meta, tarlist):
134 self.platform = platform
135 self.name = name
136 self.version = version
137 self.meta = meta
138 self.tarlist = tarlist
139 self.abbr = ''
140 self.uuid_table = None
141 if 'diseaseAbbr' in meta:
142 self.abbr = meta['diseaseAbbr']
143
144 def addOptions(self, opts):
145 self.workdir_base = opts.workdir_base
146 self.outdir = opts.outdir
147 self.sanitize = opts.sanitize
148 self.outpath = opts.outpath
149 self.metapath = opts.metapath
150 self.errorpath = opts.errorpath
151 self.clinical_type = opts.clinical_type
152
153 self.clinical_type_map = {}
154 for t, path, meta in opts.out_clinical:
155 self.clinical_type_map[ "." + t] = (path, meta)
156
157 if opts.uuid_table is not None:
158 self.uuid_table = {}
159 handle = open(opts.uuid_table)
160 for line in handle:
161 tmp = line.rstrip().split("\t")
162 self.uuid_table[tmp[0]] = tmp[1]
163
164 def translateUUID(self, uuid):
165 if self.uuid_table is None or uuid not in self.uuid_table:
166 return uuid
167 return self.uuid_table[uuid]
168
169 def getOutPath(self, name):
170 if self.outpath is not None:
171 return self.outpath
172 if name in self.clinical_type_map:
173 return self.clinical_type_map[name][0]
174 return os.path.join(self.outdir, self.name) + name
175
176 def getOutMeta(self, name):
177 if self.outpath is not None:
178 if self.metapath is not None:
179 return self.metapath
180 return self.outpath + ".json"
181 if name in self.clinical_type_map:
182 return self.clinical_type_map[name][1]
183 return os.path.join(self.outdir, self.name) + name + ".json"
184
185 def getOutError(self, name):
186 if self.outpath is not None:
187 if self.errorpath is not None:
188 return self.errorpath
189 return self.outpath + ".error"
190 return os.path.join(self.outdir, self.name) + name + ".error"
191
192
193 def getBaseBuildConf(basename, level, mirror):
194 dates = []
195 print "TCGA Query for: ", basename
196 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (basename, level))
197 urls = {}
198 meta = None
199 platform = None
200 for e in q:
201 dates.append( datetime.datetime.strptime( e['addedDate'], "%m-%d-%Y" ) )
202 if meta is None:
203 meta = {"sourceUrl" : []}
204 for e2 in CustomQuery(e['platform']):
205 platform = e2['name']
206 meta['platform'] = e2['name']
207 meta['platformTitle'] = e2['displayName']
208 for e2 in CustomQuery(e['disease']):
209 meta['diseaseAbbr'] = e2['abbreviation']
210 meta['diseaseTitle'] = e2['name']
211 for e3 in CustomQuery(e2['tissueCollection']):
212 meta['tissue'] = e3['name']
213 for e2 in CustomQuery(e['center']):
214 meta['centerTitle'] = e2['displayName']
215 meta['center'] = e2['name']
216 meta['sourceUrl'].append( "http://tcga-data.nci.nih.gov/" + e['deployLocation'] )
217 urls[ mirror + e['deployLocation'] ] = platform
218
219 print "TCGA Query for mage-tab: ", basename
220 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (basename))
221 for e in q:
222 dates.append( datetime.datetime.strptime( e['addedDate'], "%m-%d-%Y" ) )
223 q2 = CustomQuery(e['platform'])
224 platform = None
225 for e2 in q2:
226 print e2
227 platform = e2['name']
228 meta['sourceUrl'].append( "http://tcga-data.nci.nih.gov/" + e['deployLocation'] )
229 urls[ mirror + e['deployLocation'] ] = platform
230
231 if len(dates) == 0:
232 print "No Files found"
233 return
234 dates.sort()
235 dates.reverse()
236 versionDate = dates[0].strftime( "%Y-%m-%d" )
237
238 return BuildConf(platform, basename, versionDate, meta, urls)
239
240
241
242
243
244 class TableReader:
245 def __init__(self, path):
246 self.path = path
247
248 def __iter__(self):
249 if self.path is not None and os.path.exists(self.path):
250 handle = open(self.path)
251 for line in handle:
252 tmp = line.rstrip().split("\t")
253 yield tmp[0], json.loads(tmp[1])
254 handle.close()
255
256
257 class FileImporter:
258
259 fileInclude = None
260 fileExclude = None
261
262 excludes = [
263 "MANIFEST.txt$",
264 "CHANGES_DCC.txt$",
265 "README_DCC.txt$",
266 "README.txt$",
267 "CHANGES.txt$",
268 "DCC_ALTERED_FILES.txt$",
269 r'.wig$',
270 "DESCRIPTIO$"
271 ]
272
273 def __init__(self, config):
274 self.config = config
275
276 def extractTars(self):
277 self.work_dir = tempfile.mkdtemp(dir=self.config.workdir_base)
278 print "Extract to ", self.work_dir
279 for path in self.config.tarlist:
280 subprocess.check_call([ "tar", "xvzf", path, "-C", self.work_dir], stderr=sys.stdout)
281
282 def run(self):
283 self.extractTars()
284
285 filterInclude = None
286 filterExclude = None
287 if self.fileInclude is not None:
288 filterInclude = re.compile(self.fileInclude)
289 if self.fileExclude is not None:
290 filterExclude = re.compile(self.fileExclude)
291 self.inc = 0
292 self.out = {}
293 self.errors = []
294 self.ext_meta = {}
295 self.scandirs(self.work_dir, filterInclude, filterExclude)
296 for o in self.out:
297 self.out[o].close()
298 self.fileBuild()
299 #shutil.rmtree(self.work_dir)
300
301 def checkExclude( self, name ):
302 for e in self.excludes:
303 if re.search( e, name ):
304 return True
305 return False
306
307 def scandirs(self, path, filterInclude=None, filterExclude=None):
308 if os.path.isdir(path):
309 for a in glob(os.path.join(path, "*")):
310 self.scandirs(a, filterInclude, filterExclude)
311 else:
312 name = os.path.basename(path)
313 if self.isMage(path):
314 self.mageScan(path)
315 else:
316 if not self.checkExclude(name):
317 if (filterInclude is None or filterInclude.match(name)) and (filterExclude is None or not filterExclude.match(name)):
318 self.fileScan(path)
319
320 def isMage(self, path):
321 if path.endswith( '.sdrf.txt' ) or path.endswith( '.idf.txt' ) or path.endswith("DESCRIPTION.txt"):
322 return True
323
324
325 def emit(self, key, data, port):
326 if port not in self.out:
327 self.out[port] = open(self.work_dir + "/" + port, "w")
328 self.out[port].write( "%s\t%s\n" % (key, json.dumps(data)))
329
330 def emitFile(self, name, meta, file):
331 md5 = hashlib.md5()
332 oHandle = open(self.config.getOutPath(name), "wb")
333 with open(file,'rb') as f:
334 for chunk in iter(lambda: f.read(8192), ''):
335 md5.update(chunk)
336 oHandle.write(chunk)
337 oHandle.close()
338 md5str = md5.hexdigest()
339 meta['md5'] = md5str
340 mHandle = open(self.config.getOutMeta(name), "w")
341 mHandle.write( json.dumps(meta))
342 mHandle.close()
343 if len(self.errors):
344 eHandle = open( self.config.getOutError(name), "w" )
345 for msg in self.errors:
346 eHandle.write( msg + "\n" )
347 eHandle.close()
348
349 def addError(self, msg):
350 self.errors.append(msg)
351
352
353 commonMap = {
354 "mean" : "seg.mean",
355 "Segment_Mean" : "seg.mean",
356 "Start" : "loc.start",
357 "End" : "loc.end",
358 "Chromosome" : "chrom"
359 }
360
361
362 idfMap = {
363 "Investigation Title" : "title",
364 "Experiment Description" : "experimentalDescription",
365 "Person Affiliation" : "dataProducer",
366 "Date of Experiment" : "experimentalDate"
367 }
368
369 class TCGAGeneticImport(FileImporter):
370
371
372
373 def mageScan(self, path):
374 if path.endswith(".sdrf.txt"):
375 iHandle = open(path, "rU")
376 read = csv.reader( iHandle, delimiter="\t" )
377 colNum = None
378 for row in read:
379 if colNum is None:
380 colNum = {}
381 for i in range(len(row)):
382 colNum[ row[i] ] = i
383 else:
384 if not colNum.has_key("Material Type") or ( not row[ colNum[ "Material Type" ] ] in [ "genomic_DNA", "total_RNA", "MDA cell line" ] ):
385 try:
386 if colNum.has_key( "Derived Array Data File" ):
387 self.emit( row[ colNum[ "Derived Array Data File" ] ].split('.')[0], row[ colNum[ "Extract Name" ] ], "targets" )
388 self.emit( row[ colNum[ "Derived Array Data File" ] ], row[ colNum[ "Extract Name" ] ], "targets" )
389 if colNum.has_key("Derived Array Data Matrix File" ):
390 self.emit( row[ colNum[ "Derived Array Data Matrix File" ] ], row[ colNum[ "Extract Name" ] ], "targets" )
391 if colNum.has_key( "Derived Data File"):
392 self.emit( row[ colNum[ "Derived Data File" ] ].split('.')[0], row[ colNum[ "Extract Name" ] ], "targets" )
393 self.emit( row[ colNum[ "Derived Data File" ] ], row[ colNum[ "Extract Name" ] ], "targets" )
394 if colNum.has_key( "Hybridization Name" ):
395 self.emit( row[ colNum[ "Hybridization Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" )
396 if colNum.has_key( "Sample Name" ):
397 self.emit( row[ colNum[ "Sample Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" )
398 self.emit( row[ colNum[ "Extract Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" )
399 except IndexError:
400 pass #there can be blank lines in the SDRF
401 if path.endswith(".idf.txt"):
402 iHandle = open(path)
403 for line in iHandle:
404 row = line.split("\t")
405 if len(row):
406 if row[0] in idfMap:
407 self.ext_meta[ idfMap[row[0]] ] = row[1]
408 iHandle.close()
409 if path.endswith("DESCRIPTION.txt"):
410 handle = open(path)
411 self.ext_meta['description'] = handle.read()
412 handle.close()
413
414 def translateUUID(self, uuid):
415 return self.config.translateUUID(uuid)
416
417 def getTargetMap(self):
418 subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True)
419 handle = TableReader(self.work_dir + "/targets.sort")
420 tTrans = {}
421 for key, value in handle:
422 tTrans[ key ] = value
423 return tTrans
424
425 def fileScan(self, path):
426 """
427 This function takes a TCGA level 3 genetic file (file name and input handle),
428 and tries to extract probe levels or target mappings (experimental ID to TCGA barcode)
429 it emits these values to a handle, using the 'targets' and 'probes' string to identify
430 the type of data being emited
431 """
432 iHandle = open(path)
433 mode = None
434 #modes
435 #1 - segmentFile - one sample per file/no sample info inside file
436 #2 - two col header matrix file
437 #3 - segmentFile - sample information inside file
438 target = None
439 colName = None
440 colType = None
441 for line in iHandle:
442 if colName is None:
443 colName = line.rstrip().split("\t")
444 if colName[0] == "Hybridization REF" or colName[0] == "Sample REF":
445 mode=2
446 elif colName[0] == "Chromosome" or colName[0] == "chromosome":
447 mode=1
448 target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention
449 elif colName[1] == "chrom":
450 mode = 3
451 target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention
452
453 for i in range(len(colName)):
454 if commonMap.has_key( colName[i] ):
455 colName[i] = commonMap[ colName[i] ]
456 elif mode==2 and colType is None:
457 colType=line.rstrip().split("\t")
458 for i in range(len(colType)):
459 if commonMap.has_key( colType[i] ):
460 colType[i] = commonMap[ colType[i] ]
461 else:
462 tmp = line.rstrip().split("\t")
463 if mode == 2:
464 out={}
465 for col in colName[1:]:
466 out[ col ] = { "target" : col }
467 for i in range(1,len(colType)):
468 try:
469 if colType[i] in self.probeFields:
470 out[ colName[i] ][ colType[i] ] = tmp[i]
471 except IndexError:
472 out[ colName[i] ][ colType[i] ] = "NA"
473 for col in out:
474 self.emit( tmp[0], out[col], "probes" )
475 else:
476 out = {}
477 for i in range(len(colName)):
478 out[ colName[i] ] = tmp[i]
479 out['file'] = os.path.basename(path)
480 if mode==1:
481 self.emit( target, out, "segments" )
482 elif mode == 3:
483 self.emit( tmp[0], out, "segments" )
484 else:
485 self.emit( tmp[0], out, "probes" )
486
487
488
489
490 class TCGASegmentImport(TCGAGeneticImport):
491
492
493 def fileScan(self, path):
494 """
495 This function takes a TCGA level 3 genetic file (file name and input handle),
496 and tries to extract probe levels or target mappings (experimental ID to TCGA barcode)
497 it emits these values to a handle, using the 'targets' and 'probes' string to identify
498 the type of data being emited
499 """
500 iHandle = open(path)
501 mode = None
502 #modes
503 #1 - segmentFile - one sample per file/no sample info inside file
504 #2 - segmentFile - sample information inside file
505 target = None
506 colName = None
507 colType = None
508 for line in iHandle:
509 if colName is None:
510 colName = line.rstrip().split("\t")
511 if colName[0] == "Chromosome" or colName[0] == "chromosome":
512 mode=1
513 target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention
514 elif colName[1] == "chrom":
515 mode = 2
516
517 for i in range(len(colName)):
518 if commonMap.has_key( colName[i] ):
519 colName[i] = commonMap[ colName[i] ]
520 else:
521 tmp = line.rstrip().split("\t")
522 out = {}
523 for i in range(len(colName)):
524 out[ colName[i] ] = tmp[i]
525 out['file'] = os.path.basename(path)
526 if mode==1:
527 self.emit( target, out, "segments" )
528 elif mode == 2:
529 self.emit( tmp[0], out, "segments" )
530
531
532 def getMeta(self, name):
533 matrixInfo = {
534 '@context' : "http://purl.org/cgdata/",
535 '@type' : 'bed5',
536 '@id' : name,
537 "lastModified" : self.config.version,
538 'rowKeySrc' : {
539 '@type' : 'idDAG',
540 '@id' : "tcga.%s" % (self.config.abbr)
541 },
542 'dataSubType' : { "@id" : self.dataSubType },
543 'dataProducer' : 'TCGA Import',
544 "accessMap" : "public", "redistribution" : "yes"
545 }
546 matrixInfo.update(self.ext_meta)
547 matrixInfo.update(self.config.meta)
548 return matrixInfo
549
550 def fileBuild(self):
551 #use the target table to create a name translation table
552 #also setup target name enumeration, so they will have columns
553 #numbers
554
555 tTrans = self.getTargetMap()
556 subprocess.call("sort -k 1 %s/segments > %s/segments.sort" % (self.work_dir, self.work_dir), shell=True)
557 sHandle = TableReader(self.work_dir + "/segments.sort")
558
559 segFile = None
560 curName = None
561
562 curData = {}
563 missingCount = 0
564
565 startField = "loc.start"
566 endField = "loc.end"
567 valField = "seg.mean"
568 chromeField = "chrom"
569
570 segFile = None
571
572 for key, value in sHandle:
573 if segFile is None:
574 segFile = open("%s/segment_file" % (self.work_dir), "w")
575 try:
576 curName = self.translateUUID(tTrans[key]) # "-".join( tTrans[ key ].split('-')[0:4] )
577 if curName is not None:
578 try:
579 chrom = value[ chromeField ].lower()
580 if not chrom.startswith("chr"):
581 chrom = "chr" + chrom
582 chrom = chrom.upper().replace("CHR", "chr")
583 #segFile.write( "%s\t%s\t%s\t%s\t.\t%s\n" % ( curName, chrom, int(value[ startField ])+1, value[ endField ], value[ valField ] ) )
584 segFile.write( "%s\t%s\t%s\t%s\t%s\n" % ( chrom, value[ startField ], value[ endField ], curName, value[ valField ] ) )
585 except KeyError:
586 self.addError( "Field error: %s" % (str(value)))
587 except KeyError:
588 self.addError( "TargetInfo Not Found: %s" % (key))
589
590 segFile.close()
591 matrixName = self.config.name
592
593 self.emitFile( "", self.getMeta(matrixName), "%s/segment_file" % (self.work_dir) )
594
595
596
597 class TCGAMatrixImport(TCGAGeneticImport):
598
599 def getMeta(self, name):
600 matrixInfo = {
601 "@context" : 'http://purl.org/cgdata/',
602 '@type' : 'genomicMatrix',
603 '@id' : name,
604 "lastModified" : self.config.version,
605 'dataSubType' : { "@id" : self.dataSubType },
606 'dataProducer' : 'TCGA',
607 "accessMap" : "public",
608 "redistribution" : "yes",
609 'rowKeySrc' : {
610 "@type" : "probe", "@id" : self.probeMap
611 },
612 'columnKeySrc' : {
613 "@type" : "idDAG", "@id" : "tcga.%s" % (self.config.abbr)
614 }
615 }
616 matrixInfo.update(self.ext_meta)
617 matrixInfo.update(self.config.meta)
618 return matrixInfo
619
620 def fileBuild(self):
621 #use the target table to create a name translation table
622 #also setup target name enumeration, so they will have columns
623 #numbers
624
625 subprocess.call("sort -k 1 %s/probes > %s/probes.sort" % (self.work_dir, self.work_dir), shell=True)
626 subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True)
627
628 handles = {}
629 handles[ "geneticExtract:targets" ] = TableReader(self.work_dir + "/targets.sort")
630 handles[ "geneticExtract:probes" ] = TableReader(self.work_dir + "/probes.sort")
631
632 tTrans = self.getTargetMap()
633
634 tEnum = {}
635 for t in tTrans:
636 tlabel = self.translateUUID(tTrans[t])
637 if tlabel is not None and tlabel not in tEnum:
638 tEnum[tlabel] = len(tEnum)
639
640 matrixFile = None
641 segFile = None
642
643 curName = None
644 curData = {}
645 missingCount = 0
646 rowCount = 0
647 pHandle = handles["geneticExtract:probes"]
648 for key, value in pHandle:
649 if matrixFile is None:
650 matrixFile = open("%s/matrix_file" % (self.work_dir), "w" )
651 out = ["NA"] * len(tEnum)
652 for target in tEnum:
653 out[ tEnum[ target ] ] = target
654 matrixFile.write( "%s\t%s\n" % ( "#probe", "\t".join( out ) ) )
655
656 if curName != key:
657 if curName is not None:
658 out = ["NA"] * len(tEnum)
659 for target in curData:
660 try:
661 ttarget = self.translateUUID(tTrans[target])
662 if ttarget is not None:
663 out[ tEnum[ ttarget ] ] = str( curData[ target ] )
664 except KeyError:
665 self.addError( "TargetInfo Not Found: %s" % (target))
666 if out.count("NA") != len(tEnum):
667 rowCount += 1
668 matrixFile.write( "%s\t%s\n" % ( curName, "\t".join( out ) ) )
669 curName = key
670 curData = {}
671 if "target" in value:
672 for probeField in self.probeFields:
673 if probeField in value:
674 curData[ value[ "target" ] ] = value[ probeField ]
675 elif "file" in value:
676 for probeField in self.probeFields:
677 if probeField in value:
678 curData[ value[ "file" ] ] = value[ probeField ]
679 matrixFile.close()
680 matrixName = self.config.name
681 if rowCount > 0:
682 self.emitFile( "", self.getMeta(matrixName), "%s/matrix_file" % (self.work_dir) )
683
684
685 adminNS = "http://tcga.nci/bcr/xml/administration/2.3"
686
687
688 class TCGAClinicalImport(FileImporter):
689
690 def fileScan(self, path):
691 handle = open(path)
692 data = handle.read()
693 handle.close()
694 xml=parseString(data)
695 self.parseXMLFile(xml)
696
697 def getText(self, nodelist):
698 rc = []
699 for node in nodelist:
700 if node.nodeType == node.TEXT_NODE:
701 rc.append(node.data)
702 return ''.join(rc)
703
704
705 def parseXMLFile(self, dom):
706 admin = {}
707 for node in dom.getElementsByTagNameNS( adminNS, "admin"):
708 for cNode in node.childNodes:
709 if cNode.nodeType == cNode.ELEMENT_NODE:
710 admin[ cNode.localName ] = {}
711 admin[ cNode.localName ]['value'] = getText( cNode.childNodes )
712
713 name = None
714 patient = {}
715 patientName = None
716 for node in dom.childNodes[0].childNodes:
717 if node.nodeType == node.ELEMENT_NODE:
718 if node.localName == 'patient':
719 for elm in node.childNodes:
720 if elm.nodeType == elm.ELEMENT_NODE:
721 if ( elm.localName == 'bcr_patient_barcode' ):
722 name = getText( elm.childNodes )
723 patientName = name
724
725 if ( elm.getAttribute( 'procurement_status' ) == "Completed" ):
726 patient[ elm.localName ] = {}
727 patient[ elm.localName ]['value'] = getText( elm.childNodes )
728 patient[ elm.localName ]['tier'] = elm.getAttribute( 'tier' )
729 patient[ elm.localName ]['precision'] = elm.getAttribute( 'precision' )
730
731 if elm.prefix == "auxiliary":
732 for aux in elm.childNodes:
733 if aux.nodeType == aux.ELEMENT_NODE:
734 for auxval in aux.childNodes:
735 if auxval.nodeType == auxval.ELEMENT_NODE:
736 patient[ auxval.localName ] = {}
737 patient[ auxval.localName ]['value'] = getText( auxval.childNodes )
738 patient[ auxval.localName ]['tier'] = auxval.getAttribute( 'tier' )
739 patient[ auxval.localName ]['precision'] = auxval.getAttribute( 'precision' )
740
741 if name is not None:
742 for key in admin:
743 patient[ key ] = admin[ key ]
744 self.emit( name, patient, "patient" )
745
746 for node in dom.childNodes[0].childNodes:
747 if node.nodeType == node.ELEMENT_NODE and node.localName == 'patient':
748 for samples in node.childNodes:
749 if samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'samples':
750 for sample in samples.childNodes:
751 if sample.nodeType == samples.ELEMENT_NODE and sample.localName == 'sample':
752 sampleData = {}
753 for value in sample.childNodes:
754 if value.nodeType == value.ELEMENT_NODE:
755 if value.localName == 'bcr_sample_barcode' :
756 name = getText( value.childNodes )
757 if value.getAttribute( 'procurement_status' ) == "Completed" :
758 sampleData[ value.localName ] = {}
759 sampleData[ value.localName ]['value'] = getText( value.childNodes )
760
761 if value.localName == 'portions' :
762 for portions in value.childNodes:
763 if portions.nodeType == value.ELEMENT_NODE and portions.localName == "portion":
764 portionName = None
765 portionData = {}
766 for portion in portions.childNodes:
767 if portion.nodeType == value.ELEMENT_NODE:
768 if portion.localName == "analytes":
769 for analytes in portion.childNodes:
770 if analytes.nodeType == analytes.ELEMENT_NODE and analytes.localName =="analyte":
771 analyteName = None
772 analyteData = {}
773 for analyte in analytes.childNodes:
774 if analyte.nodeType == value.ELEMENT_NODE:
775 if analyte.localName == "aliquots":
776 for aliquots in analyte.childNodes:
777 if aliquots.nodeType == aliquots.ELEMENT_NODE and aliquots.localName =="aliquot":
778 aliquotName = None
779 aliquotData = {}
780 for aliquot in aliquots.childNodes:
781 if aliquot.nodeType == value.ELEMENT_NODE:
782 if aliquot.localName == "bcr_aliquot_barcode":
783 aliquotName = getText(aliquot.childNodes)
784 if aliquot.getAttribute( 'procurement_status' ) == "Completed" :
785 aliquotData[ aliquot.localName ] = {}
786 aliquotData[ aliquot.localName ]['value'] = getText( aliquot.childNodes )
787 if aliquotName is not None and len(aliquotData):
788 self.emit( aliquotName, aliquotData, 'aliquot' )
789
790
791 if analyte.localName == "bcr_analyte_barcode":
792 analyteName = getText(analyte.childNodes)
793 if analyte.getAttribute( 'procurement_status' ) == "Completed" :
794 analyteData[ analyte.localName ] = {}
795 analyteData[ analyte.localName ]['value'] = getText( analyte.childNodes )
796 if analyteName is not None and len(analyteData):
797 self.emit( analyteName, analyteData, 'analyte' )
798
799 if portion.localName == "bcr_portion_barcode":
800 portionName = getText( portion.childNodes )
801 if portion.getAttribute( 'procurement_status' ) == "Completed" :
802 portionData[ portion.localName ] = {}
803 portionData[ portion.localName ]['value'] = getText( portion.childNodes )
804 if portionName is not None and len(portionData):
805 self.emit( portionName, portionData, 'portion' )
806
807
808 #patientName = re.sub( r'\-...$', "", name )
809 self.emit( name, sampleData, "sample" )
810 self.emit( name, patient, "sample")
811 elif samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'drugs':
812 for drug in samples.childNodes:
813 if drug.nodeType == samples.ELEMENT_NODE and drug.localName == 'drug':
814 drugData = {}
815 for value in drug.childNodes:
816 if value.nodeType == value.ELEMENT_NODE:
817 if value.localName == 'bcr_drug_barcode' :
818 name = getText( value.childNodes )
819 if value.getAttribute( 'procurement_status' ) == "Completed" :
820 drugData[ value.localName ] = {}
821 drugData[ value.localName ]['value'] = getText( value.childNodes )
822
823 #patientName = re.sub( r'\-...$', "", name )
824 self.emit( patientName, drugData, "drug" )
825 elif samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'radiations':
826 for rad in samples.childNodes:
827 if rad.nodeType == samples.ELEMENT_NODE and rad.localName == 'radiation':
828 radData = {}
829 for value in rad.childNodes:
830 if value.nodeType == value.ELEMENT_NODE:
831 if value.localName == 'bcr_radiation_barcode' :
832 name = getText( value.childNodes )
833 if value.getAttribute( 'procurement_status' ) == "Completed" :
834 radData[ value.localName ] = {}
835 radData[ value.localName ]['value'] = getText( value.childNodes )
836
837 #patientName = re.sub( r'\-...$', "", name )
838 self.emit( patientName, radData, "radiation" )
839
840
841
842 def getMeta(self, name):
843 fileInfo = {
844 "@context" : "http://purl.org/cgdata/",
845 "@type" : "clinicalMatrix",
846 "@id" : name,
847 "lastModified" : self.config.version,
848 'dataSubType' : { "@id" : "clinical" },
849 "rowKeySrc" : {
850 "@type" : "idDAG", "@id" : "tcga.%s" % (self.config.abbr)
851 }
852
853 }
854 fileInfo.update(self.ext_meta)
855 fileInfo.update(self.config.meta)
856 return fileInfo
857
858 def fileBuild(self):
859
860 matrixList = [ "patient", "sample", "radiation", "drug", "portion", "analyte", "aliquot" ]
861 if self.config.clinical_type is not None:
862 matrixList = [ self.config.clinical_type ]
863
864 for matrixName in matrixList:
865 if os.path.exists( "%s/%s" % (self.work_dir, matrixName)):
866 subprocess.call("cat %s/%s | sort -k 1 > %s/%s.sort" % (self.work_dir, matrixName, self.work_dir, matrixName), shell=True)
867 handle = TableReader(self.work_dir + "/" + matrixName + ".sort")
868 matrix = {}
869 colEnum = {}
870 for key, value in handle:
871 if key not in matrix:
872 matrix[key] = {}
873 for col in value:
874 matrix[key][col] = value[col]
875 if col not in colEnum:
876 if not self.config.sanitize or col not in [ 'race', 'ethnicity' ]:
877 colEnum[col] = len(colEnum)
878
879 handle = open( os.path.join(self.work_dir, matrixName + "_file"), "w")
880 cols = [None] * (len(colEnum))
881 for col in colEnum:
882 cols[colEnum[col]] = col
883 handle.write("sample\t%s\n" % ("\t".join(cols)))
884 for key in matrix:
885 cols = [""] * (len(colEnum))
886 for col in colEnum:
887 if col in matrix[key]:
888 cols[colEnum[col]] = matrix[key][col]['value']
889 handle.write("%s\t%s\n" % (key, "\t".join(cols).encode("ASCII", "replace")))
890 handle.close()
891 self.emitFile( "." + matrixName, self.getMeta(self.config.name + "." + matrixName), "%s/%s_file" % (self.work_dir, matrixName))
892
893
894 class AgilentImport(TCGAMatrixImport):
895 dataSubType = 'geneExp'
896 probeMap = 'hugo'
897 sampleMap = 'tcga.iddag'
898 dataType = 'genomicMatrix'
899 probeFields = ['log2 lowess normalized (cy5/cy3) collapsed by gene symbol']
900
901
902 class CGH1x1mImport(TCGASegmentImport):
903 dataSubType = 'cna'
904 sampleMap = 'tcga.iddag'
905 dataType = 'genomicSegment'
906 probeFields = ['seg.mean']
907
908 class SNP6Import(TCGASegmentImport):
909 assembly = 'hg19'
910 dataSubType = 'cna'
911 sampleMap ='tcga.iddag'
912 dataType = 'genomicSegment'
913 probeFields = ['seg.mean']
914
915 def fileScan(self, path):
916 outport = None
917 #if path.endswith(".hg18.seg.txt"):
918 # outport = "hg18_segment"
919 if path.endswith(".hg19.seg.txt"):
920 outport = "hg19_segment"
921
922 if outport is not None:
923 handle = open(path)
924 colName = None
925 for line in handle:
926 if colName is None:
927 colName = line.rstrip().split("\t")
928 for i, col in enumerate(colName):
929 if commonMap.has_key( col ):
930 colName[i] = commonMap[ col ]
931 else:
932 tmp = line.rstrip().split("\t")
933 out = {}
934 for i in range(1, len(colName)):
935 out[ colName[i] ] = tmp[i]
936 self.emit( tmp[0], out, outport )
937 handle.close()
938
939 def fileBuild(self):
940 tmap = self.getTargetMap()
941
942 for base in ['hg19']:
943 subprocess.call("sort -k 1 %s/%s_segment > %s/%s_segment.sort" % (self.work_dir, base, self.work_dir, base), shell=True)
944 handle = TableReader(self.work_dir + "/%s_segment.sort" % (base))
945
946 segFile = None
947 curName = None
948 curData = {}
949 missingCount = 0
950
951 startField = "loc.start"
952 endField = "loc.end"
953 valField = "seg.mean"
954 chromeField = "chrom"
955
956 segFile = None
957 sHandle = handle
958 for key, value in sHandle:
959 if segFile is None:
960 segFile = open("%s/%s_segment.out" % (self.work_dir, base), "w")
961 try:
962 curName = self.translateUUID(tmap[key])
963 if curName is not None:
964 chrom = value[ chromeField ].lower()
965 if not chrom.startswith("chr"):
966 chrom = "chr" + chrom
967 chrom = chrom.upper().replace("CHR", "chr")
968 #segFile.write( "%s\t%s\t%s\t%s\t.\t%s\n" % ( curName, chrom, int(value[ startField ])+1, value[ endField ], value[ valField ] ) )
969 segFile.write( "%s\t%s\t%s\t%s\t%s\n" % ( chrom, value[ startField ], value[ endField ], curName, value[ valField ] ) )
970 except KeyError:
971 self.addError( "TargetInfo Not Found: %s" % (key))
972
973 segFile.close()
974
975 self.emitFile("." + base, self.getMeta(self.config.name + "." + base), "%s/%s_segment.out" % (self.work_dir, base))
976
977
978 class HmiRNAImport(TCGAMatrixImport):
979 dataSubType = 'miRNAExp'
980 probeMap = 'agilentHumanMiRNA'
981 sampleMap = 'tcga.iddag'
982 dataType = 'genomicMatrix'
983 probeFields = ['unc_DWD_Batch_adjusted']
984
985 class CGH244AImport(TCGASegmentImport):
986 dataSubType = 'cna'
987 sampleMap = 'tcga.iddag'
988 dataType = 'genomicSegment'
989 probeFields = ['Segment_Mean']
990
991 class CGH415K_G4124A(TCGASegmentImport):
992 dataSubType = 'cna'
993 sampleMap = 'tcga.iddag'
994 chromeField = 'Chromosome'
995 dataType = 'genomicSegment'
996 endField = 'End'
997 probeFields = ['Segment_Mean']
998 startField = 'Start'
999
1000
1001 class IlluminaHiSeq_DNASeqC(TCGASegmentImport):
1002 dataSubType = 'cna'
1003 sampleMap = 'tcga.iddag'
1004 chromeField = 'Chromosome'
1005 dataType = 'genomicSegment'
1006 endField = 'End'
1007 probeFields = ['Segment_Mean']
1008 startField = 'Start'
1009
1010 def translateUUID(self, uuid):
1011 out = self.config.translateUUID(uuid)
1012 #censor out normal ids
1013 if re.search(r'^TCGA-..-....-1', out):
1014 return None
1015 return out
1016
1017 class HT_HGU133A(TCGAMatrixImport):
1018 dataSubType = 'geneExp'
1019 probeMap = 'affyU133a'
1020 sampleMap = 'tcga.iddag'
1021 dataType = 'genomicMatrix'
1022 probeFields = ['Signal']
1023
1024 class HuEx1_0stv2(TCGAMatrixImport):
1025 dataSubType = 'miRNAExp'
1026 probeMap = 'hugo'
1027 sampleMap = 'tcga.iddag'
1028 dataType = 'genomicMatrix'
1029 probeFields = ['Signal']
1030 fileInclude = '^.*gene.txt$|^.*sdrf.txt$'
1031
1032 class Human1MDuoImport(TCGASegmentImport):
1033 dataSubType = 'cna'
1034 sampleMap = 'tcga.iddag'
1035 dataType = 'genomicSegment'
1036 probeFields = ['mean']
1037
1038 class HumanHap550(TCGASegmentImport):
1039 dataSubType = 'cna'
1040 sampleMap = 'tcga.iddag'
1041 dataType = 'genomicSegment'
1042 probeFields = ['mean']
1043
1044 class HumanMethylation27(TCGAMatrixImport):
1045 dataSubType = 'DNAMethylation'
1046 probeMap= 'illuminaMethyl27K_gpl8490'
1047 sampleMap= 'tcga.iddag'
1048 dataType= 'genomicMatrix'
1049 fileExclude= '.*.adf.txt'
1050 probeFields = ['Beta_Value', 'Beta_value']
1051
1052
1053 class HumanMethylation450(TCGAMatrixImport):
1054 dataSubType = 'DNAMethylation'
1055 probeMap = 'illuminaHumanMethylation450'
1056 sampleMap = 'tcga.iddag'
1057 dataType = 'genomicMatrix'
1058 fileExclude = '.*.adf.txt'
1059 probeFields = ['Beta_value', 'Beta_Value']
1060
1061 def fileScan(self, path):
1062 """
1063 This function takes a TCGA level 3 genetic file (file name and input handle),
1064 and tries to extract probe levels or target mappings (experimental ID to TCGA barcode)
1065 it emits these values to a handle, using the 'targets' and 'probes' string to identify
1066 the type of data being emited
1067 """
1068 iHandle = open(path)
1069 mode = None
1070 #modes
1071 #1 - two col header matrix file
1072 target = None
1073 colName = None
1074 colType = None
1075 for line in iHandle:
1076 if colName is None:
1077 colName = line.rstrip().split("\t")
1078 if colName[0] == "Hybridization REF" or colName[0] == "Sample REF":
1079 mode=1
1080 for i in range(len(colName)):
1081 if commonMap.has_key( colName[i] ):
1082 colName[i] = commonMap[ colName[i] ]
1083 elif mode==1 and colType is None:
1084 colType=line.rstrip().split("\t")
1085 for i in range(len(colType)):
1086 if commonMap.has_key( colType[i] ):
1087 colType[i] = commonMap[ colType[i] ]
1088 else:
1089 tmp = line.rstrip().split("\t")
1090 if mode == 1:
1091 out={}
1092 for col in colName[1:]:
1093 out[ col ] = { "target" : col }
1094 for i in range(1,len(colType)):
1095 try:
1096 if colType[i] in self.probeFields:
1097 out[ colName[i] ][ colType[i] ] = "%.4f" % float(tmp[i])
1098 except IndexError:
1099 out[ colName[i] ][ colType[i] ] = "NA"
1100 except ValueError:
1101 out[ colName[i] ][ colType[i] ] = "NA"
1102 for col in out:
1103 self.emit( tmp[0], out[col], "probes" )
1104
1105 class Illumina_RNASeq(TCGAMatrixImport):
1106 sampleMap= 'tcga.iddag'
1107 dataSubType= 'geneExp'
1108 fileInclude= r'^.*\.gene.quantification.txt$|^.*sdrf.txt$'
1109 probeFields = ['RPKM']
1110 probeMap= 'hugo.unc'
1111
1112 class Illumina_RNASeqV2(TCGAMatrixImport):
1113 sampleMap= 'tcga.iddag'
1114 dataSubType= 'geneExp'
1115 fileInclude= r'^.*rsem.genes.normalized_results$|^.*sdrf.txt$'
1116 probeFields = ['normalized_count']
1117 probeMap= 'hugo.unc'
1118
1119 class IlluminaHiSeq_RNASeq(TCGAMatrixImport):
1120 sampleMap= 'tcga.iddag'
1121 dataSubType= 'geneExp'
1122 fileInclude= r'^.*gene.quantification.txt$'
1123 probeFields = ['RPKM']
1124 probeMap= 'hugo.unc'
1125
1126 class MDA_RPPA_Core(TCGAMatrixImport):
1127 sampleMap = 'tcga.iddag'
1128 probeMap = "md_anderson_antibodies"
1129 dataSubType = "RPPA"
1130 fileExclude = r'^.*.antibody_annotation.txt'
1131 probeFields = [ 'Protein Expression' ]
1132
1133 def getTargetMap(self):
1134 subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True)
1135 handle = TableReader(self.work_dir + "/targets.sort")
1136 tTrans = {}
1137 for key, value in handle:
1138 value = re.sub(r'\.SD', '', value)
1139 tTrans[ key ] = value
1140 return tTrans
1141
1142
1143 class Illumina_miRNASeq(TCGAMatrixImport):
1144 sampleMap= 'tcga.iddag'
1145 dataSubType= 'miRNA'
1146 fileInclude= '^.*.mirna.quantification.txt$'
1147 probeFields = ['reads_per_million_miRNA_mapped']
1148 probeMap= 'hsa.mirna'
1149
1150
1151 class bioImport(TCGAClinicalImport):
1152 sampleMap = 'tcga.iddag'
1153 fileInclude = '.*.xml$'
1154
1155 tcgaConfig = {
1156 'AgilentG4502A_07' : AgilentImport,
1157 'AgilentG4502A_07_1' : AgilentImport,
1158 'AgilentG4502A_07_2' : AgilentImport,
1159 'AgilentG4502A_07_3': AgilentImport,
1160 'CGH-1x1M_G4447A': CGH1x1mImport,
1161 'Genome_Wide_SNP_6': SNP6Import,
1162 'H-miRNA_8x15K': HmiRNAImport,
1163 'H-miRNA_8x15Kv2': HmiRNAImport,
1164 'HG-CGH-244A': CGH244AImport,
1165 'HG-CGH-415K_G4124A': CGH415K_G4124A,
1166 'HT_HG-U133A': HT_HGU133A,
1167 'HuEx-1_0-st-v2': HuEx1_0stv2,
1168 'Human1MDuo': Human1MDuoImport,
1169 'HumanHap550': HumanHap550,
1170 'IlluminaHiSeq_DNASeqC' : IlluminaHiSeq_DNASeqC,
1171 'HumanMethylation27': HumanMethylation27,
1172 'HumanMethylation450': HumanMethylation450,
1173 'IlluminaHiSeq_RNASeq': IlluminaHiSeq_RNASeq,
1174 'IlluminaGA_RNASeq' : Illumina_RNASeq,
1175 'IlluminaHiSeq_RNASeqV2' : Illumina_RNASeqV2,
1176 'MDA_RPPA_Core' : MDA_RPPA_Core,
1177 'IlluminaGA_miRNASeq' : Illumina_miRNASeq,
1178 'IlluminaHiSeq_miRNASeq' : Illumina_miRNASeq,
1179 'bio' : bioImport
1180 }
1181
1182 def fileDigest( file ):
1183 md5 = hashlib.md5()
1184 with open(file,'rb') as f:
1185 for chunk in iter(lambda: f.read(8192), ''):
1186 md5.update(chunk)
1187 return md5.hexdigest()
1188
1189
1190 def platform_list():
1191 q = CustomQuery("Platform")
1192 for e in q:
1193 yield e['name']
1194
1195 def supported_list():
1196 q = CustomQuery("Platform")
1197 for e in q:
1198 if e['name'] in tcgaConfig:
1199 yield e['name']
1200
1201 def platform_archives(platform):
1202 q = CustomQuery("Archive[Platform[@name=%s]][@isLatest=1]" % platform)
1203 out = {}
1204 for e in q:
1205 name = e['baseName']
1206 if name not in out:
1207 yield name
1208 out[name] = True
1209
1210
1211 if __name__ == "__main__":
1212
1213 parser = ArgumentParser()
1214 #Stack.addJobTreeOptions(parser)
1215
1216 parser.add_argument("-a", "--platform-list", dest="platform_list", action="store_true", help="Get list of platforms", default=False)
1217 parser.add_argument("-u", "--uuid", dest="uuid_table", help="UUID to Barcode Table", default=None)
1218 parser.add_argument("-t", "--uuid-download", dest="uuid_download", help="Download UUID/Barcode Table", default=False)
1219 parser.add_argument("-z", "--all-archives", dest="all_archives", action="store_true", help="List all archives", default=False)
1220 parser.add_argument("-p", "--platform", dest="platform", help="Platform Selection", default=None)
1221 parser.add_argument("-l", "--supported", dest="supported_list", action="store_true", help="List Supported Platforms", default=None)
1222 parser.add_argument("-f", "--filelist", dest="filelist", help="List files needed to convert TCGA project basename into cgData", default=None)
1223 parser.add_argument("-b", "--basename", dest="basename", help="Convert TCGA project basename into cgData", default=None)
1224 parser.add_argument("-m", "--mirror", dest="mirror", help="Mirror Location", default=None)
1225 parser.add_argument("-w", "--workdir", dest="workdir_base", help="Working directory", default="/tmp")
1226 parser.add_argument("--out-dir", dest="outdir", help="Working directory", default="./")
1227 parser.add_argument("-o", "--out", dest="outpath", help="Output Dest", default=None)
1228 parser.add_argument("--out-error", dest="errorpath", help="Output Error", default=None)
1229 parser.add_argument("--out-meta", dest="metapath", help="Output Meta", default=None)
1230 parser.add_argument("-c", "--cancer", dest="cancer", help="List Archives by cancer type", default=None)
1231 parser.add_argument("-d", "--download", dest="download", help="Download files for archive", default=None)
1232 parser.add_argument("-e", "--level", dest="level", help="Data Level ", default="3")
1233 parser.add_argument("-s", "--check-sum", dest="checksum", help="Check project md5", default=None)
1234 parser.add_argument("-r", "--sanitize", dest="sanitize", action="store_true", help="Remove race/ethnicity from clinical data", default=False)
1235 parser.add_argument("-x", "--clinical", dest="clinical", help="Process clinical info", default=None)
1236 parser.add_argument("--clinical-basename", dest="clinical_basename", help="Select Clinical Data by basename", default=None)
1237 parser.add_argument("--clinical-type", dest="clinical_type", help="Clinical Data Type", default=None)
1238 parser.add_argument("--all-clinical", dest="all_clinical", action="store_true", help="List all clinical archives", default=False)
1239 parser.add_argument("--out-clinical", dest="out_clinical", action="append", nargs=3, default=[])
1240 parser.add_argument("--samples", dest="get_samples", action="store_true", default=False)
1241
1242 options = parser.parse_args()
1243
1244 if options.uuid_download:
1245 url="https://tcga-data.nci.nih.gov/uuid/uuidBrowserExport.htm"
1246 data = {}
1247 data['exportType'] = 'tab'
1248 data['cols'] = "uuid,barcode"
1249 urllib.urlretrieve( url, options.uuid_download, data=urllib.urlencode(data))
1250
1251 if options.platform_list:
1252 for e in platform_list():
1253 print e
1254
1255 if options.supported_list:
1256 for e in supported_list():
1257 print e
1258
1259 if options.platform:
1260 for name in platform_archives( options.platform ):
1261 print name
1262
1263 if options.all_archives:
1264 q = CustomQuery("Archive[@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.level))
1265 out = {}
1266 for e in q:
1267 name = e['baseName']
1268 if name not in out:
1269 print name
1270 out[name] = True
1271
1272 if options.all_clinical:
1273 q = CustomQuery("Archive[@isLatest=1][Platform[@alias=bio]]")
1274 out = {}
1275 for e in q:
1276 name = e['baseName']
1277 if name not in out:
1278 print name
1279 out[name] = True
1280
1281 if options.get_samples:
1282 url="https://tcga-data.nci.nih.gov/datareports/aliquotExport.htm"
1283 data = {}
1284
1285 data['exportType'] = 'tab'
1286 data['cols'] = 'aliquotId,disease,bcrBatch,center,platform,levelOne,levelTwo,levelThree'
1287 data['filterReq'] = json.dumps({"disease":"","levelOne":"","aliquotId":"","center":"","levelTwo":"","bcrBatch":"","platform":"","levelThree":""})
1288 data['formFilter'] = json.dumps({"disease":"","levelOne":"","aliquotId":"","center":"","levelTwo":"","bcrBatch":"","platform":"","levelThree":""})
1289 handle = urllib.urlopen( url + "?" + urllib.urlencode(data))
1290
1291 for line in handle:
1292 tmp = line.rstrip().split("\t")
1293 if tmp[7] == "Submitted":
1294 if tmp[0][13]=='0':
1295 print "\t".join( [ tmp[0], tmp[1], "Tumor", tmp[4] ] )
1296 elif tmp[0][13] == '1':
1297 print "\t".join( [ tmp[0], tmp[1], "Normal", tmp[4] ] )
1298
1299
1300 if options.cancer is not None:
1301 q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][ArchiveType[@type=Level_%s]]" % (options.cancer, options.level))
1302 out = {}
1303 for e in q:
1304 name = e['baseName']
1305 if name not in out:
1306 print name
1307 out[name] = True
1308
1309 if options.filelist:
1310 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.filelist, options.level))
1311 for e in q:
1312 print e['deployLocation']
1313 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.filelist))
1314 for e in q:
1315 print e['deployLocation']
1316
1317 if options.checksum:
1318 urls = []
1319 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.checksum, options.level))
1320 for e in q:
1321 urls.append( e['deployLocation'] )
1322 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.checksum))
1323 for e in q:
1324 urls.append( e['deployLocation'] )
1325
1326 for url in urls:
1327 dst = os.path.join(options.mirror, re.sub("^/", "", url))
1328 if not os.path.exists( dst ):
1329 print "NOT_FOUND:", dst
1330 continue
1331 if not os.path.exists( dst + ".md5" ):
1332 print "MD5_NOT_FOUND", dst
1333 continue
1334
1335 handle = open( dst + ".md5" )
1336 line = handle.readline()
1337 omd5 = line.split(' ')[0]
1338 handle.close()
1339
1340 nmd5 = fileDigest( dst )
1341 if omd5 != nmd5:
1342 print "CORRUPT:", dst
1343 else:
1344 print "OK:", dst
1345
1346
1347 if options.download is not None:
1348 if options.mirror is None:
1349 print "Define mirror location"
1350 sys.exit(1)
1351
1352 urls = []
1353
1354 if options.basename is None and options.clinical is None and options.clinical_basename is None:
1355 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.download, options.level))
1356 for e in q:
1357 urls.append( e['deployLocation'] )
1358 urls.append( e['deployLocation'] + ".md5" )
1359
1360 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.download))
1361 for e in q:
1362 urls.append( e['deployLocation'] )
1363 urls.append( e['deployLocation'] + ".md5" )
1364
1365 if options.basename:
1366 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.basename, options.level))
1367 for e in q:
1368 urls.append( e['deployLocation'] )
1369 urls.append( e['deployLocation'] + ".md5" )
1370
1371 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.basename))
1372 for e in q:
1373 urls.append( e['deployLocation'] )
1374 urls.append( e['deployLocation'] + ".md5" )
1375
1376 if options.clinical:
1377 q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][Platform[@alias=bio]]" % (options.clinical))
1378 for e in q:
1379 urls.append( e['deployLocation'] )
1380 urls.append( e['deployLocation'] + ".md5" )
1381
1382 if options.clinical_basename:
1383 q = CustomQuery("Archive[@isLatest=1][@baseName=%s]" % (options.clinical_basename))
1384 for e in q:
1385 urls.append( e['deployLocation'] )
1386 urls.append( e['deployLocation'] + ".md5" )
1387
1388
1389
1390 for url in urls:
1391 src = "https://tcga-data.nci.nih.gov/" + url
1392 dst = os.path.join(options.mirror, re.sub("^/", "", url))
1393 dir = os.path.dirname(dst)
1394 if not os.path.exists(dir):
1395 print "mkdir", dir
1396 os.makedirs(dir)
1397 if not os.path.exists( dst ):
1398 print "download %s to %s" % (src, dst)
1399 urllib.urlretrieve(src, dst)
1400
1401 if options.basename:
1402 if options.mirror is None:
1403 sys.stderr.write("Need mirror location\n")
1404 sys.exit(1)
1405
1406 conf = getBaseBuildConf(options.basename, options.level, options.mirror)
1407 conf.addOptions(options)
1408 if conf.platform not in tcgaConfig:
1409 sys.stderr.write("Platform %s not supported\n" % (conf.platform))
1410 sys.exit(1)
1411
1412 ext = tcgaConfig[conf.platform](conf)
1413 ext.run()
1414
1415
1416 if options.clinical:
1417 if options.mirror is None:
1418 sys.stderr.write("Need mirror location\n")
1419 sys.exit(1)
1420
1421 q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][Platform[@alias=bio]]" % (options.clinical))
1422 basenames = {}
1423 for s in q:
1424 basenames[s['baseName']] = True
1425
1426 for base in basenames:
1427 conf = getBaseBuildConf(base, 1, options.mirror)
1428 conf.addOptions(options)
1429
1430 ext = tcgaConfig[conf.platform](conf)
1431 ext.run()
1432
1433 if options.clinical_basename:
1434 if options.mirror is None:
1435 sys.stderr.write("Need mirror location\n")
1436 sys.exit(1)
1437
1438
1439 conf = getBaseBuildConf(options.clinical_basename, 1, options.mirror)
1440 conf.addOptions(options)
1441
1442 ext = tcgaConfig[conf.platform](conf)
1443 ext.run()
1444