tcga_import: tcga_import/tcgaImport.py comparison

comparison tcga_import/tcgaImport.py @ 0:f1c71f5363ae draft default tip

Uploaded

author	kellrott
date	Tue, 30 Oct 2012 14:23:49 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:f1c71f5363ae
+#!/usr/bin/env python
+"""
+Script to scan and extract TCGA data and compile it into the cgData
+Usage::
+tcga2cgdata.py [options]
+Options::
+-h, --help            show this help message and exit
+-a, --platform-list   Get list of platforms
+-p PLATFORM, --platform=PLATFORM
+Platform Selection
+-l, --supported       List Supported Platforms
+-f FILELIST, --filelist=FILELIST
+List files needed to convert TCGA project basename
+into cgData
+-b BASENAME, --basename=BASENAME
+Convert TCGA project basename into cgData
+-m MIRROR, --mirror=MIRROR
+Mirror Location
+-w WORKDIR_BASE, --workdir=WORKDIR_BASE
+Working directory
+-o OUTDIR, --out-dir=OUTDIR
+Working directory
+-c CANCER, --cancer=CANCER
+List Archives by cancer type
+-d DOWNLOAD, --download=DOWNLOAD
+Download files for archive
+-e LEVEL, --level=LEVEL
+Data Level
+-s CHECKSUM, --check-sum=CHECKSUM
+Check project md5
+-r, --sanitize        Remove race/ethnicity from clinical data
+Example::
+./scripts/tcga2cgdata.py -b intgen.org_KIRC_bio -m /inside/depot -e 1 -r -w tmp
+"""
+from xml.dom.minidom import parseString
+import urllib
+import urllib2
+import os
+import csv
+import sys
+import hashlib
+import tempfile
+import re
+import copy
+import json
+import datetime
+import hashlib
+import subprocess
+from glob import glob
+import shutil
+import subprocess
+from argparse import ArgumentParser
+"""
+Net query code
+"""
+class dccwsItem(object):
+baseURL = "http://tcga-data.nci.nih.gov/tcgadccws/GetXML?query="
+def __init__(self):
+self.url = None
+def __iter__(self):
+next = self.url
+while next != None:
+handle = urllib.urlopen(next)
+data = handle.read()
+handle.close()
+dom = parseString(data)
+# there might not be any archives for a dataset
+if len(dom.getElementsByTagName('queryResponse')) > 0:
+response = dom.getElementsByTagName('queryResponse').pop()
+classList = response.getElementsByTagName('class')
+for cls in classList:
+className = cls.getAttribute("recordNumber")
+outData = {}
+#aObj = Archive()
+for node in cls.childNodes:
+nodeName = node.getAttribute("name")
+if node.hasAttribute("xlink:href"):
+outData[ nodeName ] = node.getAttribute("xlink:href")
+else:
+outData[ nodeName ] = getText( node.childNodes )
+yield outData
+if len( dom.getElementsByTagName('next') ) > 0:
+nextElm = dom.getElementsByTagName('next').pop()
+next = nextElm.getAttribute( 'xlink:href' )
+else:
+next = None
+class CustomQuery(dccwsItem):
+def __init__(self, query):
+super(CustomQuery, self).__init__()
+if query.startswith("http://"):
+self.url = query
+else:
+self.url = dccwsItem.baseURL + query
+def getText(nodelist):
+rc = []
+for node in nodelist:
+if node.nodeType == node.TEXT_NODE:
+rc.append(node.data)
+return ''.join(rc)
+"""
+Build Configuration
+"""
+class BuildConf:
+def __init__(self, platform, name, version, meta, tarlist):
+self.platform = platform
+self.name = name
+self.version = version
+self.meta = meta
+self.tarlist = tarlist
+self.abbr = ''
+self.uuid_table = None
+if 'diseaseAbbr' in meta:
+self.abbr = meta['diseaseAbbr']
+def addOptions(self, opts):
+self.workdir_base = opts.workdir_base
+self.outdir = opts.outdir
+self.sanitize = opts.sanitize
+self.outpath = opts.outpath
+self.metapath = opts.metapath
+self.errorpath = opts.errorpath
+self.clinical_type = opts.clinical_type
+self.clinical_type_map = {}
+for t, path, meta in opts.out_clinical:
+self.clinical_type_map[ "." + t] = (path, meta)
+if opts.uuid_table is not None:
+self.uuid_table = {}
+handle = open(opts.uuid_table)
+for line in handle:
+tmp = line.rstrip().split("\t")
+self.uuid_table[tmp[0]] = tmp[1]
+def translateUUID(self, uuid):
+if self.uuid_table is None or uuid not in self.uuid_table:
+return uuid
+return self.uuid_table[uuid]
+def getOutPath(self, name):
+if self.outpath is not None:
+return self.outpath
+if name in self.clinical_type_map:
+return self.clinical_type_map[name][0]
+return os.path.join(self.outdir, self.name) + name
+def getOutMeta(self, name):
+if self.outpath is not None:
+if self.metapath is not None:
+return self.metapath
+return self.outpath + ".json"
+if name in self.clinical_type_map:
+return self.clinical_type_map[name][1]
+return os.path.join(self.outdir, self.name) + name + ".json"
+def getOutError(self, name):
+if self.outpath is not None:
+if self.errorpath is not None:
+return self.errorpath
+return self.outpath + ".error"
+return os.path.join(self.outdir, self.name) + name + ".error"
+def getBaseBuildConf(basename, level, mirror):
+dates = []
+print "TCGA Query for: ", basename
+q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (basename, level))
+urls = {}
+meta = None
+platform = None
+for e in q:
+dates.append( datetime.datetime.strptime( e['addedDate'], "%m-%d-%Y" ) )
+if meta is None:
+meta = {"sourceUrl" : []}
+for e2 in CustomQuery(e['platform']):
+platform = e2['name']
+meta['platform'] = e2['name']
+meta['platformTitle'] = e2['displayName']
+for e2 in CustomQuery(e['disease']):
+meta['diseaseAbbr'] = e2['abbreviation']
+meta['diseaseTitle'] = e2['name']
+for e3 in CustomQuery(e2['tissueCollection']):
+meta['tissue'] = e3['name']
+for e2 in CustomQuery(e['center']):
+meta['centerTitle'] = e2['displayName']
+meta['center'] = e2['name']
+meta['sourceUrl'].append( "http://tcga-data.nci.nih.gov/" + e['deployLocation'] )
+urls[ mirror + e['deployLocation'] ] = platform
+print "TCGA Query for mage-tab: ", basename
+q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (basename))
+for e in q:
+dates.append( datetime.datetime.strptime( e['addedDate'], "%m-%d-%Y" ) )
+q2 = CustomQuery(e['platform'])
+platform = None
+for e2 in q2:
+print e2
+platform = e2['name']
+meta['sourceUrl'].append( "http://tcga-data.nci.nih.gov/" + e['deployLocation'] )
+urls[ mirror + e['deployLocation'] ] = platform
+if len(dates) == 0:
+print "No Files found"
+return
+dates.sort()
+dates.reverse()
+versionDate = dates[0].strftime( "%Y-%m-%d" )
+return BuildConf(platform, basename, versionDate, meta, urls)
+class TableReader:
+def __init__(self, path):
+self.path = path
+def __iter__(self):
+if self.path is not None and os.path.exists(self.path):
+handle = open(self.path)
+for line in handle:
+tmp = line.rstrip().split("\t")
+yield tmp[0], json.loads(tmp[1])
+handle.close()
+class FileImporter:
+fileInclude = None
+fileExclude = None
+excludes = [
+"MANIFEST.txt$",
+"CHANGES_DCC.txt$",
+"README_DCC.txt$",
+"README.txt$",
+"CHANGES.txt$",
+"DCC_ALTERED_FILES.txt$",
+r'.wig$',
+"DESCRIPTIO$"
+]
+def __init__(self, config):
+self.config = config
+def extractTars(self):
+self.work_dir = tempfile.mkdtemp(dir=self.config.workdir_base)
+print "Extract to ", self.work_dir
+for path in self.config.tarlist:
+subprocess.check_call([ "tar", "xvzf", path, "-C", self.work_dir], stderr=sys.stdout)
+def run(self):
+self.extractTars()
+filterInclude = None
+filterExclude = None
+if self.fileInclude is not None:
+filterInclude = re.compile(self.fileInclude)
+if self.fileExclude is not None:
+filterExclude = re.compile(self.fileExclude)
+self.inc = 0
+self.out = {}
+self.errors = []
+self.ext_meta = {}
+self.scandirs(self.work_dir, filterInclude, filterExclude)
+for o in self.out:
+self.out[o].close()
+self.fileBuild()
+#shutil.rmtree(self.work_dir)
+def checkExclude( self, name ):
+for e in self.excludes:
+if re.search( e, name ):
+return True
+return False
+def scandirs(self, path, filterInclude=None, filterExclude=None):
+if os.path.isdir(path):
+for a in glob(os.path.join(path, "*")):
+self.scandirs(a, filterInclude, filterExclude)
+else:
+name = os.path.basename(path)
+if self.isMage(path):
+self.mageScan(path)
+else:
+if not self.checkExclude(name):
+if (filterInclude is None or filterInclude.match(name)) and (filterExclude is None or not filterExclude.match(name)):
+self.fileScan(path)
+def isMage(self, path):
+if path.endswith( '.sdrf.txt' ) or path.endswith( '.idf.txt' ) or path.endswith("DESCRIPTION.txt"):
+return True
+def emit(self, key, data, port):
+if port not in self.out:
+self.out[port] = open(self.work_dir + "/" + port, "w")
+self.out[port].write( "%s\t%s\n" % (key, json.dumps(data)))
+def emitFile(self, name, meta, file):
+md5 = hashlib.md5()
+oHandle = open(self.config.getOutPath(name), "wb")
+with open(file,'rb') as f:
+for chunk in iter(lambda: f.read(8192), ''):
+md5.update(chunk)
+oHandle.write(chunk)
+oHandle.close()
+md5str = md5.hexdigest()
+meta['md5'] = md5str
+mHandle = open(self.config.getOutMeta(name), "w")
+mHandle.write( json.dumps(meta))
+mHandle.close()
+if len(self.errors):
+eHandle = open( self.config.getOutError(name), "w" )
+for msg in self.errors:
+eHandle.write( msg + "\n" )
+eHandle.close()
+def addError(self, msg):
+self.errors.append(msg)
+commonMap = {
+"mean" : "seg.mean",
+"Segment_Mean" : "seg.mean",
+"Start" : "loc.start",
+"End" : "loc.end",
+"Chromosome" : "chrom"
+}
+idfMap = {
+"Investigation Title" : "title",
+"Experiment Description" : "experimentalDescription",
+"Person Affiliation" : "dataProducer",
+"Date of Experiment" : "experimentalDate"
+}
+class TCGAGeneticImport(FileImporter):
+def mageScan(self, path):
+if path.endswith(".sdrf.txt"):
+iHandle = open(path, "rU")
+read = csv.reader( iHandle, delimiter="\t" )
+colNum = None
+for row in read:
+if colNum is None:
+colNum = {}
+for i in range(len(row)):
+colNum[ row[i] ] = i
+else:
+if not colNum.has_key("Material Type") or ( not row[ colNum[ "Material Type" ] ] in [ "genomic_DNA", "total_RNA", "MDA cell line" ] ):
+try:
+if colNum.has_key( "Derived Array Data File" ):
+self.emit( row[ colNum[ "Derived Array Data File" ] ].split('.')[0], row[ colNum[ "Extract Name" ] ], "targets" )
+self.emit( row[ colNum[ "Derived Array Data File" ] ], row[ colNum[ "Extract Name" ] ], "targets" )
+if colNum.has_key("Derived Array Data Matrix File" ):
+self.emit( row[ colNum[ "Derived Array Data Matrix File" ] ], row[ colNum[ "Extract Name" ] ], "targets" )
+if colNum.has_key( "Derived Data File"):
+self.emit( row[ colNum[ "Derived Data File" ] ].split('.')[0], row[ colNum[ "Extract Name" ] ], "targets" )
+self.emit( row[ colNum[ "Derived Data File" ] ], row[ colNum[ "Extract Name" ] ], "targets" )
+if colNum.has_key( "Hybridization Name" ):
+self.emit( row[ colNum[ "Hybridization Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" )
+if colNum.has_key( "Sample Name" ):
+self.emit( row[ colNum[ "Sample Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" )
+self.emit( row[ colNum[ "Extract Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" )
+except IndexError:
+pass #there can be blank lines in the SDRF
+if path.endswith(".idf.txt"):
+iHandle = open(path)
+for line in iHandle:
+row = line.split("\t")
+if len(row):
+if row[0] in idfMap:
+self.ext_meta[ idfMap[row[0]] ] = row[1]
+iHandle.close()
+if path.endswith("DESCRIPTION.txt"):
+handle = open(path)
+self.ext_meta['description'] = handle.read()
+handle.close()
+def translateUUID(self, uuid):
+return self.config.translateUUID(uuid)
+def getTargetMap(self):
+subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True)
+handle = TableReader(self.work_dir + "/targets.sort")
+tTrans = {}
+for key, value in handle:
+tTrans[ key ] = value
+return tTrans
+def fileScan(self, path):
+"""
+This function takes a TCGA level 3 genetic file (file name and input handle),
+and tries to extract probe levels or target mappings (experimental ID to TCGA barcode)
+it emits these values to a handle, using the 'targets' and 'probes' string to identify
+the type of data being emited
+"""
+iHandle = open(path)
+mode = None
+#modes
+#1 - segmentFile - one sample per file/no sample info inside file
+#2 - two col header matrix file
+#3 - segmentFile - sample information inside file
+target = None
+colName = None
+colType = None
+for line in iHandle:
+if colName is None:
+colName = line.rstrip().split("\t")
+if colName[0] == "Hybridization REF" or colName[0] == "Sample REF":
+mode=2
+elif colName[0] == "Chromosome" or colName[0] == "chromosome":
+mode=1
+target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention
+elif colName[1] == "chrom":
+mode = 3
+target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention
+for i in range(len(colName)):
+if commonMap.has_key( colName[i] ):
+colName[i] = commonMap[ colName[i] ]
+elif mode==2 and colType is None:
+colType=line.rstrip().split("\t")
+for i in range(len(colType)):
+if commonMap.has_key( colType[i] ):
+colType[i] = commonMap[ colType[i] ]
+else:
+tmp = line.rstrip().split("\t")
+if mode == 2:
+out={}
+for col in colName[1:]:
+out[ col ] = { "target" : col }
+for i in range(1,len(colType)):
+try:
+if colType[i] in self.probeFields:
+out[ colName[i] ][ colType[i] ] = tmp[i]
+except IndexError:
+out[ colName[i] ][ colType[i] ] = "NA"
+for col in out:
+self.emit( tmp[0], out[col], "probes" )
+else:
+out = {}
+for i in range(len(colName)):
+out[ colName[i] ] = tmp[i]
+out['file'] = os.path.basename(path)
+if mode==1:
+self.emit( target, out, "segments" )
+elif mode == 3:
+self.emit( tmp[0], out, "segments" )
+else:
+self.emit( tmp[0], out, "probes" )
+class TCGASegmentImport(TCGAGeneticImport):
+def fileScan(self, path):
+"""
+This function takes a TCGA level 3 genetic file (file name and input handle),
+and tries to extract probe levels or target mappings (experimental ID to TCGA barcode)
+it emits these values to a handle, using the 'targets' and 'probes' string to identify
+the type of data being emited
+"""
+iHandle = open(path)
+mode = None
+#modes
+#1 - segmentFile - one sample per file/no sample info inside file
+#2 - segmentFile - sample information inside file
+target = None
+colName = None
+colType = None
+for line in iHandle:
+if colName is None:
+colName = line.rstrip().split("\t")
+if colName[0] == "Chromosome" or colName[0] == "chromosome":
+mode=1
+target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention
+elif colName[1] == "chrom":
+mode = 2
+for i in range(len(colName)):
+if commonMap.has_key( colName[i] ):
+colName[i] = commonMap[ colName[i] ]
+else:
+tmp = line.rstrip().split("\t")
+out = {}
+for i in range(len(colName)):
+out[ colName[i] ] = tmp[i]
+out['file'] = os.path.basename(path)
+if mode==1:
+self.emit( target, out, "segments" )
+elif mode == 2:
+self.emit( tmp[0], out, "segments" )
+def getMeta(self, name):
+matrixInfo = {
+'@context' : "http://purl.org/cgdata/",
+'@type' : 'bed5',
+'@id' : name,
+"lastModified" : self.config.version,
+'rowKeySrc' : {
+'@type' :  'idDAG',
+'@id' : "tcga.%s" % (self.config.abbr)
+},
+'dataSubType' : { "@id" : self.dataSubType },
+'dataProducer' : 'TCGA Import',
+"accessMap" : "public", "redistribution" : "yes"
+}
+matrixInfo.update(self.ext_meta)
+matrixInfo.update(self.config.meta)
+return matrixInfo
+def fileBuild(self):
+#use the target table to create a name translation table
+#also setup target name enumeration, so they will have columns
+#numbers
+tTrans = self.getTargetMap()
+subprocess.call("sort -k 1 %s/segments > %s/segments.sort" % (self.work_dir, self.work_dir), shell=True)
+sHandle = TableReader(self.work_dir + "/segments.sort")
+segFile = None
+curName = None
+curData = {}
+missingCount = 0
+startField  = "loc.start"
+endField    = "loc.end"
+valField    = "seg.mean"
+chromeField = "chrom"
+segFile = None
+for key, value in sHandle:
+if segFile is None:
+segFile = open("%s/segment_file"  % (self.work_dir), "w")
+try:
+curName = self.translateUUID(tTrans[key]) # "-".join( tTrans[ key ].split('-')[0:4] )
+if curName is not None:
+try:
+chrom = value[ chromeField ].lower()
+if not chrom.startswith("chr"):
+chrom = "chr" + chrom
+chrom = chrom.upper().replace("CHR", "chr")
+#segFile.write( "%s\t%s\t%s\t%s\t.\t%s\n" % ( curName, chrom, int(value[ startField ])+1, value[ endField ], value[ valField ] ) )
+segFile.write( "%s\t%s\t%s\t%s\t%s\n" % ( chrom, value[ startField ], value[ endField ], curName, value[ valField ] ) )
+except KeyError:
+self.addError( "Field error: %s" % (str(value)))
+except KeyError:
+self.addError( "TargetInfo Not Found: %s" % (key))
+segFile.close()
+matrixName = self.config.name
+self.emitFile( "", self.getMeta(matrixName), "%s/segment_file"  % (self.work_dir) )
+class TCGAMatrixImport(TCGAGeneticImport):
+def getMeta(self, name):
+matrixInfo = {
+"@context" : 'http://purl.org/cgdata/',
+'@type' : 'genomicMatrix',
+'@id' : name,
+"lastModified" : self.config.version,
+'dataSubType' : { "@id" : self.dataSubType },
+'dataProducer' : 'TCGA',
+"accessMap" : "public",
+"redistribution" : "yes",
+'rowKeySrc' : {
+"@type" : "probe", "@id" : self.probeMap
+},
+'columnKeySrc' : {
+"@type" : "idDAG", "@id" :  "tcga.%s" % (self.config.abbr)
+}
+}
+matrixInfo.update(self.ext_meta)
+matrixInfo.update(self.config.meta)
+return matrixInfo
+def fileBuild(self):
+#use the target table to create a name translation table
+#also setup target name enumeration, so they will have columns
+#numbers
+subprocess.call("sort -k 1 %s/probes > %s/probes.sort" % (self.work_dir, self.work_dir), shell=True)
+subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True)
+handles = {}
+handles[ "geneticExtract:targets" ] = TableReader(self.work_dir + "/targets.sort")
+handles[ "geneticExtract:probes" ] = TableReader(self.work_dir + "/probes.sort")
+tTrans = self.getTargetMap()
+tEnum = {}
+for t in tTrans:
+tlabel = self.translateUUID(tTrans[t])
+if tlabel is not None and tlabel not in tEnum:
+tEnum[tlabel] = len(tEnum)
+matrixFile = None
+segFile = None
+curName = None
+curData = {}
+missingCount = 0
+rowCount = 0
+pHandle = handles["geneticExtract:probes"]
+for key, value in pHandle:
+if matrixFile is None:
+matrixFile = open("%s/matrix_file" % (self.work_dir), "w" )
+out = ["NA"] * len(tEnum)
+for target in tEnum:
+out[ tEnum[ target ] ] = target
+matrixFile.write( "%s\t%s\n" % ( "#probe", "\t".join( out ) ) )
+if curName != key:
+if curName is not None:
+out = ["NA"] * len(tEnum)
+for target in curData:
+try:
+ttarget = self.translateUUID(tTrans[target])
+if ttarget is not None:
+out[ tEnum[ ttarget ] ] = str( curData[ target ] )
+except KeyError:
+self.addError( "TargetInfo Not Found: %s" % (target))
+if out.count("NA") != len(tEnum):
+rowCount += 1
+matrixFile.write( "%s\t%s\n" % ( curName, "\t".join( out ) ) )
+curName = key
+curData = {}
+if "target" in value:
+for probeField in self.probeFields:
+if probeField in value:
+curData[ value[ "target" ] ] = value[ probeField ]
+elif "file" in value:
+for probeField in self.probeFields:
+if probeField in value:
+curData[ value[ "file" ] ] = value[ probeField ]
+matrixFile.close()
+matrixName = self.config.name
+if rowCount > 0:
+self.emitFile( "", self.getMeta(matrixName), "%s/matrix_file"  % (self.work_dir) )
+adminNS = "http://tcga.nci/bcr/xml/administration/2.3"
+class TCGAClinicalImport(FileImporter):
+def fileScan(self, path):
+handle = open(path)
+data = handle.read()
+handle.close()
+xml=parseString(data)
+self.parseXMLFile(xml)
+def getText(self, nodelist):
+rc = []
+for node in nodelist:
+if node.nodeType == node.TEXT_NODE:
+rc.append(node.data)
+return ''.join(rc)
+def parseXMLFile(self, dom):
+admin = {}
+for node in dom.getElementsByTagNameNS( adminNS, "admin"):
+for cNode in node.childNodes:
+if cNode.nodeType == cNode.ELEMENT_NODE:
+admin[ cNode.localName ] = {}
+admin[ cNode.localName ]['value'] = getText( cNode.childNodes )
+name = None
+patient = {}
+patientName = None
+for node in dom.childNodes[0].childNodes:
+if node.nodeType == node.ELEMENT_NODE:
+if node.localName == 'patient':
+for elm in node.childNodes:
+if elm.nodeType == elm.ELEMENT_NODE:
+if ( elm.localName == 'bcr_patient_barcode' ):
+name = getText( elm.childNodes )
+patientName = name
+if ( elm.getAttribute( 'procurement_status' ) == "Completed" ):
+patient[ elm.localName ] = {}
+patient[ elm.localName ]['value'] = getText( elm.childNodes )
+patient[ elm.localName ]['tier']  = elm.getAttribute( 'tier' )
+patient[ elm.localName ]['precision'] = elm.getAttribute( 'precision' )
+if elm.prefix == "auxiliary":
+for aux in elm.childNodes:
+if aux.nodeType == aux.ELEMENT_NODE:
+for auxval in aux.childNodes:
+if auxval.nodeType == auxval.ELEMENT_NODE:
+patient[ auxval.localName ] = {}
+patient[ auxval.localName ]['value'] = getText( auxval.childNodes )
+patient[ auxval.localName ]['tier']  = auxval.getAttribute( 'tier' )
+patient[ auxval.localName ]['precision'] = auxval.getAttribute( 'precision' )
+if name is not None:
+for key in admin:
+patient[ key ] = admin[ key ]
+self.emit( name, patient, "patient" )
+for node in dom.childNodes[0].childNodes:
+if node.nodeType == node.ELEMENT_NODE and node.localName == 'patient':
+for samples in node.childNodes:
+if samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'samples':
+for sample in samples.childNodes:
+if sample.nodeType == samples.ELEMENT_NODE and sample.localName == 'sample':
+sampleData = {}
+for value in sample.childNodes:
+if value.nodeType == value.ELEMENT_NODE:
+if value.localName == 'bcr_sample_barcode' :
+name = getText( value.childNodes )
+if value.getAttribute( 'procurement_status' ) == "Completed" :
+sampleData[ value.localName ] = {}
+sampleData[ value.localName ]['value'] = getText( value.childNodes )
+if value.localName == 'portions' :
+for portions in value.childNodes:
+if portions.nodeType == value.ELEMENT_NODE and portions.localName == "portion":
+portionName = None
+portionData = {}
+for portion in portions.childNodes:
+if portion.nodeType == value.ELEMENT_NODE:
+if portion.localName == "analytes":
+for analytes in portion.childNodes:
+if analytes.nodeType == analytes.ELEMENT_NODE and analytes.localName =="analyte":
+analyteName = None
+analyteData = {}
+for analyte in analytes.childNodes:
+if analyte.nodeType == value.ELEMENT_NODE:
+if analyte.localName == "aliquots":
+for aliquots in analyte.childNodes:
+if aliquots.nodeType == aliquots.ELEMENT_NODE and aliquots.localName =="aliquot":
+aliquotName = None
+aliquotData = {}
+for aliquot in aliquots.childNodes:
+if aliquot.nodeType == value.ELEMENT_NODE:
+if aliquot.localName == "bcr_aliquot_barcode":
+aliquotName = getText(aliquot.childNodes)
+if aliquot.getAttribute( 'procurement_status' ) == "Completed" :
+aliquotData[ aliquot.localName ] = {}
+aliquotData[ aliquot.localName ]['value'] = getText( aliquot.childNodes )
+if aliquotName is not None and len(aliquotData):
+self.emit( aliquotName, aliquotData, 'aliquot' )
+if analyte.localName == "bcr_analyte_barcode":
+analyteName = getText(analyte.childNodes)
+if analyte.getAttribute( 'procurement_status' ) == "Completed" :
+analyteData[ analyte.localName ] = {}
+analyteData[ analyte.localName ]['value'] = getText( analyte.childNodes )
+if analyteName is not None and len(analyteData):
+self.emit( analyteName, analyteData, 'analyte' )
+if portion.localName == "bcr_portion_barcode":
+portionName = getText( portion.childNodes )
+if portion.getAttribute( 'procurement_status' ) == "Completed" :
+portionData[ portion.localName ] = {}
+portionData[ portion.localName ]['value'] = getText( portion.childNodes )
+if portionName is not None and len(portionData):
+self.emit( portionName, portionData, 'portion' )
+#patientName = re.sub( r'\-...$', "", name )
+self.emit( name, sampleData, "sample" )
+self.emit( name, patient, "sample")
+elif samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'drugs':
+for drug in samples.childNodes:
+if drug.nodeType == samples.ELEMENT_NODE and drug.localName == 'drug':
+drugData = {}
+for value in drug.childNodes:
+if value.nodeType == value.ELEMENT_NODE:
+if value.localName == 'bcr_drug_barcode' :
+name = getText( value.childNodes )
+if value.getAttribute( 'procurement_status' ) == "Completed" :
+drugData[ value.localName ] = {}
+drugData[ value.localName ]['value'] = getText( value.childNodes )
+#patientName = re.sub( r'\-...$', "", name )
+self.emit( patientName, drugData, "drug" )
+elif samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'radiations':
+for rad in samples.childNodes:
+if rad.nodeType == samples.ELEMENT_NODE and rad.localName == 'radiation':
+radData = {}
+for value in rad.childNodes:
+if value.nodeType == value.ELEMENT_NODE:
+if value.localName == 'bcr_radiation_barcode' :
+name = getText( value.childNodes )
+if value.getAttribute( 'procurement_status' ) == "Completed" :
+radData[ value.localName ] = {}
+radData[ value.localName ]['value'] = getText( value.childNodes )
+#patientName = re.sub( r'\-...$', "", name )
+self.emit( patientName, radData, "radiation" )
+def getMeta(self, name):
+fileInfo = {
+"@context" : "http://purl.org/cgdata/",
+"@type" : "clinicalMatrix",
+"@id" : name,
+"lastModified" :  self.config.version,
+'dataSubType' : { "@id" : "clinical" },
+"rowKeySrc" : {
+"@type" : "idDAG", "@id" :  "tcga.%s" % (self.config.abbr)
+}
+}
+fileInfo.update(self.ext_meta)
+fileInfo.update(self.config.meta)
+return fileInfo
+def fileBuild(self):
+matrixList = [ "patient", "sample", "radiation", "drug", "portion", "analyte", "aliquot" ]
+if self.config.clinical_type is not None:
+matrixList = [ self.config.clinical_type ]
+for matrixName in matrixList:
+if os.path.exists( "%s/%s" % (self.work_dir, matrixName)):
+subprocess.call("cat %s/%s | sort -k 1 > %s/%s.sort" % (self.work_dir, matrixName, self.work_dir, matrixName), shell=True)
+handle = TableReader(self.work_dir + "/" + matrixName + ".sort")
+matrix = {}
+colEnum = {}
+for key, value in handle:
+if key not in matrix:
+matrix[key] = {}
+for col in value:
+matrix[key][col] = value[col]
+if col not in colEnum:
+if not self.config.sanitize or col not in [ 'race', 'ethnicity' ]:
+colEnum[col] = len(colEnum)
+handle = open( os.path.join(self.work_dir, matrixName + "_file"), "w")
+cols = [None] * (len(colEnum))
+for col in colEnum:
+cols[colEnum[col]] = col
+handle.write("sample\t%s\n" % ("\t".join(cols)))
+for key in matrix:
+cols = [""] * (len(colEnum))
+for col in colEnum:
+if col in matrix[key]:
+cols[colEnum[col]] = matrix[key][col]['value']
+handle.write("%s\t%s\n" % (key, "\t".join(cols).encode("ASCII", "replace")))
+handle.close()
+self.emitFile( "." + matrixName, self.getMeta(self.config.name + "." + matrixName), "%s/%s_file"  % (self.work_dir, matrixName))
+class AgilentImport(TCGAMatrixImport):
+dataSubType = 'geneExp'
+probeMap = 'hugo'
+sampleMap = 'tcga.iddag'
+dataType = 'genomicMatrix'
+probeFields = ['log2 lowess normalized (cy5/cy3) collapsed by gene symbol']
+class CGH1x1mImport(TCGASegmentImport):
+dataSubType = 'cna'
+sampleMap = 'tcga.iddag'
+dataType = 'genomicSegment'
+probeFields = ['seg.mean']
+class SNP6Import(TCGASegmentImport):
+assembly = 'hg19'
+dataSubType = 'cna'
+sampleMap ='tcga.iddag'
+dataType = 'genomicSegment'
+probeFields = ['seg.mean']
+def fileScan(self, path):
+outport = None
+#if path.endswith(".hg18.seg.txt"):
+#    outport = "hg18_segment"
+if path.endswith(".hg19.seg.txt"):
+outport = "hg19_segment"
+if outport is not None:
+handle = open(path)
+colName = None
+for line in handle:
+if colName is None:
+colName = line.rstrip().split("\t")
+for i, col in enumerate(colName):
+if commonMap.has_key( col ):
+colName[i] = commonMap[ col ]
+else:
+tmp = line.rstrip().split("\t")
+out = {}
+for i in range(1, len(colName)):
+out[ colName[i] ] = tmp[i]
+self.emit( tmp[0], out, outport )
+handle.close()
+def fileBuild(self):
+tmap = self.getTargetMap()
+for base in ['hg19']:
+subprocess.call("sort -k 1 %s/%s_segment > %s/%s_segment.sort" % (self.work_dir, base, self.work_dir, base), shell=True)
+handle = TableReader(self.work_dir + "/%s_segment.sort" % (base))
+segFile = None
+curName = None
+curData = {}
+missingCount = 0
+startField  = "loc.start"
+endField    = "loc.end"
+valField    = "seg.mean"
+chromeField = "chrom"
+segFile = None
+sHandle = handle
+for key, value in sHandle:
+if segFile is None:
+segFile = open("%s/%s_segment.out"  % (self.work_dir, base), "w")
+try:
+curName = self.translateUUID(tmap[key])
+if curName is not None:
+chrom = value[ chromeField ].lower()
+if not chrom.startswith("chr"):
+chrom = "chr" + chrom
+chrom = chrom.upper().replace("CHR", "chr")
+#segFile.write( "%s\t%s\t%s\t%s\t.\t%s\n" % ( curName, chrom, int(value[ startField ])+1, value[ endField ], value[ valField ] ) )
+segFile.write( "%s\t%s\t%s\t%s\t%s\n" % ( chrom, value[ startField ], value[ endField ], curName, value[ valField ] ) )
+except KeyError:
+self.addError( "TargetInfo Not Found: %s" % (key))
+segFile.close()
+self.emitFile("." + base, self.getMeta(self.config.name + "." + base), "%s/%s_segment.out"  % (self.work_dir, base))
+class HmiRNAImport(TCGAMatrixImport):
+dataSubType = 'miRNAExp'
+probeMap = 'agilentHumanMiRNA'
+sampleMap = 'tcga.iddag'
+dataType = 'genomicMatrix'
+probeFields = ['unc_DWD_Batch_adjusted']
+class CGH244AImport(TCGASegmentImport):
+dataSubType = 'cna'
+sampleMap = 'tcga.iddag'
+dataType = 'genomicSegment'
+probeFields = ['Segment_Mean']
+class CGH415K_G4124A(TCGASegmentImport):
+dataSubType = 'cna'
+sampleMap = 'tcga.iddag'
+chromeField = 'Chromosome'
+dataType = 'genomicSegment'
+endField = 'End'
+probeFields = ['Segment_Mean']
+startField = 'Start'
+class IlluminaHiSeq_DNASeqC(TCGASegmentImport):
+dataSubType = 'cna'
+sampleMap = 'tcga.iddag'
+chromeField = 'Chromosome'
+dataType = 'genomicSegment'
+endField = 'End'
+probeFields = ['Segment_Mean']
+startField = 'Start'
+def translateUUID(self, uuid):
+out = self.config.translateUUID(uuid)
+#censor out normal ids
+if re.search(r'^TCGA-..-....-1', out):
+return None
+return out
+class HT_HGU133A(TCGAMatrixImport):
+dataSubType = 'geneExp'
+probeMap = 'affyU133a'
+sampleMap = 'tcga.iddag'
+dataType = 'genomicMatrix'
+probeFields = ['Signal']
+class HuEx1_0stv2(TCGAMatrixImport):
+dataSubType = 'miRNAExp'
+probeMap = 'hugo'
+sampleMap = 'tcga.iddag'
+dataType = 'genomicMatrix'
+probeFields = ['Signal']
+fileInclude = '^.*gene.txt$|^.*sdrf.txt$'
+class Human1MDuoImport(TCGASegmentImport):
+dataSubType = 'cna'
+sampleMap = 'tcga.iddag'
+dataType = 'genomicSegment'
+probeFields = ['mean']
+class HumanHap550(TCGASegmentImport):
+dataSubType = 'cna'
+sampleMap = 'tcga.iddag'
+dataType = 'genomicSegment'
+probeFields = ['mean']
+class HumanMethylation27(TCGAMatrixImport):
+dataSubType = 'DNAMethylation'
+probeMap= 'illuminaMethyl27K_gpl8490'
+sampleMap= 'tcga.iddag'
+dataType= 'genomicMatrix'
+fileExclude= '.*.adf.txt'
+probeFields = ['Beta_Value', 'Beta_value']
+class HumanMethylation450(TCGAMatrixImport):
+dataSubType =  'DNAMethylation'
+probeMap =  'illuminaHumanMethylation450'
+sampleMap =  'tcga.iddag'
+dataType =  'genomicMatrix'
+fileExclude = '.*.adf.txt'
+probeFields =  ['Beta_value', 'Beta_Value']
+def fileScan(self, path):
+"""
+This function takes a TCGA level 3 genetic file (file name and input handle),
+and tries to extract probe levels or target mappings (experimental ID to TCGA barcode)
+it emits these values to a handle, using the 'targets' and 'probes' string to identify
+the type of data being emited
+"""
+iHandle = open(path)
+mode = None
+#modes
+#1 - two col header matrix file
+target = None
+colName = None
+colType = None
+for line in iHandle:
+if colName is None:
+colName = line.rstrip().split("\t")
+if colName[0] == "Hybridization REF" or colName[0] == "Sample REF":
+mode=1
+for i in range(len(colName)):
+if commonMap.has_key( colName[i] ):
+colName[i] = commonMap[ colName[i] ]
+elif mode==1 and colType is None:
+colType=line.rstrip().split("\t")
+for i in range(len(colType)):
+if commonMap.has_key( colType[i] ):
+colType[i] = commonMap[ colType[i] ]
+else:
+tmp = line.rstrip().split("\t")
+if mode == 1:
+out={}
+for col in colName[1:]:
+out[ col ] = { "target" : col }
+for i in range(1,len(colType)):
+try:
+if colType[i] in self.probeFields:
+out[ colName[i] ][ colType[i] ] = "%.4f" % float(tmp[i])
+except IndexError:
+out[ colName[i] ][ colType[i] ] = "NA"
+except ValueError:
+out[ colName[i] ][ colType[i] ] = "NA"
+for col in out:
+self.emit( tmp[0], out[col], "probes" )
+class Illumina_RNASeq(TCGAMatrixImport):
+sampleMap= 'tcga.iddag'
+dataSubType= 'geneExp'
+fileInclude= r'^.*\.gene.quantification.txt$|^.*sdrf.txt$'
+probeFields = ['RPKM']
+probeMap= 'hugo.unc'
+class Illumina_RNASeqV2(TCGAMatrixImport):
+sampleMap= 'tcga.iddag'
+dataSubType= 'geneExp'
+fileInclude= r'^.*rsem.genes.normalized_results$|^.*sdrf.txt$'
+probeFields = ['normalized_count']
+probeMap= 'hugo.unc'
+class IlluminaHiSeq_RNASeq(TCGAMatrixImport):
+sampleMap= 'tcga.iddag'
+dataSubType= 'geneExp'
+fileInclude= r'^.*gene.quantification.txt$'
+probeFields = ['RPKM']
+probeMap= 'hugo.unc'
+class MDA_RPPA_Core(TCGAMatrixImport):
+sampleMap = 'tcga.iddag'
+probeMap = "md_anderson_antibodies"
+dataSubType = "RPPA"
+fileExclude = r'^.*.antibody_annotation.txt'
+probeFields = [ 'Protein Expression' ]
+def getTargetMap(self):
+subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True)
+handle = TableReader(self.work_dir + "/targets.sort")
+tTrans = {}
+for key, value in handle:
+value = re.sub(r'\.SD', '', value)
+tTrans[ key ] = value
+return tTrans
+class Illumina_miRNASeq(TCGAMatrixImport):
+sampleMap= 'tcga.iddag'
+dataSubType= 'miRNA'
+fileInclude= '^.*.mirna.quantification.txt$'
+probeFields = ['reads_per_million_miRNA_mapped']
+probeMap= 'hsa.mirna'
+class bioImport(TCGAClinicalImport):
+sampleMap = 'tcga.iddag'
+fileInclude = '.*.xml$'
+tcgaConfig = {
+'AgilentG4502A_07' : AgilentImport,
+'AgilentG4502A_07_1' : AgilentImport,
+'AgilentG4502A_07_2' : AgilentImport,
+'AgilentG4502A_07_3': AgilentImport,
+'CGH-1x1M_G4447A': CGH1x1mImport,
+'Genome_Wide_SNP_6': SNP6Import,
+'H-miRNA_8x15K': HmiRNAImport,
+'H-miRNA_8x15Kv2': HmiRNAImport,
+'HG-CGH-244A': CGH244AImport,
+'HG-CGH-415K_G4124A': CGH415K_G4124A,
+'HT_HG-U133A': HT_HGU133A,
+'HuEx-1_0-st-v2': HuEx1_0stv2,
+'Human1MDuo': Human1MDuoImport,
+'HumanHap550': HumanHap550,
+'IlluminaHiSeq_DNASeqC' : IlluminaHiSeq_DNASeqC,
+'HumanMethylation27': HumanMethylation27,
+'HumanMethylation450': HumanMethylation450,
+'IlluminaHiSeq_RNASeq': IlluminaHiSeq_RNASeq,
+'IlluminaGA_RNASeq' : Illumina_RNASeq,
+'IlluminaHiSeq_RNASeqV2' : Illumina_RNASeqV2,
+'MDA_RPPA_Core' : MDA_RPPA_Core,
+'IlluminaGA_miRNASeq' : Illumina_miRNASeq,
+'IlluminaHiSeq_miRNASeq' : Illumina_miRNASeq,
+'bio' : bioImport
+}
+def fileDigest( file ):
+md5 = hashlib.md5()
+with open(file,'rb') as f:
+for chunk in iter(lambda: f.read(8192), ''):
+md5.update(chunk)
+return md5.hexdigest()
+def platform_list():
+q = CustomQuery("Platform")
+for e in q:
+yield e['name']
+def supported_list():
+q = CustomQuery("Platform")
+for e in q:
+if e['name'] in tcgaConfig:
+yield e['name']
+def platform_archives(platform):
+q = CustomQuery("Archive[Platform[@name=%s]][@isLatest=1]" % platform)
+out = {}
+for e in q:
+name = e['baseName']
+if name not in out:
+yield name
+out[name] = True
+if __name__ == "__main__":
+parser = ArgumentParser()
+#Stack.addJobTreeOptions(parser)
+parser.add_argument("-a", "--platform-list", dest="platform_list", action="store_true", help="Get list of platforms", default=False)
+parser.add_argument("-u", "--uuid", dest="uuid_table", help="UUID to Barcode Table", default=None)
+parser.add_argument("-t", "--uuid-download", dest="uuid_download", help="Download UUID/Barcode Table", default=False)
+parser.add_argument("-z", "--all-archives", dest="all_archives", action="store_true", help="List all archives", default=False)
+parser.add_argument("-p", "--platform", dest="platform", help="Platform Selection", default=None)
+parser.add_argument("-l", "--supported", dest="supported_list", action="store_true", help="List Supported Platforms", default=None)
+parser.add_argument("-f", "--filelist", dest="filelist", help="List files needed to convert TCGA project basename into cgData", default=None)
+parser.add_argument("-b", "--basename", dest="basename", help="Convert TCGA project basename into cgData", default=None)
+parser.add_argument("-m", "--mirror", dest="mirror", help="Mirror Location", default=None)
+parser.add_argument("-w", "--workdir", dest="workdir_base", help="Working directory", default="/tmp")
+parser.add_argument("--out-dir", dest="outdir", help="Working directory", default="./")
+parser.add_argument("-o", "--out", dest="outpath", help="Output Dest", default=None)
+parser.add_argument("--out-error", dest="errorpath", help="Output Error", default=None)
+parser.add_argument("--out-meta", dest="metapath", help="Output Meta", default=None)
+parser.add_argument("-c", "--cancer", dest="cancer", help="List Archives by cancer type", default=None)
+parser.add_argument("-d", "--download", dest="download", help="Download files for archive", default=None)
+parser.add_argument("-e", "--level", dest="level", help="Data Level ", default="3")
+parser.add_argument("-s", "--check-sum", dest="checksum", help="Check project md5", default=None)
+parser.add_argument("-r", "--sanitize", dest="sanitize", action="store_true", help="Remove race/ethnicity from clinical data", default=False)
+parser.add_argument("-x", "--clinical", dest="clinical", help="Process clinical info", default=None)
+parser.add_argument("--clinical-basename", dest="clinical_basename", help="Select Clinical Data by basename", default=None)
+parser.add_argument("--clinical-type", dest="clinical_type", help="Clinical Data Type", default=None)
+parser.add_argument("--all-clinical", dest="all_clinical", action="store_true", help="List all clinical archives", default=False)
+parser.add_argument("--out-clinical", dest="out_clinical", action="append", nargs=3, default=[])
+parser.add_argument("--samples", dest="get_samples", action="store_true", default=False)
+options = parser.parse_args()
+if options.uuid_download:
+url="https://tcga-data.nci.nih.gov/uuid/uuidBrowserExport.htm"
+data = {}
+data['exportType'] = 'tab'
+data['cols'] = "uuid,barcode"
+urllib.urlretrieve( url, options.uuid_download, data=urllib.urlencode(data))
+if options.platform_list:
+for e in platform_list():
+print e
+if options.supported_list:
+for e in supported_list():
+print e
+if options.platform:
+for name in platform_archives( options.platform ):
+print name
+if options.all_archives:
+q = CustomQuery("Archive[@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.level))
+out = {}
+for e in q:
+name = e['baseName']
+if name not in out:
+print name
+out[name] = True
+if options.all_clinical:
+q = CustomQuery("Archive[@isLatest=1][Platform[@alias=bio]]")
+out = {}
+for e in q:
+name = e['baseName']
+if name not in out:
+print name
+out[name] = True
+if options.get_samples:
+url="https://tcga-data.nci.nih.gov/datareports/aliquotExport.htm"
+data = {}
+data['exportType'] = 'tab'
+data['cols'] = 'aliquotId,disease,bcrBatch,center,platform,levelOne,levelTwo,levelThree'
+data['filterReq'] = json.dumps({"disease":"","levelOne":"","aliquotId":"","center":"","levelTwo":"","bcrBatch":"","platform":"","levelThree":""})
+data['formFilter'] = json.dumps({"disease":"","levelOne":"","aliquotId":"","center":"","levelTwo":"","bcrBatch":"","platform":"","levelThree":""})
+handle = urllib.urlopen( url + "?" + urllib.urlencode(data))
+for line in handle:
+tmp = line.rstrip().split("\t")
+if tmp[7] == "Submitted":
+if tmp[0][13]=='0':
+print "\t".join( [ tmp[0], tmp[1], "Tumor", tmp[4] ] )
+elif tmp[0][13] == '1':
+print "\t".join( [ tmp[0], tmp[1], "Normal", tmp[4] ] )
+if options.cancer is not None:
+q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][ArchiveType[@type=Level_%s]]" % (options.cancer, options.level))
+out = {}
+for e in q:
+name = e['baseName']
+if name not in out:
+print name
+out[name] = True
+if options.filelist:
+q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.filelist, options.level))
+for e in q:
+print e['deployLocation']
+q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.filelist))
+for e in q:
+print e['deployLocation']
+if options.checksum:
+urls = []
+q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.checksum, options.level))
+for e in q:
+urls.append( e['deployLocation'] )
+q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.checksum))
+for e in q:
+urls.append( e['deployLocation'] )
+for url in urls:
+dst = os.path.join(options.mirror, re.sub("^/", "", url))
+if not os.path.exists( dst ):
+print "NOT_FOUND:", dst
+continue
+if not os.path.exists( dst + ".md5" ):
+print "MD5_NOT_FOUND", dst
+continue
+handle = open( dst + ".md5" )
+line = handle.readline()
+omd5 = line.split(' ')[0]
+handle.close()
+nmd5 = fileDigest( dst )
+if omd5 != nmd5:
+print "CORRUPT:", dst
+else:
+print "OK:", dst
+if options.download is not None:
+if options.mirror is None:
+print "Define mirror location"
+sys.exit(1)
+urls = []
+if options.basename is None and options.clinical is None and options.clinical_basename is None:
+q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.download, options.level))
+for e in q:
+urls.append( e['deployLocation'] )
+urls.append( e['deployLocation'] + ".md5" )
+q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.download))
+for e in q:
+urls.append( e['deployLocation'] )
+urls.append( e['deployLocation'] + ".md5" )
+if options.basename:
+q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.basename, options.level))
+for e in q:
+urls.append( e['deployLocation'] )
+urls.append( e['deployLocation'] + ".md5" )
+q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.basename))
+for e in q:
+urls.append( e['deployLocation'] )
+urls.append( e['deployLocation'] + ".md5" )
+if options.clinical:
+q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][Platform[@alias=bio]]" % (options.clinical))
+for e in q:
+urls.append( e['deployLocation'] )
+urls.append( e['deployLocation'] + ".md5" )
+if options.clinical_basename:
+q = CustomQuery("Archive[@isLatest=1][@baseName=%s]" % (options.clinical_basename))
+for e in q:
+urls.append( e['deployLocation'] )
+urls.append( e['deployLocation'] + ".md5" )
+for url in urls:
+src = "https://tcga-data.nci.nih.gov/" + url
+dst = os.path.join(options.mirror, re.sub("^/", "", url))
+dir = os.path.dirname(dst)
+if not os.path.exists(dir):
+print "mkdir", dir
+os.makedirs(dir)
+if not os.path.exists( dst ):
+print "download %s to %s" % (src, dst)
+urllib.urlretrieve(src, dst)
+if options.basename:
+if options.mirror is None:
+sys.stderr.write("Need mirror location\n")
+sys.exit(1)
+conf = getBaseBuildConf(options.basename, options.level, options.mirror)
+conf.addOptions(options)
+if conf.platform not in tcgaConfig:
+sys.stderr.write("Platform %s not supported\n" % (conf.platform))
+sys.exit(1)
+ext = tcgaConfig[conf.platform](conf)
+ext.run()
+if options.clinical:
+if options.mirror is None:
+sys.stderr.write("Need mirror location\n")
+sys.exit(1)
+q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][Platform[@alias=bio]]" % (options.clinical))
+basenames = {}
+for s in q:
+basenames[s['baseName']] = True
+for base in basenames:
+conf = getBaseBuildConf(base, 1, options.mirror)
+conf.addOptions(options)
+ext = tcgaConfig[conf.platform](conf)
+ext.run()
+if options.clinical_basename:
+if options.mirror is None:
+sys.stderr.write("Need mirror location\n")
+sys.exit(1)
+conf = getBaseBuildConf(options.clinical_basename, 1, options.mirror)
+conf.addOptions(options)
+ext = tcgaConfig[conf.platform](conf)
+ext.run()

Mercurial > repos > kellrott > tcga_import

comparison tcga_import/tcgaImport.py @ 0:f1c71f5363ae draft default tip