# HG changeset patch # User kellrott # Date 1351621429 14400 # Node ID f1c71f5363aee6723b62d0d849afbe5838e609d1 Uploaded diff -r 000000000000 -r f1c71f5363ae tcga_import/._tcgaImport.py Binary file tcga_import/._tcgaImport.py has changed diff -r 000000000000 -r f1c71f5363ae tcga_import/tcgaImport.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tcga_import/tcgaImport.py Tue Oct 30 14:23:49 2012 -0400 @@ -0,0 +1,1444 @@ +#!/usr/bin/env python + + +""" +Script to scan and extract TCGA data and compile it into the cgData + +Usage:: + + tcga2cgdata.py [options] + +Options:: + + -h, --help show this help message and exit + -a, --platform-list Get list of platforms + -p PLATFORM, --platform=PLATFORM + Platform Selection + -l, --supported List Supported Platforms + -f FILELIST, --filelist=FILELIST + List files needed to convert TCGA project basename + into cgData + -b BASENAME, --basename=BASENAME + Convert TCGA project basename into cgData + -m MIRROR, --mirror=MIRROR + Mirror Location + -w WORKDIR_BASE, --workdir=WORKDIR_BASE + Working directory + -o OUTDIR, --out-dir=OUTDIR + Working directory + -c CANCER, --cancer=CANCER + List Archives by cancer type + -d DOWNLOAD, --download=DOWNLOAD + Download files for archive + -e LEVEL, --level=LEVEL + Data Level + -s CHECKSUM, --check-sum=CHECKSUM + Check project md5 + -r, --sanitize Remove race/ethnicity from clinical data + + +Example:: + + ./scripts/tcga2cgdata.py -b intgen.org_KIRC_bio -m /inside/depot -e 1 -r -w tmp + + +""" + +from xml.dom.minidom import parseString +import urllib +import urllib2 +import os +import csv +import sys +import hashlib +import tempfile +import re +import copy +import json +import datetime +import hashlib +import subprocess +from glob import glob +import shutil +import subprocess +from argparse import ArgumentParser + + + + +""" + +Net query code + +""" + +class dccwsItem(object): + baseURL = "http://tcga-data.nci.nih.gov/tcgadccws/GetXML?query=" + + def __init__(self): + self.url = None + + def __iter__(self): + next = self.url + while next != None: + handle = urllib.urlopen(next) + data = handle.read() + handle.close() + dom = parseString(data) + # there might not be any archives for a dataset + if len(dom.getElementsByTagName('queryResponse')) > 0: + response = dom.getElementsByTagName('queryResponse').pop() + classList = response.getElementsByTagName('class') + for cls in classList: + className = cls.getAttribute("recordNumber") + outData = {} + #aObj = Archive() + for node in cls.childNodes: + nodeName = node.getAttribute("name") + if node.hasAttribute("xlink:href"): + outData[ nodeName ] = node.getAttribute("xlink:href") + else: + outData[ nodeName ] = getText( node.childNodes ) + yield outData + if len( dom.getElementsByTagName('next') ) > 0: + nextElm = dom.getElementsByTagName('next').pop() + next = nextElm.getAttribute( 'xlink:href' ) + else: + next = None + + +class CustomQuery(dccwsItem): + def __init__(self, query): + super(CustomQuery, self).__init__() + if query.startswith("http://"): + self.url = query + else: + self.url = dccwsItem.baseURL + query + + +def getText(nodelist): + rc = [] + for node in nodelist: + if node.nodeType == node.TEXT_NODE: + rc.append(node.data) + return ''.join(rc) + +""" + +Build Configuration + +""" + +class BuildConf: + def __init__(self, platform, name, version, meta, tarlist): + self.platform = platform + self.name = name + self.version = version + self.meta = meta + self.tarlist = tarlist + self.abbr = '' + self.uuid_table = None + if 'diseaseAbbr' in meta: + self.abbr = meta['diseaseAbbr'] + + def addOptions(self, opts): + self.workdir_base = opts.workdir_base + self.outdir = opts.outdir + self.sanitize = opts.sanitize + self.outpath = opts.outpath + self.metapath = opts.metapath + self.errorpath = opts.errorpath + self.clinical_type = opts.clinical_type + + self.clinical_type_map = {} + for t, path, meta in opts.out_clinical: + self.clinical_type_map[ "." + t] = (path, meta) + + if opts.uuid_table is not None: + self.uuid_table = {} + handle = open(opts.uuid_table) + for line in handle: + tmp = line.rstrip().split("\t") + self.uuid_table[tmp[0]] = tmp[1] + + def translateUUID(self, uuid): + if self.uuid_table is None or uuid not in self.uuid_table: + return uuid + return self.uuid_table[uuid] + + def getOutPath(self, name): + if self.outpath is not None: + return self.outpath + if name in self.clinical_type_map: + return self.clinical_type_map[name][0] + return os.path.join(self.outdir, self.name) + name + + def getOutMeta(self, name): + if self.outpath is not None: + if self.metapath is not None: + return self.metapath + return self.outpath + ".json" + if name in self.clinical_type_map: + return self.clinical_type_map[name][1] + return os.path.join(self.outdir, self.name) + name + ".json" + + def getOutError(self, name): + if self.outpath is not None: + if self.errorpath is not None: + return self.errorpath + return self.outpath + ".error" + return os.path.join(self.outdir, self.name) + name + ".error" + + +def getBaseBuildConf(basename, level, mirror): + dates = [] + print "TCGA Query for: ", basename + q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (basename, level)) + urls = {} + meta = None + platform = None + for e in q: + dates.append( datetime.datetime.strptime( e['addedDate'], "%m-%d-%Y" ) ) + if meta is None: + meta = {"sourceUrl" : []} + for e2 in CustomQuery(e['platform']): + platform = e2['name'] + meta['platform'] = e2['name'] + meta['platformTitle'] = e2['displayName'] + for e2 in CustomQuery(e['disease']): + meta['diseaseAbbr'] = e2['abbreviation'] + meta['diseaseTitle'] = e2['name'] + for e3 in CustomQuery(e2['tissueCollection']): + meta['tissue'] = e3['name'] + for e2 in CustomQuery(e['center']): + meta['centerTitle'] = e2['displayName'] + meta['center'] = e2['name'] + meta['sourceUrl'].append( "http://tcga-data.nci.nih.gov/" + e['deployLocation'] ) + urls[ mirror + e['deployLocation'] ] = platform + + print "TCGA Query for mage-tab: ", basename + q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (basename)) + for e in q: + dates.append( datetime.datetime.strptime( e['addedDate'], "%m-%d-%Y" ) ) + q2 = CustomQuery(e['platform']) + platform = None + for e2 in q2: + print e2 + platform = e2['name'] + meta['sourceUrl'].append( "http://tcga-data.nci.nih.gov/" + e['deployLocation'] ) + urls[ mirror + e['deployLocation'] ] = platform + + if len(dates) == 0: + print "No Files found" + return + dates.sort() + dates.reverse() + versionDate = dates[0].strftime( "%Y-%m-%d" ) + + return BuildConf(platform, basename, versionDate, meta, urls) + + + + + +class TableReader: + def __init__(self, path): + self.path = path + + def __iter__(self): + if self.path is not None and os.path.exists(self.path): + handle = open(self.path) + for line in handle: + tmp = line.rstrip().split("\t") + yield tmp[0], json.loads(tmp[1]) + handle.close() + + +class FileImporter: + + fileInclude = None + fileExclude = None + + excludes = [ + "MANIFEST.txt$", + "CHANGES_DCC.txt$", + "README_DCC.txt$", + "README.txt$", + "CHANGES.txt$", + "DCC_ALTERED_FILES.txt$", + r'.wig$', + "DESCRIPTIO$" + ] + + def __init__(self, config): + self.config = config + + def extractTars(self): + self.work_dir = tempfile.mkdtemp(dir=self.config.workdir_base) + print "Extract to ", self.work_dir + for path in self.config.tarlist: + subprocess.check_call([ "tar", "xvzf", path, "-C", self.work_dir], stderr=sys.stdout) + + def run(self): + self.extractTars() + + filterInclude = None + filterExclude = None + if self.fileInclude is not None: + filterInclude = re.compile(self.fileInclude) + if self.fileExclude is not None: + filterExclude = re.compile(self.fileExclude) + self.inc = 0 + self.out = {} + self.errors = [] + self.ext_meta = {} + self.scandirs(self.work_dir, filterInclude, filterExclude) + for o in self.out: + self.out[o].close() + self.fileBuild() + #shutil.rmtree(self.work_dir) + + def checkExclude( self, name ): + for e in self.excludes: + if re.search( e, name ): + return True + return False + + def scandirs(self, path, filterInclude=None, filterExclude=None): + if os.path.isdir(path): + for a in glob(os.path.join(path, "*")): + self.scandirs(a, filterInclude, filterExclude) + else: + name = os.path.basename(path) + if self.isMage(path): + self.mageScan(path) + else: + if not self.checkExclude(name): + if (filterInclude is None or filterInclude.match(name)) and (filterExclude is None or not filterExclude.match(name)): + self.fileScan(path) + + def isMage(self, path): + if path.endswith( '.sdrf.txt' ) or path.endswith( '.idf.txt' ) or path.endswith("DESCRIPTION.txt"): + return True + + + def emit(self, key, data, port): + if port not in self.out: + self.out[port] = open(self.work_dir + "/" + port, "w") + self.out[port].write( "%s\t%s\n" % (key, json.dumps(data))) + + def emitFile(self, name, meta, file): + md5 = hashlib.md5() + oHandle = open(self.config.getOutPath(name), "wb") + with open(file,'rb') as f: + for chunk in iter(lambda: f.read(8192), ''): + md5.update(chunk) + oHandle.write(chunk) + oHandle.close() + md5str = md5.hexdigest() + meta['md5'] = md5str + mHandle = open(self.config.getOutMeta(name), "w") + mHandle.write( json.dumps(meta)) + mHandle.close() + if len(self.errors): + eHandle = open( self.config.getOutError(name), "w" ) + for msg in self.errors: + eHandle.write( msg + "\n" ) + eHandle.close() + + def addError(self, msg): + self.errors.append(msg) + + +commonMap = { + "mean" : "seg.mean", + "Segment_Mean" : "seg.mean", + "Start" : "loc.start", + "End" : "loc.end", + "Chromosome" : "chrom" +} + + +idfMap = { + "Investigation Title" : "title", + "Experiment Description" : "experimentalDescription", + "Person Affiliation" : "dataProducer", + "Date of Experiment" : "experimentalDate" +} + +class TCGAGeneticImport(FileImporter): + + + + def mageScan(self, path): + if path.endswith(".sdrf.txt"): + iHandle = open(path, "rU") + read = csv.reader( iHandle, delimiter="\t" ) + colNum = None + for row in read: + if colNum is None: + colNum = {} + for i in range(len(row)): + colNum[ row[i] ] = i + else: + if not colNum.has_key("Material Type") or ( not row[ colNum[ "Material Type" ] ] in [ "genomic_DNA", "total_RNA", "MDA cell line" ] ): + try: + if colNum.has_key( "Derived Array Data File" ): + self.emit( row[ colNum[ "Derived Array Data File" ] ].split('.')[0], row[ colNum[ "Extract Name" ] ], "targets" ) + self.emit( row[ colNum[ "Derived Array Data File" ] ], row[ colNum[ "Extract Name" ] ], "targets" ) + if colNum.has_key("Derived Array Data Matrix File" ): + self.emit( row[ colNum[ "Derived Array Data Matrix File" ] ], row[ colNum[ "Extract Name" ] ], "targets" ) + if colNum.has_key( "Derived Data File"): + self.emit( row[ colNum[ "Derived Data File" ] ].split('.')[0], row[ colNum[ "Extract Name" ] ], "targets" ) + self.emit( row[ colNum[ "Derived Data File" ] ], row[ colNum[ "Extract Name" ] ], "targets" ) + if colNum.has_key( "Hybridization Name" ): + self.emit( row[ colNum[ "Hybridization Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" ) + if colNum.has_key( "Sample Name" ): + self.emit( row[ colNum[ "Sample Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" ) + self.emit( row[ colNum[ "Extract Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" ) + except IndexError: + pass #there can be blank lines in the SDRF + if path.endswith(".idf.txt"): + iHandle = open(path) + for line in iHandle: + row = line.split("\t") + if len(row): + if row[0] in idfMap: + self.ext_meta[ idfMap[row[0]] ] = row[1] + iHandle.close() + if path.endswith("DESCRIPTION.txt"): + handle = open(path) + self.ext_meta['description'] = handle.read() + handle.close() + + def translateUUID(self, uuid): + return self.config.translateUUID(uuid) + + def getTargetMap(self): + subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True) + handle = TableReader(self.work_dir + "/targets.sort") + tTrans = {} + for key, value in handle: + tTrans[ key ] = value + return tTrans + + def fileScan(self, path): + """ + This function takes a TCGA level 3 genetic file (file name and input handle), + and tries to extract probe levels or target mappings (experimental ID to TCGA barcode) + it emits these values to a handle, using the 'targets' and 'probes' string to identify + the type of data being emited + """ + iHandle = open(path) + mode = None + #modes + #1 - segmentFile - one sample per file/no sample info inside file + #2 - two col header matrix file + #3 - segmentFile - sample information inside file + target = None + colName = None + colType = None + for line in iHandle: + if colName is None: + colName = line.rstrip().split("\t") + if colName[0] == "Hybridization REF" or colName[0] == "Sample REF": + mode=2 + elif colName[0] == "Chromosome" or colName[0] == "chromosome": + mode=1 + target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention + elif colName[1] == "chrom": + mode = 3 + target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention + + for i in range(len(colName)): + if commonMap.has_key( colName[i] ): + colName[i] = commonMap[ colName[i] ] + elif mode==2 and colType is None: + colType=line.rstrip().split("\t") + for i in range(len(colType)): + if commonMap.has_key( colType[i] ): + colType[i] = commonMap[ colType[i] ] + else: + tmp = line.rstrip().split("\t") + if mode == 2: + out={} + for col in colName[1:]: + out[ col ] = { "target" : col } + for i in range(1,len(colType)): + try: + if colType[i] in self.probeFields: + out[ colName[i] ][ colType[i] ] = tmp[i] + except IndexError: + out[ colName[i] ][ colType[i] ] = "NA" + for col in out: + self.emit( tmp[0], out[col], "probes" ) + else: + out = {} + for i in range(len(colName)): + out[ colName[i] ] = tmp[i] + out['file'] = os.path.basename(path) + if mode==1: + self.emit( target, out, "segments" ) + elif mode == 3: + self.emit( tmp[0], out, "segments" ) + else: + self.emit( tmp[0], out, "probes" ) + + + + +class TCGASegmentImport(TCGAGeneticImport): + + + def fileScan(self, path): + """ + This function takes a TCGA level 3 genetic file (file name and input handle), + and tries to extract probe levels or target mappings (experimental ID to TCGA barcode) + it emits these values to a handle, using the 'targets' and 'probes' string to identify + the type of data being emited + """ + iHandle = open(path) + mode = None + #modes + #1 - segmentFile - one sample per file/no sample info inside file + #2 - segmentFile - sample information inside file + target = None + colName = None + colType = None + for line in iHandle: + if colName is None: + colName = line.rstrip().split("\t") + if colName[0] == "Chromosome" or colName[0] == "chromosome": + mode=1 + target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention + elif colName[1] == "chrom": + mode = 2 + + for i in range(len(colName)): + if commonMap.has_key( colName[i] ): + colName[i] = commonMap[ colName[i] ] + else: + tmp = line.rstrip().split("\t") + out = {} + for i in range(len(colName)): + out[ colName[i] ] = tmp[i] + out['file'] = os.path.basename(path) + if mode==1: + self.emit( target, out, "segments" ) + elif mode == 2: + self.emit( tmp[0], out, "segments" ) + + + def getMeta(self, name): + matrixInfo = { + '@context' : "http://purl.org/cgdata/", + '@type' : 'bed5', + '@id' : name, + "lastModified" : self.config.version, + 'rowKeySrc' : { + '@type' : 'idDAG', + '@id' : "tcga.%s" % (self.config.abbr) + }, + 'dataSubType' : { "@id" : self.dataSubType }, + 'dataProducer' : 'TCGA Import', + "accessMap" : "public", "redistribution" : "yes" + } + matrixInfo.update(self.ext_meta) + matrixInfo.update(self.config.meta) + return matrixInfo + + def fileBuild(self): + #use the target table to create a name translation table + #also setup target name enumeration, so they will have columns + #numbers + + tTrans = self.getTargetMap() + subprocess.call("sort -k 1 %s/segments > %s/segments.sort" % (self.work_dir, self.work_dir), shell=True) + sHandle = TableReader(self.work_dir + "/segments.sort") + + segFile = None + curName = None + + curData = {} + missingCount = 0 + + startField = "loc.start" + endField = "loc.end" + valField = "seg.mean" + chromeField = "chrom" + + segFile = None + + for key, value in sHandle: + if segFile is None: + segFile = open("%s/segment_file" % (self.work_dir), "w") + try: + curName = self.translateUUID(tTrans[key]) # "-".join( tTrans[ key ].split('-')[0:4] ) + if curName is not None: + try: + chrom = value[ chromeField ].lower() + if not chrom.startswith("chr"): + chrom = "chr" + chrom + chrom = chrom.upper().replace("CHR", "chr") + #segFile.write( "%s\t%s\t%s\t%s\t.\t%s\n" % ( curName, chrom, int(value[ startField ])+1, value[ endField ], value[ valField ] ) ) + segFile.write( "%s\t%s\t%s\t%s\t%s\n" % ( chrom, value[ startField ], value[ endField ], curName, value[ valField ] ) ) + except KeyError: + self.addError( "Field error: %s" % (str(value))) + except KeyError: + self.addError( "TargetInfo Not Found: %s" % (key)) + + segFile.close() + matrixName = self.config.name + + self.emitFile( "", self.getMeta(matrixName), "%s/segment_file" % (self.work_dir) ) + + + +class TCGAMatrixImport(TCGAGeneticImport): + + def getMeta(self, name): + matrixInfo = { + "@context" : 'http://purl.org/cgdata/', + '@type' : 'genomicMatrix', + '@id' : name, + "lastModified" : self.config.version, + 'dataSubType' : { "@id" : self.dataSubType }, + 'dataProducer' : 'TCGA', + "accessMap" : "public", + "redistribution" : "yes", + 'rowKeySrc' : { + "@type" : "probe", "@id" : self.probeMap + }, + 'columnKeySrc' : { + "@type" : "idDAG", "@id" : "tcga.%s" % (self.config.abbr) + } + } + matrixInfo.update(self.ext_meta) + matrixInfo.update(self.config.meta) + return matrixInfo + + def fileBuild(self): + #use the target table to create a name translation table + #also setup target name enumeration, so they will have columns + #numbers + + subprocess.call("sort -k 1 %s/probes > %s/probes.sort" % (self.work_dir, self.work_dir), shell=True) + subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True) + + handles = {} + handles[ "geneticExtract:targets" ] = TableReader(self.work_dir + "/targets.sort") + handles[ "geneticExtract:probes" ] = TableReader(self.work_dir + "/probes.sort") + + tTrans = self.getTargetMap() + + tEnum = {} + for t in tTrans: + tlabel = self.translateUUID(tTrans[t]) + if tlabel is not None and tlabel not in tEnum: + tEnum[tlabel] = len(tEnum) + + matrixFile = None + segFile = None + + curName = None + curData = {} + missingCount = 0 + rowCount = 0 + pHandle = handles["geneticExtract:probes"] + for key, value in pHandle: + if matrixFile is None: + matrixFile = open("%s/matrix_file" % (self.work_dir), "w" ) + out = ["NA"] * len(tEnum) + for target in tEnum: + out[ tEnum[ target ] ] = target + matrixFile.write( "%s\t%s\n" % ( "#probe", "\t".join( out ) ) ) + + if curName != key: + if curName is not None: + out = ["NA"] * len(tEnum) + for target in curData: + try: + ttarget = self.translateUUID(tTrans[target]) + if ttarget is not None: + out[ tEnum[ ttarget ] ] = str( curData[ target ] ) + except KeyError: + self.addError( "TargetInfo Not Found: %s" % (target)) + if out.count("NA") != len(tEnum): + rowCount += 1 + matrixFile.write( "%s\t%s\n" % ( curName, "\t".join( out ) ) ) + curName = key + curData = {} + if "target" in value: + for probeField in self.probeFields: + if probeField in value: + curData[ value[ "target" ] ] = value[ probeField ] + elif "file" in value: + for probeField in self.probeFields: + if probeField in value: + curData[ value[ "file" ] ] = value[ probeField ] + matrixFile.close() + matrixName = self.config.name + if rowCount > 0: + self.emitFile( "", self.getMeta(matrixName), "%s/matrix_file" % (self.work_dir) ) + + +adminNS = "http://tcga.nci/bcr/xml/administration/2.3" + + +class TCGAClinicalImport(FileImporter): + + def fileScan(self, path): + handle = open(path) + data = handle.read() + handle.close() + xml=parseString(data) + self.parseXMLFile(xml) + + def getText(self, nodelist): + rc = [] + for node in nodelist: + if node.nodeType == node.TEXT_NODE: + rc.append(node.data) + return ''.join(rc) + + + def parseXMLFile(self, dom): + admin = {} + for node in dom.getElementsByTagNameNS( adminNS, "admin"): + for cNode in node.childNodes: + if cNode.nodeType == cNode.ELEMENT_NODE: + admin[ cNode.localName ] = {} + admin[ cNode.localName ]['value'] = getText( cNode.childNodes ) + + name = None + patient = {} + patientName = None + for node in dom.childNodes[0].childNodes: + if node.nodeType == node.ELEMENT_NODE: + if node.localName == 'patient': + for elm in node.childNodes: + if elm.nodeType == elm.ELEMENT_NODE: + if ( elm.localName == 'bcr_patient_barcode' ): + name = getText( elm.childNodes ) + patientName = name + + if ( elm.getAttribute( 'procurement_status' ) == "Completed" ): + patient[ elm.localName ] = {} + patient[ elm.localName ]['value'] = getText( elm.childNodes ) + patient[ elm.localName ]['tier'] = elm.getAttribute( 'tier' ) + patient[ elm.localName ]['precision'] = elm.getAttribute( 'precision' ) + + if elm.prefix == "auxiliary": + for aux in elm.childNodes: + if aux.nodeType == aux.ELEMENT_NODE: + for auxval in aux.childNodes: + if auxval.nodeType == auxval.ELEMENT_NODE: + patient[ auxval.localName ] = {} + patient[ auxval.localName ]['value'] = getText( auxval.childNodes ) + patient[ auxval.localName ]['tier'] = auxval.getAttribute( 'tier' ) + patient[ auxval.localName ]['precision'] = auxval.getAttribute( 'precision' ) + + if name is not None: + for key in admin: + patient[ key ] = admin[ key ] + self.emit( name, patient, "patient" ) + + for node in dom.childNodes[0].childNodes: + if node.nodeType == node.ELEMENT_NODE and node.localName == 'patient': + for samples in node.childNodes: + if samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'samples': + for sample in samples.childNodes: + if sample.nodeType == samples.ELEMENT_NODE and sample.localName == 'sample': + sampleData = {} + for value in sample.childNodes: + if value.nodeType == value.ELEMENT_NODE: + if value.localName == 'bcr_sample_barcode' : + name = getText( value.childNodes ) + if value.getAttribute( 'procurement_status' ) == "Completed" : + sampleData[ value.localName ] = {} + sampleData[ value.localName ]['value'] = getText( value.childNodes ) + + if value.localName == 'portions' : + for portions in value.childNodes: + if portions.nodeType == value.ELEMENT_NODE and portions.localName == "portion": + portionName = None + portionData = {} + for portion in portions.childNodes: + if portion.nodeType == value.ELEMENT_NODE: + if portion.localName == "analytes": + for analytes in portion.childNodes: + if analytes.nodeType == analytes.ELEMENT_NODE and analytes.localName =="analyte": + analyteName = None + analyteData = {} + for analyte in analytes.childNodes: + if analyte.nodeType == value.ELEMENT_NODE: + if analyte.localName == "aliquots": + for aliquots in analyte.childNodes: + if aliquots.nodeType == aliquots.ELEMENT_NODE and aliquots.localName =="aliquot": + aliquotName = None + aliquotData = {} + for aliquot in aliquots.childNodes: + if aliquot.nodeType == value.ELEMENT_NODE: + if aliquot.localName == "bcr_aliquot_barcode": + aliquotName = getText(aliquot.childNodes) + if aliquot.getAttribute( 'procurement_status' ) == "Completed" : + aliquotData[ aliquot.localName ] = {} + aliquotData[ aliquot.localName ]['value'] = getText( aliquot.childNodes ) + if aliquotName is not None and len(aliquotData): + self.emit( aliquotName, aliquotData, 'aliquot' ) + + + if analyte.localName == "bcr_analyte_barcode": + analyteName = getText(analyte.childNodes) + if analyte.getAttribute( 'procurement_status' ) == "Completed" : + analyteData[ analyte.localName ] = {} + analyteData[ analyte.localName ]['value'] = getText( analyte.childNodes ) + if analyteName is not None and len(analyteData): + self.emit( analyteName, analyteData, 'analyte' ) + + if portion.localName == "bcr_portion_barcode": + portionName = getText( portion.childNodes ) + if portion.getAttribute( 'procurement_status' ) == "Completed" : + portionData[ portion.localName ] = {} + portionData[ portion.localName ]['value'] = getText( portion.childNodes ) + if portionName is not None and len(portionData): + self.emit( portionName, portionData, 'portion' ) + + + #patientName = re.sub( r'\-...$', "", name ) + self.emit( name, sampleData, "sample" ) + self.emit( name, patient, "sample") + elif samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'drugs': + for drug in samples.childNodes: + if drug.nodeType == samples.ELEMENT_NODE and drug.localName == 'drug': + drugData = {} + for value in drug.childNodes: + if value.nodeType == value.ELEMENT_NODE: + if value.localName == 'bcr_drug_barcode' : + name = getText( value.childNodes ) + if value.getAttribute( 'procurement_status' ) == "Completed" : + drugData[ value.localName ] = {} + drugData[ value.localName ]['value'] = getText( value.childNodes ) + + #patientName = re.sub( r'\-...$', "", name ) + self.emit( patientName, drugData, "drug" ) + elif samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'radiations': + for rad in samples.childNodes: + if rad.nodeType == samples.ELEMENT_NODE and rad.localName == 'radiation': + radData = {} + for value in rad.childNodes: + if value.nodeType == value.ELEMENT_NODE: + if value.localName == 'bcr_radiation_barcode' : + name = getText( value.childNodes ) + if value.getAttribute( 'procurement_status' ) == "Completed" : + radData[ value.localName ] = {} + radData[ value.localName ]['value'] = getText( value.childNodes ) + + #patientName = re.sub( r'\-...$', "", name ) + self.emit( patientName, radData, "radiation" ) + + + + def getMeta(self, name): + fileInfo = { + "@context" : "http://purl.org/cgdata/", + "@type" : "clinicalMatrix", + "@id" : name, + "lastModified" : self.config.version, + 'dataSubType' : { "@id" : "clinical" }, + "rowKeySrc" : { + "@type" : "idDAG", "@id" : "tcga.%s" % (self.config.abbr) + } + + } + fileInfo.update(self.ext_meta) + fileInfo.update(self.config.meta) + return fileInfo + + def fileBuild(self): + + matrixList = [ "patient", "sample", "radiation", "drug", "portion", "analyte", "aliquot" ] + if self.config.clinical_type is not None: + matrixList = [ self.config.clinical_type ] + + for matrixName in matrixList: + if os.path.exists( "%s/%s" % (self.work_dir, matrixName)): + subprocess.call("cat %s/%s | sort -k 1 > %s/%s.sort" % (self.work_dir, matrixName, self.work_dir, matrixName), shell=True) + handle = TableReader(self.work_dir + "/" + matrixName + ".sort") + matrix = {} + colEnum = {} + for key, value in handle: + if key not in matrix: + matrix[key] = {} + for col in value: + matrix[key][col] = value[col] + if col not in colEnum: + if not self.config.sanitize or col not in [ 'race', 'ethnicity' ]: + colEnum[col] = len(colEnum) + + handle = open( os.path.join(self.work_dir, matrixName + "_file"), "w") + cols = [None] * (len(colEnum)) + for col in colEnum: + cols[colEnum[col]] = col + handle.write("sample\t%s\n" % ("\t".join(cols))) + for key in matrix: + cols = [""] * (len(colEnum)) + for col in colEnum: + if col in matrix[key]: + cols[colEnum[col]] = matrix[key][col]['value'] + handle.write("%s\t%s\n" % (key, "\t".join(cols).encode("ASCII", "replace"))) + handle.close() + self.emitFile( "." + matrixName, self.getMeta(self.config.name + "." + matrixName), "%s/%s_file" % (self.work_dir, matrixName)) + + +class AgilentImport(TCGAMatrixImport): + dataSubType = 'geneExp' + probeMap = 'hugo' + sampleMap = 'tcga.iddag' + dataType = 'genomicMatrix' + probeFields = ['log2 lowess normalized (cy5/cy3) collapsed by gene symbol'] + + +class CGH1x1mImport(TCGASegmentImport): + dataSubType = 'cna' + sampleMap = 'tcga.iddag' + dataType = 'genomicSegment' + probeFields = ['seg.mean'] + +class SNP6Import(TCGASegmentImport): + assembly = 'hg19' + dataSubType = 'cna' + sampleMap ='tcga.iddag' + dataType = 'genomicSegment' + probeFields = ['seg.mean'] + + def fileScan(self, path): + outport = None + #if path.endswith(".hg18.seg.txt"): + # outport = "hg18_segment" + if path.endswith(".hg19.seg.txt"): + outport = "hg19_segment" + + if outport is not None: + handle = open(path) + colName = None + for line in handle: + if colName is None: + colName = line.rstrip().split("\t") + for i, col in enumerate(colName): + if commonMap.has_key( col ): + colName[i] = commonMap[ col ] + else: + tmp = line.rstrip().split("\t") + out = {} + for i in range(1, len(colName)): + out[ colName[i] ] = tmp[i] + self.emit( tmp[0], out, outport ) + handle.close() + + def fileBuild(self): + tmap = self.getTargetMap() + + for base in ['hg19']: + subprocess.call("sort -k 1 %s/%s_segment > %s/%s_segment.sort" % (self.work_dir, base, self.work_dir, base), shell=True) + handle = TableReader(self.work_dir + "/%s_segment.sort" % (base)) + + segFile = None + curName = None + curData = {} + missingCount = 0 + + startField = "loc.start" + endField = "loc.end" + valField = "seg.mean" + chromeField = "chrom" + + segFile = None + sHandle = handle + for key, value in sHandle: + if segFile is None: + segFile = open("%s/%s_segment.out" % (self.work_dir, base), "w") + try: + curName = self.translateUUID(tmap[key]) + if curName is not None: + chrom = value[ chromeField ].lower() + if not chrom.startswith("chr"): + chrom = "chr" + chrom + chrom = chrom.upper().replace("CHR", "chr") + #segFile.write( "%s\t%s\t%s\t%s\t.\t%s\n" % ( curName, chrom, int(value[ startField ])+1, value[ endField ], value[ valField ] ) ) + segFile.write( "%s\t%s\t%s\t%s\t%s\n" % ( chrom, value[ startField ], value[ endField ], curName, value[ valField ] ) ) + except KeyError: + self.addError( "TargetInfo Not Found: %s" % (key)) + + segFile.close() + + self.emitFile("." + base, self.getMeta(self.config.name + "." + base), "%s/%s_segment.out" % (self.work_dir, base)) + + +class HmiRNAImport(TCGAMatrixImport): + dataSubType = 'miRNAExp' + probeMap = 'agilentHumanMiRNA' + sampleMap = 'tcga.iddag' + dataType = 'genomicMatrix' + probeFields = ['unc_DWD_Batch_adjusted'] + +class CGH244AImport(TCGASegmentImport): + dataSubType = 'cna' + sampleMap = 'tcga.iddag' + dataType = 'genomicSegment' + probeFields = ['Segment_Mean'] + +class CGH415K_G4124A(TCGASegmentImport): + dataSubType = 'cna' + sampleMap = 'tcga.iddag' + chromeField = 'Chromosome' + dataType = 'genomicSegment' + endField = 'End' + probeFields = ['Segment_Mean'] + startField = 'Start' + + +class IlluminaHiSeq_DNASeqC(TCGASegmentImport): + dataSubType = 'cna' + sampleMap = 'tcga.iddag' + chromeField = 'Chromosome' + dataType = 'genomicSegment' + endField = 'End' + probeFields = ['Segment_Mean'] + startField = 'Start' + + def translateUUID(self, uuid): + out = self.config.translateUUID(uuid) + #censor out normal ids + if re.search(r'^TCGA-..-....-1', out): + return None + return out + +class HT_HGU133A(TCGAMatrixImport): + dataSubType = 'geneExp' + probeMap = 'affyU133a' + sampleMap = 'tcga.iddag' + dataType = 'genomicMatrix' + probeFields = ['Signal'] + +class HuEx1_0stv2(TCGAMatrixImport): + dataSubType = 'miRNAExp' + probeMap = 'hugo' + sampleMap = 'tcga.iddag' + dataType = 'genomicMatrix' + probeFields = ['Signal'] + fileInclude = '^.*gene.txt$|^.*sdrf.txt$' + +class Human1MDuoImport(TCGASegmentImport): + dataSubType = 'cna' + sampleMap = 'tcga.iddag' + dataType = 'genomicSegment' + probeFields = ['mean'] + +class HumanHap550(TCGASegmentImport): + dataSubType = 'cna' + sampleMap = 'tcga.iddag' + dataType = 'genomicSegment' + probeFields = ['mean'] + +class HumanMethylation27(TCGAMatrixImport): + dataSubType = 'DNAMethylation' + probeMap= 'illuminaMethyl27K_gpl8490' + sampleMap= 'tcga.iddag' + dataType= 'genomicMatrix' + fileExclude= '.*.adf.txt' + probeFields = ['Beta_Value', 'Beta_value'] + + +class HumanMethylation450(TCGAMatrixImport): + dataSubType = 'DNAMethylation' + probeMap = 'illuminaHumanMethylation450' + sampleMap = 'tcga.iddag' + dataType = 'genomicMatrix' + fileExclude = '.*.adf.txt' + probeFields = ['Beta_value', 'Beta_Value'] + + def fileScan(self, path): + """ + This function takes a TCGA level 3 genetic file (file name and input handle), + and tries to extract probe levels or target mappings (experimental ID to TCGA barcode) + it emits these values to a handle, using the 'targets' and 'probes' string to identify + the type of data being emited + """ + iHandle = open(path) + mode = None + #modes + #1 - two col header matrix file + target = None + colName = None + colType = None + for line in iHandle: + if colName is None: + colName = line.rstrip().split("\t") + if colName[0] == "Hybridization REF" or colName[0] == "Sample REF": + mode=1 + for i in range(len(colName)): + if commonMap.has_key( colName[i] ): + colName[i] = commonMap[ colName[i] ] + elif mode==1 and colType is None: + colType=line.rstrip().split("\t") + for i in range(len(colType)): + if commonMap.has_key( colType[i] ): + colType[i] = commonMap[ colType[i] ] + else: + tmp = line.rstrip().split("\t") + if mode == 1: + out={} + for col in colName[1:]: + out[ col ] = { "target" : col } + for i in range(1,len(colType)): + try: + if colType[i] in self.probeFields: + out[ colName[i] ][ colType[i] ] = "%.4f" % float(tmp[i]) + except IndexError: + out[ colName[i] ][ colType[i] ] = "NA" + except ValueError: + out[ colName[i] ][ colType[i] ] = "NA" + for col in out: + self.emit( tmp[0], out[col], "probes" ) + +class Illumina_RNASeq(TCGAMatrixImport): + sampleMap= 'tcga.iddag' + dataSubType= 'geneExp' + fileInclude= r'^.*\.gene.quantification.txt$|^.*sdrf.txt$' + probeFields = ['RPKM'] + probeMap= 'hugo.unc' + +class Illumina_RNASeqV2(TCGAMatrixImport): + sampleMap= 'tcga.iddag' + dataSubType= 'geneExp' + fileInclude= r'^.*rsem.genes.normalized_results$|^.*sdrf.txt$' + probeFields = ['normalized_count'] + probeMap= 'hugo.unc' + +class IlluminaHiSeq_RNASeq(TCGAMatrixImport): + sampleMap= 'tcga.iddag' + dataSubType= 'geneExp' + fileInclude= r'^.*gene.quantification.txt$' + probeFields = ['RPKM'] + probeMap= 'hugo.unc' + +class MDA_RPPA_Core(TCGAMatrixImport): + sampleMap = 'tcga.iddag' + probeMap = "md_anderson_antibodies" + dataSubType = "RPPA" + fileExclude = r'^.*.antibody_annotation.txt' + probeFields = [ 'Protein Expression' ] + + def getTargetMap(self): + subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True) + handle = TableReader(self.work_dir + "/targets.sort") + tTrans = {} + for key, value in handle: + value = re.sub(r'\.SD', '', value) + tTrans[ key ] = value + return tTrans + + +class Illumina_miRNASeq(TCGAMatrixImport): + sampleMap= 'tcga.iddag' + dataSubType= 'miRNA' + fileInclude= '^.*.mirna.quantification.txt$' + probeFields = ['reads_per_million_miRNA_mapped'] + probeMap= 'hsa.mirna' + + +class bioImport(TCGAClinicalImport): + sampleMap = 'tcga.iddag' + fileInclude = '.*.xml$' + +tcgaConfig = { + 'AgilentG4502A_07' : AgilentImport, + 'AgilentG4502A_07_1' : AgilentImport, + 'AgilentG4502A_07_2' : AgilentImport, + 'AgilentG4502A_07_3': AgilentImport, + 'CGH-1x1M_G4447A': CGH1x1mImport, + 'Genome_Wide_SNP_6': SNP6Import, + 'H-miRNA_8x15K': HmiRNAImport, + 'H-miRNA_8x15Kv2': HmiRNAImport, + 'HG-CGH-244A': CGH244AImport, + 'HG-CGH-415K_G4124A': CGH415K_G4124A, + 'HT_HG-U133A': HT_HGU133A, + 'HuEx-1_0-st-v2': HuEx1_0stv2, + 'Human1MDuo': Human1MDuoImport, + 'HumanHap550': HumanHap550, + 'IlluminaHiSeq_DNASeqC' : IlluminaHiSeq_DNASeqC, + 'HumanMethylation27': HumanMethylation27, + 'HumanMethylation450': HumanMethylation450, + 'IlluminaHiSeq_RNASeq': IlluminaHiSeq_RNASeq, + 'IlluminaGA_RNASeq' : Illumina_RNASeq, + 'IlluminaHiSeq_RNASeqV2' : Illumina_RNASeqV2, + 'MDA_RPPA_Core' : MDA_RPPA_Core, + 'IlluminaGA_miRNASeq' : Illumina_miRNASeq, + 'IlluminaHiSeq_miRNASeq' : Illumina_miRNASeq, + 'bio' : bioImport +} + +def fileDigest( file ): + md5 = hashlib.md5() + with open(file,'rb') as f: + for chunk in iter(lambda: f.read(8192), ''): + md5.update(chunk) + return md5.hexdigest() + + +def platform_list(): + q = CustomQuery("Platform") + for e in q: + yield e['name'] + +def supported_list(): + q = CustomQuery("Platform") + for e in q: + if e['name'] in tcgaConfig: + yield e['name'] + +def platform_archives(platform): + q = CustomQuery("Archive[Platform[@name=%s]][@isLatest=1]" % platform) + out = {} + for e in q: + name = e['baseName'] + if name not in out: + yield name + out[name] = True + + +if __name__ == "__main__": + + parser = ArgumentParser() + #Stack.addJobTreeOptions(parser) + + parser.add_argument("-a", "--platform-list", dest="platform_list", action="store_true", help="Get list of platforms", default=False) + parser.add_argument("-u", "--uuid", dest="uuid_table", help="UUID to Barcode Table", default=None) + parser.add_argument("-t", "--uuid-download", dest="uuid_download", help="Download UUID/Barcode Table", default=False) + parser.add_argument("-z", "--all-archives", dest="all_archives", action="store_true", help="List all archives", default=False) + parser.add_argument("-p", "--platform", dest="platform", help="Platform Selection", default=None) + parser.add_argument("-l", "--supported", dest="supported_list", action="store_true", help="List Supported Platforms", default=None) + parser.add_argument("-f", "--filelist", dest="filelist", help="List files needed to convert TCGA project basename into cgData", default=None) + parser.add_argument("-b", "--basename", dest="basename", help="Convert TCGA project basename into cgData", default=None) + parser.add_argument("-m", "--mirror", dest="mirror", help="Mirror Location", default=None) + parser.add_argument("-w", "--workdir", dest="workdir_base", help="Working directory", default="/tmp") + parser.add_argument("--out-dir", dest="outdir", help="Working directory", default="./") + parser.add_argument("-o", "--out", dest="outpath", help="Output Dest", default=None) + parser.add_argument("--out-error", dest="errorpath", help="Output Error", default=None) + parser.add_argument("--out-meta", dest="metapath", help="Output Meta", default=None) + parser.add_argument("-c", "--cancer", dest="cancer", help="List Archives by cancer type", default=None) + parser.add_argument("-d", "--download", dest="download", help="Download files for archive", default=None) + parser.add_argument("-e", "--level", dest="level", help="Data Level ", default="3") + parser.add_argument("-s", "--check-sum", dest="checksum", help="Check project md5", default=None) + parser.add_argument("-r", "--sanitize", dest="sanitize", action="store_true", help="Remove race/ethnicity from clinical data", default=False) + parser.add_argument("-x", "--clinical", dest="clinical", help="Process clinical info", default=None) + parser.add_argument("--clinical-basename", dest="clinical_basename", help="Select Clinical Data by basename", default=None) + parser.add_argument("--clinical-type", dest="clinical_type", help="Clinical Data Type", default=None) + parser.add_argument("--all-clinical", dest="all_clinical", action="store_true", help="List all clinical archives", default=False) + parser.add_argument("--out-clinical", dest="out_clinical", action="append", nargs=3, default=[]) + parser.add_argument("--samples", dest="get_samples", action="store_true", default=False) + + options = parser.parse_args() + + if options.uuid_download: + url="https://tcga-data.nci.nih.gov/uuid/uuidBrowserExport.htm" + data = {} + data['exportType'] = 'tab' + data['cols'] = "uuid,barcode" + urllib.urlretrieve( url, options.uuid_download, data=urllib.urlencode(data)) + + if options.platform_list: + for e in platform_list(): + print e + + if options.supported_list: + for e in supported_list(): + print e + + if options.platform: + for name in platform_archives( options.platform ): + print name + + if options.all_archives: + q = CustomQuery("Archive[@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.level)) + out = {} + for e in q: + name = e['baseName'] + if name not in out: + print name + out[name] = True + + if options.all_clinical: + q = CustomQuery("Archive[@isLatest=1][Platform[@alias=bio]]") + out = {} + for e in q: + name = e['baseName'] + if name not in out: + print name + out[name] = True + + if options.get_samples: + url="https://tcga-data.nci.nih.gov/datareports/aliquotExport.htm" + data = {} + + data['exportType'] = 'tab' + data['cols'] = 'aliquotId,disease,bcrBatch,center,platform,levelOne,levelTwo,levelThree' + data['filterReq'] = json.dumps({"disease":"","levelOne":"","aliquotId":"","center":"","levelTwo":"","bcrBatch":"","platform":"","levelThree":""}) + data['formFilter'] = json.dumps({"disease":"","levelOne":"","aliquotId":"","center":"","levelTwo":"","bcrBatch":"","platform":"","levelThree":""}) + handle = urllib.urlopen( url + "?" + urllib.urlencode(data)) + + for line in handle: + tmp = line.rstrip().split("\t") + if tmp[7] == "Submitted": + if tmp[0][13]=='0': + print "\t".join( [ tmp[0], tmp[1], "Tumor", tmp[4] ] ) + elif tmp[0][13] == '1': + print "\t".join( [ tmp[0], tmp[1], "Normal", tmp[4] ] ) + + + if options.cancer is not None: + q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][ArchiveType[@type=Level_%s]]" % (options.cancer, options.level)) + out = {} + for e in q: + name = e['baseName'] + if name not in out: + print name + out[name] = True + + if options.filelist: + q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.filelist, options.level)) + for e in q: + print e['deployLocation'] + q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.filelist)) + for e in q: + print e['deployLocation'] + + if options.checksum: + urls = [] + q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.checksum, options.level)) + for e in q: + urls.append( e['deployLocation'] ) + q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.checksum)) + for e in q: + urls.append( e['deployLocation'] ) + + for url in urls: + dst = os.path.join(options.mirror, re.sub("^/", "", url)) + if not os.path.exists( dst ): + print "NOT_FOUND:", dst + continue + if not os.path.exists( dst + ".md5" ): + print "MD5_NOT_FOUND", dst + continue + + handle = open( dst + ".md5" ) + line = handle.readline() + omd5 = line.split(' ')[0] + handle.close() + + nmd5 = fileDigest( dst ) + if omd5 != nmd5: + print "CORRUPT:", dst + else: + print "OK:", dst + + + if options.download is not None: + if options.mirror is None: + print "Define mirror location" + sys.exit(1) + + urls = [] + + if options.basename is None and options.clinical is None and options.clinical_basename is None: + q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.download, options.level)) + for e in q: + urls.append( e['deployLocation'] ) + urls.append( e['deployLocation'] + ".md5" ) + + q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.download)) + for e in q: + urls.append( e['deployLocation'] ) + urls.append( e['deployLocation'] + ".md5" ) + + if options.basename: + q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.basename, options.level)) + for e in q: + urls.append( e['deployLocation'] ) + urls.append( e['deployLocation'] + ".md5" ) + + q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.basename)) + for e in q: + urls.append( e['deployLocation'] ) + urls.append( e['deployLocation'] + ".md5" ) + + if options.clinical: + q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][Platform[@alias=bio]]" % (options.clinical)) + for e in q: + urls.append( e['deployLocation'] ) + urls.append( e['deployLocation'] + ".md5" ) + + if options.clinical_basename: + q = CustomQuery("Archive[@isLatest=1][@baseName=%s]" % (options.clinical_basename)) + for e in q: + urls.append( e['deployLocation'] ) + urls.append( e['deployLocation'] + ".md5" ) + + + + for url in urls: + src = "https://tcga-data.nci.nih.gov/" + url + dst = os.path.join(options.mirror, re.sub("^/", "", url)) + dir = os.path.dirname(dst) + if not os.path.exists(dir): + print "mkdir", dir + os.makedirs(dir) + if not os.path.exists( dst ): + print "download %s to %s" % (src, dst) + urllib.urlretrieve(src, dst) + + if options.basename: + if options.mirror is None: + sys.stderr.write("Need mirror location\n") + sys.exit(1) + + conf = getBaseBuildConf(options.basename, options.level, options.mirror) + conf.addOptions(options) + if conf.platform not in tcgaConfig: + sys.stderr.write("Platform %s not supported\n" % (conf.platform)) + sys.exit(1) + + ext = tcgaConfig[conf.platform](conf) + ext.run() + + + if options.clinical: + if options.mirror is None: + sys.stderr.write("Need mirror location\n") + sys.exit(1) + + q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][Platform[@alias=bio]]" % (options.clinical)) + basenames = {} + for s in q: + basenames[s['baseName']] = True + + for base in basenames: + conf = getBaseBuildConf(base, 1, options.mirror) + conf.addOptions(options) + + ext = tcgaConfig[conf.platform](conf) + ext.run() + + if options.clinical_basename: + if options.mirror is None: + sys.stderr.write("Need mirror location\n") + sys.exit(1) + + + conf = getBaseBuildConf(options.clinical_basename, 1, options.mirror) + conf.addOptions(options) + + ext = tcgaConfig[conf.platform](conf) + ext.run() + diff -r 000000000000 -r f1c71f5363ae tcga_import/tcga_import.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tcga_import/tcga_import.xml Tue Oct 30 14:23:49 2012 -0400 @@ -0,0 +1,181 @@ + + TCGA Import + tcgaImport.py +-w ./ +#if $iparam.import_mode == "uuid": +--uuid-download ${out_uuid} +#end if +#if $iparam.import_mode == "sample_list": +--samples > ${out_sample_list} +#end if +#if $iparam.import_mode == "archive_list": +-z > ${archive_list_out} +#end if +#if $iparam.import_mode == "archive": +-m ${__tool_data_path__} +--download - +--basename ${iparam.base} +#if str($iparam.uuid_mapping) != "None": +--uuid ${iparam.uuid_mapping} +#end if +--out ${archive_out} +--out-meta ${archive_meta} +#end if +#if $iparam.import_mode == "clinical_list": +--all-clinical > ${clinical_list_out} +#end if +#if $iparam.import_mode == "clinical": +-m ${__tool_data_path__} +--download - +--clinical-basename ${iparam.clinical_archive} +--out-clinical patient ${out_patient} ${out_patient_meta} +--out-clinical sample ${out_sample} ${out_sample_meta} +--out-clinical radiation ${out_radiation} ${out_radiation_meta} +--out-clinical drug ${out_drug} ${out_drug_meta} +--out-clinical portion ${out_portion} ${out_portion_meta} +--out-clinical analyte ${out_analyte} ${out_analyte_meta} +--out-clinical aliquot ${out_aliquot} ${out_aliquot_meta} + +#end if + + + + + + + + + + + + + + + + + + + + + + + + + + + iparam['import_mode'] == "uuid" + + + iparam['import_mode'] == "sample_list" + + + iparam['import_mode'] == "archive_list" + + + iparam['import_mode'] == "archive" + + + iparam['import_mode'] == "archive" + + + + iparam['import_mode'] == "clinical_list" + + + + + iparam['import_mode'] == "clinical" + + + iparam['import_mode'] == "clinical" + + + iparam['import_mode'] == "clinical" + + + iparam['import_mode'] == "clinical" + + + iparam['import_mode'] == "clinical" + + + iparam['import_mode'] == "clinical" + + + iparam['import_mode'] == "clinical" + + + + iparam['import_mode'] == "clinical" + + + iparam['import_mode'] == "clinical" + + + iparam['import_mode'] == "clinical" + + + iparam['import_mode'] == "clinical" + + + iparam['import_mode'] == "clinical" + + + iparam['import_mode'] == "clinical" + + + iparam['import_mode'] == "clinical" + + + + + +TCGA Import Tool +================ + +An import tool for data from `The Cancer Genome Atlas Data <https://tcga-data.nci.nih.gov>`_. +This tool specializes in importing Level 3, publically accessable data from the TCGA. This includes +processed genomic data and a majority of the clinical info. While this data is publically avalible, it +is frequently in a varity of file formats that need to be colated and formated before anlyitcal use. This tool +attempts to read across a family of tarballs and compile the data into a single matrix. +There are also import mechanism to find TCGA sample id lists and UUID conversion tables. + +Modes: + +*TCGA UUID/Barcode Mapping* + A two column mapping from TCGA UUID to barcode IDs + +*TCGA Sample List* + A list of TCGA samples, with sample type, cancer type, and platform. + +*List TCGA Archives* + A list of all valid archives found at TCGA. + +*Import Archive* + Import a TCGA archive using a name found in the archive list. This will store a mirror + of downloaded files in the Galaxy tool-data folder. This will either produce a matrix or + BED5 file, with an associated meta data description in JSON + +*List TCGA Clinical Data* + List all valid TCGA clinical repositories + +*Clinical Import* + Import a TCGA clinical repository using a name found in the clinical data list. This will store + a mirror of downloaded files in the Galaxt tool-data folder. This produces 7 matrices and their + associated meta data. The different levels of clinical matrix include + + - patient + - sample + - radiation + - drug + - portion + - analyte + - aliquot + + +Note: TCGA archive can be large, and this isn't the most optimized piece of code in the world, so import can take a while for some +of the larger TCGA archives. + + + +