Mercurial > repos > kellrott > tcga_import
view tcga_import/tcgaImport.py @ 0:f1c71f5363ae draft default tip
Uploaded
author | kellrott |
---|---|
date | Tue, 30 Oct 2012 14:23:49 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python """ Script to scan and extract TCGA data and compile it into the cgData Usage:: tcga2cgdata.py [options] Options:: -h, --help show this help message and exit -a, --platform-list Get list of platforms -p PLATFORM, --platform=PLATFORM Platform Selection -l, --supported List Supported Platforms -f FILELIST, --filelist=FILELIST List files needed to convert TCGA project basename into cgData -b BASENAME, --basename=BASENAME Convert TCGA project basename into cgData -m MIRROR, --mirror=MIRROR Mirror Location -w WORKDIR_BASE, --workdir=WORKDIR_BASE Working directory -o OUTDIR, --out-dir=OUTDIR Working directory -c CANCER, --cancer=CANCER List Archives by cancer type -d DOWNLOAD, --download=DOWNLOAD Download files for archive -e LEVEL, --level=LEVEL Data Level -s CHECKSUM, --check-sum=CHECKSUM Check project md5 -r, --sanitize Remove race/ethnicity from clinical data Example:: ./scripts/tcga2cgdata.py -b intgen.org_KIRC_bio -m /inside/depot -e 1 -r -w tmp """ from xml.dom.minidom import parseString import urllib import urllib2 import os import csv import sys import hashlib import tempfile import re import copy import json import datetime import hashlib import subprocess from glob import glob import shutil import subprocess from argparse import ArgumentParser """ Net query code """ class dccwsItem(object): baseURL = "http://tcga-data.nci.nih.gov/tcgadccws/GetXML?query=" def __init__(self): self.url = None def __iter__(self): next = self.url while next != None: handle = urllib.urlopen(next) data = handle.read() handle.close() dom = parseString(data) # there might not be any archives for a dataset if len(dom.getElementsByTagName('queryResponse')) > 0: response = dom.getElementsByTagName('queryResponse').pop() classList = response.getElementsByTagName('class') for cls in classList: className = cls.getAttribute("recordNumber") outData = {} #aObj = Archive() for node in cls.childNodes: nodeName = node.getAttribute("name") if node.hasAttribute("xlink:href"): outData[ nodeName ] = node.getAttribute("xlink:href") else: outData[ nodeName ] = getText( node.childNodes ) yield outData if len( dom.getElementsByTagName('next') ) > 0: nextElm = dom.getElementsByTagName('next').pop() next = nextElm.getAttribute( 'xlink:href' ) else: next = None class CustomQuery(dccwsItem): def __init__(self, query): super(CustomQuery, self).__init__() if query.startswith("http://"): self.url = query else: self.url = dccwsItem.baseURL + query def getText(nodelist): rc = [] for node in nodelist: if node.nodeType == node.TEXT_NODE: rc.append(node.data) return ''.join(rc) """ Build Configuration """ class BuildConf: def __init__(self, platform, name, version, meta, tarlist): self.platform = platform self.name = name self.version = version self.meta = meta self.tarlist = tarlist self.abbr = '' self.uuid_table = None if 'diseaseAbbr' in meta: self.abbr = meta['diseaseAbbr'] def addOptions(self, opts): self.workdir_base = opts.workdir_base self.outdir = opts.outdir self.sanitize = opts.sanitize self.outpath = opts.outpath self.metapath = opts.metapath self.errorpath = opts.errorpath self.clinical_type = opts.clinical_type self.clinical_type_map = {} for t, path, meta in opts.out_clinical: self.clinical_type_map[ "." + t] = (path, meta) if opts.uuid_table is not None: self.uuid_table = {} handle = open(opts.uuid_table) for line in handle: tmp = line.rstrip().split("\t") self.uuid_table[tmp[0]] = tmp[1] def translateUUID(self, uuid): if self.uuid_table is None or uuid not in self.uuid_table: return uuid return self.uuid_table[uuid] def getOutPath(self, name): if self.outpath is not None: return self.outpath if name in self.clinical_type_map: return self.clinical_type_map[name][0] return os.path.join(self.outdir, self.name) + name def getOutMeta(self, name): if self.outpath is not None: if self.metapath is not None: return self.metapath return self.outpath + ".json" if name in self.clinical_type_map: return self.clinical_type_map[name][1] return os.path.join(self.outdir, self.name) + name + ".json" def getOutError(self, name): if self.outpath is not None: if self.errorpath is not None: return self.errorpath return self.outpath + ".error" return os.path.join(self.outdir, self.name) + name + ".error" def getBaseBuildConf(basename, level, mirror): dates = [] print "TCGA Query for: ", basename q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (basename, level)) urls = {} meta = None platform = None for e in q: dates.append( datetime.datetime.strptime( e['addedDate'], "%m-%d-%Y" ) ) if meta is None: meta = {"sourceUrl" : []} for e2 in CustomQuery(e['platform']): platform = e2['name'] meta['platform'] = e2['name'] meta['platformTitle'] = e2['displayName'] for e2 in CustomQuery(e['disease']): meta['diseaseAbbr'] = e2['abbreviation'] meta['diseaseTitle'] = e2['name'] for e3 in CustomQuery(e2['tissueCollection']): meta['tissue'] = e3['name'] for e2 in CustomQuery(e['center']): meta['centerTitle'] = e2['displayName'] meta['center'] = e2['name'] meta['sourceUrl'].append( "http://tcga-data.nci.nih.gov/" + e['deployLocation'] ) urls[ mirror + e['deployLocation'] ] = platform print "TCGA Query for mage-tab: ", basename q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (basename)) for e in q: dates.append( datetime.datetime.strptime( e['addedDate'], "%m-%d-%Y" ) ) q2 = CustomQuery(e['platform']) platform = None for e2 in q2: print e2 platform = e2['name'] meta['sourceUrl'].append( "http://tcga-data.nci.nih.gov/" + e['deployLocation'] ) urls[ mirror + e['deployLocation'] ] = platform if len(dates) == 0: print "No Files found" return dates.sort() dates.reverse() versionDate = dates[0].strftime( "%Y-%m-%d" ) return BuildConf(platform, basename, versionDate, meta, urls) class TableReader: def __init__(self, path): self.path = path def __iter__(self): if self.path is not None and os.path.exists(self.path): handle = open(self.path) for line in handle: tmp = line.rstrip().split("\t") yield tmp[0], json.loads(tmp[1]) handle.close() class FileImporter: fileInclude = None fileExclude = None excludes = [ "MANIFEST.txt$", "CHANGES_DCC.txt$", "README_DCC.txt$", "README.txt$", "CHANGES.txt$", "DCC_ALTERED_FILES.txt$", r'.wig$', "DESCRIPTIO$" ] def __init__(self, config): self.config = config def extractTars(self): self.work_dir = tempfile.mkdtemp(dir=self.config.workdir_base) print "Extract to ", self.work_dir for path in self.config.tarlist: subprocess.check_call([ "tar", "xvzf", path, "-C", self.work_dir], stderr=sys.stdout) def run(self): self.extractTars() filterInclude = None filterExclude = None if self.fileInclude is not None: filterInclude = re.compile(self.fileInclude) if self.fileExclude is not None: filterExclude = re.compile(self.fileExclude) self.inc = 0 self.out = {} self.errors = [] self.ext_meta = {} self.scandirs(self.work_dir, filterInclude, filterExclude) for o in self.out: self.out[o].close() self.fileBuild() #shutil.rmtree(self.work_dir) def checkExclude( self, name ): for e in self.excludes: if re.search( e, name ): return True return False def scandirs(self, path, filterInclude=None, filterExclude=None): if os.path.isdir(path): for a in glob(os.path.join(path, "*")): self.scandirs(a, filterInclude, filterExclude) else: name = os.path.basename(path) if self.isMage(path): self.mageScan(path) else: if not self.checkExclude(name): if (filterInclude is None or filterInclude.match(name)) and (filterExclude is None or not filterExclude.match(name)): self.fileScan(path) def isMage(self, path): if path.endswith( '.sdrf.txt' ) or path.endswith( '.idf.txt' ) or path.endswith("DESCRIPTION.txt"): return True def emit(self, key, data, port): if port not in self.out: self.out[port] = open(self.work_dir + "/" + port, "w") self.out[port].write( "%s\t%s\n" % (key, json.dumps(data))) def emitFile(self, name, meta, file): md5 = hashlib.md5() oHandle = open(self.config.getOutPath(name), "wb") with open(file,'rb') as f: for chunk in iter(lambda: f.read(8192), ''): md5.update(chunk) oHandle.write(chunk) oHandle.close() md5str = md5.hexdigest() meta['md5'] = md5str mHandle = open(self.config.getOutMeta(name), "w") mHandle.write( json.dumps(meta)) mHandle.close() if len(self.errors): eHandle = open( self.config.getOutError(name), "w" ) for msg in self.errors: eHandle.write( msg + "\n" ) eHandle.close() def addError(self, msg): self.errors.append(msg) commonMap = { "mean" : "seg.mean", "Segment_Mean" : "seg.mean", "Start" : "loc.start", "End" : "loc.end", "Chromosome" : "chrom" } idfMap = { "Investigation Title" : "title", "Experiment Description" : "experimentalDescription", "Person Affiliation" : "dataProducer", "Date of Experiment" : "experimentalDate" } class TCGAGeneticImport(FileImporter): def mageScan(self, path): if path.endswith(".sdrf.txt"): iHandle = open(path, "rU") read = csv.reader( iHandle, delimiter="\t" ) colNum = None for row in read: if colNum is None: colNum = {} for i in range(len(row)): colNum[ row[i] ] = i else: if not colNum.has_key("Material Type") or ( not row[ colNum[ "Material Type" ] ] in [ "genomic_DNA", "total_RNA", "MDA cell line" ] ): try: if colNum.has_key( "Derived Array Data File" ): self.emit( row[ colNum[ "Derived Array Data File" ] ].split('.')[0], row[ colNum[ "Extract Name" ] ], "targets" ) self.emit( row[ colNum[ "Derived Array Data File" ] ], row[ colNum[ "Extract Name" ] ], "targets" ) if colNum.has_key("Derived Array Data Matrix File" ): self.emit( row[ colNum[ "Derived Array Data Matrix File" ] ], row[ colNum[ "Extract Name" ] ], "targets" ) if colNum.has_key( "Derived Data File"): self.emit( row[ colNum[ "Derived Data File" ] ].split('.')[0], row[ colNum[ "Extract Name" ] ], "targets" ) self.emit( row[ colNum[ "Derived Data File" ] ], row[ colNum[ "Extract Name" ] ], "targets" ) if colNum.has_key( "Hybridization Name" ): self.emit( row[ colNum[ "Hybridization Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" ) if colNum.has_key( "Sample Name" ): self.emit( row[ colNum[ "Sample Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" ) self.emit( row[ colNum[ "Extract Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" ) except IndexError: pass #there can be blank lines in the SDRF if path.endswith(".idf.txt"): iHandle = open(path) for line in iHandle: row = line.split("\t") if len(row): if row[0] in idfMap: self.ext_meta[ idfMap[row[0]] ] = row[1] iHandle.close() if path.endswith("DESCRIPTION.txt"): handle = open(path) self.ext_meta['description'] = handle.read() handle.close() def translateUUID(self, uuid): return self.config.translateUUID(uuid) def getTargetMap(self): subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True) handle = TableReader(self.work_dir + "/targets.sort") tTrans = {} for key, value in handle: tTrans[ key ] = value return tTrans def fileScan(self, path): """ This function takes a TCGA level 3 genetic file (file name and input handle), and tries to extract probe levels or target mappings (experimental ID to TCGA barcode) it emits these values to a handle, using the 'targets' and 'probes' string to identify the type of data being emited """ iHandle = open(path) mode = None #modes #1 - segmentFile - one sample per file/no sample info inside file #2 - two col header matrix file #3 - segmentFile - sample information inside file target = None colName = None colType = None for line in iHandle: if colName is None: colName = line.rstrip().split("\t") if colName[0] == "Hybridization REF" or colName[0] == "Sample REF": mode=2 elif colName[0] == "Chromosome" or colName[0] == "chromosome": mode=1 target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention elif colName[1] == "chrom": mode = 3 target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention for i in range(len(colName)): if commonMap.has_key( colName[i] ): colName[i] = commonMap[ colName[i] ] elif mode==2 and colType is None: colType=line.rstrip().split("\t") for i in range(len(colType)): if commonMap.has_key( colType[i] ): colType[i] = commonMap[ colType[i] ] else: tmp = line.rstrip().split("\t") if mode == 2: out={} for col in colName[1:]: out[ col ] = { "target" : col } for i in range(1,len(colType)): try: if colType[i] in self.probeFields: out[ colName[i] ][ colType[i] ] = tmp[i] except IndexError: out[ colName[i] ][ colType[i] ] = "NA" for col in out: self.emit( tmp[0], out[col], "probes" ) else: out = {} for i in range(len(colName)): out[ colName[i] ] = tmp[i] out['file'] = os.path.basename(path) if mode==1: self.emit( target, out, "segments" ) elif mode == 3: self.emit( tmp[0], out, "segments" ) else: self.emit( tmp[0], out, "probes" ) class TCGASegmentImport(TCGAGeneticImport): def fileScan(self, path): """ This function takes a TCGA level 3 genetic file (file name and input handle), and tries to extract probe levels or target mappings (experimental ID to TCGA barcode) it emits these values to a handle, using the 'targets' and 'probes' string to identify the type of data being emited """ iHandle = open(path) mode = None #modes #1 - segmentFile - one sample per file/no sample info inside file #2 - segmentFile - sample information inside file target = None colName = None colType = None for line in iHandle: if colName is None: colName = line.rstrip().split("\t") if colName[0] == "Chromosome" or colName[0] == "chromosome": mode=1 target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention elif colName[1] == "chrom": mode = 2 for i in range(len(colName)): if commonMap.has_key( colName[i] ): colName[i] = commonMap[ colName[i] ] else: tmp = line.rstrip().split("\t") out = {} for i in range(len(colName)): out[ colName[i] ] = tmp[i] out['file'] = os.path.basename(path) if mode==1: self.emit( target, out, "segments" ) elif mode == 2: self.emit( tmp[0], out, "segments" ) def getMeta(self, name): matrixInfo = { '@context' : "http://purl.org/cgdata/", '@type' : 'bed5', '@id' : name, "lastModified" : self.config.version, 'rowKeySrc' : { '@type' : 'idDAG', '@id' : "tcga.%s" % (self.config.abbr) }, 'dataSubType' : { "@id" : self.dataSubType }, 'dataProducer' : 'TCGA Import', "accessMap" : "public", "redistribution" : "yes" } matrixInfo.update(self.ext_meta) matrixInfo.update(self.config.meta) return matrixInfo def fileBuild(self): #use the target table to create a name translation table #also setup target name enumeration, so they will have columns #numbers tTrans = self.getTargetMap() subprocess.call("sort -k 1 %s/segments > %s/segments.sort" % (self.work_dir, self.work_dir), shell=True) sHandle = TableReader(self.work_dir + "/segments.sort") segFile = None curName = None curData = {} missingCount = 0 startField = "loc.start" endField = "loc.end" valField = "seg.mean" chromeField = "chrom" segFile = None for key, value in sHandle: if segFile is None: segFile = open("%s/segment_file" % (self.work_dir), "w") try: curName = self.translateUUID(tTrans[key]) # "-".join( tTrans[ key ].split('-')[0:4] ) if curName is not None: try: chrom = value[ chromeField ].lower() if not chrom.startswith("chr"): chrom = "chr" + chrom chrom = chrom.upper().replace("CHR", "chr") #segFile.write( "%s\t%s\t%s\t%s\t.\t%s\n" % ( curName, chrom, int(value[ startField ])+1, value[ endField ], value[ valField ] ) ) segFile.write( "%s\t%s\t%s\t%s\t%s\n" % ( chrom, value[ startField ], value[ endField ], curName, value[ valField ] ) ) except KeyError: self.addError( "Field error: %s" % (str(value))) except KeyError: self.addError( "TargetInfo Not Found: %s" % (key)) segFile.close() matrixName = self.config.name self.emitFile( "", self.getMeta(matrixName), "%s/segment_file" % (self.work_dir) ) class TCGAMatrixImport(TCGAGeneticImport): def getMeta(self, name): matrixInfo = { "@context" : 'http://purl.org/cgdata/', '@type' : 'genomicMatrix', '@id' : name, "lastModified" : self.config.version, 'dataSubType' : { "@id" : self.dataSubType }, 'dataProducer' : 'TCGA', "accessMap" : "public", "redistribution" : "yes", 'rowKeySrc' : { "@type" : "probe", "@id" : self.probeMap }, 'columnKeySrc' : { "@type" : "idDAG", "@id" : "tcga.%s" % (self.config.abbr) } } matrixInfo.update(self.ext_meta) matrixInfo.update(self.config.meta) return matrixInfo def fileBuild(self): #use the target table to create a name translation table #also setup target name enumeration, so they will have columns #numbers subprocess.call("sort -k 1 %s/probes > %s/probes.sort" % (self.work_dir, self.work_dir), shell=True) subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True) handles = {} handles[ "geneticExtract:targets" ] = TableReader(self.work_dir + "/targets.sort") handles[ "geneticExtract:probes" ] = TableReader(self.work_dir + "/probes.sort") tTrans = self.getTargetMap() tEnum = {} for t in tTrans: tlabel = self.translateUUID(tTrans[t]) if tlabel is not None and tlabel not in tEnum: tEnum[tlabel] = len(tEnum) matrixFile = None segFile = None curName = None curData = {} missingCount = 0 rowCount = 0 pHandle = handles["geneticExtract:probes"] for key, value in pHandle: if matrixFile is None: matrixFile = open("%s/matrix_file" % (self.work_dir), "w" ) out = ["NA"] * len(tEnum) for target in tEnum: out[ tEnum[ target ] ] = target matrixFile.write( "%s\t%s\n" % ( "#probe", "\t".join( out ) ) ) if curName != key: if curName is not None: out = ["NA"] * len(tEnum) for target in curData: try: ttarget = self.translateUUID(tTrans[target]) if ttarget is not None: out[ tEnum[ ttarget ] ] = str( curData[ target ] ) except KeyError: self.addError( "TargetInfo Not Found: %s" % (target)) if out.count("NA") != len(tEnum): rowCount += 1 matrixFile.write( "%s\t%s\n" % ( curName, "\t".join( out ) ) ) curName = key curData = {} if "target" in value: for probeField in self.probeFields: if probeField in value: curData[ value[ "target" ] ] = value[ probeField ] elif "file" in value: for probeField in self.probeFields: if probeField in value: curData[ value[ "file" ] ] = value[ probeField ] matrixFile.close() matrixName = self.config.name if rowCount > 0: self.emitFile( "", self.getMeta(matrixName), "%s/matrix_file" % (self.work_dir) ) adminNS = "http://tcga.nci/bcr/xml/administration/2.3" class TCGAClinicalImport(FileImporter): def fileScan(self, path): handle = open(path) data = handle.read() handle.close() xml=parseString(data) self.parseXMLFile(xml) def getText(self, nodelist): rc = [] for node in nodelist: if node.nodeType == node.TEXT_NODE: rc.append(node.data) return ''.join(rc) def parseXMLFile(self, dom): admin = {} for node in dom.getElementsByTagNameNS( adminNS, "admin"): for cNode in node.childNodes: if cNode.nodeType == cNode.ELEMENT_NODE: admin[ cNode.localName ] = {} admin[ cNode.localName ]['value'] = getText( cNode.childNodes ) name = None patient = {} patientName = None for node in dom.childNodes[0].childNodes: if node.nodeType == node.ELEMENT_NODE: if node.localName == 'patient': for elm in node.childNodes: if elm.nodeType == elm.ELEMENT_NODE: if ( elm.localName == 'bcr_patient_barcode' ): name = getText( elm.childNodes ) patientName = name if ( elm.getAttribute( 'procurement_status' ) == "Completed" ): patient[ elm.localName ] = {} patient[ elm.localName ]['value'] = getText( elm.childNodes ) patient[ elm.localName ]['tier'] = elm.getAttribute( 'tier' ) patient[ elm.localName ]['precision'] = elm.getAttribute( 'precision' ) if elm.prefix == "auxiliary": for aux in elm.childNodes: if aux.nodeType == aux.ELEMENT_NODE: for auxval in aux.childNodes: if auxval.nodeType == auxval.ELEMENT_NODE: patient[ auxval.localName ] = {} patient[ auxval.localName ]['value'] = getText( auxval.childNodes ) patient[ auxval.localName ]['tier'] = auxval.getAttribute( 'tier' ) patient[ auxval.localName ]['precision'] = auxval.getAttribute( 'precision' ) if name is not None: for key in admin: patient[ key ] = admin[ key ] self.emit( name, patient, "patient" ) for node in dom.childNodes[0].childNodes: if node.nodeType == node.ELEMENT_NODE and node.localName == 'patient': for samples in node.childNodes: if samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'samples': for sample in samples.childNodes: if sample.nodeType == samples.ELEMENT_NODE and sample.localName == 'sample': sampleData = {} for value in sample.childNodes: if value.nodeType == value.ELEMENT_NODE: if value.localName == 'bcr_sample_barcode' : name = getText( value.childNodes ) if value.getAttribute( 'procurement_status' ) == "Completed" : sampleData[ value.localName ] = {} sampleData[ value.localName ]['value'] = getText( value.childNodes ) if value.localName == 'portions' : for portions in value.childNodes: if portions.nodeType == value.ELEMENT_NODE and portions.localName == "portion": portionName = None portionData = {} for portion in portions.childNodes: if portion.nodeType == value.ELEMENT_NODE: if portion.localName == "analytes": for analytes in portion.childNodes: if analytes.nodeType == analytes.ELEMENT_NODE and analytes.localName =="analyte": analyteName = None analyteData = {} for analyte in analytes.childNodes: if analyte.nodeType == value.ELEMENT_NODE: if analyte.localName == "aliquots": for aliquots in analyte.childNodes: if aliquots.nodeType == aliquots.ELEMENT_NODE and aliquots.localName =="aliquot": aliquotName = None aliquotData = {} for aliquot in aliquots.childNodes: if aliquot.nodeType == value.ELEMENT_NODE: if aliquot.localName == "bcr_aliquot_barcode": aliquotName = getText(aliquot.childNodes) if aliquot.getAttribute( 'procurement_status' ) == "Completed" : aliquotData[ aliquot.localName ] = {} aliquotData[ aliquot.localName ]['value'] = getText( aliquot.childNodes ) if aliquotName is not None and len(aliquotData): self.emit( aliquotName, aliquotData, 'aliquot' ) if analyte.localName == "bcr_analyte_barcode": analyteName = getText(analyte.childNodes) if analyte.getAttribute( 'procurement_status' ) == "Completed" : analyteData[ analyte.localName ] = {} analyteData[ analyte.localName ]['value'] = getText( analyte.childNodes ) if analyteName is not None and len(analyteData): self.emit( analyteName, analyteData, 'analyte' ) if portion.localName == "bcr_portion_barcode": portionName = getText( portion.childNodes ) if portion.getAttribute( 'procurement_status' ) == "Completed" : portionData[ portion.localName ] = {} portionData[ portion.localName ]['value'] = getText( portion.childNodes ) if portionName is not None and len(portionData): self.emit( portionName, portionData, 'portion' ) #patientName = re.sub( r'\-...$', "", name ) self.emit( name, sampleData, "sample" ) self.emit( name, patient, "sample") elif samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'drugs': for drug in samples.childNodes: if drug.nodeType == samples.ELEMENT_NODE and drug.localName == 'drug': drugData = {} for value in drug.childNodes: if value.nodeType == value.ELEMENT_NODE: if value.localName == 'bcr_drug_barcode' : name = getText( value.childNodes ) if value.getAttribute( 'procurement_status' ) == "Completed" : drugData[ value.localName ] = {} drugData[ value.localName ]['value'] = getText( value.childNodes ) #patientName = re.sub( r'\-...$', "", name ) self.emit( patientName, drugData, "drug" ) elif samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'radiations': for rad in samples.childNodes: if rad.nodeType == samples.ELEMENT_NODE and rad.localName == 'radiation': radData = {} for value in rad.childNodes: if value.nodeType == value.ELEMENT_NODE: if value.localName == 'bcr_radiation_barcode' : name = getText( value.childNodes ) if value.getAttribute( 'procurement_status' ) == "Completed" : radData[ value.localName ] = {} radData[ value.localName ]['value'] = getText( value.childNodes ) #patientName = re.sub( r'\-...$', "", name ) self.emit( patientName, radData, "radiation" ) def getMeta(self, name): fileInfo = { "@context" : "http://purl.org/cgdata/", "@type" : "clinicalMatrix", "@id" : name, "lastModified" : self.config.version, 'dataSubType' : { "@id" : "clinical" }, "rowKeySrc" : { "@type" : "idDAG", "@id" : "tcga.%s" % (self.config.abbr) } } fileInfo.update(self.ext_meta) fileInfo.update(self.config.meta) return fileInfo def fileBuild(self): matrixList = [ "patient", "sample", "radiation", "drug", "portion", "analyte", "aliquot" ] if self.config.clinical_type is not None: matrixList = [ self.config.clinical_type ] for matrixName in matrixList: if os.path.exists( "%s/%s" % (self.work_dir, matrixName)): subprocess.call("cat %s/%s | sort -k 1 > %s/%s.sort" % (self.work_dir, matrixName, self.work_dir, matrixName), shell=True) handle = TableReader(self.work_dir + "/" + matrixName + ".sort") matrix = {} colEnum = {} for key, value in handle: if key not in matrix: matrix[key] = {} for col in value: matrix[key][col] = value[col] if col not in colEnum: if not self.config.sanitize or col not in [ 'race', 'ethnicity' ]: colEnum[col] = len(colEnum) handle = open( os.path.join(self.work_dir, matrixName + "_file"), "w") cols = [None] * (len(colEnum)) for col in colEnum: cols[colEnum[col]] = col handle.write("sample\t%s\n" % ("\t".join(cols))) for key in matrix: cols = [""] * (len(colEnum)) for col in colEnum: if col in matrix[key]: cols[colEnum[col]] = matrix[key][col]['value'] handle.write("%s\t%s\n" % (key, "\t".join(cols).encode("ASCII", "replace"))) handle.close() self.emitFile( "." + matrixName, self.getMeta(self.config.name + "." + matrixName), "%s/%s_file" % (self.work_dir, matrixName)) class AgilentImport(TCGAMatrixImport): dataSubType = 'geneExp' probeMap = 'hugo' sampleMap = 'tcga.iddag' dataType = 'genomicMatrix' probeFields = ['log2 lowess normalized (cy5/cy3) collapsed by gene symbol'] class CGH1x1mImport(TCGASegmentImport): dataSubType = 'cna' sampleMap = 'tcga.iddag' dataType = 'genomicSegment' probeFields = ['seg.mean'] class SNP6Import(TCGASegmentImport): assembly = 'hg19' dataSubType = 'cna' sampleMap ='tcga.iddag' dataType = 'genomicSegment' probeFields = ['seg.mean'] def fileScan(self, path): outport = None #if path.endswith(".hg18.seg.txt"): # outport = "hg18_segment" if path.endswith(".hg19.seg.txt"): outport = "hg19_segment" if outport is not None: handle = open(path) colName = None for line in handle: if colName is None: colName = line.rstrip().split("\t") for i, col in enumerate(colName): if commonMap.has_key( col ): colName[i] = commonMap[ col ] else: tmp = line.rstrip().split("\t") out = {} for i in range(1, len(colName)): out[ colName[i] ] = tmp[i] self.emit( tmp[0], out, outport ) handle.close() def fileBuild(self): tmap = self.getTargetMap() for base in ['hg19']: subprocess.call("sort -k 1 %s/%s_segment > %s/%s_segment.sort" % (self.work_dir, base, self.work_dir, base), shell=True) handle = TableReader(self.work_dir + "/%s_segment.sort" % (base)) segFile = None curName = None curData = {} missingCount = 0 startField = "loc.start" endField = "loc.end" valField = "seg.mean" chromeField = "chrom" segFile = None sHandle = handle for key, value in sHandle: if segFile is None: segFile = open("%s/%s_segment.out" % (self.work_dir, base), "w") try: curName = self.translateUUID(tmap[key]) if curName is not None: chrom = value[ chromeField ].lower() if not chrom.startswith("chr"): chrom = "chr" + chrom chrom = chrom.upper().replace("CHR", "chr") #segFile.write( "%s\t%s\t%s\t%s\t.\t%s\n" % ( curName, chrom, int(value[ startField ])+1, value[ endField ], value[ valField ] ) ) segFile.write( "%s\t%s\t%s\t%s\t%s\n" % ( chrom, value[ startField ], value[ endField ], curName, value[ valField ] ) ) except KeyError: self.addError( "TargetInfo Not Found: %s" % (key)) segFile.close() self.emitFile("." + base, self.getMeta(self.config.name + "." + base), "%s/%s_segment.out" % (self.work_dir, base)) class HmiRNAImport(TCGAMatrixImport): dataSubType = 'miRNAExp' probeMap = 'agilentHumanMiRNA' sampleMap = 'tcga.iddag' dataType = 'genomicMatrix' probeFields = ['unc_DWD_Batch_adjusted'] class CGH244AImport(TCGASegmentImport): dataSubType = 'cna' sampleMap = 'tcga.iddag' dataType = 'genomicSegment' probeFields = ['Segment_Mean'] class CGH415K_G4124A(TCGASegmentImport): dataSubType = 'cna' sampleMap = 'tcga.iddag' chromeField = 'Chromosome' dataType = 'genomicSegment' endField = 'End' probeFields = ['Segment_Mean'] startField = 'Start' class IlluminaHiSeq_DNASeqC(TCGASegmentImport): dataSubType = 'cna' sampleMap = 'tcga.iddag' chromeField = 'Chromosome' dataType = 'genomicSegment' endField = 'End' probeFields = ['Segment_Mean'] startField = 'Start' def translateUUID(self, uuid): out = self.config.translateUUID(uuid) #censor out normal ids if re.search(r'^TCGA-..-....-1', out): return None return out class HT_HGU133A(TCGAMatrixImport): dataSubType = 'geneExp' probeMap = 'affyU133a' sampleMap = 'tcga.iddag' dataType = 'genomicMatrix' probeFields = ['Signal'] class HuEx1_0stv2(TCGAMatrixImport): dataSubType = 'miRNAExp' probeMap = 'hugo' sampleMap = 'tcga.iddag' dataType = 'genomicMatrix' probeFields = ['Signal'] fileInclude = '^.*gene.txt$|^.*sdrf.txt$' class Human1MDuoImport(TCGASegmentImport): dataSubType = 'cna' sampleMap = 'tcga.iddag' dataType = 'genomicSegment' probeFields = ['mean'] class HumanHap550(TCGASegmentImport): dataSubType = 'cna' sampleMap = 'tcga.iddag' dataType = 'genomicSegment' probeFields = ['mean'] class HumanMethylation27(TCGAMatrixImport): dataSubType = 'DNAMethylation' probeMap= 'illuminaMethyl27K_gpl8490' sampleMap= 'tcga.iddag' dataType= 'genomicMatrix' fileExclude= '.*.adf.txt' probeFields = ['Beta_Value', 'Beta_value'] class HumanMethylation450(TCGAMatrixImport): dataSubType = 'DNAMethylation' probeMap = 'illuminaHumanMethylation450' sampleMap = 'tcga.iddag' dataType = 'genomicMatrix' fileExclude = '.*.adf.txt' probeFields = ['Beta_value', 'Beta_Value'] def fileScan(self, path): """ This function takes a TCGA level 3 genetic file (file name and input handle), and tries to extract probe levels or target mappings (experimental ID to TCGA barcode) it emits these values to a handle, using the 'targets' and 'probes' string to identify the type of data being emited """ iHandle = open(path) mode = None #modes #1 - two col header matrix file target = None colName = None colType = None for line in iHandle: if colName is None: colName = line.rstrip().split("\t") if colName[0] == "Hybridization REF" or colName[0] == "Sample REF": mode=1 for i in range(len(colName)): if commonMap.has_key( colName[i] ): colName[i] = commonMap[ colName[i] ] elif mode==1 and colType is None: colType=line.rstrip().split("\t") for i in range(len(colType)): if commonMap.has_key( colType[i] ): colType[i] = commonMap[ colType[i] ] else: tmp = line.rstrip().split("\t") if mode == 1: out={} for col in colName[1:]: out[ col ] = { "target" : col } for i in range(1,len(colType)): try: if colType[i] in self.probeFields: out[ colName[i] ][ colType[i] ] = "%.4f" % float(tmp[i]) except IndexError: out[ colName[i] ][ colType[i] ] = "NA" except ValueError: out[ colName[i] ][ colType[i] ] = "NA" for col in out: self.emit( tmp[0], out[col], "probes" ) class Illumina_RNASeq(TCGAMatrixImport): sampleMap= 'tcga.iddag' dataSubType= 'geneExp' fileInclude= r'^.*\.gene.quantification.txt$|^.*sdrf.txt$' probeFields = ['RPKM'] probeMap= 'hugo.unc' class Illumina_RNASeqV2(TCGAMatrixImport): sampleMap= 'tcga.iddag' dataSubType= 'geneExp' fileInclude= r'^.*rsem.genes.normalized_results$|^.*sdrf.txt$' probeFields = ['normalized_count'] probeMap= 'hugo.unc' class IlluminaHiSeq_RNASeq(TCGAMatrixImport): sampleMap= 'tcga.iddag' dataSubType= 'geneExp' fileInclude= r'^.*gene.quantification.txt$' probeFields = ['RPKM'] probeMap= 'hugo.unc' class MDA_RPPA_Core(TCGAMatrixImport): sampleMap = 'tcga.iddag' probeMap = "md_anderson_antibodies" dataSubType = "RPPA" fileExclude = r'^.*.antibody_annotation.txt' probeFields = [ 'Protein Expression' ] def getTargetMap(self): subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True) handle = TableReader(self.work_dir + "/targets.sort") tTrans = {} for key, value in handle: value = re.sub(r'\.SD', '', value) tTrans[ key ] = value return tTrans class Illumina_miRNASeq(TCGAMatrixImport): sampleMap= 'tcga.iddag' dataSubType= 'miRNA' fileInclude= '^.*.mirna.quantification.txt$' probeFields = ['reads_per_million_miRNA_mapped'] probeMap= 'hsa.mirna' class bioImport(TCGAClinicalImport): sampleMap = 'tcga.iddag' fileInclude = '.*.xml$' tcgaConfig = { 'AgilentG4502A_07' : AgilentImport, 'AgilentG4502A_07_1' : AgilentImport, 'AgilentG4502A_07_2' : AgilentImport, 'AgilentG4502A_07_3': AgilentImport, 'CGH-1x1M_G4447A': CGH1x1mImport, 'Genome_Wide_SNP_6': SNP6Import, 'H-miRNA_8x15K': HmiRNAImport, 'H-miRNA_8x15Kv2': HmiRNAImport, 'HG-CGH-244A': CGH244AImport, 'HG-CGH-415K_G4124A': CGH415K_G4124A, 'HT_HG-U133A': HT_HGU133A, 'HuEx-1_0-st-v2': HuEx1_0stv2, 'Human1MDuo': Human1MDuoImport, 'HumanHap550': HumanHap550, 'IlluminaHiSeq_DNASeqC' : IlluminaHiSeq_DNASeqC, 'HumanMethylation27': HumanMethylation27, 'HumanMethylation450': HumanMethylation450, 'IlluminaHiSeq_RNASeq': IlluminaHiSeq_RNASeq, 'IlluminaGA_RNASeq' : Illumina_RNASeq, 'IlluminaHiSeq_RNASeqV2' : Illumina_RNASeqV2, 'MDA_RPPA_Core' : MDA_RPPA_Core, 'IlluminaGA_miRNASeq' : Illumina_miRNASeq, 'IlluminaHiSeq_miRNASeq' : Illumina_miRNASeq, 'bio' : bioImport } def fileDigest( file ): md5 = hashlib.md5() with open(file,'rb') as f: for chunk in iter(lambda: f.read(8192), ''): md5.update(chunk) return md5.hexdigest() def platform_list(): q = CustomQuery("Platform") for e in q: yield e['name'] def supported_list(): q = CustomQuery("Platform") for e in q: if e['name'] in tcgaConfig: yield e['name'] def platform_archives(platform): q = CustomQuery("Archive[Platform[@name=%s]][@isLatest=1]" % platform) out = {} for e in q: name = e['baseName'] if name not in out: yield name out[name] = True if __name__ == "__main__": parser = ArgumentParser() #Stack.addJobTreeOptions(parser) parser.add_argument("-a", "--platform-list", dest="platform_list", action="store_true", help="Get list of platforms", default=False) parser.add_argument("-u", "--uuid", dest="uuid_table", help="UUID to Barcode Table", default=None) parser.add_argument("-t", "--uuid-download", dest="uuid_download", help="Download UUID/Barcode Table", default=False) parser.add_argument("-z", "--all-archives", dest="all_archives", action="store_true", help="List all archives", default=False) parser.add_argument("-p", "--platform", dest="platform", help="Platform Selection", default=None) parser.add_argument("-l", "--supported", dest="supported_list", action="store_true", help="List Supported Platforms", default=None) parser.add_argument("-f", "--filelist", dest="filelist", help="List files needed to convert TCGA project basename into cgData", default=None) parser.add_argument("-b", "--basename", dest="basename", help="Convert TCGA project basename into cgData", default=None) parser.add_argument("-m", "--mirror", dest="mirror", help="Mirror Location", default=None) parser.add_argument("-w", "--workdir", dest="workdir_base", help="Working directory", default="/tmp") parser.add_argument("--out-dir", dest="outdir", help="Working directory", default="./") parser.add_argument("-o", "--out", dest="outpath", help="Output Dest", default=None) parser.add_argument("--out-error", dest="errorpath", help="Output Error", default=None) parser.add_argument("--out-meta", dest="metapath", help="Output Meta", default=None) parser.add_argument("-c", "--cancer", dest="cancer", help="List Archives by cancer type", default=None) parser.add_argument("-d", "--download", dest="download", help="Download files for archive", default=None) parser.add_argument("-e", "--level", dest="level", help="Data Level ", default="3") parser.add_argument("-s", "--check-sum", dest="checksum", help="Check project md5", default=None) parser.add_argument("-r", "--sanitize", dest="sanitize", action="store_true", help="Remove race/ethnicity from clinical data", default=False) parser.add_argument("-x", "--clinical", dest="clinical", help="Process clinical info", default=None) parser.add_argument("--clinical-basename", dest="clinical_basename", help="Select Clinical Data by basename", default=None) parser.add_argument("--clinical-type", dest="clinical_type", help="Clinical Data Type", default=None) parser.add_argument("--all-clinical", dest="all_clinical", action="store_true", help="List all clinical archives", default=False) parser.add_argument("--out-clinical", dest="out_clinical", action="append", nargs=3, default=[]) parser.add_argument("--samples", dest="get_samples", action="store_true", default=False) options = parser.parse_args() if options.uuid_download: url="https://tcga-data.nci.nih.gov/uuid/uuidBrowserExport.htm" data = {} data['exportType'] = 'tab' data['cols'] = "uuid,barcode" urllib.urlretrieve( url, options.uuid_download, data=urllib.urlencode(data)) if options.platform_list: for e in platform_list(): print e if options.supported_list: for e in supported_list(): print e if options.platform: for name in platform_archives( options.platform ): print name if options.all_archives: q = CustomQuery("Archive[@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.level)) out = {} for e in q: name = e['baseName'] if name not in out: print name out[name] = True if options.all_clinical: q = CustomQuery("Archive[@isLatest=1][Platform[@alias=bio]]") out = {} for e in q: name = e['baseName'] if name not in out: print name out[name] = True if options.get_samples: url="https://tcga-data.nci.nih.gov/datareports/aliquotExport.htm" data = {} data['exportType'] = 'tab' data['cols'] = 'aliquotId,disease,bcrBatch,center,platform,levelOne,levelTwo,levelThree' data['filterReq'] = json.dumps({"disease":"","levelOne":"","aliquotId":"","center":"","levelTwo":"","bcrBatch":"","platform":"","levelThree":""}) data['formFilter'] = json.dumps({"disease":"","levelOne":"","aliquotId":"","center":"","levelTwo":"","bcrBatch":"","platform":"","levelThree":""}) handle = urllib.urlopen( url + "?" + urllib.urlencode(data)) for line in handle: tmp = line.rstrip().split("\t") if tmp[7] == "Submitted": if tmp[0][13]=='0': print "\t".join( [ tmp[0], tmp[1], "Tumor", tmp[4] ] ) elif tmp[0][13] == '1': print "\t".join( [ tmp[0], tmp[1], "Normal", tmp[4] ] ) if options.cancer is not None: q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][ArchiveType[@type=Level_%s]]" % (options.cancer, options.level)) out = {} for e in q: name = e['baseName'] if name not in out: print name out[name] = True if options.filelist: q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.filelist, options.level)) for e in q: print e['deployLocation'] q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.filelist)) for e in q: print e['deployLocation'] if options.checksum: urls = [] q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.checksum, options.level)) for e in q: urls.append( e['deployLocation'] ) q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.checksum)) for e in q: urls.append( e['deployLocation'] ) for url in urls: dst = os.path.join(options.mirror, re.sub("^/", "", url)) if not os.path.exists( dst ): print "NOT_FOUND:", dst continue if not os.path.exists( dst + ".md5" ): print "MD5_NOT_FOUND", dst continue handle = open( dst + ".md5" ) line = handle.readline() omd5 = line.split(' ')[0] handle.close() nmd5 = fileDigest( dst ) if omd5 != nmd5: print "CORRUPT:", dst else: print "OK:", dst if options.download is not None: if options.mirror is None: print "Define mirror location" sys.exit(1) urls = [] if options.basename is None and options.clinical is None and options.clinical_basename is None: q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.download, options.level)) for e in q: urls.append( e['deployLocation'] ) urls.append( e['deployLocation'] + ".md5" ) q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.download)) for e in q: urls.append( e['deployLocation'] ) urls.append( e['deployLocation'] + ".md5" ) if options.basename: q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.basename, options.level)) for e in q: urls.append( e['deployLocation'] ) urls.append( e['deployLocation'] + ".md5" ) q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.basename)) for e in q: urls.append( e['deployLocation'] ) urls.append( e['deployLocation'] + ".md5" ) if options.clinical: q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][Platform[@alias=bio]]" % (options.clinical)) for e in q: urls.append( e['deployLocation'] ) urls.append( e['deployLocation'] + ".md5" ) if options.clinical_basename: q = CustomQuery("Archive[@isLatest=1][@baseName=%s]" % (options.clinical_basename)) for e in q: urls.append( e['deployLocation'] ) urls.append( e['deployLocation'] + ".md5" ) for url in urls: src = "https://tcga-data.nci.nih.gov/" + url dst = os.path.join(options.mirror, re.sub("^/", "", url)) dir = os.path.dirname(dst) if not os.path.exists(dir): print "mkdir", dir os.makedirs(dir) if not os.path.exists( dst ): print "download %s to %s" % (src, dst) urllib.urlretrieve(src, dst) if options.basename: if options.mirror is None: sys.stderr.write("Need mirror location\n") sys.exit(1) conf = getBaseBuildConf(options.basename, options.level, options.mirror) conf.addOptions(options) if conf.platform not in tcgaConfig: sys.stderr.write("Platform %s not supported\n" % (conf.platform)) sys.exit(1) ext = tcgaConfig[conf.platform](conf) ext.run() if options.clinical: if options.mirror is None: sys.stderr.write("Need mirror location\n") sys.exit(1) q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][Platform[@alias=bio]]" % (options.clinical)) basenames = {} for s in q: basenames[s['baseName']] = True for base in basenames: conf = getBaseBuildConf(base, 1, options.mirror) conf.addOptions(options) ext = tcgaConfig[conf.platform](conf) ext.run() if options.clinical_basename: if options.mirror is None: sys.stderr.write("Need mirror location\n") sys.exit(1) conf = getBaseBuildConf(options.clinical_basename, 1, options.mirror) conf.addOptions(options) ext = tcgaConfig[conf.platform](conf) ext.run()