view tcga_import/tcgaImport.py @ 0:f1c71f5363ae draft default tip

Uploaded
author kellrott
date Tue, 30 Oct 2012 14:23:49 -0400
parents
children
line wrap: on
line source

#!/usr/bin/env python


"""
Script to scan and extract TCGA data and compile it into the cgData

Usage::
    
    tcga2cgdata.py [options]

Options::
    
      -h, --help            show this help message and exit
      -a, --platform-list   Get list of platforms
      -p PLATFORM, --platform=PLATFORM
                            Platform Selection
      -l, --supported       List Supported Platforms
      -f FILELIST, --filelist=FILELIST
                            List files needed to convert TCGA project basename
                            into cgData
      -b BASENAME, --basename=BASENAME
                            Convert TCGA project basename into cgData
      -m MIRROR, --mirror=MIRROR
                            Mirror Location
      -w WORKDIR_BASE, --workdir=WORKDIR_BASE
                            Working directory
      -o OUTDIR, --out-dir=OUTDIR
                            Working directory
      -c CANCER, --cancer=CANCER
                            List Archives by cancer type
      -d DOWNLOAD, --download=DOWNLOAD
                            Download files for archive
      -e LEVEL, --level=LEVEL
                            Data Level
      -s CHECKSUM, --check-sum=CHECKSUM
                            Check project md5
      -r, --sanitize        Remove race/ethnicity from clinical data


Example::
    
    ./scripts/tcga2cgdata.py -b intgen.org_KIRC_bio -m /inside/depot -e 1 -r -w tmp


"""

from xml.dom.minidom import parseString
import urllib
import urllib2
import os
import csv
import sys
import hashlib
import tempfile
import re
import copy
import json
import datetime
import hashlib
import subprocess
from glob import glob
import shutil
import subprocess
from argparse import ArgumentParser




"""

Net query code

"""

class dccwsItem(object):
    baseURL = "http://tcga-data.nci.nih.gov/tcgadccws/GetXML?query="

    def __init__(self):
        self.url = None
            
    def __iter__(self):
        next = self.url        
        while next != None:
            handle = urllib.urlopen(next)
            data = handle.read()
            handle.close()
            dom = parseString(data)
            # there might not be any archives for a dataset
            if len(dom.getElementsByTagName('queryResponse')) > 0:
                response = dom.getElementsByTagName('queryResponse').pop()
                classList = response.getElementsByTagName('class')
                for cls in classList:
                    className = cls.getAttribute("recordNumber")
                    outData = {}
                    #aObj = Archive()
                    for node in cls.childNodes:
                        nodeName = node.getAttribute("name")
                        if node.hasAttribute("xlink:href"):
                            outData[ nodeName ] = node.getAttribute("xlink:href")            
                        else:
                            outData[ nodeName ] = getText( node.childNodes )
                    yield outData
            if len( dom.getElementsByTagName('next') ) > 0:
                nextElm = dom.getElementsByTagName('next').pop()
                next = nextElm.getAttribute( 'xlink:href' )
            else:
                next = None


class CustomQuery(dccwsItem):
    def __init__(self, query):
        super(CustomQuery, self).__init__()
        if query.startswith("http://"):
            self.url = query
        else:
            self.url = dccwsItem.baseURL + query

    
def getText(nodelist):
    rc = []
    for node in nodelist:
        if node.nodeType == node.TEXT_NODE:
            rc.append(node.data)
    return ''.join(rc)
    
"""

Build Configuration

"""

class BuildConf:
    def __init__(self, platform, name, version, meta, tarlist):
        self.platform = platform
        self.name = name
        self.version = version
        self.meta = meta
        self.tarlist = tarlist
        self.abbr = ''
        self.uuid_table = None
        if 'diseaseAbbr' in meta:
            self.abbr = meta['diseaseAbbr']
    
    def addOptions(self, opts):
        self.workdir_base = opts.workdir_base
        self.outdir = opts.outdir
        self.sanitize = opts.sanitize
        self.outpath = opts.outpath
        self.metapath = opts.metapath
        self.errorpath = opts.errorpath
        self.clinical_type = opts.clinical_type

        self.clinical_type_map = {}
        for t, path, meta in opts.out_clinical:
            self.clinical_type_map[ "." + t] = (path, meta)
        
        if opts.uuid_table is not None:
            self.uuid_table = {}
            handle = open(opts.uuid_table)
            for line in handle:
                tmp = line.rstrip().split("\t")
                self.uuid_table[tmp[0]] = tmp[1]
    
    def translateUUID(self, uuid):
        if self.uuid_table is None or uuid not in self.uuid_table:
            return uuid
        return self.uuid_table[uuid]
    
    def getOutPath(self, name):
        if self.outpath is not None:
            return self.outpath
        if name in self.clinical_type_map:
            return self.clinical_type_map[name][0]
        return os.path.join(self.outdir, self.name) + name

    def getOutMeta(self, name):
        if self.outpath is not None:
            if self.metapath is not None:
                return self.metapath
            return self.outpath + ".json"
        if name in self.clinical_type_map:
            return self.clinical_type_map[name][1]
        return os.path.join(self.outdir, self.name) + name + ".json"

    def getOutError(self, name):
        if self.outpath is not None:
            if self.errorpath is not None:
                return self.errorpath
            return self.outpath + ".error"
        return os.path.join(self.outdir, self.name) + name + ".error"


def getBaseBuildConf(basename, level, mirror):
    dates = []
    print "TCGA Query for: ", basename
    q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (basename, level))
    urls = {}
    meta = None
    platform = None
    for e in q:
        dates.append( datetime.datetime.strptime( e['addedDate'], "%m-%d-%Y" ) )
        if meta is None:
            meta = {"sourceUrl" : []}            
            for e2 in CustomQuery(e['platform']):
                platform = e2['name']
                meta['platform'] = e2['name']
                meta['platformTitle'] = e2['displayName']
            for e2 in CustomQuery(e['disease']):
                meta['diseaseAbbr'] = e2['abbreviation']
                meta['diseaseTitle'] = e2['name']
                for e3 in CustomQuery(e2['tissueCollection']):
                    meta['tissue'] = e3['name']
            for e2 in CustomQuery(e['center']):
                meta['centerTitle'] = e2['displayName']
                meta['center'] = e2['name']
        meta['sourceUrl'].append( "http://tcga-data.nci.nih.gov/" + e['deployLocation'] )
        urls[ mirror + e['deployLocation'] ] = platform

    print "TCGA Query for mage-tab: ", basename
    q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (basename))
    for e in q:
        dates.append( datetime.datetime.strptime( e['addedDate'], "%m-%d-%Y" ) )
        q2 = CustomQuery(e['platform'])
        platform = None
        for e2 in q2:
            print e2
            platform = e2['name']
        meta['sourceUrl'].append( "http://tcga-data.nci.nih.gov/" + e['deployLocation'] )
        urls[ mirror + e['deployLocation'] ] = platform
    
    if len(dates) == 0:
        print "No Files found"
        return
    dates.sort()
    dates.reverse()
    versionDate = dates[0].strftime( "%Y-%m-%d" )
    
    return BuildConf(platform, basename, versionDate, meta, urls)
    




class TableReader:
    def __init__(self, path):
        self.path = path
    
    def __iter__(self):
        if self.path is not None and os.path.exists(self.path):
            handle = open(self.path)
            for line in handle:
                tmp = line.rstrip().split("\t")
                yield tmp[0], json.loads(tmp[1])
            handle.close()


class FileImporter:
    
    fileInclude = None
    fileExclude = None
    
    excludes = [
         "MANIFEST.txt$",
         "CHANGES_DCC.txt$",
         "README_DCC.txt$",
         "README.txt$",
         "CHANGES.txt$",
         "DCC_ALTERED_FILES.txt$", 
         r'.wig$',    
         "DESCRIPTIO$"
    ]
    
    def __init__(self, config):
        self.config = config
        
    def extractTars(self):        
        self.work_dir = tempfile.mkdtemp(dir=self.config.workdir_base)
        print "Extract to ", self.work_dir
        for path in self.config.tarlist:
            subprocess.check_call([ "tar", "xvzf", path, "-C", self.work_dir], stderr=sys.stdout)
        
    def run(self):        
        self.extractTars()
        
        filterInclude = None
        filterExclude = None
        if self.fileInclude is not None:
            filterInclude = re.compile(self.fileInclude)
        if self.fileExclude is not None:
            filterExclude = re.compile(self.fileExclude)
        self.inc = 0
        self.out = {}
        self.errors = []
        self.ext_meta = {}
        self.scandirs(self.work_dir, filterInclude, filterExclude)
        for o in self.out:
            self.out[o].close()
        self.fileBuild()
        #shutil.rmtree(self.work_dir)       
    
    def checkExclude( self, name ):
        for e in self.excludes:
            if re.search( e, name ):
                return True
        return False
    
    def scandirs(self, path, filterInclude=None, filterExclude=None):
        if os.path.isdir(path):
            for a in glob(os.path.join(path, "*")):
                self.scandirs(a, filterInclude, filterExclude)
        else:
            name = os.path.basename(path)
            if self.isMage(path):
                self.mageScan(path)
            else:
                if not self.checkExclude(name):
                    if (filterInclude is None or filterInclude.match(name)) and (filterExclude is None or not filterExclude.match(name)):
                        self.fileScan(path)
                        
    def isMage(self, path):
        if path.endswith( '.sdrf.txt' ) or path.endswith( '.idf.txt' ) or path.endswith("DESCRIPTION.txt"):
            return True

    
    def emit(self, key, data, port):
        if port not in self.out:
            self.out[port] = open(self.work_dir + "/" + port, "w")
        self.out[port].write( "%s\t%s\n" % (key, json.dumps(data)))

    def emitFile(self, name, meta, file):
        md5 = hashlib.md5()
        oHandle = open(self.config.getOutPath(name), "wb")
        with open(file,'rb') as f: 
            for chunk in iter(lambda: f.read(8192), ''): 
                md5.update(chunk)
                oHandle.write(chunk)
        oHandle.close()
        md5str = md5.hexdigest()
        meta['md5'] = md5str
        mHandle = open(self.config.getOutMeta(name), "w")
        mHandle.write( json.dumps(meta))
        mHandle.close()
        if len(self.errors):
            eHandle = open( self.config.getOutError(name), "w" )
            for msg in self.errors:
                eHandle.write( msg + "\n" )
            eHandle.close()
    
    def addError(self, msg):
        self.errors.append(msg)

        
commonMap = {
    "mean" : "seg.mean",
    "Segment_Mean" : "seg.mean",
    "Start" : "loc.start",
    "End" : "loc.end",
    "Chromosome" : "chrom"
}


idfMap = {
    "Investigation Title" : "title",
    "Experiment Description" : "experimentalDescription",
    "Person Affiliation" : "dataProducer",
    "Date of Experiment" : "experimentalDate"
}

class TCGAGeneticImport(FileImporter):      
    

    
    def mageScan(self, path):
        if path.endswith(".sdrf.txt"):
            iHandle = open(path, "rU")
            read = csv.reader( iHandle, delimiter="\t" )
            colNum = None
            for row in read:
                if colNum is None:
                    colNum = {}
                    for i in range(len(row)):
                        colNum[ row[i] ] = i
                else:
                    if not colNum.has_key("Material Type") or ( not row[ colNum[ "Material Type" ] ] in [ "genomic_DNA", "total_RNA", "MDA cell line" ] ):
                        try:
                            if colNum.has_key( "Derived Array Data File" ):
                                self.emit( row[ colNum[ "Derived Array Data File" ] ].split('.')[0], row[ colNum[ "Extract Name" ] ], "targets" )
                                self.emit( row[ colNum[ "Derived Array Data File" ] ], row[ colNum[ "Extract Name" ] ], "targets" )
                            if colNum.has_key("Derived Array Data Matrix File" ):
                                self.emit( row[ colNum[ "Derived Array Data Matrix File" ] ], row[ colNum[ "Extract Name" ] ], "targets" )    
                            if colNum.has_key( "Derived Data File"):
                                self.emit( row[ colNum[ "Derived Data File" ] ].split('.')[0], row[ colNum[ "Extract Name" ] ], "targets" )  
                                self.emit( row[ colNum[ "Derived Data File" ] ], row[ colNum[ "Extract Name" ] ], "targets" )    
                            if colNum.has_key( "Hybridization Name" ):
                                self.emit( row[ colNum[ "Hybridization Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" )
                            if colNum.has_key( "Sample Name" ):
                                self.emit( row[ colNum[ "Sample Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" )
                            self.emit( row[ colNum[ "Extract Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" )
                        except IndexError:
                            pass #there can be blank lines in the SDRF
        if path.endswith(".idf.txt"):
            iHandle = open(path)
            for line in iHandle:
                row = line.split("\t")
                if len(row):
                    if row[0] in idfMap:
                        self.ext_meta[ idfMap[row[0]] ] = row[1]
            iHandle.close()
        if path.endswith("DESCRIPTION.txt"):
            handle = open(path)
            self.ext_meta['description'] = handle.read()
            handle.close()
    
    def translateUUID(self, uuid):
        return self.config.translateUUID(uuid)
    
    def getTargetMap(self):
        subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True)                
        handle = TableReader(self.work_dir + "/targets.sort")
        tTrans = {}
        for key, value in handle:
            tTrans[ key ] = value
        return tTrans
    
    def fileScan(self, path):
        """
        This function takes a TCGA level 3 genetic file (file name and input handle),
        and tries to extract probe levels or target mappings (experimental ID to TCGA barcode)
        it emits these values to a handle, using the 'targets' and 'probes' string to identify 
        the type of data being emited
        """
        iHandle = open(path)
        mode = None
        #modes
        #1 - segmentFile - one sample per file/no sample info inside file
        #2 - two col header matrix file
        #3 - segmentFile - sample information inside file
        target = None
        colName = None
        colType = None
        for line in iHandle:
            if colName is None:
                colName = line.rstrip().split("\t")                     
                if colName[0] == "Hybridization REF" or colName[0] == "Sample REF":
                    mode=2
                elif colName[0] == "Chromosome" or colName[0] == "chromosome":
                    mode=1
                    target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention
                elif colName[1] == "chrom":
                    mode = 3
                    target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention
                    
                for i in range(len(colName)):
                    if commonMap.has_key( colName[i] ):
                        colName[i] = commonMap[ colName[i] ]
            elif mode==2 and colType is None:
                colType=line.rstrip().split("\t")
                for i in range(len(colType)):
                    if commonMap.has_key( colType[i] ):
                        colType[i] = commonMap[ colType[i] ]
            else:
                tmp = line.rstrip().split("\t")
                if mode == 2:
                    out={}
                    for col in colName[1:]:
                        out[ col ] = { "target" : col }
                    for i in range(1,len(colType)):
                        try:
                            if colType[i] in self.probeFields:
                                out[ colName[i] ][ colType[i] ] = tmp[i]
                        except IndexError:
                            out[ colName[i] ][ colType[i] ] = "NA"
                    for col in out:
                        self.emit( tmp[0], out[col], "probes" )
                else:
                    out = {}
                    for i in range(len(colName)):
                        out[ colName[i] ] = tmp[i]
                    out['file'] = os.path.basename(path)
                    if mode==1:
                        self.emit( target, out, "segments" )
                    elif mode == 3:
                        self.emit( tmp[0], out, "segments" )
                    else:
                        self.emit( tmp[0], out, "probes" )

        


class TCGASegmentImport(TCGAGeneticImport):
    
    
    def fileScan(self, path):
        """
        This function takes a TCGA level 3 genetic file (file name and input handle),
        and tries to extract probe levels or target mappings (experimental ID to TCGA barcode)
        it emits these values to a handle, using the 'targets' and 'probes' string to identify 
        the type of data being emited
        """
        iHandle = open(path)
        mode = None
        #modes
        #1 - segmentFile - one sample per file/no sample info inside file
        #2 - segmentFile - sample information inside file
        target = None
        colName = None
        colType = None
        for line in iHandle:
            if colName is None:
                colName = line.rstrip().split("\t")                     
                if colName[0] == "Chromosome" or colName[0] == "chromosome":
                    mode=1
                    target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention
                elif colName[1] == "chrom":
                    mode = 2
                    
                for i in range(len(colName)):
                    if commonMap.has_key( colName[i] ):
                        colName[i] = commonMap[ colName[i] ]
            else:
                tmp = line.rstrip().split("\t")
                out = {}
                for i in range(len(colName)):
                    out[ colName[i] ] = tmp[i]
                out['file'] = os.path.basename(path)
                if mode==1:
                    self.emit( target, out, "segments" )
                elif mode == 2:
                    self.emit( tmp[0], out, "segments" )

    
    def getMeta(self, name):
        matrixInfo = { 
            '@context' : "http://purl.org/cgdata/",
            '@type' : 'bed5', 
            '@id' : name, 
            "lastModified" : self.config.version,
            'rowKeySrc' : {
                    '@type' :  'idDAG',
                    '@id' : "tcga.%s" % (self.config.abbr)
            },
            'dataSubType' : { "@id" : self.dataSubType },
            'dataProducer' : 'TCGA Import',
            "accessMap" : "public", "redistribution" : "yes" 
        }
        matrixInfo.update(self.ext_meta)
        matrixInfo.update(self.config.meta)
        return matrixInfo
    
    def fileBuild(self):
        #use the target table to create a name translation table
        #also setup target name enumeration, so they will have columns
        #numbers

        tTrans = self.getTargetMap()        
        subprocess.call("sort -k 1 %s/segments > %s/segments.sort" % (self.work_dir, self.work_dir), shell=True)
        sHandle = TableReader(self.work_dir + "/segments.sort")

        segFile = None
        curName = None
        
        curData = {}
        missingCount = 0

        startField  = "loc.start"
        endField    = "loc.end"
        valField    = "seg.mean"
        chromeField = "chrom"
        
        segFile = None

        for key, value in sHandle:
            if segFile is None:
                segFile = open("%s/segment_file"  % (self.work_dir), "w")
            try:
                curName = self.translateUUID(tTrans[key]) # "-".join( tTrans[ key ].split('-')[0:4] )
                if curName is not None:
                    try:
                        chrom = value[ chromeField ].lower()
                        if not chrom.startswith("chr"):
                            chrom = "chr" + chrom
                        chrom = chrom.upper().replace("CHR", "chr")
                        #segFile.write( "%s\t%s\t%s\t%s\t.\t%s\n" % ( curName, chrom, int(value[ startField ])+1, value[ endField ], value[ valField ] ) )
                        segFile.write( "%s\t%s\t%s\t%s\t%s\n" % ( chrom, value[ startField ], value[ endField ], curName, value[ valField ] ) )
                    except KeyError:
                         self.addError( "Field error: %s" % (str(value)))
            except KeyError:
                self.addError( "TargetInfo Not Found: %s" % (key))
            
        segFile.close()
        matrixName = self.config.name

        self.emitFile( "", self.getMeta(matrixName), "%s/segment_file"  % (self.work_dir) )     



class TCGAMatrixImport(TCGAGeneticImport):
    
    def getMeta(self, name):
        matrixInfo = { 
            "@context" : 'http://purl.org/cgdata/',
            '@type' : 'genomicMatrix', 
            '@id' : name, 
            "lastModified" : self.config.version,
            'dataSubType' : { "@id" : self.dataSubType },
            'dataProducer' : 'TCGA', 
            "accessMap" : "public", 
            "redistribution" : "yes",
            'rowKeySrc' : {
                "@type" : "probe", "@id" : self.probeMap
            },
            'columnKeySrc' : {
                "@type" : "idDAG", "@id" :  "tcga.%s" % (self.config.abbr)
            }
        }
        matrixInfo.update(self.ext_meta)
        matrixInfo.update(self.config.meta)
        return matrixInfo
        
    def fileBuild(self):
        #use the target table to create a name translation table
        #also setup target name enumeration, so they will have columns
        #numbers        
        
        subprocess.call("sort -k 1 %s/probes > %s/probes.sort" % (self.work_dir, self.work_dir), shell=True)
        subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True)
                
        handles = {}
        handles[ "geneticExtract:targets" ] = TableReader(self.work_dir + "/targets.sort")
        handles[ "geneticExtract:probes" ] = TableReader(self.work_dir + "/probes.sort")

        tTrans = self.getTargetMap()
        
        tEnum = {}
        for t in tTrans:
            tlabel = self.translateUUID(tTrans[t])
            if tlabel is not None and tlabel not in tEnum:
                tEnum[tlabel] = len(tEnum)
                
        matrixFile = None
        segFile = None

        curName = None
        curData = {}
        missingCount = 0
        rowCount = 0
        pHandle = handles["geneticExtract:probes"]
        for key, value in pHandle:
            if matrixFile is None:
                matrixFile = open("%s/matrix_file" % (self.work_dir), "w" )            
                out = ["NA"] * len(tEnum)
                for target in tEnum:
                    out[ tEnum[ target ] ] = target
                matrixFile.write( "%s\t%s\n" % ( "#probe", "\t".join( out ) ) )        
            
            if curName != key:
                if curName is not None:
                    out = ["NA"] * len(tEnum)
                    for target in curData:
                        try:
                            ttarget = self.translateUUID(tTrans[target])
                            if ttarget is not None:
                                out[ tEnum[ ttarget ] ] = str( curData[ target ] )
                        except KeyError:
                            self.addError( "TargetInfo Not Found: %s" % (target))
                    if out.count("NA") != len(tEnum):
                        rowCount += 1
                        matrixFile.write( "%s\t%s\n" % ( curName, "\t".join( out ) ) )  
                curName = key
                curData = {}
            if "target" in value:
                for probeField in self.probeFields:
                    if probeField in value:
                        curData[ value[ "target" ] ] = value[ probeField ]
            elif "file" in value:
                for probeField in self.probeFields:
                    if probeField in value:
                        curData[ value[ "file" ] ] = value[ probeField ]
        matrixFile.close()
        matrixName = self.config.name    
        if rowCount > 0:
            self.emitFile( "", self.getMeta(matrixName), "%s/matrix_file"  % (self.work_dir) )


adminNS = "http://tcga.nci/bcr/xml/administration/2.3"


class TCGAClinicalImport(FileImporter):
    
    def fileScan(self, path):
        handle = open(path)
        data = handle.read()
        handle.close()
        xml=parseString(data)
        self.parseXMLFile(xml)
            
    def getText(self, nodelist):
        rc = []
        for node in nodelist:
            if node.nodeType == node.TEXT_NODE:
                rc.append(node.data)
        return ''.join(rc)


    def parseXMLFile(self, dom):    
        admin = {}
        for node in dom.getElementsByTagNameNS( adminNS, "admin"):
            for cNode in node.childNodes:
                if cNode.nodeType == cNode.ELEMENT_NODE:
                    admin[ cNode.localName ] = {}
                    admin[ cNode.localName ]['value'] = getText( cNode.childNodes )
        
        name = None
        patient = {}
        patientName = None
        for node in dom.childNodes[0].childNodes:
            if node.nodeType == node.ELEMENT_NODE:
                if node.localName == 'patient':
                    for elm in node.childNodes:
                        if elm.nodeType == elm.ELEMENT_NODE:
                            if ( elm.localName == 'bcr_patient_barcode' ):
                                name = getText( elm.childNodes )
                                patientName = name
                                
                            if ( elm.getAttribute( 'procurement_status' ) == "Completed" ):
                                patient[ elm.localName ] = {}
                                patient[ elm.localName ]['value'] = getText( elm.childNodes )
                                patient[ elm.localName ]['tier']  = elm.getAttribute( 'tier' )
                                patient[ elm.localName ]['precision'] = elm.getAttribute( 'precision' )

                            if elm.prefix == "auxiliary":
                                for aux in elm.childNodes:
                                    if aux.nodeType == aux.ELEMENT_NODE:
                                        for auxval in aux.childNodes:
                                            if auxval.nodeType == auxval.ELEMENT_NODE:
                                                patient[ auxval.localName ] = {}
                                                patient[ auxval.localName ]['value'] = getText( auxval.childNodes )
                                                patient[ auxval.localName ]['tier']  = auxval.getAttribute( 'tier' )
                                                patient[ auxval.localName ]['precision'] = auxval.getAttribute( 'precision' )

        if name is not None:
            for key in admin:
                patient[ key ] = admin[ key ]
            self.emit( name, patient, "patient" )

        for node in dom.childNodes[0].childNodes:
            if node.nodeType == node.ELEMENT_NODE and node.localName == 'patient':
                for samples in node.childNodes:
                    if samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'samples':                        
                        for sample in samples.childNodes:
                            if sample.nodeType == samples.ELEMENT_NODE and sample.localName == 'sample':
                                sampleData = {}
                                for value in sample.childNodes:
                                    if value.nodeType == value.ELEMENT_NODE:                                        
                                        if value.localName == 'bcr_sample_barcode' :
                                            name = getText( value.childNodes )
                                        if value.getAttribute( 'procurement_status' ) == "Completed" :
                                            sampleData[ value.localName ] = {}
                                            sampleData[ value.localName ]['value'] = getText( value.childNodes )
                                        
                                        if value.localName == 'portions' :
                                            for portions in value.childNodes:
                                                if portions.nodeType == value.ELEMENT_NODE and portions.localName == "portion":
                                                    portionName = None
                                                    portionData = {}
                                                    for portion in portions.childNodes:
                                                        if portion.nodeType == value.ELEMENT_NODE:
                                                            if portion.localName == "analytes":
                                                                for analytes in portion.childNodes:
                                                                    if analytes.nodeType == analytes.ELEMENT_NODE and analytes.localName =="analyte":
                                                                        analyteName = None
                                                                        analyteData = {}
                                                                        for analyte in analytes.childNodes:
                                                                            if analyte.nodeType == value.ELEMENT_NODE:
                                                                                if analyte.localName == "aliquots":
                                                                                    for aliquots in analyte.childNodes:
                                                                                        if aliquots.nodeType == aliquots.ELEMENT_NODE and aliquots.localName =="aliquot":
                                                                                            aliquotName = None
                                                                                            aliquotData = {}
                                                                                            for aliquot in aliquots.childNodes:
                                                                                                if aliquot.nodeType == value.ELEMENT_NODE:
                                                                                                    if aliquot.localName == "bcr_aliquot_barcode":
                                                                                                        aliquotName = getText(aliquot.childNodes)
                                                                                                    if aliquot.getAttribute( 'procurement_status' ) == "Completed" :
                                                                                                        aliquotData[ aliquot.localName ] = {}
                                                                                                        aliquotData[ aliquot.localName ]['value'] = getText( aliquot.childNodes )
                                                                                            if aliquotName is not None and len(aliquotData):
                                                                                                self.emit( aliquotName, aliquotData, 'aliquot' )
                                                                                    
                                                                            
                                                                                if analyte.localName == "bcr_analyte_barcode":
                                                                                    analyteName = getText(analyte.childNodes)
                                                                                if analyte.getAttribute( 'procurement_status' ) == "Completed" :
                                                                                    analyteData[ analyte.localName ] = {}
                                                                                    analyteData[ analyte.localName ]['value'] = getText( analyte.childNodes )
                                                                        if analyteName is not None and len(analyteData):
                                                                            self.emit( analyteName, analyteData, 'analyte' )
                                                                            
                                                            if portion.localName == "bcr_portion_barcode":
                                                                portionName = getText( portion.childNodes )
                                                            if portion.getAttribute( 'procurement_status' ) == "Completed" :
                                                                portionData[ portion.localName ] = {}
                                                                portionData[ portion.localName ]['value'] = getText( portion.childNodes )
                                                    if portionName is not None and len(portionData):
                                                        self.emit( portionName, portionData, 'portion' )
                                                
                                            
                                #patientName = re.sub( r'\-...$', "", name )
                                self.emit( name, sampleData, "sample" )
                                self.emit( name, patient, "sample")
                    elif samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'drugs':
                        for drug in samples.childNodes:
                            if drug.nodeType == samples.ELEMENT_NODE and drug.localName == 'drug':
                                drugData = {}
                                for value in drug.childNodes:
                                    if value.nodeType == value.ELEMENT_NODE:                                        
                                        if value.localName == 'bcr_drug_barcode' :
                                            name = getText( value.childNodes )
                                        if value.getAttribute( 'procurement_status' ) == "Completed" :
                                            drugData[ value.localName ] = {}
                                            drugData[ value.localName ]['value'] = getText( value.childNodes )
                                            
                                #patientName = re.sub( r'\-...$', "", name )
                                self.emit( patientName, drugData, "drug" )
                    elif samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'radiations':
                        for rad in samples.childNodes:
                            if rad.nodeType == samples.ELEMENT_NODE and rad.localName == 'radiation':
                                radData = {}
                                for value in rad.childNodes:
                                    if value.nodeType == value.ELEMENT_NODE:                                        
                                        if value.localName == 'bcr_radiation_barcode' :
                                            name = getText( value.childNodes )
                                        if value.getAttribute( 'procurement_status' ) == "Completed" :
                                            radData[ value.localName ] = {}
                                            radData[ value.localName ]['value'] = getText( value.childNodes )
                                            
                                #patientName = re.sub( r'\-...$', "", name )
                                self.emit( patientName, radData, "radiation" )
                    

    
    def getMeta(self, name):
        fileInfo = {
            "@context" : "http://purl.org/cgdata/",
            "@type" : "clinicalMatrix",
            "@id" : name,
            "lastModified" :  self.config.version,
            'dataSubType' : { "@id" : "clinical" },
            "rowKeySrc" : {
                "@type" : "idDAG", "@id" :  "tcga.%s" % (self.config.abbr)
            }
            
        }
        fileInfo.update(self.ext_meta)
        fileInfo.update(self.config.meta)
        return fileInfo
    
    def fileBuild(self):

        matrixList = [ "patient", "sample", "radiation", "drug", "portion", "analyte", "aliquot" ]
        if self.config.clinical_type is not None:
            matrixList = [ self.config.clinical_type ]

        for matrixName in matrixList:
            if os.path.exists( "%s/%s" % (self.work_dir, matrixName)):
                subprocess.call("cat %s/%s | sort -k 1 > %s/%s.sort" % (self.work_dir, matrixName, self.work_dir, matrixName), shell=True)
                handle = TableReader(self.work_dir + "/" + matrixName + ".sort")
                matrix = {}
                colEnum = {}
                for key, value in handle:
                    if key not in matrix:
                        matrix[key] = {}
                    for col in value:
                        matrix[key][col] = value[col]
                        if col not in colEnum:
                            if not self.config.sanitize or col not in [ 'race', 'ethnicity' ]:
                                colEnum[col] = len(colEnum)
                
                handle = open( os.path.join(self.work_dir, matrixName + "_file"), "w")
                cols = [None] * (len(colEnum))
                for col in colEnum:
                    cols[colEnum[col]] = col
                handle.write("sample\t%s\n" % ("\t".join(cols)))
                for key in matrix:
                    cols = [""] * (len(colEnum))
                    for col in colEnum:
                        if col in matrix[key]:
                            cols[colEnum[col]] = matrix[key][col]['value']
                    handle.write("%s\t%s\n" % (key, "\t".join(cols).encode("ASCII", "replace")))
                handle.close()
                self.emitFile( "." + matrixName, self.getMeta(self.config.name + "." + matrixName), "%s/%s_file"  % (self.work_dir, matrixName)) 


class AgilentImport(TCGAMatrixImport):
    dataSubType = 'geneExp'
    probeMap = 'hugo'
    sampleMap = 'tcga.iddag'
    dataType = 'genomicMatrix'
    probeFields = ['log2 lowess normalized (cy5/cy3) collapsed by gene symbol']
   

class CGH1x1mImport(TCGASegmentImport):
    dataSubType = 'cna'
    sampleMap = 'tcga.iddag'
    dataType = 'genomicSegment'
    probeFields = ['seg.mean']

class SNP6Import(TCGASegmentImport):
    assembly = 'hg19'
    dataSubType = 'cna'
    sampleMap ='tcga.iddag'
    dataType = 'genomicSegment'
    probeFields = ['seg.mean']
    
    def fileScan(self, path):
        outport = None
        #if path.endswith(".hg18.seg.txt"):
        #    outport = "hg18_segment"
        if path.endswith(".hg19.seg.txt"):
            outport = "hg19_segment"
        
        if outport is not None:
            handle = open(path)
            colName = None
            for line in handle:
                if colName is None:
                    colName = line.rstrip().split("\t")                     
                    for i, col in enumerate(colName):
                        if commonMap.has_key( col ):
                            colName[i] = commonMap[ col ]
                else:
                    tmp = line.rstrip().split("\t")
                    out = {}
                    for i in range(1, len(colName)):
                        out[ colName[i] ] = tmp[i]
                    self.emit( tmp[0], out, outport )                    
            handle.close()
    
    def fileBuild(self):
        tmap = self.getTargetMap()  
        
        for base in ['hg19']:              
            subprocess.call("sort -k 1 %s/%s_segment > %s/%s_segment.sort" % (self.work_dir, base, self.work_dir, base), shell=True)                
            handle = TableReader(self.work_dir + "/%s_segment.sort" % (base))

            segFile = None
            curName = None
            curData = {}
            missingCount = 0

            startField  = "loc.start"
            endField    = "loc.end"
            valField    = "seg.mean"
            chromeField = "chrom"
            
            segFile = None
            sHandle = handle
            for key, value in sHandle:
                if segFile is None:
                    segFile = open("%s/%s_segment.out"  % (self.work_dir, base), "w")
                try:
                    curName = self.translateUUID(tmap[key])
                    if curName is not None:
                        chrom = value[ chromeField ].lower()
                        if not chrom.startswith("chr"):
                            chrom = "chr" + chrom
                        chrom = chrom.upper().replace("CHR", "chr")
                        #segFile.write( "%s\t%s\t%s\t%s\t.\t%s\n" % ( curName, chrom, int(value[ startField ])+1, value[ endField ], value[ valField ] ) )
                        segFile.write( "%s\t%s\t%s\t%s\t%s\n" % ( chrom, value[ startField ], value[ endField ], curName, value[ valField ] ) )
                except KeyError:
                    self.addError( "TargetInfo Not Found: %s" % (key))
            
            segFile.close()
            
            self.emitFile("." + base, self.getMeta(self.config.name + "." + base), "%s/%s_segment.out"  % (self.work_dir, base))
        

class HmiRNAImport(TCGAMatrixImport):
    dataSubType = 'miRNAExp'
    probeMap = 'agilentHumanMiRNA'
    sampleMap = 'tcga.iddag'
    dataType = 'genomicMatrix'
    probeFields = ['unc_DWD_Batch_adjusted']

class CGH244AImport(TCGASegmentImport):
    dataSubType = 'cna'
    sampleMap = 'tcga.iddag'
    dataType = 'genomicSegment'
    probeFields = ['Segment_Mean']

class CGH415K_G4124A(TCGASegmentImport):
    dataSubType = 'cna'
    sampleMap = 'tcga.iddag'
    chromeField = 'Chromosome'
    dataType = 'genomicSegment'
    endField = 'End'
    probeFields = ['Segment_Mean']
    startField = 'Start'


class IlluminaHiSeq_DNASeqC(TCGASegmentImport):
    dataSubType = 'cna'
    sampleMap = 'tcga.iddag'
    chromeField = 'Chromosome'
    dataType = 'genomicSegment'
    endField = 'End'
    probeFields = ['Segment_Mean']
    startField = 'Start'
    
    def translateUUID(self, uuid):
        out = self.config.translateUUID(uuid)
        #censor out normal ids
        if re.search(r'^TCGA-..-....-1', out):
            return None
        return out

class HT_HGU133A(TCGAMatrixImport):
    dataSubType = 'geneExp'
    probeMap = 'affyU133a'
    sampleMap = 'tcga.iddag'
    dataType = 'genomicMatrix'
    probeFields = ['Signal']

class HuEx1_0stv2(TCGAMatrixImport):
    dataSubType = 'miRNAExp'
    probeMap = 'hugo'
    sampleMap = 'tcga.iddag'
    dataType = 'genomicMatrix'
    probeFields = ['Signal']
    fileInclude = '^.*gene.txt$|^.*sdrf.txt$'

class Human1MDuoImport(TCGASegmentImport):
    dataSubType = 'cna'
    sampleMap = 'tcga.iddag'
    dataType = 'genomicSegment'
    probeFields = ['mean']

class HumanHap550(TCGASegmentImport):
    dataSubType = 'cna'
    sampleMap = 'tcga.iddag'
    dataType = 'genomicSegment'
    probeFields = ['mean']

class HumanMethylation27(TCGAMatrixImport):
    dataSubType = 'DNAMethylation'
    probeMap= 'illuminaMethyl27K_gpl8490'
    sampleMap= 'tcga.iddag'
    dataType= 'genomicMatrix'
    fileExclude= '.*.adf.txt'
    probeFields = ['Beta_Value', 'Beta_value']
    

class HumanMethylation450(TCGAMatrixImport):
    dataSubType =  'DNAMethylation'
    probeMap =  'illuminaHumanMethylation450'
    sampleMap =  'tcga.iddag'
    dataType =  'genomicMatrix'
    fileExclude = '.*.adf.txt'
    probeFields =  ['Beta_value', 'Beta_Value']

    def fileScan(self, path):
        """
        This function takes a TCGA level 3 genetic file (file name and input handle),
        and tries to extract probe levels or target mappings (experimental ID to TCGA barcode)
        it emits these values to a handle, using the 'targets' and 'probes' string to identify 
        the type of data being emited
        """
        iHandle = open(path)
        mode = None
        #modes
        #1 - two col header matrix file
        target = None
        colName = None
        colType = None
        for line in iHandle:
            if colName is None:
                colName = line.rstrip().split("\t")                     
                if colName[0] == "Hybridization REF" or colName[0] == "Sample REF":
                    mode=1                    
                for i in range(len(colName)):
                    if commonMap.has_key( colName[i] ):
                        colName[i] = commonMap[ colName[i] ]
            elif mode==1 and colType is None:
                colType=line.rstrip().split("\t")
                for i in range(len(colType)):
                    if commonMap.has_key( colType[i] ):
                        colType[i] = commonMap[ colType[i] ]
            else:
                tmp = line.rstrip().split("\t")
                if mode == 1:
                    out={}
                    for col in colName[1:]:
                        out[ col ] = { "target" : col }
                    for i in range(1,len(colType)):
                        try:
                            if colType[i] in self.probeFields:
                                out[ colName[i] ][ colType[i] ] = "%.4f" % float(tmp[i])
                        except IndexError:
                            out[ colName[i] ][ colType[i] ] = "NA"
                        except ValueError:
                            out[ colName[i] ][ colType[i] ] = "NA"
                    for col in out:
                        self.emit( tmp[0], out[col], "probes" )
                
class Illumina_RNASeq(TCGAMatrixImport):
    sampleMap= 'tcga.iddag'
    dataSubType= 'geneExp'
    fileInclude= r'^.*\.gene.quantification.txt$|^.*sdrf.txt$'
    probeFields = ['RPKM']
    probeMap= 'hugo.unc'

class Illumina_RNASeqV2(TCGAMatrixImport):
    sampleMap= 'tcga.iddag'
    dataSubType= 'geneExp'
    fileInclude= r'^.*rsem.genes.normalized_results$|^.*sdrf.txt$'
    probeFields = ['normalized_count']
    probeMap= 'hugo.unc'

class IlluminaHiSeq_RNASeq(TCGAMatrixImport):
    sampleMap= 'tcga.iddag'
    dataSubType= 'geneExp'
    fileInclude= r'^.*gene.quantification.txt$'
    probeFields = ['RPKM']
    probeMap= 'hugo.unc'

class MDA_RPPA_Core(TCGAMatrixImport):
    sampleMap = 'tcga.iddag'
    probeMap = "md_anderson_antibodies"
    dataSubType = "RPPA"
    fileExclude = r'^.*.antibody_annotation.txt'
    probeFields = [ 'Protein Expression' ]

    def getTargetMap(self):
        subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True)                
        handle = TableReader(self.work_dir + "/targets.sort")
        tTrans = {}
        for key, value in handle:
            value = re.sub(r'\.SD', '', value)
            tTrans[ key ] = value
        return tTrans


class Illumina_miRNASeq(TCGAMatrixImport):
    sampleMap= 'tcga.iddag'
    dataSubType= 'miRNA'
    fileInclude= '^.*.mirna.quantification.txt$'
    probeFields = ['reads_per_million_miRNA_mapped']
    probeMap= 'hsa.mirna'


class bioImport(TCGAClinicalImport):
    sampleMap = 'tcga.iddag'
    fileInclude = '.*.xml$'
        
tcgaConfig = {
    'AgilentG4502A_07' : AgilentImport,
    'AgilentG4502A_07_1' : AgilentImport,
    'AgilentG4502A_07_2' : AgilentImport,
    'AgilentG4502A_07_3': AgilentImport,
    'CGH-1x1M_G4447A': CGH1x1mImport,    
    'Genome_Wide_SNP_6': SNP6Import,
    'H-miRNA_8x15K': HmiRNAImport,
    'H-miRNA_8x15Kv2': HmiRNAImport,
    'HG-CGH-244A': CGH244AImport,
    'HG-CGH-415K_G4124A': CGH415K_G4124A,
    'HT_HG-U133A': HT_HGU133A,
    'HuEx-1_0-st-v2': HuEx1_0stv2,
    'Human1MDuo': Human1MDuoImport,
    'HumanHap550': HumanHap550,
    'IlluminaHiSeq_DNASeqC' : IlluminaHiSeq_DNASeqC,
    'HumanMethylation27': HumanMethylation27,
    'HumanMethylation450': HumanMethylation450,
    'IlluminaHiSeq_RNASeq': IlluminaHiSeq_RNASeq,
    'IlluminaGA_RNASeq' : Illumina_RNASeq,
    'IlluminaHiSeq_RNASeqV2' : Illumina_RNASeqV2,
    'MDA_RPPA_Core' : MDA_RPPA_Core,
    'IlluminaGA_miRNASeq' : Illumina_miRNASeq,
    'IlluminaHiSeq_miRNASeq' : Illumina_miRNASeq,
    'bio' : bioImport
}

def fileDigest( file ):
    md5 = hashlib.md5()
    with open(file,'rb') as f: 
        for chunk in iter(lambda: f.read(8192), ''): 
            md5.update(chunk)
    return md5.hexdigest()


def platform_list():
    q = CustomQuery("Platform")
    for e in q:
        yield e['name']

def supported_list():
    q = CustomQuery("Platform")
    for e in q:
        if e['name'] in tcgaConfig:
            yield e['name']

def platform_archives(platform):
    q = CustomQuery("Archive[Platform[@name=%s]][@isLatest=1]" % platform)
    out = {}
    for e in q:
        name = e['baseName']
        if name not in out:
            yield name
            out[name] = True


if __name__ == "__main__":
    
    parser = ArgumentParser()
    #Stack.addJobTreeOptions(parser) 
    
    parser.add_argument("-a", "--platform-list", dest="platform_list", action="store_true", help="Get list of platforms", default=False)
    parser.add_argument("-u", "--uuid", dest="uuid_table", help="UUID to Barcode Table", default=None)
    parser.add_argument("-t", "--uuid-download", dest="uuid_download", help="Download UUID/Barcode Table", default=False)
    parser.add_argument("-z", "--all-archives", dest="all_archives", action="store_true", help="List all archives", default=False)
    parser.add_argument("-p", "--platform", dest="platform", help="Platform Selection", default=None)
    parser.add_argument("-l", "--supported", dest="supported_list", action="store_true", help="List Supported Platforms", default=None)
    parser.add_argument("-f", "--filelist", dest="filelist", help="List files needed to convert TCGA project basename into cgData", default=None)
    parser.add_argument("-b", "--basename", dest="basename", help="Convert TCGA project basename into cgData", default=None)
    parser.add_argument("-m", "--mirror", dest="mirror", help="Mirror Location", default=None)
    parser.add_argument("-w", "--workdir", dest="workdir_base", help="Working directory", default="/tmp")
    parser.add_argument("--out-dir", dest="outdir", help="Working directory", default="./")
    parser.add_argument("-o", "--out", dest="outpath", help="Output Dest", default=None)    
    parser.add_argument("--out-error", dest="errorpath", help="Output Error", default=None)    
    parser.add_argument("--out-meta", dest="metapath", help="Output Meta", default=None)    
    parser.add_argument("-c", "--cancer", dest="cancer", help="List Archives by cancer type", default=None)
    parser.add_argument("-d", "--download", dest="download", help="Download files for archive", default=None)
    parser.add_argument("-e", "--level", dest="level", help="Data Level ", default="3")
    parser.add_argument("-s", "--check-sum", dest="checksum", help="Check project md5", default=None)
    parser.add_argument("-r", "--sanitize", dest="sanitize", action="store_true", help="Remove race/ethnicity from clinical data", default=False) 
    parser.add_argument("-x", "--clinical", dest="clinical", help="Process clinical info", default=None) 
    parser.add_argument("--clinical-basename", dest="clinical_basename", help="Select Clinical Data by basename", default=None) 
    parser.add_argument("--clinical-type", dest="clinical_type", help="Clinical Data Type", default=None)
    parser.add_argument("--all-clinical", dest="all_clinical", action="store_true", help="List all clinical archives", default=False)
    parser.add_argument("--out-clinical", dest="out_clinical", action="append", nargs=3, default=[])
    parser.add_argument("--samples", dest="get_samples", action="store_true", default=False)

    options = parser.parse_args()
    
    if options.uuid_download:
        url="https://tcga-data.nci.nih.gov/uuid/uuidBrowserExport.htm"
        data = {}
        data['exportType'] = 'tab'
        data['cols'] = "uuid,barcode"
        urllib.urlretrieve( url, options.uuid_download, data=urllib.urlencode(data))
    
    if options.platform_list:
        for e in platform_list():
            print e
    
    if options.supported_list:
        for e in supported_list():
            print e
            
    if options.platform:
        for name in platform_archives( options.platform ):
            print name

    if options.all_archives:
        q = CustomQuery("Archive[@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.level))
        out = {}
        for e in q:
            name = e['baseName']
            if name not in out:
                print name
                out[name] = True

    if options.all_clinical:
        q = CustomQuery("Archive[@isLatest=1][Platform[@alias=bio]]")
        out = {}
        for e in q:
            name = e['baseName']
            if name not in out:
                print name
                out[name] = True

    if options.get_samples:
        url="https://tcga-data.nci.nih.gov/datareports/aliquotExport.htm"
        data = {}
    
        data['exportType'] = 'tab'
        data['cols'] = 'aliquotId,disease,bcrBatch,center,platform,levelOne,levelTwo,levelThree'
        data['filterReq'] = json.dumps({"disease":"","levelOne":"","aliquotId":"","center":"","levelTwo":"","bcrBatch":"","platform":"","levelThree":""})
        data['formFilter'] = json.dumps({"disease":"","levelOne":"","aliquotId":"","center":"","levelTwo":"","bcrBatch":"","platform":"","levelThree":""})
        handle = urllib.urlopen( url + "?" + urllib.urlencode(data))
    
        for line in handle:
            tmp = line.rstrip().split("\t")
            if tmp[7] == "Submitted":
                if tmp[0][13]=='0':
                    print "\t".join( [ tmp[0], tmp[1], "Tumor", tmp[4] ] )
                elif tmp[0][13] == '1':
                    print "\t".join( [ tmp[0], tmp[1], "Normal", tmp[4] ] )

        
    if options.cancer is not None:
        q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][ArchiveType[@type=Level_%s]]" % (options.cancer, options.level))
        out = {}
        for e in q:
            name = e['baseName']
            if name not in out:
                print name
                out[name] = True

    if options.filelist:
        q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.filelist, options.level))
        for e in q:
            print e['deployLocation']
        q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.filelist))
        for e in q:
            print e['deployLocation']

    if options.checksum:
        urls = []
        q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.checksum, options.level))
        for e in q:
            urls.append( e['deployLocation'] )
        q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.checksum))
        for e in q:
            urls.append( e['deployLocation'] )
        
        for url in urls:
            dst = os.path.join(options.mirror, re.sub("^/", "", url))
            if not os.path.exists( dst ):
                print "NOT_FOUND:", dst
                continue
            if not os.path.exists( dst + ".md5" ):
                print "MD5_NOT_FOUND", dst
                continue

            handle = open( dst + ".md5" )
            line = handle.readline()
            omd5 = line.split(' ')[0]
            handle.close()

            nmd5 = fileDigest( dst )
            if omd5 != nmd5:
                print "CORRUPT:", dst
            else:
                print "OK:", dst


    if options.download is not None:
        if options.mirror is None:
            print "Define mirror location"
            sys.exit(1)

        urls = []
        
        if options.basename is None and options.clinical is None and options.clinical_basename is None:
            q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.download, options.level))
            for e in q:
                urls.append( e['deployLocation'] )
                urls.append( e['deployLocation'] + ".md5" )
                
            q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.download))
            for e in q:
                urls.append( e['deployLocation'] )
                urls.append( e['deployLocation'] + ".md5" )

        if options.basename:
            q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.basename, options.level))
            for e in q:
                urls.append( e['deployLocation'] )
                urls.append( e['deployLocation'] + ".md5" )

            q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.basename))
            for e in q:
                urls.append( e['deployLocation'] )
                urls.append( e['deployLocation'] + ".md5" )

        if options.clinical:
            q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][Platform[@alias=bio]]" % (options.clinical))
            for e in q:
                urls.append( e['deployLocation'] )
                urls.append( e['deployLocation'] + ".md5" )

        if options.clinical_basename:
            q = CustomQuery("Archive[@isLatest=1][@baseName=%s]" % (options.clinical_basename))
            for e in q:
                urls.append( e['deployLocation'] )
                urls.append( e['deployLocation'] + ".md5" )



        for url in urls:
            src = "https://tcga-data.nci.nih.gov/" + url
            dst = os.path.join(options.mirror, re.sub("^/", "", url))
            dir = os.path.dirname(dst)
            if not os.path.exists(dir):
                print "mkdir", dir
                os.makedirs(dir)
            if not os.path.exists( dst ):
                print "download %s to %s" % (src, dst)
                urllib.urlretrieve(src, dst)

    if options.basename:
        if options.mirror is None:
            sys.stderr.write("Need mirror location\n")
            sys.exit(1)
        
        conf = getBaseBuildConf(options.basename, options.level, options.mirror)
        conf.addOptions(options)
        if conf.platform not in tcgaConfig:
            sys.stderr.write("Platform %s not supported\n" % (conf.platform))
            sys.exit(1)

        ext = tcgaConfig[conf.platform](conf)
        ext.run()


    if options.clinical:
        if options.mirror is None:
            sys.stderr.write("Need mirror location\n")
            sys.exit(1)
        
        q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][Platform[@alias=bio]]" % (options.clinical))
        basenames = {}
        for s in q:
            basenames[s['baseName']] = True

        for base in basenames:
            conf = getBaseBuildConf(base, 1, options.mirror)
            conf.addOptions(options)
            
            ext = tcgaConfig[conf.platform](conf)
            ext.run()

    if options.clinical_basename:
        if options.mirror is None:
            sys.stderr.write("Need mirror location\n")
            sys.exit(1)
        

        conf = getBaseBuildConf(options.clinical_basename, 1, options.mirror)
        conf.addOptions(options)
        
        ext = tcgaConfig[conf.platform](conf)
        ext.run()