Mercurial > repos > kellrott > tcga_import

Binary file tcga_import/._tcgaImport.py has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tcga_import/tcgaImport.py	Tue Oct 30 14:23:49 2012 -0400
@@ -0,0 +1,1444 @@
+#!/usr/bin/env python
+
+
+"""
+Script to scan and extract TCGA data and compile it into the cgData
+
+Usage::
+
+    tcga2cgdata.py [options]
+
+Options::
+
+      -h, --help            show this help message and exit
+      -a, --platform-list   Get list of platforms
+      -p PLATFORM, --platform=PLATFORM
+                            Platform Selection
+      -l, --supported       List Supported Platforms
+      -f FILELIST, --filelist=FILELIST
+                            List files needed to convert TCGA project basename
+                            into cgData
+      -b BASENAME, --basename=BASENAME
+                            Convert TCGA project basename into cgData
+      -m MIRROR, --mirror=MIRROR
+                            Mirror Location
+      -w WORKDIR_BASE, --workdir=WORKDIR_BASE
+                            Working directory
+      -o OUTDIR, --out-dir=OUTDIR
+                            Working directory
+      -c CANCER, --cancer=CANCER
+                            List Archives by cancer type
+      -d DOWNLOAD, --download=DOWNLOAD
+                            Download files for archive
+      -e LEVEL, --level=LEVEL
+                            Data Level
+      -s CHECKSUM, --check-sum=CHECKSUM
+                            Check project md5
+      -r, --sanitize        Remove race/ethnicity from clinical data
+
+
+Example::
+
+    ./scripts/tcga2cgdata.py -b intgen.org_KIRC_bio -m /inside/depot -e 1 -r -w tmp
+
+
+"""
+
+from xml.dom.minidom import parseString
+import urllib
+import urllib2
+import os
+import csv
+import sys
+import hashlib
+import tempfile
+import re
+import copy
+import json
+import datetime
+import hashlib
+import subprocess
+from glob import glob
+import shutil
+import subprocess
+from argparse import ArgumentParser
+
+
+
+
+"""
+
+Net query code
+
+"""
+
+class dccwsItem(object):
+    baseURL = "http://tcga-data.nci.nih.gov/tcgadccws/GetXML?query="
+
+    def __init__(self):
+        self.url = None
+
+    def __iter__(self):
+        next = self.url
+        while next != None:
+            handle = urllib.urlopen(next)
+            data = handle.read()
+            handle.close()
+            dom = parseString(data)
+            # there might not be any archives for a dataset
+            if len(dom.getElementsByTagName('queryResponse')) > 0:
+                response = dom.getElementsByTagName('queryResponse').pop()
+                classList = response.getElementsByTagName('class')
+                for cls in classList:
+                    className = cls.getAttribute("recordNumber")
+                    outData = {}
+                    #aObj = Archive()
+                    for node in cls.childNodes:
+                        nodeName = node.getAttribute("name")
+                        if node.hasAttribute("xlink:href"):
+                            outData[ nodeName ] = node.getAttribute("xlink:href")
+                        else:
+                            outData[ nodeName ] = getText( node.childNodes )
+                    yield outData
+            if len( dom.getElementsByTagName('next') ) > 0:
+                nextElm = dom.getElementsByTagName('next').pop()
+                next = nextElm.getAttribute( 'xlink:href' )
+            else:
+                next = None
+
+
+class CustomQuery(dccwsItem):
+    def __init__(self, query):
+        super(CustomQuery, self).__init__()
+        if query.startswith("http://"):
+            self.url = query
+        else:
+            self.url = dccwsItem.baseURL + query
+
+
+def getText(nodelist):
+    rc = []
+    for node in nodelist:
+        if node.nodeType == node.TEXT_NODE:
+            rc.append(node.data)
+    return ''.join(rc)
+
+"""
+
+Build Configuration
+
+"""
+
+class BuildConf:
+    def __init__(self, platform, name, version, meta, tarlist):
+        self.platform = platform
+        self.name = name
+        self.version = version
+        self.meta = meta
+        self.tarlist = tarlist
+        self.abbr = ''
+        self.uuid_table = None
+        if 'diseaseAbbr' in meta:
+            self.abbr = meta['diseaseAbbr']
+
+    def addOptions(self, opts):
+        self.workdir_base = opts.workdir_base
+        self.outdir = opts.outdir
+        self.sanitize = opts.sanitize
+        self.outpath = opts.outpath
+        self.metapath = opts.metapath
+        self.errorpath = opts.errorpath
+        self.clinical_type = opts.clinical_type
+
+        self.clinical_type_map = {}
+        for t, path, meta in opts.out_clinical:
+            self.clinical_type_map[ "." + t] = (path, meta)
+
+        if opts.uuid_table is not None:
+            self.uuid_table = {}
+            handle = open(opts.uuid_table)
+            for line in handle:
+                tmp = line.rstrip().split("\t")
+                self.uuid_table[tmp[0]] = tmp[1]
+
+    def translateUUID(self, uuid):
+        if self.uuid_table is None or uuid not in self.uuid_table:
+            return uuid
+        return self.uuid_table[uuid]
+
+    def getOutPath(self, name):
+        if self.outpath is not None:
+            return self.outpath
+        if name in self.clinical_type_map:
+            return self.clinical_type_map[name][0]
+        return os.path.join(self.outdir, self.name) + name
+
+    def getOutMeta(self, name):
+        if self.outpath is not None:
+            if self.metapath is not None:
+                return self.metapath
+            return self.outpath + ".json"
+        if name in self.clinical_type_map:
+            return self.clinical_type_map[name][1]
+        return os.path.join(self.outdir, self.name) + name + ".json"
+
+    def getOutError(self, name):
+        if self.outpath is not None:
+            if self.errorpath is not None:
+                return self.errorpath
+            return self.outpath + ".error"
+        return os.path.join(self.outdir, self.name) + name + ".error"
+
+
+def getBaseBuildConf(basename, level, mirror):
+    dates = []
+    print "TCGA Query for: ", basename
+    q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (basename, level))
+    urls = {}
+    meta = None
+    platform = None
+    for e in q:
+        dates.append( datetime.datetime.strptime( e['addedDate'], "%m-%d-%Y" ) )
+        if meta is None:
+            meta = {"sourceUrl" : []}
+            for e2 in CustomQuery(e['platform']):
+                platform = e2['name']
+                meta['platform'] = e2['name']
+                meta['platformTitle'] = e2['displayName']
+            for e2 in CustomQuery(e['disease']):
+                meta['diseaseAbbr'] = e2['abbreviation']
+                meta['diseaseTitle'] = e2['name']
+                for e3 in CustomQuery(e2['tissueCollection']):
+                    meta['tissue'] = e3['name']
+            for e2 in CustomQuery(e['center']):
+                meta['centerTitle'] = e2['displayName']
+                meta['center'] = e2['name']
+        meta['sourceUrl'].append( "http://tcga-data.nci.nih.gov/" + e['deployLocation'] )
+        urls[ mirror + e['deployLocation'] ] = platform
+
+    print "TCGA Query for mage-tab: ", basename
+    q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (basename))
+    for e in q:
+        dates.append( datetime.datetime.strptime( e['addedDate'], "%m-%d-%Y" ) )
+        q2 = CustomQuery(e['platform'])
+        platform = None
+        for e2 in q2:
+            print e2
+            platform = e2['name']
+        meta['sourceUrl'].append( "http://tcga-data.nci.nih.gov/" + e['deployLocation'] )
+        urls[ mirror + e['deployLocation'] ] = platform
+
+    if len(dates) == 0:
+        print "No Files found"
+        return
+    dates.sort()
+    dates.reverse()
+    versionDate = dates[0].strftime( "%Y-%m-%d" )
+
+    return BuildConf(platform, basename, versionDate, meta, urls)
+
+
+
+
+
+class TableReader:
+    def __init__(self, path):
+        self.path = path
+
+    def __iter__(self):
+        if self.path is not None and os.path.exists(self.path):
+            handle = open(self.path)
+            for line in handle:
+                tmp = line.rstrip().split("\t")
+                yield tmp[0], json.loads(tmp[1])
+            handle.close()
+
+
+class FileImporter:
+
+    fileInclude = None
+    fileExclude = None
+
+    excludes = [
+         "MANIFEST.txt$",
+         "CHANGES_DCC.txt$",
+         "README_DCC.txt$",
+         "README.txt$",
+         "CHANGES.txt$",
+         "DCC_ALTERED_FILES.txt$",
+         r'.wig$',
+         "DESCRIPTIO$"
+    ]
+
+    def __init__(self, config):
+        self.config = config
+
+    def extractTars(self):
+        self.work_dir = tempfile.mkdtemp(dir=self.config.workdir_base)
+        print "Extract to ", self.work_dir
+        for path in self.config.tarlist:
+            subprocess.check_call([ "tar", "xvzf", path, "-C", self.work_dir], stderr=sys.stdout)
+
+    def run(self):
+        self.extractTars()
+
+        filterInclude = None
+        filterExclude = None
+        if self.fileInclude is not None:
+            filterInclude = re.compile(self.fileInclude)
+        if self.fileExclude is not None:
+            filterExclude = re.compile(self.fileExclude)
+        self.inc = 0
+        self.out = {}
+        self.errors = []
+        self.ext_meta = {}
+        self.scandirs(self.work_dir, filterInclude, filterExclude)
+        for o in self.out:
+            self.out[o].close()
+        self.fileBuild()
+        #shutil.rmtree(self.work_dir)
+
+    def checkExclude( self, name ):
+        for e in self.excludes:
+            if re.search( e, name ):
+                return True
+        return False
+
+    def scandirs(self, path, filterInclude=None, filterExclude=None):
+        if os.path.isdir(path):
+            for a in glob(os.path.join(path, "*")):
+                self.scandirs(a, filterInclude, filterExclude)
+        else:
+            name = os.path.basename(path)
+            if self.isMage(path):
+                self.mageScan(path)
+            else:
+                if not self.checkExclude(name):
+                    if (filterInclude is None or filterInclude.match(name)) and (filterExclude is None or not filterExclude.match(name)):
+                        self.fileScan(path)
+
+    def isMage(self, path):
+        if path.endswith( '.sdrf.txt' ) or path.endswith( '.idf.txt' ) or path.endswith("DESCRIPTION.txt"):
+            return True
+
+
+    def emit(self, key, data, port):
+        if port not in self.out:
+            self.out[port] = open(self.work_dir + "/" + port, "w")
+        self.out[port].write( "%s\t%s\n" % (key, json.dumps(data)))
+
+    def emitFile(self, name, meta, file):
+        md5 = hashlib.md5()
+        oHandle = open(self.config.getOutPath(name), "wb")
+        with open(file,'rb') as f:
+            for chunk in iter(lambda: f.read(8192), ''):
+                md5.update(chunk)
+                oHandle.write(chunk)
+        oHandle.close()
+        md5str = md5.hexdigest()
+        meta['md5'] = md5str
+        mHandle = open(self.config.getOutMeta(name), "w")
+        mHandle.write( json.dumps(meta))
+        mHandle.close()
+        if len(self.errors):
+            eHandle = open( self.config.getOutError(name), "w" )
+            for msg in self.errors:
+                eHandle.write( msg + "\n" )
+            eHandle.close()
+
+    def addError(self, msg):
+        self.errors.append(msg)
+
+
+commonMap = {
+    "mean" : "seg.mean",
+    "Segment_Mean" : "seg.mean",
+    "Start" : "loc.start",
+    "End" : "loc.end",
+    "Chromosome" : "chrom"
+}
+
+
+idfMap = {
+    "Investigation Title" : "title",
+    "Experiment Description" : "experimentalDescription",
+    "Person Affiliation" : "dataProducer",
+    "Date of Experiment" : "experimentalDate"
+}
+
+class TCGAGeneticImport(FileImporter):
+
+
+
+    def mageScan(self, path):
+        if path.endswith(".sdrf.txt"):
+            iHandle = open(path, "rU")
+            read = csv.reader( iHandle, delimiter="\t" )
+            colNum = None
+            for row in read:
+                if colNum is None:
+                    colNum = {}
+                    for i in range(len(row)):
+                        colNum[ row[i] ] = i
+                else:
+                    if not colNum.has_key("Material Type") or ( not row[ colNum[ "Material Type" ] ] in [ "genomic_DNA", "total_RNA", "MDA cell line" ] ):
+                        try:
+                            if colNum.has_key( "Derived Array Data File" ):
+                                self.emit( row[ colNum[ "Derived Array Data File" ] ].split('.')[0], row[ colNum[ "Extract Name" ] ], "targets" )
+                                self.emit( row[ colNum[ "Derived Array Data File" ] ], row[ colNum[ "Extract Name" ] ], "targets" )
+                            if colNum.has_key("Derived Array Data Matrix File" ):
+                                self.emit( row[ colNum[ "Derived Array Data Matrix File" ] ], row[ colNum[ "Extract Name" ] ], "targets" )
+                            if colNum.has_key( "Derived Data File"):
+                                self.emit( row[ colNum[ "Derived Data File" ] ].split('.')[0], row[ colNum[ "Extract Name" ] ], "targets" )
+                                self.emit( row[ colNum[ "Derived Data File" ] ], row[ colNum[ "Extract Name" ] ], "targets" )
+                            if colNum.has_key( "Hybridization Name" ):
+                                self.emit( row[ colNum[ "Hybridization Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" )
+                            if colNum.has_key( "Sample Name" ):
+                                self.emit( row[ colNum[ "Sample Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" )
+                            self.emit( row[ colNum[ "Extract Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" )
+                        except IndexError:
+                            pass #there can be blank lines in the SDRF
+        if path.endswith(".idf.txt"):
+            iHandle = open(path)
+            for line in iHandle:
+                row = line.split("\t")
+                if len(row):
+                    if row[0] in idfMap:
+                        self.ext_meta[ idfMap[row[0]] ] = row[1]
+            iHandle.close()
+        if path.endswith("DESCRIPTION.txt"):
+            handle = open(path)
+            self.ext_meta['description'] = handle.read()
+            handle.close()
+
+    def translateUUID(self, uuid):
+        return self.config.translateUUID(uuid)
+
+    def getTargetMap(self):
+        subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True)
+        handle = TableReader(self.work_dir + "/targets.sort")
+        tTrans = {}
+        for key, value in handle:
+            tTrans[ key ] = value
+        return tTrans
+
+    def fileScan(self, path):
+        """
+        This function takes a TCGA level 3 genetic file (file name and input handle),
+        and tries to extract probe levels or target mappings (experimental ID to TCGA barcode)
+        it emits these values to a handle, using the 'targets' and 'probes' string to identify
+        the type of data being emited
+        """
+        iHandle = open(path)
+        mode = None
+        #modes
+        #1 - segmentFile - one sample per file/no sample info inside file
+        #2 - two col header matrix file
+        #3 - segmentFile - sample information inside file
+        target = None
+        colName = None
+        colType = None
+        for line in iHandle:
+            if colName is None:
+                colName = line.rstrip().split("\t")
+                if colName[0] == "Hybridization REF" or colName[0] == "Sample REF":
+                    mode=2
+                elif colName[0] == "Chromosome" or colName[0] == "chromosome":
+                    mode=1
+                    target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention
+                elif colName[1] == "chrom":
+                    mode = 3
+                    target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention
+
+                for i in range(len(colName)):
+                    if commonMap.has_key( colName[i] ):
+                        colName[i] = commonMap[ colName[i] ]
+            elif mode==2 and colType is None:
+                colType=line.rstrip().split("\t")
+                for i in range(len(colType)):
+                    if commonMap.has_key( colType[i] ):
+                        colType[i] = commonMap[ colType[i] ]
+            else:
+                tmp = line.rstrip().split("\t")
+                if mode == 2:
+                    out={}
+                    for col in colName[1:]:
+                        out[ col ] = { "target" : col }
+                    for i in range(1,len(colType)):
+                        try:
+                            if colType[i] in self.probeFields:
+                                out[ colName[i] ][ colType[i] ] = tmp[i]
+                        except IndexError:
+                            out[ colName[i] ][ colType[i] ] = "NA"
+                    for col in out:
+                        self.emit( tmp[0], out[col], "probes" )
+                else:
+                    out = {}
+                    for i in range(len(colName)):
+                        out[ colName[i] ] = tmp[i]
+                    out['file'] = os.path.basename(path)
+                    if mode==1:
+                        self.emit( target, out, "segments" )
+                    elif mode == 3:
+                        self.emit( tmp[0], out, "segments" )
+                    else:
+                        self.emit( tmp[0], out, "probes" )
+
+
+
+
+class TCGASegmentImport(TCGAGeneticImport):
+
+
+    def fileScan(self, path):
+        """
+        This function takes a TCGA level 3 genetic file (file name and input handle),
+        and tries to extract probe levels or target mappings (experimental ID to TCGA barcode)
+        it emits these values to a handle, using the 'targets' and 'probes' string to identify
+        the type of data being emited
+        """
+        iHandle = open(path)
+        mode = None
+        #modes
+        #1 - segmentFile - one sample per file/no sample info inside file
+        #2 - segmentFile - sample information inside file
+        target = None
+        colName = None
+        colType = None
+        for line in iHandle:
+            if colName is None:
+                colName = line.rstrip().split("\t")
+                if colName[0] == "Chromosome" or colName[0] == "chromosome":
+                    mode=1
+                    target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention
+                elif colName[1] == "chrom":
+                    mode = 2
+
+                for i in range(len(colName)):
+                    if commonMap.has_key( colName[i] ):
+                        colName[i] = commonMap[ colName[i] ]
+            else:
+                tmp = line.rstrip().split("\t")
+                out = {}
+                for i in range(len(colName)):
+                    out[ colName[i] ] = tmp[i]
+                out['file'] = os.path.basename(path)
+                if mode==1:
+                    self.emit( target, out, "segments" )
+                elif mode == 2:
+                    self.emit( tmp[0], out, "segments" )
+
+
+    def getMeta(self, name):
+        matrixInfo = {
+            '@context' : "http://purl.org/cgdata/",
+            '@type' : 'bed5',
+            '@id' : name,
+            "lastModified" : self.config.version,
+            'rowKeySrc' : {
+                    '@type' :  'idDAG',
+                    '@id' : "tcga.%s" % (self.config.abbr)
+            },
+            'dataSubType' : { "@id" : self.dataSubType },
+            'dataProducer' : 'TCGA Import',
+            "accessMap" : "public", "redistribution" : "yes"
+        }
+        matrixInfo.update(self.ext_meta)
+        matrixInfo.update(self.config.meta)
+        return matrixInfo
+
+    def fileBuild(self):
+        #use the target table to create a name translation table
+        #also setup target name enumeration, so they will have columns
+        #numbers
+
+        tTrans = self.getTargetMap()
+        subprocess.call("sort -k 1 %s/segments > %s/segments.sort" % (self.work_dir, self.work_dir), shell=True)
+        sHandle = TableReader(self.work_dir + "/segments.sort")
+
+        segFile = None
+        curName = None
+
+        curData = {}
+        missingCount = 0
+
+        startField  = "loc.start"
+        endField    = "loc.end"
+        valField    = "seg.mean"
+        chromeField = "chrom"
+
+        segFile = None
+
+        for key, value in sHandle:
+            if segFile is None:
+                segFile = open("%s/segment_file"  % (self.work_dir), "w")
+            try:
+                curName = self.translateUUID(tTrans[key]) # "-".join( tTrans[ key ].split('-')[0:4] )
+                if curName is not None:
+                    try:
+                        chrom = value[ chromeField ].lower()
+                        if not chrom.startswith("chr"):
+                            chrom = "chr" + chrom
+                        chrom = chrom.upper().replace("CHR", "chr")
+                        #segFile.write( "%s\t%s\t%s\t%s\t.\t%s\n" % ( curName, chrom, int(value[ startField ])+1, value[ endField ], value[ valField ] ) )
+                        segFile.write( "%s\t%s\t%s\t%s\t%s\n" % ( chrom, value[ startField ], value[ endField ], curName, value[ valField ] ) )
+                    except KeyError:
+                         self.addError( "Field error: %s" % (str(value)))
+            except KeyError:
+                self.addError( "TargetInfo Not Found: %s" % (key))
+
+        segFile.close()
+        matrixName = self.config.name
+
+        self.emitFile( "", self.getMeta(matrixName), "%s/segment_file"  % (self.work_dir) )
+
+
+
+class TCGAMatrixImport(TCGAGeneticImport):
+
+    def getMeta(self, name):
+        matrixInfo = {
+            "@context" : 'http://purl.org/cgdata/',
+            '@type' : 'genomicMatrix',
+            '@id' : name,
+            "lastModified" : self.config.version,
+            'dataSubType' : { "@id" : self.dataSubType },
+            'dataProducer' : 'TCGA',
+            "accessMap" : "public",
+            "redistribution" : "yes",
+            'rowKeySrc' : {
+                "@type" : "probe", "@id" : self.probeMap
+            },
+            'columnKeySrc' : {
+                "@type" : "idDAG", "@id" :  "tcga.%s" % (self.config.abbr)
+            }
+        }
+        matrixInfo.update(self.ext_meta)
+        matrixInfo.update(self.config.meta)
+        return matrixInfo
+
+    def fileBuild(self):
+        #use the target table to create a name translation table
+        #also setup target name enumeration, so they will have columns
+        #numbers
+
+        subprocess.call("sort -k 1 %s/probes > %s/probes.sort" % (self.work_dir, self.work_dir), shell=True)
+        subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True)
+
+        handles = {}
+        handles[ "geneticExtract:targets" ] = TableReader(self.work_dir + "/targets.sort")
+        handles[ "geneticExtract:probes" ] = TableReader(self.work_dir + "/probes.sort")
+
+        tTrans = self.getTargetMap()
+
+        tEnum = {}
+        for t in tTrans:
+            tlabel = self.translateUUID(tTrans[t])
+            if tlabel is not None and tlabel not in tEnum:
+                tEnum[tlabel] = len(tEnum)
+
+        matrixFile = None
+        segFile = None
+
+        curName = None
+        curData = {}
+        missingCount = 0
+        rowCount = 0
+        pHandle = handles["geneticExtract:probes"]
+        for key, value in pHandle:
+            if matrixFile is None:
+                matrixFile = open("%s/matrix_file" % (self.work_dir), "w" )
+                out = ["NA"] * len(tEnum)
+                for target in tEnum:
+                    out[ tEnum[ target ] ] = target
+                matrixFile.write( "%s\t%s\n" % ( "#probe", "\t".join( out ) ) )
+
+            if curName != key:
+                if curName is not None:
+                    out = ["NA"] * len(tEnum)
+                    for target in curData:
+                        try:
+                            ttarget = self.translateUUID(tTrans[target])
+                            if ttarget is not None:
+                                out[ tEnum[ ttarget ] ] = str( curData[ target ] )
+                        except KeyError:
+                            self.addError( "TargetInfo Not Found: %s" % (target))
+                    if out.count("NA") != len(tEnum):
+                        rowCount += 1
+                        matrixFile.write( "%s\t%s\n" % ( curName, "\t".join( out ) ) )
+                curName = key
+                curData = {}
+            if "target" in value:
+                for probeField in self.probeFields:
+                    if probeField in value:
+                        curData[ value[ "target" ] ] = value[ probeField ]
+            elif "file" in value:
+                for probeField in self.probeFields:
+                    if probeField in value:
+                        curData[ value[ "file" ] ] = value[ probeField ]
+        matrixFile.close()
+        matrixName = self.config.name
+        if rowCount > 0:
+            self.emitFile( "", self.getMeta(matrixName), "%s/matrix_file"  % (self.work_dir) )
+
+
+adminNS = "http://tcga.nci/bcr/xml/administration/2.3"
+
+
+class TCGAClinicalImport(FileImporter):
+
+    def fileScan(self, path):
+        handle = open(path)
+        data = handle.read()
+        handle.close()
+        xml=parseString(data)
+        self.parseXMLFile(xml)
+
+    def getText(self, nodelist):
+        rc = []
+        for node in nodelist:
+            if node.nodeType == node.TEXT_NODE:
+                rc.append(node.data)
+        return ''.join(rc)
+
+
+    def parseXMLFile(self, dom):
+        admin = {}
+        for node in dom.getElementsByTagNameNS( adminNS, "admin"):
+            for cNode in node.childNodes:
+                if cNode.nodeType == cNode.ELEMENT_NODE:
+                    admin[ cNode.localName ] = {}
+                    admin[ cNode.localName ]['value'] = getText( cNode.childNodes )
+
+        name = None
+        patient = {}
+        patientName = None
+        for node in dom.childNodes[0].childNodes:
+            if node.nodeType == node.ELEMENT_NODE:
+                if node.localName == 'patient':
+                    for elm in node.childNodes:
+                        if elm.nodeType == elm.ELEMENT_NODE:
+                            if ( elm.localName == 'bcr_patient_barcode' ):
+                                name = getText( elm.childNodes )
+                                patientName = name
+
+                            if ( elm.getAttribute( 'procurement_status' ) == "Completed" ):
+                                patient[ elm.localName ] = {}
+                                patient[ elm.localName ]['value'] = getText( elm.childNodes )
+                                patient[ elm.localName ]['tier']  = elm.getAttribute( 'tier' )
+                                patient[ elm.localName ]['precision'] = elm.getAttribute( 'precision' )
+
+                            if elm.prefix == "auxiliary":
+                                for aux in elm.childNodes:
+                                    if aux.nodeType == aux.ELEMENT_NODE:
+                                        for auxval in aux.childNodes:
+                                            if auxval.nodeType == auxval.ELEMENT_NODE:
+                                                patient[ auxval.localName ] = {}
+                                                patient[ auxval.localName ]['value'] = getText( auxval.childNodes )
+                                                patient[ auxval.localName ]['tier']  = auxval.getAttribute( 'tier' )
+                                                patient[ auxval.localName ]['precision'] = auxval.getAttribute( 'precision' )
+
+        if name is not None:
+            for key in admin:
+                patient[ key ] = admin[ key ]
+            self.emit( name, patient, "patient" )
+
+        for node in dom.childNodes[0].childNodes:
+            if node.nodeType == node.ELEMENT_NODE and node.localName == 'patient':
+                for samples in node.childNodes:
+                    if samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'samples':
+                        for sample in samples.childNodes:
+                            if sample.nodeType == samples.ELEMENT_NODE and sample.localName == 'sample':
+                                sampleData = {}
+                                for value in sample.childNodes:
+                                    if value.nodeType == value.ELEMENT_NODE:
+                                        if value.localName == 'bcr_sample_barcode' :
+                                            name = getText( value.childNodes )
+                                        if value.getAttribute( 'procurement_status' ) == "Completed" :
+                                            sampleData[ value.localName ] = {}
+                                            sampleData[ value.localName ]['value'] = getText( value.childNodes )
+
+                                        if value.localName == 'portions' :
+                                            for portions in value.childNodes:
+                                                if portions.nodeType == value.ELEMENT_NODE and portions.localName == "portion":
+                                                    portionName = None
+                                                    portionData = {}
+                                                    for portion in portions.childNodes:
+                                                        if portion.nodeType == value.ELEMENT_NODE:
+                                                            if portion.localName == "analytes":
+                                                                for analytes in portion.childNodes:
+                                                                    if analytes.nodeType == analytes.ELEMENT_NODE and analytes.localName =="analyte":
+                                                                        analyteName = None
+                                                                        analyteData = {}
+                                                                        for analyte in analytes.childNodes:
+                                                                            if analyte.nodeType == value.ELEMENT_NODE:
+                                                                                if analyte.localName == "aliquots":
+                                                                                    for aliquots in analyte.childNodes:
+                                                                                        if aliquots.nodeType == aliquots.ELEMENT_NODE and aliquots.localName =="aliquot":
+                                                                                            aliquotName = None
+                                                                                            aliquotData = {}
+                                                                                            for aliquot in aliquots.childNodes:
+                                                                                                if aliquot.nodeType == value.ELEMENT_NODE:
+                                                                                                    if aliquot.localName == "bcr_aliquot_barcode":
+                                                                                                        aliquotName = getText(aliquot.childNodes)
+                                                                                                    if aliquot.getAttribute( 'procurement_status' ) == "Completed" :
+                                                                                                        aliquotData[ aliquot.localName ] = {}
+                                                                                                        aliquotData[ aliquot.localName ]['value'] = getText( aliquot.childNodes )
+                                                                                            if aliquotName is not None and len(aliquotData):
+                                                                                                self.emit( aliquotName, aliquotData, 'aliquot' )
+
+
+                                                                                if analyte.localName == "bcr_analyte_barcode":
+                                                                                    analyteName = getText(analyte.childNodes)
+                                                                                if analyte.getAttribute( 'procurement_status' ) == "Completed" :
+                                                                                    analyteData[ analyte.localName ] = {}
+                                                                                    analyteData[ analyte.localName ]['value'] = getText( analyte.childNodes )
+                                                                        if analyteName is not None and len(analyteData):
+                                                                            self.emit( analyteName, analyteData, 'analyte' )
+
+                                                            if portion.localName == "bcr_portion_barcode":
+                                                                portionName = getText( portion.childNodes )
+                                                            if portion.getAttribute( 'procurement_status' ) == "Completed" :
+                                                                portionData[ portion.localName ] = {}
+                                                                portionData[ portion.localName ]['value'] = getText( portion.childNodes )
+                                                    if portionName is not None and len(portionData):
+                                                        self.emit( portionName, portionData, 'portion' )
+
+
+                                #patientName = re.sub( r'\-...$', "", name )
+                                self.emit( name, sampleData, "sample" )
+                                self.emit( name, patient, "sample")
+                    elif samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'drugs':
+                        for drug in samples.childNodes:
+                            if drug.nodeType == samples.ELEMENT_NODE and drug.localName == 'drug':
+                                drugData = {}
+                                for value in drug.childNodes:
+                                    if value.nodeType == value.ELEMENT_NODE:
+                                        if value.localName == 'bcr_drug_barcode' :
+                                            name = getText( value.childNodes )
+                                        if value.getAttribute( 'procurement_status' ) == "Completed" :
+                                            drugData[ value.localName ] = {}
+                                            drugData[ value.localName ]['value'] = getText( value.childNodes )
+
+                                #patientName = re.sub( r'\-...$', "", name )
+                                self.emit( patientName, drugData, "drug" )
+                    elif samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'radiations':
+                        for rad in samples.childNodes:
+                            if rad.nodeType == samples.ELEMENT_NODE and rad.localName == 'radiation':
+                                radData = {}
+                                for value in rad.childNodes:
+                                    if value.nodeType == value.ELEMENT_NODE:
+                                        if value.localName == 'bcr_radiation_barcode' :
+                                            name = getText( value.childNodes )
+                                        if value.getAttribute( 'procurement_status' ) == "Completed" :
+                                            radData[ value.localName ] = {}
+                                            radData[ value.localName ]['value'] = getText( value.childNodes )
+
+                                #patientName = re.sub( r'\-...$', "", name )
+                                self.emit( patientName, radData, "radiation" )
+
+
+
+    def getMeta(self, name):
+        fileInfo = {
+            "@context" : "http://purl.org/cgdata/",
+            "@type" : "clinicalMatrix",
+            "@id" : name,
+            "lastModified" :  self.config.version,
+            'dataSubType' : { "@id" : "clinical" },
+            "rowKeySrc" : {
+                "@type" : "idDAG", "@id" :  "tcga.%s" % (self.config.abbr)
+            }
+
+        }
+        fileInfo.update(self.ext_meta)
+        fileInfo.update(self.config.meta)
+        return fileInfo
+
+    def fileBuild(self):
+
+        matrixList = [ "patient", "sample", "radiation", "drug", "portion", "analyte", "aliquot" ]
+        if self.config.clinical_type is not None:
+            matrixList = [ self.config.clinical_type ]
+
+        for matrixName in matrixList:
+            if os.path.exists( "%s/%s" % (self.work_dir, matrixName)):
+                subprocess.call("cat %s/%s | sort -k 1 > %s/%s.sort" % (self.work_dir, matrixName, self.work_dir, matrixName), shell=True)
+                handle = TableReader(self.work_dir + "/" + matrixName + ".sort")
+                matrix = {}
+                colEnum = {}
+                for key, value in handle:
+                    if key not in matrix:
+                        matrix[key] = {}
+                    for col in value:
+                        matrix[key][col] = value[col]
+                        if col not in colEnum:
+                            if not self.config.sanitize or col not in [ 'race', 'ethnicity' ]:
+                                colEnum[col] = len(colEnum)
+
+                handle = open( os.path.join(self.work_dir, matrixName + "_file"), "w")
+                cols = [None] * (len(colEnum))
+                for col in colEnum:
+                    cols[colEnum[col]] = col
+                handle.write("sample\t%s\n" % ("\t".join(cols)))
+                for key in matrix:
+                    cols = [""] * (len(colEnum))
+                    for col in colEnum:
+                        if col in matrix[key]:
+                            cols[colEnum[col]] = matrix[key][col]['value']
+                    handle.write("%s\t%s\n" % (key, "\t".join(cols).encode("ASCII", "replace")))
+                handle.close()
+                self.emitFile( "." + matrixName, self.getMeta(self.config.name + "." + matrixName), "%s/%s_file"  % (self.work_dir, matrixName))
+
+
+class AgilentImport(TCGAMatrixImport):
+    dataSubType = 'geneExp'
+    probeMap = 'hugo'
+    sampleMap = 'tcga.iddag'
+    dataType = 'genomicMatrix'
+    probeFields = ['log2 lowess normalized (cy5/cy3) collapsed by gene symbol']
+
+
+class CGH1x1mImport(TCGASegmentImport):
+    dataSubType = 'cna'
+    sampleMap = 'tcga.iddag'
+    dataType = 'genomicSegment'
+    probeFields = ['seg.mean']
+
+class SNP6Import(TCGASegmentImport):
+    assembly = 'hg19'
+    dataSubType = 'cna'
+    sampleMap ='tcga.iddag'
+    dataType = 'genomicSegment'
+    probeFields = ['seg.mean']
+
+    def fileScan(self, path):
+        outport = None
+        #if path.endswith(".hg18.seg.txt"):
+        #    outport = "hg18_segment"
+        if path.endswith(".hg19.seg.txt"):
+            outport = "hg19_segment"
+
+        if outport is not None:
+            handle = open(path)
+            colName = None
+            for line in handle:
+                if colName is None:
+                    colName = line.rstrip().split("\t")
+                    for i, col in enumerate(colName):
+                        if commonMap.has_key( col ):
+                            colName[i] = commonMap[ col ]
+                else:
+                    tmp = line.rstrip().split("\t")
+                    out = {}
+                    for i in range(1, len(colName)):
+                        out[ colName[i] ] = tmp[i]
+                    self.emit( tmp[0], out, outport )
+            handle.close()
+
+    def fileBuild(self):
+        tmap = self.getTargetMap()
+
+        for base in ['hg19']:
+            subprocess.call("sort -k 1 %s/%s_segment > %s/%s_segment.sort" % (self.work_dir, base, self.work_dir, base), shell=True)
+            handle = TableReader(self.work_dir + "/%s_segment.sort" % (base))
+
+            segFile = None
+            curName = None
+            curData = {}
+            missingCount = 0
+
+            startField  = "loc.start"
+            endField    = "loc.end"
+            valField    = "seg.mean"
+            chromeField = "chrom"
+
+            segFile = None
+            sHandle = handle
+            for key, value in sHandle:
+                if segFile is None:
+                    segFile = open("%s/%s_segment.out"  % (self.work_dir, base), "w")
+                try:
+                    curName = self.translateUUID(tmap[key])
+                    if curName is not None:
+                        chrom = value[ chromeField ].lower()
+                        if not chrom.startswith("chr"):
+                            chrom = "chr" + chrom
+                        chrom = chrom.upper().replace("CHR", "chr")
+                        #segFile.write( "%s\t%s\t%s\t%s\t.\t%s\n" % ( curName, chrom, int(value[ startField ])+1, value[ endField ], value[ valField ] ) )
+                        segFile.write( "%s\t%s\t%s\t%s\t%s\n" % ( chrom, value[ startField ], value[ endField ], curName, value[ valField ] ) )
+                except KeyError:
+                    self.addError( "TargetInfo Not Found: %s" % (key))
+
+            segFile.close()
+
+            self.emitFile("." + base, self.getMeta(self.config.name + "." + base), "%s/%s_segment.out"  % (self.work_dir, base))
+
+
+class HmiRNAImport(TCGAMatrixImport):
+    dataSubType = 'miRNAExp'
+    probeMap = 'agilentHumanMiRNA'
+    sampleMap = 'tcga.iddag'
+    dataType = 'genomicMatrix'
+    probeFields = ['unc_DWD_Batch_adjusted']
+
+class CGH244AImport(TCGASegmentImport):
+    dataSubType = 'cna'
+    sampleMap = 'tcga.iddag'
+    dataType = 'genomicSegment'
+    probeFields = ['Segment_Mean']
+
+class CGH415K_G4124A(TCGASegmentImport):
+    dataSubType = 'cna'
+    sampleMap = 'tcga.iddag'
+    chromeField = 'Chromosome'
+    dataType = 'genomicSegment'
+    endField = 'End'
+    probeFields = ['Segment_Mean']
+    startField = 'Start'
+
+
+class IlluminaHiSeq_DNASeqC(TCGASegmentImport):
+    dataSubType = 'cna'
+    sampleMap = 'tcga.iddag'
+    chromeField = 'Chromosome'
+    dataType = 'genomicSegment'
+    endField = 'End'
+    probeFields = ['Segment_Mean']
+    startField = 'Start'
+
+    def translateUUID(self, uuid):
+        out = self.config.translateUUID(uuid)
+        #censor out normal ids
+        if re.search(r'^TCGA-..-....-1', out):
+            return None
+        return out
+
+class HT_HGU133A(TCGAMatrixImport):
+    dataSubType = 'geneExp'
+    probeMap = 'affyU133a'
+    sampleMap = 'tcga.iddag'
+    dataType = 'genomicMatrix'
+    probeFields = ['Signal']
+
+class HuEx1_0stv2(TCGAMatrixImport):
+    dataSubType = 'miRNAExp'
+    probeMap = 'hugo'
+    sampleMap = 'tcga.iddag'
+    dataType = 'genomicMatrix'
+    probeFields = ['Signal']
+    fileInclude = '^.*gene.txt$|^.*sdrf.txt$'
+
+class Human1MDuoImport(TCGASegmentImport):
+    dataSubType = 'cna'
+    sampleMap = 'tcga.iddag'
+    dataType = 'genomicSegment'
+    probeFields = ['mean']
+
+class HumanHap550(TCGASegmentImport):
+    dataSubType = 'cna'
+    sampleMap = 'tcga.iddag'
+    dataType = 'genomicSegment'
+    probeFields = ['mean']
+
+class HumanMethylation27(TCGAMatrixImport):
+    dataSubType = 'DNAMethylation'
+    probeMap= 'illuminaMethyl27K_gpl8490'
+    sampleMap= 'tcga.iddag'
+    dataType= 'genomicMatrix'
+    fileExclude= '.*.adf.txt'
+    probeFields = ['Beta_Value', 'Beta_value']
+
+
+class HumanMethylation450(TCGAMatrixImport):
+    dataSubType =  'DNAMethylation'
+    probeMap =  'illuminaHumanMethylation450'
+    sampleMap =  'tcga.iddag'
+    dataType =  'genomicMatrix'
+    fileExclude = '.*.adf.txt'
+    probeFields =  ['Beta_value', 'Beta_Value']
+
+    def fileScan(self, path):
+        """
+        This function takes a TCGA level 3 genetic file (file name and input handle),
+        and tries to extract probe levels or target mappings (experimental ID to TCGA barcode)
+        it emits these values to a handle, using the 'targets' and 'probes' string to identify
+        the type of data being emited
+        """
+        iHandle = open(path)
+        mode = None
+        #modes
+        #1 - two col header matrix file
+        target = None
+        colName = None
+        colType = None
+        for line in iHandle:
+            if colName is None:
+                colName = line.rstrip().split("\t")
+                if colName[0] == "Hybridization REF" or colName[0] == "Sample REF":
+                    mode=1
+                for i in range(len(colName)):
+                    if commonMap.has_key( colName[i] ):
+                        colName[i] = commonMap[ colName[i] ]
+            elif mode==1 and colType is None:
+                colType=line.rstrip().split("\t")
+                for i in range(len(colType)):
+                    if commonMap.has_key( colType[i] ):
+                        colType[i] = commonMap[ colType[i] ]
+            else:
+                tmp = line.rstrip().split("\t")
+                if mode == 1:
+                    out={}
+                    for col in colName[1:]:
+                        out[ col ] = { "target" : col }
+                    for i in range(1,len(colType)):
+                        try:
+                            if colType[i] in self.probeFields:
+                                out[ colName[i] ][ colType[i] ] = "%.4f" % float(tmp[i])
+                        except IndexError:
+                            out[ colName[i] ][ colType[i] ] = "NA"
+                        except ValueError:
+                            out[ colName[i] ][ colType[i] ] = "NA"
+                    for col in out:
+                        self.emit( tmp[0], out[col], "probes" )
+
+class Illumina_RNASeq(TCGAMatrixImport):
+    sampleMap= 'tcga.iddag'
+    dataSubType= 'geneExp'
+    fileInclude= r'^.*\.gene.quantification.txt$|^.*sdrf.txt$'
+    probeFields = ['RPKM']
+    probeMap= 'hugo.unc'
+
+class Illumina_RNASeqV2(TCGAMatrixImport):
+    sampleMap= 'tcga.iddag'
+    dataSubType= 'geneExp'
+    fileInclude= r'^.*rsem.genes.normalized_results$|^.*sdrf.txt$'
+    probeFields = ['normalized_count']
+    probeMap= 'hugo.unc'
+
+class IlluminaHiSeq_RNASeq(TCGAMatrixImport):
+    sampleMap= 'tcga.iddag'
+    dataSubType= 'geneExp'
+    fileInclude= r'^.*gene.quantification.txt$'
+    probeFields = ['RPKM']
+    probeMap= 'hugo.unc'
+
+class MDA_RPPA_Core(TCGAMatrixImport):
+    sampleMap = 'tcga.iddag'
+    probeMap = "md_anderson_antibodies"
+    dataSubType = "RPPA"
+    fileExclude = r'^.*.antibody_annotation.txt'
+    probeFields = [ 'Protein Expression' ]
+
+    def getTargetMap(self):
+        subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True)
+        handle = TableReader(self.work_dir + "/targets.sort")
+        tTrans = {}
+        for key, value in handle:
+            value = re.sub(r'\.SD', '', value)
+            tTrans[ key ] = value
+        return tTrans
+
+
+class Illumina_miRNASeq(TCGAMatrixImport):
+    sampleMap= 'tcga.iddag'
+    dataSubType= 'miRNA'
+    fileInclude= '^.*.mirna.quantification.txt$'
+    probeFields = ['reads_per_million_miRNA_mapped']
+    probeMap= 'hsa.mirna'
+
+
+class bioImport(TCGAClinicalImport):
+    sampleMap = 'tcga.iddag'
+    fileInclude = '.*.xml$'
+
+tcgaConfig = {
+    'AgilentG4502A_07' : AgilentImport,
+    'AgilentG4502A_07_1' : AgilentImport,
+    'AgilentG4502A_07_2' : AgilentImport,
+    'AgilentG4502A_07_3': AgilentImport,
+    'CGH-1x1M_G4447A': CGH1x1mImport,
+    'Genome_Wide_SNP_6': SNP6Import,
+    'H-miRNA_8x15K': HmiRNAImport,
+    'H-miRNA_8x15Kv2': HmiRNAImport,
+    'HG-CGH-244A': CGH244AImport,
+    'HG-CGH-415K_G4124A': CGH415K_G4124A,
+    'HT_HG-U133A': HT_HGU133A,
+    'HuEx-1_0-st-v2': HuEx1_0stv2,
+    'Human1MDuo': Human1MDuoImport,
+    'HumanHap550': HumanHap550,
+    'IlluminaHiSeq_DNASeqC' : IlluminaHiSeq_DNASeqC,
+    'HumanMethylation27': HumanMethylation27,
+    'HumanMethylation450': HumanMethylation450,
+    'IlluminaHiSeq_RNASeq': IlluminaHiSeq_RNASeq,
+    'IlluminaGA_RNASeq' : Illumina_RNASeq,
+    'IlluminaHiSeq_RNASeqV2' : Illumina_RNASeqV2,
+    'MDA_RPPA_Core' : MDA_RPPA_Core,
+    'IlluminaGA_miRNASeq' : Illumina_miRNASeq,
+    'IlluminaHiSeq_miRNASeq' : Illumina_miRNASeq,
+    'bio' : bioImport
+}
+
+def fileDigest( file ):
+    md5 = hashlib.md5()
+    with open(file,'rb') as f:
+        for chunk in iter(lambda: f.read(8192), ''):
+            md5.update(chunk)
+    return md5.hexdigest()
+
+
+def platform_list():
+    q = CustomQuery("Platform")
+    for e in q:
+        yield e['name']
+
+def supported_list():
+    q = CustomQuery("Platform")
+    for e in q:
+        if e['name'] in tcgaConfig:
+            yield e['name']
+
+def platform_archives(platform):
+    q = CustomQuery("Archive[Platform[@name=%s]][@isLatest=1]" % platform)
+    out = {}
+    for e in q:
+        name = e['baseName']
+        if name not in out:
+            yield name
+            out[name] = True
+
+
+if __name__ == "__main__":
+
+    parser = ArgumentParser()
+    #Stack.addJobTreeOptions(parser)
+
+    parser.add_argument("-a", "--platform-list", dest="platform_list", action="store_true", help="Get list of platforms", default=False)
+    parser.add_argument("-u", "--uuid", dest="uuid_table", help="UUID to Barcode Table", default=None)
+    parser.add_argument("-t", "--uuid-download", dest="uuid_download", help="Download UUID/Barcode Table", default=False)
+    parser.add_argument("-z", "--all-archives", dest="all_archives", action="store_true", help="List all archives", default=False)
+    parser.add_argument("-p", "--platform", dest="platform", help="Platform Selection", default=None)
+    parser.add_argument("-l", "--supported", dest="supported_list", action="store_true", help="List Supported Platforms", default=None)
+    parser.add_argument("-f", "--filelist", dest="filelist", help="List files needed to convert TCGA project basename into cgData", default=None)
+    parser.add_argument("-b", "--basename", dest="basename", help="Convert TCGA project basename into cgData", default=None)
+    parser.add_argument("-m", "--mirror", dest="mirror", help="Mirror Location", default=None)
+    parser.add_argument("-w", "--workdir", dest="workdir_base", help="Working directory", default="/tmp")
+    parser.add_argument("--out-dir", dest="outdir", help="Working directory", default="./")
+    parser.add_argument("-o", "--out", dest="outpath", help="Output Dest", default=None)
+    parser.add_argument("--out-error", dest="errorpath", help="Output Error", default=None)
+    parser.add_argument("--out-meta", dest="metapath", help="Output Meta", default=None)
+    parser.add_argument("-c", "--cancer", dest="cancer", help="List Archives by cancer type", default=None)
+    parser.add_argument("-d", "--download", dest="download", help="Download files for archive", default=None)
+    parser.add_argument("-e", "--level", dest="level", help="Data Level ", default="3")
+    parser.add_argument("-s", "--check-sum", dest="checksum", help="Check project md5", default=None)
+    parser.add_argument("-r", "--sanitize", dest="sanitize", action="store_true", help="Remove race/ethnicity from clinical data", default=False)
+    parser.add_argument("-x", "--clinical", dest="clinical", help="Process clinical info", default=None)
+    parser.add_argument("--clinical-basename", dest="clinical_basename", help="Select Clinical Data by basename", default=None)
+    parser.add_argument("--clinical-type", dest="clinical_type", help="Clinical Data Type", default=None)
+    parser.add_argument("--all-clinical", dest="all_clinical", action="store_true", help="List all clinical archives", default=False)
+    parser.add_argument("--out-clinical", dest="out_clinical", action="append", nargs=3, default=[])
+    parser.add_argument("--samples", dest="get_samples", action="store_true", default=False)
+
+    options = parser.parse_args()
+
+    if options.uuid_download:
+        url="https://tcga-data.nci.nih.gov/uuid/uuidBrowserExport.htm"
+        data = {}
+        data['exportType'] = 'tab'
+        data['cols'] = "uuid,barcode"
+        urllib.urlretrieve( url, options.uuid_download, data=urllib.urlencode(data))
+
+    if options.platform_list:
+        for e in platform_list():
+            print e
+
+    if options.supported_list:
+        for e in supported_list():
+            print e
+
+    if options.platform:
+        for name in platform_archives( options.platform ):
+            print name
+
+    if options.all_archives:
+        q = CustomQuery("Archive[@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.level))
+        out = {}
+        for e in q:
+            name = e['baseName']
+            if name not in out:
+                print name
+                out[name] = True
+
+    if options.all_clinical:
+        q = CustomQuery("Archive[@isLatest=1][Platform[@alias=bio]]")
+        out = {}
+        for e in q:
+            name = e['baseName']
+            if name not in out:
+                print name
+                out[name] = True
+
+    if options.get_samples:
+        url="https://tcga-data.nci.nih.gov/datareports/aliquotExport.htm"
+        data = {}
+
+        data['exportType'] = 'tab'
+        data['cols'] = 'aliquotId,disease,bcrBatch,center,platform,levelOne,levelTwo,levelThree'
+        data['filterReq'] = json.dumps({"disease":"","levelOne":"","aliquotId":"","center":"","levelTwo":"","bcrBatch":"","platform":"","levelThree":""})
+        data['formFilter'] = json.dumps({"disease":"","levelOne":"","aliquotId":"","center":"","levelTwo":"","bcrBatch":"","platform":"","levelThree":""})
+        handle = urllib.urlopen( url + "?" + urllib.urlencode(data))
+
+        for line in handle:
+            tmp = line.rstrip().split("\t")
+            if tmp[7] == "Submitted":
+                if tmp[0][13]=='0':
+                    print "\t".join( [ tmp[0], tmp[1], "Tumor", tmp[4] ] )
+                elif tmp[0][13] == '1':
+                    print "\t".join( [ tmp[0], tmp[1], "Normal", tmp[4] ] )
+
+
+    if options.cancer is not None:
+        q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][ArchiveType[@type=Level_%s]]" % (options.cancer, options.level))
+        out = {}
+        for e in q:
+            name = e['baseName']
+            if name not in out:
+                print name
+                out[name] = True
+
+    if options.filelist:
+        q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.filelist, options.level))
+        for e in q:
+            print e['deployLocation']
+        q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.filelist))
+        for e in q:
+            print e['deployLocation']
+
+    if options.checksum:
+        urls = []
+        q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.checksum, options.level))
+        for e in q:
+            urls.append( e['deployLocation'] )
+        q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.checksum))
+        for e in q:
+            urls.append( e['deployLocation'] )
+
+        for url in urls:
+            dst = os.path.join(options.mirror, re.sub("^/", "", url))
+            if not os.path.exists( dst ):
+                print "NOT_FOUND:", dst
+                continue
+            if not os.path.exists( dst + ".md5" ):
+                print "MD5_NOT_FOUND", dst
+                continue
+
+            handle = open( dst + ".md5" )
+            line = handle.readline()
+            omd5 = line.split(' ')[0]
+            handle.close()
+
+            nmd5 = fileDigest( dst )
+            if omd5 != nmd5:
+                print "CORRUPT:", dst
+            else:
+                print "OK:", dst
+
+
+    if options.download is not None:
+        if options.mirror is None:
+            print "Define mirror location"
+            sys.exit(1)
+
+        urls = []
+
+        if options.basename is None and options.clinical is None and options.clinical_basename is None:
+            q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.download, options.level))
+            for e in q:
+                urls.append( e['deployLocation'] )
+                urls.append( e['deployLocation'] + ".md5" )
+
+            q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.download))
+            for e in q:
+                urls.append( e['deployLocation'] )
+                urls.append( e['deployLocation'] + ".md5" )
+
+        if options.basename:
+            q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.basename, options.level))
+            for e in q:
+                urls.append( e['deployLocation'] )
+                urls.append( e['deployLocation'] + ".md5" )
+
+            q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.basename))
+            for e in q:
+                urls.append( e['deployLocation'] )
+                urls.append( e['deployLocation'] + ".md5" )
+
+        if options.clinical:
+            q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][Platform[@alias=bio]]" % (options.clinical))
+            for e in q:
+                urls.append( e['deployLocation'] )
+                urls.append( e['deployLocation'] + ".md5" )
+
+        if options.clinical_basename:
+            q = CustomQuery("Archive[@isLatest=1][@baseName=%s]" % (options.clinical_basename))
+            for e in q:
+                urls.append( e['deployLocation'] )
+                urls.append( e['deployLocation'] + ".md5" )
+
+
+
+        for url in urls:
+            src = "https://tcga-data.nci.nih.gov/" + url
+            dst = os.path.join(options.mirror, re.sub("^/", "", url))
+            dir = os.path.dirname(dst)
+            if not os.path.exists(dir):
+                print "mkdir", dir
+                os.makedirs(dir)
+            if not os.path.exists( dst ):
+                print "download %s to %s" % (src, dst)
+                urllib.urlretrieve(src, dst)
+
+    if options.basename:
+        if options.mirror is None:
+            sys.stderr.write("Need mirror location\n")
+            sys.exit(1)
+
+        conf = getBaseBuildConf(options.basename, options.level, options.mirror)
+        conf.addOptions(options)
+        if conf.platform not in tcgaConfig:
+            sys.stderr.write("Platform %s not supported\n" % (conf.platform))
+            sys.exit(1)
+
+        ext = tcgaConfig[conf.platform](conf)
+        ext.run()
+
+
+    if options.clinical:
+        if options.mirror is None:
+            sys.stderr.write("Need mirror location\n")
+            sys.exit(1)
+
+        q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][Platform[@alias=bio]]" % (options.clinical))
+        basenames = {}
+        for s in q:
+            basenames[s['baseName']] = True
+
+        for base in basenames:
+            conf = getBaseBuildConf(base, 1, options.mirror)
+            conf.addOptions(options)
+
+            ext = tcgaConfig[conf.platform](conf)
+            ext.run()
+
+    if options.clinical_basename:
+        if options.mirror is None:
+            sys.stderr.write("Need mirror location\n")
+            sys.exit(1)
+
+
+        conf = getBaseBuildConf(options.clinical_basename, 1, options.mirror)
+        conf.addOptions(options)
+
+        ext = tcgaConfig[conf.platform](conf)
+        ext.run()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tcga_import/tcga_import.xml	Tue Oct 30 14:23:49 2012 -0400
@@ -0,0 +1,181 @@
+<tool id="tcga_import" name="TCGA Import" version="1.0.0">
+  <description>TCGA Import</description>
+  <command interpreter="python">tcgaImport.py
+-w ./
+#if $iparam.import_mode == "uuid":
+--uuid-download ${out_uuid}
+#end if
+#if $iparam.import_mode == "sample_list":
+--samples > ${out_sample_list}
+#end if
+#if $iparam.import_mode == "archive_list":
+-z > ${archive_list_out}
+#end if
+#if $iparam.import_mode == "archive":
+-m ${__tool_data_path__}
+--download -
+--basename ${iparam.base}
+#if str($iparam.uuid_mapping) != "None":
+--uuid ${iparam.uuid_mapping}
+#end if
+--out ${archive_out}
+--out-meta ${archive_meta}
+#end if
+#if $iparam.import_mode == "clinical_list":
+--all-clinical > ${clinical_list_out}
+#end if
+#if $iparam.import_mode == "clinical":
+-m ${__tool_data_path__}
+--download -
+--clinical-basename ${iparam.clinical_archive}
+--out-clinical patient ${out_patient} ${out_patient_meta}
+--out-clinical sample ${out_sample} ${out_sample_meta}
+--out-clinical radiation ${out_radiation} ${out_radiation_meta}
+--out-clinical drug ${out_drug} ${out_drug_meta}
+--out-clinical portion ${out_portion} ${out_portion_meta}
+--out-clinical analyte ${out_analyte} ${out_analyte_meta}
+--out-clinical aliquot ${out_aliquot} ${out_aliquot_meta}
+
+#end if
+  </command>
+  <inputs>
+      <conditional name="iparam">
+        <param name="import_mode" type="select" label="Import Mode">
+          <option value="uuid">TCGA UUID/Barcode Mapping</option>
+          <option value="sample_list">TCGA Sample List</option>
+          <option value="archive_list">List TCGA Archives</option>
+          <option value="archive">Import Archive</option>
+          <option value="clinical_list">List TCGA Clinical Data</option>
+          <option value="clinical">Clinical Import</option>
+        </param>
+        <when value="uuid"/>
+        <when value="samples"/>
+        <when value="archive_list"/>
+        <when value="archive">
+          <param name="base" type="text" size="90" value="unc.edu_COAD_IlluminaHiSeq_RNASeqV2" label="Archive Basename"/>
+          <param name="uuid_mapping" type="data" format="tabular" optional="true" label="UUID-Barcode Mapping"/>
+        </when>
+        <when value="clinical_list"/>
+        <when value="clinical">
+          <param name="clinical_archive" size="90" value="intgen.org_OV_bio" type="text"/>
+        </when>
+      </conditional>
+  </inputs>
+  <outputs>
+    <data name="out_uuid" format="tabular" label="UUID/Barcode Mapping">
+      <filter>iparam['import_mode'] == "uuid"</filter>
+    </data>
+    <data name="out_sample_list" format="tabular" label="TCGA Samples">
+      <filter>iparam['import_mode'] == "sample_list"</filter>
+    </data>
+    <data name="archive_list_out" format="tabular" label="TCGA Archive List">
+      <filter>iparam['import_mode'] == "archive_list"</filter>
+    </data>
+    <data name="archive_out" format="tabular" label="${iparam.base}">
+      <filter>iparam['import_mode'] == "archive"</filter>
+    </data>
+    <data name="archive_meta" format="txt" label="${iparam.base} Meta">
+      <filter>iparam['import_mode'] == "archive"</filter>
+    </data>
+
+    <data name="clinical_list_out" format="tabular" label="TCGA Clinical List">
+      <filter>iparam['import_mode'] == "clinical_list"</filter>
+    </data>
+
+
+    <data name="out_patient" format="tabular" label="${iparam.clinical_archive} Patient">
+        <filter>iparam['import_mode'] == "clinical"</filter>
+    </data>
+    <data name="out_sample" format="tabular" label="${iparam.clinical_archive} Sample">
+        <filter>iparam['import_mode'] == "clinical"</filter>
+    </data>
+    <data name="out_radiation" format="tabular" label="${iparam.clinical_archive} Radiation">
+        <filter>iparam['import_mode'] == "clinical"</filter>
+    </data>
+    <data name="out_drug" format="tabular" label="${iparam.clinical_archive} Drug">
+        <filter>iparam['import_mode'] == "clinical"</filter>
+    </data>
+    <data name="out_portion" format="tabular" label="${iparam.clinical_archive} Portion">
+        <filter>iparam['import_mode'] == "clinical"</filter>
+    </data>
+    <data name="out_analyte" format="tabular" label="${iparam.clinical_archive} Analyte">
+        <filter>iparam['import_mode'] == "clinical"</filter>
+    </data>
+    <data name="out_aliquot" format="tabular" label="${iparam.clinical_archive} Aliquot">
+        <filter>iparam['import_mode'] == "clinical"</filter>
+    </data>
+
+    <data name="out_patient_meta" format="txt" label="${iparam.clinical_archive} Patient Meta">
+        <filter>iparam['import_mode'] == "clinical"</filter>
+    </data>
+    <data name="out_sample_meta" format="txt" label="${iparam.clinical_archive} Sample Meta">
+        <filter>iparam['import_mode'] == "clinical"</filter>
+    </data>
+    <data name="out_radiation_meta" format="txt" label="${iparam.clinical_archive} Radiation Meta">
+        <filter>iparam['import_mode'] == "clinical"</filter>
+    </data>
+    <data name="out_drug_meta" format="txt" label="${iparam.clinical_archive} Drug Meta">
+        <filter>iparam['import_mode'] == "clinical"</filter>
+    </data>
+    <data name="out_portion_meta" format="txt" label="${iparam.clinical_archive} Portion Meta">
+        <filter>iparam['import_mode'] == "clinical"</filter>
+    </data>
+    <data name="out_analyte_meta" format="txt" label="${iparam.clinical_archive} Analyte Meta">
+        <filter>iparam['import_mode'] == "clinical"</filter>
+    </data>
+    <data name="out_aliquot_meta" format="txt" label="${iparam.clinical_archive} Aliquot Meta">
+        <filter>iparam['import_mode'] == "clinical"</filter>
+    </data>
+
+
+  </outputs>
+  <help>
+TCGA Import Tool
+================
+
+An import tool for data from `The Cancer Genome Atlas Data &lt;https://tcga-data.nci.nih.gov&gt;`_.
+This tool specializes in importing Level 3, publically accessable data from the TCGA. This includes
+processed genomic data and a majority of the clinical info. While this data is publically avalible, it
+is frequently in a varity of file formats that need to be colated and formated before anlyitcal use. This tool
+attempts to read across a family of tarballs and compile the data into a single matrix.
+There are also import mechanism to find TCGA sample id lists and UUID conversion tables.
+
+Modes:
+
+*TCGA UUID/Barcode Mapping*
+  A two column mapping from TCGA UUID to barcode IDs
+
+*TCGA Sample List*
+   A list of TCGA samples, with sample type, cancer type, and platform.
+
+*List TCGA Archives*
+   A list of all valid archives found at TCGA.
+
+*Import Archive*
+   Import a TCGA archive using a name found in the archive list. This will store a mirror
+   of downloaded files in the Galaxy tool-data folder. This will either produce a matrix or
+   BED5 file, with an associated meta data description in JSON
+
+*List TCGA Clinical Data*
+   List all valid TCGA clinical repositories
+
+*Clinical Import*
+   Import a TCGA clinical repository using a name found in the clinical data list. This will store
+   a mirror of downloaded files in the Galaxt tool-data folder. This produces 7 matrices and their
+   associated meta data. The different levels of clinical matrix include
+
+    - patient
+    - sample
+    - radiation
+    - drug
+    - portion
+    - analyte
+    - aliquot
+
+
+Note: TCGA archive can be large, and this isn't the most optimized piece of code in the world, so import can take a while for some
+of the larger TCGA archives.
+
+
+  </help>
+</tool>