Mercurial > repos > kellrott > tcga_import
comparison tcga_import/tcgaImport.py @ 0:f1c71f5363ae draft default tip
Uploaded
author | kellrott |
---|---|
date | Tue, 30 Oct 2012 14:23:49 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:f1c71f5363ae |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 | |
4 """ | |
5 Script to scan and extract TCGA data and compile it into the cgData | |
6 | |
7 Usage:: | |
8 | |
9 tcga2cgdata.py [options] | |
10 | |
11 Options:: | |
12 | |
13 -h, --help show this help message and exit | |
14 -a, --platform-list Get list of platforms | |
15 -p PLATFORM, --platform=PLATFORM | |
16 Platform Selection | |
17 -l, --supported List Supported Platforms | |
18 -f FILELIST, --filelist=FILELIST | |
19 List files needed to convert TCGA project basename | |
20 into cgData | |
21 -b BASENAME, --basename=BASENAME | |
22 Convert TCGA project basename into cgData | |
23 -m MIRROR, --mirror=MIRROR | |
24 Mirror Location | |
25 -w WORKDIR_BASE, --workdir=WORKDIR_BASE | |
26 Working directory | |
27 -o OUTDIR, --out-dir=OUTDIR | |
28 Working directory | |
29 -c CANCER, --cancer=CANCER | |
30 List Archives by cancer type | |
31 -d DOWNLOAD, --download=DOWNLOAD | |
32 Download files for archive | |
33 -e LEVEL, --level=LEVEL | |
34 Data Level | |
35 -s CHECKSUM, --check-sum=CHECKSUM | |
36 Check project md5 | |
37 -r, --sanitize Remove race/ethnicity from clinical data | |
38 | |
39 | |
40 Example:: | |
41 | |
42 ./scripts/tcga2cgdata.py -b intgen.org_KIRC_bio -m /inside/depot -e 1 -r -w tmp | |
43 | |
44 | |
45 """ | |
46 | |
47 from xml.dom.minidom import parseString | |
48 import urllib | |
49 import urllib2 | |
50 import os | |
51 import csv | |
52 import sys | |
53 import hashlib | |
54 import tempfile | |
55 import re | |
56 import copy | |
57 import json | |
58 import datetime | |
59 import hashlib | |
60 import subprocess | |
61 from glob import glob | |
62 import shutil | |
63 import subprocess | |
64 from argparse import ArgumentParser | |
65 | |
66 | |
67 | |
68 | |
69 """ | |
70 | |
71 Net query code | |
72 | |
73 """ | |
74 | |
75 class dccwsItem(object): | |
76 baseURL = "http://tcga-data.nci.nih.gov/tcgadccws/GetXML?query=" | |
77 | |
78 def __init__(self): | |
79 self.url = None | |
80 | |
81 def __iter__(self): | |
82 next = self.url | |
83 while next != None: | |
84 handle = urllib.urlopen(next) | |
85 data = handle.read() | |
86 handle.close() | |
87 dom = parseString(data) | |
88 # there might not be any archives for a dataset | |
89 if len(dom.getElementsByTagName('queryResponse')) > 0: | |
90 response = dom.getElementsByTagName('queryResponse').pop() | |
91 classList = response.getElementsByTagName('class') | |
92 for cls in classList: | |
93 className = cls.getAttribute("recordNumber") | |
94 outData = {} | |
95 #aObj = Archive() | |
96 for node in cls.childNodes: | |
97 nodeName = node.getAttribute("name") | |
98 if node.hasAttribute("xlink:href"): | |
99 outData[ nodeName ] = node.getAttribute("xlink:href") | |
100 else: | |
101 outData[ nodeName ] = getText( node.childNodes ) | |
102 yield outData | |
103 if len( dom.getElementsByTagName('next') ) > 0: | |
104 nextElm = dom.getElementsByTagName('next').pop() | |
105 next = nextElm.getAttribute( 'xlink:href' ) | |
106 else: | |
107 next = None | |
108 | |
109 | |
110 class CustomQuery(dccwsItem): | |
111 def __init__(self, query): | |
112 super(CustomQuery, self).__init__() | |
113 if query.startswith("http://"): | |
114 self.url = query | |
115 else: | |
116 self.url = dccwsItem.baseURL + query | |
117 | |
118 | |
119 def getText(nodelist): | |
120 rc = [] | |
121 for node in nodelist: | |
122 if node.nodeType == node.TEXT_NODE: | |
123 rc.append(node.data) | |
124 return ''.join(rc) | |
125 | |
126 """ | |
127 | |
128 Build Configuration | |
129 | |
130 """ | |
131 | |
132 class BuildConf: | |
133 def __init__(self, platform, name, version, meta, tarlist): | |
134 self.platform = platform | |
135 self.name = name | |
136 self.version = version | |
137 self.meta = meta | |
138 self.tarlist = tarlist | |
139 self.abbr = '' | |
140 self.uuid_table = None | |
141 if 'diseaseAbbr' in meta: | |
142 self.abbr = meta['diseaseAbbr'] | |
143 | |
144 def addOptions(self, opts): | |
145 self.workdir_base = opts.workdir_base | |
146 self.outdir = opts.outdir | |
147 self.sanitize = opts.sanitize | |
148 self.outpath = opts.outpath | |
149 self.metapath = opts.metapath | |
150 self.errorpath = opts.errorpath | |
151 self.clinical_type = opts.clinical_type | |
152 | |
153 self.clinical_type_map = {} | |
154 for t, path, meta in opts.out_clinical: | |
155 self.clinical_type_map[ "." + t] = (path, meta) | |
156 | |
157 if opts.uuid_table is not None: | |
158 self.uuid_table = {} | |
159 handle = open(opts.uuid_table) | |
160 for line in handle: | |
161 tmp = line.rstrip().split("\t") | |
162 self.uuid_table[tmp[0]] = tmp[1] | |
163 | |
164 def translateUUID(self, uuid): | |
165 if self.uuid_table is None or uuid not in self.uuid_table: | |
166 return uuid | |
167 return self.uuid_table[uuid] | |
168 | |
169 def getOutPath(self, name): | |
170 if self.outpath is not None: | |
171 return self.outpath | |
172 if name in self.clinical_type_map: | |
173 return self.clinical_type_map[name][0] | |
174 return os.path.join(self.outdir, self.name) + name | |
175 | |
176 def getOutMeta(self, name): | |
177 if self.outpath is not None: | |
178 if self.metapath is not None: | |
179 return self.metapath | |
180 return self.outpath + ".json" | |
181 if name in self.clinical_type_map: | |
182 return self.clinical_type_map[name][1] | |
183 return os.path.join(self.outdir, self.name) + name + ".json" | |
184 | |
185 def getOutError(self, name): | |
186 if self.outpath is not None: | |
187 if self.errorpath is not None: | |
188 return self.errorpath | |
189 return self.outpath + ".error" | |
190 return os.path.join(self.outdir, self.name) + name + ".error" | |
191 | |
192 | |
193 def getBaseBuildConf(basename, level, mirror): | |
194 dates = [] | |
195 print "TCGA Query for: ", basename | |
196 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (basename, level)) | |
197 urls = {} | |
198 meta = None | |
199 platform = None | |
200 for e in q: | |
201 dates.append( datetime.datetime.strptime( e['addedDate'], "%m-%d-%Y" ) ) | |
202 if meta is None: | |
203 meta = {"sourceUrl" : []} | |
204 for e2 in CustomQuery(e['platform']): | |
205 platform = e2['name'] | |
206 meta['platform'] = e2['name'] | |
207 meta['platformTitle'] = e2['displayName'] | |
208 for e2 in CustomQuery(e['disease']): | |
209 meta['diseaseAbbr'] = e2['abbreviation'] | |
210 meta['diseaseTitle'] = e2['name'] | |
211 for e3 in CustomQuery(e2['tissueCollection']): | |
212 meta['tissue'] = e3['name'] | |
213 for e2 in CustomQuery(e['center']): | |
214 meta['centerTitle'] = e2['displayName'] | |
215 meta['center'] = e2['name'] | |
216 meta['sourceUrl'].append( "http://tcga-data.nci.nih.gov/" + e['deployLocation'] ) | |
217 urls[ mirror + e['deployLocation'] ] = platform | |
218 | |
219 print "TCGA Query for mage-tab: ", basename | |
220 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (basename)) | |
221 for e in q: | |
222 dates.append( datetime.datetime.strptime( e['addedDate'], "%m-%d-%Y" ) ) | |
223 q2 = CustomQuery(e['platform']) | |
224 platform = None | |
225 for e2 in q2: | |
226 print e2 | |
227 platform = e2['name'] | |
228 meta['sourceUrl'].append( "http://tcga-data.nci.nih.gov/" + e['deployLocation'] ) | |
229 urls[ mirror + e['deployLocation'] ] = platform | |
230 | |
231 if len(dates) == 0: | |
232 print "No Files found" | |
233 return | |
234 dates.sort() | |
235 dates.reverse() | |
236 versionDate = dates[0].strftime( "%Y-%m-%d" ) | |
237 | |
238 return BuildConf(platform, basename, versionDate, meta, urls) | |
239 | |
240 | |
241 | |
242 | |
243 | |
244 class TableReader: | |
245 def __init__(self, path): | |
246 self.path = path | |
247 | |
248 def __iter__(self): | |
249 if self.path is not None and os.path.exists(self.path): | |
250 handle = open(self.path) | |
251 for line in handle: | |
252 tmp = line.rstrip().split("\t") | |
253 yield tmp[0], json.loads(tmp[1]) | |
254 handle.close() | |
255 | |
256 | |
257 class FileImporter: | |
258 | |
259 fileInclude = None | |
260 fileExclude = None | |
261 | |
262 excludes = [ | |
263 "MANIFEST.txt$", | |
264 "CHANGES_DCC.txt$", | |
265 "README_DCC.txt$", | |
266 "README.txt$", | |
267 "CHANGES.txt$", | |
268 "DCC_ALTERED_FILES.txt$", | |
269 r'.wig$', | |
270 "DESCRIPTIO$" | |
271 ] | |
272 | |
273 def __init__(self, config): | |
274 self.config = config | |
275 | |
276 def extractTars(self): | |
277 self.work_dir = tempfile.mkdtemp(dir=self.config.workdir_base) | |
278 print "Extract to ", self.work_dir | |
279 for path in self.config.tarlist: | |
280 subprocess.check_call([ "tar", "xvzf", path, "-C", self.work_dir], stderr=sys.stdout) | |
281 | |
282 def run(self): | |
283 self.extractTars() | |
284 | |
285 filterInclude = None | |
286 filterExclude = None | |
287 if self.fileInclude is not None: | |
288 filterInclude = re.compile(self.fileInclude) | |
289 if self.fileExclude is not None: | |
290 filterExclude = re.compile(self.fileExclude) | |
291 self.inc = 0 | |
292 self.out = {} | |
293 self.errors = [] | |
294 self.ext_meta = {} | |
295 self.scandirs(self.work_dir, filterInclude, filterExclude) | |
296 for o in self.out: | |
297 self.out[o].close() | |
298 self.fileBuild() | |
299 #shutil.rmtree(self.work_dir) | |
300 | |
301 def checkExclude( self, name ): | |
302 for e in self.excludes: | |
303 if re.search( e, name ): | |
304 return True | |
305 return False | |
306 | |
307 def scandirs(self, path, filterInclude=None, filterExclude=None): | |
308 if os.path.isdir(path): | |
309 for a in glob(os.path.join(path, "*")): | |
310 self.scandirs(a, filterInclude, filterExclude) | |
311 else: | |
312 name = os.path.basename(path) | |
313 if self.isMage(path): | |
314 self.mageScan(path) | |
315 else: | |
316 if not self.checkExclude(name): | |
317 if (filterInclude is None or filterInclude.match(name)) and (filterExclude is None or not filterExclude.match(name)): | |
318 self.fileScan(path) | |
319 | |
320 def isMage(self, path): | |
321 if path.endswith( '.sdrf.txt' ) or path.endswith( '.idf.txt' ) or path.endswith("DESCRIPTION.txt"): | |
322 return True | |
323 | |
324 | |
325 def emit(self, key, data, port): | |
326 if port not in self.out: | |
327 self.out[port] = open(self.work_dir + "/" + port, "w") | |
328 self.out[port].write( "%s\t%s\n" % (key, json.dumps(data))) | |
329 | |
330 def emitFile(self, name, meta, file): | |
331 md5 = hashlib.md5() | |
332 oHandle = open(self.config.getOutPath(name), "wb") | |
333 with open(file,'rb') as f: | |
334 for chunk in iter(lambda: f.read(8192), ''): | |
335 md5.update(chunk) | |
336 oHandle.write(chunk) | |
337 oHandle.close() | |
338 md5str = md5.hexdigest() | |
339 meta['md5'] = md5str | |
340 mHandle = open(self.config.getOutMeta(name), "w") | |
341 mHandle.write( json.dumps(meta)) | |
342 mHandle.close() | |
343 if len(self.errors): | |
344 eHandle = open( self.config.getOutError(name), "w" ) | |
345 for msg in self.errors: | |
346 eHandle.write( msg + "\n" ) | |
347 eHandle.close() | |
348 | |
349 def addError(self, msg): | |
350 self.errors.append(msg) | |
351 | |
352 | |
353 commonMap = { | |
354 "mean" : "seg.mean", | |
355 "Segment_Mean" : "seg.mean", | |
356 "Start" : "loc.start", | |
357 "End" : "loc.end", | |
358 "Chromosome" : "chrom" | |
359 } | |
360 | |
361 | |
362 idfMap = { | |
363 "Investigation Title" : "title", | |
364 "Experiment Description" : "experimentalDescription", | |
365 "Person Affiliation" : "dataProducer", | |
366 "Date of Experiment" : "experimentalDate" | |
367 } | |
368 | |
369 class TCGAGeneticImport(FileImporter): | |
370 | |
371 | |
372 | |
373 def mageScan(self, path): | |
374 if path.endswith(".sdrf.txt"): | |
375 iHandle = open(path, "rU") | |
376 read = csv.reader( iHandle, delimiter="\t" ) | |
377 colNum = None | |
378 for row in read: | |
379 if colNum is None: | |
380 colNum = {} | |
381 for i in range(len(row)): | |
382 colNum[ row[i] ] = i | |
383 else: | |
384 if not colNum.has_key("Material Type") or ( not row[ colNum[ "Material Type" ] ] in [ "genomic_DNA", "total_RNA", "MDA cell line" ] ): | |
385 try: | |
386 if colNum.has_key( "Derived Array Data File" ): | |
387 self.emit( row[ colNum[ "Derived Array Data File" ] ].split('.')[0], row[ colNum[ "Extract Name" ] ], "targets" ) | |
388 self.emit( row[ colNum[ "Derived Array Data File" ] ], row[ colNum[ "Extract Name" ] ], "targets" ) | |
389 if colNum.has_key("Derived Array Data Matrix File" ): | |
390 self.emit( row[ colNum[ "Derived Array Data Matrix File" ] ], row[ colNum[ "Extract Name" ] ], "targets" ) | |
391 if colNum.has_key( "Derived Data File"): | |
392 self.emit( row[ colNum[ "Derived Data File" ] ].split('.')[0], row[ colNum[ "Extract Name" ] ], "targets" ) | |
393 self.emit( row[ colNum[ "Derived Data File" ] ], row[ colNum[ "Extract Name" ] ], "targets" ) | |
394 if colNum.has_key( "Hybridization Name" ): | |
395 self.emit( row[ colNum[ "Hybridization Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" ) | |
396 if colNum.has_key( "Sample Name" ): | |
397 self.emit( row[ colNum[ "Sample Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" ) | |
398 self.emit( row[ colNum[ "Extract Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" ) | |
399 except IndexError: | |
400 pass #there can be blank lines in the SDRF | |
401 if path.endswith(".idf.txt"): | |
402 iHandle = open(path) | |
403 for line in iHandle: | |
404 row = line.split("\t") | |
405 if len(row): | |
406 if row[0] in idfMap: | |
407 self.ext_meta[ idfMap[row[0]] ] = row[1] | |
408 iHandle.close() | |
409 if path.endswith("DESCRIPTION.txt"): | |
410 handle = open(path) | |
411 self.ext_meta['description'] = handle.read() | |
412 handle.close() | |
413 | |
414 def translateUUID(self, uuid): | |
415 return self.config.translateUUID(uuid) | |
416 | |
417 def getTargetMap(self): | |
418 subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True) | |
419 handle = TableReader(self.work_dir + "/targets.sort") | |
420 tTrans = {} | |
421 for key, value in handle: | |
422 tTrans[ key ] = value | |
423 return tTrans | |
424 | |
425 def fileScan(self, path): | |
426 """ | |
427 This function takes a TCGA level 3 genetic file (file name and input handle), | |
428 and tries to extract probe levels or target mappings (experimental ID to TCGA barcode) | |
429 it emits these values to a handle, using the 'targets' and 'probes' string to identify | |
430 the type of data being emited | |
431 """ | |
432 iHandle = open(path) | |
433 mode = None | |
434 #modes | |
435 #1 - segmentFile - one sample per file/no sample info inside file | |
436 #2 - two col header matrix file | |
437 #3 - segmentFile - sample information inside file | |
438 target = None | |
439 colName = None | |
440 colType = None | |
441 for line in iHandle: | |
442 if colName is None: | |
443 colName = line.rstrip().split("\t") | |
444 if colName[0] == "Hybridization REF" or colName[0] == "Sample REF": | |
445 mode=2 | |
446 elif colName[0] == "Chromosome" or colName[0] == "chromosome": | |
447 mode=1 | |
448 target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention | |
449 elif colName[1] == "chrom": | |
450 mode = 3 | |
451 target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention | |
452 | |
453 for i in range(len(colName)): | |
454 if commonMap.has_key( colName[i] ): | |
455 colName[i] = commonMap[ colName[i] ] | |
456 elif mode==2 and colType is None: | |
457 colType=line.rstrip().split("\t") | |
458 for i in range(len(colType)): | |
459 if commonMap.has_key( colType[i] ): | |
460 colType[i] = commonMap[ colType[i] ] | |
461 else: | |
462 tmp = line.rstrip().split("\t") | |
463 if mode == 2: | |
464 out={} | |
465 for col in colName[1:]: | |
466 out[ col ] = { "target" : col } | |
467 for i in range(1,len(colType)): | |
468 try: | |
469 if colType[i] in self.probeFields: | |
470 out[ colName[i] ][ colType[i] ] = tmp[i] | |
471 except IndexError: | |
472 out[ colName[i] ][ colType[i] ] = "NA" | |
473 for col in out: | |
474 self.emit( tmp[0], out[col], "probes" ) | |
475 else: | |
476 out = {} | |
477 for i in range(len(colName)): | |
478 out[ colName[i] ] = tmp[i] | |
479 out['file'] = os.path.basename(path) | |
480 if mode==1: | |
481 self.emit( target, out, "segments" ) | |
482 elif mode == 3: | |
483 self.emit( tmp[0], out, "segments" ) | |
484 else: | |
485 self.emit( tmp[0], out, "probes" ) | |
486 | |
487 | |
488 | |
489 | |
490 class TCGASegmentImport(TCGAGeneticImport): | |
491 | |
492 | |
493 def fileScan(self, path): | |
494 """ | |
495 This function takes a TCGA level 3 genetic file (file name and input handle), | |
496 and tries to extract probe levels or target mappings (experimental ID to TCGA barcode) | |
497 it emits these values to a handle, using the 'targets' and 'probes' string to identify | |
498 the type of data being emited | |
499 """ | |
500 iHandle = open(path) | |
501 mode = None | |
502 #modes | |
503 #1 - segmentFile - one sample per file/no sample info inside file | |
504 #2 - segmentFile - sample information inside file | |
505 target = None | |
506 colName = None | |
507 colType = None | |
508 for line in iHandle: | |
509 if colName is None: | |
510 colName = line.rstrip().split("\t") | |
511 if colName[0] == "Chromosome" or colName[0] == "chromosome": | |
512 mode=1 | |
513 target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention | |
514 elif colName[1] == "chrom": | |
515 mode = 2 | |
516 | |
517 for i in range(len(colName)): | |
518 if commonMap.has_key( colName[i] ): | |
519 colName[i] = commonMap[ colName[i] ] | |
520 else: | |
521 tmp = line.rstrip().split("\t") | |
522 out = {} | |
523 for i in range(len(colName)): | |
524 out[ colName[i] ] = tmp[i] | |
525 out['file'] = os.path.basename(path) | |
526 if mode==1: | |
527 self.emit( target, out, "segments" ) | |
528 elif mode == 2: | |
529 self.emit( tmp[0], out, "segments" ) | |
530 | |
531 | |
532 def getMeta(self, name): | |
533 matrixInfo = { | |
534 '@context' : "http://purl.org/cgdata/", | |
535 '@type' : 'bed5', | |
536 '@id' : name, | |
537 "lastModified" : self.config.version, | |
538 'rowKeySrc' : { | |
539 '@type' : 'idDAG', | |
540 '@id' : "tcga.%s" % (self.config.abbr) | |
541 }, | |
542 'dataSubType' : { "@id" : self.dataSubType }, | |
543 'dataProducer' : 'TCGA Import', | |
544 "accessMap" : "public", "redistribution" : "yes" | |
545 } | |
546 matrixInfo.update(self.ext_meta) | |
547 matrixInfo.update(self.config.meta) | |
548 return matrixInfo | |
549 | |
550 def fileBuild(self): | |
551 #use the target table to create a name translation table | |
552 #also setup target name enumeration, so they will have columns | |
553 #numbers | |
554 | |
555 tTrans = self.getTargetMap() | |
556 subprocess.call("sort -k 1 %s/segments > %s/segments.sort" % (self.work_dir, self.work_dir), shell=True) | |
557 sHandle = TableReader(self.work_dir + "/segments.sort") | |
558 | |
559 segFile = None | |
560 curName = None | |
561 | |
562 curData = {} | |
563 missingCount = 0 | |
564 | |
565 startField = "loc.start" | |
566 endField = "loc.end" | |
567 valField = "seg.mean" | |
568 chromeField = "chrom" | |
569 | |
570 segFile = None | |
571 | |
572 for key, value in sHandle: | |
573 if segFile is None: | |
574 segFile = open("%s/segment_file" % (self.work_dir), "w") | |
575 try: | |
576 curName = self.translateUUID(tTrans[key]) # "-".join( tTrans[ key ].split('-')[0:4] ) | |
577 if curName is not None: | |
578 try: | |
579 chrom = value[ chromeField ].lower() | |
580 if not chrom.startswith("chr"): | |
581 chrom = "chr" + chrom | |
582 chrom = chrom.upper().replace("CHR", "chr") | |
583 #segFile.write( "%s\t%s\t%s\t%s\t.\t%s\n" % ( curName, chrom, int(value[ startField ])+1, value[ endField ], value[ valField ] ) ) | |
584 segFile.write( "%s\t%s\t%s\t%s\t%s\n" % ( chrom, value[ startField ], value[ endField ], curName, value[ valField ] ) ) | |
585 except KeyError: | |
586 self.addError( "Field error: %s" % (str(value))) | |
587 except KeyError: | |
588 self.addError( "TargetInfo Not Found: %s" % (key)) | |
589 | |
590 segFile.close() | |
591 matrixName = self.config.name | |
592 | |
593 self.emitFile( "", self.getMeta(matrixName), "%s/segment_file" % (self.work_dir) ) | |
594 | |
595 | |
596 | |
597 class TCGAMatrixImport(TCGAGeneticImport): | |
598 | |
599 def getMeta(self, name): | |
600 matrixInfo = { | |
601 "@context" : 'http://purl.org/cgdata/', | |
602 '@type' : 'genomicMatrix', | |
603 '@id' : name, | |
604 "lastModified" : self.config.version, | |
605 'dataSubType' : { "@id" : self.dataSubType }, | |
606 'dataProducer' : 'TCGA', | |
607 "accessMap" : "public", | |
608 "redistribution" : "yes", | |
609 'rowKeySrc' : { | |
610 "@type" : "probe", "@id" : self.probeMap | |
611 }, | |
612 'columnKeySrc' : { | |
613 "@type" : "idDAG", "@id" : "tcga.%s" % (self.config.abbr) | |
614 } | |
615 } | |
616 matrixInfo.update(self.ext_meta) | |
617 matrixInfo.update(self.config.meta) | |
618 return matrixInfo | |
619 | |
620 def fileBuild(self): | |
621 #use the target table to create a name translation table | |
622 #also setup target name enumeration, so they will have columns | |
623 #numbers | |
624 | |
625 subprocess.call("sort -k 1 %s/probes > %s/probes.sort" % (self.work_dir, self.work_dir), shell=True) | |
626 subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True) | |
627 | |
628 handles = {} | |
629 handles[ "geneticExtract:targets" ] = TableReader(self.work_dir + "/targets.sort") | |
630 handles[ "geneticExtract:probes" ] = TableReader(self.work_dir + "/probes.sort") | |
631 | |
632 tTrans = self.getTargetMap() | |
633 | |
634 tEnum = {} | |
635 for t in tTrans: | |
636 tlabel = self.translateUUID(tTrans[t]) | |
637 if tlabel is not None and tlabel not in tEnum: | |
638 tEnum[tlabel] = len(tEnum) | |
639 | |
640 matrixFile = None | |
641 segFile = None | |
642 | |
643 curName = None | |
644 curData = {} | |
645 missingCount = 0 | |
646 rowCount = 0 | |
647 pHandle = handles["geneticExtract:probes"] | |
648 for key, value in pHandle: | |
649 if matrixFile is None: | |
650 matrixFile = open("%s/matrix_file" % (self.work_dir), "w" ) | |
651 out = ["NA"] * len(tEnum) | |
652 for target in tEnum: | |
653 out[ tEnum[ target ] ] = target | |
654 matrixFile.write( "%s\t%s\n" % ( "#probe", "\t".join( out ) ) ) | |
655 | |
656 if curName != key: | |
657 if curName is not None: | |
658 out = ["NA"] * len(tEnum) | |
659 for target in curData: | |
660 try: | |
661 ttarget = self.translateUUID(tTrans[target]) | |
662 if ttarget is not None: | |
663 out[ tEnum[ ttarget ] ] = str( curData[ target ] ) | |
664 except KeyError: | |
665 self.addError( "TargetInfo Not Found: %s" % (target)) | |
666 if out.count("NA") != len(tEnum): | |
667 rowCount += 1 | |
668 matrixFile.write( "%s\t%s\n" % ( curName, "\t".join( out ) ) ) | |
669 curName = key | |
670 curData = {} | |
671 if "target" in value: | |
672 for probeField in self.probeFields: | |
673 if probeField in value: | |
674 curData[ value[ "target" ] ] = value[ probeField ] | |
675 elif "file" in value: | |
676 for probeField in self.probeFields: | |
677 if probeField in value: | |
678 curData[ value[ "file" ] ] = value[ probeField ] | |
679 matrixFile.close() | |
680 matrixName = self.config.name | |
681 if rowCount > 0: | |
682 self.emitFile( "", self.getMeta(matrixName), "%s/matrix_file" % (self.work_dir) ) | |
683 | |
684 | |
685 adminNS = "http://tcga.nci/bcr/xml/administration/2.3" | |
686 | |
687 | |
688 class TCGAClinicalImport(FileImporter): | |
689 | |
690 def fileScan(self, path): | |
691 handle = open(path) | |
692 data = handle.read() | |
693 handle.close() | |
694 xml=parseString(data) | |
695 self.parseXMLFile(xml) | |
696 | |
697 def getText(self, nodelist): | |
698 rc = [] | |
699 for node in nodelist: | |
700 if node.nodeType == node.TEXT_NODE: | |
701 rc.append(node.data) | |
702 return ''.join(rc) | |
703 | |
704 | |
705 def parseXMLFile(self, dom): | |
706 admin = {} | |
707 for node in dom.getElementsByTagNameNS( adminNS, "admin"): | |
708 for cNode in node.childNodes: | |
709 if cNode.nodeType == cNode.ELEMENT_NODE: | |
710 admin[ cNode.localName ] = {} | |
711 admin[ cNode.localName ]['value'] = getText( cNode.childNodes ) | |
712 | |
713 name = None | |
714 patient = {} | |
715 patientName = None | |
716 for node in dom.childNodes[0].childNodes: | |
717 if node.nodeType == node.ELEMENT_NODE: | |
718 if node.localName == 'patient': | |
719 for elm in node.childNodes: | |
720 if elm.nodeType == elm.ELEMENT_NODE: | |
721 if ( elm.localName == 'bcr_patient_barcode' ): | |
722 name = getText( elm.childNodes ) | |
723 patientName = name | |
724 | |
725 if ( elm.getAttribute( 'procurement_status' ) == "Completed" ): | |
726 patient[ elm.localName ] = {} | |
727 patient[ elm.localName ]['value'] = getText( elm.childNodes ) | |
728 patient[ elm.localName ]['tier'] = elm.getAttribute( 'tier' ) | |
729 patient[ elm.localName ]['precision'] = elm.getAttribute( 'precision' ) | |
730 | |
731 if elm.prefix == "auxiliary": | |
732 for aux in elm.childNodes: | |
733 if aux.nodeType == aux.ELEMENT_NODE: | |
734 for auxval in aux.childNodes: | |
735 if auxval.nodeType == auxval.ELEMENT_NODE: | |
736 patient[ auxval.localName ] = {} | |
737 patient[ auxval.localName ]['value'] = getText( auxval.childNodes ) | |
738 patient[ auxval.localName ]['tier'] = auxval.getAttribute( 'tier' ) | |
739 patient[ auxval.localName ]['precision'] = auxval.getAttribute( 'precision' ) | |
740 | |
741 if name is not None: | |
742 for key in admin: | |
743 patient[ key ] = admin[ key ] | |
744 self.emit( name, patient, "patient" ) | |
745 | |
746 for node in dom.childNodes[0].childNodes: | |
747 if node.nodeType == node.ELEMENT_NODE and node.localName == 'patient': | |
748 for samples in node.childNodes: | |
749 if samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'samples': | |
750 for sample in samples.childNodes: | |
751 if sample.nodeType == samples.ELEMENT_NODE and sample.localName == 'sample': | |
752 sampleData = {} | |
753 for value in sample.childNodes: | |
754 if value.nodeType == value.ELEMENT_NODE: | |
755 if value.localName == 'bcr_sample_barcode' : | |
756 name = getText( value.childNodes ) | |
757 if value.getAttribute( 'procurement_status' ) == "Completed" : | |
758 sampleData[ value.localName ] = {} | |
759 sampleData[ value.localName ]['value'] = getText( value.childNodes ) | |
760 | |
761 if value.localName == 'portions' : | |
762 for portions in value.childNodes: | |
763 if portions.nodeType == value.ELEMENT_NODE and portions.localName == "portion": | |
764 portionName = None | |
765 portionData = {} | |
766 for portion in portions.childNodes: | |
767 if portion.nodeType == value.ELEMENT_NODE: | |
768 if portion.localName == "analytes": | |
769 for analytes in portion.childNodes: | |
770 if analytes.nodeType == analytes.ELEMENT_NODE and analytes.localName =="analyte": | |
771 analyteName = None | |
772 analyteData = {} | |
773 for analyte in analytes.childNodes: | |
774 if analyte.nodeType == value.ELEMENT_NODE: | |
775 if analyte.localName == "aliquots": | |
776 for aliquots in analyte.childNodes: | |
777 if aliquots.nodeType == aliquots.ELEMENT_NODE and aliquots.localName =="aliquot": | |
778 aliquotName = None | |
779 aliquotData = {} | |
780 for aliquot in aliquots.childNodes: | |
781 if aliquot.nodeType == value.ELEMENT_NODE: | |
782 if aliquot.localName == "bcr_aliquot_barcode": | |
783 aliquotName = getText(aliquot.childNodes) | |
784 if aliquot.getAttribute( 'procurement_status' ) == "Completed" : | |
785 aliquotData[ aliquot.localName ] = {} | |
786 aliquotData[ aliquot.localName ]['value'] = getText( aliquot.childNodes ) | |
787 if aliquotName is not None and len(aliquotData): | |
788 self.emit( aliquotName, aliquotData, 'aliquot' ) | |
789 | |
790 | |
791 if analyte.localName == "bcr_analyte_barcode": | |
792 analyteName = getText(analyte.childNodes) | |
793 if analyte.getAttribute( 'procurement_status' ) == "Completed" : | |
794 analyteData[ analyte.localName ] = {} | |
795 analyteData[ analyte.localName ]['value'] = getText( analyte.childNodes ) | |
796 if analyteName is not None and len(analyteData): | |
797 self.emit( analyteName, analyteData, 'analyte' ) | |
798 | |
799 if portion.localName == "bcr_portion_barcode": | |
800 portionName = getText( portion.childNodes ) | |
801 if portion.getAttribute( 'procurement_status' ) == "Completed" : | |
802 portionData[ portion.localName ] = {} | |
803 portionData[ portion.localName ]['value'] = getText( portion.childNodes ) | |
804 if portionName is not None and len(portionData): | |
805 self.emit( portionName, portionData, 'portion' ) | |
806 | |
807 | |
808 #patientName = re.sub( r'\-...$', "", name ) | |
809 self.emit( name, sampleData, "sample" ) | |
810 self.emit( name, patient, "sample") | |
811 elif samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'drugs': | |
812 for drug in samples.childNodes: | |
813 if drug.nodeType == samples.ELEMENT_NODE and drug.localName == 'drug': | |
814 drugData = {} | |
815 for value in drug.childNodes: | |
816 if value.nodeType == value.ELEMENT_NODE: | |
817 if value.localName == 'bcr_drug_barcode' : | |
818 name = getText( value.childNodes ) | |
819 if value.getAttribute( 'procurement_status' ) == "Completed" : | |
820 drugData[ value.localName ] = {} | |
821 drugData[ value.localName ]['value'] = getText( value.childNodes ) | |
822 | |
823 #patientName = re.sub( r'\-...$', "", name ) | |
824 self.emit( patientName, drugData, "drug" ) | |
825 elif samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'radiations': | |
826 for rad in samples.childNodes: | |
827 if rad.nodeType == samples.ELEMENT_NODE and rad.localName == 'radiation': | |
828 radData = {} | |
829 for value in rad.childNodes: | |
830 if value.nodeType == value.ELEMENT_NODE: | |
831 if value.localName == 'bcr_radiation_barcode' : | |
832 name = getText( value.childNodes ) | |
833 if value.getAttribute( 'procurement_status' ) == "Completed" : | |
834 radData[ value.localName ] = {} | |
835 radData[ value.localName ]['value'] = getText( value.childNodes ) | |
836 | |
837 #patientName = re.sub( r'\-...$', "", name ) | |
838 self.emit( patientName, radData, "radiation" ) | |
839 | |
840 | |
841 | |
842 def getMeta(self, name): | |
843 fileInfo = { | |
844 "@context" : "http://purl.org/cgdata/", | |
845 "@type" : "clinicalMatrix", | |
846 "@id" : name, | |
847 "lastModified" : self.config.version, | |
848 'dataSubType' : { "@id" : "clinical" }, | |
849 "rowKeySrc" : { | |
850 "@type" : "idDAG", "@id" : "tcga.%s" % (self.config.abbr) | |
851 } | |
852 | |
853 } | |
854 fileInfo.update(self.ext_meta) | |
855 fileInfo.update(self.config.meta) | |
856 return fileInfo | |
857 | |
858 def fileBuild(self): | |
859 | |
860 matrixList = [ "patient", "sample", "radiation", "drug", "portion", "analyte", "aliquot" ] | |
861 if self.config.clinical_type is not None: | |
862 matrixList = [ self.config.clinical_type ] | |
863 | |
864 for matrixName in matrixList: | |
865 if os.path.exists( "%s/%s" % (self.work_dir, matrixName)): | |
866 subprocess.call("cat %s/%s | sort -k 1 > %s/%s.sort" % (self.work_dir, matrixName, self.work_dir, matrixName), shell=True) | |
867 handle = TableReader(self.work_dir + "/" + matrixName + ".sort") | |
868 matrix = {} | |
869 colEnum = {} | |
870 for key, value in handle: | |
871 if key not in matrix: | |
872 matrix[key] = {} | |
873 for col in value: | |
874 matrix[key][col] = value[col] | |
875 if col not in colEnum: | |
876 if not self.config.sanitize or col not in [ 'race', 'ethnicity' ]: | |
877 colEnum[col] = len(colEnum) | |
878 | |
879 handle = open( os.path.join(self.work_dir, matrixName + "_file"), "w") | |
880 cols = [None] * (len(colEnum)) | |
881 for col in colEnum: | |
882 cols[colEnum[col]] = col | |
883 handle.write("sample\t%s\n" % ("\t".join(cols))) | |
884 for key in matrix: | |
885 cols = [""] * (len(colEnum)) | |
886 for col in colEnum: | |
887 if col in matrix[key]: | |
888 cols[colEnum[col]] = matrix[key][col]['value'] | |
889 handle.write("%s\t%s\n" % (key, "\t".join(cols).encode("ASCII", "replace"))) | |
890 handle.close() | |
891 self.emitFile( "." + matrixName, self.getMeta(self.config.name + "." + matrixName), "%s/%s_file" % (self.work_dir, matrixName)) | |
892 | |
893 | |
894 class AgilentImport(TCGAMatrixImport): | |
895 dataSubType = 'geneExp' | |
896 probeMap = 'hugo' | |
897 sampleMap = 'tcga.iddag' | |
898 dataType = 'genomicMatrix' | |
899 probeFields = ['log2 lowess normalized (cy5/cy3) collapsed by gene symbol'] | |
900 | |
901 | |
902 class CGH1x1mImport(TCGASegmentImport): | |
903 dataSubType = 'cna' | |
904 sampleMap = 'tcga.iddag' | |
905 dataType = 'genomicSegment' | |
906 probeFields = ['seg.mean'] | |
907 | |
908 class SNP6Import(TCGASegmentImport): | |
909 assembly = 'hg19' | |
910 dataSubType = 'cna' | |
911 sampleMap ='tcga.iddag' | |
912 dataType = 'genomicSegment' | |
913 probeFields = ['seg.mean'] | |
914 | |
915 def fileScan(self, path): | |
916 outport = None | |
917 #if path.endswith(".hg18.seg.txt"): | |
918 # outport = "hg18_segment" | |
919 if path.endswith(".hg19.seg.txt"): | |
920 outport = "hg19_segment" | |
921 | |
922 if outport is not None: | |
923 handle = open(path) | |
924 colName = None | |
925 for line in handle: | |
926 if colName is None: | |
927 colName = line.rstrip().split("\t") | |
928 for i, col in enumerate(colName): | |
929 if commonMap.has_key( col ): | |
930 colName[i] = commonMap[ col ] | |
931 else: | |
932 tmp = line.rstrip().split("\t") | |
933 out = {} | |
934 for i in range(1, len(colName)): | |
935 out[ colName[i] ] = tmp[i] | |
936 self.emit( tmp[0], out, outport ) | |
937 handle.close() | |
938 | |
939 def fileBuild(self): | |
940 tmap = self.getTargetMap() | |
941 | |
942 for base in ['hg19']: | |
943 subprocess.call("sort -k 1 %s/%s_segment > %s/%s_segment.sort" % (self.work_dir, base, self.work_dir, base), shell=True) | |
944 handle = TableReader(self.work_dir + "/%s_segment.sort" % (base)) | |
945 | |
946 segFile = None | |
947 curName = None | |
948 curData = {} | |
949 missingCount = 0 | |
950 | |
951 startField = "loc.start" | |
952 endField = "loc.end" | |
953 valField = "seg.mean" | |
954 chromeField = "chrom" | |
955 | |
956 segFile = None | |
957 sHandle = handle | |
958 for key, value in sHandle: | |
959 if segFile is None: | |
960 segFile = open("%s/%s_segment.out" % (self.work_dir, base), "w") | |
961 try: | |
962 curName = self.translateUUID(tmap[key]) | |
963 if curName is not None: | |
964 chrom = value[ chromeField ].lower() | |
965 if not chrom.startswith("chr"): | |
966 chrom = "chr" + chrom | |
967 chrom = chrom.upper().replace("CHR", "chr") | |
968 #segFile.write( "%s\t%s\t%s\t%s\t.\t%s\n" % ( curName, chrom, int(value[ startField ])+1, value[ endField ], value[ valField ] ) ) | |
969 segFile.write( "%s\t%s\t%s\t%s\t%s\n" % ( chrom, value[ startField ], value[ endField ], curName, value[ valField ] ) ) | |
970 except KeyError: | |
971 self.addError( "TargetInfo Not Found: %s" % (key)) | |
972 | |
973 segFile.close() | |
974 | |
975 self.emitFile("." + base, self.getMeta(self.config.name + "." + base), "%s/%s_segment.out" % (self.work_dir, base)) | |
976 | |
977 | |
978 class HmiRNAImport(TCGAMatrixImport): | |
979 dataSubType = 'miRNAExp' | |
980 probeMap = 'agilentHumanMiRNA' | |
981 sampleMap = 'tcga.iddag' | |
982 dataType = 'genomicMatrix' | |
983 probeFields = ['unc_DWD_Batch_adjusted'] | |
984 | |
985 class CGH244AImport(TCGASegmentImport): | |
986 dataSubType = 'cna' | |
987 sampleMap = 'tcga.iddag' | |
988 dataType = 'genomicSegment' | |
989 probeFields = ['Segment_Mean'] | |
990 | |
991 class CGH415K_G4124A(TCGASegmentImport): | |
992 dataSubType = 'cna' | |
993 sampleMap = 'tcga.iddag' | |
994 chromeField = 'Chromosome' | |
995 dataType = 'genomicSegment' | |
996 endField = 'End' | |
997 probeFields = ['Segment_Mean'] | |
998 startField = 'Start' | |
999 | |
1000 | |
1001 class IlluminaHiSeq_DNASeqC(TCGASegmentImport): | |
1002 dataSubType = 'cna' | |
1003 sampleMap = 'tcga.iddag' | |
1004 chromeField = 'Chromosome' | |
1005 dataType = 'genomicSegment' | |
1006 endField = 'End' | |
1007 probeFields = ['Segment_Mean'] | |
1008 startField = 'Start' | |
1009 | |
1010 def translateUUID(self, uuid): | |
1011 out = self.config.translateUUID(uuid) | |
1012 #censor out normal ids | |
1013 if re.search(r'^TCGA-..-....-1', out): | |
1014 return None | |
1015 return out | |
1016 | |
1017 class HT_HGU133A(TCGAMatrixImport): | |
1018 dataSubType = 'geneExp' | |
1019 probeMap = 'affyU133a' | |
1020 sampleMap = 'tcga.iddag' | |
1021 dataType = 'genomicMatrix' | |
1022 probeFields = ['Signal'] | |
1023 | |
1024 class HuEx1_0stv2(TCGAMatrixImport): | |
1025 dataSubType = 'miRNAExp' | |
1026 probeMap = 'hugo' | |
1027 sampleMap = 'tcga.iddag' | |
1028 dataType = 'genomicMatrix' | |
1029 probeFields = ['Signal'] | |
1030 fileInclude = '^.*gene.txt$|^.*sdrf.txt$' | |
1031 | |
1032 class Human1MDuoImport(TCGASegmentImport): | |
1033 dataSubType = 'cna' | |
1034 sampleMap = 'tcga.iddag' | |
1035 dataType = 'genomicSegment' | |
1036 probeFields = ['mean'] | |
1037 | |
1038 class HumanHap550(TCGASegmentImport): | |
1039 dataSubType = 'cna' | |
1040 sampleMap = 'tcga.iddag' | |
1041 dataType = 'genomicSegment' | |
1042 probeFields = ['mean'] | |
1043 | |
1044 class HumanMethylation27(TCGAMatrixImport): | |
1045 dataSubType = 'DNAMethylation' | |
1046 probeMap= 'illuminaMethyl27K_gpl8490' | |
1047 sampleMap= 'tcga.iddag' | |
1048 dataType= 'genomicMatrix' | |
1049 fileExclude= '.*.adf.txt' | |
1050 probeFields = ['Beta_Value', 'Beta_value'] | |
1051 | |
1052 | |
1053 class HumanMethylation450(TCGAMatrixImport): | |
1054 dataSubType = 'DNAMethylation' | |
1055 probeMap = 'illuminaHumanMethylation450' | |
1056 sampleMap = 'tcga.iddag' | |
1057 dataType = 'genomicMatrix' | |
1058 fileExclude = '.*.adf.txt' | |
1059 probeFields = ['Beta_value', 'Beta_Value'] | |
1060 | |
1061 def fileScan(self, path): | |
1062 """ | |
1063 This function takes a TCGA level 3 genetic file (file name and input handle), | |
1064 and tries to extract probe levels or target mappings (experimental ID to TCGA barcode) | |
1065 it emits these values to a handle, using the 'targets' and 'probes' string to identify | |
1066 the type of data being emited | |
1067 """ | |
1068 iHandle = open(path) | |
1069 mode = None | |
1070 #modes | |
1071 #1 - two col header matrix file | |
1072 target = None | |
1073 colName = None | |
1074 colType = None | |
1075 for line in iHandle: | |
1076 if colName is None: | |
1077 colName = line.rstrip().split("\t") | |
1078 if colName[0] == "Hybridization REF" or colName[0] == "Sample REF": | |
1079 mode=1 | |
1080 for i in range(len(colName)): | |
1081 if commonMap.has_key( colName[i] ): | |
1082 colName[i] = commonMap[ colName[i] ] | |
1083 elif mode==1 and colType is None: | |
1084 colType=line.rstrip().split("\t") | |
1085 for i in range(len(colType)): | |
1086 if commonMap.has_key( colType[i] ): | |
1087 colType[i] = commonMap[ colType[i] ] | |
1088 else: | |
1089 tmp = line.rstrip().split("\t") | |
1090 if mode == 1: | |
1091 out={} | |
1092 for col in colName[1:]: | |
1093 out[ col ] = { "target" : col } | |
1094 for i in range(1,len(colType)): | |
1095 try: | |
1096 if colType[i] in self.probeFields: | |
1097 out[ colName[i] ][ colType[i] ] = "%.4f" % float(tmp[i]) | |
1098 except IndexError: | |
1099 out[ colName[i] ][ colType[i] ] = "NA" | |
1100 except ValueError: | |
1101 out[ colName[i] ][ colType[i] ] = "NA" | |
1102 for col in out: | |
1103 self.emit( tmp[0], out[col], "probes" ) | |
1104 | |
1105 class Illumina_RNASeq(TCGAMatrixImport): | |
1106 sampleMap= 'tcga.iddag' | |
1107 dataSubType= 'geneExp' | |
1108 fileInclude= r'^.*\.gene.quantification.txt$|^.*sdrf.txt$' | |
1109 probeFields = ['RPKM'] | |
1110 probeMap= 'hugo.unc' | |
1111 | |
1112 class Illumina_RNASeqV2(TCGAMatrixImport): | |
1113 sampleMap= 'tcga.iddag' | |
1114 dataSubType= 'geneExp' | |
1115 fileInclude= r'^.*rsem.genes.normalized_results$|^.*sdrf.txt$' | |
1116 probeFields = ['normalized_count'] | |
1117 probeMap= 'hugo.unc' | |
1118 | |
1119 class IlluminaHiSeq_RNASeq(TCGAMatrixImport): | |
1120 sampleMap= 'tcga.iddag' | |
1121 dataSubType= 'geneExp' | |
1122 fileInclude= r'^.*gene.quantification.txt$' | |
1123 probeFields = ['RPKM'] | |
1124 probeMap= 'hugo.unc' | |
1125 | |
1126 class MDA_RPPA_Core(TCGAMatrixImport): | |
1127 sampleMap = 'tcga.iddag' | |
1128 probeMap = "md_anderson_antibodies" | |
1129 dataSubType = "RPPA" | |
1130 fileExclude = r'^.*.antibody_annotation.txt' | |
1131 probeFields = [ 'Protein Expression' ] | |
1132 | |
1133 def getTargetMap(self): | |
1134 subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True) | |
1135 handle = TableReader(self.work_dir + "/targets.sort") | |
1136 tTrans = {} | |
1137 for key, value in handle: | |
1138 value = re.sub(r'\.SD', '', value) | |
1139 tTrans[ key ] = value | |
1140 return tTrans | |
1141 | |
1142 | |
1143 class Illumina_miRNASeq(TCGAMatrixImport): | |
1144 sampleMap= 'tcga.iddag' | |
1145 dataSubType= 'miRNA' | |
1146 fileInclude= '^.*.mirna.quantification.txt$' | |
1147 probeFields = ['reads_per_million_miRNA_mapped'] | |
1148 probeMap= 'hsa.mirna' | |
1149 | |
1150 | |
1151 class bioImport(TCGAClinicalImport): | |
1152 sampleMap = 'tcga.iddag' | |
1153 fileInclude = '.*.xml$' | |
1154 | |
1155 tcgaConfig = { | |
1156 'AgilentG4502A_07' : AgilentImport, | |
1157 'AgilentG4502A_07_1' : AgilentImport, | |
1158 'AgilentG4502A_07_2' : AgilentImport, | |
1159 'AgilentG4502A_07_3': AgilentImport, | |
1160 'CGH-1x1M_G4447A': CGH1x1mImport, | |
1161 'Genome_Wide_SNP_6': SNP6Import, | |
1162 'H-miRNA_8x15K': HmiRNAImport, | |
1163 'H-miRNA_8x15Kv2': HmiRNAImport, | |
1164 'HG-CGH-244A': CGH244AImport, | |
1165 'HG-CGH-415K_G4124A': CGH415K_G4124A, | |
1166 'HT_HG-U133A': HT_HGU133A, | |
1167 'HuEx-1_0-st-v2': HuEx1_0stv2, | |
1168 'Human1MDuo': Human1MDuoImport, | |
1169 'HumanHap550': HumanHap550, | |
1170 'IlluminaHiSeq_DNASeqC' : IlluminaHiSeq_DNASeqC, | |
1171 'HumanMethylation27': HumanMethylation27, | |
1172 'HumanMethylation450': HumanMethylation450, | |
1173 'IlluminaHiSeq_RNASeq': IlluminaHiSeq_RNASeq, | |
1174 'IlluminaGA_RNASeq' : Illumina_RNASeq, | |
1175 'IlluminaHiSeq_RNASeqV2' : Illumina_RNASeqV2, | |
1176 'MDA_RPPA_Core' : MDA_RPPA_Core, | |
1177 'IlluminaGA_miRNASeq' : Illumina_miRNASeq, | |
1178 'IlluminaHiSeq_miRNASeq' : Illumina_miRNASeq, | |
1179 'bio' : bioImport | |
1180 } | |
1181 | |
1182 def fileDigest( file ): | |
1183 md5 = hashlib.md5() | |
1184 with open(file,'rb') as f: | |
1185 for chunk in iter(lambda: f.read(8192), ''): | |
1186 md5.update(chunk) | |
1187 return md5.hexdigest() | |
1188 | |
1189 | |
1190 def platform_list(): | |
1191 q = CustomQuery("Platform") | |
1192 for e in q: | |
1193 yield e['name'] | |
1194 | |
1195 def supported_list(): | |
1196 q = CustomQuery("Platform") | |
1197 for e in q: | |
1198 if e['name'] in tcgaConfig: | |
1199 yield e['name'] | |
1200 | |
1201 def platform_archives(platform): | |
1202 q = CustomQuery("Archive[Platform[@name=%s]][@isLatest=1]" % platform) | |
1203 out = {} | |
1204 for e in q: | |
1205 name = e['baseName'] | |
1206 if name not in out: | |
1207 yield name | |
1208 out[name] = True | |
1209 | |
1210 | |
1211 if __name__ == "__main__": | |
1212 | |
1213 parser = ArgumentParser() | |
1214 #Stack.addJobTreeOptions(parser) | |
1215 | |
1216 parser.add_argument("-a", "--platform-list", dest="platform_list", action="store_true", help="Get list of platforms", default=False) | |
1217 parser.add_argument("-u", "--uuid", dest="uuid_table", help="UUID to Barcode Table", default=None) | |
1218 parser.add_argument("-t", "--uuid-download", dest="uuid_download", help="Download UUID/Barcode Table", default=False) | |
1219 parser.add_argument("-z", "--all-archives", dest="all_archives", action="store_true", help="List all archives", default=False) | |
1220 parser.add_argument("-p", "--platform", dest="platform", help="Platform Selection", default=None) | |
1221 parser.add_argument("-l", "--supported", dest="supported_list", action="store_true", help="List Supported Platforms", default=None) | |
1222 parser.add_argument("-f", "--filelist", dest="filelist", help="List files needed to convert TCGA project basename into cgData", default=None) | |
1223 parser.add_argument("-b", "--basename", dest="basename", help="Convert TCGA project basename into cgData", default=None) | |
1224 parser.add_argument("-m", "--mirror", dest="mirror", help="Mirror Location", default=None) | |
1225 parser.add_argument("-w", "--workdir", dest="workdir_base", help="Working directory", default="/tmp") | |
1226 parser.add_argument("--out-dir", dest="outdir", help="Working directory", default="./") | |
1227 parser.add_argument("-o", "--out", dest="outpath", help="Output Dest", default=None) | |
1228 parser.add_argument("--out-error", dest="errorpath", help="Output Error", default=None) | |
1229 parser.add_argument("--out-meta", dest="metapath", help="Output Meta", default=None) | |
1230 parser.add_argument("-c", "--cancer", dest="cancer", help="List Archives by cancer type", default=None) | |
1231 parser.add_argument("-d", "--download", dest="download", help="Download files for archive", default=None) | |
1232 parser.add_argument("-e", "--level", dest="level", help="Data Level ", default="3") | |
1233 parser.add_argument("-s", "--check-sum", dest="checksum", help="Check project md5", default=None) | |
1234 parser.add_argument("-r", "--sanitize", dest="sanitize", action="store_true", help="Remove race/ethnicity from clinical data", default=False) | |
1235 parser.add_argument("-x", "--clinical", dest="clinical", help="Process clinical info", default=None) | |
1236 parser.add_argument("--clinical-basename", dest="clinical_basename", help="Select Clinical Data by basename", default=None) | |
1237 parser.add_argument("--clinical-type", dest="clinical_type", help="Clinical Data Type", default=None) | |
1238 parser.add_argument("--all-clinical", dest="all_clinical", action="store_true", help="List all clinical archives", default=False) | |
1239 parser.add_argument("--out-clinical", dest="out_clinical", action="append", nargs=3, default=[]) | |
1240 parser.add_argument("--samples", dest="get_samples", action="store_true", default=False) | |
1241 | |
1242 options = parser.parse_args() | |
1243 | |
1244 if options.uuid_download: | |
1245 url="https://tcga-data.nci.nih.gov/uuid/uuidBrowserExport.htm" | |
1246 data = {} | |
1247 data['exportType'] = 'tab' | |
1248 data['cols'] = "uuid,barcode" | |
1249 urllib.urlretrieve( url, options.uuid_download, data=urllib.urlencode(data)) | |
1250 | |
1251 if options.platform_list: | |
1252 for e in platform_list(): | |
1253 print e | |
1254 | |
1255 if options.supported_list: | |
1256 for e in supported_list(): | |
1257 print e | |
1258 | |
1259 if options.platform: | |
1260 for name in platform_archives( options.platform ): | |
1261 print name | |
1262 | |
1263 if options.all_archives: | |
1264 q = CustomQuery("Archive[@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.level)) | |
1265 out = {} | |
1266 for e in q: | |
1267 name = e['baseName'] | |
1268 if name not in out: | |
1269 print name | |
1270 out[name] = True | |
1271 | |
1272 if options.all_clinical: | |
1273 q = CustomQuery("Archive[@isLatest=1][Platform[@alias=bio]]") | |
1274 out = {} | |
1275 for e in q: | |
1276 name = e['baseName'] | |
1277 if name not in out: | |
1278 print name | |
1279 out[name] = True | |
1280 | |
1281 if options.get_samples: | |
1282 url="https://tcga-data.nci.nih.gov/datareports/aliquotExport.htm" | |
1283 data = {} | |
1284 | |
1285 data['exportType'] = 'tab' | |
1286 data['cols'] = 'aliquotId,disease,bcrBatch,center,platform,levelOne,levelTwo,levelThree' | |
1287 data['filterReq'] = json.dumps({"disease":"","levelOne":"","aliquotId":"","center":"","levelTwo":"","bcrBatch":"","platform":"","levelThree":""}) | |
1288 data['formFilter'] = json.dumps({"disease":"","levelOne":"","aliquotId":"","center":"","levelTwo":"","bcrBatch":"","platform":"","levelThree":""}) | |
1289 handle = urllib.urlopen( url + "?" + urllib.urlencode(data)) | |
1290 | |
1291 for line in handle: | |
1292 tmp = line.rstrip().split("\t") | |
1293 if tmp[7] == "Submitted": | |
1294 if tmp[0][13]=='0': | |
1295 print "\t".join( [ tmp[0], tmp[1], "Tumor", tmp[4] ] ) | |
1296 elif tmp[0][13] == '1': | |
1297 print "\t".join( [ tmp[0], tmp[1], "Normal", tmp[4] ] ) | |
1298 | |
1299 | |
1300 if options.cancer is not None: | |
1301 q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][ArchiveType[@type=Level_%s]]" % (options.cancer, options.level)) | |
1302 out = {} | |
1303 for e in q: | |
1304 name = e['baseName'] | |
1305 if name not in out: | |
1306 print name | |
1307 out[name] = True | |
1308 | |
1309 if options.filelist: | |
1310 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.filelist, options.level)) | |
1311 for e in q: | |
1312 print e['deployLocation'] | |
1313 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.filelist)) | |
1314 for e in q: | |
1315 print e['deployLocation'] | |
1316 | |
1317 if options.checksum: | |
1318 urls = [] | |
1319 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.checksum, options.level)) | |
1320 for e in q: | |
1321 urls.append( e['deployLocation'] ) | |
1322 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.checksum)) | |
1323 for e in q: | |
1324 urls.append( e['deployLocation'] ) | |
1325 | |
1326 for url in urls: | |
1327 dst = os.path.join(options.mirror, re.sub("^/", "", url)) | |
1328 if not os.path.exists( dst ): | |
1329 print "NOT_FOUND:", dst | |
1330 continue | |
1331 if not os.path.exists( dst + ".md5" ): | |
1332 print "MD5_NOT_FOUND", dst | |
1333 continue | |
1334 | |
1335 handle = open( dst + ".md5" ) | |
1336 line = handle.readline() | |
1337 omd5 = line.split(' ')[0] | |
1338 handle.close() | |
1339 | |
1340 nmd5 = fileDigest( dst ) | |
1341 if omd5 != nmd5: | |
1342 print "CORRUPT:", dst | |
1343 else: | |
1344 print "OK:", dst | |
1345 | |
1346 | |
1347 if options.download is not None: | |
1348 if options.mirror is None: | |
1349 print "Define mirror location" | |
1350 sys.exit(1) | |
1351 | |
1352 urls = [] | |
1353 | |
1354 if options.basename is None and options.clinical is None and options.clinical_basename is None: | |
1355 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.download, options.level)) | |
1356 for e in q: | |
1357 urls.append( e['deployLocation'] ) | |
1358 urls.append( e['deployLocation'] + ".md5" ) | |
1359 | |
1360 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.download)) | |
1361 for e in q: | |
1362 urls.append( e['deployLocation'] ) | |
1363 urls.append( e['deployLocation'] + ".md5" ) | |
1364 | |
1365 if options.basename: | |
1366 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.basename, options.level)) | |
1367 for e in q: | |
1368 urls.append( e['deployLocation'] ) | |
1369 urls.append( e['deployLocation'] + ".md5" ) | |
1370 | |
1371 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.basename)) | |
1372 for e in q: | |
1373 urls.append( e['deployLocation'] ) | |
1374 urls.append( e['deployLocation'] + ".md5" ) | |
1375 | |
1376 if options.clinical: | |
1377 q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][Platform[@alias=bio]]" % (options.clinical)) | |
1378 for e in q: | |
1379 urls.append( e['deployLocation'] ) | |
1380 urls.append( e['deployLocation'] + ".md5" ) | |
1381 | |
1382 if options.clinical_basename: | |
1383 q = CustomQuery("Archive[@isLatest=1][@baseName=%s]" % (options.clinical_basename)) | |
1384 for e in q: | |
1385 urls.append( e['deployLocation'] ) | |
1386 urls.append( e['deployLocation'] + ".md5" ) | |
1387 | |
1388 | |
1389 | |
1390 for url in urls: | |
1391 src = "https://tcga-data.nci.nih.gov/" + url | |
1392 dst = os.path.join(options.mirror, re.sub("^/", "", url)) | |
1393 dir = os.path.dirname(dst) | |
1394 if not os.path.exists(dir): | |
1395 print "mkdir", dir | |
1396 os.makedirs(dir) | |
1397 if not os.path.exists( dst ): | |
1398 print "download %s to %s" % (src, dst) | |
1399 urllib.urlretrieve(src, dst) | |
1400 | |
1401 if options.basename: | |
1402 if options.mirror is None: | |
1403 sys.stderr.write("Need mirror location\n") | |
1404 sys.exit(1) | |
1405 | |
1406 conf = getBaseBuildConf(options.basename, options.level, options.mirror) | |
1407 conf.addOptions(options) | |
1408 if conf.platform not in tcgaConfig: | |
1409 sys.stderr.write("Platform %s not supported\n" % (conf.platform)) | |
1410 sys.exit(1) | |
1411 | |
1412 ext = tcgaConfig[conf.platform](conf) | |
1413 ext.run() | |
1414 | |
1415 | |
1416 if options.clinical: | |
1417 if options.mirror is None: | |
1418 sys.stderr.write("Need mirror location\n") | |
1419 sys.exit(1) | |
1420 | |
1421 q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][Platform[@alias=bio]]" % (options.clinical)) | |
1422 basenames = {} | |
1423 for s in q: | |
1424 basenames[s['baseName']] = True | |
1425 | |
1426 for base in basenames: | |
1427 conf = getBaseBuildConf(base, 1, options.mirror) | |
1428 conf.addOptions(options) | |
1429 | |
1430 ext = tcgaConfig[conf.platform](conf) | |
1431 ext.run() | |
1432 | |
1433 if options.clinical_basename: | |
1434 if options.mirror is None: | |
1435 sys.stderr.write("Need mirror location\n") | |
1436 sys.exit(1) | |
1437 | |
1438 | |
1439 conf = getBaseBuildConf(options.clinical_basename, 1, options.mirror) | |
1440 conf.addOptions(options) | |
1441 | |
1442 ext = tcgaConfig[conf.platform](conf) | |
1443 ext.run() | |
1444 |