annotate tcga_import/tcgaImport.py @ 0:f1c71f5363ae draft default tip

Uploaded
author kellrott
date Tue, 30 Oct 2012 14:23:49 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1 #!/usr/bin/env python
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
2
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
3
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
4 """
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
5 Script to scan and extract TCGA data and compile it into the cgData
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
6
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
7 Usage::
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
8
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
9 tcga2cgdata.py [options]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
10
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
11 Options::
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
12
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
13 -h, --help show this help message and exit
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
14 -a, --platform-list Get list of platforms
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
15 -p PLATFORM, --platform=PLATFORM
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
16 Platform Selection
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
17 -l, --supported List Supported Platforms
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
18 -f FILELIST, --filelist=FILELIST
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
19 List files needed to convert TCGA project basename
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
20 into cgData
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
21 -b BASENAME, --basename=BASENAME
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
22 Convert TCGA project basename into cgData
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
23 -m MIRROR, --mirror=MIRROR
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
24 Mirror Location
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
25 -w WORKDIR_BASE, --workdir=WORKDIR_BASE
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
26 Working directory
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
27 -o OUTDIR, --out-dir=OUTDIR
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
28 Working directory
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
29 -c CANCER, --cancer=CANCER
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
30 List Archives by cancer type
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
31 -d DOWNLOAD, --download=DOWNLOAD
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
32 Download files for archive
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
33 -e LEVEL, --level=LEVEL
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
34 Data Level
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
35 -s CHECKSUM, --check-sum=CHECKSUM
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
36 Check project md5
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
37 -r, --sanitize Remove race/ethnicity from clinical data
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
38
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
39
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
40 Example::
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
41
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
42 ./scripts/tcga2cgdata.py -b intgen.org_KIRC_bio -m /inside/depot -e 1 -r -w tmp
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
43
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
44
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
45 """
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
46
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
47 from xml.dom.minidom import parseString
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
48 import urllib
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
49 import urllib2
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
50 import os
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
51 import csv
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
52 import sys
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
53 import hashlib
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
54 import tempfile
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
55 import re
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
56 import copy
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
57 import json
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
58 import datetime
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
59 import hashlib
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
60 import subprocess
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
61 from glob import glob
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
62 import shutil
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
63 import subprocess
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
64 from argparse import ArgumentParser
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
65
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
66
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
67
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
68
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
69 """
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
70
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
71 Net query code
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
72
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
73 """
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
74
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
75 class dccwsItem(object):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
76 baseURL = "http://tcga-data.nci.nih.gov/tcgadccws/GetXML?query="
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
77
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
78 def __init__(self):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
79 self.url = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
80
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
81 def __iter__(self):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
82 next = self.url
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
83 while next != None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
84 handle = urllib.urlopen(next)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
85 data = handle.read()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
86 handle.close()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
87 dom = parseString(data)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
88 # there might not be any archives for a dataset
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
89 if len(dom.getElementsByTagName('queryResponse')) > 0:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
90 response = dom.getElementsByTagName('queryResponse').pop()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
91 classList = response.getElementsByTagName('class')
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
92 for cls in classList:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
93 className = cls.getAttribute("recordNumber")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
94 outData = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
95 #aObj = Archive()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
96 for node in cls.childNodes:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
97 nodeName = node.getAttribute("name")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
98 if node.hasAttribute("xlink:href"):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
99 outData[ nodeName ] = node.getAttribute("xlink:href")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
100 else:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
101 outData[ nodeName ] = getText( node.childNodes )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
102 yield outData
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
103 if len( dom.getElementsByTagName('next') ) > 0:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
104 nextElm = dom.getElementsByTagName('next').pop()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
105 next = nextElm.getAttribute( 'xlink:href' )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
106 else:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
107 next = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
108
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
109
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
110 class CustomQuery(dccwsItem):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
111 def __init__(self, query):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
112 super(CustomQuery, self).__init__()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
113 if query.startswith("http://"):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
114 self.url = query
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
115 else:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
116 self.url = dccwsItem.baseURL + query
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
117
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
118
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
119 def getText(nodelist):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
120 rc = []
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
121 for node in nodelist:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
122 if node.nodeType == node.TEXT_NODE:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
123 rc.append(node.data)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
124 return ''.join(rc)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
125
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
126 """
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
127
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
128 Build Configuration
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
129
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
130 """
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
131
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
132 class BuildConf:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
133 def __init__(self, platform, name, version, meta, tarlist):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
134 self.platform = platform
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
135 self.name = name
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
136 self.version = version
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
137 self.meta = meta
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
138 self.tarlist = tarlist
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
139 self.abbr = ''
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
140 self.uuid_table = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
141 if 'diseaseAbbr' in meta:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
142 self.abbr = meta['diseaseAbbr']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
143
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
144 def addOptions(self, opts):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
145 self.workdir_base = opts.workdir_base
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
146 self.outdir = opts.outdir
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
147 self.sanitize = opts.sanitize
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
148 self.outpath = opts.outpath
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
149 self.metapath = opts.metapath
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
150 self.errorpath = opts.errorpath
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
151 self.clinical_type = opts.clinical_type
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
152
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
153 self.clinical_type_map = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
154 for t, path, meta in opts.out_clinical:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
155 self.clinical_type_map[ "." + t] = (path, meta)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
156
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
157 if opts.uuid_table is not None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
158 self.uuid_table = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
159 handle = open(opts.uuid_table)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
160 for line in handle:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
161 tmp = line.rstrip().split("\t")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
162 self.uuid_table[tmp[0]] = tmp[1]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
163
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
164 def translateUUID(self, uuid):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
165 if self.uuid_table is None or uuid not in self.uuid_table:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
166 return uuid
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
167 return self.uuid_table[uuid]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
168
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
169 def getOutPath(self, name):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
170 if self.outpath is not None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
171 return self.outpath
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
172 if name in self.clinical_type_map:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
173 return self.clinical_type_map[name][0]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
174 return os.path.join(self.outdir, self.name) + name
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
175
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
176 def getOutMeta(self, name):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
177 if self.outpath is not None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
178 if self.metapath is not None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
179 return self.metapath
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
180 return self.outpath + ".json"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
181 if name in self.clinical_type_map:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
182 return self.clinical_type_map[name][1]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
183 return os.path.join(self.outdir, self.name) + name + ".json"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
184
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
185 def getOutError(self, name):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
186 if self.outpath is not None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
187 if self.errorpath is not None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
188 return self.errorpath
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
189 return self.outpath + ".error"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
190 return os.path.join(self.outdir, self.name) + name + ".error"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
191
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
192
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
193 def getBaseBuildConf(basename, level, mirror):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
194 dates = []
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
195 print "TCGA Query for: ", basename
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
196 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (basename, level))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
197 urls = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
198 meta = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
199 platform = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
200 for e in q:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
201 dates.append( datetime.datetime.strptime( e['addedDate'], "%m-%d-%Y" ) )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
202 if meta is None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
203 meta = {"sourceUrl" : []}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
204 for e2 in CustomQuery(e['platform']):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
205 platform = e2['name']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
206 meta['platform'] = e2['name']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
207 meta['platformTitle'] = e2['displayName']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
208 for e2 in CustomQuery(e['disease']):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
209 meta['diseaseAbbr'] = e2['abbreviation']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
210 meta['diseaseTitle'] = e2['name']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
211 for e3 in CustomQuery(e2['tissueCollection']):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
212 meta['tissue'] = e3['name']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
213 for e2 in CustomQuery(e['center']):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
214 meta['centerTitle'] = e2['displayName']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
215 meta['center'] = e2['name']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
216 meta['sourceUrl'].append( "http://tcga-data.nci.nih.gov/" + e['deployLocation'] )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
217 urls[ mirror + e['deployLocation'] ] = platform
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
218
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
219 print "TCGA Query for mage-tab: ", basename
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
220 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (basename))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
221 for e in q:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
222 dates.append( datetime.datetime.strptime( e['addedDate'], "%m-%d-%Y" ) )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
223 q2 = CustomQuery(e['platform'])
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
224 platform = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
225 for e2 in q2:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
226 print e2
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
227 platform = e2['name']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
228 meta['sourceUrl'].append( "http://tcga-data.nci.nih.gov/" + e['deployLocation'] )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
229 urls[ mirror + e['deployLocation'] ] = platform
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
230
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
231 if len(dates) == 0:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
232 print "No Files found"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
233 return
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
234 dates.sort()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
235 dates.reverse()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
236 versionDate = dates[0].strftime( "%Y-%m-%d" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
237
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
238 return BuildConf(platform, basename, versionDate, meta, urls)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
239
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
240
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
241
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
242
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
243
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
244 class TableReader:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
245 def __init__(self, path):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
246 self.path = path
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
247
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
248 def __iter__(self):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
249 if self.path is not None and os.path.exists(self.path):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
250 handle = open(self.path)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
251 for line in handle:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
252 tmp = line.rstrip().split("\t")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
253 yield tmp[0], json.loads(tmp[1])
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
254 handle.close()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
255
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
256
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
257 class FileImporter:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
258
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
259 fileInclude = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
260 fileExclude = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
261
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
262 excludes = [
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
263 "MANIFEST.txt$",
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
264 "CHANGES_DCC.txt$",
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
265 "README_DCC.txt$",
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
266 "README.txt$",
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
267 "CHANGES.txt$",
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
268 "DCC_ALTERED_FILES.txt$",
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
269 r'.wig$',
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
270 "DESCRIPTIO$"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
271 ]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
272
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
273 def __init__(self, config):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
274 self.config = config
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
275
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
276 def extractTars(self):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
277 self.work_dir = tempfile.mkdtemp(dir=self.config.workdir_base)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
278 print "Extract to ", self.work_dir
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
279 for path in self.config.tarlist:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
280 subprocess.check_call([ "tar", "xvzf", path, "-C", self.work_dir], stderr=sys.stdout)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
281
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
282 def run(self):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
283 self.extractTars()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
284
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
285 filterInclude = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
286 filterExclude = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
287 if self.fileInclude is not None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
288 filterInclude = re.compile(self.fileInclude)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
289 if self.fileExclude is not None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
290 filterExclude = re.compile(self.fileExclude)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
291 self.inc = 0
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
292 self.out = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
293 self.errors = []
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
294 self.ext_meta = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
295 self.scandirs(self.work_dir, filterInclude, filterExclude)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
296 for o in self.out:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
297 self.out[o].close()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
298 self.fileBuild()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
299 #shutil.rmtree(self.work_dir)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
300
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
301 def checkExclude( self, name ):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
302 for e in self.excludes:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
303 if re.search( e, name ):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
304 return True
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
305 return False
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
306
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
307 def scandirs(self, path, filterInclude=None, filterExclude=None):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
308 if os.path.isdir(path):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
309 for a in glob(os.path.join(path, "*")):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
310 self.scandirs(a, filterInclude, filterExclude)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
311 else:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
312 name = os.path.basename(path)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
313 if self.isMage(path):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
314 self.mageScan(path)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
315 else:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
316 if not self.checkExclude(name):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
317 if (filterInclude is None or filterInclude.match(name)) and (filterExclude is None or not filterExclude.match(name)):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
318 self.fileScan(path)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
319
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
320 def isMage(self, path):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
321 if path.endswith( '.sdrf.txt' ) or path.endswith( '.idf.txt' ) or path.endswith("DESCRIPTION.txt"):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
322 return True
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
323
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
324
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
325 def emit(self, key, data, port):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
326 if port not in self.out:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
327 self.out[port] = open(self.work_dir + "/" + port, "w")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
328 self.out[port].write( "%s\t%s\n" % (key, json.dumps(data)))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
329
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
330 def emitFile(self, name, meta, file):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
331 md5 = hashlib.md5()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
332 oHandle = open(self.config.getOutPath(name), "wb")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
333 with open(file,'rb') as f:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
334 for chunk in iter(lambda: f.read(8192), ''):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
335 md5.update(chunk)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
336 oHandle.write(chunk)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
337 oHandle.close()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
338 md5str = md5.hexdigest()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
339 meta['md5'] = md5str
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
340 mHandle = open(self.config.getOutMeta(name), "w")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
341 mHandle.write( json.dumps(meta))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
342 mHandle.close()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
343 if len(self.errors):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
344 eHandle = open( self.config.getOutError(name), "w" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
345 for msg in self.errors:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
346 eHandle.write( msg + "\n" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
347 eHandle.close()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
348
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
349 def addError(self, msg):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
350 self.errors.append(msg)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
351
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
352
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
353 commonMap = {
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
354 "mean" : "seg.mean",
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
355 "Segment_Mean" : "seg.mean",
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
356 "Start" : "loc.start",
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
357 "End" : "loc.end",
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
358 "Chromosome" : "chrom"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
359 }
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
360
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
361
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
362 idfMap = {
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
363 "Investigation Title" : "title",
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
364 "Experiment Description" : "experimentalDescription",
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
365 "Person Affiliation" : "dataProducer",
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
366 "Date of Experiment" : "experimentalDate"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
367 }
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
368
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
369 class TCGAGeneticImport(FileImporter):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
370
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
371
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
372
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
373 def mageScan(self, path):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
374 if path.endswith(".sdrf.txt"):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
375 iHandle = open(path, "rU")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
376 read = csv.reader( iHandle, delimiter="\t" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
377 colNum = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
378 for row in read:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
379 if colNum is None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
380 colNum = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
381 for i in range(len(row)):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
382 colNum[ row[i] ] = i
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
383 else:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
384 if not colNum.has_key("Material Type") or ( not row[ colNum[ "Material Type" ] ] in [ "genomic_DNA", "total_RNA", "MDA cell line" ] ):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
385 try:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
386 if colNum.has_key( "Derived Array Data File" ):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
387 self.emit( row[ colNum[ "Derived Array Data File" ] ].split('.')[0], row[ colNum[ "Extract Name" ] ], "targets" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
388 self.emit( row[ colNum[ "Derived Array Data File" ] ], row[ colNum[ "Extract Name" ] ], "targets" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
389 if colNum.has_key("Derived Array Data Matrix File" ):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
390 self.emit( row[ colNum[ "Derived Array Data Matrix File" ] ], row[ colNum[ "Extract Name" ] ], "targets" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
391 if colNum.has_key( "Derived Data File"):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
392 self.emit( row[ colNum[ "Derived Data File" ] ].split('.')[0], row[ colNum[ "Extract Name" ] ], "targets" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
393 self.emit( row[ colNum[ "Derived Data File" ] ], row[ colNum[ "Extract Name" ] ], "targets" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
394 if colNum.has_key( "Hybridization Name" ):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
395 self.emit( row[ colNum[ "Hybridization Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
396 if colNum.has_key( "Sample Name" ):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
397 self.emit( row[ colNum[ "Sample Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
398 self.emit( row[ colNum[ "Extract Name" ] ] , row[ colNum[ "Extract Name" ] ], "targets" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
399 except IndexError:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
400 pass #there can be blank lines in the SDRF
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
401 if path.endswith(".idf.txt"):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
402 iHandle = open(path)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
403 for line in iHandle:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
404 row = line.split("\t")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
405 if len(row):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
406 if row[0] in idfMap:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
407 self.ext_meta[ idfMap[row[0]] ] = row[1]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
408 iHandle.close()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
409 if path.endswith("DESCRIPTION.txt"):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
410 handle = open(path)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
411 self.ext_meta['description'] = handle.read()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
412 handle.close()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
413
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
414 def translateUUID(self, uuid):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
415 return self.config.translateUUID(uuid)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
416
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
417 def getTargetMap(self):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
418 subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
419 handle = TableReader(self.work_dir + "/targets.sort")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
420 tTrans = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
421 for key, value in handle:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
422 tTrans[ key ] = value
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
423 return tTrans
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
424
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
425 def fileScan(self, path):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
426 """
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
427 This function takes a TCGA level 3 genetic file (file name and input handle),
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
428 and tries to extract probe levels or target mappings (experimental ID to TCGA barcode)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
429 it emits these values to a handle, using the 'targets' and 'probes' string to identify
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
430 the type of data being emited
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
431 """
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
432 iHandle = open(path)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
433 mode = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
434 #modes
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
435 #1 - segmentFile - one sample per file/no sample info inside file
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
436 #2 - two col header matrix file
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
437 #3 - segmentFile - sample information inside file
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
438 target = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
439 colName = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
440 colType = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
441 for line in iHandle:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
442 if colName is None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
443 colName = line.rstrip().split("\t")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
444 if colName[0] == "Hybridization REF" or colName[0] == "Sample REF":
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
445 mode=2
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
446 elif colName[0] == "Chromosome" or colName[0] == "chromosome":
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
447 mode=1
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
448 target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
449 elif colName[1] == "chrom":
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
450 mode = 3
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
451 target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
452
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
453 for i in range(len(colName)):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
454 if commonMap.has_key( colName[i] ):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
455 colName[i] = commonMap[ colName[i] ]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
456 elif mode==2 and colType is None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
457 colType=line.rstrip().split("\t")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
458 for i in range(len(colType)):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
459 if commonMap.has_key( colType[i] ):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
460 colType[i] = commonMap[ colType[i] ]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
461 else:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
462 tmp = line.rstrip().split("\t")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
463 if mode == 2:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
464 out={}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
465 for col in colName[1:]:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
466 out[ col ] = { "target" : col }
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
467 for i in range(1,len(colType)):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
468 try:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
469 if colType[i] in self.probeFields:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
470 out[ colName[i] ][ colType[i] ] = tmp[i]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
471 except IndexError:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
472 out[ colName[i] ][ colType[i] ] = "NA"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
473 for col in out:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
474 self.emit( tmp[0], out[col], "probes" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
475 else:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
476 out = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
477 for i in range(len(colName)):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
478 out[ colName[i] ] = tmp[i]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
479 out['file'] = os.path.basename(path)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
480 if mode==1:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
481 self.emit( target, out, "segments" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
482 elif mode == 3:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
483 self.emit( tmp[0], out, "segments" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
484 else:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
485 self.emit( tmp[0], out, "probes" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
486
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
487
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
488
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
489
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
490 class TCGASegmentImport(TCGAGeneticImport):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
491
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
492
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
493 def fileScan(self, path):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
494 """
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
495 This function takes a TCGA level 3 genetic file (file name and input handle),
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
496 and tries to extract probe levels or target mappings (experimental ID to TCGA barcode)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
497 it emits these values to a handle, using the 'targets' and 'probes' string to identify
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
498 the type of data being emited
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
499 """
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
500 iHandle = open(path)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
501 mode = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
502 #modes
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
503 #1 - segmentFile - one sample per file/no sample info inside file
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
504 #2 - segmentFile - sample information inside file
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
505 target = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
506 colName = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
507 colType = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
508 for line in iHandle:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
509 if colName is None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
510 colName = line.rstrip().split("\t")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
511 if colName[0] == "Chromosome" or colName[0] == "chromosome":
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
512 mode=1
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
513 target=os.path.basename( path ).split('.')[0] #seg files are named by the filename before the '.' extention
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
514 elif colName[1] == "chrom":
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
515 mode = 2
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
516
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
517 for i in range(len(colName)):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
518 if commonMap.has_key( colName[i] ):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
519 colName[i] = commonMap[ colName[i] ]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
520 else:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
521 tmp = line.rstrip().split("\t")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
522 out = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
523 for i in range(len(colName)):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
524 out[ colName[i] ] = tmp[i]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
525 out['file'] = os.path.basename(path)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
526 if mode==1:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
527 self.emit( target, out, "segments" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
528 elif mode == 2:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
529 self.emit( tmp[0], out, "segments" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
530
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
531
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
532 def getMeta(self, name):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
533 matrixInfo = {
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
534 '@context' : "http://purl.org/cgdata/",
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
535 '@type' : 'bed5',
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
536 '@id' : name,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
537 "lastModified" : self.config.version,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
538 'rowKeySrc' : {
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
539 '@type' : 'idDAG',
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
540 '@id' : "tcga.%s" % (self.config.abbr)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
541 },
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
542 'dataSubType' : { "@id" : self.dataSubType },
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
543 'dataProducer' : 'TCGA Import',
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
544 "accessMap" : "public", "redistribution" : "yes"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
545 }
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
546 matrixInfo.update(self.ext_meta)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
547 matrixInfo.update(self.config.meta)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
548 return matrixInfo
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
549
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
550 def fileBuild(self):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
551 #use the target table to create a name translation table
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
552 #also setup target name enumeration, so they will have columns
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
553 #numbers
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
554
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
555 tTrans = self.getTargetMap()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
556 subprocess.call("sort -k 1 %s/segments > %s/segments.sort" % (self.work_dir, self.work_dir), shell=True)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
557 sHandle = TableReader(self.work_dir + "/segments.sort")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
558
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
559 segFile = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
560 curName = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
561
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
562 curData = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
563 missingCount = 0
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
564
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
565 startField = "loc.start"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
566 endField = "loc.end"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
567 valField = "seg.mean"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
568 chromeField = "chrom"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
569
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
570 segFile = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
571
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
572 for key, value in sHandle:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
573 if segFile is None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
574 segFile = open("%s/segment_file" % (self.work_dir), "w")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
575 try:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
576 curName = self.translateUUID(tTrans[key]) # "-".join( tTrans[ key ].split('-')[0:4] )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
577 if curName is not None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
578 try:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
579 chrom = value[ chromeField ].lower()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
580 if not chrom.startswith("chr"):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
581 chrom = "chr" + chrom
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
582 chrom = chrom.upper().replace("CHR", "chr")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
583 #segFile.write( "%s\t%s\t%s\t%s\t.\t%s\n" % ( curName, chrom, int(value[ startField ])+1, value[ endField ], value[ valField ] ) )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
584 segFile.write( "%s\t%s\t%s\t%s\t%s\n" % ( chrom, value[ startField ], value[ endField ], curName, value[ valField ] ) )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
585 except KeyError:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
586 self.addError( "Field error: %s" % (str(value)))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
587 except KeyError:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
588 self.addError( "TargetInfo Not Found: %s" % (key))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
589
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
590 segFile.close()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
591 matrixName = self.config.name
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
592
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
593 self.emitFile( "", self.getMeta(matrixName), "%s/segment_file" % (self.work_dir) )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
594
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
595
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
596
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
597 class TCGAMatrixImport(TCGAGeneticImport):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
598
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
599 def getMeta(self, name):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
600 matrixInfo = {
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
601 "@context" : 'http://purl.org/cgdata/',
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
602 '@type' : 'genomicMatrix',
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
603 '@id' : name,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
604 "lastModified" : self.config.version,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
605 'dataSubType' : { "@id" : self.dataSubType },
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
606 'dataProducer' : 'TCGA',
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
607 "accessMap" : "public",
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
608 "redistribution" : "yes",
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
609 'rowKeySrc' : {
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
610 "@type" : "probe", "@id" : self.probeMap
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
611 },
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
612 'columnKeySrc' : {
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
613 "@type" : "idDAG", "@id" : "tcga.%s" % (self.config.abbr)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
614 }
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
615 }
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
616 matrixInfo.update(self.ext_meta)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
617 matrixInfo.update(self.config.meta)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
618 return matrixInfo
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
619
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
620 def fileBuild(self):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
621 #use the target table to create a name translation table
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
622 #also setup target name enumeration, so they will have columns
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
623 #numbers
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
624
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
625 subprocess.call("sort -k 1 %s/probes > %s/probes.sort" % (self.work_dir, self.work_dir), shell=True)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
626 subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
627
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
628 handles = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
629 handles[ "geneticExtract:targets" ] = TableReader(self.work_dir + "/targets.sort")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
630 handles[ "geneticExtract:probes" ] = TableReader(self.work_dir + "/probes.sort")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
631
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
632 tTrans = self.getTargetMap()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
633
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
634 tEnum = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
635 for t in tTrans:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
636 tlabel = self.translateUUID(tTrans[t])
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
637 if tlabel is not None and tlabel not in tEnum:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
638 tEnum[tlabel] = len(tEnum)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
639
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
640 matrixFile = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
641 segFile = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
642
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
643 curName = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
644 curData = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
645 missingCount = 0
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
646 rowCount = 0
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
647 pHandle = handles["geneticExtract:probes"]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
648 for key, value in pHandle:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
649 if matrixFile is None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
650 matrixFile = open("%s/matrix_file" % (self.work_dir), "w" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
651 out = ["NA"] * len(tEnum)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
652 for target in tEnum:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
653 out[ tEnum[ target ] ] = target
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
654 matrixFile.write( "%s\t%s\n" % ( "#probe", "\t".join( out ) ) )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
655
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
656 if curName != key:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
657 if curName is not None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
658 out = ["NA"] * len(tEnum)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
659 for target in curData:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
660 try:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
661 ttarget = self.translateUUID(tTrans[target])
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
662 if ttarget is not None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
663 out[ tEnum[ ttarget ] ] = str( curData[ target ] )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
664 except KeyError:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
665 self.addError( "TargetInfo Not Found: %s" % (target))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
666 if out.count("NA") != len(tEnum):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
667 rowCount += 1
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
668 matrixFile.write( "%s\t%s\n" % ( curName, "\t".join( out ) ) )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
669 curName = key
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
670 curData = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
671 if "target" in value:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
672 for probeField in self.probeFields:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
673 if probeField in value:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
674 curData[ value[ "target" ] ] = value[ probeField ]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
675 elif "file" in value:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
676 for probeField in self.probeFields:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
677 if probeField in value:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
678 curData[ value[ "file" ] ] = value[ probeField ]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
679 matrixFile.close()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
680 matrixName = self.config.name
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
681 if rowCount > 0:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
682 self.emitFile( "", self.getMeta(matrixName), "%s/matrix_file" % (self.work_dir) )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
683
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
684
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
685 adminNS = "http://tcga.nci/bcr/xml/administration/2.3"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
686
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
687
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
688 class TCGAClinicalImport(FileImporter):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
689
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
690 def fileScan(self, path):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
691 handle = open(path)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
692 data = handle.read()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
693 handle.close()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
694 xml=parseString(data)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
695 self.parseXMLFile(xml)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
696
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
697 def getText(self, nodelist):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
698 rc = []
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
699 for node in nodelist:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
700 if node.nodeType == node.TEXT_NODE:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
701 rc.append(node.data)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
702 return ''.join(rc)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
703
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
704
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
705 def parseXMLFile(self, dom):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
706 admin = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
707 for node in dom.getElementsByTagNameNS( adminNS, "admin"):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
708 for cNode in node.childNodes:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
709 if cNode.nodeType == cNode.ELEMENT_NODE:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
710 admin[ cNode.localName ] = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
711 admin[ cNode.localName ]['value'] = getText( cNode.childNodes )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
712
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
713 name = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
714 patient = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
715 patientName = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
716 for node in dom.childNodes[0].childNodes:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
717 if node.nodeType == node.ELEMENT_NODE:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
718 if node.localName == 'patient':
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
719 for elm in node.childNodes:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
720 if elm.nodeType == elm.ELEMENT_NODE:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
721 if ( elm.localName == 'bcr_patient_barcode' ):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
722 name = getText( elm.childNodes )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
723 patientName = name
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
724
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
725 if ( elm.getAttribute( 'procurement_status' ) == "Completed" ):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
726 patient[ elm.localName ] = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
727 patient[ elm.localName ]['value'] = getText( elm.childNodes )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
728 patient[ elm.localName ]['tier'] = elm.getAttribute( 'tier' )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
729 patient[ elm.localName ]['precision'] = elm.getAttribute( 'precision' )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
730
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
731 if elm.prefix == "auxiliary":
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
732 for aux in elm.childNodes:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
733 if aux.nodeType == aux.ELEMENT_NODE:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
734 for auxval in aux.childNodes:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
735 if auxval.nodeType == auxval.ELEMENT_NODE:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
736 patient[ auxval.localName ] = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
737 patient[ auxval.localName ]['value'] = getText( auxval.childNodes )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
738 patient[ auxval.localName ]['tier'] = auxval.getAttribute( 'tier' )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
739 patient[ auxval.localName ]['precision'] = auxval.getAttribute( 'precision' )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
740
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
741 if name is not None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
742 for key in admin:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
743 patient[ key ] = admin[ key ]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
744 self.emit( name, patient, "patient" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
745
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
746 for node in dom.childNodes[0].childNodes:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
747 if node.nodeType == node.ELEMENT_NODE and node.localName == 'patient':
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
748 for samples in node.childNodes:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
749 if samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'samples':
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
750 for sample in samples.childNodes:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
751 if sample.nodeType == samples.ELEMENT_NODE and sample.localName == 'sample':
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
752 sampleData = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
753 for value in sample.childNodes:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
754 if value.nodeType == value.ELEMENT_NODE:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
755 if value.localName == 'bcr_sample_barcode' :
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
756 name = getText( value.childNodes )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
757 if value.getAttribute( 'procurement_status' ) == "Completed" :
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
758 sampleData[ value.localName ] = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
759 sampleData[ value.localName ]['value'] = getText( value.childNodes )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
760
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
761 if value.localName == 'portions' :
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
762 for portions in value.childNodes:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
763 if portions.nodeType == value.ELEMENT_NODE and portions.localName == "portion":
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
764 portionName = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
765 portionData = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
766 for portion in portions.childNodes:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
767 if portion.nodeType == value.ELEMENT_NODE:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
768 if portion.localName == "analytes":
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
769 for analytes in portion.childNodes:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
770 if analytes.nodeType == analytes.ELEMENT_NODE and analytes.localName =="analyte":
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
771 analyteName = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
772 analyteData = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
773 for analyte in analytes.childNodes:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
774 if analyte.nodeType == value.ELEMENT_NODE:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
775 if analyte.localName == "aliquots":
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
776 for aliquots in analyte.childNodes:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
777 if aliquots.nodeType == aliquots.ELEMENT_NODE and aliquots.localName =="aliquot":
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
778 aliquotName = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
779 aliquotData = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
780 for aliquot in aliquots.childNodes:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
781 if aliquot.nodeType == value.ELEMENT_NODE:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
782 if aliquot.localName == "bcr_aliquot_barcode":
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
783 aliquotName = getText(aliquot.childNodes)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
784 if aliquot.getAttribute( 'procurement_status' ) == "Completed" :
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
785 aliquotData[ aliquot.localName ] = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
786 aliquotData[ aliquot.localName ]['value'] = getText( aliquot.childNodes )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
787 if aliquotName is not None and len(aliquotData):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
788 self.emit( aliquotName, aliquotData, 'aliquot' )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
789
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
790
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
791 if analyte.localName == "bcr_analyte_barcode":
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
792 analyteName = getText(analyte.childNodes)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
793 if analyte.getAttribute( 'procurement_status' ) == "Completed" :
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
794 analyteData[ analyte.localName ] = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
795 analyteData[ analyte.localName ]['value'] = getText( analyte.childNodes )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
796 if analyteName is not None and len(analyteData):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
797 self.emit( analyteName, analyteData, 'analyte' )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
798
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
799 if portion.localName == "bcr_portion_barcode":
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
800 portionName = getText( portion.childNodes )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
801 if portion.getAttribute( 'procurement_status' ) == "Completed" :
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
802 portionData[ portion.localName ] = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
803 portionData[ portion.localName ]['value'] = getText( portion.childNodes )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
804 if portionName is not None and len(portionData):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
805 self.emit( portionName, portionData, 'portion' )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
806
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
807
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
808 #patientName = re.sub( r'\-...$', "", name )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
809 self.emit( name, sampleData, "sample" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
810 self.emit( name, patient, "sample")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
811 elif samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'drugs':
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
812 for drug in samples.childNodes:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
813 if drug.nodeType == samples.ELEMENT_NODE and drug.localName == 'drug':
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
814 drugData = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
815 for value in drug.childNodes:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
816 if value.nodeType == value.ELEMENT_NODE:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
817 if value.localName == 'bcr_drug_barcode' :
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
818 name = getText( value.childNodes )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
819 if value.getAttribute( 'procurement_status' ) == "Completed" :
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
820 drugData[ value.localName ] = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
821 drugData[ value.localName ]['value'] = getText( value.childNodes )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
822
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
823 #patientName = re.sub( r'\-...$', "", name )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
824 self.emit( patientName, drugData, "drug" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
825 elif samples.nodeType == samples.ELEMENT_NODE and samples.localName == 'radiations':
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
826 for rad in samples.childNodes:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
827 if rad.nodeType == samples.ELEMENT_NODE and rad.localName == 'radiation':
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
828 radData = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
829 for value in rad.childNodes:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
830 if value.nodeType == value.ELEMENT_NODE:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
831 if value.localName == 'bcr_radiation_barcode' :
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
832 name = getText( value.childNodes )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
833 if value.getAttribute( 'procurement_status' ) == "Completed" :
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
834 radData[ value.localName ] = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
835 radData[ value.localName ]['value'] = getText( value.childNodes )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
836
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
837 #patientName = re.sub( r'\-...$', "", name )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
838 self.emit( patientName, radData, "radiation" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
839
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
840
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
841
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
842 def getMeta(self, name):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
843 fileInfo = {
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
844 "@context" : "http://purl.org/cgdata/",
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
845 "@type" : "clinicalMatrix",
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
846 "@id" : name,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
847 "lastModified" : self.config.version,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
848 'dataSubType' : { "@id" : "clinical" },
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
849 "rowKeySrc" : {
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
850 "@type" : "idDAG", "@id" : "tcga.%s" % (self.config.abbr)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
851 }
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
852
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
853 }
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
854 fileInfo.update(self.ext_meta)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
855 fileInfo.update(self.config.meta)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
856 return fileInfo
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
857
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
858 def fileBuild(self):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
859
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
860 matrixList = [ "patient", "sample", "radiation", "drug", "portion", "analyte", "aliquot" ]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
861 if self.config.clinical_type is not None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
862 matrixList = [ self.config.clinical_type ]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
863
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
864 for matrixName in matrixList:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
865 if os.path.exists( "%s/%s" % (self.work_dir, matrixName)):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
866 subprocess.call("cat %s/%s | sort -k 1 > %s/%s.sort" % (self.work_dir, matrixName, self.work_dir, matrixName), shell=True)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
867 handle = TableReader(self.work_dir + "/" + matrixName + ".sort")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
868 matrix = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
869 colEnum = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
870 for key, value in handle:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
871 if key not in matrix:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
872 matrix[key] = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
873 for col in value:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
874 matrix[key][col] = value[col]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
875 if col not in colEnum:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
876 if not self.config.sanitize or col not in [ 'race', 'ethnicity' ]:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
877 colEnum[col] = len(colEnum)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
878
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
879 handle = open( os.path.join(self.work_dir, matrixName + "_file"), "w")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
880 cols = [None] * (len(colEnum))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
881 for col in colEnum:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
882 cols[colEnum[col]] = col
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
883 handle.write("sample\t%s\n" % ("\t".join(cols)))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
884 for key in matrix:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
885 cols = [""] * (len(colEnum))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
886 for col in colEnum:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
887 if col in matrix[key]:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
888 cols[colEnum[col]] = matrix[key][col]['value']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
889 handle.write("%s\t%s\n" % (key, "\t".join(cols).encode("ASCII", "replace")))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
890 handle.close()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
891 self.emitFile( "." + matrixName, self.getMeta(self.config.name + "." + matrixName), "%s/%s_file" % (self.work_dir, matrixName))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
892
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
893
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
894 class AgilentImport(TCGAMatrixImport):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
895 dataSubType = 'geneExp'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
896 probeMap = 'hugo'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
897 sampleMap = 'tcga.iddag'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
898 dataType = 'genomicMatrix'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
899 probeFields = ['log2 lowess normalized (cy5/cy3) collapsed by gene symbol']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
900
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
901
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
902 class CGH1x1mImport(TCGASegmentImport):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
903 dataSubType = 'cna'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
904 sampleMap = 'tcga.iddag'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
905 dataType = 'genomicSegment'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
906 probeFields = ['seg.mean']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
907
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
908 class SNP6Import(TCGASegmentImport):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
909 assembly = 'hg19'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
910 dataSubType = 'cna'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
911 sampleMap ='tcga.iddag'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
912 dataType = 'genomicSegment'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
913 probeFields = ['seg.mean']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
914
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
915 def fileScan(self, path):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
916 outport = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
917 #if path.endswith(".hg18.seg.txt"):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
918 # outport = "hg18_segment"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
919 if path.endswith(".hg19.seg.txt"):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
920 outport = "hg19_segment"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
921
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
922 if outport is not None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
923 handle = open(path)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
924 colName = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
925 for line in handle:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
926 if colName is None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
927 colName = line.rstrip().split("\t")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
928 for i, col in enumerate(colName):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
929 if commonMap.has_key( col ):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
930 colName[i] = commonMap[ col ]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
931 else:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
932 tmp = line.rstrip().split("\t")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
933 out = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
934 for i in range(1, len(colName)):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
935 out[ colName[i] ] = tmp[i]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
936 self.emit( tmp[0], out, outport )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
937 handle.close()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
938
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
939 def fileBuild(self):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
940 tmap = self.getTargetMap()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
941
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
942 for base in ['hg19']:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
943 subprocess.call("sort -k 1 %s/%s_segment > %s/%s_segment.sort" % (self.work_dir, base, self.work_dir, base), shell=True)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
944 handle = TableReader(self.work_dir + "/%s_segment.sort" % (base))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
945
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
946 segFile = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
947 curName = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
948 curData = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
949 missingCount = 0
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
950
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
951 startField = "loc.start"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
952 endField = "loc.end"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
953 valField = "seg.mean"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
954 chromeField = "chrom"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
955
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
956 segFile = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
957 sHandle = handle
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
958 for key, value in sHandle:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
959 if segFile is None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
960 segFile = open("%s/%s_segment.out" % (self.work_dir, base), "w")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
961 try:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
962 curName = self.translateUUID(tmap[key])
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
963 if curName is not None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
964 chrom = value[ chromeField ].lower()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
965 if not chrom.startswith("chr"):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
966 chrom = "chr" + chrom
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
967 chrom = chrom.upper().replace("CHR", "chr")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
968 #segFile.write( "%s\t%s\t%s\t%s\t.\t%s\n" % ( curName, chrom, int(value[ startField ])+1, value[ endField ], value[ valField ] ) )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
969 segFile.write( "%s\t%s\t%s\t%s\t%s\n" % ( chrom, value[ startField ], value[ endField ], curName, value[ valField ] ) )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
970 except KeyError:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
971 self.addError( "TargetInfo Not Found: %s" % (key))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
972
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
973 segFile.close()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
974
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
975 self.emitFile("." + base, self.getMeta(self.config.name + "." + base), "%s/%s_segment.out" % (self.work_dir, base))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
976
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
977
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
978 class HmiRNAImport(TCGAMatrixImport):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
979 dataSubType = 'miRNAExp'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
980 probeMap = 'agilentHumanMiRNA'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
981 sampleMap = 'tcga.iddag'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
982 dataType = 'genomicMatrix'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
983 probeFields = ['unc_DWD_Batch_adjusted']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
984
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
985 class CGH244AImport(TCGASegmentImport):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
986 dataSubType = 'cna'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
987 sampleMap = 'tcga.iddag'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
988 dataType = 'genomicSegment'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
989 probeFields = ['Segment_Mean']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
990
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
991 class CGH415K_G4124A(TCGASegmentImport):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
992 dataSubType = 'cna'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
993 sampleMap = 'tcga.iddag'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
994 chromeField = 'Chromosome'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
995 dataType = 'genomicSegment'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
996 endField = 'End'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
997 probeFields = ['Segment_Mean']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
998 startField = 'Start'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
999
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1000
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1001 class IlluminaHiSeq_DNASeqC(TCGASegmentImport):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1002 dataSubType = 'cna'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1003 sampleMap = 'tcga.iddag'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1004 chromeField = 'Chromosome'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1005 dataType = 'genomicSegment'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1006 endField = 'End'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1007 probeFields = ['Segment_Mean']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1008 startField = 'Start'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1009
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1010 def translateUUID(self, uuid):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1011 out = self.config.translateUUID(uuid)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1012 #censor out normal ids
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1013 if re.search(r'^TCGA-..-....-1', out):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1014 return None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1015 return out
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1016
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1017 class HT_HGU133A(TCGAMatrixImport):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1018 dataSubType = 'geneExp'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1019 probeMap = 'affyU133a'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1020 sampleMap = 'tcga.iddag'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1021 dataType = 'genomicMatrix'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1022 probeFields = ['Signal']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1023
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1024 class HuEx1_0stv2(TCGAMatrixImport):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1025 dataSubType = 'miRNAExp'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1026 probeMap = 'hugo'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1027 sampleMap = 'tcga.iddag'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1028 dataType = 'genomicMatrix'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1029 probeFields = ['Signal']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1030 fileInclude = '^.*gene.txt$|^.*sdrf.txt$'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1031
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1032 class Human1MDuoImport(TCGASegmentImport):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1033 dataSubType = 'cna'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1034 sampleMap = 'tcga.iddag'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1035 dataType = 'genomicSegment'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1036 probeFields = ['mean']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1037
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1038 class HumanHap550(TCGASegmentImport):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1039 dataSubType = 'cna'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1040 sampleMap = 'tcga.iddag'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1041 dataType = 'genomicSegment'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1042 probeFields = ['mean']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1043
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1044 class HumanMethylation27(TCGAMatrixImport):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1045 dataSubType = 'DNAMethylation'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1046 probeMap= 'illuminaMethyl27K_gpl8490'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1047 sampleMap= 'tcga.iddag'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1048 dataType= 'genomicMatrix'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1049 fileExclude= '.*.adf.txt'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1050 probeFields = ['Beta_Value', 'Beta_value']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1051
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1052
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1053 class HumanMethylation450(TCGAMatrixImport):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1054 dataSubType = 'DNAMethylation'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1055 probeMap = 'illuminaHumanMethylation450'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1056 sampleMap = 'tcga.iddag'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1057 dataType = 'genomicMatrix'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1058 fileExclude = '.*.adf.txt'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1059 probeFields = ['Beta_value', 'Beta_Value']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1060
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1061 def fileScan(self, path):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1062 """
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1063 This function takes a TCGA level 3 genetic file (file name and input handle),
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1064 and tries to extract probe levels or target mappings (experimental ID to TCGA barcode)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1065 it emits these values to a handle, using the 'targets' and 'probes' string to identify
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1066 the type of data being emited
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1067 """
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1068 iHandle = open(path)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1069 mode = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1070 #modes
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1071 #1 - two col header matrix file
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1072 target = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1073 colName = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1074 colType = None
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1075 for line in iHandle:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1076 if colName is None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1077 colName = line.rstrip().split("\t")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1078 if colName[0] == "Hybridization REF" or colName[0] == "Sample REF":
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1079 mode=1
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1080 for i in range(len(colName)):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1081 if commonMap.has_key( colName[i] ):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1082 colName[i] = commonMap[ colName[i] ]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1083 elif mode==1 and colType is None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1084 colType=line.rstrip().split("\t")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1085 for i in range(len(colType)):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1086 if commonMap.has_key( colType[i] ):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1087 colType[i] = commonMap[ colType[i] ]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1088 else:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1089 tmp = line.rstrip().split("\t")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1090 if mode == 1:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1091 out={}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1092 for col in colName[1:]:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1093 out[ col ] = { "target" : col }
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1094 for i in range(1,len(colType)):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1095 try:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1096 if colType[i] in self.probeFields:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1097 out[ colName[i] ][ colType[i] ] = "%.4f" % float(tmp[i])
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1098 except IndexError:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1099 out[ colName[i] ][ colType[i] ] = "NA"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1100 except ValueError:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1101 out[ colName[i] ][ colType[i] ] = "NA"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1102 for col in out:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1103 self.emit( tmp[0], out[col], "probes" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1104
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1105 class Illumina_RNASeq(TCGAMatrixImport):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1106 sampleMap= 'tcga.iddag'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1107 dataSubType= 'geneExp'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1108 fileInclude= r'^.*\.gene.quantification.txt$|^.*sdrf.txt$'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1109 probeFields = ['RPKM']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1110 probeMap= 'hugo.unc'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1111
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1112 class Illumina_RNASeqV2(TCGAMatrixImport):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1113 sampleMap= 'tcga.iddag'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1114 dataSubType= 'geneExp'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1115 fileInclude= r'^.*rsem.genes.normalized_results$|^.*sdrf.txt$'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1116 probeFields = ['normalized_count']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1117 probeMap= 'hugo.unc'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1118
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1119 class IlluminaHiSeq_RNASeq(TCGAMatrixImport):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1120 sampleMap= 'tcga.iddag'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1121 dataSubType= 'geneExp'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1122 fileInclude= r'^.*gene.quantification.txt$'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1123 probeFields = ['RPKM']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1124 probeMap= 'hugo.unc'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1125
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1126 class MDA_RPPA_Core(TCGAMatrixImport):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1127 sampleMap = 'tcga.iddag'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1128 probeMap = "md_anderson_antibodies"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1129 dataSubType = "RPPA"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1130 fileExclude = r'^.*.antibody_annotation.txt'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1131 probeFields = [ 'Protein Expression' ]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1132
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1133 def getTargetMap(self):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1134 subprocess.call("sort -k 1 %s/targets > %s/targets.sort" % (self.work_dir, self.work_dir), shell=True)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1135 handle = TableReader(self.work_dir + "/targets.sort")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1136 tTrans = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1137 for key, value in handle:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1138 value = re.sub(r'\.SD', '', value)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1139 tTrans[ key ] = value
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1140 return tTrans
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1141
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1142
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1143 class Illumina_miRNASeq(TCGAMatrixImport):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1144 sampleMap= 'tcga.iddag'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1145 dataSubType= 'miRNA'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1146 fileInclude= '^.*.mirna.quantification.txt$'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1147 probeFields = ['reads_per_million_miRNA_mapped']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1148 probeMap= 'hsa.mirna'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1149
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1150
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1151 class bioImport(TCGAClinicalImport):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1152 sampleMap = 'tcga.iddag'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1153 fileInclude = '.*.xml$'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1154
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1155 tcgaConfig = {
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1156 'AgilentG4502A_07' : AgilentImport,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1157 'AgilentG4502A_07_1' : AgilentImport,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1158 'AgilentG4502A_07_2' : AgilentImport,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1159 'AgilentG4502A_07_3': AgilentImport,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1160 'CGH-1x1M_G4447A': CGH1x1mImport,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1161 'Genome_Wide_SNP_6': SNP6Import,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1162 'H-miRNA_8x15K': HmiRNAImport,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1163 'H-miRNA_8x15Kv2': HmiRNAImport,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1164 'HG-CGH-244A': CGH244AImport,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1165 'HG-CGH-415K_G4124A': CGH415K_G4124A,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1166 'HT_HG-U133A': HT_HGU133A,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1167 'HuEx-1_0-st-v2': HuEx1_0stv2,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1168 'Human1MDuo': Human1MDuoImport,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1169 'HumanHap550': HumanHap550,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1170 'IlluminaHiSeq_DNASeqC' : IlluminaHiSeq_DNASeqC,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1171 'HumanMethylation27': HumanMethylation27,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1172 'HumanMethylation450': HumanMethylation450,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1173 'IlluminaHiSeq_RNASeq': IlluminaHiSeq_RNASeq,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1174 'IlluminaGA_RNASeq' : Illumina_RNASeq,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1175 'IlluminaHiSeq_RNASeqV2' : Illumina_RNASeqV2,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1176 'MDA_RPPA_Core' : MDA_RPPA_Core,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1177 'IlluminaGA_miRNASeq' : Illumina_miRNASeq,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1178 'IlluminaHiSeq_miRNASeq' : Illumina_miRNASeq,
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1179 'bio' : bioImport
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1180 }
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1181
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1182 def fileDigest( file ):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1183 md5 = hashlib.md5()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1184 with open(file,'rb') as f:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1185 for chunk in iter(lambda: f.read(8192), ''):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1186 md5.update(chunk)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1187 return md5.hexdigest()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1188
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1189
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1190 def platform_list():
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1191 q = CustomQuery("Platform")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1192 for e in q:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1193 yield e['name']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1194
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1195 def supported_list():
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1196 q = CustomQuery("Platform")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1197 for e in q:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1198 if e['name'] in tcgaConfig:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1199 yield e['name']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1200
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1201 def platform_archives(platform):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1202 q = CustomQuery("Archive[Platform[@name=%s]][@isLatest=1]" % platform)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1203 out = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1204 for e in q:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1205 name = e['baseName']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1206 if name not in out:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1207 yield name
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1208 out[name] = True
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1209
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1210
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1211 if __name__ == "__main__":
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1212
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1213 parser = ArgumentParser()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1214 #Stack.addJobTreeOptions(parser)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1215
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1216 parser.add_argument("-a", "--platform-list", dest="platform_list", action="store_true", help="Get list of platforms", default=False)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1217 parser.add_argument("-u", "--uuid", dest="uuid_table", help="UUID to Barcode Table", default=None)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1218 parser.add_argument("-t", "--uuid-download", dest="uuid_download", help="Download UUID/Barcode Table", default=False)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1219 parser.add_argument("-z", "--all-archives", dest="all_archives", action="store_true", help="List all archives", default=False)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1220 parser.add_argument("-p", "--platform", dest="platform", help="Platform Selection", default=None)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1221 parser.add_argument("-l", "--supported", dest="supported_list", action="store_true", help="List Supported Platforms", default=None)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1222 parser.add_argument("-f", "--filelist", dest="filelist", help="List files needed to convert TCGA project basename into cgData", default=None)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1223 parser.add_argument("-b", "--basename", dest="basename", help="Convert TCGA project basename into cgData", default=None)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1224 parser.add_argument("-m", "--mirror", dest="mirror", help="Mirror Location", default=None)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1225 parser.add_argument("-w", "--workdir", dest="workdir_base", help="Working directory", default="/tmp")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1226 parser.add_argument("--out-dir", dest="outdir", help="Working directory", default="./")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1227 parser.add_argument("-o", "--out", dest="outpath", help="Output Dest", default=None)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1228 parser.add_argument("--out-error", dest="errorpath", help="Output Error", default=None)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1229 parser.add_argument("--out-meta", dest="metapath", help="Output Meta", default=None)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1230 parser.add_argument("-c", "--cancer", dest="cancer", help="List Archives by cancer type", default=None)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1231 parser.add_argument("-d", "--download", dest="download", help="Download files for archive", default=None)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1232 parser.add_argument("-e", "--level", dest="level", help="Data Level ", default="3")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1233 parser.add_argument("-s", "--check-sum", dest="checksum", help="Check project md5", default=None)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1234 parser.add_argument("-r", "--sanitize", dest="sanitize", action="store_true", help="Remove race/ethnicity from clinical data", default=False)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1235 parser.add_argument("-x", "--clinical", dest="clinical", help="Process clinical info", default=None)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1236 parser.add_argument("--clinical-basename", dest="clinical_basename", help="Select Clinical Data by basename", default=None)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1237 parser.add_argument("--clinical-type", dest="clinical_type", help="Clinical Data Type", default=None)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1238 parser.add_argument("--all-clinical", dest="all_clinical", action="store_true", help="List all clinical archives", default=False)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1239 parser.add_argument("--out-clinical", dest="out_clinical", action="append", nargs=3, default=[])
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1240 parser.add_argument("--samples", dest="get_samples", action="store_true", default=False)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1241
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1242 options = parser.parse_args()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1243
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1244 if options.uuid_download:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1245 url="https://tcga-data.nci.nih.gov/uuid/uuidBrowserExport.htm"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1246 data = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1247 data['exportType'] = 'tab'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1248 data['cols'] = "uuid,barcode"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1249 urllib.urlretrieve( url, options.uuid_download, data=urllib.urlencode(data))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1250
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1251 if options.platform_list:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1252 for e in platform_list():
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1253 print e
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1254
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1255 if options.supported_list:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1256 for e in supported_list():
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1257 print e
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1258
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1259 if options.platform:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1260 for name in platform_archives( options.platform ):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1261 print name
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1262
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1263 if options.all_archives:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1264 q = CustomQuery("Archive[@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.level))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1265 out = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1266 for e in q:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1267 name = e['baseName']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1268 if name not in out:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1269 print name
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1270 out[name] = True
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1271
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1272 if options.all_clinical:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1273 q = CustomQuery("Archive[@isLatest=1][Platform[@alias=bio]]")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1274 out = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1275 for e in q:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1276 name = e['baseName']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1277 if name not in out:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1278 print name
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1279 out[name] = True
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1280
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1281 if options.get_samples:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1282 url="https://tcga-data.nci.nih.gov/datareports/aliquotExport.htm"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1283 data = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1284
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1285 data['exportType'] = 'tab'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1286 data['cols'] = 'aliquotId,disease,bcrBatch,center,platform,levelOne,levelTwo,levelThree'
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1287 data['filterReq'] = json.dumps({"disease":"","levelOne":"","aliquotId":"","center":"","levelTwo":"","bcrBatch":"","platform":"","levelThree":""})
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1288 data['formFilter'] = json.dumps({"disease":"","levelOne":"","aliquotId":"","center":"","levelTwo":"","bcrBatch":"","platform":"","levelThree":""})
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1289 handle = urllib.urlopen( url + "?" + urllib.urlencode(data))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1290
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1291 for line in handle:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1292 tmp = line.rstrip().split("\t")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1293 if tmp[7] == "Submitted":
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1294 if tmp[0][13]=='0':
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1295 print "\t".join( [ tmp[0], tmp[1], "Tumor", tmp[4] ] )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1296 elif tmp[0][13] == '1':
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1297 print "\t".join( [ tmp[0], tmp[1], "Normal", tmp[4] ] )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1298
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1299
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1300 if options.cancer is not None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1301 q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][ArchiveType[@type=Level_%s]]" % (options.cancer, options.level))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1302 out = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1303 for e in q:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1304 name = e['baseName']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1305 if name not in out:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1306 print name
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1307 out[name] = True
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1308
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1309 if options.filelist:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1310 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.filelist, options.level))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1311 for e in q:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1312 print e['deployLocation']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1313 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.filelist))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1314 for e in q:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1315 print e['deployLocation']
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1316
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1317 if options.checksum:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1318 urls = []
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1319 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.checksum, options.level))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1320 for e in q:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1321 urls.append( e['deployLocation'] )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1322 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.checksum))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1323 for e in q:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1324 urls.append( e['deployLocation'] )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1325
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1326 for url in urls:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1327 dst = os.path.join(options.mirror, re.sub("^/", "", url))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1328 if not os.path.exists( dst ):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1329 print "NOT_FOUND:", dst
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1330 continue
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1331 if not os.path.exists( dst + ".md5" ):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1332 print "MD5_NOT_FOUND", dst
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1333 continue
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1334
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1335 handle = open( dst + ".md5" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1336 line = handle.readline()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1337 omd5 = line.split(' ')[0]
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1338 handle.close()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1339
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1340 nmd5 = fileDigest( dst )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1341 if omd5 != nmd5:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1342 print "CORRUPT:", dst
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1343 else:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1344 print "OK:", dst
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1345
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1346
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1347 if options.download is not None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1348 if options.mirror is None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1349 print "Define mirror location"
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1350 sys.exit(1)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1351
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1352 urls = []
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1353
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1354 if options.basename is None and options.clinical is None and options.clinical_basename is None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1355 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.download, options.level))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1356 for e in q:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1357 urls.append( e['deployLocation'] )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1358 urls.append( e['deployLocation'] + ".md5" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1359
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1360 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.download))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1361 for e in q:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1362 urls.append( e['deployLocation'] )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1363 urls.append( e['deployLocation'] + ".md5" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1364
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1365 if options.basename:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1366 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=Level_%s]]" % (options.basename, options.level))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1367 for e in q:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1368 urls.append( e['deployLocation'] )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1369 urls.append( e['deployLocation'] + ".md5" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1370
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1371 q = CustomQuery("Archive[@baseName=%s][@isLatest=1][ArchiveType[@type=mage-tab]]" % (options.basename))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1372 for e in q:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1373 urls.append( e['deployLocation'] )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1374 urls.append( e['deployLocation'] + ".md5" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1375
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1376 if options.clinical:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1377 q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][Platform[@alias=bio]]" % (options.clinical))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1378 for e in q:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1379 urls.append( e['deployLocation'] )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1380 urls.append( e['deployLocation'] + ".md5" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1381
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1382 if options.clinical_basename:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1383 q = CustomQuery("Archive[@isLatest=1][@baseName=%s]" % (options.clinical_basename))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1384 for e in q:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1385 urls.append( e['deployLocation'] )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1386 urls.append( e['deployLocation'] + ".md5" )
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1387
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1388
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1389
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1390 for url in urls:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1391 src = "https://tcga-data.nci.nih.gov/" + url
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1392 dst = os.path.join(options.mirror, re.sub("^/", "", url))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1393 dir = os.path.dirname(dst)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1394 if not os.path.exists(dir):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1395 print "mkdir", dir
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1396 os.makedirs(dir)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1397 if not os.path.exists( dst ):
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1398 print "download %s to %s" % (src, dst)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1399 urllib.urlretrieve(src, dst)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1400
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1401 if options.basename:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1402 if options.mirror is None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1403 sys.stderr.write("Need mirror location\n")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1404 sys.exit(1)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1405
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1406 conf = getBaseBuildConf(options.basename, options.level, options.mirror)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1407 conf.addOptions(options)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1408 if conf.platform not in tcgaConfig:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1409 sys.stderr.write("Platform %s not supported\n" % (conf.platform))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1410 sys.exit(1)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1411
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1412 ext = tcgaConfig[conf.platform](conf)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1413 ext.run()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1414
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1415
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1416 if options.clinical:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1417 if options.mirror is None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1418 sys.stderr.write("Need mirror location\n")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1419 sys.exit(1)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1420
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1421 q = CustomQuery("Archive[@isLatest=1][Disease[@abbreviation=%s]][Platform[@alias=bio]]" % (options.clinical))
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1422 basenames = {}
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1423 for s in q:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1424 basenames[s['baseName']] = True
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1425
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1426 for base in basenames:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1427 conf = getBaseBuildConf(base, 1, options.mirror)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1428 conf.addOptions(options)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1429
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1430 ext = tcgaConfig[conf.platform](conf)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1431 ext.run()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1432
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1433 if options.clinical_basename:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1434 if options.mirror is None:
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1435 sys.stderr.write("Need mirror location\n")
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1436 sys.exit(1)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1437
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1438
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1439 conf = getBaseBuildConf(options.clinical_basename, 1, options.mirror)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1440 conf.addOptions(options)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1441
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1442 ext = tcgaConfig[conf.platform](conf)
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1443 ext.run()
f1c71f5363ae Uploaded
kellrott
parents:
diff changeset
1444