Mercurial > repos > ynewton > extract_tumor_vs_normal_tcga_ids
changeset 0:72b0123a8587 draft
Uploaded
author | ynewton |
---|---|
date | Thu, 17 Jan 2013 20:07:57 -0500 |
parents | |
children | 5240f8985481 |
files | get_normal_vs_tumor_sample_ids.py |
diffstat | 1 files changed, 49 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_normal_vs_tumor_sample_ids.py Thu Jan 17 20:07:57 2013 -0500 @@ -0,0 +1,49 @@ +import optparse, re +parser = optparse.OptionParser() +parser.add_option("--in_matrix", dest="in_matrix", action="store", default="", help="") +parser.add_option("--out_normals", dest="out_normals", action="store", default="", help="") +parser.add_option("--out_tumor", dest="out_tumor", action="store", default="", help="") +opts, args = parser.parse_args() + +#process input arguments: +input_expression_file_name = opts.in_matrix +output_normal_file_name = opts.out_normals +output_tumor_file_name = opts.out_tumor + +normal_sample_re = re.compile('TCGA-\w\w-\w\w\w\w-1\d\w-.*') +tumor_sample_re = re.compile("TCGA-\w\w-\w\w\w\w-0\d\w-.*") +#TCGA-A2-A0D2-01A-21R-A034-07 + +normal_samples = [] +tumor_samples = [] +expression_file = open(input_expression_file_name, 'r') +line_number = 0 +for line in expression_file: + line_elems = line.strip().split("\t") + if line_number == 0: + #this is the header line, need to figure out what samples are normal and which are tumor + elem_counter = 1 + for elem in line_elems[1:]: + normal_re_match = normal_sample_re.match(elem) + if normal_re_match: + normal_samples.append(elem) + + tumor_re_match = tumor_sample_re.match(elem) + if tumor_re_match: + tumor_samples.append(elem) + + elem_counter += 1 + + else: + break + + line_number += 1 +expression_file.close() + +output_normal_file = open(output_normal_file_name, 'w') +print >> output_normal_file, "\n".join(normal_samples) +output_normal_file.close() + +output_tumor_file = open(output_tumor_file_name, 'w') +print >> output_tumor_file, "\n".join(tumor_samples) +output_tumor_file.close() \ No newline at end of file