Mercurial > repos > ynewton > extract_tumor_vs_normal_tcga_ids

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_normal_vs_tumor_sample_ids.py	Thu Jan 17 20:07:57 2013 -0500
@@ -0,0 +1,49 @@
+import optparse, re
+parser = optparse.OptionParser()
+parser.add_option("--in_matrix", dest="in_matrix", action="store", default="", help="")
+parser.add_option("--out_normals", dest="out_normals", action="store", default="", help="")
+parser.add_option("--out_tumor", dest="out_tumor", action="store", default="", help="")
+opts, args = parser.parse_args()
+
+#process input arguments:
+input_expression_file_name = opts.in_matrix
+output_normal_file_name = opts.out_normals
+output_tumor_file_name = opts.out_tumor
+
+normal_sample_re = re.compile('TCGA-\w\w-\w\w\w\w-1\d\w-.*')
+tumor_sample_re = re.compile("TCGA-\w\w-\w\w\w\w-0\d\w-.*")
+#TCGA-A2-A0D2-01A-21R-A034-07
+
+normal_samples = []
+tumor_samples = []
+expression_file = open(input_expression_file_name, 'r')
+line_number = 0
+for line in expression_file:
+	line_elems = line.strip().split("\t")
+	if line_number == 0:
+		#this is the header line, need to figure out what samples are normal and which are tumor
+		elem_counter = 1
+		for elem in line_elems[1:]:
+			normal_re_match = normal_sample_re.match(elem)
+			if normal_re_match:
+				normal_samples.append(elem)
+
+			tumor_re_match = tumor_sample_re.match(elem)
+			if tumor_re_match:
+				tumor_samples.append(elem)
+
+			elem_counter += 1
+
+	else:
+		break
+
+	line_number += 1
+expression_file.close()
+
+output_normal_file = open(output_normal_file_name, 'w')
+print >> output_normal_file, "\n".join(normal_samples)
+output_normal_file.close()
+
+output_tumor_file = open(output_tumor_file_name, 'w')
+print >> output_tumor_file, "\n".join(tumor_samples)
+output_tumor_file.close()
\ No newline at end of file