Mercurial > repos > lnguyen > sort_by_tissue
comparison sort_by_tissue.py @ 0:3155d867c056 draft default tip
planemo upload
author | lnguyen |
---|---|
date | Fri, 15 Sep 2017 11:04:37 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:3155d867c056 |
---|---|
1 import argparse | |
2 import re | |
3 | |
4 def options(): | |
5 parser = argparse.ArgumentParser() | |
6 parser.add_argument("--input",nargs="+", required=True, help="MaxQuant file") | |
7 parser.add_argument("--hpa", required=True, help="HPA file") | |
8 parser.add_argument("--tissues_del", required=True, help="List of tissues which expressed genes in are discarded") | |
9 parser.add_argument("--tissues_keep", help="List of tissues to keep regardless being expressed in list tissues_del..") | |
10 parser.add_argument("-o", "--output", default="HPA_selection.txt") | |
11 parser.add_argument("--trash", default="Trash.txt", help="Write filtered genes into a file") | |
12 parser.add_argument("--trash_file_detail", default="Trash_detail.txt", help="Write filtered genes with detailed information into a file") | |
13 parser.add_argument("--na_file", default="NaN.txt", help="Write genes whose name not found in HPA file") | |
14 | |
15 args = parser.parse_args() | |
16 #print(args.mq, args.hpa, args.tissues_del, args.tissues_keep, args.output, args.trash, args.trash_file_detail) | |
17 | |
18 filterHPA(args.input, args.hpa, args.tissues_del, args.tissues_keep, args.output, args.trash, args.trash_file_detail, args.na_file) | |
19 | |
20 def isnumber(format, n): | |
21 # Check if an element is integer or float number | |
22 float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$") | |
23 int_format = re.compile("^[\-]?[1-9][0-9]*$") | |
24 test = "" | |
25 if format == "int": | |
26 test = re.match(int_format, n) | |
27 elif format == "float": | |
28 test = re.match(float_format, n) | |
29 if test: | |
30 return True | |
31 else: | |
32 return False | |
33 | |
34 def readHPA(HPAfile, tissues_del, tissues_keep): | |
35 # Read HPA file | |
36 hpa = open(HPAfile, "r") | |
37 hpa = hpa.readlines() | |
38 # Extract lists of genes expressed in tissues to keep and in tissue to delete | |
39 tdel_dict = {} | |
40 if tissues_del: | |
41 tissues_del = tissues_del.split(",") | |
42 else: | |
43 tissues_del = [] | |
44 #print("List of tissues to del", tissues_del) | |
45 tkeep_dict = {} | |
46 if tissues_keep: | |
47 tissues_keep = tissues_keep.split(",") | |
48 else: | |
49 tissues_keep = [] | |
50 #print("List of tissues to keep", tissues_keep) | |
51 for line in hpa[1:]: | |
52 ensg = line.replace('"', "").split(",")[0] | |
53 tissue = line.replace('"', "").split(",")[2] | |
54 for t in tissues_del: | |
55 if tissue == t: | |
56 if t not in tdel_dict: | |
57 tdel_dict[t] = [ensg] | |
58 else: | |
59 if ensg not in tdel_dict[t]: | |
60 tdel_dict[t].append(ensg) | |
61 for k in tissues_keep: | |
62 if tissue == k: | |
63 if k not in tkeep_dict: | |
64 tkeep_dict[k] = [ensg] | |
65 else: | |
66 if ensg not in tkeep_dict[k]: | |
67 tkeep_dict[k].append(ensg) | |
68 | |
69 return tdel_dict, tkeep_dict | |
70 | |
71 def filterHPA(input, HPAfile, tissues_del, tissues_keep, output, trash_file, trash_file_detail, na_file): | |
72 input_type = input[1] | |
73 if input_type == "file": | |
74 input_file = input[0] | |
75 header = input[2] | |
76 ncol = input[3] | |
77 file_content = open(input_file, "r").readlines() | |
78 if isnumber("int", ncol.replace("c", "")): | |
79 if header == "true": | |
80 header = file_content[0] | |
81 content = file_content[1:] #[x.strip() for x in [line.split("\t")[int(ncol.replace("c", ""))-1].split(";")[0] for line in file_content[1:]]] # take only first IDs | |
82 else: | |
83 header = "" | |
84 content = file_content[:] #[x.strip() for x in [line.split("\t")[int(ncol.replace("c", ""))-1].split(";")[0] for line in file_content]] # take only first IDs | |
85 #print(file_content[1:13]) | |
86 ncol = int(ncol.replace("c", "")) - 1 | |
87 else: | |
88 raise ValueError("Please fill in the right format of column number") | |
89 else: | |
90 print(input[0]) | |
91 header = "" | |
92 content = input[0].split() | |
93 | |
94 # Read HPA file | |
95 hpa = open(HPAfile, "r") | |
96 hpa = hpa.readlines() | |
97 | |
98 # Get dictionary of tissues : genes | |
99 tdel_dict, tkeep_dict = readHPA(HPAfile, tissues_del, tissues_keep) | |
100 #print("Dictionary of tissue:genes to del", tdel_dict) | |
101 #print("Dictionary of tissue:genes to keep", tkeep_dict) | |
102 | |
103 # Filter | |
104 string = header.strip() + "\t" + "Filtered" + "\n" | |
105 filtered_genes = [] | |
106 filtered_lines = [] | |
107 na_genes = [] | |
108 #print(len(mq)) | |
109 for l in content: | |
110 line_string = l.rstrip() + "\t" #.replace("^M", "") | |
111 if input_type == "file": | |
112 gene = l.split("\t")[ncol].split(";")[0].replace('"', "") | |
113 else: | |
114 gene = l | |
115 if gene == "": | |
116 line_string += "NA - No ENSG ID" + "\n" | |
117 string += line_string | |
118 elif gene == "NA": | |
119 line_string += "NA - No ENSG ID" + "\n" | |
120 else: | |
121 tissue = sorted(set([t.split(",")[2].replace('"', "") for t in hpa if gene in t])) | |
122 if all (gene not in genes for genes in tdel_dict.values()): | |
123 if len(tissue) != 0: | |
124 print("Not in del list", gene, len(tissue)) | |
125 line_string += ",".join(tissue) + "\n" | |
126 string += line_string | |
127 else: | |
128 print("No tissue information", gene) | |
129 line_string += "NA - no tissue information" + "\n" | |
130 string += line_string | |
131 na_genes.append(gene) | |
132 else: | |
133 if all (gene not in genes for genes in tkeep_dict.values()): | |
134 print("In del list only", gene) | |
135 filtered_genes.append(gene) | |
136 filtered_lines.append(l) | |
137 else: | |
138 print("In both del and keep", gene, len(tissue)) | |
139 line_string += ",".join(tissue) + "\n" | |
140 string += line_string | |
141 | |
142 # Generate output file | |
143 output = open(output, "w") | |
144 output.write(string) | |
145 | |
146 # Generate file of unknown gene name | |
147 na_file = open(na_file, "w") | |
148 na_file.write("\n".join(na_genes)) | |
149 | |
150 # Generate trash files | |
151 output_trash = open(trash_file, "w") | |
152 output_trash.write("\n".join(filtered_lines)) | |
153 | |
154 output_trash_detail = open(trash_file_detail, "w") | |
155 print("Deleted genes", filtered_genes) | |
156 for gene in filtered_genes: | |
157 lines = [line for line in hpa if gene in line] | |
158 output_trash_detail.write("".join(lines)) | |
159 | |
160 if __name__ == "__main__": | |
161 options() | |
162 |