Mercurial > repos > lnguyen > sort_by_tissue
comparison hpa_tissue_distribution.py @ 0:3155d867c056 draft default tip
planemo upload
author | lnguyen |
---|---|
date | Fri, 15 Sep 2017 11:04:37 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:3155d867c056 |
---|---|
1 import argparse | |
2 import re | |
3 | |
4 def options(): | |
5 parser = argparse.ArgumentParser() | |
6 parser.add_argument("--input",nargs="+", required=True, help="List of IDs") | |
7 parser.add_argument("--hpa", required=True, help="HPA file") | |
8 parser.add_argument("--tissues_del", required=True, help="List of tissues which expressed genes in are discarded") | |
9 parser.add_argument("--tissues_keep", help="List of tissues to keep regardless being expressed in list tissues_del..") | |
10 parser.add_argument("-o", "--output", default="HPA_selection.txt") | |
11 parser.add_argument("--trash", default="Trash.txt", help="Write filtered genes into a file") | |
12 parser.add_argument("--trash_file_detail", default="Trash_detail.txt", help="Write filtered genes with detailed information into a file") | |
13 parser.add_argument("--na_file", default="NaN.txt", help="Write genes whose name not found in HPA file") | |
14 parser.add_argument("--ncol", default="None", help="Number of column to filter") | |
15 | |
16 args = parser.parse_args() | |
17 #print(args.mq, args.hpa, args.tissues_del, args.tissues_keep, args.output, args.trash, args.trash_file_detail) | |
18 | |
19 filterHPA(args.input, args.hpa, args.tissues_del, args.tissues_keep, args.output, args.trash, args.trash_file_detail, args.na_file) | |
20 | |
21 def isnumber(format, n): | |
22 float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$") | |
23 int_format = re.compile("^[\-]?[1-9][0-9]*$") | |
24 test = "" | |
25 if format == "int": | |
26 test = re.match(int_format, n) | |
27 elif format == "float": | |
28 test = re.match(float_format, n) | |
29 if test: | |
30 return True | |
31 else: | |
32 return False | |
33 | |
34 def readHPA(HPAfile, tissues_del, tissues_keep): | |
35 # Read HPA file: | |
36 hpa = open(HPAfile, "r") | |
37 hpa = hpa.readlines() | |
38 # Extract tissues genes lists | |
39 tdel_dict = {} | |
40 tissues_del = tissues_del.split(",") | |
41 print("List of tissues to del", tissues_del) | |
42 tkeep_dict = {} | |
43 tissues_keep = tissues_keep.split(",") | |
44 print("List of tissues to keep", tissues_keep) | |
45 for line in hpa[1:]: | |
46 name = line.replace('"', "").split(",")[1] | |
47 tissue = line.replace('"', "").split(",")[2] | |
48 for t in tissues_del: | |
49 if tissue == t: | |
50 if t not in tdel_dict: | |
51 tdel_dict[t] = [name] | |
52 else: | |
53 if name not in tdel_dict[t]: | |
54 tdel_dict[t].append(name) | |
55 for k in tissues_keep: | |
56 if tissue == k: | |
57 if k not in tkeep_dict: | |
58 tkeep_dict[k] = [name] | |
59 else: | |
60 if name not in tkeep_dict[k]: | |
61 tkeep_dict[k].append(name) | |
62 | |
63 return tdel_dict, tkeep_dict | |
64 | |
65 def filterHPA(input, HPAfile, tissues_del, tissues_keep, output, trash_file, trash_file_detail, na_file, ncol): | |
66 | |
67 if input[1] == "list": | |
68 content = input.split() | |
69 else if input.split(",")[1] == "file": | |
70 filename = input.split(",")[0] | |
71 file = open(filename, "r") | |
72 file_content = file.readlines() | |
73 file.close() | |
74 if header == "true": | |
75 header = file_content[0] | |
76 content = file_content[1:] | |
77 else: | |
78 header = "" | |
79 content = file_content[:] | |
80 | |
81 # Remove empty lines | |
82 [content.remove(blank) for blank in content if blank.isspace()] | |
83 | |
84 # Read HPA file | |
85 hpa = open(HPAfile, "r") | |
86 hpa = hpa.readlines() | |
87 | |
88 # Get dictionary of tissues : genes | |
89 tdel_dict, tkeep_dict = readHPA(HPAfile, tissues_del, tissues_keep) | |
90 #print("Dictionary of tissue:genes to del", tdel_dict) | |
91 #print("Dictionary of tissue:genes to keep", tkeep_dict) | |
92 | |
93 # Extract gene names and protein ids column number | |
94 print(ncol.replace("c", "")) | |
95 if isnumber("int", ncol.replace("c", "")): | |
96 gene_names_index = int(ncol.replace("c", "")) - 1 | |
97 print(gene_names_index, type(gene_names_index)) | |
98 for i in range(len(column_names)): | |
99 if column_names[i] == "Majority protein IDs": | |
100 prot_id_index = i | |
101 if prot_id_index == "": | |
102 raise ValueError("Could not find 'Majority protein IDs' column") | |
103 else: | |
104 raise ValueError("Please fill in the right format of column number") | |
105 | |
106 # Filter | |
107 string = mq[0].rstrip() | |
108 string = string.replace("^M", "") + "\t" + "Filtered" + "\n" | |
109 filtered_genes = [] | |
110 filtered_prots = [] | |
111 na_genes = [] | |
112 #print(len(mq)) | |
113 for line in mq[1:]: | |
114 prot_string = line.rstrip() + "\t" | |
115 line = line.split("\t") | |
116 name = line[gene_names_index].split(";")[0].replace('"', "") | |
117 prot = line[prot_id_index].split(";")[0].replace('"', "") | |
118 | |
119 if name == "": | |
120 prot_string += "NaN - No gene name" + "\n" | |
121 string += prot_string | |
122 else: | |
123 tissue = sorted(set([t.split(",")[2].replace('"', "") for t in hpa if name in t])) | |
124 | |
125 if all (name not in genes for genes in tdel_dict.values()): | |
126 if len(tissue) != 0: | |
127 print("Not in del list", name, len(tissue)) | |
128 prot_string += ",".join(tissue) + "\n" | |
129 string += prot_string | |
130 else: | |
131 print("No tissue information", name) | |
132 prot_string += "NaN - no tissue information" + "\n" | |
133 string += prot_string | |
134 na_genes.append(name) | |
135 else: | |
136 if all (name not in genes for genes in tkeep_dict.values()): | |
137 print("In del list only", name) | |
138 filtered_genes.append(name) | |
139 filtered_prots.append(prot) | |
140 else: | |
141 print("In both del and keep", name, len(tissue)) | |
142 prot_string += ",".join(tissue) + "\n" | |
143 string += prot_string | |
144 | |
145 # Generate output file | |
146 output = open(output, "w") | |
147 output.write(string) | |
148 | |
149 # Generate file of unknown gene name | |
150 na_file = open(na_file, "w") | |
151 na_file.write("\n".join(na_genes)) | |
152 | |
153 # Generate trash files | |
154 output_trash = open(trash_file, "w") | |
155 output_trash.write("\n".join(filtered_prots)) | |
156 | |
157 output_trash_detail = open(trash_file_detail, "w") | |
158 print("Deleted genes", filtered_genes) | |
159 for gene in filtered_genes: | |
160 lines = [line for line in hpa if gene in line] | |
161 output_trash_detail.write("".join(lines)) | |
162 | |
163 if __name__ == "__main__": | |
164 options() | |
165 | |
166 # python biofilter2.py --mq ../proteinGroups_Maud.txt --hpa /db/proteinatlas/normal_tissue.csv --tissues_del "retina" --tissues_keep "tonsil" --trash "Trash3.txt" --trash_file_detail "Trash_detail3.txt" -o test-data/output3.txt --na_file "Unknown.txt" |