annotate hpa_tissue_distribution.py @ 0:3155d867c056 draft default tip

planemo upload
author lnguyen
date Fri, 15 Sep 2017 11:04:37 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
1 import argparse
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
2 import re
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
3
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
4 def options():
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
5 parser = argparse.ArgumentParser()
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
6 parser.add_argument("--input",nargs="+", required=True, help="List of IDs")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
7 parser.add_argument("--hpa", required=True, help="HPA file")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
8 parser.add_argument("--tissues_del", required=True, help="List of tissues which expressed genes in are discarded")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
9 parser.add_argument("--tissues_keep", help="List of tissues to keep regardless being expressed in list tissues_del..")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
10 parser.add_argument("-o", "--output", default="HPA_selection.txt")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
11 parser.add_argument("--trash", default="Trash.txt", help="Write filtered genes into a file")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
12 parser.add_argument("--trash_file_detail", default="Trash_detail.txt", help="Write filtered genes with detailed information into a file")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
13 parser.add_argument("--na_file", default="NaN.txt", help="Write genes whose name not found in HPA file")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
14 parser.add_argument("--ncol", default="None", help="Number of column to filter")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
15
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
16 args = parser.parse_args()
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
17 #print(args.mq, args.hpa, args.tissues_del, args.tissues_keep, args.output, args.trash, args.trash_file_detail)
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
18
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
19 filterHPA(args.input, args.hpa, args.tissues_del, args.tissues_keep, args.output, args.trash, args.trash_file_detail, args.na_file)
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
20
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
21 def isnumber(format, n):
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
22 float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
23 int_format = re.compile("^[\-]?[1-9][0-9]*$")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
24 test = ""
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
25 if format == "int":
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
26 test = re.match(int_format, n)
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
27 elif format == "float":
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
28 test = re.match(float_format, n)
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
29 if test:
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
30 return True
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
31 else:
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
32 return False
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
33
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
34 def readHPA(HPAfile, tissues_del, tissues_keep):
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
35 # Read HPA file:
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
36 hpa = open(HPAfile, "r")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
37 hpa = hpa.readlines()
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
38 # Extract tissues genes lists
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
39 tdel_dict = {}
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
40 tissues_del = tissues_del.split(",")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
41 print("List of tissues to del", tissues_del)
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
42 tkeep_dict = {}
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
43 tissues_keep = tissues_keep.split(",")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
44 print("List of tissues to keep", tissues_keep)
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
45 for line in hpa[1:]:
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
46 name = line.replace('"', "").split(",")[1]
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
47 tissue = line.replace('"', "").split(",")[2]
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
48 for t in tissues_del:
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
49 if tissue == t:
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
50 if t not in tdel_dict:
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
51 tdel_dict[t] = [name]
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
52 else:
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
53 if name not in tdel_dict[t]:
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
54 tdel_dict[t].append(name)
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
55 for k in tissues_keep:
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
56 if tissue == k:
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
57 if k not in tkeep_dict:
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
58 tkeep_dict[k] = [name]
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
59 else:
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
60 if name not in tkeep_dict[k]:
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
61 tkeep_dict[k].append(name)
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
62
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
63 return tdel_dict, tkeep_dict
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
64
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
65 def filterHPA(input, HPAfile, tissues_del, tissues_keep, output, trash_file, trash_file_detail, na_file, ncol):
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
66
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
67 if input[1] == "list":
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
68 content = input.split()
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
69 else if input.split(",")[1] == "file":
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
70 filename = input.split(",")[0]
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
71 file = open(filename, "r")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
72 file_content = file.readlines()
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
73 file.close()
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
74 if header == "true":
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
75 header = file_content[0]
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
76 content = file_content[1:]
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
77 else:
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
78 header = ""
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
79 content = file_content[:]
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
80
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
81 # Remove empty lines
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
82 [content.remove(blank) for blank in content if blank.isspace()]
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
83
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
84 # Read HPA file
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
85 hpa = open(HPAfile, "r")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
86 hpa = hpa.readlines()
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
87
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
88 # Get dictionary of tissues : genes
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
89 tdel_dict, tkeep_dict = readHPA(HPAfile, tissues_del, tissues_keep)
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
90 #print("Dictionary of tissue:genes to del", tdel_dict)
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
91 #print("Dictionary of tissue:genes to keep", tkeep_dict)
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
92
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
93 # Extract gene names and protein ids column number
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
94 print(ncol.replace("c", ""))
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
95 if isnumber("int", ncol.replace("c", "")):
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
96 gene_names_index = int(ncol.replace("c", "")) - 1
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
97 print(gene_names_index, type(gene_names_index))
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
98 for i in range(len(column_names)):
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
99 if column_names[i] == "Majority protein IDs":
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
100 prot_id_index = i
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
101 if prot_id_index == "":
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
102 raise ValueError("Could not find 'Majority protein IDs' column")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
103 else:
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
104 raise ValueError("Please fill in the right format of column number")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
105
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
106 # Filter
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
107 string = mq[0].rstrip()
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
108 string = string.replace("^M", "") + "\t" + "Filtered" + "\n"
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
109 filtered_genes = []
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
110 filtered_prots = []
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
111 na_genes = []
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
112 #print(len(mq))
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
113 for line in mq[1:]:
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
114 prot_string = line.rstrip() + "\t"
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
115 line = line.split("\t")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
116 name = line[gene_names_index].split(";")[0].replace('"', "")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
117 prot = line[prot_id_index].split(";")[0].replace('"', "")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
118
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
119 if name == "":
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
120 prot_string += "NaN - No gene name" + "\n"
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
121 string += prot_string
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
122 else:
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
123 tissue = sorted(set([t.split(",")[2].replace('"', "") for t in hpa if name in t]))
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
124
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
125 if all (name not in genes for genes in tdel_dict.values()):
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
126 if len(tissue) != 0:
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
127 print("Not in del list", name, len(tissue))
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
128 prot_string += ",".join(tissue) + "\n"
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
129 string += prot_string
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
130 else:
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
131 print("No tissue information", name)
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
132 prot_string += "NaN - no tissue information" + "\n"
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
133 string += prot_string
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
134 na_genes.append(name)
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
135 else:
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
136 if all (name not in genes for genes in tkeep_dict.values()):
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
137 print("In del list only", name)
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
138 filtered_genes.append(name)
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
139 filtered_prots.append(prot)
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
140 else:
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
141 print("In both del and keep", name, len(tissue))
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
142 prot_string += ",".join(tissue) + "\n"
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
143 string += prot_string
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
144
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
145 # Generate output file
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
146 output = open(output, "w")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
147 output.write(string)
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
148
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
149 # Generate file of unknown gene name
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
150 na_file = open(na_file, "w")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
151 na_file.write("\n".join(na_genes))
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
152
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
153 # Generate trash files
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
154 output_trash = open(trash_file, "w")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
155 output_trash.write("\n".join(filtered_prots))
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
156
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
157 output_trash_detail = open(trash_file_detail, "w")
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
158 print("Deleted genes", filtered_genes)
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
159 for gene in filtered_genes:
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
160 lines = [line for line in hpa if gene in line]
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
161 output_trash_detail.write("".join(lines))
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
162
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
163 if __name__ == "__main__":
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
164 options()
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
165
3155d867c056 planemo upload
lnguyen
parents:
diff changeset
166 # python biofilter2.py --mq ../proteinGroups_Maud.txt --hpa /db/proteinatlas/normal_tissue.csv --tissues_del "retina" --tissues_keep "tonsil" --trash "Trash3.txt" --trash_file_detail "Trash_detail3.txt" -o test-data/output3.txt --na_file "Unknown.txt"