0
|
1 import argparse
|
|
2 import re
|
|
3
|
|
4 def options():
|
|
5 parser = argparse.ArgumentParser()
|
|
6 parser.add_argument("--input",nargs="+", required=True, help="MaxQuant file")
|
|
7 parser.add_argument("--hpa", required=True, help="HPA file")
|
|
8 parser.add_argument("--tissues_del", required=True, help="List of tissues which expressed genes in are discarded")
|
|
9 parser.add_argument("--tissues_keep", help="List of tissues to keep regardless being expressed in list tissues_del..")
|
|
10 parser.add_argument("-o", "--output", default="HPA_selection.txt")
|
|
11 parser.add_argument("--trash", default="Trash.txt", help="Write filtered genes into a file")
|
|
12 parser.add_argument("--trash_file_detail", default="Trash_detail.txt", help="Write filtered genes with detailed information into a file")
|
|
13 parser.add_argument("--na_file", default="NaN.txt", help="Write genes whose name not found in HPA file")
|
|
14
|
|
15 args = parser.parse_args()
|
|
16 #print(args.mq, args.hpa, args.tissues_del, args.tissues_keep, args.output, args.trash, args.trash_file_detail)
|
|
17
|
|
18 filterHPA(args.input, args.hpa, args.tissues_del, args.tissues_keep, args.output, args.trash, args.trash_file_detail, args.na_file)
|
|
19
|
|
20 def isnumber(format, n):
|
|
21 # Check if an element is integer or float number
|
|
22 float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$")
|
|
23 int_format = re.compile("^[\-]?[1-9][0-9]*$")
|
|
24 test = ""
|
|
25 if format == "int":
|
|
26 test = re.match(int_format, n)
|
|
27 elif format == "float":
|
|
28 test = re.match(float_format, n)
|
|
29 if test:
|
|
30 return True
|
|
31 else:
|
|
32 return False
|
|
33
|
|
34 def readHPA(HPAfile, tissues_del, tissues_keep):
|
|
35 # Read HPA file
|
|
36 hpa = open(HPAfile, "r")
|
|
37 hpa = hpa.readlines()
|
|
38 # Extract lists of genes expressed in tissues to keep and in tissue to delete
|
|
39 tdel_dict = {}
|
|
40 if tissues_del:
|
|
41 tissues_del = tissues_del.split(",")
|
|
42 else:
|
|
43 tissues_del = []
|
|
44 #print("List of tissues to del", tissues_del)
|
|
45 tkeep_dict = {}
|
|
46 if tissues_keep:
|
|
47 tissues_keep = tissues_keep.split(",")
|
|
48 else:
|
|
49 tissues_keep = []
|
|
50 #print("List of tissues to keep", tissues_keep)
|
|
51 for line in hpa[1:]:
|
|
52 ensg = line.replace('"', "").split(",")[0]
|
|
53 tissue = line.replace('"', "").split(",")[2]
|
|
54 for t in tissues_del:
|
|
55 if tissue == t:
|
|
56 if t not in tdel_dict:
|
|
57 tdel_dict[t] = [ensg]
|
|
58 else:
|
|
59 if ensg not in tdel_dict[t]:
|
|
60 tdel_dict[t].append(ensg)
|
|
61 for k in tissues_keep:
|
|
62 if tissue == k:
|
|
63 if k not in tkeep_dict:
|
|
64 tkeep_dict[k] = [ensg]
|
|
65 else:
|
|
66 if ensg not in tkeep_dict[k]:
|
|
67 tkeep_dict[k].append(ensg)
|
|
68
|
|
69 return tdel_dict, tkeep_dict
|
|
70
|
|
71 def filterHPA(input, HPAfile, tissues_del, tissues_keep, output, trash_file, trash_file_detail, na_file):
|
|
72 input_type = input[1]
|
|
73 if input_type == "file":
|
|
74 input_file = input[0]
|
|
75 header = input[2]
|
|
76 ncol = input[3]
|
|
77 file_content = open(input_file, "r").readlines()
|
|
78 if isnumber("int", ncol.replace("c", "")):
|
|
79 if header == "true":
|
|
80 header = file_content[0]
|
|
81 content = file_content[1:] #[x.strip() for x in [line.split("\t")[int(ncol.replace("c", ""))-1].split(";")[0] for line in file_content[1:]]] # take only first IDs
|
|
82 else:
|
|
83 header = ""
|
|
84 content = file_content[:] #[x.strip() for x in [line.split("\t")[int(ncol.replace("c", ""))-1].split(";")[0] for line in file_content]] # take only first IDs
|
|
85 #print(file_content[1:13])
|
|
86 ncol = int(ncol.replace("c", "")) - 1
|
|
87 else:
|
|
88 raise ValueError("Please fill in the right format of column number")
|
|
89 else:
|
|
90 print(input[0])
|
|
91 header = ""
|
|
92 content = input[0].split()
|
|
93
|
|
94 # Read HPA file
|
|
95 hpa = open(HPAfile, "r")
|
|
96 hpa = hpa.readlines()
|
|
97
|
|
98 # Get dictionary of tissues : genes
|
|
99 tdel_dict, tkeep_dict = readHPA(HPAfile, tissues_del, tissues_keep)
|
|
100 #print("Dictionary of tissue:genes to del", tdel_dict)
|
|
101 #print("Dictionary of tissue:genes to keep", tkeep_dict)
|
|
102
|
|
103 # Filter
|
|
104 string = header.strip() + "\t" + "Filtered" + "\n"
|
|
105 filtered_genes = []
|
|
106 filtered_lines = []
|
|
107 na_genes = []
|
|
108 #print(len(mq))
|
|
109 for l in content:
|
|
110 line_string = l.rstrip() + "\t" #.replace("^M", "")
|
|
111 if input_type == "file":
|
|
112 gene = l.split("\t")[ncol].split(";")[0].replace('"', "")
|
|
113 else:
|
|
114 gene = l
|
|
115 if gene == "":
|
|
116 line_string += "NA - No ENSG ID" + "\n"
|
|
117 string += line_string
|
|
118 elif gene == "NA":
|
|
119 line_string += "NA - No ENSG ID" + "\n"
|
|
120 else:
|
|
121 tissue = sorted(set([t.split(",")[2].replace('"', "") for t in hpa if gene in t]))
|
|
122 if all (gene not in genes for genes in tdel_dict.values()):
|
|
123 if len(tissue) != 0:
|
|
124 print("Not in del list", gene, len(tissue))
|
|
125 line_string += ",".join(tissue) + "\n"
|
|
126 string += line_string
|
|
127 else:
|
|
128 print("No tissue information", gene)
|
|
129 line_string += "NA - no tissue information" + "\n"
|
|
130 string += line_string
|
|
131 na_genes.append(gene)
|
|
132 else:
|
|
133 if all (gene not in genes for genes in tkeep_dict.values()):
|
|
134 print("In del list only", gene)
|
|
135 filtered_genes.append(gene)
|
|
136 filtered_lines.append(l)
|
|
137 else:
|
|
138 print("In both del and keep", gene, len(tissue))
|
|
139 line_string += ",".join(tissue) + "\n"
|
|
140 string += line_string
|
|
141
|
|
142 # Generate output file
|
|
143 output = open(output, "w")
|
|
144 output.write(string)
|
|
145
|
|
146 # Generate file of unknown gene name
|
|
147 na_file = open(na_file, "w")
|
|
148 na_file.write("\n".join(na_genes))
|
|
149
|
|
150 # Generate trash files
|
|
151 output_trash = open(trash_file, "w")
|
|
152 output_trash.write("\n".join(filtered_lines))
|
|
153
|
|
154 output_trash_detail = open(trash_file_detail, "w")
|
|
155 print("Deleted genes", filtered_genes)
|
|
156 for gene in filtered_genes:
|
|
157 lines = [line for line in hpa if gene in line]
|
|
158 output_trash_detail.write("".join(lines))
|
|
159
|
|
160 if __name__ == "__main__":
|
|
161 options()
|
|
162
|