comparison sort_by_tissue.py @ 0:3155d867c056 draft default tip

planemo upload
author lnguyen
date Fri, 15 Sep 2017 11:04:37 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:3155d867c056
1 import argparse
2 import re
3
4 def options():
5 parser = argparse.ArgumentParser()
6 parser.add_argument("--input",nargs="+", required=True, help="MaxQuant file")
7 parser.add_argument("--hpa", required=True, help="HPA file")
8 parser.add_argument("--tissues_del", required=True, help="List of tissues which expressed genes in are discarded")
9 parser.add_argument("--tissues_keep", help="List of tissues to keep regardless being expressed in list tissues_del..")
10 parser.add_argument("-o", "--output", default="HPA_selection.txt")
11 parser.add_argument("--trash", default="Trash.txt", help="Write filtered genes into a file")
12 parser.add_argument("--trash_file_detail", default="Trash_detail.txt", help="Write filtered genes with detailed information into a file")
13 parser.add_argument("--na_file", default="NaN.txt", help="Write genes whose name not found in HPA file")
14
15 args = parser.parse_args()
16 #print(args.mq, args.hpa, args.tissues_del, args.tissues_keep, args.output, args.trash, args.trash_file_detail)
17
18 filterHPA(args.input, args.hpa, args.tissues_del, args.tissues_keep, args.output, args.trash, args.trash_file_detail, args.na_file)
19
20 def isnumber(format, n):
21 # Check if an element is integer or float number
22 float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$")
23 int_format = re.compile("^[\-]?[1-9][0-9]*$")
24 test = ""
25 if format == "int":
26 test = re.match(int_format, n)
27 elif format == "float":
28 test = re.match(float_format, n)
29 if test:
30 return True
31 else:
32 return False
33
34 def readHPA(HPAfile, tissues_del, tissues_keep):
35 # Read HPA file
36 hpa = open(HPAfile, "r")
37 hpa = hpa.readlines()
38 # Extract lists of genes expressed in tissues to keep and in tissue to delete
39 tdel_dict = {}
40 if tissues_del:
41 tissues_del = tissues_del.split(",")
42 else:
43 tissues_del = []
44 #print("List of tissues to del", tissues_del)
45 tkeep_dict = {}
46 if tissues_keep:
47 tissues_keep = tissues_keep.split(",")
48 else:
49 tissues_keep = []
50 #print("List of tissues to keep", tissues_keep)
51 for line in hpa[1:]:
52 ensg = line.replace('"', "").split(",")[0]
53 tissue = line.replace('"', "").split(",")[2]
54 for t in tissues_del:
55 if tissue == t:
56 if t not in tdel_dict:
57 tdel_dict[t] = [ensg]
58 else:
59 if ensg not in tdel_dict[t]:
60 tdel_dict[t].append(ensg)
61 for k in tissues_keep:
62 if tissue == k:
63 if k not in tkeep_dict:
64 tkeep_dict[k] = [ensg]
65 else:
66 if ensg not in tkeep_dict[k]:
67 tkeep_dict[k].append(ensg)
68
69 return tdel_dict, tkeep_dict
70
71 def filterHPA(input, HPAfile, tissues_del, tissues_keep, output, trash_file, trash_file_detail, na_file):
72 input_type = input[1]
73 if input_type == "file":
74 input_file = input[0]
75 header = input[2]
76 ncol = input[3]
77 file_content = open(input_file, "r").readlines()
78 if isnumber("int", ncol.replace("c", "")):
79 if header == "true":
80 header = file_content[0]
81 content = file_content[1:] #[x.strip() for x in [line.split("\t")[int(ncol.replace("c", ""))-1].split(";")[0] for line in file_content[1:]]] # take only first IDs
82 else:
83 header = ""
84 content = file_content[:] #[x.strip() for x in [line.split("\t")[int(ncol.replace("c", ""))-1].split(";")[0] for line in file_content]] # take only first IDs
85 #print(file_content[1:13])
86 ncol = int(ncol.replace("c", "")) - 1
87 else:
88 raise ValueError("Please fill in the right format of column number")
89 else:
90 print(input[0])
91 header = ""
92 content = input[0].split()
93
94 # Read HPA file
95 hpa = open(HPAfile, "r")
96 hpa = hpa.readlines()
97
98 # Get dictionary of tissues : genes
99 tdel_dict, tkeep_dict = readHPA(HPAfile, tissues_del, tissues_keep)
100 #print("Dictionary of tissue:genes to del", tdel_dict)
101 #print("Dictionary of tissue:genes to keep", tkeep_dict)
102
103 # Filter
104 string = header.strip() + "\t" + "Filtered" + "\n"
105 filtered_genes = []
106 filtered_lines = []
107 na_genes = []
108 #print(len(mq))
109 for l in content:
110 line_string = l.rstrip() + "\t" #.replace("^M", "")
111 if input_type == "file":
112 gene = l.split("\t")[ncol].split(";")[0].replace('"', "")
113 else:
114 gene = l
115 if gene == "":
116 line_string += "NA - No ENSG ID" + "\n"
117 string += line_string
118 elif gene == "NA":
119 line_string += "NA - No ENSG ID" + "\n"
120 else:
121 tissue = sorted(set([t.split(",")[2].replace('"', "") for t in hpa if gene in t]))
122 if all (gene not in genes for genes in tdel_dict.values()):
123 if len(tissue) != 0:
124 print("Not in del list", gene, len(tissue))
125 line_string += ",".join(tissue) + "\n"
126 string += line_string
127 else:
128 print("No tissue information", gene)
129 line_string += "NA - no tissue information" + "\n"
130 string += line_string
131 na_genes.append(gene)
132 else:
133 if all (gene not in genes for genes in tkeep_dict.values()):
134 print("In del list only", gene)
135 filtered_genes.append(gene)
136 filtered_lines.append(l)
137 else:
138 print("In both del and keep", gene, len(tissue))
139 line_string += ",".join(tissue) + "\n"
140 string += line_string
141
142 # Generate output file
143 output = open(output, "w")
144 output.write(string)
145
146 # Generate file of unknown gene name
147 na_file = open(na_file, "w")
148 na_file.write("\n".join(na_genes))
149
150 # Generate trash files
151 output_trash = open(trash_file, "w")
152 output_trash.write("\n".join(filtered_lines))
153
154 output_trash_detail = open(trash_file_detail, "w")
155 print("Deleted genes", filtered_genes)
156 for gene in filtered_genes:
157 lines = [line for line in hpa if gene in line]
158 output_trash_detail.write("".join(lines))
159
160 if __name__ == "__main__":
161 options()
162