sort_by_tissue: sort_by_tissue.py comparison

comparison sort_by_tissue.py @ 0:3155d867c056 draft default tip

planemo upload

author	lnguyen
date	Fri, 15 Sep 2017 11:04:37 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:3155d867c056
+import argparse
+import re
+def options():
+parser = argparse.ArgumentParser()
+parser.add_argument("--input",nargs="+", required=True, help="MaxQuant file")
+parser.add_argument("--hpa", required=True, help="HPA file")
+parser.add_argument("--tissues_del", required=True, help="List of tissues which expressed genes in are discarded")
+parser.add_argument("--tissues_keep", help="List of tissues to keep regardless being expressed in list tissues_del..")
+parser.add_argument("-o", "--output", default="HPA_selection.txt")
+parser.add_argument("--trash", default="Trash.txt", help="Write filtered genes into a file")
+parser.add_argument("--trash_file_detail", default="Trash_detail.txt", help="Write filtered genes with detailed information into a file")
+parser.add_argument("--na_file", default="NaN.txt", help="Write genes whose name not found in HPA file")
+args = parser.parse_args()
+#print(args.mq, args.hpa, args.tissues_del, args.tissues_keep, args.output, args.trash, args.trash_file_detail)
+filterHPA(args.input, args.hpa, args.tissues_del, args.tissues_keep, args.output, args.trash, args.trash_file_detail, args.na_file)
+def isnumber(format, n):
+# Check if an element is integer or float number
+float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$")
+int_format = re.compile("^[\-]?[1-9][0-9]*$")
+test = ""
+if format == "int":
+test = re.match(int_format, n)
+elif format == "float":
+test = re.match(float_format, n)
+if test:
+return True
+else:
+return False
+def readHPA(HPAfile, tissues_del, tissues_keep):
+# Read HPA file
+hpa = open(HPAfile, "r")
+hpa = hpa.readlines()
+# Extract lists of genes expressed in tissues to keep and in tissue to delete
+tdel_dict = {}
+if tissues_del:
+tissues_del = tissues_del.split(",")
+else:
+tissues_del = []
+#print("List of tissues to del", tissues_del)
+tkeep_dict = {}
+if tissues_keep:
+tissues_keep = tissues_keep.split(",")
+else:
+tissues_keep = []
+#print("List of tissues to keep", tissues_keep)
+for line in hpa[1:]:
+ensg = line.replace('"', "").split(",")[0]
+tissue = line.replace('"', "").split(",")[2]
+for t in tissues_del:
+if tissue == t:
+if t not in tdel_dict:
+tdel_dict[t] = [ensg]
+else:
+if ensg not in tdel_dict[t]:
+tdel_dict[t].append(ensg)
+for k in tissues_keep:
+if tissue == k:
+if k not in tkeep_dict:
+tkeep_dict[k] = [ensg]
+else:
+if ensg not in tkeep_dict[k]:
+tkeep_dict[k].append(ensg)
+return tdel_dict, tkeep_dict
+def filterHPA(input, HPAfile, tissues_del, tissues_keep, output, trash_file, trash_file_detail, na_file):
+input_type = input[1]
+if input_type == "file":
+input_file = input[0]
+header = input[2]
+ncol = input[3]
+file_content = open(input_file, "r").readlines()
+if isnumber("int", ncol.replace("c", "")):
+if header == "true":
+header = file_content[0]
+content = file_content[1:] #[x.strip() for x in [line.split("\t")[int(ncol.replace("c", ""))-1].split(";")[0] for line in file_content[1:]]]     # take only first IDs
+else:
+header = ""
+content = file_content[:] #[x.strip() for x in [line.split("\t")[int(ncol.replace("c", ""))-1].split(";")[0] for line in file_content]]     # take only first IDs
+#print(file_content[1:13])
+ncol = int(ncol.replace("c", "")) - 1
+else:
+raise ValueError("Please fill in the right format of column number")
+else:
+print(input[0])
+header = ""
+content = input[0].split()
+# Read HPA file
+hpa = open(HPAfile, "r")
+hpa = hpa.readlines()
+# Get dictionary of tissues : genes
+tdel_dict, tkeep_dict = readHPA(HPAfile, tissues_del, tissues_keep)
+#print("Dictionary of tissue:genes to del", tdel_dict)
+#print("Dictionary of tissue:genes to keep", tkeep_dict)
+# Filter
+string = header.strip() + "\t" + "Filtered" + "\n"
+filtered_genes = []
+filtered_lines = []
+na_genes = []
+#print(len(mq))
+for l in content:
+line_string = l.rstrip() + "\t" #.replace("^M", "")
+if input_type == "file":
+gene = l.split("\t")[ncol].split(";")[0].replace('"', "")
+else:
+gene = l
+if gene == "":
+line_string += "NA - No ENSG ID" + "\n"
+string += line_string
+elif gene == "NA":
+line_string += "NA - No ENSG ID" + "\n"
+else:
+tissue = sorted(set([t.split(",")[2].replace('"', "") for t in hpa if gene in t]))
+if all (gene not in genes for genes in tdel_dict.values()):
+if len(tissue) != 0:
+print("Not in del list", gene, len(tissue))
+line_string += ",".join(tissue) + "\n"
+string += line_string
+else:
+print("No tissue information", gene)
+line_string += "NA - no tissue information" + "\n"
+string += line_string
+na_genes.append(gene)
+else:
+if all (gene not in genes for genes in tkeep_dict.values()):
+print("In del list only", gene)
+filtered_genes.append(gene)
+filtered_lines.append(l)
+else:
+print("In both del and keep", gene, len(tissue))
+line_string += ",".join(tissue) + "\n"
+string += line_string
+# Generate output file
+output = open(output, "w")
+output.write(string)
+# Generate file of unknown gene name
+na_file = open(na_file, "w")
+na_file.write("\n".join(na_genes))
+# Generate trash files
+output_trash = open(trash_file, "w")
+output_trash.write("\n".join(filtered_lines))
+output_trash_detail = open(trash_file_detail, "w")
+print("Deleted genes", filtered_genes)
+for gene in filtered_genes:
+lines = [line for line in hpa if gene in line]
+output_trash_detail.write("".join(lines))
+if __name__ == "__main__":
+options()

Mercurial > repos > lnguyen > sort_by_tissue

comparison sort_by_tissue.py @ 0:3155d867c056 draft default tip