Mercurial > repos > lnguyen > sort_by_tissue
view sort_by_tissue.py @ 0:3155d867c056 draft default tip
planemo upload
author | lnguyen |
---|---|
date | Fri, 15 Sep 2017 11:04:37 -0400 |
parents | |
children |
line wrap: on
line source
import argparse import re def options(): parser = argparse.ArgumentParser() parser.add_argument("--input",nargs="+", required=True, help="MaxQuant file") parser.add_argument("--hpa", required=True, help="HPA file") parser.add_argument("--tissues_del", required=True, help="List of tissues which expressed genes in are discarded") parser.add_argument("--tissues_keep", help="List of tissues to keep regardless being expressed in list tissues_del..") parser.add_argument("-o", "--output", default="HPA_selection.txt") parser.add_argument("--trash", default="Trash.txt", help="Write filtered genes into a file") parser.add_argument("--trash_file_detail", default="Trash_detail.txt", help="Write filtered genes with detailed information into a file") parser.add_argument("--na_file", default="NaN.txt", help="Write genes whose name not found in HPA file") args = parser.parse_args() #print(args.mq, args.hpa, args.tissues_del, args.tissues_keep, args.output, args.trash, args.trash_file_detail) filterHPA(args.input, args.hpa, args.tissues_del, args.tissues_keep, args.output, args.trash, args.trash_file_detail, args.na_file) def isnumber(format, n): # Check if an element is integer or float number float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$") int_format = re.compile("^[\-]?[1-9][0-9]*$") test = "" if format == "int": test = re.match(int_format, n) elif format == "float": test = re.match(float_format, n) if test: return True else: return False def readHPA(HPAfile, tissues_del, tissues_keep): # Read HPA file hpa = open(HPAfile, "r") hpa = hpa.readlines() # Extract lists of genes expressed in tissues to keep and in tissue to delete tdel_dict = {} if tissues_del: tissues_del = tissues_del.split(",") else: tissues_del = [] #print("List of tissues to del", tissues_del) tkeep_dict = {} if tissues_keep: tissues_keep = tissues_keep.split(",") else: tissues_keep = [] #print("List of tissues to keep", tissues_keep) for line in hpa[1:]: ensg = line.replace('"', "").split(",")[0] tissue = line.replace('"', "").split(",")[2] for t in tissues_del: if tissue == t: if t not in tdel_dict: tdel_dict[t] = [ensg] else: if ensg not in tdel_dict[t]: tdel_dict[t].append(ensg) for k in tissues_keep: if tissue == k: if k not in tkeep_dict: tkeep_dict[k] = [ensg] else: if ensg not in tkeep_dict[k]: tkeep_dict[k].append(ensg) return tdel_dict, tkeep_dict def filterHPA(input, HPAfile, tissues_del, tissues_keep, output, trash_file, trash_file_detail, na_file): input_type = input[1] if input_type == "file": input_file = input[0] header = input[2] ncol = input[3] file_content = open(input_file, "r").readlines() if isnumber("int", ncol.replace("c", "")): if header == "true": header = file_content[0] content = file_content[1:] #[x.strip() for x in [line.split("\t")[int(ncol.replace("c", ""))-1].split(";")[0] for line in file_content[1:]]] # take only first IDs else: header = "" content = file_content[:] #[x.strip() for x in [line.split("\t")[int(ncol.replace("c", ""))-1].split(";")[0] for line in file_content]] # take only first IDs #print(file_content[1:13]) ncol = int(ncol.replace("c", "")) - 1 else: raise ValueError("Please fill in the right format of column number") else: print(input[0]) header = "" content = input[0].split() # Read HPA file hpa = open(HPAfile, "r") hpa = hpa.readlines() # Get dictionary of tissues : genes tdel_dict, tkeep_dict = readHPA(HPAfile, tissues_del, tissues_keep) #print("Dictionary of tissue:genes to del", tdel_dict) #print("Dictionary of tissue:genes to keep", tkeep_dict) # Filter string = header.strip() + "\t" + "Filtered" + "\n" filtered_genes = [] filtered_lines = [] na_genes = [] #print(len(mq)) for l in content: line_string = l.rstrip() + "\t" #.replace("^M", "") if input_type == "file": gene = l.split("\t")[ncol].split(";")[0].replace('"', "") else: gene = l if gene == "": line_string += "NA - No ENSG ID" + "\n" string += line_string elif gene == "NA": line_string += "NA - No ENSG ID" + "\n" else: tissue = sorted(set([t.split(",")[2].replace('"', "") for t in hpa if gene in t])) if all (gene not in genes for genes in tdel_dict.values()): if len(tissue) != 0: print("Not in del list", gene, len(tissue)) line_string += ",".join(tissue) + "\n" string += line_string else: print("No tissue information", gene) line_string += "NA - no tissue information" + "\n" string += line_string na_genes.append(gene) else: if all (gene not in genes for genes in tkeep_dict.values()): print("In del list only", gene) filtered_genes.append(gene) filtered_lines.append(l) else: print("In both del and keep", gene, len(tissue)) line_string += ",".join(tissue) + "\n" string += line_string # Generate output file output = open(output, "w") output.write(string) # Generate file of unknown gene name na_file = open(na_file, "w") na_file.write("\n".join(na_genes)) # Generate trash files output_trash = open(trash_file, "w") output_trash.write("\n".join(filtered_lines)) output_trash_detail = open(trash_file_detail, "w") print("Deleted genes", filtered_genes) for gene in filtered_genes: lines = [line for line in hpa if gene in line] output_trash_detail.write("".join(lines)) if __name__ == "__main__": options()