diff hpa_tissue_distribution.py @ 0:3155d867c056 draft default tip

planemo upload
author lnguyen
date Fri, 15 Sep 2017 11:04:37 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hpa_tissue_distribution.py	Fri Sep 15 11:04:37 2017 -0400
@@ -0,0 +1,166 @@
+import argparse
+import re
+
+def options():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input",nargs="+", required=True, help="List of IDs")
+    parser.add_argument("--hpa", required=True, help="HPA file")
+    parser.add_argument("--tissues_del", required=True, help="List of tissues which expressed genes in are discarded")
+    parser.add_argument("--tissues_keep", help="List of tissues to keep regardless being expressed in list tissues_del..")
+    parser.add_argument("-o", "--output", default="HPA_selection.txt")
+    parser.add_argument("--trash", default="Trash.txt", help="Write filtered genes into a file")
+    parser.add_argument("--trash_file_detail", default="Trash_detail.txt", help="Write filtered genes with detailed information into a file")
+    parser.add_argument("--na_file", default="NaN.txt", help="Write genes whose name not found in HPA file")
+    parser.add_argument("--ncol", default="None", help="Number of column to filter")
+
+    args = parser.parse_args()
+    #print(args.mq, args.hpa, args.tissues_del, args.tissues_keep, args.output, args.trash, args.trash_file_detail)
+
+    filterHPA(args.input, args.hpa, args.tissues_del, args.tissues_keep, args.output, args.trash, args.trash_file_detail, args.na_file)
+    
+def isnumber(format, n):
+    float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$")
+    int_format = re.compile("^[\-]?[1-9][0-9]*$")
+    test = ""
+    if format == "int":
+        test = re.match(int_format, n)
+    elif format == "float":
+        test = re.match(float_format, n)
+    if test:
+        return True
+    else:
+        return False
+    
+def readHPA(HPAfile, tissues_del, tissues_keep):
+    # Read HPA file:
+    hpa = open(HPAfile, "r")
+    hpa = hpa.readlines()
+    # Extract tissues genes lists
+    tdel_dict = {}
+    tissues_del = tissues_del.split(",")
+    print("List of tissues to del", tissues_del)
+    tkeep_dict = {}
+    tissues_keep = tissues_keep.split(",")
+    print("List of tissues to keep", tissues_keep)
+    for line in hpa[1:]:
+        name = line.replace('"', "").split(",")[1]
+        tissue = line.replace('"', "").split(",")[2]
+        for t in tissues_del:
+            if tissue == t:
+                if t not in tdel_dict:
+                    tdel_dict[t] = [name]
+                else:
+                    if name not in tdel_dict[t]:
+                        tdel_dict[t].append(name)
+        for k in tissues_keep:
+            if tissue == k:
+                if k not in tkeep_dict:
+                    tkeep_dict[k] = [name]
+                else:
+                    if name not in tkeep_dict[k]:
+                        tkeep_dict[k].append(name)
+    
+    return tdel_dict, tkeep_dict
+
+def filterHPA(input, HPAfile, tissues_del, tissues_keep, output, trash_file, trash_file_detail, na_file, ncol):
+
+    if input[1] == "list":
+        content = input.split()
+    else if input.split(",")[1] == "file":
+        filename = input.split(",")[0]
+        file = open(filename, "r")
+        file_content = file.readlines()
+        file.close()
+        if header == "true":
+            header = file_content[0]
+            content = file_content[1:]
+        else:
+            header = ""
+            content = file_content[:]
+
+    # Remove empty lines
+    [content.remove(blank) for blank in content if blank.isspace()]
+
+    # Read HPA file
+    hpa = open(HPAfile, "r")
+    hpa = hpa.readlines()
+
+    # Get dictionary of tissues : genes
+    tdel_dict, tkeep_dict = readHPA(HPAfile, tissues_del, tissues_keep)
+    #print("Dictionary of tissue:genes to del", tdel_dict)
+    #print("Dictionary of tissue:genes to keep", tkeep_dict)
+
+    # Extract gene names and protein ids column number
+    print(ncol.replace("c", ""))
+    if isnumber("int", ncol.replace("c", "")):
+        gene_names_index = int(ncol.replace("c", "")) - 1
+        print(gene_names_index, type(gene_names_index))
+        for i in range(len(column_names)):
+            if column_names[i] == "Majority protein IDs":
+                prot_id_index = i
+        if prot_id_index == "":
+            raise ValueError("Could not find 'Majority protein IDs' column")
+    else:
+        raise ValueError("Please fill in the right format of column number")
+
+    # Filter
+    string = mq[0].rstrip()
+    string = string.replace("^M", "") + "\t" + "Filtered" + "\n"
+    filtered_genes = []
+    filtered_prots = []
+    na_genes = []
+    #print(len(mq))
+    for line in mq[1:]:
+        prot_string = line.rstrip() + "\t" 
+        line = line.split("\t")
+        name = line[gene_names_index].split(";")[0].replace('"', "")
+        prot = line[prot_id_index].split(";")[0].replace('"', "")
+
+        if name == "":
+            prot_string += "NaN - No gene name" + "\n"
+            string += prot_string
+        else:
+            tissue = sorted(set([t.split(",")[2].replace('"', "") for t in hpa if name in t]))
+
+            if all (name not in genes for genes in tdel_dict.values()):       
+                if len(tissue) != 0:
+                    print("Not in del list", name, len(tissue))
+                    prot_string += ",".join(tissue) + "\n"
+                    string += prot_string
+                else:
+                    print("No tissue information", name)
+                    prot_string += "NaN - no tissue information" + "\n"
+                    string += prot_string
+                    na_genes.append(name)
+            else:
+                if all (name not in genes for genes in tkeep_dict.values()):
+                    print("In del list only", name)
+                    filtered_genes.append(name)
+                    filtered_prots.append(prot)
+                else:
+                    print("In both del and keep", name, len(tissue))
+                    prot_string += ",".join(tissue) + "\n"
+                    string += prot_string
+
+    # Generate output file
+    output = open(output, "w")
+    output.write(string)
+
+    # Generate file of unknown gene name
+    na_file = open(na_file, "w")
+    na_file.write("\n".join(na_genes))
+        
+    # Generate trash files
+    output_trash = open(trash_file, "w")
+    output_trash.write("\n".join(filtered_prots))
+
+    output_trash_detail = open(trash_file_detail, "w")
+    print("Deleted genes", filtered_genes)
+    for gene in filtered_genes:
+        lines = [line for line in hpa if gene in line]
+        output_trash_detail.write("".join(lines))
+
+if __name__ == "__main__":
+    options()
+
+# python biofilter2.py --mq ../proteinGroups_Maud.txt --hpa /db/proteinatlas/normal_tissue.csv --tissues_del "retina" --tissues_keep "tonsil" --trash "Trash3.txt" --trash_file_detail "Trash_detail3.txt" -o test-data/output3.txt --na_file "Unknown.txt"