Repository 'saint_preprocessing'
hg clone https://toolshed.g2.bx.psu.edu/repos/bornea/saint_preprocessing

Changeset 56:18389ccc7629 (2016-08-27)
Previous changeset 55:340cc5988c31 (2016-08-27) Next changeset 57:677d224656e0 (2016-08-27)
Commit message:
Uploaded
added:
Protein_report_processing.py
b
diff -r 340cc5988c31 -r 18389ccc7629 Protein_report_processing.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Protein_report_processing.py Sat Aug 27 21:01:33 2016 -0400
[
b'@@ -0,0 +1,216 @@\n+import sys\n+import os\n+from time import sleep\n+\n+files = sys.argv[1] # read in a string of file names seperated by ", "\n+# e.g. "Default_Protein_Report.txt, Default_Protein_Report_2.txt"\n+#bait = sys.argv[2] # SAINT formatted bait file\n+# still need a way to match files to bait identifiers\n+# or they can just be required to be put in the order of the bait file\n+quant_type = sys.argv[3] # what metric to use for quantification\n+# "#Validated Peptides", "#Peptides", "#Unique", "#Validated PSMs", "#PSMs"\n+db = sys.argv[4] # fasta database used in SearchGUI and PeptideShaker\n+prey = sys.argv[5]\n+tool_path = sys.argv[7]\n+if db == "None":\n+    db = str(tool_path)  + "/SwissProt_HUMAN_2015_12.fasta"\n+make_bait = sys.argv[6]\n+bait_bool = sys.argv[8]\n+\n+def bait_create(baits, infile):\n+    # Verifies the Baits are valid in the Scaffold file and writes the Bait.txt.\n+    baits = make_bait.split()\n+    i = 0\n+    bait_file_tmp = open("bait.txt", "w")\n+    order = []\n+    bait_cache = []\n+    while i < len(baits):\n+        if baits[i+2] == "true":\n+            T_C = "C"\n+        else:\n+            T_C = "T"\n+        bait_line = baits[i] + "\\t" + baits[i+1] + "\\t" + T_C + "\\n"\n+        bait_cache.append(str(bait_line))\n+        i = i + 3\n+\n+    for cache_line in bait_cache:\n+        bait_file_tmp.write(cache_line)\n+\n+    bait_file_tmp.close()\n+\n+if bait_bool == \'false\':\n+    bait_create(make_bait, infile)\n+    bait = "bait.txt"\n+else:\n+    bait_temp_file = open(sys.argv[9], \'r\')\n+    bait_cache = bait_temp_file.readlines()\n+    bait_file_tmp = open("bait.txt", "wr")\n+    for cache_line in bait_cache:\n+        bait_file_tmp.write(cache_line)\n+    bait_file_tmp.close()\n+    bait = "bait.txt"\n+\n+class ReturnValue1(object):\n+    def __init__(self, sequence, gene):\n+        self.seqlength = sequence\n+        self.genename = gene\n+\n+def read_tab(infile):\n+    with open(infile,\'r\') as x:\n+        output = []\n+        for line in x:\n+            line = line.strip()\n+            temp = line.split(\'\\t\')\n+            output.append(temp)\n+    return output\n+def printProgress (iteration, total, prefix = \'\', suffix = \'\', decimals = 1, barLength = 100):\n+    """\n+    Call in a loop to create terminal progress bar\n+    @params:\n+        iteration   - Required  : current iteration (Int)\n+        total       - Required  : total iterations (Int)\n+        prefix      - Optional  : prefix string (Str)\n+        suffix      - Optional  : suffix string (Str)\n+        decimals    - Optional  : positive number of decimals in percent complete (Int)\n+        barLength   - Optional  : character length of bar (Int)\n+    """\n+    formatStr       = "{0:." + str(decimals) + "f}"\n+    percents        = formatStr.format(100 * (iteration / float(total)))\n+    filledLength    = int(round(barLength * iteration / float(total)))\n+    bar             = \'=\' * filledLength + \'-\' * (barLength - filledLength)\n+    sys.stdout.write(\'\\r%s |%s| %s%s %s\' % (prefix, bar, percents, \'%\', suffix)),\n+    sys.stdout.flush()\n+    if iteration == total:\n+        sys.stdout.write(\'\\n\')\n+        sys.stdout.flush()\n+def get_info(uniprot_accession_in,fasta_db): \n+    # Get aminoacid lengths and gene name.\n+    error = open(\'error proteins.txt\', \'a+\')\n+    data = open(fasta_db, \'r\')\n+    data_lines = data.readlines()\n+    db_len = len(data_lines)\n+    seqlength = 0\n+    count = 0\n+    for data_line in data_lines:\n+        if ">sp" in data_line:\n+            namer = data_line.split("|")[2]\n+            if uniprot_accession_in == data_line.split("|")[1]:\n+                match = count + 1\n+                if \'GN=\' in data_line:\n+                    lst = data_line.split(\'GN=\')\n+                    lst2 = lst[1].split(\' \')\n+                    genename = lst2[0]\n+                if \'GN=\' not in data_line:\n+                    genename = \'NA\'\n+                while ">sp" not in data_lines[match]:\n+                    if match <= db_len:\n+                        seqlength = seqlength + '..b'       if uniprot_accession_in == namer.split(" ")[0]:\n+            match = count + 1\n+            # Ensures consistent spacing throughout.\n+            if \'GN=\' in data_line:\n+                lst = data_line.split(\'GN=\')\n+                lst2 = lst[1].split(\' \')\n+                genename = lst2[0]\n+            if \'GN=\' not in data_line:\n+                genename = \'NA\'\n+            while ">sp" not in data_lines[match]:\n+                if match <= db_len:\n+                    seqlength = seqlength + len(data_lines[match].strip())\n+                    match = match + 1\n+                else:\n+                    break\n+            return ReturnValue1(seqlength, genename)\n+        count = count + 1\n+    if seqlength == 0:\n+        error.write(uniprot_accession_in + \'\\t\' + "Uniprot not in Fasta" + \'\\n\')\n+        error.close\n+        seqlength = \'NA\'\n+        genename = \'NA\'\n+        return ReturnValue1(seqlength, genename)\n+def concatenate_files(file_list_string, bait_file):\n+    file_list = file_list_string.split(",")\n+    bait = read_tab(bait_file)\n+    master_table = []\n+    header_check = 0\n+    file_cnt = 0\n+    table_cnt = 0\n+    for i in file_list:\n+        table = read_tab(i)\n+        for j in table:\n+            if table_cnt == 0:\n+                if header_check == 0:\n+                    header_check +=1\n+                    j.append("Replicate")\n+                    j.append("Bait_Grouping")\n+                    master_table.append(j)\n+            if table_cnt > 0:\n+                j.append(bait[file_cnt][0])\n+                j.append(bait[file_cnt][1])\n+                master_table.append(j)\n+            table_cnt +=1\n+        file_cnt+=1\n+        table_cnt = 0\n+    if len(master_table[0]) < len(master_table[1]):\n+        master_table[0] = ["#"] + master_table[0]\n+    with open("merged_PeptideShaker.txt","w") as x:\n+        for i in master_table:\n+            x.write("\\t".join(i))\n+            x.write("\\n")\n+    return master_table\n+def make_inter(master_table,quant_type):\n+    if len(master_table[0]) < len(master_table[1]):\n+        master_table[0] = ["#"] + master_table[0]\n+    replicate_index = master_table[0].index("Replicate")\n+    grouping_index = master_table[0].index("Bait_Grouping")\n+    accession_index = master_table[0].index("Main Accession")\n+    quant_type = quant_type.replace("_", " ")\n+    quant_type = r"#" + quant_type\n+    Quant_index = master_table[0].index(quant_type)\n+    inter_file = ""\n+    for i in master_table[1:]:\n+        line = []\n+        line.append(i[replicate_index])\n+        line.append(i[grouping_index])\n+        line.append(i[accession_index])\n+        line.append(i[Quant_index])\n+        inter_file = inter_file + "\\t".join(line) + "\\n"\n+    with open("inter.txt","w") as x:\n+        x.write(inter_file)\n+    \n+def make_prey(concat_table,fasta_db):\n+    input_data = concat_table\n+    if len(input_data[0]) < len(input_data[1]):\n+        input_data[0] = ["#"] + input_data[0]\n+    accession_index = input_data[0].index("Main Accession")\n+    proteins = []\n+    for i in input_data[1:]:\n+        proteins.append(i[accession_index])\n+    output_file = open("prey.txt", \'w\')\n+    start = 0\n+    end = len(proteins)\n+\n+    # Initial call to print 0% progress\n+    printProgress(start, end, prefix = \'Progress:\', suffix = \'Complete\', barLength = 50)\n+\n+    for protein in proteins:\n+        seq = get_info(protein,fasta_db).seqlength\n+        GN = get_info(protein,fasta_db).genename\n+        if seq != \'NA\':\n+            output_file.write(protein + "\\t" + str(seq) + "\\t" + str(GN) + "\\n")\n+        start+=1\n+        printProgress(start, end, prefix = \'Progress:\', suffix = \'Complete\', barLength = 50)\n+    output_file.close()\n+data = concatenate_files(files,bait)\n+make_inter(data, quant_type)\n+if prey == "true":\n+    make_prey(data,db)\n+\n+os.rename("bait.txt", sys.argv[2])\n+os.rename("inter.txt", sys.argv[10])\n+if str(prey) != "None": \n+    os.rename("prey.txt", sys.argv[11])\n\\ No newline at end of file\n'