Repository 'saint_preproc'
hg clone https://toolshed.g2.bx.psu.edu/repos/bornea/saint_preproc

Changeset 28:5be5c9c81bda (2016-01-28)
Previous changeset 27:305615a58155 (2016-01-28) Next changeset 29:0bb43c64defd (2016-01-28)
Commit message:
Uploaded
added:
SAINT_preprocessing_v6_mq_pep.py
b
diff -r 305615a58155 -r 5be5c9c81bda SAINT_preprocessing_v6_mq_pep.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/SAINT_preprocessing_v6_mq_pep.py Thu Jan 28 13:53:14 2016 -0500
[
b'@@ -0,0 +1,268 @@\n+#######################################################################################\r\n+# Python-code: SAINT pre-processing from MaxQuant "Samples Report" output\r\n+# Author: Brent Kuenzi\r\n+#######################################################################################\r\n+# This program reads in a raw MaxQuant "Samples Report" output and a user generated\r\n+# bait file and autoformats it into prey and interaction files for SAINTexpress\r\n+# analysis\r\n+#######################################################################################\r\n+# Copyright (C)  Brent Kuenzi.\r\n+# Permission is granted to copy, distribute and/or modify this document\r\n+# under the terms of the GNU Free Documentation License, Version 1.3\r\n+# or any later version published by the Free Software Foundation;\r\n+# with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.\r\n+# A copy of the license is included in the section entitled "GNU\r\n+# Free Documentation License".\r\n+#######################################################################################\r\n+## REQUIRED INPUT ##\r\n+\r\n+# 1) infile: MaxQuant "Samples Report" output\r\n+# 2) baitfile: SAINT formatted bait file generated in Galaxy\r\n+# 3) fasta_db: fasta database for use (defaults to SwissProt_HUMAN_2014_08.fasta)\r\n+# 4) prey: Y or N for generating a prey file\r\n+# 5) make_bait: String of bait names, assignment, and test or control boolean\r\n+#######################################################################################\r\n+\r\n+\r\n+import sys\r\n+import os\r\n+\r\n+\r\n+mq_file = sys.argv[1]\r\n+ins_path = sys.argv[8]\r\n+names_path = str(ins_path) + r"uniprot_names.txt"\r\n+cmd = (r"Rscript "+ str(ins_path) +"pre_process_protein_name_set.R " + str(mq_file) +\r\n+       " " + str(names_path))\r\n+os.system(cmd)\r\n+\r\n+infile = "./tukeys_output.txt" \r\n+# The MaxQuant "Samples Report" output.\r\n+prey = sys.argv[2] \r\n+# Y or N boolean from Galaxy.\r\n+fasta_db = sys.argv[3]\r\n+if fasta_db == "None":\r\n+    fasta_db = str(ins_path)  + "SwissProt_HUMAN_2014_08.fasta"\r\n+make_bait = sys.argv[6]\r\n+bait_bool = sys.argv[9]\r\n+\r\n+def bait_create(baits, infile):\r\n+    # Takes the Bait specified by the user and makes them into a Bait file and includes a\r\n+    # check to make sure they are using valid baits.\r\n+    baits = make_bait.split()\r\n+    i = 0\r\n+    bait_file_tmp = open("bait.txt", "w")\r\n+    order = []\r\n+    bait_cache = []\r\n+    while i < len(baits):\r\n+        if baits[i+2] == "true":\r\n+            T_C = "C"\r\n+        else:\r\n+            T_C = "T"\r\n+        bait_line = baits[i] + "\\t" + baits[i+1] + "\\t" + T_C + "\\n"\r\n+        read_infile = open(infile, "r")\r\n+        for input_line in read_infile :\r\n+            input_line = input_line.replace("\\"", "")\r\n+            input_line = input_line.replace(r"Intensity.", "")\r\n+            # R coerces "-" into "." changes them back and remove Intensity from the Bait names.\r\n+            input_line = input_line.replace(r".", r"-")\r\n+            temp = input_line.split()\r\n+            if "mapped_protein" in str(temp):\r\n+                if baits[i] in temp:\r\n+                    number_bait = temp.index(str(baits[i]))\r\n+                    number_bait = number_bait - 9\r\n+                    bait_cache.append((number_bait, str(bait_line)))\r\n+                    # Locates the Bait names in the column names and then sets the Baits in the \r\n+                    # correct order in the cache thus the - 9 because the baits start at the 9th\r\n+                    # column.\r\n+                else:\r\n+                    print "Error: bad bait " + str(baits[i])\r\n+                    sys.exit()\r\n+            else:\r\n+                pass\r\n+        i = i + 3\r\n+    # Writes cache to Bait file.\r\n+    bait_cache.sort()\r\n+    for line in bait_cache:\r\n+        bait_file_tmp.write(line[1])\r\n+\r\n+    bait_file_tmp.close()\r\n+\r\n+\r\n+if bait_bool == \'false\':\r\n+    bait_create(make_bait, infile)\r\n+    baitfile = "bait.txt"\r\n+else:\r\n+    bait_temp_file = open(sys.argv[10]'..b'gth, genename)\r\n+        count = count + 1\r\n+\r\n+\r\n+    if seqlength == 0:\r\n+        error.write(uniprot_accession_in + \'\\t\' + "Uniprot not in Fasta" + \'\\n\')\r\n+        error.close\r\n+        seqlength = \'NA\'\r\n+        genename = \'NA\'\r\n+        return ReturnValue1(seqlength, genename)\r\n+\r\n+\r\n+def readtab(infile):\r\n+    with open(infile, \'r\') as input_file:\r\n+    # Read in tab-delim text file.\r\n+        output = []\r\n+        for input_line in input_file:\r\n+            input_line = input_line.strip()\r\n+            temp = input_line.split(\'\\t\')\r\n+            output.append(temp)\r\n+    return output\r\n+\r\n+\r\n+def read_MaxQuant(MaxQuant_input):\r\n+    # Get data, proteins and header from MaxQuant output.\r\n+    dupes = readtab(MaxQuant_input)\r\n+    header_start = 0\r\n+    header = dupes[header_start]\r\n+    for var_MQ in header:\r\n+        var_MQ = var_MQ.replace(r"\\"", "")\r\n+        var_MQ = var_MQ.replace(r"Intensity.", r"")\r\n+        var_MQ = var_MQ.replace(r".", r"-")\r\n+    data = dupes[header_start+1:len(dupes)]\r\n+    # Cut off blank line and END OF FILE.\r\n+    proteins = []\r\n+    for protein in data:\r\n+        proteins.append(protein[0])\r\n+    return ReturnValue2(data, proteins, header)\r\n+\r\n+\r\n+def make_inter(MaxQuant_input):\r\n+    bait = readtab(baitfile)\r\n+    data = read_MaxQuant(MaxQuant_input).data\r\n+    header = read_MaxQuant(MaxQuant_input).header\r\n+    proteins = read_MaxQuant(MaxQuant_input).proteins\r\n+    bait_index = []\r\n+    for bait_item in bait:\r\n+        bait_index.append(header.index("mapped_protein") + 1)\r\n+        # Find just the baits defined in bait file.\r\n+    with open(\'inter.txt\', \'w\') as y:\r\n+        a = 0; l = 0\r\n+        for bb in bait:\r\n+            for lst in data:\r\n+                y.write(header[bait_index[l]] + \'\\t\' + bb[1] + \'\\t\' + proteins[a] + \'\\t\'\r\n+                        + lst[bait_index[l]] + \'\\n\')\r\n+                a += 1\r\n+                if a == len(proteins):\r\n+                    a = 0; l += 1\r\n+\r\n+\r\n+def make_prey(MaxQuant_input):\r\n+    proteins = read_MaxQuant(MaxQuant_input).proteins\r\n+    output_file = open("prey.txt", \'w\')\r\n+    for a in proteins:\r\n+        a = a.replace("\\n", "")\r\n+        # Remove \\n for input into function.\r\n+        a = a.replace("\\r", "")\r\n+        # Ditto for \\r.\r\n+        seq = get_info(a).seqlength\r\n+        GN = get_info(a).genename\r\n+        if seq != \'NA\':\r\n+            output_file.write(a+"\\t"+str(seq)+ "\\t" + str(GN) + "\\n")\r\n+    output_file.close()\r\n+\r\n+\r\n+def no_error_inter(MaxQuant_input):\r\n+    # Remake inter file without protein errors from Uniprot.\r\n+    err = readtab("error proteins.txt")\r\n+    bait = readtab(baitfile)\r\n+    data = read_MaxQuant(MaxQuant_input).data\r\n+    header = read_MaxQuant(MaxQuant_input).header\r\n+    header = [MQ_var.replace(r"\\"", "") for MQ_var in header]\r\n+    header = [MQ_var.replace(r"Intensity.", r"") for MQ_var in header]\r\n+    header = [MQ_var.replace(r".", r"-") for MQ_var in header]\r\n+    bait_index = []\r\n+    for bait_item in bait:\r\n+        bait_index.append(header.index(bait_item[0]))\r\n+    proteins = read_MaxQuant(MaxQuant_input).proteins\r\n+    errors = []\r\n+    for e in err:\r\n+        errors.append(e[0])\r\n+    with open(\'inter.txt\', \'w\') as input_file:\r\n+        l = 0; a = 0\r\n+        for bb in bait:\r\n+            for lst in data:\r\n+                if proteins[a] not in errors:\r\n+                    input_file.write(header[bait_index[l]] + \'\\t\' + bb[1] + \'\\t\' + proteins[a] + \'\\t\' \r\n+                            + lst[bait_index[l]] + \'\\n\')\r\n+                a += 1\r\n+                if a == len(proteins):\r\n+                    l += 1; a = 0\r\n+\r\n+\r\n+def bait_check(bait, MaxQuant_input):\r\n+    # Check that bait names share header titles.\r\n+    bait_in = readtab(bait)\r\n+    header = read_MaxQuant(MaxQuant_input).header\r\n+    for bait in bait_in:\r\n+        if bait[0] not in header:\r\n+            sys.exit("Bait must share header titles with MaxQuant output")\r\n+\r\n+if __name__ == \'__main__\':\r\n+    main(infile, make_bait)\r\n'