Repository 'saint_preproc'
hg clone https://toolshed.g2.bx.psu.edu/repos/bornea/saint_preproc

Changeset 26:09612857d26a (2016-01-28)
Previous changeset 25:b98ec711623d (2016-01-12) Next changeset 27:305615a58155 (2016-01-28)
Commit message:
Uploaded
added:
SAINT_preprocessing_v6.py
b
diff -r b98ec711623d -r 09612857d26a SAINT_preprocessing_v6.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/SAINT_preprocessing_v6.py Thu Jan 28 13:52:48 2016 -0500
[
b'@@ -0,0 +1,278 @@\n+#######################################################################################\r\n+# Python-code: SAINT pre-processing from Scaffold "Samples Report" output\r\n+# Author: Brent Kuenzi\r\n+#######################################################################################\r\n+# This program reads in a raw Scaffold "Samples Report" output and a user generated\r\n+# bait file and autoformats it into prey and interaction files for SAINTexpress\r\n+# analysis\r\n+#######################################################################################\r\n+# Copyright (C)  Brent Kuenzi.\r\n+# Permission is granted to copy, distribute and/or modify this document\r\n+# under the terms of the GNU Free Documentation License, Version 1.3\r\n+# or any later version published by the Free Software Foundation;\r\n+# with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.\r\n+# A copy of the license is included in the section entitled "GNU\r\n+# Free Documentation License".\r\n+#######################################################################################\r\n+## REQUIRED INPUT ##\r\n+\r\n+# 1) infile: Scaffold "Samples Report" output\r\n+# 2) baitfile: SAINT formatted bait file generated in Galaxy\r\n+# 3) fasta_db: fasta database for use (defaults to SwissProt_HUMAN_2014_08.fasta)\r\n+# 4) prey: Y or N for generating a prey file\r\n+# 5) make_bait: String of bait names, assignment, and test or control boolean\r\n+#######################################################################################\r\n+\r\n+import sys\r\n+import os.path\r\n+\r\n+\r\n+infile = sys.argv[1] \r\n+#Scaffold "Samples Report" output.\r\n+prey = sys.argv[2] \r\n+# Y or N boolean from Galaxy.\r\n+fasta_db = sys.argv[3]\r\n+tool_path = sys.argv[8]\r\n+if fasta_db == "None":\r\n+    fasta_db = str(tool_path)  + "/SwissProt_HUMAN_2014_08.fasta"\r\n+make_bait = sys.argv[6]\r\n+bait_bool = sys.argv[9]\r\n+\r\n+\r\n+def bait_create(baits, infile):\r\n+    # Verifies the Baits are valid in the Scaffold file and writes the Bait.txt.\r\n+    baits = make_bait.split()\r\n+    i = 0\r\n+    bait_file_tmp = open("bait.txt", "w")\r\n+    order = []\r\n+    bait_cache = []\r\n+    while i < len(baits):\r\n+        if baits[i+2] == "true":\r\n+            T_C = "C"\r\n+        else:\r\n+            T_C = "T"\r\n+        bait_line = baits[i] + "\\t" + baits[i+1] + "\\t" + T_C + "\\n"\r\n+        read_infile = open(infile, "r")\r\n+        for input_line in read_infile:\r\n+            input_line = input_line.strip()\r\n+            temp = input_line.split(\'\\t\')\r\n+            if "Quantitative Variance" in str(temp):\r\n+                if baits[i] in temp:                    \r\n+                    number_bait = temp.index(str(baits[i]))\r\n+                    number_bait = number_bait - 9\r\n+                    bait_cache.append((number_bait, str(bait_line)))\r\n+                    # Locates the Bait names in the column names and then sets the Baits in the \r\n+                    # correct order in the cache thus the - 9 because the baits start at the 9th\r\n+                    # column.\r\n+                else:\r\n+                    print "Error: bad bait " + str(baits[i])\r\n+                    sys.exit()\r\n+            else:\r\n+                pass\r\n+        i = i + 3\r\n+\r\n+    bait_cache.sort()\r\n+    for cache_line in bait_cache:\r\n+        bait_file_tmp.write(cache_line[1])\r\n+\r\n+    bait_file_tmp.close()\r\n+\r\n+if bait_bool == \'false\':\r\n+    bait_create(make_bait, infile)\r\n+    baitfile = "bait.txt"\r\n+else:\r\n+    bait_temp_file = open(sys.argv[10], \'r\')\r\n+    bait_cache = bait_temp_file.readlines()\r\n+    print bait_cache\r\n+    bait_file_tmp = open("bait.txt", "wr")\r\n+    for cache_line in bait_cache:\r\n+        bait_file_tmp.write(cache_line)\r\n+    bait_file_tmp.close()\r\n+    baitfile = "bait.txt"\r\n+\r\n+\r\n+class ReturnValue1(object):\r\n+    def __init__(self, sequence, gene):\r\n+        self.seqlength = sequence\r\n+        self.genename = gene\r\n+\r\n+\r\n+class ReturnValue2(object):\r\n+    def __init__(self, getdata, getproteins, getheader):\r\n+        self.da'..b't\' + "Uniprot not in Fasta" + \'\\n\')\r\n+        error.close\r\n+        seqlength = \'NA\'\r\n+        genename = \'NA\'\r\n+        return ReturnValue1(seqlength, genename)\r\n+\r\n+\r\n+def readtab(infile):\r\n+    with open(infile, \'r\') as input_file: \r\n+    # read in tab-delim text\r\n+        output = []\r\n+        for input_line in input_file:\r\n+            input_line = input_line.strip()\r\n+            temp = input_line.split(\'\\t\')\r\n+            output.append(temp)\r\n+    return output\r\n+\r\n+\r\n+def read_Scaffold(Scaffold_input): \r\n+    # Get data, proteins and header from Scaffold output\r\n+    dupes = readtab(Scaffold_input)\r\n+    cnt = 0\r\n+    for Scaffold_line in dupes:\r\n+        cnt += 1\r\n+        if Scaffold_line[0] == \'#\': \r\n+        # Finds the start of second header.\r\n+            header_start = cnt-1\r\n+    header = dupes[header_start]\r\n+    prot_start = header.index("Accession Number")\r\n+    data = dupes[header_start+1:len(dupes)-2] \r\n+    # Cut off blank line and END OF FILE.\r\n+    proteins = []\r\n+    for Scaffold_line in data:\r\n+        Scaffold_line[4] = Scaffold_line[4].split()[0]\r\n+        # Removes the (+##) that sometimes is attached.\r\n+    for protein in data:\r\n+        proteins.append(protein[prot_start])\r\n+    return ReturnValue2(data, proteins, header)\r\n+\r\n+\r\n+def make_inter(Scaffold_input):\r\n+    bait = readtab(baitfile)\r\n+    data = read_Scaffold(Scaffold_input).data\r\n+    header = read_Scaffold(Scaffold_input).header\r\n+    proteins = read_Scaffold(Scaffold_input).proteins\r\n+    bait_index = []\r\n+    for bait_line in bait:\r\n+        bait_index.append(header.index(bait_line[0]))\r\n+        # Find just the baits defined in bait file.\r\n+    with open(\'inter.txt\', \'w\') as inter_file:\r\n+        a = 0; l = 0\r\n+        for bb in bait:\r\n+            for lst in data:\r\n+                inter_file.write(header[bait_index[l]] + \'\\t\' + bb[1] + \'\\t\' + proteins[a] + \'\\t\'\r\n+                        + lst[bait_index[l]] + \'\\n\')\r\n+                a += 1\r\n+                if a == len(proteins):\r\n+                    a = 0; l += 1\r\n+\r\n+\r\n+def make_prey(Scaffold_input):\r\n+    proteins = read_Scaffold(Scaffold_input).proteins\r\n+    output_file = open("prey.txt", \'w\')\r\n+    for protein in proteins:\r\n+        protein = protein.replace("\\n", "")\r\n+        # Remove \\n for input into function.\r\n+        protein = protein.replace("\\r", "")\r\n+        # Ditto for \\r.\r\n+        seq = get_info(protein).seqlength\r\n+        GN = get_info(protein).genename\r\n+        if seq != \'NA\':\r\n+            output_file.write(protein + "\\t" + str(seq) + "\\t" + str(GN) + "\\n")\r\n+    output_file.close()\r\n+\r\n+\r\n+def no_error_inter(Scaffold_input):\r\n+    # Remake inter file without protein errors from Uniprot.\r\n+    err = readtab("error proteins.txt")\r\n+    bait = readtab(baitfile)\r\n+    data = read_Scaffold(Scaffold_input).data\r\n+    header = read_Scaffold(Scaffold_input).header\r\n+    bait_index = []\r\n+    for bait_line in bait:\r\n+        bait_index.append(header.index(bait_line[0]))\r\n+    proteins = read_Scaffold(Scaffold_input).proteins\r\n+    errors = []\r\n+    for e in err:\r\n+        errors.append(e[0])\r\n+    with open(\'inter.txt\', \'w\') as y:\r\n+        l = 0; a = 0\r\n+        for bb in bait:\r\n+            for lst in data:\r\n+                if proteins[a] not in errors:\r\n+                    y.write(header[bait_index[l]] + \'\\t\' + bb[1] + \'\\t\' + proteins[a] + \'\\t\'\r\n+                            + lst[bait_index[l]] + \'\\n\')\r\n+                a += 1\r\n+                if a == len(proteins):\r\n+                    l += 1; a = 0\r\n+\r\n+\r\n+def bait_check(bait, Scaffold_input): \r\n+    # Check that bait names share Scaffold header titles.\r\n+    bait_in = readtab(bait)\r\n+    header = read_Scaffold(Scaffold_input).header\r\n+    for i in bait_in:\r\n+        if i[0] not in header:\r\n+            sys.exit("Bait must share header titles with Scaffold output")\r\n+\r\n+if __name__ == \'__main__\':\r\n+    main(infile, baitfile)\r\n+\r\n+os.rename("inter.txt", sys.argv[4])\r\n+os.rename("bait.txt", sys.argv[7])\r\n'