Repository 'saint_preproc'
hg clone https://toolshed.g2.bx.psu.edu/repos/bornea/saint_preproc

Changeset 8:30d53a378141 (2015-11-10)
Previous changeset 7:c88212194b4e (2015-11-10) Next changeset 9:9c3d2b1d0c1c (2015-11-10)
Commit message:
Uploaded
added:
SAINT_preprocessing_v6.py
b
diff -r c88212194b4e -r 30d53a378141 SAINT_preprocessing_v6.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/SAINT_preprocessing_v6.py Tue Nov 10 13:15:26 2015 -0500
[
b'@@ -0,0 +1,245 @@\n+#######################################################################################\r\n+# Python-code: SAINT pre-processing from Scaffold "Samples Report" output\r\n+# Author: Brent Kuenzi\r\n+#######################################################################################\r\n+# This program reads in a raw Scaffold "Samples Report" output and a user generated\r\n+# bait file and autoformats it into prey and interaction files for SAINTexpress \r\n+# analysis\r\n+#######################################################################################\r\n+import sys\r\n+import urllib2\r\n+import os.path\r\n+#######################################################################################\r\n+## REQUIRED INPUT ##\r\n+\r\n+# 1) infile: Scaffold "Samples Report" output\r\n+# 2) baitfile: SAINT formatted bait file generated in Galaxy\r\n+# 3) prey: Y or N for generating a prey file (requires internet connection)\r\n+#######################################################################################\r\n+infile = sys.argv[1] #Scaffold "Samples Report" output\r\n+prey = sys.argv[2] # Y or N\r\n+fasta_db = sys.argv[3]\r\n+tool_path = sys.argv[8]\r\n+if fasta_db == "None":\r\n+    fasta_db = str(tool_path)  + "/SwissProt_HUMAN_2014_08.fasta"\r\n+make_bait= sys.argv[5]\r\n+\r\n+\r\n+baits = make_bait.split()\r\n+i = 0\r\n+bait_file_tmp = open("bait.txt", "wr")\r\n+order = [] \r\n+bait_cache = []\r\n+\r\n+while i < len(baits):\r\n+    if baits[i+2] == "true":\r\n+        T_C = "C"\r\n+    else:\r\n+        T_C = "T"\r\n+    line1 = baits[i] + "\\t" + baits[i+1] + "\\t" + T_C + "\\n"\r\n+    q = open(infile,"r")\r\n+    for line2 in q:\r\n+        line2 = line2.strip()\r\n+        temp = line2.split(\'\\t\')\r\n+        if "Quantitative Variance" in str(temp):\r\n+            if baits[i] in temp:\r\n+                number_bait = temp.index(str(baits[i]))\r\n+                number_bait = number_bait - 9\r\n+                bait_cache.append((number_bait, str(line1)))\r\n+            else:\r\n+                print "Error: bad bait " + str(baits[i])\r\n+                sys.exit()\r\n+        else: \r\n+            pass                    \r\n+    i = i + 3\r\n+\r\n+bait_cache.sort()\r\n+for line in bait_cache:\r\n+    bait_file_tmp.write(line[1])            \r\n+        \r\n+bait_file_tmp.close()\r\n+baitfile = "bait.txt" \r\n+\r\n+class ReturnValue1(object):\r\n+    def __init__(self, sequence, gene):\r\n+     self.seqlength = sequence\r\n+     self.genename = gene\r\n+class ReturnValue2(object):\r\n+    def __init__(self, getdata, getproteins, getheader):\r\n+        self.data = getdata\r\n+        self.proteins = getproteins\r\n+        self.header = getheader\r\n+\r\n+def main(scaffold_input, baitfile): \r\n+    bait_check(baitfile, scaffold_input)\r\n+    make_inter(scaffold_input)\r\n+    if prey == \'true\':\r\n+        make_prey(scaffold_input)\r\n+        no_error_inter(scaffold_input)\r\n+        os.rename(\'prey.txt\', sys.argv[5])\r\n+    elif prey == \'false\':\r\n+        if os.path.isfile(\'error proteins.txt\') == True:\r\n+            no_error_inter(scaffold_input)\r\n+        pass\r\n+    elif prey != \'true\' or \'false\':\r\n+        sys.exit("Invalid Prey Argument: Y or N")\r\n+\r\n+def get_info(uniprot_accession_in): # get aa lengths and gene name\r\n+    error = open(\'error proteins.txt\', \'a+\')\r\n+#    while True:\r\n+#        i = 0\r\n+#\ttry:  \r\n+#            data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta")\r\n+#            break\r\n+#        except urllib2.HTTPError, err:\r\n+#            i = i + 1\r\n+#            if i == 50:\r\n+#                sys.exit("More than 50 errors. Check your file or try again later.")\r\n+#            if err.code == 404:\r\n+#                error.write(uniprot_accession_in + \'\\t\' + "Invalid URL. Check protein" + \'\\n\')\r\n+#                seqlength = \'NA\'\r\n+#                genename = \'NA\'\r\n+#                return ReturnValue1(seqlength, genename)\r\n+#            elif err.code == 302:\r\n+#                sys.exit("Request timed out. Check connection and try again.")\r\n+#            else:\r\n+#                sys.exit'..b'       if match <= db_len:\r\n+                        seqlength = seqlength + len(lines[match].strip())\r\n+                        match = match + 1\r\n+                    else:\r\n+                        break\r\n+                return ReturnValue1(seqlength, genename)\r\n+        count = count + 1\r\n+        \r\n+\r\n+    if seqlength == 0:\r\n+        error.write(uniprot_accession_in + \'\\t\' + "Uniprot not in Fasta" + \'\\n\')\r\n+        error.close\r\n+        seqlength = \'NA\'\r\n+        genename = \'NA\'\r\n+        return ReturnValue1(seqlength, genename)\r\n+\r\n+def readtab(infile):\r\n+    with open(infile,\'r\') as x: # read in tab-delim text\r\n+        output = []\r\n+        for line in x:\r\n+            line = line.strip()\r\n+            temp = line.split(\'\\t\')\r\n+            output.append(temp)\r\n+    return output\r\n+def read_scaffold(scaffold_input): # Get data, proteins and header from scaffold output\r\n+    dupes = readtab(scaffold_input)\r\n+    cnt = 0\r\n+    for i in dupes:\r\n+        cnt += 1\r\n+        if i[0] == \'#\': # finds the start of second header\r\n+            header_start = cnt-1\r\n+    header = dupes[header_start]\r\n+    prot_start = header.index("Accession Number")\r\n+    data = dupes[header_start+1:len(dupes)-2] # cut off blank line and END OF FILE\r\n+    proteins = []\r\n+    for i in data:\r\n+        i[4] = i[4].split()[0] # removes the (+##) that sometimes is attached\r\n+    for protein in data:\r\n+        proteins.append(protein[prot_start])\r\n+    return ReturnValue2(data, proteins, header)\r\n+def make_inter(scaffold_input):\r\n+    bait = readtab(baitfile)\r\n+    data = read_scaffold(scaffold_input).data\r\n+    header = read_scaffold(scaffold_input).header\r\n+    proteins = read_scaffold(scaffold_input).proteins\r\n+    bait_index = []\r\n+    for i in bait:\r\n+        bait_index.append(header.index(i[0])) # Find just the baits defined in bait file\r\n+    with open(\'inter.txt\', \'w\') as y:\r\n+            a = 0; l=0\r\n+            for bb in bait:\r\n+                for lst in data:\r\n+                    y.write(header[bait_index[l]] + \'\\t\' + bb[1] + \'\\t\' + proteins[a] + \'\\t\' + lst[bait_index[l]] + \'\\n\')\r\n+                    a+=1\r\n+                    if a == len(proteins):\r\n+                        a = 0; l+=1\r\n+def make_prey(scaffold_input):\r\n+    proteins = read_scaffold(scaffold_input).proteins\r\n+    output_file = open("prey.txt",\'w\')\r\n+    for a in proteins:\r\n+        a = a.replace("\\n","") # remove \\n for input into function\r\n+        a = a.replace("\\r","") # ditto for \\r\r\n+        seq = get_info(a).seqlength\r\n+        GN = get_info(a).genename\r\n+        if seq != \'NA\':\r\n+            output_file.write(a+"\\t"+str(seq)+ "\\t" + str(GN) + "\\n")\r\n+    output_file.close()\r\n+def no_error_inter(scaffold_input): # remake inter file without protein errors from Uniprot\r\n+    err = readtab("error proteins.txt")\r\n+    bait = readtab(baitfile)\r\n+    data = read_scaffold(scaffold_input).data\r\n+    header = read_scaffold(scaffold_input).header\r\n+    bait_index = []\r\n+    for i in bait:\r\n+        bait_index.append(header.index(i[0]))\r\n+    proteins = read_scaffold(scaffold_input).proteins\r\n+    errors = []\r\n+    for e in err:\r\n+        errors.append(e[0])\r\n+    with open(\'inter.txt\', \'w\') as y:\r\n+        l = 0; a = 0\r\n+        for bb in bait:\r\n+            for lst in data:\r\n+                if proteins[a] not in errors:\r\n+                    y.write(header[bait_index[l]] + \'\\t\' + bb[1] + \'\\t\' + proteins[a] + \'\\t\' + lst[bait_index[l]] + \'\\n\')\r\n+                a+=1\r\n+                if a == len(proteins):\r\n+                    l += 1; a = 0\r\n+def bait_check(bait, scaffold_input): # check that bait names share header titles\r\n+    bait_in = readtab(bait)\r\n+    header = read_scaffold(scaffold_input).header\r\n+    for i in bait_in:\r\n+        if i[0] not in header:\r\n+            sys.exit("Bait must share header titles with Scaffold output")\r\n+\r\n+if __name__ == \'__main__\':\r\n+    main(infile, baitfile)\r\n+\r\n+os.rename(\'inter.txt\', sys.argv[4])\r\n+os.rename("bait.txt", sys.argv[7])\r\n'