Previous changeset 25:b98ec711623d (2016-01-12) Next changeset 27:305615a58155 (2016-01-28) |
Commit message:
Uploaded |
added:
SAINT_preprocessing_v6.py |
b |
diff -r b98ec711623d -r 09612857d26a SAINT_preprocessing_v6.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SAINT_preprocessing_v6.py Thu Jan 28 13:52:48 2016 -0500 |
[ |
b'@@ -0,0 +1,278 @@\n+#######################################################################################\r\n+# Python-code: SAINT pre-processing from Scaffold "Samples Report" output\r\n+# Author: Brent Kuenzi\r\n+#######################################################################################\r\n+# This program reads in a raw Scaffold "Samples Report" output and a user generated\r\n+# bait file and autoformats it into prey and interaction files for SAINTexpress\r\n+# analysis\r\n+#######################################################################################\r\n+# Copyright (C) Brent Kuenzi.\r\n+# Permission is granted to copy, distribute and/or modify this document\r\n+# under the terms of the GNU Free Documentation License, Version 1.3\r\n+# or any later version published by the Free Software Foundation;\r\n+# with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.\r\n+# A copy of the license is included in the section entitled "GNU\r\n+# Free Documentation License".\r\n+#######################################################################################\r\n+## REQUIRED INPUT ##\r\n+\r\n+# 1) infile: Scaffold "Samples Report" output\r\n+# 2) baitfile: SAINT formatted bait file generated in Galaxy\r\n+# 3) fasta_db: fasta database for use (defaults to SwissProt_HUMAN_2014_08.fasta)\r\n+# 4) prey: Y or N for generating a prey file\r\n+# 5) make_bait: String of bait names, assignment, and test or control boolean\r\n+#######################################################################################\r\n+\r\n+import sys\r\n+import os.path\r\n+\r\n+\r\n+infile = sys.argv[1] \r\n+#Scaffold "Samples Report" output.\r\n+prey = sys.argv[2] \r\n+# Y or N boolean from Galaxy.\r\n+fasta_db = sys.argv[3]\r\n+tool_path = sys.argv[8]\r\n+if fasta_db == "None":\r\n+ fasta_db = str(tool_path) + "/SwissProt_HUMAN_2014_08.fasta"\r\n+make_bait = sys.argv[6]\r\n+bait_bool = sys.argv[9]\r\n+\r\n+\r\n+def bait_create(baits, infile):\r\n+ # Verifies the Baits are valid in the Scaffold file and writes the Bait.txt.\r\n+ baits = make_bait.split()\r\n+ i = 0\r\n+ bait_file_tmp = open("bait.txt", "w")\r\n+ order = []\r\n+ bait_cache = []\r\n+ while i < len(baits):\r\n+ if baits[i+2] == "true":\r\n+ T_C = "C"\r\n+ else:\r\n+ T_C = "T"\r\n+ bait_line = baits[i] + "\\t" + baits[i+1] + "\\t" + T_C + "\\n"\r\n+ read_infile = open(infile, "r")\r\n+ for input_line in read_infile:\r\n+ input_line = input_line.strip()\r\n+ temp = input_line.split(\'\\t\')\r\n+ if "Quantitative Variance" in str(temp):\r\n+ if baits[i] in temp: \r\n+ number_bait = temp.index(str(baits[i]))\r\n+ number_bait = number_bait - 9\r\n+ bait_cache.append((number_bait, str(bait_line)))\r\n+ # Locates the Bait names in the column names and then sets the Baits in the \r\n+ # correct order in the cache thus the - 9 because the baits start at the 9th\r\n+ # column.\r\n+ else:\r\n+ print "Error: bad bait " + str(baits[i])\r\n+ sys.exit()\r\n+ else:\r\n+ pass\r\n+ i = i + 3\r\n+\r\n+ bait_cache.sort()\r\n+ for cache_line in bait_cache:\r\n+ bait_file_tmp.write(cache_line[1])\r\n+\r\n+ bait_file_tmp.close()\r\n+\r\n+if bait_bool == \'false\':\r\n+ bait_create(make_bait, infile)\r\n+ baitfile = "bait.txt"\r\n+else:\r\n+ bait_temp_file = open(sys.argv[10], \'r\')\r\n+ bait_cache = bait_temp_file.readlines()\r\n+ print bait_cache\r\n+ bait_file_tmp = open("bait.txt", "wr")\r\n+ for cache_line in bait_cache:\r\n+ bait_file_tmp.write(cache_line)\r\n+ bait_file_tmp.close()\r\n+ baitfile = "bait.txt"\r\n+\r\n+\r\n+class ReturnValue1(object):\r\n+ def __init__(self, sequence, gene):\r\n+ self.seqlength = sequence\r\n+ self.genename = gene\r\n+\r\n+\r\n+class ReturnValue2(object):\r\n+ def __init__(self, getdata, getproteins, getheader):\r\n+ self.da'..b't\' + "Uniprot not in Fasta" + \'\\n\')\r\n+ error.close\r\n+ seqlength = \'NA\'\r\n+ genename = \'NA\'\r\n+ return ReturnValue1(seqlength, genename)\r\n+\r\n+\r\n+def readtab(infile):\r\n+ with open(infile, \'r\') as input_file: \r\n+ # read in tab-delim text\r\n+ output = []\r\n+ for input_line in input_file:\r\n+ input_line = input_line.strip()\r\n+ temp = input_line.split(\'\\t\')\r\n+ output.append(temp)\r\n+ return output\r\n+\r\n+\r\n+def read_Scaffold(Scaffold_input): \r\n+ # Get data, proteins and header from Scaffold output\r\n+ dupes = readtab(Scaffold_input)\r\n+ cnt = 0\r\n+ for Scaffold_line in dupes:\r\n+ cnt += 1\r\n+ if Scaffold_line[0] == \'#\': \r\n+ # Finds the start of second header.\r\n+ header_start = cnt-1\r\n+ header = dupes[header_start]\r\n+ prot_start = header.index("Accession Number")\r\n+ data = dupes[header_start+1:len(dupes)-2] \r\n+ # Cut off blank line and END OF FILE.\r\n+ proteins = []\r\n+ for Scaffold_line in data:\r\n+ Scaffold_line[4] = Scaffold_line[4].split()[0]\r\n+ # Removes the (+##) that sometimes is attached.\r\n+ for protein in data:\r\n+ proteins.append(protein[prot_start])\r\n+ return ReturnValue2(data, proteins, header)\r\n+\r\n+\r\n+def make_inter(Scaffold_input):\r\n+ bait = readtab(baitfile)\r\n+ data = read_Scaffold(Scaffold_input).data\r\n+ header = read_Scaffold(Scaffold_input).header\r\n+ proteins = read_Scaffold(Scaffold_input).proteins\r\n+ bait_index = []\r\n+ for bait_line in bait:\r\n+ bait_index.append(header.index(bait_line[0]))\r\n+ # Find just the baits defined in bait file.\r\n+ with open(\'inter.txt\', \'w\') as inter_file:\r\n+ a = 0; l = 0\r\n+ for bb in bait:\r\n+ for lst in data:\r\n+ inter_file.write(header[bait_index[l]] + \'\\t\' + bb[1] + \'\\t\' + proteins[a] + \'\\t\'\r\n+ + lst[bait_index[l]] + \'\\n\')\r\n+ a += 1\r\n+ if a == len(proteins):\r\n+ a = 0; l += 1\r\n+\r\n+\r\n+def make_prey(Scaffold_input):\r\n+ proteins = read_Scaffold(Scaffold_input).proteins\r\n+ output_file = open("prey.txt", \'w\')\r\n+ for protein in proteins:\r\n+ protein = protein.replace("\\n", "")\r\n+ # Remove \\n for input into function.\r\n+ protein = protein.replace("\\r", "")\r\n+ # Ditto for \\r.\r\n+ seq = get_info(protein).seqlength\r\n+ GN = get_info(protein).genename\r\n+ if seq != \'NA\':\r\n+ output_file.write(protein + "\\t" + str(seq) + "\\t" + str(GN) + "\\n")\r\n+ output_file.close()\r\n+\r\n+\r\n+def no_error_inter(Scaffold_input):\r\n+ # Remake inter file without protein errors from Uniprot.\r\n+ err = readtab("error proteins.txt")\r\n+ bait = readtab(baitfile)\r\n+ data = read_Scaffold(Scaffold_input).data\r\n+ header = read_Scaffold(Scaffold_input).header\r\n+ bait_index = []\r\n+ for bait_line in bait:\r\n+ bait_index.append(header.index(bait_line[0]))\r\n+ proteins = read_Scaffold(Scaffold_input).proteins\r\n+ errors = []\r\n+ for e in err:\r\n+ errors.append(e[0])\r\n+ with open(\'inter.txt\', \'w\') as y:\r\n+ l = 0; a = 0\r\n+ for bb in bait:\r\n+ for lst in data:\r\n+ if proteins[a] not in errors:\r\n+ y.write(header[bait_index[l]] + \'\\t\' + bb[1] + \'\\t\' + proteins[a] + \'\\t\'\r\n+ + lst[bait_index[l]] + \'\\n\')\r\n+ a += 1\r\n+ if a == len(proteins):\r\n+ l += 1; a = 0\r\n+\r\n+\r\n+def bait_check(bait, Scaffold_input): \r\n+ # Check that bait names share Scaffold header titles.\r\n+ bait_in = readtab(bait)\r\n+ header = read_Scaffold(Scaffold_input).header\r\n+ for i in bait_in:\r\n+ if i[0] not in header:\r\n+ sys.exit("Bait must share header titles with Scaffold output")\r\n+\r\n+if __name__ == \'__main__\':\r\n+ main(infile, baitfile)\r\n+\r\n+os.rename("inter.txt", sys.argv[4])\r\n+os.rename("bait.txt", sys.argv[7])\r\n' |