Previous changeset 8:30d53a378141 (2015-11-10) Next changeset 10:9136c728935f (2015-11-10) |
Commit message:
Uploaded |
added:
SAINT_preprocessing_v6_mq_pep.py |
b |
diff -r 30d53a378141 -r 9c3d2b1d0c1c SAINT_preprocessing_v6_mq_pep.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SAINT_preprocessing_v6_mq_pep.py Tue Nov 10 13:15:41 2015 -0500 |
[ |
b'@@ -0,0 +1,262 @@\n+#######################################################################################\r\n+# Python-code: SAINT pre-processing from maxquant "Samples Report" output\r\n+# Author: Brent Kuenzi\r\n+#######################################################################################\r\n+# This program reads in a raw maxquant "Samples Report" output and a user generated\r\n+# bait file and autoformats it into prey and interaction files for SAINTexpress \r\n+# analysis\r\n+#######################################################################################\r\n+import sys\r\n+import urllib2\r\n+import os\r\n+#######################################################################################\r\n+## REQUIRED INPUT ##\r\n+\r\n+# 1) infile: maxquant "Samples Report" output\r\n+# 2) baitfile: SAINT formatted bait file generated in Galaxy\r\n+# 3) prey: Y or N for generating a prey file (requires internet connection)\r\n+#######################################################################################\r\n+mq_file = sys.argv[1]\r\n+cmd = r"Rscript /home/bornea/galaxy_moffitt_dev/tools/Moffitt_Tools/bubblebeam/pre_process_protein_name_set.R " + str(mq_file) \r\n+os.system(cmd)\r\n+\r\n+infile = "./tukeys_output.txt" #maxquant "Samples Report" output\r\n+prey = sys.argv[2] # Y or N\r\n+fasta_db = sys.argv[3]\r\n+if fasta_db == "None":\r\n+ fasta_db = "/home/bornea/galaxy_moffitt_dev/tools/Moffitt_Tools/bubblebeam/SwissProt_HUMAN_2014_08.fasta"\r\n+make_bait= sys.argv[6]\r\n+\r\n+def bait_create(baits, infile):\r\n+ #Takes the Bait specified by the user and makes them into a Bait file and includes a check to make sure they are using valid baits.\r\n+ baits = make_bait.split()\r\n+ i = 0\r\n+ bait_file_tmp = open("bait.txt", "wr")\r\n+ order = [] \r\n+ bait_cache = []\r\n+ while i < len(baits):\r\n+ if baits[i+2] == "true":\r\n+ T_C = "C"\r\n+ else:\r\n+ T_C = "T"\r\n+ line1 = baits[i] + "\\t" + baits[i+1] + "\\t" + T_C + "\\n"\r\n+ q = open(infile,"r")\r\n+ for line2 in q:\r\n+ line2 = line2.replace("\\"", "")\r\n+ line2 = line2.replace(r"Intensity.", "") #R coerces "-" into "." this changes them back and remove Intensity from the Bait names.\r\n+ line2 = line2.replace(r".", r"-")\r\n+ temp = line2.split()\r\n+ if "mapped_protein" in str(temp):\r\n+ #If the bait is in the original file then write to cache it if not exit.\r\n+ if baits[i] in temp:\r\n+ number_bait = temp.index(str(baits[i]))\r\n+ number_bait = number_bait - 9\r\n+ bait_cache.append((number_bait, str(line1)))\r\n+ else:\r\n+ print "Error: bad bait " + str(baits[i])\r\n+ sys.exit()\r\n+ else: \r\n+ pass \r\n+ i = i + 3\r\n+ #Writes cache to file.\r\n+ bait_cache.sort()\r\n+ for line in bait_cache:\r\n+ bait_file_tmp.write(line[1]) \r\n+ \r\n+ bait_file_tmp.close() \r\n+\r\n+\r\n+baitfile = "bait.txt"\r\n+\r\n+class ReturnValue1(object):\r\n+ def __init__(self, sequence, gene):\r\n+ self.seqlength = sequence\r\n+ self.genename = gene\r\n+class ReturnValue2(object):\r\n+ def __init__(self, getdata, getproteins, getheader):\r\n+ self.data = getdata\r\n+ self.proteins = getproteins\r\n+ self.header = getheader\r\n+\r\n+def main(maxquant_input, make_bait): \r\n+ bait_create(make_bait, infile)\r\n+ baitfile = "bait.txt"\r\n+ #bait_check(baitfile, maxquant_input)\r\n+ make_inter(maxquant_input)\r\n+ if prey == \'true\':\r\n+ make_prey(maxquant_input)\r\n+ no_error_inter(maxquant_input)\r\n+ os.rename(\'prey.txt\', sys.argv[5])\r\n+ elif prey == \'false\':\r\n+ if os.path.isfile(\'error proteins.txt\') == True:\r\n+ no_error_inter(maxquant_input)\r\n+ pass\r\n+ elif prey != \'true\' or \'false\':\r\n+ sys.exit("Invalid Prey Argument: Y or N")\r\n+ os.rename(\'inter.txt\', sys.argv[4])\r\n+ '..b'tch]:\r\n+ if match <= db_len:\r\n+ seqlength = seqlength + len(lines[match].strip())\r\n+ match = match + 1\r\n+ else:\r\n+ break\r\n+ return ReturnValue1(seqlength, genename)\r\n+ count = count + 1\r\n+ \r\n+\r\n+ if seqlength == 0:\r\n+ error.write(uniprot_accession_in + \'\\t\' + "Uniprot not in Fasta" + \'\\n\')\r\n+ error.close\r\n+ seqlength = \'NA\'\r\n+ genename = \'NA\'\r\n+ return ReturnValue1(seqlength, genename)\r\n+\r\n+\r\n+def readtab(infile):\r\n+ with open(infile,\'r\') as x: # read in tab-delim text\r\n+ output = []\r\n+ for line in x:\r\n+ line = line.strip()\r\n+ temp = line.split(\'\\t\')\r\n+ output.append(temp)\r\n+ return output\r\n+def read_maxquant(maxquant_input): # Get data, proteins and header from maxquant output\r\n+ dupes = readtab(maxquant_input)\r\n+ header_start = 0 \r\n+ header = dupes[header_start]\r\n+ for i in header:\r\n+ i = i.replace(r"\\"", "")\r\n+ i = i.replace(r"Intensity.", r"")\r\n+ i = i.replace(r".", r"-")\r\n+ data = dupes[header_start+1:len(dupes)] #cut off blank line and END OF FILE\r\n+ proteins = []\r\n+ for protein in data:\r\n+ proteins.append(protein[0])\r\n+ return ReturnValue2(data, proteins, header)\r\n+def make_inter(maxquant_input):\r\n+ bait = readtab(baitfile)\r\n+ data = read_maxquant(maxquant_input).data\r\n+ header = read_maxquant(maxquant_input).header\r\n+ proteins = read_maxquant(maxquant_input).proteins\r\n+ bait_index = []\r\n+ for i in bait:\r\n+ bait_index.append(header.index("mapped_protein") + 1) # Find just the baits defined in bait file\r\n+ with open(\'inter.txt\', \'w\') as y:\r\n+ a = 0; l=0\r\n+ for bb in bait:\r\n+ for lst in data:\r\n+ y.write(header[bait_index[l]] + \'\\t\' + bb[1] + \'\\t\' + proteins[a] + \'\\t\' + lst[bait_index[l]] + \'\\n\')\r\n+ a+=1\r\n+ if a == len(proteins):\r\n+ a = 0; l+=1\r\n+def make_prey(maxquant_input):\r\n+ proteins = read_maxquant(maxquant_input).proteins\r\n+ output_file = open("prey.txt",\'w\')\r\n+ for a in proteins:\r\n+ a = a.replace("\\n","") # remove \\n for input into function\r\n+ a = a.replace("\\r","") # ditto for \\r\r\n+ seq = get_info(a).seqlength\r\n+ GN = get_info(a).genename\r\n+ if seq != \'NA\':\r\n+ output_file.write(a+"\\t"+str(seq)+ "\\t" + str(GN) + "\\n")\r\n+ output_file.close()\r\n+def no_error_inter(maxquant_input): # remake inter file without protein errors from Uniprot\r\n+ err = readtab("error proteins.txt")\r\n+ bait = readtab(baitfile)\r\n+ data = read_maxquant(maxquant_input).data\r\n+ header = read_maxquant(maxquant_input).header\r\n+ header = [i.replace(r"\\"", "") for i in header]\r\n+ header = [i.replace(r"Intensity.", r"") for i in header]\r\n+ header = [i.replace(r".", r"-") for i in header]\r\n+ print header\r\n+ bait_index = []\r\n+ for i in bait:\r\n+ bait_index.append(header.index(i[0]))\r\n+ proteins = read_maxquant(maxquant_input).proteins\r\n+ errors = []\r\n+ for e in err:\r\n+ errors.append(e[0])\r\n+ with open(\'inter.txt\', \'w\') as y:\r\n+ l = 0; a = 0\r\n+ for bb in bait:\r\n+ for lst in data:\r\n+ if proteins[a] not in errors:\r\n+ y.write(header[bait_index[l]] + \'\\t\' + bb[1] + \'\\t\' + proteins[a] + \'\\t\' + lst[bait_index[l]] + \'\\n\')\r\n+ a+=1\r\n+ if a == len(proteins):\r\n+ l += 1; a = 0\r\n+def bait_check(bait, maxquant_input): # check that bait names share header titles\r\n+ bait_in = readtab(bait)\r\n+ header = read_maxquant(maxquant_input).header\r\n+ for i in bait_in:\r\n+ if i[0] not in header:\r\n+ sys.exit("Bait must share header titles with maxquant output")\r\n+\r\n+if __name__ == \'__main__\':\r\n+ main(infile, make_bait)\r\n' |