Previous changeset 55:340cc5988c31 (2016-08-27) Next changeset 57:677d224656e0 (2016-08-27) |
Commit message:
Uploaded |
added:
Protein_report_processing.py |
b |
diff -r 340cc5988c31 -r 18389ccc7629 Protein_report_processing.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Protein_report_processing.py Sat Aug 27 21:01:33 2016 -0400 |
[ |
b'@@ -0,0 +1,216 @@\n+import sys\n+import os\n+from time import sleep\n+\n+files = sys.argv[1] # read in a string of file names seperated by ", "\n+# e.g. "Default_Protein_Report.txt, Default_Protein_Report_2.txt"\n+#bait = sys.argv[2] # SAINT formatted bait file\n+# still need a way to match files to bait identifiers\n+# or they can just be required to be put in the order of the bait file\n+quant_type = sys.argv[3] # what metric to use for quantification\n+# "#Validated Peptides", "#Peptides", "#Unique", "#Validated PSMs", "#PSMs"\n+db = sys.argv[4] # fasta database used in SearchGUI and PeptideShaker\n+prey = sys.argv[5]\n+tool_path = sys.argv[7]\n+if db == "None":\n+ db = str(tool_path) + "/SwissProt_HUMAN_2015_12.fasta"\n+make_bait = sys.argv[6]\n+bait_bool = sys.argv[8]\n+\n+def bait_create(baits, infile):\n+ # Verifies the Baits are valid in the Scaffold file and writes the Bait.txt.\n+ baits = make_bait.split()\n+ i = 0\n+ bait_file_tmp = open("bait.txt", "w")\n+ order = []\n+ bait_cache = []\n+ while i < len(baits):\n+ if baits[i+2] == "true":\n+ T_C = "C"\n+ else:\n+ T_C = "T"\n+ bait_line = baits[i] + "\\t" + baits[i+1] + "\\t" + T_C + "\\n"\n+ bait_cache.append(str(bait_line))\n+ i = i + 3\n+\n+ for cache_line in bait_cache:\n+ bait_file_tmp.write(cache_line)\n+\n+ bait_file_tmp.close()\n+\n+if bait_bool == \'false\':\n+ bait_create(make_bait, infile)\n+ bait = "bait.txt"\n+else:\n+ bait_temp_file = open(sys.argv[9], \'r\')\n+ bait_cache = bait_temp_file.readlines()\n+ bait_file_tmp = open("bait.txt", "wr")\n+ for cache_line in bait_cache:\n+ bait_file_tmp.write(cache_line)\n+ bait_file_tmp.close()\n+ bait = "bait.txt"\n+\n+class ReturnValue1(object):\n+ def __init__(self, sequence, gene):\n+ self.seqlength = sequence\n+ self.genename = gene\n+\n+def read_tab(infile):\n+ with open(infile,\'r\') as x:\n+ output = []\n+ for line in x:\n+ line = line.strip()\n+ temp = line.split(\'\\t\')\n+ output.append(temp)\n+ return output\n+def printProgress (iteration, total, prefix = \'\', suffix = \'\', decimals = 1, barLength = 100):\n+ """\n+ Call in a loop to create terminal progress bar\n+ @params:\n+ iteration - Required : current iteration (Int)\n+ total - Required : total iterations (Int)\n+ prefix - Optional : prefix string (Str)\n+ suffix - Optional : suffix string (Str)\n+ decimals - Optional : positive number of decimals in percent complete (Int)\n+ barLength - Optional : character length of bar (Int)\n+ """\n+ formatStr = "{0:." + str(decimals) + "f}"\n+ percents = formatStr.format(100 * (iteration / float(total)))\n+ filledLength = int(round(barLength * iteration / float(total)))\n+ bar = \'=\' * filledLength + \'-\' * (barLength - filledLength)\n+ sys.stdout.write(\'\\r%s |%s| %s%s %s\' % (prefix, bar, percents, \'%\', suffix)),\n+ sys.stdout.flush()\n+ if iteration == total:\n+ sys.stdout.write(\'\\n\')\n+ sys.stdout.flush()\n+def get_info(uniprot_accession_in,fasta_db): \n+ # Get aminoacid lengths and gene name.\n+ error = open(\'error proteins.txt\', \'a+\')\n+ data = open(fasta_db, \'r\')\n+ data_lines = data.readlines()\n+ db_len = len(data_lines)\n+ seqlength = 0\n+ count = 0\n+ for data_line in data_lines:\n+ if ">sp" in data_line:\n+ namer = data_line.split("|")[2]\n+ if uniprot_accession_in == data_line.split("|")[1]:\n+ match = count + 1\n+ if \'GN=\' in data_line:\n+ lst = data_line.split(\'GN=\')\n+ lst2 = lst[1].split(\' \')\n+ genename = lst2[0]\n+ if \'GN=\' not in data_line:\n+ genename = \'NA\'\n+ while ">sp" not in data_lines[match]:\n+ if match <= db_len:\n+ seqlength = seqlength + '..b' if uniprot_accession_in == namer.split(" ")[0]:\n+ match = count + 1\n+ # Ensures consistent spacing throughout.\n+ if \'GN=\' in data_line:\n+ lst = data_line.split(\'GN=\')\n+ lst2 = lst[1].split(\' \')\n+ genename = lst2[0]\n+ if \'GN=\' not in data_line:\n+ genename = \'NA\'\n+ while ">sp" not in data_lines[match]:\n+ if match <= db_len:\n+ seqlength = seqlength + len(data_lines[match].strip())\n+ match = match + 1\n+ else:\n+ break\n+ return ReturnValue1(seqlength, genename)\n+ count = count + 1\n+ if seqlength == 0:\n+ error.write(uniprot_accession_in + \'\\t\' + "Uniprot not in Fasta" + \'\\n\')\n+ error.close\n+ seqlength = \'NA\'\n+ genename = \'NA\'\n+ return ReturnValue1(seqlength, genename)\n+def concatenate_files(file_list_string, bait_file):\n+ file_list = file_list_string.split(",")\n+ bait = read_tab(bait_file)\n+ master_table = []\n+ header_check = 0\n+ file_cnt = 0\n+ table_cnt = 0\n+ for i in file_list:\n+ table = read_tab(i)\n+ for j in table:\n+ if table_cnt == 0:\n+ if header_check == 0:\n+ header_check +=1\n+ j.append("Replicate")\n+ j.append("Bait_Grouping")\n+ master_table.append(j)\n+ if table_cnt > 0:\n+ j.append(bait[file_cnt][0])\n+ j.append(bait[file_cnt][1])\n+ master_table.append(j)\n+ table_cnt +=1\n+ file_cnt+=1\n+ table_cnt = 0\n+ if len(master_table[0]) < len(master_table[1]):\n+ master_table[0] = ["#"] + master_table[0]\n+ with open("merged_PeptideShaker.txt","w") as x:\n+ for i in master_table:\n+ x.write("\\t".join(i))\n+ x.write("\\n")\n+ return master_table\n+def make_inter(master_table,quant_type):\n+ if len(master_table[0]) < len(master_table[1]):\n+ master_table[0] = ["#"] + master_table[0]\n+ replicate_index = master_table[0].index("Replicate")\n+ grouping_index = master_table[0].index("Bait_Grouping")\n+ accession_index = master_table[0].index("Main Accession")\n+ quant_type = quant_type.replace("_", " ")\n+ quant_type = r"#" + quant_type\n+ Quant_index = master_table[0].index(quant_type)\n+ inter_file = ""\n+ for i in master_table[1:]:\n+ line = []\n+ line.append(i[replicate_index])\n+ line.append(i[grouping_index])\n+ line.append(i[accession_index])\n+ line.append(i[Quant_index])\n+ inter_file = inter_file + "\\t".join(line) + "\\n"\n+ with open("inter.txt","w") as x:\n+ x.write(inter_file)\n+ \n+def make_prey(concat_table,fasta_db):\n+ input_data = concat_table\n+ if len(input_data[0]) < len(input_data[1]):\n+ input_data[0] = ["#"] + input_data[0]\n+ accession_index = input_data[0].index("Main Accession")\n+ proteins = []\n+ for i in input_data[1:]:\n+ proteins.append(i[accession_index])\n+ output_file = open("prey.txt", \'w\')\n+ start = 0\n+ end = len(proteins)\n+\n+ # Initial call to print 0% progress\n+ printProgress(start, end, prefix = \'Progress:\', suffix = \'Complete\', barLength = 50)\n+\n+ for protein in proteins:\n+ seq = get_info(protein,fasta_db).seqlength\n+ GN = get_info(protein,fasta_db).genename\n+ if seq != \'NA\':\n+ output_file.write(protein + "\\t" + str(seq) + "\\t" + str(GN) + "\\n")\n+ start+=1\n+ printProgress(start, end, prefix = \'Progress:\', suffix = \'Complete\', barLength = 50)\n+ output_file.close()\n+data = concatenate_files(files,bait)\n+make_inter(data, quant_type)\n+if prey == "true":\n+ make_prey(data,db)\n+\n+os.rename("bait.txt", sys.argv[2])\n+os.rename("inter.txt", sys.argv[10])\n+if str(prey) != "None": \n+ os.rename("prey.txt", sys.argv[11])\n\\ No newline at end of file\n' |