Previous changeset 74:47aa4f551c53 (2016-08-27) Next changeset 76:43b9bad147df (2016-09-02) |
Commit message:
Uploaded |
modified:
Protein_report_processing.py |
b |
diff -r 47aa4f551c53 -r 792056ff8ed5 Protein_report_processing.py --- a/Protein_report_processing.py Sat Aug 27 23:57:34 2016 -0400 +++ b/Protein_report_processing.py Fri Sep 02 16:32:26 2016 -0400 |
[ |
b'@@ -1,221 +0,0 @@\n-import sys\n-import os\n-from time import sleep\n-\n-files = sys.argv[1] # read in a string of file names seperated by ", "\n-# e.g. "Default_Protein_Report.txt, Default_Protein_Report_2.txt"\n-#bait = sys.argv[2] # SAINT formatted bait file\n-# still need a way to match files to bait identifiers\n-# or they can just be required to be put in the order of the bait file\n-quant_type = sys.argv[3] # what metric to use for quantification\n-# "#Validated Peptides", "#Peptides", "#Unique", "#Validated PSMs", "#PSMs"\n-db = sys.argv[4] # fasta database used in SearchGUI and PeptideShaker\n-prey = sys.argv[5]\n-tool_path = sys.argv[7]\n-if db == "None":\n- db = str(tool_path) + "/SwissProt_HUMAN_2015_12.fasta"\n-make_bait = sys.argv[6]\n-bait_bool = sys.argv[8]\n-\n-def bait_create(baits, infile):\n- # Verifies the Baits are valid in the Scaffold file and writes the Bait.txt.\n- baits = make_bait.split()\n- i = 0\n- bait_file_tmp = open("bait.txt", "w")\n- order = []\n- bait_cache = []\n- while i < len(baits):\n- if baits[i+2] == "true":\n- T_C = "C"\n- else:\n- T_C = "T"\n- bait_line = baits[i] + "\\t" + baits[i+1] + "\\t" + T_C + "\\n"\n- bait_cache.append(str(bait_line))\n- i = i + 3\n-\n- for cache_line in bait_cache:\n- bait_file_tmp.write(cache_line)\n-\n- bait_file_tmp.close()\n-\n-if bait_bool == \'false\':\n- bait_create(make_bait, infile)\n- bait = "bait.txt"\n-else:\n- bait_temp_file = open(sys.argv[9], \'r\')\n- bait_cache = bait_temp_file.readlines()\n- bait_file_tmp = open("bait.txt", "wr")\n- for cache_line in bait_cache:\n- bait_file_tmp.write(cache_line)\n- bait_file_tmp.close()\n- bait = "bait.txt"\n-\n-class ReturnValue1(object):\n- def __init__(self, sequence, gene):\n- self.seqlength = sequence\n- self.genename = gene\n-\n-def read_tab(infile):\n- with open(infile,\'r\') as x:\n- output = []\n- for line in x:\n- line = line.strip()\n- temp = line.split(\'\\t\')\n- output.append(temp)\n- return output\n-def printProgress (iteration, total, prefix = \'\', suffix = \'\', decimals = 1, barLength = 100):\n- """\n- Call in a loop to create terminal progress bar\n- @params:\n- iteration - Required : current iteration (Int)\n- total - Required : total iterations (Int)\n- prefix - Optional : prefix string (Str)\n- suffix - Optional : suffix string (Str)\n- decimals - Optional : positive number of decimals in percent complete (Int)\n- barLength - Optional : character length of bar (Int)\n- """\n- formatStr = "{0:." + str(decimals) + "f}"\n- percents = formatStr.format(100 * (iteration / float(total)))\n- filledLength = int(round(barLength * iteration / float(total)))\n- bar = \'=\' * filledLength + \'-\' * (barLength - filledLength)\n- sys.stdout.write(\'\\r%s |%s| %s%s %s\' % (prefix, bar, percents, \'%\', suffix)),\n- sys.stdout.flush()\n- if iteration == total:\n- sys.stdout.write(\'\\n\')\n- sys.stdout.flush()\n-def get_info(uniprot_accession_in,fasta_db): \n- # Get aminoacid lengths and gene name.\n- error = open(\'error proteins.txt\', \'a+\')\n- data = open(fasta_db, \'r\')\n- data_lines = data.readlines()\n- db_len = len(data_lines)\n- seqlength = 0\n- count = 0\n- last_line = data_lines[-1]\n- for data_line in data_lines:\n- if ">sp" in data_line:\n- namer = data_line.split("|")[2]\n- if uniprot_accession_in == data_line.split("|")[1]:\n- match = count + 1\n- if \'GN=\' in data_line:\n- lst = data_line.split(\'GN=\')\n- lst2 = lst[1].split(\' \')\n- genename = lst2[0]\n- if \'GN=\' not in data_line:\n- genename = \'NA\'\n- while ">sp" not in data_lines[match]:\n- if match <= db_len:\n- '..b' # Ensures consistent spacing throughout.\n- if \'GN=\' in data_line:\n- lst = data_line.split(\'GN=\')\n- lst2 = lst[1].split(\' \')\n- genename = lst2[0]\n- if \'GN=\' not in data_line:\n- genename = \'NA\'\n- while ">sp" not in data_lines[match]:\n- if match <= db_len:\n- seqlength = seqlength + len(data_lines[match].strip())\n- if data_lines[match] == last_line:\n- break\n- match = match + 1\n- else:\n- break\n- return ReturnValue1(seqlength, genename)\n- count = count + 1\n- if seqlength == 0:\n- error.write(uniprot_accession_in + \'\\t\' + "Uniprot not in Fasta" + \'\\n\')\n- error.close\n- seqlength = \'NA\'\n- genename = \'NA\'\n- return ReturnValue1(seqlength, genename)\n-def concatenate_files(file_list_string, bait_file):\n- file_list = file_list_string.split(",")\n- bait = read_tab(bait_file)\n- master_table = []\n- header_check = 0\n- file_cnt = 0\n- table_cnt = 0\n- for i in file_list:\n- table = read_tab(i)\n- for j in table:\n- if table_cnt == 0:\n- if header_check == 0:\n- header_check +=1\n- j.append("Replicate")\n- j.append("Bait_Grouping")\n- master_table.append(j)\n- if table_cnt > 0:\n- j.append(bait[file_cnt][0])\n- j.append(bait[file_cnt][1])\n- master_table.append(j)\n- table_cnt +=1\n- file_cnt+=1\n- table_cnt = 0\n- if len(master_table[0]) < len(master_table[1]):\n- master_table[0] = ["#"] + master_table[0]\n- with open("merged_PeptideShaker.txt","w") as x:\n- for i in master_table:\n- x.write("\\t".join(i))\n- x.write("\\n")\n- return master_table\n-def make_inter(master_table,quant_type):\n- if len(master_table[0]) < len(master_table[1]):\n- master_table[0] = ["#"] + master_table[0]\n- replicate_index = master_table[0].index("Replicate")\n- grouping_index = master_table[0].index("Bait_Grouping")\n- accession_index = master_table[0].index("Main Accession")\n- quant_type = quant_type.replace("_", " ")\n- quant_type = r"#" + quant_type\n- Quant_index = master_table[0].index(quant_type)\n- inter_file = ""\n- for i in master_table[1:]:\n- line = []\n- line.append(i[replicate_index])\n- line.append(i[grouping_index])\n- line.append(i[accession_index])\n- line.append(i[Quant_index])\n- inter_file = inter_file + "\\t".join(line) + "\\n"\n- with open("inter.txt","w") as x:\n- x.write(inter_file)\n- \n-def make_prey(concat_table,fasta_db):\n- input_data = concat_table\n- if len(input_data[0]) < len(input_data[1]):\n- input_data[0] = ["#"] + input_data[0]\n- accession_index = input_data[0].index("Main Accession")\n- proteins = []\n- for i in input_data[1:]:\n- proteins.append(i[accession_index])\n- output_file = open("prey.txt", \'w\')\n- start = 0\n- end = len(proteins)\n-\n- # Initial call to print 0% progress\n- printProgress(start, end, prefix = \'Progress:\', suffix = \'Complete\', barLength = 50)\n-\n- for protein in proteins:\n- seq = get_info(protein,fasta_db).seqlength\n- GN = get_info(protein,fasta_db).genename\n- if seq != \'NA\':\n- output_file.write(protein + "\\t" + str(seq) + "\\t" + str(GN) + "\\n")\n- start+=1\n- printProgress(start, end, prefix = \'Progress:\', suffix = \'Complete\', barLength = 50)\n- output_file.close()\n-data = concatenate_files(files,bait)\n-make_inter(data, quant_type)\n-if prey == "true":\n- make_prey(data,db)\n-\n-os.rename("bait.txt", sys.argv[2])\n-os.rename("inter.txt", sys.argv[10])\n-if str(prey) != "None": \n- os.rename("prey.txt", sys.argv[11])\n\\ No newline at end of file\n' |