Repository 'saint_preprocessing'
hg clone https://toolshed.g2.bx.psu.edu/repos/bornea/saint_preprocessing

Changeset 75:792056ff8ed5 (2016-09-02)
Previous changeset 74:47aa4f551c53 (2016-08-27) Next changeset 76:43b9bad147df (2016-09-02)
Commit message:
Uploaded
modified:
Protein_report_processing.py
b
diff -r 47aa4f551c53 -r 792056ff8ed5 Protein_report_processing.py
--- a/Protein_report_processing.py Sat Aug 27 23:57:34 2016 -0400
+++ b/Protein_report_processing.py Fri Sep 02 16:32:26 2016 -0400
[
b'@@ -1,221 +0,0 @@\n-import sys\n-import os\n-from time import sleep\n-\n-files = sys.argv[1] # read in a string of file names seperated by ", "\n-# e.g. "Default_Protein_Report.txt, Default_Protein_Report_2.txt"\n-#bait = sys.argv[2] # SAINT formatted bait file\n-# still need a way to match files to bait identifiers\n-# or they can just be required to be put in the order of the bait file\n-quant_type = sys.argv[3] # what metric to use for quantification\n-# "#Validated Peptides", "#Peptides", "#Unique", "#Validated PSMs", "#PSMs"\n-db = sys.argv[4] # fasta database used in SearchGUI and PeptideShaker\n-prey = sys.argv[5]\n-tool_path = sys.argv[7]\n-if db == "None":\n-    db = str(tool_path)  + "/SwissProt_HUMAN_2015_12.fasta"\n-make_bait = sys.argv[6]\n-bait_bool = sys.argv[8]\n-\n-def bait_create(baits, infile):\n-    # Verifies the Baits are valid in the Scaffold file and writes the Bait.txt.\n-    baits = make_bait.split()\n-    i = 0\n-    bait_file_tmp = open("bait.txt", "w")\n-    order = []\n-    bait_cache = []\n-    while i < len(baits):\n-        if baits[i+2] == "true":\n-            T_C = "C"\n-        else:\n-            T_C = "T"\n-        bait_line = baits[i] + "\\t" + baits[i+1] + "\\t" + T_C + "\\n"\n-        bait_cache.append(str(bait_line))\n-        i = i + 3\n-\n-    for cache_line in bait_cache:\n-        bait_file_tmp.write(cache_line)\n-\n-    bait_file_tmp.close()\n-\n-if bait_bool == \'false\':\n-    bait_create(make_bait, infile)\n-    bait = "bait.txt"\n-else:\n-    bait_temp_file = open(sys.argv[9], \'r\')\n-    bait_cache = bait_temp_file.readlines()\n-    bait_file_tmp = open("bait.txt", "wr")\n-    for cache_line in bait_cache:\n-        bait_file_tmp.write(cache_line)\n-    bait_file_tmp.close()\n-    bait = "bait.txt"\n-\n-class ReturnValue1(object):\n-    def __init__(self, sequence, gene):\n-        self.seqlength = sequence\n-        self.genename = gene\n-\n-def read_tab(infile):\n-    with open(infile,\'r\') as x:\n-        output = []\n-        for line in x:\n-            line = line.strip()\n-            temp = line.split(\'\\t\')\n-            output.append(temp)\n-    return output\n-def printProgress (iteration, total, prefix = \'\', suffix = \'\', decimals = 1, barLength = 100):\n-    """\n-    Call in a loop to create terminal progress bar\n-    @params:\n-        iteration   - Required  : current iteration (Int)\n-        total       - Required  : total iterations (Int)\n-        prefix      - Optional  : prefix string (Str)\n-        suffix      - Optional  : suffix string (Str)\n-        decimals    - Optional  : positive number of decimals in percent complete (Int)\n-        barLength   - Optional  : character length of bar (Int)\n-    """\n-    formatStr       = "{0:." + str(decimals) + "f}"\n-    percents        = formatStr.format(100 * (iteration / float(total)))\n-    filledLength    = int(round(barLength * iteration / float(total)))\n-    bar             = \'=\' * filledLength + \'-\' * (barLength - filledLength)\n-    sys.stdout.write(\'\\r%s |%s| %s%s %s\' % (prefix, bar, percents, \'%\', suffix)),\n-    sys.stdout.flush()\n-    if iteration == total:\n-        sys.stdout.write(\'\\n\')\n-        sys.stdout.flush()\n-def get_info(uniprot_accession_in,fasta_db): \n-    # Get aminoacid lengths and gene name.\n-    error = open(\'error proteins.txt\', \'a+\')\n-    data = open(fasta_db, \'r\')\n-    data_lines = data.readlines()\n-    db_len = len(data_lines)\n-    seqlength = 0\n-    count = 0\n-    last_line = data_lines[-1]\n-    for data_line in data_lines:\n-        if ">sp" in data_line:\n-            namer = data_line.split("|")[2]\n-            if uniprot_accession_in == data_line.split("|")[1]:\n-                match = count + 1\n-                if \'GN=\' in data_line:\n-                    lst = data_line.split(\'GN=\')\n-                    lst2 = lst[1].split(\' \')\n-                    genename = lst2[0]\n-                if \'GN=\' not in data_line:\n-                    genename = \'NA\'\n-                while ">sp" not in data_lines[match]:\n-                    if match <= db_len:\n-                '..b'            # Ensures consistent spacing throughout.\n-            if \'GN=\' in data_line:\n-                lst = data_line.split(\'GN=\')\n-                lst2 = lst[1].split(\' \')\n-                genename = lst2[0]\n-            if \'GN=\' not in data_line:\n-                genename = \'NA\'\n-            while ">sp" not in data_lines[match]:\n-                if match <= db_len:\n-                    seqlength = seqlength + len(data_lines[match].strip())\n-                    if data_lines[match] == last_line:\n-                        break\n-                    match = match + 1\n-                else:\n-                    break\n-            return ReturnValue1(seqlength, genename)\n-        count = count + 1\n-    if seqlength == 0:\n-        error.write(uniprot_accession_in + \'\\t\' + "Uniprot not in Fasta" + \'\\n\')\n-        error.close\n-        seqlength = \'NA\'\n-        genename = \'NA\'\n-        return ReturnValue1(seqlength, genename)\n-def concatenate_files(file_list_string, bait_file):\n-    file_list = file_list_string.split(",")\n-    bait = read_tab(bait_file)\n-    master_table = []\n-    header_check = 0\n-    file_cnt = 0\n-    table_cnt = 0\n-    for i in file_list:\n-        table = read_tab(i)\n-        for j in table:\n-            if table_cnt == 0:\n-                if header_check == 0:\n-                    header_check +=1\n-                    j.append("Replicate")\n-                    j.append("Bait_Grouping")\n-                    master_table.append(j)\n-            if table_cnt > 0:\n-                j.append(bait[file_cnt][0])\n-                j.append(bait[file_cnt][1])\n-                master_table.append(j)\n-            table_cnt +=1\n-        file_cnt+=1\n-        table_cnt = 0\n-    if len(master_table[0]) < len(master_table[1]):\n-        master_table[0] = ["#"] + master_table[0]\n-    with open("merged_PeptideShaker.txt","w") as x:\n-        for i in master_table:\n-            x.write("\\t".join(i))\n-            x.write("\\n")\n-    return master_table\n-def make_inter(master_table,quant_type):\n-    if len(master_table[0]) < len(master_table[1]):\n-        master_table[0] = ["#"] + master_table[0]\n-    replicate_index = master_table[0].index("Replicate")\n-    grouping_index = master_table[0].index("Bait_Grouping")\n-    accession_index = master_table[0].index("Main Accession")\n-    quant_type = quant_type.replace("_", " ")\n-    quant_type = r"#" + quant_type\n-    Quant_index = master_table[0].index(quant_type)\n-    inter_file = ""\n-    for i in master_table[1:]:\n-        line = []\n-        line.append(i[replicate_index])\n-        line.append(i[grouping_index])\n-        line.append(i[accession_index])\n-        line.append(i[Quant_index])\n-        inter_file = inter_file + "\\t".join(line) + "\\n"\n-    with open("inter.txt","w") as x:\n-        x.write(inter_file)\n-    \n-def make_prey(concat_table,fasta_db):\n-    input_data = concat_table\n-    if len(input_data[0]) < len(input_data[1]):\n-        input_data[0] = ["#"] + input_data[0]\n-    accession_index = input_data[0].index("Main Accession")\n-    proteins = []\n-    for i in input_data[1:]:\n-        proteins.append(i[accession_index])\n-    output_file = open("prey.txt", \'w\')\n-    start = 0\n-    end = len(proteins)\n-\n-    # Initial call to print 0% progress\n-    printProgress(start, end, prefix = \'Progress:\', suffix = \'Complete\', barLength = 50)\n-\n-    for protein in proteins:\n-        seq = get_info(protein,fasta_db).seqlength\n-        GN = get_info(protein,fasta_db).genename\n-        if seq != \'NA\':\n-            output_file.write(protein + "\\t" + str(seq) + "\\t" + str(GN) + "\\n")\n-        start+=1\n-        printProgress(start, end, prefix = \'Progress:\', suffix = \'Complete\', barLength = 50)\n-    output_file.close()\n-data = concatenate_files(files,bait)\n-make_inter(data, quant_type)\n-if prey == "true":\n-    make_prey(data,db)\n-\n-os.rename("bait.txt", sys.argv[2])\n-os.rename("inter.txt", sys.argv[10])\n-if str(prey) != "None": \n-    os.rename("prey.txt", sys.argv[11])\n\\ No newline at end of file\n'