Repository 'saint_preproc'
hg clone https://toolshed.g2.bx.psu.edu/repos/bornea/saint_preproc

Changeset 0:db1558c92262 (2015-11-10)
Next changeset 1:364f9b42f2ae (2015-11-10)
Commit message:
Uploaded
added:
">saint_preproc/<?xml version="1.0"?>
saint_preproc/SAINT_preprocessing_v5.xml
saint_preproc/SAINT_preprocessing_v6.py
saint_preproc/SAINT_preprocessing_v6_mq_pep.py
saint_preproc/pre_process_protein_name_set.R
saint_preproc/uniprot_names.txt
b
diff -r 000000000000 -r db1558c92262 saint_preproc/<?xml version="1.0"?>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/saint_preproc/<?xml version="1.0"?> Tue Nov 10 12:59:50 2015 -0500
b
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <set_environment version="1.0">
+        <environment_variable name="INSTALL_RUN_PATH" action="set_to">$REPOSITORY_INSTALL_DIR</environment_variable>   
+    </set_environment>-->
+</tool_dependency>
\ No newline at end of file
b
diff -r 000000000000 -r db1558c92262 saint_preproc/SAINT_preprocessing_v5.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/saint_preproc/SAINT_preprocessing_v5.xml Tue Nov 10 12:59:50 2015 -0500
b
@@ -0,0 +1,91 @@
+<tool id="SAINT_preprocessing_v5" name="SAINT pre-processing">
+  <description></description>
+  <command interpreter="python">
+    #if (str($type) == 'Scaffold'):
+      SAINT_preprocessing_v6.py $input $preybool $fasta_db $Inter_file $Prey_file 
+      "
+        #for $ba in $bait
+         ${ba.bait1}
+         ${ba.assign}
+         ${ba.T_C}
+        #end for
+        "
+      $Bait_file \$INSTALL_RUN_PATH/
+    #elif (str($type) == 'MaxQuant'):
+      SAINT_preprocessing_v6_mq_pep.py $input $preybool $fasta_db $Inter_file $Prey_file 
+        "
+        #for $ba in $bait
+          ${ba.bait1}
+          ${ba.assign}
+          ${ba.T_C}
+        #end for
+        "
+      $Bait_file \$INSTALL_RUN_PATH/
+    #end if
+  </command>
+  <requirements>
+    <requirement type="set_environment">INSTALL_RUN_PATH</requirment>
+  </requirments>
+  <inputs>
+    <param type="select" name="type" label="MaxQuant or Scaffold">
+      <option value="MaxQuant">MaxQuant</option>
+      <option value="Scaffold">Scaffold</option> 
+    </param>
+    <param format="dat" name="input" type="data" label="Scaffold or MaxQuant proteinGroup Output"/>
+    <param type="boolean" name="preybool" checked="true" label="Create Prey File"/>
+    <param type="data" name="fasta_db" format="fasta"  label="Provide Uniprot Fasta database" />
+    <repeat name="bait" title="Bait Create">
+      <param name="bait1" type="text" size="100"/>
+      <param name="assign" type="text" size="100"/>
+      <param name="T_C" type="boolean" checked="true" label="Is this a Control?"/>
+    </repeat>
+  </inputs>
+  <outputs>
+    <data format="txt" name="Inter_file" label="Inter File"/>    
+    <data format="txt" name="Prey_file" label="Prey File" />
+    <data format="txt" name="Bait_file" label="Bait File" />
+  </outputs>
+  <stdio>
+    <regex match="error"
+    source="stdout"
+           level="fatal"
+           description="Unknown error"/>
+    <regex match="Error: bad bait"
+           source="stdout"
+           level="fatal"
+           description="Error: bad bait"/>
+  </stdio> 
+
+  <tests>
+    <test>
+      <param name="input" value="fa_gc_content_input.fa"/>
+      <output name="out_file1" file="fa_gc_content_output.txt"/>
+    </test>
+  </tests>
+  <help>
+Pre-processing:
+APOSTL is able to recognize either a Scaffold "Samples Report" file (tab-delimited
+txt file) or the "peptides.txt" file output in the maxquant "txt" output folder. No
+modifications should be made to these files. Using the "Bait Create" tool, you can
+create your "bait.txt" file. It is important that the individual bait names match the
+bait names within your scaffold or maxquant output. APOSTL uses the bait file to find
+the user's baits of interest. Additionally there is an option to make the prey file (Y/N).
+When making a prey file, APOSTL queries uniprot in order to extract protein amino acid
+lengths and gene names. This takes several minutes depending on your internet connection. 
+Some users may want to run SAINTexpress using the same data set while changing which baits 
+are considered test or control. It is useful to toggle "Make Prey" off in order to save 
+time by circumventing this step as the same prey file can be used for both SAINTexpress 
+runs. 
+
+INPUTS:
+
+Scaffold file:
+
+- Scaffold "Samples Report" output (tab-delimited txt file)
+
+
+Maxquant file:
+
+- maxquant "peptides.txt" file (tab-delimited txt file)
+  </help>
+</tool>
b
diff -r 000000000000 -r db1558c92262 saint_preproc/SAINT_preprocessing_v6.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/saint_preproc/SAINT_preprocessing_v6.py Tue Nov 10 12:59:50 2015 -0500
[
b'@@ -0,0 +1,245 @@\n+#######################################################################################\r\n+# Python-code: SAINT pre-processing from Scaffold "Samples Report" output\r\n+# Author: Brent Kuenzi\r\n+#######################################################################################\r\n+# This program reads in a raw Scaffold "Samples Report" output and a user generated\r\n+# bait file and autoformats it into prey and interaction files for SAINTexpress \r\n+# analysis\r\n+#######################################################################################\r\n+import sys\r\n+import urllib2\r\n+import os.path\r\n+#######################################################################################\r\n+## REQUIRED INPUT ##\r\n+\r\n+# 1) infile: Scaffold "Samples Report" output\r\n+# 2) baitfile: SAINT formatted bait file generated in Galaxy\r\n+# 3) prey: Y or N for generating a prey file (requires internet connection)\r\n+#######################################################################################\r\n+infile = sys.argv[1] #Scaffold "Samples Report" output\r\n+prey = sys.argv[2] # Y or N\r\n+fasta_db = sys.argv[3]\r\n+tool_path = sys.argv[8]\r\n+if fasta_db == "None":\r\n+    fasta_db = str(tool_path)  + "/SwissProt_HUMAN_2014_08.fasta"\r\n+make_bait= sys.argv[5]\r\n+\r\n+\r\n+baits = make_bait.split()\r\n+i = 0\r\n+bait_file_tmp = open("bait.txt", "wr")\r\n+order = [] \r\n+bait_cache = []\r\n+\r\n+while i < len(baits):\r\n+    if baits[i+2] == "true":\r\n+        T_C = "C"\r\n+    else:\r\n+        T_C = "T"\r\n+    line1 = baits[i] + "\\t" + baits[i+1] + "\\t" + T_C + "\\n"\r\n+    q = open(infile,"r")\r\n+    for line2 in q:\r\n+        line2 = line2.strip()\r\n+        temp = line2.split(\'\\t\')\r\n+        if "Quantitative Variance" in str(temp):\r\n+            if baits[i] in temp:\r\n+                number_bait = temp.index(str(baits[i]))\r\n+                number_bait = number_bait - 9\r\n+                bait_cache.append((number_bait, str(line1)))\r\n+            else:\r\n+                print "Error: bad bait " + str(baits[i])\r\n+                sys.exit()\r\n+        else: \r\n+            pass                    \r\n+    i = i + 3\r\n+\r\n+bait_cache.sort()\r\n+for line in bait_cache:\r\n+    bait_file_tmp.write(line[1])            \r\n+        \r\n+bait_file_tmp.close()\r\n+baitfile = "bait.txt" \r\n+\r\n+class ReturnValue1(object):\r\n+    def __init__(self, sequence, gene):\r\n+     self.seqlength = sequence\r\n+     self.genename = gene\r\n+class ReturnValue2(object):\r\n+    def __init__(self, getdata, getproteins, getheader):\r\n+        self.data = getdata\r\n+        self.proteins = getproteins\r\n+        self.header = getheader\r\n+\r\n+def main(scaffold_input, baitfile): \r\n+    bait_check(baitfile, scaffold_input)\r\n+    make_inter(scaffold_input)\r\n+    if prey == \'true\':\r\n+        make_prey(scaffold_input)\r\n+        no_error_inter(scaffold_input)\r\n+        os.rename(\'prey.txt\', sys.argv[5])\r\n+    elif prey == \'false\':\r\n+        if os.path.isfile(\'error proteins.txt\') == True:\r\n+            no_error_inter(scaffold_input)\r\n+        pass\r\n+    elif prey != \'true\' or \'false\':\r\n+        sys.exit("Invalid Prey Argument: Y or N")\r\n+\r\n+def get_info(uniprot_accession_in): # get aa lengths and gene name\r\n+    error = open(\'error proteins.txt\', \'a+\')\r\n+#    while True:\r\n+#        i = 0\r\n+#\ttry:  \r\n+#            data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta")\r\n+#            break\r\n+#        except urllib2.HTTPError, err:\r\n+#            i = i + 1\r\n+#            if i == 50:\r\n+#                sys.exit("More than 50 errors. Check your file or try again later.")\r\n+#            if err.code == 404:\r\n+#                error.write(uniprot_accession_in + \'\\t\' + "Invalid URL. Check protein" + \'\\n\')\r\n+#                seqlength = \'NA\'\r\n+#                genename = \'NA\'\r\n+#                return ReturnValue1(seqlength, genename)\r\n+#            elif err.code == 302:\r\n+#                sys.exit("Request timed out. Check connection and try again.")\r\n+#            else:\r\n+#                sys.exit'..b'       if match <= db_len:\r\n+                        seqlength = seqlength + len(lines[match].strip())\r\n+                        match = match + 1\r\n+                    else:\r\n+                        break\r\n+                return ReturnValue1(seqlength, genename)\r\n+        count = count + 1\r\n+        \r\n+\r\n+    if seqlength == 0:\r\n+        error.write(uniprot_accession_in + \'\\t\' + "Uniprot not in Fasta" + \'\\n\')\r\n+        error.close\r\n+        seqlength = \'NA\'\r\n+        genename = \'NA\'\r\n+        return ReturnValue1(seqlength, genename)\r\n+\r\n+def readtab(infile):\r\n+    with open(infile,\'r\') as x: # read in tab-delim text\r\n+        output = []\r\n+        for line in x:\r\n+            line = line.strip()\r\n+            temp = line.split(\'\\t\')\r\n+            output.append(temp)\r\n+    return output\r\n+def read_scaffold(scaffold_input): # Get data, proteins and header from scaffold output\r\n+    dupes = readtab(scaffold_input)\r\n+    cnt = 0\r\n+    for i in dupes:\r\n+        cnt += 1\r\n+        if i[0] == \'#\': # finds the start of second header\r\n+            header_start = cnt-1\r\n+    header = dupes[header_start]\r\n+    prot_start = header.index("Accession Number")\r\n+    data = dupes[header_start+1:len(dupes)-2] # cut off blank line and END OF FILE\r\n+    proteins = []\r\n+    for i in data:\r\n+        i[4] = i[4].split()[0] # removes the (+##) that sometimes is attached\r\n+    for protein in data:\r\n+        proteins.append(protein[prot_start])\r\n+    return ReturnValue2(data, proteins, header)\r\n+def make_inter(scaffold_input):\r\n+    bait = readtab(baitfile)\r\n+    data = read_scaffold(scaffold_input).data\r\n+    header = read_scaffold(scaffold_input).header\r\n+    proteins = read_scaffold(scaffold_input).proteins\r\n+    bait_index = []\r\n+    for i in bait:\r\n+        bait_index.append(header.index(i[0])) # Find just the baits defined in bait file\r\n+    with open(\'inter.txt\', \'w\') as y:\r\n+            a = 0; l=0\r\n+            for bb in bait:\r\n+                for lst in data:\r\n+                    y.write(header[bait_index[l]] + \'\\t\' + bb[1] + \'\\t\' + proteins[a] + \'\\t\' + lst[bait_index[l]] + \'\\n\')\r\n+                    a+=1\r\n+                    if a == len(proteins):\r\n+                        a = 0; l+=1\r\n+def make_prey(scaffold_input):\r\n+    proteins = read_scaffold(scaffold_input).proteins\r\n+    output_file = open("prey.txt",\'w\')\r\n+    for a in proteins:\r\n+        a = a.replace("\\n","") # remove \\n for input into function\r\n+        a = a.replace("\\r","") # ditto for \\r\r\n+        seq = get_info(a).seqlength\r\n+        GN = get_info(a).genename\r\n+        if seq != \'NA\':\r\n+            output_file.write(a+"\\t"+str(seq)+ "\\t" + str(GN) + "\\n")\r\n+    output_file.close()\r\n+def no_error_inter(scaffold_input): # remake inter file without protein errors from Uniprot\r\n+    err = readtab("error proteins.txt")\r\n+    bait = readtab(baitfile)\r\n+    data = read_scaffold(scaffold_input).data\r\n+    header = read_scaffold(scaffold_input).header\r\n+    bait_index = []\r\n+    for i in bait:\r\n+        bait_index.append(header.index(i[0]))\r\n+    proteins = read_scaffold(scaffold_input).proteins\r\n+    errors = []\r\n+    for e in err:\r\n+        errors.append(e[0])\r\n+    with open(\'inter.txt\', \'w\') as y:\r\n+        l = 0; a = 0\r\n+        for bb in bait:\r\n+            for lst in data:\r\n+                if proteins[a] not in errors:\r\n+                    y.write(header[bait_index[l]] + \'\\t\' + bb[1] + \'\\t\' + proteins[a] + \'\\t\' + lst[bait_index[l]] + \'\\n\')\r\n+                a+=1\r\n+                if a == len(proteins):\r\n+                    l += 1; a = 0\r\n+def bait_check(bait, scaffold_input): # check that bait names share header titles\r\n+    bait_in = readtab(bait)\r\n+    header = read_scaffold(scaffold_input).header\r\n+    for i in bait_in:\r\n+        if i[0] not in header:\r\n+            sys.exit("Bait must share header titles with Scaffold output")\r\n+\r\n+if __name__ == \'__main__\':\r\n+    main(infile, baitfile)\r\n+\r\n+os.rename(\'inter.txt\', sys.argv[4])\r\n+os.rename("bait.txt", sys.argv[7])\r\n'
b
diff -r 000000000000 -r db1558c92262 saint_preproc/SAINT_preprocessing_v6_mq_pep.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/saint_preproc/SAINT_preprocessing_v6_mq_pep.py Tue Nov 10 12:59:50 2015 -0500
[
b'@@ -0,0 +1,262 @@\n+#######################################################################################\r\n+# Python-code: SAINT pre-processing from maxquant "Samples Report" output\r\n+# Author: Brent Kuenzi\r\n+#######################################################################################\r\n+# This program reads in a raw maxquant "Samples Report" output and a user generated\r\n+# bait file and autoformats it into prey and interaction files for SAINTexpress \r\n+# analysis\r\n+#######################################################################################\r\n+import sys\r\n+import urllib2\r\n+import os\r\n+#######################################################################################\r\n+## REQUIRED INPUT ##\r\n+\r\n+# 1) infile: maxquant "Samples Report" output\r\n+# 2) baitfile: SAINT formatted bait file generated in Galaxy\r\n+# 3) prey: Y or N for generating a prey file (requires internet connection)\r\n+#######################################################################################\r\n+mq_file = sys.argv[1]\r\n+cmd = r"Rscript /home/bornea/galaxy_moffitt_dev/tools/Moffitt_Tools/bubblebeam/pre_process_protein_name_set.R " + str(mq_file) \r\n+os.system(cmd)\r\n+\r\n+infile = "./tukeys_output.txt" #maxquant "Samples Report" output\r\n+prey = sys.argv[2] # Y or N\r\n+fasta_db = sys.argv[3]\r\n+if fasta_db == "None":\r\n+    fasta_db = "/home/bornea/galaxy_moffitt_dev/tools/Moffitt_Tools/bubblebeam/SwissProt_HUMAN_2014_08.fasta"\r\n+make_bait= sys.argv[6]\r\n+\r\n+def bait_create(baits, infile):\r\n+    #Takes the Bait specified by the user and makes them into a Bait file and includes a check to make sure they are using valid baits.\r\n+    baits = make_bait.split()\r\n+    i = 0\r\n+    bait_file_tmp = open("bait.txt", "wr")\r\n+    order = [] \r\n+    bait_cache = []\r\n+    while i < len(baits):\r\n+        if baits[i+2] == "true":\r\n+            T_C = "C"\r\n+        else:\r\n+            T_C = "T"\r\n+        line1 = baits[i] + "\\t" + baits[i+1] + "\\t" + T_C + "\\n"\r\n+        q = open(infile,"r")\r\n+        for line2 in q:\r\n+           line2 = line2.replace("\\"", "")\r\n+           line2 = line2.replace(r"Intensity.", "") #R coerces "-" into "." this changes them back and remove Intensity from the Bait names.\r\n+           line2 = line2.replace(r".", r"-")\r\n+           temp = line2.split()\r\n+           if "mapped_protein" in str(temp):\r\n+                #If the bait is in the original file then write to cache it if not exit.\r\n+                if baits[i] in temp:\r\n+                    number_bait = temp.index(str(baits[i]))\r\n+                    number_bait = number_bait - 9\r\n+                    bait_cache.append((number_bait, str(line1)))\r\n+                else:\r\n+                    print "Error: bad bait " + str(baits[i])\r\n+                    sys.exit()\r\n+           else: \r\n+                pass                    \r\n+        i = i + 3\r\n+    #Writes cache to file.\r\n+    bait_cache.sort()\r\n+    for line in bait_cache:\r\n+        bait_file_tmp.write(line[1])            \r\n+        \r\n+    bait_file_tmp.close()  \r\n+\r\n+\r\n+baitfile = "bait.txt"\r\n+\r\n+class ReturnValue1(object):\r\n+    def __init__(self, sequence, gene):\r\n+     self.seqlength = sequence\r\n+     self.genename = gene\r\n+class ReturnValue2(object):\r\n+    def __init__(self, getdata, getproteins, getheader):\r\n+        self.data = getdata\r\n+        self.proteins = getproteins\r\n+        self.header = getheader\r\n+\r\n+def main(maxquant_input, make_bait):  \r\n+    bait_create(make_bait, infile)\r\n+    baitfile = "bait.txt"\r\n+    #bait_check(baitfile, maxquant_input)\r\n+    make_inter(maxquant_input)\r\n+    if prey == \'true\':\r\n+        make_prey(maxquant_input)\r\n+        no_error_inter(maxquant_input)\r\n+        os.rename(\'prey.txt\', sys.argv[5])\r\n+    elif prey == \'false\':\r\n+        if os.path.isfile(\'error proteins.txt\') == True:\r\n+            no_error_inter(maxquant_input)\r\n+        pass\r\n+    elif prey != \'true\' or \'false\':\r\n+        sys.exit("Invalid Prey Argument: Y or N")\r\n+    os.rename(\'inter.txt\', sys.argv[4])\r\n+   '..b'tch]:\r\n+                    if match <= db_len:\r\n+                        seqlength = seqlength + len(lines[match].strip())\r\n+                        match = match + 1\r\n+                    else:\r\n+                        break\r\n+                return ReturnValue1(seqlength, genename)\r\n+        count = count + 1\r\n+        \r\n+\r\n+    if seqlength == 0:\r\n+        error.write(uniprot_accession_in + \'\\t\' + "Uniprot not in Fasta" + \'\\n\')\r\n+        error.close\r\n+        seqlength = \'NA\'\r\n+        genename = \'NA\'\r\n+        return ReturnValue1(seqlength, genename)\r\n+\r\n+\r\n+def readtab(infile):\r\n+    with open(infile,\'r\') as x: # read in tab-delim text\r\n+        output = []\r\n+        for line in x:\r\n+            line = line.strip()\r\n+            temp = line.split(\'\\t\')\r\n+            output.append(temp)\r\n+    return output\r\n+def read_maxquant(maxquant_input): # Get data, proteins and header from maxquant output\r\n+    dupes = readtab(maxquant_input)\r\n+    header_start = 0 \r\n+    header = dupes[header_start]\r\n+    for i in header:\r\n+        i = i.replace(r"\\"", "")\r\n+        i = i.replace(r"Intensity.", r"")\r\n+        i = i.replace(r".", r"-")\r\n+    data = dupes[header_start+1:len(dupes)] #cut off blank line and END OF FILE\r\n+    proteins = []\r\n+    for protein in data:\r\n+        proteins.append(protein[0])\r\n+    return ReturnValue2(data, proteins, header)\r\n+def make_inter(maxquant_input):\r\n+    bait = readtab(baitfile)\r\n+    data = read_maxquant(maxquant_input).data\r\n+    header = read_maxquant(maxquant_input).header\r\n+    proteins = read_maxquant(maxquant_input).proteins\r\n+    bait_index = []\r\n+    for i in bait:\r\n+        bait_index.append(header.index("mapped_protein") + 1) # Find just the baits defined in bait file\r\n+    with open(\'inter.txt\', \'w\') as y:\r\n+            a = 0; l=0\r\n+            for bb in bait:\r\n+                for lst in data:\r\n+                    y.write(header[bait_index[l]] + \'\\t\' + bb[1] + \'\\t\' + proteins[a] + \'\\t\' + lst[bait_index[l]] + \'\\n\')\r\n+                    a+=1\r\n+                    if a == len(proteins):\r\n+                        a = 0; l+=1\r\n+def make_prey(maxquant_input):\r\n+    proteins = read_maxquant(maxquant_input).proteins\r\n+    output_file = open("prey.txt",\'w\')\r\n+    for a in proteins:\r\n+        a = a.replace("\\n","") # remove \\n for input into function\r\n+        a = a.replace("\\r","") # ditto for \\r\r\n+        seq = get_info(a).seqlength\r\n+        GN = get_info(a).genename\r\n+        if seq != \'NA\':\r\n+            output_file.write(a+"\\t"+str(seq)+ "\\t" + str(GN) + "\\n")\r\n+    output_file.close()\r\n+def no_error_inter(maxquant_input): # remake inter file without protein errors from Uniprot\r\n+    err = readtab("error proteins.txt")\r\n+    bait = readtab(baitfile)\r\n+    data = read_maxquant(maxquant_input).data\r\n+    header = read_maxquant(maxquant_input).header\r\n+    header = [i.replace(r"\\"", "") for i in header]\r\n+    header = [i.replace(r"Intensity.", r"") for i in header]\r\n+    header = [i.replace(r".", r"-") for i in header]\r\n+    print header\r\n+    bait_index = []\r\n+    for i in bait:\r\n+        bait_index.append(header.index(i[0]))\r\n+    proteins = read_maxquant(maxquant_input).proteins\r\n+    errors = []\r\n+    for e in err:\r\n+        errors.append(e[0])\r\n+    with open(\'inter.txt\', \'w\') as y:\r\n+        l = 0; a = 0\r\n+        for bb in bait:\r\n+            for lst in data:\r\n+                if proteins[a] not in errors:\r\n+                    y.write(header[bait_index[l]] + \'\\t\' + bb[1] + \'\\t\' + proteins[a] + \'\\t\' + lst[bait_index[l]] + \'\\n\')\r\n+                a+=1\r\n+                if a == len(proteins):\r\n+                    l += 1; a = 0\r\n+def bait_check(bait, maxquant_input): # check that bait names share header titles\r\n+    bait_in = readtab(bait)\r\n+    header = read_maxquant(maxquant_input).header\r\n+    for i in bait_in:\r\n+        if i[0] not in header:\r\n+            sys.exit("Bait must share header titles with maxquant output")\r\n+\r\n+if __name__ == \'__main__\':\r\n+    main(infile, make_bait)\r\n'
b
diff -r 000000000000 -r db1558c92262 saint_preproc/pre_process_protein_name_set.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/saint_preproc/pre_process_protein_name_set.R Tue Nov 10 12:59:50 2015 -0500
[
@@ -0,0 +1,77 @@
+library(data.table)
+library(affy)
+library(stringr)
+library(mygene)
+library(VennDiagram)
+#####
+#data
+main <- function(peptides_file) {
+ peptides_file = read.delim(peptides_file,header=TRUE,stringsAsFactors=FALSE,fill=TRUE)
+  peptides_txt = peptides_file
+ intensity_columns = names(peptides_txt[,str_detect(names(peptides_txt),"Intensity\\.*")]) #Pulls out all lines with Intensity in them.
+ intensity_columns = intensity_columns[2:length(intensity_columns)] #Removes the first column that does not have a bait. 
+ peptides_txt_mapped = as.data.frame(map_peptides_proteins(peptides_txt)) #This function as below sets every line to a 1 to 1 intensity to each possible protein.
+ peptides_txt_mapped$Uniprot = str_extract(peptides_txt_mapped$mapped_protein, "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") #Pulls out just Uniprot id from the script.
+ peptides_txt_mapped = subset(peptides_txt_mapped,!is.na(Uniprot)) #removes reverse sequences and any that didn't match a uniprot accession
+ columns_comb = c("Uniprot", intensity_columns) 
+ peptides_mapped_intensity = subset(peptides_txt_mapped, select = columns_comb) #Subsets out only the needed cloumns for Tukeys (Uniprot IDS and baited intensities)
+ swissprot_fasta = scan("/home/philip/galaxy/tools/Moffitt_Tools/uniprot_names.txt",what="character")
+ peptides_txt_mapped_log2 = peptides_mapped_intensity
+  # Takes the log2 of the intensities. 
+ for (i in intensity_columns) { 
+ peptides_txt_mapped_log2[,i] = log2(subset(peptides_txt_mapped_log2, select = i))
+ }
+  #get the minimum from each column while ignoring the -Inf; get the min of these mins for the global min; breaks when there's only one intensity column 
+ global_min = min(apply(peptides_txt_mapped_log2[,2:ncol(peptides_txt_mapped_log2)],2,function(x) {
+   min(x[x != -Inf])
+ }))
+ peptides_txt_mapped_log2[peptides_txt_mapped_log2 == -Inf] <- 0
+  #uniprot accessions WITHOUT isoforms; it looks like only contaminants contain isoforms anyways
+ mapped_protein_uniprotonly = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") 
+ mapped_protein_uniprot_accession = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9](-[0-9]+)?|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(-[0-9]+)?|[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}")
+ peptides_txt_mapped_log2$mapped_protein = mapped_protein_uniprotonly
+  # Runs the Tukey function returning completed table
+  peptides_txt_mapped_log2 = subset(peptides_txt_mapped_log2,mapped_protein %in% swissprot_fasta)
+ protein_intensities_tukeys = get_protein_values(peptides_txt_mapped_log2,intensity_columns)
+  protein_intensities_tukeys[protein_intensities_tukeys == 1] <- 0
+  write.table(protein_intensities_tukeys, "./tukeys_output.txt", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t")
+
+}
+
+map_peptides_proteins = function(peptides_in) {
+    #reverse sequences are blank but have a razor protein indicating that they are reverse; exclude these for now
+    peptides_in = subset(peptides_in,peptides_in$Proteins != "")
+    results_list = list()
+    k = 1
+    for (i in 1:nrow(peptides_in)) {
+        protein_names = peptides_in[i,"Proteins"]
+        protein_names_split = unlist(strsplit(protein_names,";"))
+        for (j in 1:length(protein_names_split)) {
+            peptides_mapped_proteins = data.frame(peptides_in[i,],mapped_protein=protein_names_split[j],stringsAsFactors=FALSE)
+            results_list[[k]] = peptides_mapped_proteins
+            k = k+1
+            
+        }
+    }
+    return(rbindlist(results_list))
+}
+
+get_protein_values = function(mapped_peptides_in,intensity_columns_list) {
+  unique_mapped_proteins_list = unique(mapped_peptides_in$mapped_protein) # Gets list of all peptides listed.
+  # Generates a blank data frame with clomns of Intensities and rows of Uniprots.
+  Tukeys_df = data.frame(mapped_protein = unique_mapped_proteins_list, stringsAsFactors = FALSE ) 
+  for (q in intensity_columns_list) {Tukeys_df[,q] = NA}
+  for (i in 1:length(unique_mapped_proteins_list)) {
+    mapped_peptides_unique_subset = subset(mapped_peptides_in, mapped_protein == unique_mapped_proteins_list[i])
+    #calculate Tukey's Biweight from library(affy); returns a single numeric
+    #results_list[[i]] = data.frame(Protein=unique_mapped_proteins_list[i],Peptides_per_protein=nrow(mapped_peptides_unique_subset))
+    for (j in intensity_columns_list) {
+      #Populates with new Tukeys values.
+      Tukeys_df[i,j] = 2^(tukey.biweight(mapped_peptides_unique_subset[,j]))
+    }
+  }
+  return(Tukeys_df)
+}
+
+args <- commandArgs(trailingOnly = TRUE)
+main(args[1])
b
diff -r 000000000000 -r db1558c92262 saint_preproc/uniprot_names.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/saint_preproc/uniprot_names.txt Tue Nov 10 12:59:50 2015 -0500
b
b'@@ -0,0 +1,218965 @@\n+"x"\n+\n+\n+\n+\n+\n+"P62258"\n+\n+\n+\n+\n+\n+"Q04917"\n+\n+\n+\n+\n+\n+"P61981"\n+\n+\n+\n+\n+\n+"P31947"\n+\n+\n+\n+\n+\n+"P27348"\n+\n+\n+\n+\n+\n+"P63104"\n+\n+\n+\n+\n+\n+"P30443"\n+\n+\n+\n+\n+\n+\n+\n+"P01892"\n+\n+\n+\n+\n+\n+\n+\n+"P04439"\n+\n+\n+\n+\n+\n+\n+\n+"P13746"\n+\n+\n+\n+\n+\n+\n+\n+"Q96QU6"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q4AC99"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P30447"\n+\n+\n+\n+\n+\n+\n+\n+"P05534"\n+\n+\n+\n+\n+\n+\n+\n+"P18462"\n+\n+\n+\n+\n+\n+\n+\n+"P30450"\n+\n+\n+\n+\n+\n+\n+\n+"P30512"\n+\n+\n+\n+\n+\n+\n+\n+"P16188"\n+\n+\n+\n+\n+\n+\n+\n+"P16189"\n+\n+\n+\n+\n+\n+\n+\n+"P10314"\n+\n+\n+\n+\n+\n+\n+\n+"P16190"\n+\n+\n+\n+\n+\n+\n+\n+"P30453"\n+\n+\n+\n+\n+\n+\n+\n+"P30455"\n+\n+\n+\n+\n+\n+\n+\n+"P30456"\n+\n+\n+\n+\n+\n+\n+\n+"P30457"\n+\n+\n+\n+\n+\n+\n+\n+"P01891"\n+\n+\n+\n+\n+\n+\n+\n+"P10316"\n+\n+\n+\n+\n+\n+\n+\n+"P30459"\n+\n+\n+\n+\n+\n+\n+\n+"Q09160"\n+\n+\n+\n+\n+\n+\n+\n+"P01889"\n+\n+\n+\n+\n+\n+\n+\n+"P30460"\n+\n+\n+\n+\n+\n+\n+\n+"P30461"\n+\n+\n+\n+\n+\n+\n+\n+"P30462"\n+\n+\n+\n+\n+\n+\n+\n+"P30464"\n+\n+\n+\n+\n+\n+\n+\n+"P30466"\n+\n+\n+\n+\n+\n+\n+\n+"P03989"\n+\n+\n+\n+\n+\n+\n+\n+"P30685"\n+\n+\n+\n+\n+\n+\n+\n+"P18463"\n+\n+\n+\n+\n+\n+\n+\n+"Q95365"\n+\n+\n+\n+\n+\n+\n+\n+"P30475"\n+\n+\n+\n+\n+\n+\n+\n+"Q04826"\n+\n+\n+\n+\n+\n+\n+\n+"P30479"\n+\n+\n+\n+\n+\n+\n+\n+"P30480"\n+\n+\n+\n+\n+\n+\n+\n+"P30481"\n+\n+\n+\n+\n+\n+\n+\n+"P30483"\n+\n+\n+\n+\n+\n+\n+\n+"P30484"\n+\n+\n+\n+\n+\n+\n+\n+"P30486"\n+\n+\n+\n+\n+\n+\n+\n+"P30485"\n+\n+\n+\n+\n+\n+\n+\n+"P30487"\n+\n+\n+\n+\n+\n+\n+\n+"P30488"\n+\n+\n+\n+\n+\n+\n+\n+"P18464"\n+\n+\n+\n+\n+\n+\n+\n+"P30490"\n+\n+\n+\n+\n+\n+\n+\n+"P30491"\n+\n+\n+\n+\n+\n+\n+\n+"P30492"\n+\n+\n+\n+\n+\n+\n+\n+"P30493"\n+\n+\n+\n+\n+\n+\n+\n+"P30495"\n+\n+\n+\n+\n+\n+\n+\n+"P18465"\n+\n+\n+\n+\n+\n+\n+\n+"P10319"\n+\n+\n+\n+\n+\n+\n+\n+"Q29940"\n+\n+\n+\n+\n+\n+\n+\n+"Q29836"\n+\n+\n+\n+\n+\n+\n+\n+"P30498"\n+\n+\n+\n+\n+\n+\n+\n+"Q31612"\n+\n+\n+\n+\n+\n+\n+\n+"Q31610"\n+\n+\n+\n+\n+\n+\n+\n+"Q29718"\n+\n+\n+\n+\n+\n+\n+\n+"P30499"\n+\n+\n+\n+\n+\n+\n+\n+"P30501"\n+\n+\n+\n+\n+\n+\n+\n+"P04222"\n+\n+\n+\n+\n+\n+\n+\n+"P30504"\n+\n+\n+\n+\n+\n+\n+\n+"Q9TNN7"\n+\n+\n+\n+\n+\n+\n+\n+"Q29963"\n+\n+\n+\n+\n+\n+\n+\n+"P10321"\n+\n+\n+\n+\n+\n+\n+\n+"P30505"\n+\n+\n+\n+\n+\n+\n+\n+"P30508"\n+\n+\n+\n+\n+\n+\n+\n+"P30510"\n+\n+\n+\n+\n+\n+\n+\n+"Q07000"\n+\n+\n+\n+\n+\n+\n+\n+"Q29960"\n+\n+\n+\n+\n+\n+\n+\n+"Q95604"\n+\n+\n+\n+\n+\n+\n+\n+"Q29865"\n+\n+\n+\n+\n+\n+\n+\n+"Q15172"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q15173"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q14738"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q16537"\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q13362"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P30153"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P30154"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P63151"\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q00005"\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q66LE6"\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q9Y2T4"\n+\n+\n+\n+\n+\n+\n+\n+\n+"P01912"\n+\n+\n+\n+\n+\n+"P13760"\n+\n+\n+\n+\n+\n+"P04229"\n+\n+\n+\n+\n+\n+"P13761"\n+\n+\n+\n+\n+\n+"Q30134"\n+\n+\n+\n+\n+\n+"Q9TQE0"\n+\n+\n+\n+\n+\n+"Q30167"\n+\n+\n+\n+\n+\n+"P20039"\n+\n+\n+\n+\n+\n+"Q95IE3"\n+\n+\n+\n+\n+\n+"Q5Y7A7"\n+\n+\n+\n+\n+\n+"Q9GIY3"\n+\n+\n+\n+\n+\n+"P01911"\n+\n+\n+\n+\n+\n+"Q29974"\n+\n+\n+\n+\n+\n+"P14060"\n+\n+\n+\n+\n+\n+\n+\n+"P26439"\n+\n+\n+\n+\n+\n+\n+\n+"Q9H2F3"\n+\n+\n+\n+\n+\n+\n+\n+"Q9Y3L3"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P78314"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q7L8J4"\n+\n+\n+\n+\n+\n+\n+\n+"O60239"\n+\n+\n+\n+\n+\n+\n+\n+\n+"P46952"\n+\n+\n+\n+\n+\n+"P31937"\n+\n+\n+\n+\n+\n+\n+"P29372"\n+\n+\n+\n+\n+\n+"P11171"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q13541"\n+\n+\n+"Q13542"\n+\n+\n+"O60516"\n+\n+\n+"Q9NRA8"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P08195"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P08908"\n+\n+\n+\n+\n+\n+\n+\n+\n+"P28222"\n+\n+\n+\n+\n+\n+\n+\n+"P28221"\n+\n+\n+\n+\n+\n+\n+\n+"P28566"\n+\n+\n+\n+\n+\n+\n+\n+"P30939"\n+\n+\n+\n+\n+\n+\n+\n+"P28223"\n+\n+\n+\n+\n+\n+\n+\n+\n+"P41595"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P28335"\n+\n+\n+\n+\n+\n+\n+\n+\n+"P46098"\n+\n+\n+\n+\n+\n+\n+\n+\n+"O95264"\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q8WXA8"\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q70Z44"\n+\n+\n+\n+\n+\n+\n+\n+\n+"A5X5Y0"\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q13639"\n+\n+\n+\n+\n+\n+\n+\n+"P47898"\n+\n+\n+\n+\n+\n+\n+"P50406"\n+\n+\n+\n+\n+\n+\n+\n+\n+"P34969"\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q96P26"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q9BXI3"\n+\n+\n+\n+\n+\n+\n+\n+"Q9H0P0"\n+\n+\n+\n+\n+\n+\n+"Q969T7"\n+\n+\n+\n+\n+\n+"P49902"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P21589"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P56378"\n+\n+"P52209"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"O95336"\n+\n+\n+\n+\n+\n+"P05408"\n+\n+\n+\n+\n+"P36639"\n+\n+\n+\n+\n+"P0DKL9"\n+\n+\n+\n+"Q8IZ83"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q676U5"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q8A4"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P02763"\n+\n+\n+\n+\n+"P19652"\n+\n+\n+\n+\n+"P20848"\n+\n+\n+\n+\n+\n+\n+\n+"P01009"\n+\n+\n+\n+\n+\n+\n+\n+"P04217"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q9NQ94"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q8NF67"\n+\n+\n+\n+\n+\n+"Q5TYW2"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q5VUR7"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q5SQ80"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q4UJ75"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"A0PJZ0"\n+\n+\n+\n+'..b'\n+\n+\n+"P21506"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P17017"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P17020"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P17021"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P17022"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P17014"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P17024"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P17026"\n+\n+\n+\n+\n+"P17023"\n+\n+\n+\n+\n+\n+\n+\n+\n+"P17028"\n+\n+\n+\n+\n+\n+\n+\n+"P17027"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P17030"\n+\n+\n+\n+\n+\n+\n+\n+\n+"P17031"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q9BSG1"\n+\n+\n+\n+\n+\n+\n+\n+\n+"P17039"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P17041"\n+\n+\n+\n+\n+\n+"P17035"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q8IZ26"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P17036"\n+\n+\n+\n+\n+\n+\n+\n+\n+"P13682"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P51814"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P17038"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q02386"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P15621"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q96MX3"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q15929"\n+\n+\n+\n+"Q68EA5"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q6ZN08"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q15940"\n+\n+\n+\n+\n+"O43830"\n+\n+\n+\n+\n+\n+\n+"Q16587"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q9UC07"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P36508"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q9NQZ8"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q15935"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P17097"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q15937"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P51504"\n+\n+\n+\n+\n+\n+"Q9UC06"\n+\n+\n+\n+\n+\n+\n+\n+\n+"P51508"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P51522"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P51523"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q03923"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P17098"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q03938"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q05481"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P35789"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"A6NK75"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q03936"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"A8MXY4"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q9P2E3"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"O43257"\n+\n+\n+\n+"Q0IIN9"\n+\n+\n+\n+\n+"Q15649"\n+\n+\n+\n+"Q9Y6M5"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q6XR72"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q8WWF5"\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q8ND25"\n+\n+\n+\n+\n+"Q9ULT6"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q9UHR6"\n+\n+\n+\n+\n+\n+\n+\n+"Q9BRI3"\n+\n+\n+\n+\n+\n+\n+"Q99726"\n+\n+\n+\n+\n+\n+\n+\n+"Q8TAD4"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q8NHG8"\n+\n+\n+\n+\n+\n+"Q6NXT4"\n+\n+\n+\n+\n+\n+\n+\n+\n+"O14863"\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q8NEW0"\n+\n+\n+\n+\n+\n+\n+\n+"Q8IWU4"\n+\n+\n+\n+\n+\n+\n+\n+"Q6PML9"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q07157"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q9UDY2"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"O95049"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P60852"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q05996"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P21754"\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q12836"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q6X784"\n+\n+\n+\n+\n+\n+\n+"Q9UK55"\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q9BS86"\n+\n+\n+\n+\n+\n+\n+"Q8TCW7"\n+\n+\n+\n+\n+\n+\n+\n+"O95218"\n+\n+\n+\n+\n+\n+\n+"O75312"\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q5FWF4"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q9UGI0"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q2KJ03"\n+\n+\n+\n+\n+"A6NJL1"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P0CG00"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q96SZ4"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"A6NGD5"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"O43309"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q9BUG6"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q9H4T2"\n+\n+\n+\n+\n+\n+\n+"P17040"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q8TBC5"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P10073"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q16670"\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q9Y5A6"\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q6NSZ9"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q3MJ62"\n+\n+\n+\n+\n+\n+\n+\n+"Q96LW9"\n+\n+\n+\n+\n+\n+\n+\n+"Q8IWY8"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q86W11"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q9NX65"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"O15535"\n+\n+\n+\n+\n+\n+\n+\n+"Q7Z7L9"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q8NBB4"\n+\n+\n+\n+\n+\n+\n+\n+"Q9BR11"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q8M6"\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q8NEG5"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q96MP5"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q9P217"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q9HCJ5"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"A7E2V4"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q19AV6"\n+\n+\n+\n+"Q9H7M6"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q96AP4"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"O43264"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q9H900"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"O95229"\n+\n+\n+\n+\n+\n+"P98169"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q2QGD7"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q6WRX3"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"P98168"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q9C0D3"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q15942"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"O43149"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+"Q8IYH5"\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n'