diff search_ppep.py @ 0:dbff53e6f75f draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 3a7b3609d6e514c9e8f980ecb684960c6b2252fe
author galaxyp
date Mon, 11 Jul 2022 19:22:25 +0000
parents
children 08678c931f5d
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/search_ppep.py	Mon Jul 11 19:22:25 2022 +0000
@@ -0,0 +1,560 @@
+#!/usr/bin/env python
+# Search and memoize phosphopeptides in Swiss-Prot SQLite table UniProtKB
+
+import argparse
+import os.path
+import re
+import sqlite3
+import sys  # import the sys module for exc_info
+import time
+import traceback  # import the traceback module for format_exception
+from codecs import getreader as cx_getreader
+
+# For Aho-Corasick search for fixed set of substrings
+# - add_word
+# - make_automaton
+# - iter
+import ahocorasick
+
+
+# ref: https://stackoverflow.com/a/8915613/15509512
+#   answers: "How to handle exceptions in a list comprehensions"
+#   usage:
+#       from math import log
+#       eggs = [1,3,0,3,2]
+#       print([x for x in [catch(log, egg) for egg in eggs] if x is not None])
+#   producing:
+#       for <built-in function log>
+#         with args (0,)
+#         exception: math domain error
+#       [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]
+def catch(func, *args, handle=lambda e: e, **kwargs):
+
+    try:
+        return func(*args, **kwargs)
+    except Exception as e:
+        print("For %s" % str(func))
+        print("  with args %s" % str(args))
+        print("  caught exception: %s" % str(e))
+        (ty, va, tb) = sys.exc_info()
+        print("  stack trace: " + str(traceback.format_exception(ty, va, tb)))
+        # exit(-1)
+        return None  # was handle(e)
+
+
+def __main__():
+
+    DROP_TABLES_SQL = """
+        DROP VIEW  IF EXISTS ppep_gene_site_view;
+        DROP VIEW  IF EXISTS uniprot_view;
+        DROP VIEW  IF EXISTS uniprotkb_pep_ppep_view;
+        DROP VIEW  IF EXISTS ppep_intensity_view;
+        DROP VIEW  IF EXISTS ppep_metadata_view;
+
+        DROP TABLE IF EXISTS sample;
+        DROP TABLE IF EXISTS ppep;
+        DROP TABLE IF EXISTS site_type;
+        DROP TABLE IF EXISTS deppep_UniProtKB;
+        DROP TABLE IF EXISTS deppep;
+        DROP TABLE IF EXISTS ppep_gene_site;
+        DROP TABLE IF EXISTS ppep_metadata;
+        DROP TABLE IF EXISTS ppep_intensity;
+    """
+
+    CREATE_TABLES_SQL = """
+        CREATE TABLE deppep
+          ( id INTEGER PRIMARY KEY
+          , seq TEXT UNIQUE                            ON CONFLICT IGNORE
+          )
+          ;
+        CREATE TABLE deppep_UniProtKB
+          ( deppep_id    INTEGER REFERENCES deppep(id) ON DELETE CASCADE
+          , UniProtKB_id TEXT REFERENCES UniProtKB(id) ON DELETE CASCADE
+          , pos_start    INTEGER
+          , pos_end      INTEGER
+          , PRIMARY KEY (deppep_id, UniProtKB_id, pos_start, pos_end)
+                                                       ON CONFLICT IGNORE
+          )
+          ;
+        CREATE TABLE ppep
+          ( id        INTEGER PRIMARY KEY
+          , deppep_id INTEGER REFERENCES deppep(id)    ON DELETE CASCADE
+          , seq       TEXT UNIQUE                      ON CONFLICT IGNORE
+          , scrubbed  TEXT
+          );
+        CREATE TABLE site_type
+          ( id        INTEGER PRIMARY KEY
+          , type_name TEXT UNIQUE                      ON CONFLICT IGNORE
+          );
+        CREATE INDEX idx_ppep_scrubbed on ppep(scrubbed)
+          ;
+        CREATE TABLE sample
+          ( id        INTEGER PRIMARY KEY
+          , name      TEXT UNIQUE                      ON CONFLICT IGNORE
+          )
+          ;
+        CREATE VIEW uniprot_view AS
+          SELECT DISTINCT
+              Uniprot_ID
+            , Description
+            , Organism_Name
+            , Organism_ID
+            , Gene_Name
+            , PE
+            , SV
+            , Sequence
+            , Description ||
+                CASE WHEN Organism_Name = 'N/A'
+                     THEN ''
+                     ELSE ' OS='|| Organism_Name
+                     END ||
+                CASE WHEN Organism_ID = -1
+                     THEN ''
+                     ELSE ' OX='|| Organism_ID
+                     END ||
+                CASE WHEN Gene_Name = 'N/A'
+                     THEN ''
+                     ELSE ' GN='|| Gene_Name
+                     END ||
+                CASE WHEN PE = 'N/A'
+                     THEN ''
+                     ELSE ' PE='|| PE
+                     END ||
+                CASE WHEN SV = 'N/A'
+                     THEN ''
+                     ELSE ' SV='|| SV
+                     END AS long_description
+            , Database
+          FROM UniProtKB
+          ;
+        CREATE VIEW uniprotkb_pep_ppep_view AS
+          SELECT   deppep_UniProtKB.UniprotKB_ID       AS accession
+                 , deppep_UniProtKB.pos_start          AS pos_start
+                 , deppep_UniProtKB.pos_end            AS pos_end
+                 , deppep.seq                          AS peptide
+                 , ppep.seq                            AS phosphopeptide
+                 , ppep.scrubbed                       AS scrubbed
+                 , uniprot_view.Sequence               AS sequence
+                 , uniprot_view.Description            AS description
+                 , uniprot_view.long_description       AS long_description
+                 , ppep.id                             AS ppep_id
+          FROM     ppep, deppep, deppep_UniProtKB, uniprot_view
+          WHERE    deppep.id = ppep.deppep_id
+          AND      deppep.id = deppep_UniProtKB.deppep_id
+          AND      deppep_UniProtKB.UniprotKB_ID = uniprot_view.Uniprot_ID
+          ORDER BY UniprotKB_ID, deppep.seq, ppep.seq
+          ;
+        CREATE TABLE ppep_gene_site
+          ( ppep_id         INTEGER REFERENCES ppep(id)
+          , gene_names      TEXT
+          , site_type_id    INTEGER REFERENCES site_type(id)
+          , kinase_map      TEXT
+          , PRIMARY KEY (ppep_id, kinase_map)          ON CONFLICT IGNORE
+          )
+          ;
+        CREATE VIEW ppep_gene_site_view AS
+          SELECT DISTINCT
+            ppep.seq   AS phospho_peptide
+          , ppep_id
+          , gene_names
+          , type_name
+          , kinase_map
+          FROM
+            ppep, ppep_gene_site, site_type
+          WHERE
+              ppep_gene_site.ppep_id = ppep.id
+            AND
+              ppep_gene_site.site_type_id = site_type.id
+          ORDER BY
+            ppep.seq
+            ;
+        CREATE TABLE ppep_metadata
+          ( ppep_id             INTEGER REFERENCES ppep(id)
+          , protein_description TEXT
+          , gene_name           TEXT
+          , FASTA_name          TEXT
+          , phospho_sites       TEXT
+          , motifs_unique       TEXT
+          , accessions          TEXT
+          , motifs_all_members  TEXT
+          , domain              TEXT
+          , ON_FUNCTION         TEXT
+          , ON_PROCESS          TEXT
+          , ON_PROT_INTERACT    TEXT
+          , ON_OTHER_INTERACT   TEXT
+          , notes               TEXT
+          , PRIMARY KEY (ppep_id)                      ON CONFLICT IGNORE
+          )
+          ;
+        CREATE VIEW ppep_metadata_view AS
+          SELECT DISTINCT
+              ppep.seq             AS phospho_peptide
+            , protein_description
+            , gene_name
+            , FASTA_name
+            , phospho_sites
+            , motifs_unique
+            , accessions
+            , motifs_all_members
+            , domain
+            , ON_FUNCTION
+            , ON_PROCESS
+            , ON_PROT_INTERACT
+            , ON_OTHER_INTERACT
+            , notes
+          FROM
+            ppep, ppep_metadata
+          WHERE
+              ppep_metadata.ppep_id = ppep.id
+          ORDER BY
+            ppep.seq
+            ;
+        CREATE TABLE ppep_intensity
+          ( ppep_id    INTEGER REFERENCES ppep(id)
+          , sample_id  INTEGER
+          , intensity  INTEGER
+          , PRIMARY KEY (ppep_id, sample_id)           ON CONFLICT IGNORE
+          )
+          ;
+        CREATE VIEW ppep_intensity_view AS
+          SELECT DISTINCT
+              ppep.seq             AS phospho_peptide
+            , sample.name          AS sample
+            , intensity
+          FROM
+            ppep, sample, ppep_intensity
+          WHERE
+              ppep_intensity.sample_id = sample.id
+            AND
+              ppep_intensity.ppep_id = ppep.id
+          ;
+    """
+
+    UNIPROT_SEQ_AND_ID_SQL = """
+        select    Sequence, Uniprot_ID
+             from UniProtKB
+    """
+
+    # Parse Command Line
+    parser = argparse.ArgumentParser(
+        description="Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB)."
+    )
+
+    # inputs:
+    #   Phosphopeptide data for experimental results, including the intensities
+    #   and the mapping to kinase domains, in tabular format.
+    parser.add_argument(
+        "--phosphopeptides",
+        "-p",
+        nargs=1,
+        required=True,
+        dest="phosphopeptides",
+        help="Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool",
+    )
+    parser.add_argument(
+        "--uniprotkb",
+        "-u",
+        nargs=1,
+        required=True,
+        dest="uniprotkb",
+        help="UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool",
+    )
+    parser.add_argument(
+        "--schema",
+        action="store_true",
+        dest="db_schema",
+        help="show updated database schema",
+    )
+    parser.add_argument(
+        "--warn-duplicates",
+        action="store_true",
+        dest="warn_duplicates",
+        help="show warnings for duplicated sequences",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        dest="verbose",
+        help="show somewhat verbose program tracing",
+    )
+    # "Make it so!" (parse the arguments)
+    options = parser.parse_args()
+    if options.verbose:
+        print("options: " + str(options) + "\n")
+
+    # path to phosphopeptide (e.g., "outputfile_STEP2.txt") input tabular file
+    if options.phosphopeptides is None:
+        exit('Argument "phosphopeptides" is required but not supplied')
+    try:
+        f_name = os.path.abspath(options.phosphopeptides[0])
+    except Exception as e:
+        exit("Error parsing phosphopeptides argument: %s" % (e))
+
+    # path to SQLite input/output tabular file
+    if options.uniprotkb is None:
+        exit('Argument "uniprotkb" is required but not supplied')
+    try:
+        db_name = os.path.abspath(options.uniprotkb[0])
+    except Exception as e:
+        exit("Error parsing uniprotkb argument: %s" % (e))
+
+    # print("options.schema is %d" % options.db_schema)
+
+    # db_name = "demo/test.sqlite"
+    # f_name  = "demo/test_input.txt"
+
+    con = sqlite3.connect(db_name)
+    cur = con.cursor()
+    ker = con.cursor()
+
+    cur.executescript(DROP_TABLES_SQL)
+
+    # if options.db_schema:
+    #     print("\nAfter dropping tables/views that are to be created, schema is:")
+    #     cur.execute("SELECT * FROM sqlite_schema")
+    #     for row in cur.fetchall():
+    #         if row[4] is not None:
+    #             print("%s;" % row[4])
+
+    cur.executescript(CREATE_TABLES_SQL)
+
+    if options.db_schema:
+        print(
+            "\nAfter creating tables/views that are to be created, schema is:"
+        )
+        cur.execute("SELECT * FROM sqlite_schema")
+        for row in cur.fetchall():
+            if row[4] is not None:
+                print("%s;" % row[4])
+
+    def generate_ppep(f):
+        # get keys from upstream tabular file using readline()
+        # ref: https://stackoverflow.com/a/16713581/15509512
+        #      answer to "Use codecs to read file with correct encoding"
+        file1_encoded = open(f, "rb")
+        file1 = cx_getreader("latin-1")(file1_encoded)
+
+        count = 0
+        re_tab = re.compile("^[^\t]*")
+        re_quote = re.compile('"')
+        while True:
+            count += 1
+            # Get next line from file
+            line = file1.readline()
+            # if line is empty
+            # end of file is reached
+            if not line:
+                break
+            if count > 1:
+                m = re_tab.match(line)
+                m = re_quote.sub("", m[0])
+                yield m
+        file1.close()
+        file1_encoded.close()
+
+    # Build an Aho-Corasick automaton from a trie
+    # - ref:
+    #   - https://pypi.org/project/pyahocorasick/
+    #   - https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
+    #   - https://en.wikipedia.org/wiki/Trie
+    auto = ahocorasick.Automaton()
+    re_phos = re.compile("p")
+    # scrub out unsearchable characters per section
+    #   "Match the p_peptides to the @sequences array:"
+    # of the original
+    #   PhosphoPeptide Upstream Kinase Mapping.pl
+    # which originally read
+    #   $tmp_p_peptide =~ s/#//g;
+    #   $tmp_p_peptide =~ s/\d//g;
+    #   $tmp_p_peptide =~ s/\_//g;
+    #   $tmp_p_peptide =~ s/\.//g;
+    #
+    re_scrub = re.compile("0-9_.#")
+    ppep_count = 0
+    for ppep in generate_ppep(f_name):
+        ppep_count += 1
+        add_to_trie = False
+        # print(ppep)
+        scrubbed = re_scrub.sub("", ppep)
+        deppep = re_phos.sub("", scrubbed)
+        if options.verbose:
+            print("deppep: %s; scrubbed: %s" % (deppep, scrubbed))
+        # print(deppep)
+        cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,))
+        if cur.fetchone() is None:
+            add_to_trie = True
+        cur.execute("INSERT INTO deppep(seq) VALUES (?)", (deppep,))
+        cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,))
+        deppep_id = cur.fetchone()[0]
+        if add_to_trie:
+            # print((deppep_id, deppep))
+            # Build the trie
+            auto.add_word(deppep, (deppep_id, deppep))
+        cur.execute(
+            "INSERT INTO ppep(seq, scrubbed, deppep_id) VALUES (?,?,?)",
+            (ppep, scrubbed, deppep_id),
+        )
+    # def generate_deppep():
+    #     cur.execute("SELECT seq FROM deppep")
+    #     for row in cur.fetchall():
+    #         yield row[0]
+    cur.execute("SELECT count(*) FROM (SELECT seq FROM deppep GROUP BY seq)")
+    for row in cur.fetchall():
+        deppep_count = row[0]
+
+    cur.execute(
+        "SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)"
+    )
+    for row in cur.fetchall():
+        sequence_count = row[0]
+
+    print("%d phosphopeptides were read from input" % ppep_count)
+    print(
+        "%d corresponding dephosphopeptides are represented in input"
+        % deppep_count
+    )
+    # Look for cases where both Gene_Name and Sequence are identical
+    cur.execute(
+        """
+      SELECT Uniprot_ID, Gene_Name, Sequence
+      FROM   UniProtKB
+      WHERE  Sequence IN (
+        SELECT   Sequence
+        FROM     UniProtKB
+        GROUP BY Sequence, Gene_Name
+        HAVING   count(*) > 1
+        )
+      ORDER BY Sequence
+      """
+    )
+    duplicate_count = 0
+    old_seq = ""
+    for row in cur.fetchall():
+        if duplicate_count == 0:
+            print(
+                "\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column)."
+            )
+        if row[2] != old_seq:
+            old_seq = row[2]
+            duplicate_count += 1
+            if options.warn_duplicates:
+                print("\n%s\t%s\t%s" % row)
+        else:
+            if options.warn_duplicates:
+                print("%s\t%s" % (row[0], row[1]))
+    if duplicate_count > 0:
+        print(
+            "\n%d sequences have duplicated accession IDs\n" % duplicate_count
+        )
+
+    print("%s accession sequences will be searched\n" % sequence_count)
+
+    # print(auto.dump())
+
+    # Convert the trie to an automaton (a finite-state machine)
+    auto.make_automaton()
+
+    # Execute query for seqs and metadata without fetching the results yet
+    uniprot_seq_and_id = cur.execute(UNIPROT_SEQ_AND_ID_SQL)
+    while 1:
+        batch = uniprot_seq_and_id.fetchmany(size=50)
+        if not batch:
+            break
+        for Sequence, UniProtKB_id in batch:
+            if Sequence is not None:
+                for end_index, (insert_order, original_value) in auto.iter(
+                    Sequence
+                ):
+                    ker.execute(
+                        """
+                      INSERT INTO deppep_UniProtKB
+                        (deppep_id,UniProtKB_id,pos_start,pos_end)
+                      VALUES (?,?,?,?)
+                      """,
+                        (
+                            insert_order,
+                            UniProtKB_id,
+                            1 + end_index - len(original_value),
+                            end_index,
+                        ),
+                    )
+            else:
+                raise ValueError(
+                    "UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID"
+                    % (UniProtKB_id,)
+                )
+    ker.execute(
+        """
+        SELECT   count(*) || ' accession-peptide-phosphopeptide combinations were found'
+        FROM     uniprotkb_pep_ppep_view
+        """
+    )
+    for row in ker.fetchall():
+        print(row[0])
+
+    ker.execute(
+        """
+      SELECT   count(*) || ' accession matches were found', count(*) AS accession_count
+      FROM     (
+        SELECT   accession
+        FROM     uniprotkb_pep_ppep_view
+        GROUP BY accession
+        )
+      """
+    )
+    for row in ker.fetchall():
+        print(row[0])
+
+    ker.execute(
+        """
+      SELECT   count(*) || ' peptide matches were found'
+      FROM     (
+        SELECT   peptide
+        FROM     uniprotkb_pep_ppep_view
+        GROUP BY peptide
+        )
+      """
+    )
+    for row in ker.fetchall():
+        print(row[0])
+
+    ker.execute(
+        """
+      SELECT   count(*) || ' phosphopeptide matches were found', count(*) AS phosphopeptide_count
+      FROM     (
+        SELECT   phosphopeptide
+        FROM     uniprotkb_pep_ppep_view
+        GROUP BY phosphopeptide
+        )
+      """
+    )
+    for row in ker.fetchall():
+        print(row[0])
+
+    # link peptides not found in sequence database to a dummy sequence-record
+    ker.execute(
+        """
+        INSERT INTO deppep_UniProtKB(deppep_id,UniProtKB_id,pos_start,pos_end)
+          SELECT id, 'No Uniprot_ID', 0, 0
+          FROM   deppep
+          WHERE  id NOT IN (SELECT deppep_id FROM deppep_UniProtKB)
+        """
+    )
+
+    con.commit()
+    ker.execute("vacuum")
+    con.close()
+
+
+if __name__ == "__main__":
+    wrap_start_time = time.perf_counter()
+    __main__()
+    wrap_stop_time = time.perf_counter()
+    # print(wrap_start_time)
+    # print(wrap_stop_time)
+    print(
+        "\nThe matching process took %d milliseconds to run.\n"
+        % ((wrap_stop_time - wrap_start_time) * 1000),
+    )
+
+# vim: sw=4 ts=4 et ai :