comparison search_ppep.py @ 0:8dfd5d2b5903 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 3a7b3609d6e514c9e8f980ecb684960c6b2252fe
author galaxyp
date Mon, 11 Jul 2022 19:22:54 +0000
parents
children b76c75521d91
comparison
equal deleted inserted replaced
-1:000000000000 0:8dfd5d2b5903
1 #!/usr/bin/env python
2 # Search and memoize phosphopeptides in Swiss-Prot SQLite table UniProtKB
3
4 import argparse
5 import os.path
6 import re
7 import sqlite3
8 import sys # import the sys module for exc_info
9 import time
10 import traceback # import the traceback module for format_exception
11 from codecs import getreader as cx_getreader
12
13 # For Aho-Corasick search for fixed set of substrings
14 # - add_word
15 # - make_automaton
16 # - iter
17 import ahocorasick
18
19
20 # ref: https://stackoverflow.com/a/8915613/15509512
21 # answers: "How to handle exceptions in a list comprehensions"
22 # usage:
23 # from math import log
24 # eggs = [1,3,0,3,2]
25 # print([x for x in [catch(log, egg) for egg in eggs] if x is not None])
26 # producing:
27 # for <built-in function log>
28 # with args (0,)
29 # exception: math domain error
30 # [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]
31 def catch(func, *args, handle=lambda e: e, **kwargs):
32
33 try:
34 return func(*args, **kwargs)
35 except Exception as e:
36 print("For %s" % str(func))
37 print(" with args %s" % str(args))
38 print(" caught exception: %s" % str(e))
39 (ty, va, tb) = sys.exc_info()
40 print(" stack trace: " + str(traceback.format_exception(ty, va, tb)))
41 # exit(-1)
42 return None # was handle(e)
43
44
45 def __main__():
46
47 DROP_TABLES_SQL = """
48 DROP VIEW IF EXISTS ppep_gene_site_view;
49 DROP VIEW IF EXISTS uniprot_view;
50 DROP VIEW IF EXISTS uniprotkb_pep_ppep_view;
51 DROP VIEW IF EXISTS ppep_intensity_view;
52 DROP VIEW IF EXISTS ppep_metadata_view;
53
54 DROP TABLE IF EXISTS sample;
55 DROP TABLE IF EXISTS ppep;
56 DROP TABLE IF EXISTS site_type;
57 DROP TABLE IF EXISTS deppep_UniProtKB;
58 DROP TABLE IF EXISTS deppep;
59 DROP TABLE IF EXISTS ppep_gene_site;
60 DROP TABLE IF EXISTS ppep_metadata;
61 DROP TABLE IF EXISTS ppep_intensity;
62 """
63
64 CREATE_TABLES_SQL = """
65 CREATE TABLE deppep
66 ( id INTEGER PRIMARY KEY
67 , seq TEXT UNIQUE ON CONFLICT IGNORE
68 )
69 ;
70 CREATE TABLE deppep_UniProtKB
71 ( deppep_id INTEGER REFERENCES deppep(id) ON DELETE CASCADE
72 , UniProtKB_id TEXT REFERENCES UniProtKB(id) ON DELETE CASCADE
73 , pos_start INTEGER
74 , pos_end INTEGER
75 , PRIMARY KEY (deppep_id, UniProtKB_id, pos_start, pos_end)
76 ON CONFLICT IGNORE
77 )
78 ;
79 CREATE TABLE ppep
80 ( id INTEGER PRIMARY KEY
81 , deppep_id INTEGER REFERENCES deppep(id) ON DELETE CASCADE
82 , seq TEXT UNIQUE ON CONFLICT IGNORE
83 , scrubbed TEXT
84 );
85 CREATE TABLE site_type
86 ( id INTEGER PRIMARY KEY
87 , type_name TEXT UNIQUE ON CONFLICT IGNORE
88 );
89 CREATE INDEX idx_ppep_scrubbed on ppep(scrubbed)
90 ;
91 CREATE TABLE sample
92 ( id INTEGER PRIMARY KEY
93 , name TEXT UNIQUE ON CONFLICT IGNORE
94 )
95 ;
96 CREATE VIEW uniprot_view AS
97 SELECT DISTINCT
98 Uniprot_ID
99 , Description
100 , Organism_Name
101 , Organism_ID
102 , Gene_Name
103 , PE
104 , SV
105 , Sequence
106 , Description ||
107 CASE WHEN Organism_Name = 'N/A'
108 THEN ''
109 ELSE ' OS='|| Organism_Name
110 END ||
111 CASE WHEN Organism_ID = -1
112 THEN ''
113 ELSE ' OX='|| Organism_ID
114 END ||
115 CASE WHEN Gene_Name = 'N/A'
116 THEN ''
117 ELSE ' GN='|| Gene_Name
118 END ||
119 CASE WHEN PE = 'N/A'
120 THEN ''
121 ELSE ' PE='|| PE
122 END ||
123 CASE WHEN SV = 'N/A'
124 THEN ''
125 ELSE ' SV='|| SV
126 END AS long_description
127 , Database
128 FROM UniProtKB
129 ;
130 CREATE VIEW uniprotkb_pep_ppep_view AS
131 SELECT deppep_UniProtKB.UniprotKB_ID AS accession
132 , deppep_UniProtKB.pos_start AS pos_start
133 , deppep_UniProtKB.pos_end AS pos_end
134 , deppep.seq AS peptide
135 , ppep.seq AS phosphopeptide
136 , ppep.scrubbed AS scrubbed
137 , uniprot_view.Sequence AS sequence
138 , uniprot_view.Description AS description
139 , uniprot_view.long_description AS long_description
140 , ppep.id AS ppep_id
141 FROM ppep, deppep, deppep_UniProtKB, uniprot_view
142 WHERE deppep.id = ppep.deppep_id
143 AND deppep.id = deppep_UniProtKB.deppep_id
144 AND deppep_UniProtKB.UniprotKB_ID = uniprot_view.Uniprot_ID
145 ORDER BY UniprotKB_ID, deppep.seq, ppep.seq
146 ;
147 CREATE TABLE ppep_gene_site
148 ( ppep_id INTEGER REFERENCES ppep(id)
149 , gene_names TEXT
150 , site_type_id INTEGER REFERENCES site_type(id)
151 , kinase_map TEXT
152 , PRIMARY KEY (ppep_id, kinase_map) ON CONFLICT IGNORE
153 )
154 ;
155 CREATE VIEW ppep_gene_site_view AS
156 SELECT DISTINCT
157 ppep.seq AS phospho_peptide
158 , ppep_id
159 , gene_names
160 , type_name
161 , kinase_map
162 FROM
163 ppep, ppep_gene_site, site_type
164 WHERE
165 ppep_gene_site.ppep_id = ppep.id
166 AND
167 ppep_gene_site.site_type_id = site_type.id
168 ORDER BY
169 ppep.seq
170 ;
171 CREATE TABLE ppep_metadata
172 ( ppep_id INTEGER REFERENCES ppep(id)
173 , protein_description TEXT
174 , gene_name TEXT
175 , FASTA_name TEXT
176 , phospho_sites TEXT
177 , motifs_unique TEXT
178 , accessions TEXT
179 , motifs_all_members TEXT
180 , domain TEXT
181 , ON_FUNCTION TEXT
182 , ON_PROCESS TEXT
183 , ON_PROT_INTERACT TEXT
184 , ON_OTHER_INTERACT TEXT
185 , notes TEXT
186 , PRIMARY KEY (ppep_id) ON CONFLICT IGNORE
187 )
188 ;
189 CREATE VIEW ppep_metadata_view AS
190 SELECT DISTINCT
191 ppep.seq AS phospho_peptide
192 , protein_description
193 , gene_name
194 , FASTA_name
195 , phospho_sites
196 , motifs_unique
197 , accessions
198 , motifs_all_members
199 , domain
200 , ON_FUNCTION
201 , ON_PROCESS
202 , ON_PROT_INTERACT
203 , ON_OTHER_INTERACT
204 , notes
205 FROM
206 ppep, ppep_metadata
207 WHERE
208 ppep_metadata.ppep_id = ppep.id
209 ORDER BY
210 ppep.seq
211 ;
212 CREATE TABLE ppep_intensity
213 ( ppep_id INTEGER REFERENCES ppep(id)
214 , sample_id INTEGER
215 , intensity INTEGER
216 , PRIMARY KEY (ppep_id, sample_id) ON CONFLICT IGNORE
217 )
218 ;
219 CREATE VIEW ppep_intensity_view AS
220 SELECT DISTINCT
221 ppep.seq AS phospho_peptide
222 , sample.name AS sample
223 , intensity
224 FROM
225 ppep, sample, ppep_intensity
226 WHERE
227 ppep_intensity.sample_id = sample.id
228 AND
229 ppep_intensity.ppep_id = ppep.id
230 ;
231 """
232
233 UNIPROT_SEQ_AND_ID_SQL = """
234 select Sequence, Uniprot_ID
235 from UniProtKB
236 """
237
238 # Parse Command Line
239 parser = argparse.ArgumentParser(
240 description="Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB)."
241 )
242
243 # inputs:
244 # Phosphopeptide data for experimental results, including the intensities
245 # and the mapping to kinase domains, in tabular format.
246 parser.add_argument(
247 "--phosphopeptides",
248 "-p",
249 nargs=1,
250 required=True,
251 dest="phosphopeptides",
252 help="Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool",
253 )
254 parser.add_argument(
255 "--uniprotkb",
256 "-u",
257 nargs=1,
258 required=True,
259 dest="uniprotkb",
260 help="UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool",
261 )
262 parser.add_argument(
263 "--schema",
264 action="store_true",
265 dest="db_schema",
266 help="show updated database schema",
267 )
268 parser.add_argument(
269 "--warn-duplicates",
270 action="store_true",
271 dest="warn_duplicates",
272 help="show warnings for duplicated sequences",
273 )
274 parser.add_argument(
275 "--verbose",
276 action="store_true",
277 dest="verbose",
278 help="show somewhat verbose program tracing",
279 )
280 # "Make it so!" (parse the arguments)
281 options = parser.parse_args()
282 if options.verbose:
283 print("options: " + str(options) + "\n")
284
285 # path to phosphopeptide (e.g., "outputfile_STEP2.txt") input tabular file
286 if options.phosphopeptides is None:
287 exit('Argument "phosphopeptides" is required but not supplied')
288 try:
289 f_name = os.path.abspath(options.phosphopeptides[0])
290 except Exception as e:
291 exit("Error parsing phosphopeptides argument: %s" % (e))
292
293 # path to SQLite input/output tabular file
294 if options.uniprotkb is None:
295 exit('Argument "uniprotkb" is required but not supplied')
296 try:
297 db_name = os.path.abspath(options.uniprotkb[0])
298 except Exception as e:
299 exit("Error parsing uniprotkb argument: %s" % (e))
300
301 # print("options.schema is %d" % options.db_schema)
302
303 # db_name = "demo/test.sqlite"
304 # f_name = "demo/test_input.txt"
305
306 con = sqlite3.connect(db_name)
307 cur = con.cursor()
308 ker = con.cursor()
309
310 cur.executescript(DROP_TABLES_SQL)
311
312 # if options.db_schema:
313 # print("\nAfter dropping tables/views that are to be created, schema is:")
314 # cur.execute("SELECT * FROM sqlite_schema")
315 # for row in cur.fetchall():
316 # if row[4] is not None:
317 # print("%s;" % row[4])
318
319 cur.executescript(CREATE_TABLES_SQL)
320
321 if options.db_schema:
322 print(
323 "\nAfter creating tables/views that are to be created, schema is:"
324 )
325 cur.execute("SELECT * FROM sqlite_schema")
326 for row in cur.fetchall():
327 if row[4] is not None:
328 print("%s;" % row[4])
329
330 def generate_ppep(f):
331 # get keys from upstream tabular file using readline()
332 # ref: https://stackoverflow.com/a/16713581/15509512
333 # answer to "Use codecs to read file with correct encoding"
334 file1_encoded = open(f, "rb")
335 file1 = cx_getreader("latin-1")(file1_encoded)
336
337 count = 0
338 re_tab = re.compile("^[^\t]*")
339 re_quote = re.compile('"')
340 while True:
341 count += 1
342 # Get next line from file
343 line = file1.readline()
344 # if line is empty
345 # end of file is reached
346 if not line:
347 break
348 if count > 1:
349 m = re_tab.match(line)
350 m = re_quote.sub("", m[0])
351 yield m
352 file1.close()
353 file1_encoded.close()
354
355 # Build an Aho-Corasick automaton from a trie
356 # - ref:
357 # - https://pypi.org/project/pyahocorasick/
358 # - https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
359 # - https://en.wikipedia.org/wiki/Trie
360 auto = ahocorasick.Automaton()
361 re_phos = re.compile("p")
362 # scrub out unsearchable characters per section
363 # "Match the p_peptides to the @sequences array:"
364 # of the original
365 # PhosphoPeptide Upstream Kinase Mapping.pl
366 # which originally read
367 # $tmp_p_peptide =~ s/#//g;
368 # $tmp_p_peptide =~ s/\d//g;
369 # $tmp_p_peptide =~ s/\_//g;
370 # $tmp_p_peptide =~ s/\.//g;
371 #
372 re_scrub = re.compile("0-9_.#")
373 ppep_count = 0
374 for ppep in generate_ppep(f_name):
375 ppep_count += 1
376 add_to_trie = False
377 # print(ppep)
378 scrubbed = re_scrub.sub("", ppep)
379 deppep = re_phos.sub("", scrubbed)
380 if options.verbose:
381 print("deppep: %s; scrubbed: %s" % (deppep, scrubbed))
382 # print(deppep)
383 cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,))
384 if cur.fetchone() is None:
385 add_to_trie = True
386 cur.execute("INSERT INTO deppep(seq) VALUES (?)", (deppep,))
387 cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,))
388 deppep_id = cur.fetchone()[0]
389 if add_to_trie:
390 # print((deppep_id, deppep))
391 # Build the trie
392 auto.add_word(deppep, (deppep_id, deppep))
393 cur.execute(
394 "INSERT INTO ppep(seq, scrubbed, deppep_id) VALUES (?,?,?)",
395 (ppep, scrubbed, deppep_id),
396 )
397 # def generate_deppep():
398 # cur.execute("SELECT seq FROM deppep")
399 # for row in cur.fetchall():
400 # yield row[0]
401 cur.execute("SELECT count(*) FROM (SELECT seq FROM deppep GROUP BY seq)")
402 for row in cur.fetchall():
403 deppep_count = row[0]
404
405 cur.execute(
406 "SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)"
407 )
408 for row in cur.fetchall():
409 sequence_count = row[0]
410
411 print("%d phosphopeptides were read from input" % ppep_count)
412 print(
413 "%d corresponding dephosphopeptides are represented in input"
414 % deppep_count
415 )
416 # Look for cases where both Gene_Name and Sequence are identical
417 cur.execute(
418 """
419 SELECT Uniprot_ID, Gene_Name, Sequence
420 FROM UniProtKB
421 WHERE Sequence IN (
422 SELECT Sequence
423 FROM UniProtKB
424 GROUP BY Sequence, Gene_Name
425 HAVING count(*) > 1
426 )
427 ORDER BY Sequence
428 """
429 )
430 duplicate_count = 0
431 old_seq = ""
432 for row in cur.fetchall():
433 if duplicate_count == 0:
434 print(
435 "\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column)."
436 )
437 if row[2] != old_seq:
438 old_seq = row[2]
439 duplicate_count += 1
440 if options.warn_duplicates:
441 print("\n%s\t%s\t%s" % row)
442 else:
443 if options.warn_duplicates:
444 print("%s\t%s" % (row[0], row[1]))
445 if duplicate_count > 0:
446 print(
447 "\n%d sequences have duplicated accession IDs\n" % duplicate_count
448 )
449
450 print("%s accession sequences will be searched\n" % sequence_count)
451
452 # print(auto.dump())
453
454 # Convert the trie to an automaton (a finite-state machine)
455 auto.make_automaton()
456
457 # Execute query for seqs and metadata without fetching the results yet
458 uniprot_seq_and_id = cur.execute(UNIPROT_SEQ_AND_ID_SQL)
459 while 1:
460 batch = uniprot_seq_and_id.fetchmany(size=50)
461 if not batch:
462 break
463 for Sequence, UniProtKB_id in batch:
464 if Sequence is not None:
465 for end_index, (insert_order, original_value) in auto.iter(
466 Sequence
467 ):
468 ker.execute(
469 """
470 INSERT INTO deppep_UniProtKB
471 (deppep_id,UniProtKB_id,pos_start,pos_end)
472 VALUES (?,?,?,?)
473 """,
474 (
475 insert_order,
476 UniProtKB_id,
477 1 + end_index - len(original_value),
478 end_index,
479 ),
480 )
481 else:
482 raise ValueError(
483 "UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID"
484 % (UniProtKB_id,)
485 )
486 ker.execute(
487 """
488 SELECT count(*) || ' accession-peptide-phosphopeptide combinations were found'
489 FROM uniprotkb_pep_ppep_view
490 """
491 )
492 for row in ker.fetchall():
493 print(row[0])
494
495 ker.execute(
496 """
497 SELECT count(*) || ' accession matches were found', count(*) AS accession_count
498 FROM (
499 SELECT accession
500 FROM uniprotkb_pep_ppep_view
501 GROUP BY accession
502 )
503 """
504 )
505 for row in ker.fetchall():
506 print(row[0])
507
508 ker.execute(
509 """
510 SELECT count(*) || ' peptide matches were found'
511 FROM (
512 SELECT peptide
513 FROM uniprotkb_pep_ppep_view
514 GROUP BY peptide
515 )
516 """
517 )
518 for row in ker.fetchall():
519 print(row[0])
520
521 ker.execute(
522 """
523 SELECT count(*) || ' phosphopeptide matches were found', count(*) AS phosphopeptide_count
524 FROM (
525 SELECT phosphopeptide
526 FROM uniprotkb_pep_ppep_view
527 GROUP BY phosphopeptide
528 )
529 """
530 )
531 for row in ker.fetchall():
532 print(row[0])
533
534 # link peptides not found in sequence database to a dummy sequence-record
535 ker.execute(
536 """
537 INSERT INTO deppep_UniProtKB(deppep_id,UniProtKB_id,pos_start,pos_end)
538 SELECT id, 'No Uniprot_ID', 0, 0
539 FROM deppep
540 WHERE id NOT IN (SELECT deppep_id FROM deppep_UniProtKB)
541 """
542 )
543
544 con.commit()
545 ker.execute("vacuum")
546 con.close()
547
548
549 if __name__ == "__main__":
550 wrap_start_time = time.perf_counter()
551 __main__()
552 wrap_stop_time = time.perf_counter()
553 # print(wrap_start_time)
554 # print(wrap_stop_time)
555 print(
556 "\nThe matching process took %d milliseconds to run.\n"
557 % ((wrap_stop_time - wrap_start_time) * 1000),
558 )
559
560 # vim: sw=4 ts=4 et ai :