Mercurial > repos > galaxyp > mqppep_preproc
comparison mqppep_mrgfltr.py @ 0:8dfd5d2b5903 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 3a7b3609d6e514c9e8f980ecb684960c6b2252fe
author | galaxyp |
---|---|
date | Mon, 11 Jul 2022 19:22:54 +0000 |
parents | |
children | b76c75521d91 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:8dfd5d2b5903 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # Import the packages needed | |
4 import argparse | |
5 import operator # for operator.itemgetter | |
6 import os.path | |
7 import re | |
8 import shutil # for shutil.copyfile(src, dest) | |
9 import sqlite3 as sql | |
10 import sys # import the sys module for exc_info | |
11 import time | |
12 import traceback # for formatting stack-trace | |
13 from codecs import getreader as cx_getreader | |
14 | |
15 import numpy as np | |
16 import pandas | |
17 | |
18 # global constants | |
19 N_A = "N/A" | |
20 | |
21 | |
22 # ref: https://stackoverflow.com/a/8915613/15509512 | |
23 # answers: "How to handle exceptions in a list comprehensions" | |
24 # usage: | |
25 # from math import log | |
26 # eggs = [1,3,0,3,2] | |
27 # print([x for x in [catch(log, egg) for egg in eggs] if x is not None]) | |
28 # producing: | |
29 # for <built-in function log> | |
30 # with args (0,) | |
31 # exception: math domain error | |
32 # [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453] | |
33 def catch(func, *args, handle=lambda e: e, **kwargs): | |
34 | |
35 try: | |
36 return func(*args, **kwargs) | |
37 except Exception as e: | |
38 print("For %s" % str(func)) | |
39 print(" with args %s" % str(args)) | |
40 print(" caught exception: %s" % str(e)) | |
41 (ty, va, tb) = sys.exc_info() | |
42 print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) | |
43 exit(-1) | |
44 return None | |
45 | |
46 | |
47 def whine(func, *args, handle=lambda e: e, **kwargs): | |
48 | |
49 try: | |
50 return func(*args, **kwargs) | |
51 except Exception as e: | |
52 print("Warning: For %s" % str(func)) | |
53 print(" with args %s" % str(args)) | |
54 print(" caught exception: %s" % str(e)) | |
55 (ty, va, tb) = sys.exc_info() | |
56 print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) | |
57 return None | |
58 | |
59 | |
60 def ppep_join(x): | |
61 x = [i for i in x if N_A != i] | |
62 result = "%s" % " | ".join(x) | |
63 if result != "": | |
64 return result | |
65 else: | |
66 return N_A | |
67 | |
68 | |
69 def melt_join(x): | |
70 tmp = {key.lower(): key for key in x} | |
71 result = "%s" % " | ".join([tmp[key] for key in tmp]) | |
72 return result | |
73 | |
74 | |
75 def __main__(): | |
76 # Parse Command Line | |
77 parser = argparse.ArgumentParser( | |
78 description="Phopsphoproteomic Enrichment Pipeline Merge and Filter." | |
79 ) | |
80 | |
81 # inputs: | |
82 # Phosphopeptide data for experimental results, including the intensities | |
83 # and the mapping to kinase domains, in tabular format. | |
84 parser.add_argument( | |
85 "--phosphopeptides", | |
86 "-p", | |
87 nargs=1, | |
88 required=True, | |
89 dest="phosphopeptides", | |
90 help="Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format", | |
91 ) | |
92 # UniProtKB/SwissProt DB input, SQLite | |
93 parser.add_argument( | |
94 "--ppep_mapping_db", | |
95 "-d", | |
96 nargs=1, | |
97 required=True, | |
98 dest="ppep_mapping_db", | |
99 help="UniProtKB/SwissProt SQLite Database", | |
100 ) | |
101 # species to limit records chosed from PhosPhositesPlus | |
102 parser.add_argument( | |
103 "--species", | |
104 "-x", | |
105 nargs=1, | |
106 required=False, | |
107 default=[], | |
108 dest="species", | |
109 help="limit PhosphoSitePlus records to indicated species (field may be empty)", | |
110 ) | |
111 | |
112 # outputs: | |
113 # tabular output | |
114 parser.add_argument( | |
115 "--mrgfltr_tab", | |
116 "-o", | |
117 nargs=1, | |
118 required=True, | |
119 dest="mrgfltr_tab", | |
120 help="Tabular output file for results", | |
121 ) | |
122 # CSV output | |
123 parser.add_argument( | |
124 "--mrgfltr_csv", | |
125 "-c", | |
126 nargs=1, | |
127 required=True, | |
128 dest="mrgfltr_csv", | |
129 help="CSV output file for results", | |
130 ) | |
131 # SQLite output | |
132 parser.add_argument( | |
133 "--mrgfltr_sqlite", | |
134 "-S", | |
135 nargs=1, | |
136 required=True, | |
137 dest="mrgfltr_sqlite", | |
138 help="SQLite output file for results", | |
139 ) | |
140 | |
141 # "Make it so!" (parse the arguments) | |
142 options = parser.parse_args() | |
143 print("options: " + str(options)) | |
144 | |
145 # determine phosphopeptide ("upstream map") input tabular file access | |
146 if options.phosphopeptides is None: | |
147 exit('Argument "phosphopeptides" is required but not supplied') | |
148 try: | |
149 upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0]) | |
150 input_file = open(upstream_map_filename_tab, "r") | |
151 input_file.close() | |
152 except Exception as e: | |
153 exit("Error parsing phosphopeptides argument: %s" % str(e)) | |
154 | |
155 # determine input SQLite access | |
156 if options.ppep_mapping_db is None: | |
157 exit('Argument "ppep_mapping_db" is required but not supplied') | |
158 try: | |
159 uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0]) | |
160 input_file = open(uniprot_sqlite, "rb") | |
161 input_file.close() | |
162 except Exception as e: | |
163 exit("Error parsing ppep_mapping_db argument: %s" % str(e)) | |
164 | |
165 # copy input SQLite dataset to output SQLite dataset | |
166 if options.mrgfltr_sqlite is None: | |
167 exit('Argument "mrgfltr_sqlite" is required but not supplied') | |
168 try: | |
169 output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0]) | |
170 shutil.copyfile(uniprot_sqlite, output_sqlite) | |
171 except Exception as e: | |
172 exit("Error copying ppep_mapping_db to mrgfltr_sqlite: %s" % str(e)) | |
173 | |
174 # determine species to limit records from PSP_Regulatory_Sites | |
175 if options.species is None: | |
176 exit( | |
177 'Argument "species" is required (and may be empty) but not supplied' | |
178 ) | |
179 try: | |
180 if len(options.species) > 0: | |
181 species = options.species[0] | |
182 else: | |
183 species = "" | |
184 except Exception as e: | |
185 exit("Error parsing species argument: %s" % str(e)) | |
186 | |
187 # determine tabular output destination | |
188 if options.mrgfltr_tab is None: | |
189 exit('Argument "mrgfltr_tab" is required but not supplied') | |
190 try: | |
191 output_filename_tab = os.path.abspath(options.mrgfltr_tab[0]) | |
192 output_file = open(output_filename_tab, "w") | |
193 output_file.close() | |
194 except Exception as e: | |
195 exit("Error parsing mrgfltr_tab argument: %s" % str(e)) | |
196 | |
197 # determine CSV output destination | |
198 if options.mrgfltr_csv is None: | |
199 exit('Argument "mrgfltr_csv" is required but not supplied') | |
200 try: | |
201 output_filename_csv = os.path.abspath(options.mrgfltr_csv[0]) | |
202 output_file = open(output_filename_csv, "w") | |
203 output_file.close() | |
204 except Exception as e: | |
205 exit("Error parsing mrgfltr_csv argument: %s" % str(e)) | |
206 | |
207 def mqpep_getswissprot(): | |
208 | |
209 # | |
210 # copied from Excel Output Script.ipynb BEGIN # | |
211 # | |
212 | |
213 # String Constants ################# | |
214 DEPHOSPHOPEP = "DephosphoPep" | |
215 DESCRIPTION = "Description" | |
216 FUNCTION_PHOSPHORESIDUE = ( | |
217 "Function Phosphoresidue(PSP=PhosphoSitePlus.org)" | |
218 ) | |
219 GENE_NAME = "Gene_Name" # Gene Name from UniProtKB | |
220 ON_FUNCTION = ( | |
221 "ON_FUNCTION" # ON_FUNCTION column from PSP_Regulatory_Sites | |
222 ) | |
223 ON_NOTES = "NOTES" # NOTES column from PSP_Regulatory_Sites | |
224 ON_OTHER_INTERACT = "ON_OTHER_INTERACT" # ON_OTHER_INTERACT column from PSP_Regulatory_Sites | |
225 ON_PROCESS = ( | |
226 "ON_PROCESS" # ON_PROCESS column from PSP_Regulatory_Sites | |
227 ) | |
228 ON_PROT_INTERACT = "ON_PROT_INTERACT" # ON_PROT_INTERACT column from PSP_Regulatory_Sites | |
229 PHOSPHOPEPTIDE = "Phosphopeptide" | |
230 PHOSPHOPEPTIDE_MATCH = "Phosphopeptide_match" | |
231 PHOSPHORESIDUE = "Phosphoresidue" | |
232 PUTATIVE_UPSTREAM_DOMAINS = "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains" | |
233 SEQUENCE = "Sequence" | |
234 SEQUENCE10 = "Sequence10" | |
235 SEQUENCE7 = "Sequence7" | |
236 SITE_PLUSMINUS_7AA_SQL = "SITE_PLUSMINUS_7AA" | |
237 UNIPROT_ID = "UniProt_ID" | |
238 UNIPROT_SEQ_AND_META_SQL = """ | |
239 select Uniprot_ID, Description, Gene_Name, Sequence, | |
240 Organism_Name, Organism_ID, PE, SV | |
241 from UniProtKB | |
242 order by Sequence, UniProt_ID | |
243 """ | |
244 UNIPROT_UNIQUE_SEQ_SQL = """ | |
245 select distinct Sequence | |
246 from UniProtKB | |
247 group by Sequence | |
248 """ | |
249 PPEP_PEP_UNIPROTSEQ_SQL = """ | |
250 select distinct phosphopeptide, peptide, sequence | |
251 from uniprotkb_pep_ppep_view | |
252 order by sequence | |
253 """ | |
254 PPEP_MELT_SQL = """ | |
255 SELECT DISTINCT | |
256 phospho_peptide AS 'p_peptide', | |
257 kinase_map AS 'characterization', | |
258 'X' AS 'X' | |
259 FROM ppep_gene_site_view | |
260 """ | |
261 # CREATE TABLE PSP_Regulatory_site ( | |
262 # site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE, | |
263 # domain TEXT, | |
264 # ON_FUNCTION TEXT, | |
265 # ON_PROCESS TEXT, | |
266 # ON_PROT_INTERACT TEXT, | |
267 # ON_OTHER_INTERACT TEXT, | |
268 # notes TEXT, | |
269 # organism TEXT | |
270 # ); | |
271 PSP_REGSITE_SQL = """ | |
272 SELECT DISTINCT | |
273 SITE_PLUSMINUS_7AA , | |
274 DOMAIN , | |
275 ON_FUNCTION , | |
276 ON_PROCESS , | |
277 ON_PROT_INTERACT , | |
278 ON_OTHER_INTERACT , | |
279 NOTES , | |
280 ORGANISM | |
281 FROM PSP_Regulatory_site | |
282 """ | |
283 PPEP_ID_SQL = """ | |
284 SELECT | |
285 id AS 'ppep_id', | |
286 seq AS 'ppep_seq' | |
287 FROM ppep | |
288 """ | |
289 MRGFLTR_DDL = """ | |
290 DROP VIEW IF EXISTS mrgfltr_metadata_view; | |
291 DROP TABLE IF EXISTS mrgfltr_metadata; | |
292 CREATE TABLE mrgfltr_metadata | |
293 ( ppep_id INTEGER REFERENCES ppep(id) | |
294 , Sequence10 TEXT | |
295 , Sequence7 TEXT | |
296 , GeneName TEXT | |
297 , Phosphoresidue TEXT | |
298 , UniProtID TEXT | |
299 , Description TEXT | |
300 , FunctionPhosphoresidue TEXT | |
301 , PutativeUpstreamDomains TEXT | |
302 , PRIMARY KEY (ppep_id) ON CONFLICT IGNORE | |
303 ) | |
304 ; | |
305 CREATE VIEW mrgfltr_metadata_view AS | |
306 SELECT DISTINCT | |
307 ppep.seq AS phospho_peptide | |
308 , Sequence10 | |
309 , Sequence7 | |
310 , GeneName | |
311 , Phosphoresidue | |
312 , UniProtID | |
313 , Description | |
314 , FunctionPhosphoresidue | |
315 , PutativeUpstreamDomains | |
316 FROM | |
317 ppep, mrgfltr_metadata | |
318 WHERE | |
319 mrgfltr_metadata.ppep_id = ppep.id | |
320 ORDER BY | |
321 ppep.seq | |
322 ; | |
323 """ | |
324 | |
325 CITATION_INSERT_STMT = """ | |
326 INSERT INTO Citation ( | |
327 ObjectName, | |
328 CitationData | |
329 ) VALUES (?,?) | |
330 """ | |
331 CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."' | |
332 CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122' | |
333 | |
334 MRGFLTR_METADATA_COLUMNS = [ | |
335 "ppep_id", | |
336 "Sequence10", | |
337 "Sequence7", | |
338 "GeneName", | |
339 "Phosphoresidue", | |
340 "UniProtID", | |
341 "Description", | |
342 "FunctionPhosphoresidue", | |
343 "PutativeUpstreamDomains", | |
344 ] | |
345 | |
346 # String Constants (end) ############ | |
347 | |
348 class Error(Exception): | |
349 """Base class for exceptions in this module.""" | |
350 | |
351 pass | |
352 | |
353 class PreconditionError(Error): | |
354 """Exception raised for errors in the input. | |
355 | |
356 Attributes: | |
357 expression -- input expression in which the error occurred | |
358 message -- explanation of the error | |
359 """ | |
360 | |
361 def __init__(self, expression, message): | |
362 self.expression = expression | |
363 self.message = message | |
364 | |
365 # start_time = time.clock() #timer | |
366 start_time = time.process_time() # timer | |
367 | |
368 # get keys from upstream tabular file using readline() | |
369 # ref: https://stackoverflow.com/a/16713581/15509512 | |
370 # answer to "Use codecs to read file with correct encoding" | |
371 file1_encoded = open(upstream_map_filename_tab, "rb") | |
372 file1 = cx_getreader("latin-1")(file1_encoded) | |
373 | |
374 count = 0 | |
375 upstream_map_p_peptide_list = [] | |
376 re_tab = re.compile("^[^\t]*") | |
377 while True: | |
378 count += 1 | |
379 # Get next line from file | |
380 line = file1.readline() | |
381 # if line is empty | |
382 # end of file is reached | |
383 if not line: | |
384 break | |
385 if count > 1: | |
386 m = re_tab.match(line) | |
387 upstream_map_p_peptide_list.append(m[0]) | |
388 file1.close() | |
389 file1_encoded.close() | |
390 | |
391 # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed | |
392 re_phos = re.compile("p") | |
393 | |
394 end_time = time.process_time() # timer | |
395 print( | |
396 "%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,), | |
397 file=sys.stderr, | |
398 ) | |
399 | |
400 # ----------- Get SwissProt data from SQLite database (start) ----------- | |
401 # build UniProt sequence LUT and list of unique SwissProt sequences | |
402 | |
403 # Open SwissProt SQLite database | |
404 conn = sql.connect(uniprot_sqlite) | |
405 cur = conn.cursor() | |
406 | |
407 # Set up structures to hold SwissProt data | |
408 | |
409 uniprot_Sequence_List = [] | |
410 UniProtSeqLUT = {} | |
411 | |
412 # Execute query for unique seqs without fetching the results yet | |
413 uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL) | |
414 | |
415 while 1: | |
416 batch = uniprot_unique_seq_cur.fetchmany(size=50) | |
417 if not batch: | |
418 # handle case where no records are returned | |
419 break | |
420 for row in batch: | |
421 Sequence = row[0] | |
422 UniProtSeqLUT[(Sequence, DESCRIPTION)] = [] | |
423 UniProtSeqLUT[(Sequence, GENE_NAME)] = [] | |
424 UniProtSeqLUT[(Sequence, UNIPROT_ID)] = [] | |
425 UniProtSeqLUT[Sequence] = [] | |
426 | |
427 # Execute query for seqs and metadata without fetching the results yet | |
428 uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL) | |
429 | |
430 while 1: | |
431 batch = uniprot_seq_and_meta.fetchmany(size=50) | |
432 if not batch: | |
433 # handle case where no records are returned | |
434 break | |
435 for ( | |
436 UniProt_ID, | |
437 Description, | |
438 Gene_Name, | |
439 Sequence, | |
440 OS, | |
441 OX, | |
442 PE, | |
443 SV, | |
444 ) in batch: | |
445 uniprot_Sequence_List.append(Sequence) | |
446 UniProtSeqLUT[Sequence] = Sequence | |
447 UniProtSeqLUT[(Sequence, UNIPROT_ID)].append(UniProt_ID) | |
448 UniProtSeqLUT[(Sequence, GENE_NAME)].append(Gene_Name) | |
449 if OS != N_A: | |
450 Description += " OS=" + OS | |
451 if OX != -1: | |
452 Description += " OX=" + str(OX) | |
453 if Gene_Name != N_A: | |
454 Description += " GN=" + Gene_Name | |
455 if PE != N_A: | |
456 Description += " PE=" + PE | |
457 if SV != N_A: | |
458 Description += " SV=" + SV | |
459 UniProtSeqLUT[(Sequence, DESCRIPTION)].append(Description) | |
460 | |
461 # Close SwissProt SQLite database; clean up local variables | |
462 conn.close() | |
463 Sequence = "" | |
464 UniProt_ID = "" | |
465 Description = "" | |
466 Gene_Name = "" | |
467 | |
468 # ----------- Get SwissProt data from SQLite database (finish) ----------- | |
469 | |
470 end_time = time.process_time() # timer | |
471 print( | |
472 "%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,), | |
473 file=sys.stderr, | |
474 ) | |
475 | |
476 # ----------- Get SwissProt data from SQLite database (start) ----------- | |
477 # Open SwissProt SQLite database | |
478 conn = sql.connect(uniprot_sqlite) | |
479 cur = conn.cursor() | |
480 | |
481 # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide | |
482 DephosphoPep_UniProtSeq_LUT = {} | |
483 | |
484 # Set up dictionary to accumulate results | |
485 PhosphoPep_UniProtSeq_LUT = {} | |
486 | |
487 # Execute query for tuples without fetching the results yet | |
488 ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL) | |
489 | |
490 while 1: | |
491 batch = ppep_pep_uniprotseq_cur.fetchmany(size=50) | |
492 if not batch: | |
493 # handle case where no records are returned | |
494 break | |
495 for (phospho_pep, dephospho_pep, sequence) in batch: | |
496 # do interesting stuff here... | |
497 PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep | |
498 PhosphoPep_UniProtSeq_LUT[ | |
499 (phospho_pep, DEPHOSPHOPEP) | |
500 ] = dephospho_pep | |
501 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: | |
502 DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set() | |
503 DephosphoPep_UniProtSeq_LUT[ | |
504 (dephospho_pep, DESCRIPTION) | |
505 ] = [] | |
506 DephosphoPep_UniProtSeq_LUT[ | |
507 (dephospho_pep, GENE_NAME) | |
508 ] = [] | |
509 DephosphoPep_UniProtSeq_LUT[ | |
510 (dephospho_pep, UNIPROT_ID) | |
511 ] = [] | |
512 DephosphoPep_UniProtSeq_LUT[(dephospho_pep, SEQUENCE)] = [] | |
513 DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep) | |
514 | |
515 if ( | |
516 sequence | |
517 not in DephosphoPep_UniProtSeq_LUT[ | |
518 (dephospho_pep, SEQUENCE) | |
519 ] | |
520 ): | |
521 DephosphoPep_UniProtSeq_LUT[ | |
522 (dephospho_pep, SEQUENCE) | |
523 ].append(sequence) | |
524 for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]: | |
525 if phospho_pep != phospho_pep: | |
526 print( | |
527 "phospho_pep:'%s' phospho_pep:'%s'" | |
528 % (phospho_pep, phospho_pep) | |
529 ) | |
530 if phospho_pep not in PhosphoPep_UniProtSeq_LUT: | |
531 PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep | |
532 PhosphoPep_UniProtSeq_LUT[ | |
533 (phospho_pep, DEPHOSPHOPEP) | |
534 ] = dephospho_pep | |
535 r = list( | |
536 zip( | |
537 [s for s in UniProtSeqLUT[(sequence, UNIPROT_ID)]], | |
538 [s for s in UniProtSeqLUT[(sequence, GENE_NAME)]], | |
539 [ | |
540 s | |
541 for s in UniProtSeqLUT[(sequence, DESCRIPTION)] | |
542 ], | |
543 ) | |
544 ) | |
545 # Sort by `UniProt_ID` | |
546 # ref: https://stackoverflow.com/a/4174955/15509512 | |
547 r = sorted(r, key=operator.itemgetter(0)) | |
548 # Get one tuple for each `phospho_pep` | |
549 # in DephosphoPep_UniProtSeq_LUT[dephospho_pep] | |
550 for (upid, gn, desc) in r: | |
551 # Append pseudo-tuple per UniProt_ID but only when it is not present | |
552 if ( | |
553 upid | |
554 not in DephosphoPep_UniProtSeq_LUT[ | |
555 (dephospho_pep, UNIPROT_ID) | |
556 ] | |
557 ): | |
558 DephosphoPep_UniProtSeq_LUT[ | |
559 (dephospho_pep, UNIPROT_ID) | |
560 ].append(upid) | |
561 DephosphoPep_UniProtSeq_LUT[ | |
562 (dephospho_pep, DESCRIPTION) | |
563 ].append(desc) | |
564 DephosphoPep_UniProtSeq_LUT[ | |
565 (dephospho_pep, GENE_NAME) | |
566 ].append(gn) | |
567 | |
568 # Close SwissProt SQLite database; clean up local variables | |
569 conn.close() | |
570 # wipe local variables | |
571 phospho_pep = dephospho_pep = sequence = 0 | |
572 upid = gn = desc = r = "" | |
573 | |
574 # ----------- Get SwissProt data from SQLite database (finish) ----------- | |
575 | |
576 end_time = time.process_time() # timer | |
577 print( | |
578 "%0.6f finished reading and decoding '%s' [0.4]" | |
579 % (end_time - start_time, upstream_map_filename_tab), | |
580 file=sys.stderr, | |
581 ) | |
582 | |
583 print( | |
584 "{:>10} unique upstream phosphopeptides tested".format( | |
585 str(len(upstream_map_p_peptide_list)) | |
586 ) | |
587 ) | |
588 | |
589 # Read in Upstream tabular file | |
590 # We are discarding the intensity data; so read it as text | |
591 upstream_data = pandas.read_table( | |
592 upstream_map_filename_tab, dtype="str", index_col=0 | |
593 ) | |
594 | |
595 end_time = time.process_time() # timer | |
596 print( | |
597 "%0.6f read Upstream Map from file [1g_1]" | |
598 % (end_time - start_time,), | |
599 file=sys.stderr, | |
600 ) # timer | |
601 | |
602 upstream_data.index = upstream_map_p_peptide_list | |
603 | |
604 end_time = time.process_time() # timer | |
605 print( | |
606 "%0.6f added index to Upstream Map [1g_2]" | |
607 % (end_time - start_time,), | |
608 file=sys.stderr, | |
609 ) # timer | |
610 | |
611 # ######################################################################## | |
612 # # trim upstream_data to include only the upstream map columns | |
613 # old_cols = upstream_data.columns.tolist() | |
614 # i = 0 | |
615 # first_intensity = -1 | |
616 # last_intensity = -1 | |
617 # intensity_re = re.compile("Intensity.*") | |
618 # for col_name in old_cols: | |
619 # m = intensity_re.match(col_name) | |
620 # if m: | |
621 # last_intensity = i | |
622 # if first_intensity == -1: | |
623 # first_intensity = i | |
624 # i += 1 | |
625 # # print('last intensity = %d' % last_intensity) | |
626 # col_PKCalpha = last_intensity + 2 | |
627 # | |
628 # data_in_cols = [old_cols[0]] + old_cols[ | |
629 # first_intensity: last_intensity + 1 | |
630 # ] | |
631 # | |
632 # if upstream_data.empty: | |
633 # print("upstream_data is empty") | |
634 # exit(0) | |
635 # | |
636 # data_in = upstream_data.copy(deep=True)[data_in_cols] | |
637 ######################################################################## | |
638 # trim upstream_data to include only the upstream map columns | |
639 old_cols = upstream_data.columns.tolist() | |
640 i = 0 | |
641 first_intensity = -1 | |
642 last_intensity = -1 | |
643 intensity_re = re.compile("Intensity.*") | |
644 for col_name in old_cols: | |
645 m = intensity_re.match(col_name) | |
646 if m: | |
647 last_intensity = i | |
648 if first_intensity == -1: | |
649 first_intensity = i | |
650 i += 1 | |
651 # print('last intensity = %d' % last_intensity) | |
652 col_PKCalpha = last_intensity + 2 | |
653 | |
654 data_in_cols = [old_cols[0]] + old_cols[ | |
655 first_intensity - 1: last_intensity | |
656 ] | |
657 data_col_names = [old_cols[0]] + old_cols[ | |
658 first_intensity: last_intensity + 1 | |
659 ] | |
660 | |
661 if upstream_data.empty: | |
662 print("upstream_data is empty") | |
663 exit(0) | |
664 | |
665 data_in = upstream_data.copy(deep=True)[data_in_cols] | |
666 data_in.columns = data_col_names | |
667 print("data_in") | |
668 print(data_in) | |
669 ######################################################################## | |
670 | |
671 # Convert floating-point integers to int64 integers | |
672 # ref: https://stackoverflow.com/a/68497603/15509512 | |
673 data_in[list(data_in.columns[1:])] = ( | |
674 data_in[list(data_in.columns[1:])] | |
675 .astype("float64") | |
676 .apply(np.int64) | |
677 ) | |
678 | |
679 # create another phosphopeptide column that will be used to join later; | |
680 # MAY need to change depending on Phosphopeptide column position | |
681 # data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]] | |
682 data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index | |
683 | |
684 end_time = time.process_time() # timer | |
685 print( | |
686 "%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]" | |
687 % (end_time - start_time,), | |
688 file=sys.stderr, | |
689 ) # timer | |
690 | |
691 # Produce a dictionary of metadata for a single phosphopeptide. | |
692 # This is a replacement of `UniProtInfo_subdict` in the original code. | |
693 def pseq_to_subdict(phospho_pep): | |
694 # Strip "p" from phosphopeptide sequence | |
695 dephospho_pep = re_phos.sub("", phospho_pep) | |
696 | |
697 # Determine number of phosphoresidues in phosphopeptide | |
698 numps = len(phospho_pep) - len(dephospho_pep) | |
699 | |
700 # Determine location(s) of phosphoresidue(s) in phosphopeptide | |
701 # (used later for Phosphoresidue, Sequence7, and Sequence10) | |
702 ploc = [] # list of p locations | |
703 i = 0 | |
704 p = phospho_pep | |
705 while i < numps: | |
706 ploc.append(p.find("p")) | |
707 p = p[: p.find("p")] + p[p.find("p") + 1:] | |
708 i += 1 | |
709 | |
710 # Establish nested dictionary | |
711 result = {} | |
712 result[SEQUENCE] = [] | |
713 result[UNIPROT_ID] = [] | |
714 result[DESCRIPTION] = [] | |
715 result[GENE_NAME] = [] | |
716 result[PHOSPHORESIDUE] = [] | |
717 result[SEQUENCE7] = [] | |
718 result[SEQUENCE10] = [] | |
719 | |
720 # Add stripped sequence to dictionary | |
721 result[SEQUENCE].append(dephospho_pep) | |
722 | |
723 # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT | |
724 # Caller may elect to: | |
725 # try: | |
726 # ... | |
727 # except PreconditionError as pe: | |
728 # print("'{expression}': {message}".format( | |
729 # expression = pe.expression, | |
730 # message = pe.message)) | |
731 # ) | |
732 # ) | |
733 if phospho_pep not in PhosphoPep_UniProtSeq_LUT: | |
734 raise PreconditionError( | |
735 phospho_pep, | |
736 "no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT", | |
737 ) | |
738 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: | |
739 raise PreconditionError( | |
740 dephospho_pep, | |
741 "dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT", | |
742 ) | |
743 if ( | |
744 dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)] | |
745 ): | |
746 my_err_msg = "dephosphorylated phosphopeptide does not match " | |
747 my_err_msg += "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = " | |
748 my_err_msg += PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)] | |
749 raise PreconditionError(dephospho_pep, my_err_msg) | |
750 | |
751 result[SEQUENCE] = [dephospho_pep] | |
752 result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[ | |
753 (dephospho_pep, UNIPROT_ID) | |
754 ] | |
755 result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[ | |
756 (dephospho_pep, DESCRIPTION) | |
757 ] | |
758 result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[ | |
759 (dephospho_pep, GENE_NAME) | |
760 ] | |
761 if (dephospho_pep, SEQUENCE) not in DephosphoPep_UniProtSeq_LUT: | |
762 raise PreconditionError( | |
763 dephospho_pep, | |
764 "no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT", | |
765 ) | |
766 UniProtSeqList = DephosphoPep_UniProtSeq_LUT[ | |
767 (dephospho_pep, SEQUENCE) | |
768 ] | |
769 if len(UniProtSeqList) < 1: | |
770 print( | |
771 "Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length" | |
772 % dephospho_pep | |
773 ) | |
774 # raise PreconditionError( | |
775 # "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)", | |
776 # 'value has zero length' | |
777 # ) | |
778 for UniProtSeq in UniProtSeqList: | |
779 i = 0 | |
780 phosphoresidues = [] | |
781 seq7s_set = set() | |
782 seq7s = [] | |
783 seq10s_set = set() | |
784 seq10s = [] | |
785 while i < len(ploc): | |
786 start = UniProtSeq.find(dephospho_pep) | |
787 # handle case where no sequence was found for dep-pep | |
788 if start < 0: | |
789 i += 1 | |
790 continue | |
791 psite = ( | |
792 start + ploc[i] | |
793 ) # location of phosphoresidue on protein sequence | |
794 | |
795 # add Phosphoresidue | |
796 phosphosite = "p" + str(UniProtSeq)[psite] + str(psite + 1) | |
797 phosphoresidues.append(phosphosite) | |
798 | |
799 # Add Sequence7 | |
800 if psite < 7: # phospho_pep at N terminus | |
801 seq7 = str(UniProtSeq)[: psite + 8] | |
802 if seq7[psite] == "S": # if phosphosresidue is serine | |
803 pres = "s" | |
804 elif ( | |
805 seq7[psite] == "T" | |
806 ): # if phosphosresidue is threonine | |
807 pres = "t" | |
808 elif ( | |
809 seq7[psite] == "Y" | |
810 ): # if phosphoresidue is tyrosine | |
811 pres = "y" | |
812 else: # if not pSTY | |
813 pres = "?" | |
814 seq7 = ( | |
815 seq7[:psite] + pres + seq7[psite + 1: psite + 8] | |
816 ) | |
817 while ( | |
818 len(seq7) < 15 | |
819 ): # add appropriate number of "_" to the front | |
820 seq7 = "_" + seq7 | |
821 elif ( | |
822 len(UniProtSeq) - psite < 8 | |
823 ): # phospho_pep at C terminus | |
824 seq7 = str(UniProtSeq)[psite - 7:] | |
825 if seq7[7] == "S": | |
826 pres = "s" | |
827 elif seq7[7] == "T": | |
828 pres = "t" | |
829 elif seq7[7] == "Y": | |
830 pres = "y" | |
831 else: | |
832 pres = "?" | |
833 seq7 = seq7[:7] + pres + seq7[8:] | |
834 while ( | |
835 len(seq7) < 15 | |
836 ): # add appropriate number of "_" to the back | |
837 seq7 = seq7 + "_" | |
838 else: | |
839 seq7 = str(UniProtSeq)[psite - 7: psite + 8] | |
840 pres = "" # phosphoresidue | |
841 if seq7[7] == "S": # if phosphosresidue is serine | |
842 pres = "s" | |
843 elif seq7[7] == "T": # if phosphosresidue is threonine | |
844 pres = "t" | |
845 elif seq7[7] == "Y": # if phosphoresidue is tyrosine | |
846 pres = "y" | |
847 else: # if not pSTY | |
848 pres = "?" | |
849 seq7 = seq7[:7] + pres + seq7[8:] | |
850 if seq7 not in seq7s_set: | |
851 seq7s.append(seq7) | |
852 seq7s_set.add(seq7) | |
853 | |
854 # add Sequence10 | |
855 if psite < 10: # phospho_pep at N terminus | |
856 seq10 = ( | |
857 str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite: psite + 11] | |
858 ) | |
859 elif ( | |
860 len(UniProtSeq) - psite < 11 | |
861 ): # phospho_pep at C terminus | |
862 seq10 = ( | |
863 str(UniProtSeq)[psite - 10: psite] + "p" + str(UniProtSeq)[psite:] | |
864 ) | |
865 else: | |
866 seq10 = str(UniProtSeq)[psite - 10: psite + 11] | |
867 seq10 = seq10[:10] + "p" + seq10[10:] | |
868 if seq10 not in seq10s_set: | |
869 seq10s.append(seq10) | |
870 seq10s_set.add(seq10) | |
871 | |
872 i += 1 | |
873 | |
874 result[PHOSPHORESIDUE].append(phosphoresidues) | |
875 result[SEQUENCE7].append(seq7s) | |
876 # result[SEQUENCE10] is a list of lists of strings | |
877 result[SEQUENCE10].append(seq10s) | |
878 | |
879 r = list( | |
880 zip( | |
881 result[UNIPROT_ID], | |
882 result[GENE_NAME], | |
883 result[DESCRIPTION], | |
884 result[PHOSPHORESIDUE], | |
885 ) | |
886 ) | |
887 # Sort by `UniProt_ID` | |
888 # ref: https://stackoverflow.com//4174955/15509512 | |
889 s = sorted(r, key=operator.itemgetter(0)) | |
890 | |
891 result[UNIPROT_ID] = [] | |
892 result[GENE_NAME] = [] | |
893 result[DESCRIPTION] = [] | |
894 result[PHOSPHORESIDUE] = [] | |
895 | |
896 for r in s: | |
897 result[UNIPROT_ID].append(r[0]) | |
898 result[GENE_NAME].append(r[1]) | |
899 result[DESCRIPTION].append(r[2]) | |
900 result[PHOSPHORESIDUE].append(r[3]) | |
901 | |
902 # convert lists to strings in the dictionary | |
903 for key, value in result.items(): | |
904 if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]: | |
905 result[key] = "; ".join(map(str, value)) | |
906 elif key in [SEQUENCE10]: | |
907 # result[SEQUENCE10] is a list of lists of strings | |
908 joined_value = "" | |
909 joined_set = set() | |
910 sep = "" | |
911 for valL in value: | |
912 # valL is a list of strings | |
913 for val in valL: | |
914 # val is a string | |
915 if val not in joined_set: | |
916 joined_set.add(val) | |
917 joined_value += sep + val | |
918 sep = "; " | |
919 # joined_value is a string | |
920 result[key] = joined_value | |
921 | |
922 newstring = "; ".join( | |
923 [", ".join(prez) for prez in result[PHOSPHORESIDUE]] | |
924 ) | |
925 # #separate the isoforms in PHOSPHORESIDUE column with ";" | |
926 # oldstring = result[PHOSPHORESIDUE] | |
927 # oldlist = list(oldstring) | |
928 # newstring = "" | |
929 # i = 0 | |
930 # for e in oldlist: | |
931 # if e == ";": | |
932 # if numps > 1: | |
933 # if i%numps: | |
934 # newstring = newstring + ";" | |
935 # else: | |
936 # newstring = newstring + "," | |
937 # else: | |
938 # newstring = newstring + ";" | |
939 # i +=1 | |
940 # else: | |
941 # newstring = newstring + e | |
942 result[PHOSPHORESIDUE] = newstring | |
943 | |
944 # separate sequence7's by | | |
945 oldstring = result[SEQUENCE7] | |
946 oldlist = oldstring | |
947 newstring = "" | |
948 for ol in oldlist: | |
949 for e in ol: | |
950 if e == ";": | |
951 newstring = newstring + " |" | |
952 elif len(newstring) > 0 and 1 > newstring.count(e): | |
953 newstring = newstring + " | " + e | |
954 elif 1 > newstring.count(e): | |
955 newstring = newstring + e | |
956 result[SEQUENCE7] = newstring | |
957 | |
958 return [phospho_pep, result] | |
959 | |
960 # Construct list of [string, dictionary] lists | |
961 # where the dictionary provides the SwissProt metadata | |
962 # for a phosphopeptide | |
963 result_list = [ | |
964 whine(pseq_to_subdict, psequence) | |
965 for psequence in data_in[PHOSPHOPEPTIDE_MATCH] | |
966 ] | |
967 | |
968 end_time = time.process_time() # timer | |
969 print( | |
970 "%0.6f added SwissProt annotations to phosphopeptides [B]" | |
971 % (end_time - start_time,), | |
972 file=sys.stderr, | |
973 ) # timer | |
974 | |
975 # Construct dictionary from list of lists | |
976 # ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/ | |
977 UniProt_Info = { | |
978 result[0]: result[1] | |
979 for result in result_list | |
980 if result is not None | |
981 } | |
982 | |
983 end_time = time.process_time() # timer | |
984 print( | |
985 "%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]" | |
986 % (end_time - start_time,), | |
987 file=sys.stderr, | |
988 ) # timer | |
989 | |
990 # cosmetic: add N_A to phosphopeptide rows with no hits | |
991 p_peptide_list = [] | |
992 for key in UniProt_Info: | |
993 p_peptide_list.append(key) | |
994 for nestedKey in UniProt_Info[key]: | |
995 if UniProt_Info[key][nestedKey] == "": | |
996 UniProt_Info[key][nestedKey] = N_A | |
997 | |
998 end_time = time.process_time() # timer | |
999 print( | |
1000 "%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,), | |
1001 file=sys.stderr, | |
1002 ) # timer | |
1003 | |
1004 # convert UniProt_Info dictionary to dataframe | |
1005 uniprot_df = pandas.DataFrame.transpose( | |
1006 pandas.DataFrame.from_dict(UniProt_Info) | |
1007 ) | |
1008 | |
1009 # reorder columns to match expected output file | |
1010 uniprot_df[ | |
1011 PHOSPHOPEPTIDE | |
1012 ] = uniprot_df.index # make index a column too | |
1013 | |
1014 cols = uniprot_df.columns.tolist() | |
1015 # cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]] | |
1016 # uniprot_df = uniprot_df[cols] | |
1017 uniprot_df = uniprot_df[ | |
1018 [ | |
1019 PHOSPHOPEPTIDE, | |
1020 SEQUENCE10, | |
1021 SEQUENCE7, | |
1022 GENE_NAME, | |
1023 PHOSPHORESIDUE, | |
1024 UNIPROT_ID, | |
1025 DESCRIPTION, | |
1026 ] | |
1027 ] | |
1028 | |
1029 end_time = time.process_time() # timer | |
1030 print( | |
1031 "%0.6f reordered columns to match expected output file [1]" | |
1032 % (end_time - start_time,), | |
1033 file=sys.stderr, | |
1034 ) # timer | |
1035 | |
1036 # concat to split then groupby to collapse | |
1037 seq7_df = pandas.concat( | |
1038 [ | |
1039 pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(" | ")) | |
1040 for _, row in uniprot_df.iterrows() | |
1041 ] | |
1042 ).reset_index() | |
1043 seq7_df.columns = [SEQUENCE7, PHOSPHOPEPTIDE] | |
1044 | |
1045 # --- -------------- begin read PSP_Regulatory_sites --------------------------------- | |
1046 # read in PhosphoSitePlus Regulatory Sites dataset | |
1047 # ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) ----------- | |
1048 conn = sql.connect(uniprot_sqlite) | |
1049 regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn) | |
1050 # Close SwissProt SQLite database | |
1051 conn.close() | |
1052 # ... -------------- end read PSP_Regulatory_sites ------------------------------------ | |
1053 | |
1054 # keep only the human entries in dataframe | |
1055 if len(species) > 0: | |
1056 print( | |
1057 'Limit PhosphoSitesPlus records to species "' + species + '"' | |
1058 ) | |
1059 regsites_df = regsites_df[regsites_df.ORGANISM == species] | |
1060 | |
1061 # merge the seq7 df with the regsites df based off of the sequence7 | |
1062 merge_df = seq7_df.merge( | |
1063 regsites_df, | |
1064 left_on=SEQUENCE7, | |
1065 right_on=SITE_PLUSMINUS_7AA_SQL, | |
1066 how="left", | |
1067 ) | |
1068 | |
1069 # after merging df, select only the columns of interest; | |
1070 # note that PROTEIN is absent here | |
1071 merge_df = merge_df[ | |
1072 [ | |
1073 PHOSPHOPEPTIDE, | |
1074 SEQUENCE7, | |
1075 ON_FUNCTION, | |
1076 ON_PROCESS, | |
1077 ON_PROT_INTERACT, | |
1078 ON_OTHER_INTERACT, | |
1079 ON_NOTES, | |
1080 ] | |
1081 ] | |
1082 # combine column values of interest | |
1083 # into one FUNCTION_PHOSPHORESIDUE column" | |
1084 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat( | |
1085 merge_df[ON_PROCESS], sep="; ", na_rep="" | |
1086 ) | |
1087 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ | |
1088 FUNCTION_PHOSPHORESIDUE | |
1089 ].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="") | |
1090 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ | |
1091 FUNCTION_PHOSPHORESIDUE | |
1092 ].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="") | |
1093 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ | |
1094 FUNCTION_PHOSPHORESIDUE | |
1095 ].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="") | |
1096 | |
1097 # remove the columns that were combined | |
1098 merge_df = merge_df[ | |
1099 [PHOSPHOPEPTIDE, SEQUENCE7, FUNCTION_PHOSPHORESIDUE] | |
1100 ] | |
1101 | |
1102 end_time = time.process_time() # timer | |
1103 print( | |
1104 "%0.6f merge regsite metadata [1a]" % (end_time - start_time,), | |
1105 file=sys.stderr, | |
1106 ) # timer | |
1107 | |
1108 # cosmetic changes to Function Phosphoresidue column | |
1109 fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE]) | |
1110 | |
1111 end_time = time.process_time() # timer | |
1112 print( | |
1113 "%0.6f more cosmetic changes [1b]" % (end_time - start_time,), | |
1114 file=sys.stderr, | |
1115 ) # timer | |
1116 | |
1117 i = 0 | |
1118 while i < len(fp_series): | |
1119 # remove the extra ";" so that it looks more professional | |
1120 if fp_series[i] == "; ; ; ; ": # remove ; from empty hits | |
1121 fp_series[i] = "" | |
1122 while fp_series[i].endswith("; "): # remove ; from the ends | |
1123 fp_series[i] = fp_series[i][:-2] | |
1124 while fp_series[i].startswith("; "): # remove ; from the beginning | |
1125 fp_series[i] = fp_series[i][2:] | |
1126 fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ") | |
1127 fp_series[i] = fp_series[i].replace("; ; ; ", "; ") | |
1128 fp_series[i] = fp_series[i].replace("; ; ", "; ") | |
1129 | |
1130 # turn blanks into N_A to signify the info was searched for but cannot be found | |
1131 if fp_series[i] == "": | |
1132 fp_series[i] = N_A | |
1133 | |
1134 i += 1 | |
1135 merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series | |
1136 | |
1137 end_time = time.process_time() # timer | |
1138 print( | |
1139 "%0.6f cleaned up semicolons [1c]" % (end_time - start_time,), | |
1140 file=sys.stderr, | |
1141 ) # timer | |
1142 | |
1143 # merge uniprot df with merge df | |
1144 uniprot_regsites_merged_df = uniprot_df.merge( | |
1145 merge_df, | |
1146 left_on=PHOSPHOPEPTIDE, | |
1147 right_on=PHOSPHOPEPTIDE, | |
1148 how="left", | |
1149 ) | |
1150 | |
1151 # collapse the merged df | |
1152 uniprot_regsites_collapsed_df = pandas.DataFrame( | |
1153 uniprot_regsites_merged_df.groupby(PHOSPHOPEPTIDE)[ | |
1154 FUNCTION_PHOSPHORESIDUE | |
1155 ].apply(lambda x: ppep_join(x)) | |
1156 ) | |
1157 # .apply(lambda x: "%s" % ' | '.join(x))) | |
1158 | |
1159 end_time = time.process_time() # timer | |
1160 print( | |
1161 "%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,), | |
1162 file=sys.stderr, | |
1163 ) # timer | |
1164 | |
1165 uniprot_regsites_collapsed_df[ | |
1166 PHOSPHOPEPTIDE | |
1167 ] = ( | |
1168 uniprot_regsites_collapsed_df.index | |
1169 ) # add df index as its own column | |
1170 | |
1171 # rename columns | |
1172 uniprot_regsites_collapsed_df.columns = [ | |
1173 FUNCTION_PHOSPHORESIDUE, | |
1174 "ppp", | |
1175 ] | |
1176 | |
1177 end_time = time.process_time() # timer | |
1178 print( | |
1179 "%0.6f selected columns to be merged to uniprot_df [1e]" | |
1180 % (end_time - start_time,), | |
1181 file=sys.stderr, | |
1182 ) # timer | |
1183 | |
1184 # add columns based on Sequence7 matching site_+/-7_AA | |
1185 uniprot_regsite_df = pandas.merge( | |
1186 left=uniprot_df, | |
1187 right=uniprot_regsites_collapsed_df, | |
1188 how="left", | |
1189 left_on=PHOSPHOPEPTIDE, | |
1190 right_on="ppp", | |
1191 ) | |
1192 | |
1193 end_time = time.process_time() # timer | |
1194 print( | |
1195 "%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]" | |
1196 % (end_time - start_time,), | |
1197 file=sys.stderr, | |
1198 ) # timer | |
1199 | |
1200 data_in.rename( | |
1201 {"Protein description": PHOSPHOPEPTIDE}, | |
1202 axis="columns", | |
1203 inplace=True, | |
1204 ) | |
1205 | |
1206 # data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort') | |
1207 res2 = sorted( | |
1208 data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key=lambda s: s.casefold() | |
1209 ) | |
1210 data_in = data_in.loc[res2] | |
1211 | |
1212 end_time = time.process_time() # timer | |
1213 print( | |
1214 "%0.6f sorting time [1f]" % (end_time - start_time,), | |
1215 file=sys.stderr, | |
1216 ) # timer | |
1217 | |
1218 print("old_cols[:col_PKCalpha]") | |
1219 print(old_cols[:col_PKCalpha]) | |
1220 cols = [old_cols[0]] + old_cols[col_PKCalpha - 1:] | |
1221 upstream_data = upstream_data[cols] | |
1222 print("upstream_data.columns") | |
1223 print(upstream_data.columns) | |
1224 | |
1225 end_time = time.process_time() # timer | |
1226 print( | |
1227 "%0.6f refactored columns for Upstream Map [1g]" | |
1228 % (end_time - start_time,), | |
1229 file=sys.stderr, | |
1230 ) # timer | |
1231 | |
1232 # #rename upstream columns in new list | |
1233 # new_cols = [] | |
1234 # for name in cols: | |
1235 # if "_NetworKIN" in name: | |
1236 # name = name.split("_")[0] | |
1237 # if " motif" in name: | |
1238 # name = name.split(" motif")[0] | |
1239 # if " sequence " in name: | |
1240 # name = name.split(" sequence")[0] | |
1241 # if "_Phosida" in name: | |
1242 # name = name.split("_")[0] | |
1243 # if "_PhosphoSite" in name: | |
1244 # name = name.split("_")[0] | |
1245 # new_cols.append(name) | |
1246 | |
1247 # rename upstream columns in new list | |
1248 def col_rename(name): | |
1249 if "_NetworKIN" in name: | |
1250 name = name.split("_")[0] | |
1251 if " motif" in name: | |
1252 name = name.split(" motif")[0] | |
1253 if " sequence " in name: | |
1254 name = name.split(" sequence")[0] | |
1255 if "_Phosida" in name: | |
1256 name = name.split("_")[0] | |
1257 if "_PhosphoSite" in name: | |
1258 name = name.split("_")[0] | |
1259 return name | |
1260 | |
1261 new_cols = [col_rename(col) for col in cols] | |
1262 upstream_data.columns = new_cols | |
1263 | |
1264 end_time = time.process_time() # timer | |
1265 print( | |
1266 "%0.6f renamed columns for Upstream Map [1h_1]" | |
1267 % (end_time - start_time,), | |
1268 file=sys.stderr, | |
1269 ) # timer | |
1270 | |
1271 # Create upstream_data_cast as a copy of upstream_data | |
1272 # but with first column substituted by the phosphopeptide sequence | |
1273 upstream_data_cast = upstream_data.copy() | |
1274 new_cols_cast = new_cols | |
1275 new_cols_cast[0] = "p_peptide" | |
1276 upstream_data_cast.columns = new_cols_cast | |
1277 upstream_data_cast["p_peptide"] = upstream_data.index | |
1278 | |
1279 # --- -------------- begin read upstream_data_melt ------------------------------------ | |
1280 # ----------- Get melted kinase mapping data from SQLite database (start) ----------- | |
1281 conn = sql.connect(uniprot_sqlite) | |
1282 upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn) | |
1283 # Close SwissProt SQLite database | |
1284 conn.close() | |
1285 upstream_data_melt = upstream_data_melt_df.copy() | |
1286 upstream_data_melt.columns = ["p_peptide", "characterization", "X"] | |
1287 upstream_data_melt["characterization"] = [ | |
1288 col_rename(s) for s in upstream_data_melt["characterization"] | |
1289 ] | |
1290 | |
1291 print( | |
1292 "%0.6f upstream_data_melt_df initially has %d rows" | |
1293 % (end_time - start_time, len(upstream_data_melt.axes[0])), | |
1294 file=sys.stderr, | |
1295 ) | |
1296 # ref: https://stackoverflow.com/a/27360130/15509512 | |
1297 # e.g. df.drop(df[df.score < 50].index, inplace=True) | |
1298 upstream_data_melt.drop( | |
1299 upstream_data_melt[upstream_data_melt.X != "X"].index, inplace=True | |
1300 ) | |
1301 print( | |
1302 "%0.6f upstream_data_melt_df pre-dedup has %d rows" | |
1303 % (end_time - start_time, len(upstream_data_melt.axes[0])), | |
1304 file=sys.stderr, | |
1305 ) | |
1306 # ----------- Get melted kinase mapping data from SQLite database (finish) ----------- | |
1307 # ... -------------- end read upstream_data_melt -------------------------------------- | |
1308 | |
1309 end_time = time.process_time() # timer | |
1310 print( | |
1311 "%0.6f melted and minimized Upstream Map dataframe [1h_2]" | |
1312 % (end_time - start_time,), | |
1313 file=sys.stderr, | |
1314 ) # timer | |
1315 # ... end read upstream_data_melt | |
1316 | |
1317 end_time = time.process_time() # timer | |
1318 print( | |
1319 "%0.6f indexed melted Upstream Map [1h_2a]" | |
1320 % (end_time - start_time,), | |
1321 file=sys.stderr, | |
1322 ) # timer | |
1323 | |
1324 upstream_delta_melt_LoL = upstream_data_melt.values.tolist() | |
1325 | |
1326 melt_dict = {} | |
1327 for key in upstream_map_p_peptide_list: | |
1328 melt_dict[key] = [] | |
1329 | |
1330 for el in upstream_delta_melt_LoL: | |
1331 (p_peptide, characterization, X) = tuple(el) | |
1332 if p_peptide in melt_dict: | |
1333 melt_dict[p_peptide].append(characterization) | |
1334 else: | |
1335 exit( | |
1336 'Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping' | |
1337 % (p_peptide) | |
1338 ) | |
1339 | |
1340 end_time = time.process_time() # timer | |
1341 print( | |
1342 "%0.6f appended peptide characterizations [1h_2b]" | |
1343 % (end_time - start_time,), | |
1344 file=sys.stderr, | |
1345 ) # timer | |
1346 | |
1347 # for key in upstream_map_p_peptide_list: | |
1348 # melt_dict[key] = ' | '.join(melt_dict[key]) | |
1349 | |
1350 for key in upstream_map_p_peptide_list: | |
1351 melt_dict[key] = melt_join(melt_dict[key]) | |
1352 | |
1353 end_time = time.process_time() # timer | |
1354 print( | |
1355 "%0.6f concatenated multiple characterizations [1h_2c]" | |
1356 % (end_time - start_time,), | |
1357 file=sys.stderr, | |
1358 ) # timer | |
1359 | |
1360 # map_dict is a dictionary of dictionaries | |
1361 map_dict = {} | |
1362 for key in upstream_map_p_peptide_list: | |
1363 map_dict[key] = {} | |
1364 map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key] | |
1365 | |
1366 end_time = time.process_time() # timer | |
1367 print( | |
1368 "%0.6f instantiated map dictionary [2]" % (end_time - start_time,), | |
1369 file=sys.stderr, | |
1370 ) # timer | |
1371 | |
1372 # convert map_dict to dataframe | |
1373 map_df = pandas.DataFrame.transpose( | |
1374 pandas.DataFrame.from_dict(map_dict) | |
1375 ) | |
1376 map_df["p-peptide"] = map_df.index # make index a column too | |
1377 cols_map_df = map_df.columns.tolist() | |
1378 cols_map_df = [cols_map_df[1]] + [cols_map_df[0]] | |
1379 map_df = map_df[cols_map_df] | |
1380 | |
1381 # join map_df to uniprot_regsite_df | |
1382 output_df = uniprot_regsite_df.merge( | |
1383 map_df, how="left", left_on=PHOSPHOPEPTIDE, right_on="p-peptide" | |
1384 ) | |
1385 | |
1386 output_df = output_df[ | |
1387 [ | |
1388 PHOSPHOPEPTIDE, | |
1389 SEQUENCE10, | |
1390 SEQUENCE7, | |
1391 GENE_NAME, | |
1392 PHOSPHORESIDUE, | |
1393 UNIPROT_ID, | |
1394 DESCRIPTION, | |
1395 FUNCTION_PHOSPHORESIDUE, | |
1396 PUTATIVE_UPSTREAM_DOMAINS, | |
1397 ] | |
1398 ] | |
1399 | |
1400 # cols_output_prelim = output_df.columns.tolist() | |
1401 # | |
1402 # print("cols_output_prelim") | |
1403 # print(cols_output_prelim) | |
1404 # | |
1405 # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]] | |
1406 # | |
1407 # print("cols_output with p-peptide") | |
1408 # print(cols_output) | |
1409 # | |
1410 # cols_output = [col for col in cols_output if not col == "p-peptide"] | |
1411 # | |
1412 # print("cols_output") | |
1413 # print(cols_output) | |
1414 # | |
1415 # output_df = output_df[cols_output] | |
1416 | |
1417 # join output_df back to quantitative columns in data_in df | |
1418 quant_cols = data_in.columns.tolist() | |
1419 quant_cols = quant_cols[1:] | |
1420 quant_data = data_in[quant_cols] | |
1421 | |
1422 # ----------- Write merge/filter metadata to SQLite database (start) ----------- | |
1423 # Open SwissProt SQLite database | |
1424 conn = sql.connect(output_sqlite) | |
1425 cur = conn.cursor() | |
1426 | |
1427 cur.executescript(MRGFLTR_DDL) | |
1428 | |
1429 cur.execute( | |
1430 CITATION_INSERT_STMT, | |
1431 ("mrgfltr_metadata_view", CITATION_INSERT_PSP), | |
1432 ) | |
1433 cur.execute( | |
1434 CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP) | |
1435 ) | |
1436 cur.execute( | |
1437 CITATION_INSERT_STMT, | |
1438 ("mrgfltr_metadata_view", CITATION_INSERT_PSP_REF), | |
1439 ) | |
1440 cur.execute( | |
1441 CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP_REF) | |
1442 ) | |
1443 | |
1444 # Read ppep-to-sequence LUT | |
1445 ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn) | |
1446 # write only metadata for merged/filtered records to SQLite | |
1447 mrgfltr_metadata_df = output_df.copy() | |
1448 # replace phosphopeptide seq with ppep.id | |
1449 mrgfltr_metadata_df = ppep_lut_df.merge( | |
1450 mrgfltr_metadata_df, | |
1451 left_on="ppep_seq", | |
1452 right_on=PHOSPHOPEPTIDE, | |
1453 how="inner", | |
1454 ) | |
1455 mrgfltr_metadata_df.drop( | |
1456 columns=[PHOSPHOPEPTIDE, "ppep_seq"], inplace=True | |
1457 ) | |
1458 # rename columns | |
1459 mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS | |
1460 mrgfltr_metadata_df.to_sql( | |
1461 "mrgfltr_metadata", | |
1462 con=conn, | |
1463 if_exists="append", | |
1464 index=False, | |
1465 method="multi", | |
1466 ) | |
1467 | |
1468 # Close SwissProt SQLite database | |
1469 conn.close() | |
1470 # ----------- Write merge/filter metadata to SQLite database (finish) ----------- | |
1471 | |
1472 output_df = output_df.merge( | |
1473 quant_data, | |
1474 how="right", | |
1475 left_on=PHOSPHOPEPTIDE, | |
1476 right_on=PHOSPHOPEPTIDE_MATCH, | |
1477 ) | |
1478 output_cols = output_df.columns.tolist() | |
1479 output_cols = output_cols[:-1] | |
1480 output_df = output_df[output_cols] | |
1481 | |
1482 # cosmetic changes to Upstream column | |
1483 output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[ | |
1484 PUTATIVE_UPSTREAM_DOMAINS | |
1485 ].fillna( | |
1486 "" | |
1487 ) # fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping | |
1488 us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS]) | |
1489 i = 0 | |
1490 while i < len(us_series): | |
1491 # turn blanks into N_A to signify the info was searched for but cannot be found | |
1492 if us_series[i] == "": | |
1493 us_series[i] = N_A | |
1494 i += 1 | |
1495 output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series | |
1496 | |
1497 end_time = time.process_time() # timer | |
1498 print( | |
1499 "%0.6f establisheed output [3]" % (end_time - start_time,), | |
1500 file=sys.stderr, | |
1501 ) # timer | |
1502 | |
1503 (output_rows, output_cols) = output_df.shape | |
1504 | |
1505 output_df = output_df.convert_dtypes(convert_integer=True) | |
1506 | |
1507 # Output onto Final CSV file | |
1508 output_df.to_csv(output_filename_csv, index=False) | |
1509 output_df.to_csv( | |
1510 output_filename_tab, quoting=None, sep="\t", index=False | |
1511 ) | |
1512 | |
1513 end_time = time.process_time() # timer | |
1514 print( | |
1515 "%0.6f wrote output [4]" % (end_time - start_time,), | |
1516 file=sys.stderr, | |
1517 ) # timer | |
1518 | |
1519 print( | |
1520 "{:>10} phosphopeptides written to output".format(str(output_rows)) | |
1521 ) | |
1522 | |
1523 end_time = time.process_time() # timer | |
1524 print( | |
1525 "%0.6f seconds of non-system CPU time were consumed" | |
1526 % (end_time - start_time,), | |
1527 file=sys.stderr, | |
1528 ) # timer | |
1529 | |
1530 # Rev. 7/1/2016 | |
1531 # Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's | |
1532 # Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS | |
1533 # Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \ | |
1534 # read from SwissProt SQLite database | |
1535 # Rev. 12/9/2021: Transfer code to Galaxy tool wrapper | |
1536 | |
1537 # | |
1538 # copied from Excel Output Script.ipynb END # | |
1539 # | |
1540 | |
1541 try: | |
1542 catch( | |
1543 mqpep_getswissprot, | |
1544 ) | |
1545 exit(0) | |
1546 except Exception as e: | |
1547 exit("Internal error running mqpep_getswissprot(): %s" % (e)) | |
1548 | |
1549 | |
1550 if __name__ == "__main__": | |
1551 __main__() |