diff genericize_db.py @ 0:14785481da2b draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fragpipe commit 905cc2be18669cffe9ac6c46fcd08b6857a67f4f
author galaxyp
date Wed, 10 Jul 2024 06:15:00 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/genericize_db.py	Wed Jul 10 06:15:00 2024 +0000
@@ -0,0 +1,19 @@
+#!/usr/bin/env python3
+#
+# Prefixes sequence headers in the input FASTA file that are not formatted according to the UniProt, NCBI, or ENSEMBL formats with '>generic|' to avoid being misinterpreted by Philosopher.
+#
+
+import re
+import sys
+
+input_db_file = sys.argv[1]
+output_db_file = sys.argv[2]
+
+
+def sub_header(line):
+    return re.sub(r'^>(?!sp\||tr\||db\||AP_|NP_|YP_|XP_|WP_|ENSP|UniRef|nxp|generic)', '>generic|', line)
+
+
+with open(input_db_file) as in_file, open(output_db_file, 'w') as out_file:
+    for line in in_file:
+        out_file.write(sub_header(line))