diff t_coffee_to_cigar.py @ 1:b3833e5b50d4 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 011cabb2a2b3237bbbc4850ed26972816702a2ba-dirty
author earlhaminst
date Mon, 19 Dec 2016 17:47:31 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/t_coffee_to_cigar.py	Mon Dec 19 17:47:31 2016 -0500
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+""" A script to build specific fasta databases """
+from __future__ import print_function
+
+import re
+import sys
+
+
+FASTA_MATCH_RE = re.compile(r'[^-]')
+
+
+def convert_and_print(header, sequence):
+    # Converts each match into M and each gap into D
+    tmp_seq = FASTA_MATCH_RE.sub('M', sequence)
+    tmp_seq = tmp_seq.replace('-', 'D')
+    # Split the sequence in substrings composed by the same letter
+    tmp_seq = tmp_seq.replace('DM', 'D,M')
+    tmp_seq = tmp_seq.replace('MD', 'M,D')
+    cigar_list = tmp_seq.split(',')
+    # Condense each substring, e.g. DDDD in 4D, and concatenate them again
+    cigar = ''
+    for s in cigar_list:
+        if len(s) > 1:
+            cigar += str(len(s))
+        cigar += s[0]
+    print("%s\t%s" % (header, cigar))
+
+
+def main():
+    with open(sys.argv[1]) as fh:
+        header = None
+        sequence = None
+        for line in fh:
+            line = line.strip()
+            if line and line[0] == '>':
+                if header:
+                    convert_and_print(header, sequence)
+                header = line[1:]
+                sequence = ''
+            else:
+                sequence += line
+    if header:
+        convert_and_print(header, sequence)
+
+
+if __name__ == "__main__":
+    main()