diff reading_garnier_output.py @ 1:edd518e72c89 draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:52:09 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/reading_garnier_output.py	Mon Jun 05 02:52:09 2023 +0000
@@ -0,0 +1,133 @@
+#!/usr/bin/env python
+
+import csv
+import argparse
+
+# import sys
+
+# This function reads through the tagseq file and outputs a list of sequence names and the lengths of each sequence.
+def garnier_sequences(tagseq_file=None):
+    # open the file and create blank lists
+    f = tagseq_file  # open(tagseq_file, 'r')
+    f.seek(0)
+    sequence = []
+    lengths = []
+
+    # for each line the in file, search for the words 'Sequence' and 'to' to find the sequence name and length,
+    # respectively. Then add sequence names and lengths to the proper lists
+    for line in f:
+        words = line.split()
+        if line.startswith("# Sequence:"):
+            # if 'Sequence:' in line:
+            # if words[1] == 'Sequence:':
+            sequence += [words[words.index("Sequence:") + 1]]
+            # if words[5] == 'to:':
+            #    lengths += [int(words[6])]
+            if words.index("to:"):
+                lengths += [int(words[words.index("to:") + 1])]
+    # return the sequence names and lengths
+    return sequence, lengths
+
+
+# This function extracts the helix, sheet, turn, and coil predictions from the file. The predictions for each type of
+# secondary structure are joined together in one string.
+def garnier_secondary_struct(tagseq_file=None):
+    # opens the file and sets variables for the structural predictions
+    f = tagseq_file  # open(tagseq_file, 'r')
+    helix = ""
+    turns = ""
+    coil = ""
+    sheet = ""
+
+    # if the first work in the line indicates a structural prediction, it adds the rest of the line to the right
+    # prediction string.
+    for line in f:
+        words = line.split()
+        if len(words) > 0:
+            if words[0] in "helix":
+                helix += str(line[6:]).rstrip("\n")
+            elif words[0] in "sheet":
+                sheet += str(line[6:]).rstrip("\n")
+            elif words[0] in "turns":
+                turns += str(line[6:]).rstrip("\n")
+            elif words[0] in "coil":
+                coil += str(line[6:]).rstrip("\n")
+    # f.close()
+    # returns the four structural prediction strings
+    return helix, turns, coil, sheet
+
+
+# This functions cuts the strings based on the lengths of the original sequences. Lengths are given in a list.
+def vector_cutter(vector, lengths_to_cut):
+    # sets up iteration variables
+    start = 0
+    end = lengths_to_cut[0]
+    maximum = len(lengths_to_cut)
+    # creates output list
+    output = []
+    # loops through the number of sequences based on the number of lengths
+    for i in range(maximum):
+        # outputs list of sequence strings
+        output += [str(vector[start:end])]
+        start = end
+        if i + 1 != maximum:
+            end += lengths_to_cut[i + 1]
+    # returns list of strings. Each sequence has a string included in the list.
+    return output
+
+
+# this function takes the helix, turn, sheet, and coil predictions for each sequence and creates a single structural
+# prediction string.
+def single_prediction(helix, sheet, turns, coil):
+    # sets output list
+    secondary_structure = []
+    # checks to make sure each of the strings is the same length
+    if len(helix) == len(sheet) == len(coil) == len(turns):
+        # loops through the length of each sequence, and when the value is not a blank it is added to the output
+        # prediction list.
+        for j in range(len(helix)):
+            if helix[j] != " ":
+                secondary_structure += [str(helix[j])]
+            elif sheet[j] != " ":
+                secondary_structure += [str(sheet[j])]
+            elif coil[j] != " ":
+                secondary_structure += [str(coil[j])]
+            else:
+                secondary_structure += [str(turns[j])]
+    # returns the output prediction list for the sequence
+    return secondary_structure
+
+
+if __name__ == "__main__":
+    # Grab all of the filters from our plugin loader
+    parser = argparse.ArgumentParser(
+        description="Read Garnier Secondary Structure Prediction"
+    )
+    parser.add_argument(
+        "tagseq_file", type=argparse.FileType("r"), help="Tagseq file input"
+    )
+    args = parser.parse_args()
+
+    # opens the tagseq file and prepares for writing csv
+    # f = open(sys.stdout, 'w', newline='')
+    # writer = csv.writer(f)
+
+    # reads tagseq file for helix, turn, coil, and sheet sequences as well as for names and lengths of the sequences
+    # summarized in the tagseq file#!/usr/bin/env python\r
+    Hel, Tur, Coi, She = garnier_secondary_struct(**vars(args))
+    names, gives = garnier_sequences(**vars(args))
+
+    # cut each of the structural prediction strings so that they are individual sequences
+    Helix = vector_cutter(Hel, gives)
+    Sheet = vector_cutter(She, gives)
+    Turns = vector_cutter(Tur, gives)
+    Coil = vector_cutter(Coi, gives)
+
+    # for each sequence compile the four types of structural predictions into a single prediction, and output the final
+    # prediction in csv format and to the screen
+    for i in range(len(Helix)):
+        Final = single_prediction(Helix[i], Sheet[i], Turns[i], Coil[i])
+        # csv.writerow(['Sequence: '] + [names[i]])
+        # csv.writerow(Final)
+        print("Sequence Name: " + "\t" + names[i])
+        print("\t".join(Final))