annotate bprom_gff3_converter.py @ 0:14e64dda68c8 draft default tip

planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
author cpt
date Sun, 23 Jul 2023 01:47:35 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
1 import argparse
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
2
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
3 import pandas as pd
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
4 import re
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
5 from typing import List, Match, Dict, TextIO, Union
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
6 from datetime import date
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
7
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
8 # In this file, a "feature" refers to the collection of data between the > keys of the bprom output.
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
9 # That collection of data refers to one section of the DNA upstream of a gene
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
10
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
11
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
12 def read_bprom_file(bprom_file) -> List[str]:
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
13 """Reads in file, creating a list of strings with each list element containing a line from the file"""
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
14 contents = []
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
15
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
16 with open(bprom_file) as file:
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
17 for line in file:
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
18 contents.append(line)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
19
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
20 return contents
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
21
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
22
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
23 def concatenate_then_split(contents) -> List[str]:
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
24 """Concatenates the file into one large string.
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
25 Then splits it on '>' so that each feature's data is together in one element"""
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
26 # Concatenates the entire file into one large string
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
27 concat_contents = "".join(contents)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
28
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
29 # Removing the empty string '' at element 0 used to make the join
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
30 concat_contents = concat_contents[1:]
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
31
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
32 # Splits the file into a list of strings on ">"
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
33 features = concat_contents.split(">")
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
34
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
35 return features
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
36
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
37
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
38 def remove_promoterless_features(features) -> List[str]:
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
39 """For each concatenated feature string passed, removes the element
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
40 if the # of predicted promoters is 0."""
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
41 cleaned_features = features
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
42 indices_to_delete = []
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
43 for i, feature in enumerate(cleaned_features):
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
44 if "Number of predicted promoters - 0" in cleaned_features[i]:
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
45 indices_to_delete.append(i)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
46 # Must delete in reverse order, otherwise it changes the list indices after
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
47 # the element deleted, and you delete subsequent elements at i+1, i+2, i+3, etc
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
48 for i in sorted(indices_to_delete, reverse=True):
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
49 del cleaned_features[i]
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
50
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
51 return cleaned_features
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
52
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
53
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
54 def extract_accession(feature) -> str:
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
55 """Extract accession"""
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
56 accession = re.search("[\w](.*)(?=_)", feature)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
57 accession = accession.group().replace("_", "").strip()
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
58
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
59 return accession
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
60
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
61
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
62 def extract_test_seq_position(feature) -> List[str]:
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
63 """Extract position in genome. Gets any number of values '(.*)' between the brackets
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
64 using 'lookbehind/lookright' (?<=PATTERN) and 'lookahead/lookleft' regex assertions
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
65 to extract (?<=Location=\\[)(.*)(?=]\\()"""
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
66 location = re.search("(?<=Location=\\[)(.*)(?=]\\()", feature)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
67 location = location.group().split(":")
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
68
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
69 return location
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
70
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
71
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
72 def extract_strand_direction(feature) -> str:
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
73 """Extract strand direction for a feature, - or +"""
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
74 # Matches for '(.)'
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
75 direction = re.search("(?<=\\().(?=\\))", feature)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
76 direction = direction.group()
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
77
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
78 return direction
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
79
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
80
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
81 def extract_promoter_data(feature) -> Dict[str, str]:
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
82 """Extracts all promoter data using regular expressions.
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
83 Use for one element in the output of concatenate_then_split()"""
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
84 # Extract promoter -10 and -35 sequences and scores
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
85 # Gets everything between "-xx box at pos." and " Score"
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
86 minus10 = re.search("(?<=-10 box at pos.)(.*)(?= Score)(.*)", feature)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
87 minus35 = re.search("(?<=-35 box at pos.)(.*)(?= Score)(.*)", feature)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
88
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
89 # Extracts the match and removes leading and trailing whitespace (which can be variable)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
90 # (the bprom output does not maintain the same # of whitespace characters
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
91 # if there are less digits, at least for the scoring)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
92 minus10 = minus10.group().lstrip().split(" ")
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
93 minus10_pos = int(minus10[0])
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
94 minus10_seq = minus10[1]
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
95 minus10_score = minus10[-1]
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
96
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
97 minus35 = minus35.group().lstrip().split(" ")
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
98 minus35_pos = int(minus35[0])
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
99 minus35_seq = minus35[1]
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
100 minus35_score = minus35[-1]
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
101
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
102 # Can change these keys to change the column 9
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
103 promoter_data = {
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
104 "minus10_pos": minus10_pos,
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
105 "minus10_seq": minus10_seq,
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
106 "minus10_score": minus10_score,
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
107 "minus35_pos": minus35_pos,
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
108 "minus35_seq": minus35_seq,
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
109 "minus35_score": minus35_score,
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
110 }
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
111 return promoter_data
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
112
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
113
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
114 def convert_extracted_promoter_data_to_ID_column_format(
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
115 promoter_data, calculated_promoter_positions
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
116 ) -> str:
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
117 """Converts input data to the GFF3 ID column (column 9) format, a semicolon separated
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
118 list of values providing additional information about each feature"""
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
119 # Replaces the BPROM output positions with the calculated ones
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
120 minus_10_calculated = calculated_promoter_positions[2]
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
121 minus_35_calculated = calculated_promoter_positions[3]
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
122 promoter_data["minus10_pos"] = minus_10_calculated
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
123 promoter_data["minus35_pos"] = minus_35_calculated
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
124
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
125 # Creates the column 9 string (attributes)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
126 promoter_data = ["{}={}".format(key, value) for key, value in promoter_data.items()]
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
127 promoter_data = (
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
128 "Description=Predicted promoter data;" + "Note=" + ",".join(promoter_data) + ";"
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
129 )
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
130 return promoter_data
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
131
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
132
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
133 def extract_LDF_score(feature) -> str:
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
134 """Extract LDF score"""
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
135 LDF = re.search("(?<=LDF-)(.*)", feature)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
136 LDF = LDF.group().strip()
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
137
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
138 return LDF
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
139
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
140
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
141 def calculate_promoter_position(feature):
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
142 """Calculate promoter positions (in the context of the genome) based on BPROM predictions."""
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
143 # Get 'Promoter Pos: X' data. This refers to the predicted transcriptional start site!
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
144 promoter_pos = re.search("(?<=Promoter Pos:)(.*)(?=LDF)", feature)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
145 promoter_pos = int(promoter_pos.group().strip())
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
146
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
147 # Get start and end positions from 'Location=[XXX:YYYY]'
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
148 test_seq_position = extract_test_seq_position(feature)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
149 test_cds_location_start_pos = int(test_seq_position[0])
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
150 test_cds_location_end_pos = int(test_seq_position[1])
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
151
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
152 promoter_data = extract_promoter_data(feature)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
153
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
154 """ IMPORTANT!! Whether or not you add or subtract to calculate the promoter start
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
155 # position depends on whether we're on the + or - strand!
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
156 # The workflow Jolene uses is smart enough to correctly pull upstream
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
157 # for both + and - strands (i.e., pulls left for +, pulls right for -)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
158 # THEREFORE, for a gene with a start at 930 on the + strand, it pulls 830:930
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
159 # And for a gene with a start at 930 on the - strand, it pulls 930:1030 """
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
160
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
161 direction = extract_strand_direction(feature)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
162
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
163 if direction == "+":
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
164 # BPROM starts counting from the LEFT boundary for + strand test sequences (as expected)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
165 # Get -10 promoter position
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
166 minus10_pos = promoter_data["minus10_pos"]
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
167 minus10_pos_in_context_of_genome = test_cds_location_start_pos + minus10_pos
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
168 # Get -35 promoter position
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
169 minus35_pos = promoter_data["minus35_pos"]
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
170 minus35_pos_in_context_of_genome = test_cds_location_start_pos + minus35_pos
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
171
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
172 start = test_cds_location_start_pos + minus35_pos
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
173 end = test_cds_location_start_pos + promoter_pos
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
174
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
175 calculated_promoter_positions = [
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
176 start,
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
177 end,
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
178 minus10_pos_in_context_of_genome,
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
179 minus35_pos_in_context_of_genome,
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
180 ]
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
181 return calculated_promoter_positions
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
182
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
183 elif direction == "-":
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
184 # BPROM starts counting from the RIGHT boundary for - strand test sequences
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
185 # Get -10 promoter position
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
186 minus10_pos = promoter_data["minus10_pos"]
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
187 minus10_pos_in_context_of_genome = test_cds_location_end_pos - minus10_pos
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
188 # Get -35 promoter position
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
189 minus35_pos = promoter_data["minus35_pos"]
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
190 minus35_pos_in_context_of_genome = test_cds_location_end_pos - minus35_pos
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
191
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
192 # The start and end are reversed
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
193 end = test_cds_location_end_pos - minus35_pos
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
194 start = test_cds_location_end_pos - promoter_pos
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
195
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
196 calculated_promoter_positions = [
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
197 start,
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
198 end,
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
199 minus10_pos_in_context_of_genome,
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
200 minus35_pos_in_context_of_genome,
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
201 ]
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
202 return calculated_promoter_positions
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
203
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
204 else:
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
205 assert "Error: Strand data neither '+' nor '-'"
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
206
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
207
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
208 def extract_tf_binding_elements():
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
209 """Extract predicted transcription factor binding elements"""
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
210 return
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
211
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
212
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
213 def extract_data_for_all_features(features) -> List[List[Union[str, int]]]:
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
214 """Loops through cleaned bprom output extracting all data of interest and builds the
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
215 structure for loading into a dataframe"""
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
216 extracted_data = []
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
217 for feature in features:
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
218 # loop through features, a List[str] containing each feature [str] in the
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
219 # original bprom format as a single string, but cleaned of irrelevant data
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
220 calculated_promoter_positions = calculate_promoter_position(feature)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
221 promoter_data = extract_promoter_data(feature)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
222 promoter_data_converted = convert_extracted_promoter_data_to_ID_column_format(
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
223 promoter_data, calculated_promoter_positions
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
224 )
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
225
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
226 extracted_data.append(
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
227 [
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
228 extract_accession(feature), # Seqid, col 1
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
229 "bprom", # Source, col 2
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
230 "promoter", # Type, col 3
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
231 calculated_promoter_positions[0], # Start, col 4
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
232 calculated_promoter_positions[1], # End, col 5
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
233 extract_LDF_score(feature), # Score, col 6
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
234 extract_strand_direction(feature), # Strand direction, col 7
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
235 ".", # Phase, col 8
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
236 promoter_data_converted, # Attributes, col 9
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
237 ]
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
238 )
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
239
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
240 return extracted_data
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
241
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
242
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
243 def convert_to_dataframe(extracted_data) -> pd.DataFrame:
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
244 """Convert extracted and processed data to Pandas dataframe with gff3 column names"""
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
245
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
246 df = pd.DataFrame(
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
247 extracted_data,
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
248 columns=[
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
249 "seqid",
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
250 "source",
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
251 "type",
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
252 "start",
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
253 "end",
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
254 "score",
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
255 "strand",
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
256 "phase",
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
257 "attributes",
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
258 ],
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
259 )
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
260 return df
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
261
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
262
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
263 def write_to_gff3(dataframe) -> None:
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
264 """Create a gff3 text file from the DataFrame by converting to a tab separated values (tsv) file"""
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
265 tsv = dataframe.to_csv(sep="\t", index=False, header=None)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
266
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
267 # Gets the first element of the first column to use for
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
268 accession = dataframe.iloc[0][0]
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
269
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
270 year, month, day = date.today().year, date.today().month, date.today().day
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
271
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
272 # with open(f'{year}_{month}_{day}_bprom_as_gff3_{accession}.txt', 'w') as wf:
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
273 # Header so Galaxy can recognize as GFF3
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
274 print("##gff-version 3\n")
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
275 # for line in tsv:
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
276 print(tsv)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
277
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
278 return
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
279
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
280
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
281 def convert_bprom_output_to_gff3(bprom_file) -> None:
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
282 """Master function. Given a BPROM .txt file as output, extracts data and writes as a GFF3 file"""
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
283 bprom_file = read_bprom_file(bprom_file)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
284 concatenated_bprom_file = concatenate_then_split(bprom_file)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
285 working_file = remove_promoterless_features(concatenated_bprom_file)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
286 extracted_data = extract_data_for_all_features(working_file)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
287 gff3_dataframe = convert_to_dataframe(extracted_data)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
288 # Create the gff3 text file
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
289 write_to_gff3(gff3_dataframe)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
290
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
291 return
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
292
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
293
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
294 if __name__ == "__main__":
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
295 ## Shows the DataFrame output in the terminal for testing/debugging
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
296 # bprom_file = read_bprom_file('BPROM_output.txt')
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
297 # concatenated_bprom_file: List[str] = concatenate_then_split(bprom_file)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
298 # working_file = remove_promoterless_features(concatenated_bprom_file)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
299 # print(convert_to_dataframe(extract_data_for_all_features(working_file)).to_string())
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
300
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
301 parser = argparse.ArgumentParser(
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
302 description="converts BPROM output to the gff3 file format"
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
303 )
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
304
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
305 parser.add_argument("-f", help="bprom file as .txt")
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
306 args = parser.parse_args()
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
307 # Actual function for converting the BPROM output to gff3
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
308 convert_bprom_output_to_gff3(args.f)
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
309
14e64dda68c8 planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff changeset
310 # Upload to cpt github in the directory Galaxy-Tools/tools/external/