Mercurial > repos > cpt > cpt_bprom_converter
annotate bprom_gff3_converter.py @ 0:14e64dda68c8 draft default tip
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
author | cpt |
---|---|
date | Sun, 23 Jul 2023 01:47:35 +0000 |
parents | |
children |
rev | line source |
---|---|
0
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
1 import argparse |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
2 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
3 import pandas as pd |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
4 import re |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
5 from typing import List, Match, Dict, TextIO, Union |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
6 from datetime import date |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
7 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
8 # In this file, a "feature" refers to the collection of data between the > keys of the bprom output. |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
9 # That collection of data refers to one section of the DNA upstream of a gene |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
10 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
11 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
12 def read_bprom_file(bprom_file) -> List[str]: |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
13 """Reads in file, creating a list of strings with each list element containing a line from the file""" |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
14 contents = [] |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
15 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
16 with open(bprom_file) as file: |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
17 for line in file: |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
18 contents.append(line) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
19 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
20 return contents |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
21 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
22 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
23 def concatenate_then_split(contents) -> List[str]: |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
24 """Concatenates the file into one large string. |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
25 Then splits it on '>' so that each feature's data is together in one element""" |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
26 # Concatenates the entire file into one large string |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
27 concat_contents = "".join(contents) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
28 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
29 # Removing the empty string '' at element 0 used to make the join |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
30 concat_contents = concat_contents[1:] |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
31 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
32 # Splits the file into a list of strings on ">" |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
33 features = concat_contents.split(">") |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
34 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
35 return features |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
36 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
37 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
38 def remove_promoterless_features(features) -> List[str]: |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
39 """For each concatenated feature string passed, removes the element |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
40 if the # of predicted promoters is 0.""" |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
41 cleaned_features = features |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
42 indices_to_delete = [] |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
43 for i, feature in enumerate(cleaned_features): |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
44 if "Number of predicted promoters - 0" in cleaned_features[i]: |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
45 indices_to_delete.append(i) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
46 # Must delete in reverse order, otherwise it changes the list indices after |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
47 # the element deleted, and you delete subsequent elements at i+1, i+2, i+3, etc |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
48 for i in sorted(indices_to_delete, reverse=True): |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
49 del cleaned_features[i] |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
50 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
51 return cleaned_features |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
52 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
53 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
54 def extract_accession(feature) -> str: |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
55 """Extract accession""" |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
56 accession = re.search("[\w](.*)(?=_)", feature) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
57 accession = accession.group().replace("_", "").strip() |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
58 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
59 return accession |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
60 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
61 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
62 def extract_test_seq_position(feature) -> List[str]: |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
63 """Extract position in genome. Gets any number of values '(.*)' between the brackets |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
64 using 'lookbehind/lookright' (?<=PATTERN) and 'lookahead/lookleft' regex assertions |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
65 to extract (?<=Location=\\[)(.*)(?=]\\()""" |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
66 location = re.search("(?<=Location=\\[)(.*)(?=]\\()", feature) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
67 location = location.group().split(":") |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
68 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
69 return location |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
70 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
71 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
72 def extract_strand_direction(feature) -> str: |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
73 """Extract strand direction for a feature, - or +""" |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
74 # Matches for '(.)' |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
75 direction = re.search("(?<=\\().(?=\\))", feature) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
76 direction = direction.group() |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
77 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
78 return direction |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
79 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
80 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
81 def extract_promoter_data(feature) -> Dict[str, str]: |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
82 """Extracts all promoter data using regular expressions. |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
83 Use for one element in the output of concatenate_then_split()""" |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
84 # Extract promoter -10 and -35 sequences and scores |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
85 # Gets everything between "-xx box at pos." and " Score" |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
86 minus10 = re.search("(?<=-10 box at pos.)(.*)(?= Score)(.*)", feature) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
87 minus35 = re.search("(?<=-35 box at pos.)(.*)(?= Score)(.*)", feature) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
88 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
89 # Extracts the match and removes leading and trailing whitespace (which can be variable) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
90 # (the bprom output does not maintain the same # of whitespace characters |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
91 # if there are less digits, at least for the scoring) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
92 minus10 = minus10.group().lstrip().split(" ") |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
93 minus10_pos = int(minus10[0]) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
94 minus10_seq = minus10[1] |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
95 minus10_score = minus10[-1] |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
96 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
97 minus35 = minus35.group().lstrip().split(" ") |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
98 minus35_pos = int(minus35[0]) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
99 minus35_seq = minus35[1] |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
100 minus35_score = minus35[-1] |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
101 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
102 # Can change these keys to change the column 9 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
103 promoter_data = { |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
104 "minus10_pos": minus10_pos, |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
105 "minus10_seq": minus10_seq, |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
106 "minus10_score": minus10_score, |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
107 "minus35_pos": minus35_pos, |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
108 "minus35_seq": minus35_seq, |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
109 "minus35_score": minus35_score, |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
110 } |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
111 return promoter_data |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
112 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
113 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
114 def convert_extracted_promoter_data_to_ID_column_format( |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
115 promoter_data, calculated_promoter_positions |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
116 ) -> str: |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
117 """Converts input data to the GFF3 ID column (column 9) format, a semicolon separated |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
118 list of values providing additional information about each feature""" |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
119 # Replaces the BPROM output positions with the calculated ones |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
120 minus_10_calculated = calculated_promoter_positions[2] |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
121 minus_35_calculated = calculated_promoter_positions[3] |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
122 promoter_data["minus10_pos"] = minus_10_calculated |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
123 promoter_data["minus35_pos"] = minus_35_calculated |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
124 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
125 # Creates the column 9 string (attributes) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
126 promoter_data = ["{}={}".format(key, value) for key, value in promoter_data.items()] |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
127 promoter_data = ( |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
128 "Description=Predicted promoter data;" + "Note=" + ",".join(promoter_data) + ";" |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
129 ) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
130 return promoter_data |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
131 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
132 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
133 def extract_LDF_score(feature) -> str: |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
134 """Extract LDF score""" |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
135 LDF = re.search("(?<=LDF-)(.*)", feature) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
136 LDF = LDF.group().strip() |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
137 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
138 return LDF |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
139 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
140 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
141 def calculate_promoter_position(feature): |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
142 """Calculate promoter positions (in the context of the genome) based on BPROM predictions.""" |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
143 # Get 'Promoter Pos: X' data. This refers to the predicted transcriptional start site! |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
144 promoter_pos = re.search("(?<=Promoter Pos:)(.*)(?=LDF)", feature) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
145 promoter_pos = int(promoter_pos.group().strip()) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
146 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
147 # Get start and end positions from 'Location=[XXX:YYYY]' |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
148 test_seq_position = extract_test_seq_position(feature) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
149 test_cds_location_start_pos = int(test_seq_position[0]) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
150 test_cds_location_end_pos = int(test_seq_position[1]) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
151 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
152 promoter_data = extract_promoter_data(feature) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
153 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
154 """ IMPORTANT!! Whether or not you add or subtract to calculate the promoter start |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
155 # position depends on whether we're on the + or - strand! |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
156 # The workflow Jolene uses is smart enough to correctly pull upstream |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
157 # for both + and - strands (i.e., pulls left for +, pulls right for -) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
158 # THEREFORE, for a gene with a start at 930 on the + strand, it pulls 830:930 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
159 # And for a gene with a start at 930 on the - strand, it pulls 930:1030 """ |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
160 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
161 direction = extract_strand_direction(feature) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
162 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
163 if direction == "+": |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
164 # BPROM starts counting from the LEFT boundary for + strand test sequences (as expected) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
165 # Get -10 promoter position |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
166 minus10_pos = promoter_data["minus10_pos"] |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
167 minus10_pos_in_context_of_genome = test_cds_location_start_pos + minus10_pos |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
168 # Get -35 promoter position |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
169 minus35_pos = promoter_data["minus35_pos"] |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
170 minus35_pos_in_context_of_genome = test_cds_location_start_pos + minus35_pos |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
171 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
172 start = test_cds_location_start_pos + minus35_pos |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
173 end = test_cds_location_start_pos + promoter_pos |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
174 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
175 calculated_promoter_positions = [ |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
176 start, |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
177 end, |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
178 minus10_pos_in_context_of_genome, |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
179 minus35_pos_in_context_of_genome, |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
180 ] |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
181 return calculated_promoter_positions |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
182 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
183 elif direction == "-": |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
184 # BPROM starts counting from the RIGHT boundary for - strand test sequences |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
185 # Get -10 promoter position |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
186 minus10_pos = promoter_data["minus10_pos"] |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
187 minus10_pos_in_context_of_genome = test_cds_location_end_pos - minus10_pos |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
188 # Get -35 promoter position |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
189 minus35_pos = promoter_data["minus35_pos"] |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
190 minus35_pos_in_context_of_genome = test_cds_location_end_pos - minus35_pos |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
191 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
192 # The start and end are reversed |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
193 end = test_cds_location_end_pos - minus35_pos |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
194 start = test_cds_location_end_pos - promoter_pos |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
195 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
196 calculated_promoter_positions = [ |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
197 start, |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
198 end, |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
199 minus10_pos_in_context_of_genome, |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
200 minus35_pos_in_context_of_genome, |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
201 ] |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
202 return calculated_promoter_positions |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
203 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
204 else: |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
205 assert "Error: Strand data neither '+' nor '-'" |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
206 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
207 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
208 def extract_tf_binding_elements(): |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
209 """Extract predicted transcription factor binding elements""" |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
210 return |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
211 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
212 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
213 def extract_data_for_all_features(features) -> List[List[Union[str, int]]]: |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
214 """Loops through cleaned bprom output extracting all data of interest and builds the |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
215 structure for loading into a dataframe""" |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
216 extracted_data = [] |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
217 for feature in features: |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
218 # loop through features, a List[str] containing each feature [str] in the |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
219 # original bprom format as a single string, but cleaned of irrelevant data |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
220 calculated_promoter_positions = calculate_promoter_position(feature) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
221 promoter_data = extract_promoter_data(feature) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
222 promoter_data_converted = convert_extracted_promoter_data_to_ID_column_format( |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
223 promoter_data, calculated_promoter_positions |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
224 ) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
225 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
226 extracted_data.append( |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
227 [ |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
228 extract_accession(feature), # Seqid, col 1 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
229 "bprom", # Source, col 2 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
230 "promoter", # Type, col 3 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
231 calculated_promoter_positions[0], # Start, col 4 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
232 calculated_promoter_positions[1], # End, col 5 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
233 extract_LDF_score(feature), # Score, col 6 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
234 extract_strand_direction(feature), # Strand direction, col 7 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
235 ".", # Phase, col 8 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
236 promoter_data_converted, # Attributes, col 9 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
237 ] |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
238 ) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
239 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
240 return extracted_data |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
241 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
242 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
243 def convert_to_dataframe(extracted_data) -> pd.DataFrame: |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
244 """Convert extracted and processed data to Pandas dataframe with gff3 column names""" |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
245 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
246 df = pd.DataFrame( |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
247 extracted_data, |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
248 columns=[ |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
249 "seqid", |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
250 "source", |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
251 "type", |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
252 "start", |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
253 "end", |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
254 "score", |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
255 "strand", |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
256 "phase", |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
257 "attributes", |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
258 ], |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
259 ) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
260 return df |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
261 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
262 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
263 def write_to_gff3(dataframe) -> None: |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
264 """Create a gff3 text file from the DataFrame by converting to a tab separated values (tsv) file""" |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
265 tsv = dataframe.to_csv(sep="\t", index=False, header=None) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
266 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
267 # Gets the first element of the first column to use for |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
268 accession = dataframe.iloc[0][0] |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
269 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
270 year, month, day = date.today().year, date.today().month, date.today().day |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
271 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
272 # with open(f'{year}_{month}_{day}_bprom_as_gff3_{accession}.txt', 'w') as wf: |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
273 # Header so Galaxy can recognize as GFF3 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
274 print("##gff-version 3\n") |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
275 # for line in tsv: |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
276 print(tsv) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
277 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
278 return |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
279 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
280 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
281 def convert_bprom_output_to_gff3(bprom_file) -> None: |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
282 """Master function. Given a BPROM .txt file as output, extracts data and writes as a GFF3 file""" |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
283 bprom_file = read_bprom_file(bprom_file) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
284 concatenated_bprom_file = concatenate_then_split(bprom_file) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
285 working_file = remove_promoterless_features(concatenated_bprom_file) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
286 extracted_data = extract_data_for_all_features(working_file) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
287 gff3_dataframe = convert_to_dataframe(extracted_data) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
288 # Create the gff3 text file |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
289 write_to_gff3(gff3_dataframe) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
290 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
291 return |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
292 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
293 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
294 if __name__ == "__main__": |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
295 ## Shows the DataFrame output in the terminal for testing/debugging |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
296 # bprom_file = read_bprom_file('BPROM_output.txt') |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
297 # concatenated_bprom_file: List[str] = concatenate_then_split(bprom_file) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
298 # working_file = remove_promoterless_features(concatenated_bprom_file) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
299 # print(convert_to_dataframe(extract_data_for_all_features(working_file)).to_string()) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
300 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
301 parser = argparse.ArgumentParser( |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
302 description="converts BPROM output to the gff3 file format" |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
303 ) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
304 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
305 parser.add_argument("-f", help="bprom file as .txt") |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
306 args = parser.parse_args() |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
307 # Actual function for converting the BPROM output to gff3 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
308 convert_bprom_output_to_gff3(args.f) |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
309 |
14e64dda68c8
planemo upload commit 852ac96ca53a2ffa0947e6df5e24671866b642f5
cpt
parents:
diff
changeset
|
310 # Upload to cpt github in the directory Galaxy-Tools/tools/external/ |