annotate detect_putative_ltr_wrapper.py @ 13:559940c04c44 draft

"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
author petr-novak
date Thu, 11 Aug 2022 07:29:06 +0000
parents ff01d4263391
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
2 """This wrapper is intended to be used on large genomes and large DANTE input to
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
3 minimize memory usage, It splits input files to pieces and analyze it on by one by
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
4 detect_putative_ltr.R
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
5 If input does not exceed specified max-chunk_size, it will run detect_putative_ltr.R
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
6 directly
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
7 """
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
8
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
9 import argparse
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
10 import os
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
11 import sys
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
12 import tempfile
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
13 from itertools import cycle
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
14 import subprocess
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
15
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
16
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
17 class Gff3Feature:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
18 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
19 Class for gff3 feature
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
20 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
21
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
22 def __init__(self, line):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
23 self.line = line
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
24 self.items = line.strip().split('\t')
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
25 self.header = self.items[0]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
26 self.source = self.items[1]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
27 self.type = self.items[2]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
28 self.start = int(self.items[3])
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
29 self.end = int(self.items[4])
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
30 self.score = self.items[5]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
31 self.strand = self.items[6]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
32 self.frame = self.items[7]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
33 self.attributes = self.items[8]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
34 self.attributes_dict = {}
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
35 for item in self.attributes.split(';'):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
36 if item != '':
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
37 key, value = item.split('=')
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
38 self.attributes_dict[key] = value
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
39
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
40 self.attributes_str = ';'.join(
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
41 ['{}={}'.format(key, value) for key, value in self.attributes_dict.items()]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
42 )
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
43
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
44 def __str__(self):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
45 return '\t'.join(
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
46 [self.header, self.source, self.type, str(self.start), str(self.end),
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
47 self.score, self.strand, self.frame, self.attributes_str]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
48 ) + '\n'
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
49
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
50 def __repr__(self):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
51 return '\t'.join(
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
52 [self.header, self.source, self.type, str(self.start), str(self.end),
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
53 self.score, self.strand, self.frame, self.attributes_str]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
54 ) + '\n'
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
55
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
56 def __eq__(self, other):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
57 return self.line_recalculated() == other.line_recalculated()
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
58
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
59 def __hash__(self):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
60 return hash(self.line_recalculated())
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
61
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
62 def get_line(self):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
63 """returns original line"""
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
64 return self.line
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
65
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
66 def overlap(self, other):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
67 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
68 Check if two features overlap
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
69 :param other:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
70 :return:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
71 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
72 if self.start <= other.end and self.end >= other.start:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
73 return True
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
74 else:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
75 return False
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
76
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
77 def line_recalculated(self):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
78 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
79 :return:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
80 string with recalculated line
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
81 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
82 return '\t'.join(
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
83 [self.header, self.source, self.type, str(self.start), str(self.end),
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
84 self.score, self.strand, self.frame, self.attributes_str]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
85 ) + '\n'
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
86
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
87 def __lt__(self, other):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
88 width = self.end - self.start
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
89 other_width = other.end - other.start
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
90 return width < other_width
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
91
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
92 def __gt__(self, other):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
93 width = self.end - self.start
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
94 other_width = other.end - other.start
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
95 return width > other_width
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
96
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
97 def identical_region(self, other):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
98 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
99 Check if two features are identical
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
100 :param other:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
101 :return:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
102 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
103 if self.start == other.start and self.end == other.end and self.header == \
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
104 other.header:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
105 return True
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
106 else:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
107 return False
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
108
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
109
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
110 def get_arguments():
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
111 """
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
112 Get arguments from command line
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
113 :return:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
114 args
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
115 """
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
116 parser = argparse.ArgumentParser(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
117 description="""detect_putative_ltr_wrapper.py is a wrapper for
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
118 detect_putative_ltr.R""", formatter_class=argparse.RawTextHelpFormatter
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
119 )
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
120 parser.add_argument(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
121 '-g', '--gff3', default=None, required=True, help="gff3 file", type=str,
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
122 action='store'
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
123 )
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
124 parser.add_argument(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
125 '-s', '--reference_sequence', default=None, required=True,
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
126 help="reference sequence as fasta file", type=str, action='store'
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
127 )
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
128 parser.add_argument(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
129 '-o', '--output', default=None, required=True, help="output file path and prefix",
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
130 type=str, action='store'
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
131 )
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
132 parser.add_argument(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
133 '-c', '--cpu', default=1, required=False, help="number of CPUs", type=int,
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
134 action='store'
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
135 )
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
136 parser.add_argument(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
137 '-M', '--max_missing_domains', default=0, required=False, type=int
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
138 )
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
139 parser.add_argument(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
140 '-L', '--min_relative_length', default=0.6, required=False, type=float,
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
141 help="Minimum relative length of protein domain to be "
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
142 "considered for retrostransposon detection"
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
143 )
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
144 parser.add_argument(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
145 '-S', '--max_chunk_size', default=100000000, required=False, type=int,
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
146 help='If size of reference sequence is greater than this value, reference is '
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
147 'analyzed in chunks of this size. default is %(default)s'
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
148 'Setting this value too small will slow down the analysis'
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
149 )
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
150 args = parser.parse_args()
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
151 return args
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
152
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
153
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
154 def read_fasta_sequence_size(fasta_file):
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
155 """Read size of sequence into dictionary"""
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
156 fasta_dict = {}
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
157 with open(fasta_file, 'r') as f:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
158 for line in f:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
159 if line[0] == '>':
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
160 header = line.strip().split(' ')[0][1:] # remove part of name after space
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
161 fasta_dict[header] = 0
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
162 else:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
163 fasta_dict[header] += len(line.strip())
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
164 return fasta_dict
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
165
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
166
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
167 def make_temp_files(number_of_files):
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
168 """
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
169 Make named temporary files, file will not be deleted upon exit!
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
170 :param number_of_files:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
171 :return:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
172 filepaths
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
173 """
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
174 temp_files = []
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
175 for i in range(number_of_files):
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
176 temp_files.append(tempfile.NamedTemporaryFile(delete=False).name)
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
177 os.remove(temp_files[-1])
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
178 return temp_files
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
179
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
180
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
181 def sum_up_stats_files(files):
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
182 """
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
183 Sum up statistics files
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
184 :return:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
185 """
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
186 new_statistics = {}
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
187 for file in files:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
188 with open(file, 'r') as fh:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
189 for line in fh:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
190 items = line.strip().split('\t')
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
191 if items[0] == 'Classification':
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
192 header = items
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
193 continue
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
194 else:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
195 counts = [int(item) for item in items[1:]]
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
196 if items[0] in new_statistics:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
197 new_statistics[items[0]] = [sum(x) for x in
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
198 zip(new_statistics[items[0]], counts)]
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
199 else:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
200 new_statistics[items[0]] = counts
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
201 # convert to string, first line is header
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
202 statistics_str = []
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
203 for classification, counts in new_statistics.items():
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
204 statistics_str.append(classification + '\t' + '\t'.join([str(x) for x in counts]))
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
205 sorted_stat_with_header = ['\t'.join(header)] + sorted(statistics_str)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
206 return sorted_stat_with_header
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
207
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
208
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
209 def read_single_fasta_to_dictionary(fh):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
210 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
211 Read fasta file into dictionary
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
212 :param fh:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
213 :return:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
214 fasta_dict
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
215 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
216 fasta_dict = {}
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
217 for line in fh:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
218 if line[0] == '>':
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
219 header = line.strip().split(' ')[0][1:] # remove part of name after space
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
220 fasta_dict[header] = []
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
221 else:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
222 fasta_dict[header] += [line.strip()]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
223 fasta_dict = {k: ''.join(v) for k, v in fasta_dict.items()}
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
224 return fasta_dict
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
225
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
226
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
227 def split_fasta_to_chunks(fasta_file, chunk_size=100000000, overlap=100000):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
228 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
229 Split fasta file to chunks, sequences longe than chuck size are split to overlaping
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
230 peaces. If sequences are shorter, chunck with multiple sequences are created.
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
231 :param fasta_file:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
232
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
233 :param fasta_file:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
234 :param chunk_size:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
235 :param overlap:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
236 :return:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
237 fasta_file_split
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
238 matching_table
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
239 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
240 min_chunk_size = chunk_size * 2
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
241 fasta_dict = read_fasta_sequence_size(fasta_file)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
242 # calculates ranges for splitting of fasta files and store them in list
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
243 matching_table = []
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
244 fasta_file_split = tempfile.NamedTemporaryFile(delete=False).name
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
245 for header, size in fasta_dict.items():
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
246 if size > min_chunk_size:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
247 number_of_chunks = int(size / chunk_size)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
248 adjusted_chunk_size = int(size / number_of_chunks)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
249 for i in range(number_of_chunks):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
250 start = i * adjusted_chunk_size
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
251 end = ((i + 1) *
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
252 adjusted_chunk_size
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
253 + overlap) if i + 1 < number_of_chunks else size
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
254 new_header = header + '_' + str(i)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
255 matching_table.append([header, i, start, end, new_header])
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
256 else:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
257 new_header = header + '_0'
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
258 matching_table.append([header, 0, 0, size, new_header])
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
259 # read sequences from fasta files and split them to chunks according to matching table
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
260 # open output and input files, use with statement to close files
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
261 fasta_dict = read_single_fasta_to_dictionary(open(fasta_file, 'r'))
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
262 with open(fasta_file_split, 'w') as fh_out:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
263 for header in fasta_dict:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
264 matching_table_part = [x for x in matching_table if x[0] == header]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
265 for header2, i, start, end, new_header in matching_table_part:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
266 fh_out.write('>' + new_header + '\n')
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
267 fh_out.write(fasta_dict[header][start:end] + '\n')
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
268 return fasta_file_split, matching_table
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
269
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
270
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
271 def get_new_header_and_coordinates(header, start, end, matching_table):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
272 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
273 Get new header and coordinates for sequence
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
274 :param header:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
275 :param start:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
276 :param end:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
277 :param matching_table:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
278 :return:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
279 new_header
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
280 new_start
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
281 new_end
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
282 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
283 matching_table_part = [x for x in matching_table if x[0] == header]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
284 new_coords = []
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
285 for chunk in matching_table_part:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
286 if chunk[2] <= start < chunk[3]:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
287 new_header = chunk[4]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
288 new_start = start - chunk[2]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
289 new_end = end - chunk[2]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
290 new_sequence_length = chunk[3] - chunk[2]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
291 new_coords.append([new_header, new_start, new_end, new_sequence_length])
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
292 return new_coords
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
293
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
294
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
295 def get_original_header_and_coordinates(new_header, new_start, new_end, matching_table):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
296 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
297 Get original header and coordinates for sequence
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
298 :param new_header:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
299 :param new_start:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
300 :param new_end:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
301 :param matching_table:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
302 :return:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
303 original_header
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
304 original_start
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
305 original_end
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
306 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
307 matching_table_part = [x for x in matching_table if x[4] == new_header]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
308 ori_header = matching_table_part[0][0]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
309 start = matching_table_part[0][2]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
310 ori_start = new_start + start
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
311 ori_end = new_end + start
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
312 return ori_header, ori_start, ori_end
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
313
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
314
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
315 # recalculate gff3 coordinates, use gff3_feature class
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
316 def recalculate_gff3_coordinates(gff3_file, matching_table):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
317 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
318 Recalculate gff3 coordinates, use gff3_feature class
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
319 :param gff3_file:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
320 :param matching_table:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
321 :return:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
322 gff3_file_recalculated
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
323 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
324 gff3_file_recalculated = tempfile.NamedTemporaryFile(delete=False).name
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
325
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
326 with open(gff3_file, 'r') as fh_in:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
327 with open(gff3_file_recalculated, 'w') as fh_out:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
328 for line in fh_in:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
329 if line[0] == '#':
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
330 fh_out.write(line)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
331 else:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
332 feature = Gff3Feature(line)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
333 new_coords = get_new_header_and_coordinates(
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
334 feature.header, feature.start, feature.end, matching_table
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
335 )
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
336 for new_header, new_start, new_end, sequence_length in new_coords:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
337 if new_start >= 1 and new_end <= sequence_length:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
338 feature.header = new_header
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
339 feature.start = new_start
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
340 feature.end = new_end
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
341 fh_out.write(str(feature))
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
342 return gff3_file_recalculated
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
343
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
344
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
345 # recalculate gff3 back to original coordinates, use gff3_feature class
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
346 def recalculate_gff3_back_to_original_coordinates(gff3_file, matching_table):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
347 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
348 Recalculate gff3 back to original coordinates, use gff3_feature class
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
349 :param gff3_file:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
350 :param matching_table:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
351 :return:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
352 gff3_file_recalculated
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
353 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
354 gff3_file_recalculated = tempfile.NamedTemporaryFile(delete=False).name
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
355 with open(gff3_file, 'r') as fh_in:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
356 with open(gff3_file_recalculated, 'w') as fh_out:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
357 for line in fh_in:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
358 if line[0] == '#':
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
359 fh_out.write(line)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
360 else:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
361 feature = Gff3Feature(line)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
362 ori_header, ori_start, ori_end = get_original_header_and_coordinates(
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
363 feature.header, feature.start, feature.end, matching_table
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
364 )
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
365 feature.header = ori_header
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
366 feature.start = ori_start
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
367 feature.end = ori_end
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
368 fh_out.write(str(feature))
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
369 return gff3_file_recalculated
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
370
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
371
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
372 def get_feature_attributes(line):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
373 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
374 Get attributes as dictionary from gff3 list
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
375 :param line:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
376 :return:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
377 attributes_dict
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
378 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
379 attributes_dict = {}
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
380 for item in line[8].split(';'):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
381 if item.strip():
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
382 key, value = item.strip().split('=')
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
383 attributes_dict[key] = value
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
384 return attributes_dict
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
385
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
386
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
387 def get_unique_features(gff3_file):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
388 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
389 return list of ID of non-ovelaping features.
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
390
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
391 :param gff3_file:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
392 :return:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
393 duplicated_features
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
394 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
395 good_id = []
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
396 feature_list = []
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
397 with open(gff3_file, 'r') as fh:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
398 for line in fh:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
399 if line[0] == '#':
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
400 continue
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
401 feature = Gff3Feature(line)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
402 if feature.type != 'transposable_element':
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
403 continue
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
404 feature_list.append(feature) # sort by start position and header
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
405 feature_list.sort(key=lambda x: (x.start, x.header))
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
406 i = 0
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
407 while i < len(feature_list) - 1:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
408 ch = feature_list[i].header == feature_list[i + 1].header
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
409 if not ch:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
410 good_id.append(feature_list[i].attributes_dict['ID'])
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
411 i += 1
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
412 continue
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
413 if feature_list[i].identical_region(feature_list[i + 1]):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
414 # identical position
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
415 good_id.append(feature_list[i].attributes_dict['ID'])
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
416 i += 2
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
417 continue
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
418 if feature_list[i].overlap(feature_list[i + 1]):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
419 # overlap
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
420 if feature_list[i] > feature_list[i + 1]:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
421 good_id.append(feature_list[i].attributes_dict['ID'])
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
422 i += 2
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
423 continue
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
424 else:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
425 good_id.append(feature_list[i + 1].attributes_dict['ID'])
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
426 i += 2
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
427 continue
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
428 else:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
429 good_id.append(feature_list[i].attributes_dict['ID'])
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
430 i += 1
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
431 if i == len(feature_list) - 1:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
432 good_id.append(feature_list[i].attributes_dict['ID'])
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
433 return good_id
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
434
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
435
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
436 def filter_gff3_file(gff3_file, good_id, output_file):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
437 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
438 Filter gff3 file by good_id
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
439 :param gff3_file:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
440 :param good_id:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
441 :param output_file:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
442 :return:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
443 filtered_gff3_file
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
444 """
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
445
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
446 with open(gff3_file, 'r') as fh, open(output_file, 'w') as fout:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
447 for line in fh:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
448 if line[0] == '#':
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
449 fout.write(line)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
450 else:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
451 feature = Gff3Feature(line)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
452 if ('ID' in feature.attributes_dict and
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
453 feature.attributes_dict['ID'] in good_id):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
454 fout.write(line)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
455 continue
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
456 if 'Parent' in feature.attributes_dict:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
457 if feature.attributes_dict['Parent'] in good_id:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
458 fout.write(line)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
459 continue
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
460
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
461
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
462 def main():
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
463 """
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
464 Main function
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
465 """
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
466 args = get_arguments()
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
467 # locate directory of current script
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
468 tool_path = os.path.dirname(os.path.realpath(__file__))
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
469 # split fasta file to chunks
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
470 fasta_file_split, matching_table = split_fasta_to_chunks(
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
471 args.reference_sequence, chunk_size=args.max_chunk_size
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
472 )
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
473 # recalculate gff3 coordinates
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
474 gff3_file_recalculated = recalculate_gff3_coordinates(args.gff3, matching_table)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
475
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
476 fasta_seq_size = read_fasta_sequence_size(fasta_file_split)
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
477 total_size = sum(fasta_seq_size.values())
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
478 number_of_sequences = len(fasta_seq_size)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
479 if total_size > args.max_chunk_size and number_of_sequences > 1:
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
480 print('running analysis on chunks of ~ {} Mb'.format(args.max_chunk_size))
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
481 # sort dictionary by values
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
482 seq_id_size_sorted = [i[0] for i in sorted(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
483 fasta_seq_size.items(), key=lambda x: int(x[1]), reverse=True
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
484 )]
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
485 number_of_temp_files = int(total_size / args.max_chunk_size) + 1
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
486 if number_of_temp_files > number_of_sequences:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
487 number_of_temp_files = number_of_sequences
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
488
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
489 temp_files_fasta = make_temp_files(number_of_temp_files)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
490 file_handles = [open(temp_file, 'w') for temp_file in temp_files_fasta]
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
491 # make dictionary seq_id_sorted as keys and values as file handles
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
492 seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles)))
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
493
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
494 # write sequences to temporary files
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
495 with open(fasta_file_split, 'r') as f:
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
496 for line in f:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
497 if line[0] == '>':
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
498 header = line.strip().split(' ')[0][1:]
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
499 seq_id_file_handle_dict[header].write(line)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
500 else:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
501 seq_id_file_handle_dict[header].write(line)
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
502 os.remove(fasta_file_split)
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
503 # close file handles
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
504 for file_handle in file_handles:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
505 file_handle.close()
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
506
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
507 # split gff3 file to temporary files -
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
508 # each temporary file will contain gff lines matching fasta
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
509 temp_files_gff = make_temp_files(number_of_temp_files)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
510 file_handles = [open(temp_file, 'w') for temp_file in temp_files_gff]
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
511 # make dictionary seq_id_sorted as keys and values as file handles
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
512 seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles)))
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
513 # write gff lines to chunks
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
514 with open(gff3_file_recalculated, 'r') as f:
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
515 for line in f:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
516 if line[0] == '#':
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
517 continue
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
518 else:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
519 header = line.strip().split('\t')[0]
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
520 seq_id_file_handle_dict[header].write(line)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
521 # close file handles
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
522 for file_handle in file_handles:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
523 file_handle.close()
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
524 os.remove(gff3_file_recalculated)
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
525 # run retrotransposon detection on each temporary file
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
526 output_files = make_temp_files(number_of_temp_files)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
527 for i in range(number_of_temp_files):
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
528 print('Running retrotransposon detection on file ' + str(i))
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
529 subprocess.check_call(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
530 [f'{tool_path}/detect_putative_ltr.R', '-s', temp_files_fasta[i], '-g',
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
531 temp_files_gff[i], '-o', output_files[i], '-c', str(args.cpu), '-M',
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
532 str(args.max_missing_domains), '-L', str(args.min_relative_length)]
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
533 )
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
534
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
535 #remove all temporary input files
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
536 for temp_file in temp_files_fasta + temp_files_gff:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
537 os.remove(temp_file)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
538
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
539 # concatenate output files
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
540 output_file_suffixes = ['_D.fasta', '_DL.fasta', '_DLT.fasta', '_DLTP.fasta',
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
541 '_DLP.fasta', '.gff3', '_statistics.csv']
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
542
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
543 for suffix in output_file_suffixes:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
544 if suffix == '_statistics.csv':
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
545 # sum up line with same word in first column
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
546 stat_files = [output_file + suffix for output_file in output_files]
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
547 new_statistics = sum_up_stats_files(stat_files)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
548 with open(args.output + suffix, 'w') as f:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
549 f.write("\n".join(new_statistics))
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
550 # remove parsed temporary statistics files
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
551 for file in stat_files:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
552 os.remove(file)
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
553 elif suffix == '.gff3':
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
554 tmp_gff_unfiltered = tempfile.NamedTemporaryFile(delete=False).name
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
555 with open(tmp_gff_unfiltered, 'w') as f:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
556 for i in range(number_of_temp_files):
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
557 tmp_gff = recalculate_gff3_back_to_original_coordinates(
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
558 output_files[i] + suffix, matching_table
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
559 )
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
560 # remove temporary gff3 file
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
561 os.remove(output_files[i] + suffix)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
562 with open(tmp_gff, 'r') as f_tmp:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
563 for line in f_tmp:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
564 f.write(line)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
565 os.remove(tmp_gff)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
566 # filter overlapping features
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
567 good_id = get_unique_features(tmp_gff_unfiltered)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
568 filter_gff3_file(
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
569 tmp_gff_unfiltered, good_id, args.output + suffix
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
570 )
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
571 # remove temporary gff3 file
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
572 os.remove(tmp_gff_unfiltered)
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
573 else:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
574 with open(args.output + suffix, 'w') as f:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
575 for i in range(number_of_temp_files):
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
576 # some file may not exist, so we need to check
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
577 try:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
578 with open(output_files[i] + suffix, 'r') as g:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
579 for line in g:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
580 f.write(line)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
581 # remove parsed temporary output files
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
582 os.remove(output_files[i] + suffix)
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
583 except FileNotFoundError:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
584 pass
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
585 else:
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
586 print('running analysis on whole input')
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
587 # no need to split sequences into chunks
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
588 subprocess.check_call(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
589 [f'{tool_path}/detect_putative_ltr.R', '-s', args.reference_sequence, '-g',
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
590 args.gff3, '-o', args.output, '-c', str(args.cpu), '-M',
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
591 str(args.max_missing_domains), '-L', str(args.min_relative_length)]
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
592 )
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
593
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
594
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
595 if __name__ == '__main__':
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
596 # check version of python must be 3.6 or greater
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
597 if sys.version_info < (3, 6):
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
598 print('Python version must be 3.6 or greater')
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
599 sys.exit(1)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
600 main()