Mercurial > repos > petr-novak > dante_ltr
annotate detect_putative_ltr_wrapper.py @ 13:559940c04c44 draft
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
author | petr-novak |
---|---|
date | Thu, 11 Aug 2022 07:29:06 +0000 |
parents | ff01d4263391 |
children |
rev | line source |
---|---|
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
1 #!/usr/bin/env python |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
2 """This wrapper is intended to be used on large genomes and large DANTE input to |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
3 minimize memory usage, It splits input files to pieces and analyze it on by one by |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
4 detect_putative_ltr.R |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
5 If input does not exceed specified max-chunk_size, it will run detect_putative_ltr.R |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
6 directly |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
7 """ |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
8 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
9 import argparse |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
10 import os |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
11 import sys |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
12 import tempfile |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
13 from itertools import cycle |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
14 import subprocess |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
15 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
16 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
17 class Gff3Feature: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
18 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
19 Class for gff3 feature |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
20 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
21 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
22 def __init__(self, line): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
23 self.line = line |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
24 self.items = line.strip().split('\t') |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
25 self.header = self.items[0] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
26 self.source = self.items[1] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
27 self.type = self.items[2] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
28 self.start = int(self.items[3]) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
29 self.end = int(self.items[4]) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
30 self.score = self.items[5] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
31 self.strand = self.items[6] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
32 self.frame = self.items[7] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
33 self.attributes = self.items[8] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
34 self.attributes_dict = {} |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
35 for item in self.attributes.split(';'): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
36 if item != '': |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
37 key, value = item.split('=') |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
38 self.attributes_dict[key] = value |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
39 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
40 self.attributes_str = ';'.join( |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
41 ['{}={}'.format(key, value) for key, value in self.attributes_dict.items()] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
42 ) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
43 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
44 def __str__(self): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
45 return '\t'.join( |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
46 [self.header, self.source, self.type, str(self.start), str(self.end), |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
47 self.score, self.strand, self.frame, self.attributes_str] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
48 ) + '\n' |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
49 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
50 def __repr__(self): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
51 return '\t'.join( |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
52 [self.header, self.source, self.type, str(self.start), str(self.end), |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
53 self.score, self.strand, self.frame, self.attributes_str] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
54 ) + '\n' |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
55 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
56 def __eq__(self, other): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
57 return self.line_recalculated() == other.line_recalculated() |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
58 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
59 def __hash__(self): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
60 return hash(self.line_recalculated()) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
61 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
62 def get_line(self): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
63 """returns original line""" |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
64 return self.line |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
65 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
66 def overlap(self, other): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
67 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
68 Check if two features overlap |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
69 :param other: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
70 :return: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
71 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
72 if self.start <= other.end and self.end >= other.start: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
73 return True |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
74 else: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
75 return False |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
76 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
77 def line_recalculated(self): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
78 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
79 :return: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
80 string with recalculated line |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
81 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
82 return '\t'.join( |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
83 [self.header, self.source, self.type, str(self.start), str(self.end), |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
84 self.score, self.strand, self.frame, self.attributes_str] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
85 ) + '\n' |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
86 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
87 def __lt__(self, other): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
88 width = self.end - self.start |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
89 other_width = other.end - other.start |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
90 return width < other_width |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
91 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
92 def __gt__(self, other): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
93 width = self.end - self.start |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
94 other_width = other.end - other.start |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
95 return width > other_width |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
96 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
97 def identical_region(self, other): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
98 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
99 Check if two features are identical |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
100 :param other: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
101 :return: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
102 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
103 if self.start == other.start and self.end == other.end and self.header == \ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
104 other.header: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
105 return True |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
106 else: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
107 return False |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
108 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
109 |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
110 def get_arguments(): |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
111 """ |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
112 Get arguments from command line |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
113 :return: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
114 args |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
115 """ |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
116 parser = argparse.ArgumentParser( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
117 description="""detect_putative_ltr_wrapper.py is a wrapper for |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
118 detect_putative_ltr.R""", formatter_class=argparse.RawTextHelpFormatter |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
119 ) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
120 parser.add_argument( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
121 '-g', '--gff3', default=None, required=True, help="gff3 file", type=str, |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
122 action='store' |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
123 ) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
124 parser.add_argument( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
125 '-s', '--reference_sequence', default=None, required=True, |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
126 help="reference sequence as fasta file", type=str, action='store' |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
127 ) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
128 parser.add_argument( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
129 '-o', '--output', default=None, required=True, help="output file path and prefix", |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
130 type=str, action='store' |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
131 ) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
132 parser.add_argument( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
133 '-c', '--cpu', default=1, required=False, help="number of CPUs", type=int, |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
134 action='store' |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
135 ) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
136 parser.add_argument( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
137 '-M', '--max_missing_domains', default=0, required=False, type=int |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
138 ) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
139 parser.add_argument( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
140 '-L', '--min_relative_length', default=0.6, required=False, type=float, |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
141 help="Minimum relative length of protein domain to be " |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
142 "considered for retrostransposon detection" |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
143 ) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
144 parser.add_argument( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
145 '-S', '--max_chunk_size', default=100000000, required=False, type=int, |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
146 help='If size of reference sequence is greater than this value, reference is ' |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
147 'analyzed in chunks of this size. default is %(default)s' |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
148 'Setting this value too small will slow down the analysis' |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
149 ) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
150 args = parser.parse_args() |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
151 return args |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
152 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
153 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
154 def read_fasta_sequence_size(fasta_file): |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
155 """Read size of sequence into dictionary""" |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
156 fasta_dict = {} |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
157 with open(fasta_file, 'r') as f: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
158 for line in f: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
159 if line[0] == '>': |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
160 header = line.strip().split(' ')[0][1:] # remove part of name after space |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
161 fasta_dict[header] = 0 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
162 else: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
163 fasta_dict[header] += len(line.strip()) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
164 return fasta_dict |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
165 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
166 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
167 def make_temp_files(number_of_files): |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
168 """ |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
169 Make named temporary files, file will not be deleted upon exit! |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
170 :param number_of_files: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
171 :return: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
172 filepaths |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
173 """ |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
174 temp_files = [] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
175 for i in range(number_of_files): |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
176 temp_files.append(tempfile.NamedTemporaryFile(delete=False).name) |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
177 os.remove(temp_files[-1]) |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
178 return temp_files |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
179 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
180 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
181 def sum_up_stats_files(files): |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
182 """ |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
183 Sum up statistics files |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
184 :return: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
185 """ |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
186 new_statistics = {} |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
187 for file in files: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
188 with open(file, 'r') as fh: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
189 for line in fh: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
190 items = line.strip().split('\t') |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
191 if items[0] == 'Classification': |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
192 header = items |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
193 continue |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
194 else: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
195 counts = [int(item) for item in items[1:]] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
196 if items[0] in new_statistics: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
197 new_statistics[items[0]] = [sum(x) for x in |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
198 zip(new_statistics[items[0]], counts)] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
199 else: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
200 new_statistics[items[0]] = counts |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
201 # convert to string, first line is header |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
202 statistics_str = [] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
203 for classification, counts in new_statistics.items(): |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
204 statistics_str.append(classification + '\t' + '\t'.join([str(x) for x in counts])) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
205 sorted_stat_with_header = ['\t'.join(header)] + sorted(statistics_str) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
206 return sorted_stat_with_header |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
207 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
208 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
209 def read_single_fasta_to_dictionary(fh): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
210 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
211 Read fasta file into dictionary |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
212 :param fh: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
213 :return: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
214 fasta_dict |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
215 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
216 fasta_dict = {} |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
217 for line in fh: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
218 if line[0] == '>': |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
219 header = line.strip().split(' ')[0][1:] # remove part of name after space |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
220 fasta_dict[header] = [] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
221 else: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
222 fasta_dict[header] += [line.strip()] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
223 fasta_dict = {k: ''.join(v) for k, v in fasta_dict.items()} |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
224 return fasta_dict |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
225 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
226 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
227 def split_fasta_to_chunks(fasta_file, chunk_size=100000000, overlap=100000): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
228 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
229 Split fasta file to chunks, sequences longe than chuck size are split to overlaping |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
230 peaces. If sequences are shorter, chunck with multiple sequences are created. |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
231 :param fasta_file: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
232 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
233 :param fasta_file: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
234 :param chunk_size: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
235 :param overlap: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
236 :return: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
237 fasta_file_split |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
238 matching_table |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
239 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
240 min_chunk_size = chunk_size * 2 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
241 fasta_dict = read_fasta_sequence_size(fasta_file) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
242 # calculates ranges for splitting of fasta files and store them in list |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
243 matching_table = [] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
244 fasta_file_split = tempfile.NamedTemporaryFile(delete=False).name |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
245 for header, size in fasta_dict.items(): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
246 if size > min_chunk_size: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
247 number_of_chunks = int(size / chunk_size) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
248 adjusted_chunk_size = int(size / number_of_chunks) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
249 for i in range(number_of_chunks): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
250 start = i * adjusted_chunk_size |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
251 end = ((i + 1) * |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
252 adjusted_chunk_size |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
253 + overlap) if i + 1 < number_of_chunks else size |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
254 new_header = header + '_' + str(i) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
255 matching_table.append([header, i, start, end, new_header]) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
256 else: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
257 new_header = header + '_0' |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
258 matching_table.append([header, 0, 0, size, new_header]) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
259 # read sequences from fasta files and split them to chunks according to matching table |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
260 # open output and input files, use with statement to close files |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
261 fasta_dict = read_single_fasta_to_dictionary(open(fasta_file, 'r')) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
262 with open(fasta_file_split, 'w') as fh_out: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
263 for header in fasta_dict: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
264 matching_table_part = [x for x in matching_table if x[0] == header] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
265 for header2, i, start, end, new_header in matching_table_part: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
266 fh_out.write('>' + new_header + '\n') |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
267 fh_out.write(fasta_dict[header][start:end] + '\n') |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
268 return fasta_file_split, matching_table |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
269 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
270 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
271 def get_new_header_and_coordinates(header, start, end, matching_table): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
272 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
273 Get new header and coordinates for sequence |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
274 :param header: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
275 :param start: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
276 :param end: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
277 :param matching_table: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
278 :return: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
279 new_header |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
280 new_start |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
281 new_end |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
282 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
283 matching_table_part = [x for x in matching_table if x[0] == header] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
284 new_coords = [] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
285 for chunk in matching_table_part: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
286 if chunk[2] <= start < chunk[3]: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
287 new_header = chunk[4] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
288 new_start = start - chunk[2] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
289 new_end = end - chunk[2] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
290 new_sequence_length = chunk[3] - chunk[2] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
291 new_coords.append([new_header, new_start, new_end, new_sequence_length]) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
292 return new_coords |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
293 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
294 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
295 def get_original_header_and_coordinates(new_header, new_start, new_end, matching_table): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
296 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
297 Get original header and coordinates for sequence |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
298 :param new_header: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
299 :param new_start: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
300 :param new_end: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
301 :param matching_table: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
302 :return: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
303 original_header |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
304 original_start |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
305 original_end |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
306 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
307 matching_table_part = [x for x in matching_table if x[4] == new_header] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
308 ori_header = matching_table_part[0][0] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
309 start = matching_table_part[0][2] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
310 ori_start = new_start + start |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
311 ori_end = new_end + start |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
312 return ori_header, ori_start, ori_end |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
313 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
314 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
315 # recalculate gff3 coordinates, use gff3_feature class |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
316 def recalculate_gff3_coordinates(gff3_file, matching_table): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
317 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
318 Recalculate gff3 coordinates, use gff3_feature class |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
319 :param gff3_file: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
320 :param matching_table: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
321 :return: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
322 gff3_file_recalculated |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
323 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
324 gff3_file_recalculated = tempfile.NamedTemporaryFile(delete=False).name |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
325 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
326 with open(gff3_file, 'r') as fh_in: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
327 with open(gff3_file_recalculated, 'w') as fh_out: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
328 for line in fh_in: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
329 if line[0] == '#': |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
330 fh_out.write(line) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
331 else: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
332 feature = Gff3Feature(line) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
333 new_coords = get_new_header_and_coordinates( |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
334 feature.header, feature.start, feature.end, matching_table |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
335 ) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
336 for new_header, new_start, new_end, sequence_length in new_coords: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
337 if new_start >= 1 and new_end <= sequence_length: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
338 feature.header = new_header |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
339 feature.start = new_start |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
340 feature.end = new_end |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
341 fh_out.write(str(feature)) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
342 return gff3_file_recalculated |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
343 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
344 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
345 # recalculate gff3 back to original coordinates, use gff3_feature class |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
346 def recalculate_gff3_back_to_original_coordinates(gff3_file, matching_table): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
347 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
348 Recalculate gff3 back to original coordinates, use gff3_feature class |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
349 :param gff3_file: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
350 :param matching_table: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
351 :return: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
352 gff3_file_recalculated |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
353 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
354 gff3_file_recalculated = tempfile.NamedTemporaryFile(delete=False).name |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
355 with open(gff3_file, 'r') as fh_in: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
356 with open(gff3_file_recalculated, 'w') as fh_out: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
357 for line in fh_in: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
358 if line[0] == '#': |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
359 fh_out.write(line) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
360 else: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
361 feature = Gff3Feature(line) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
362 ori_header, ori_start, ori_end = get_original_header_and_coordinates( |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
363 feature.header, feature.start, feature.end, matching_table |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
364 ) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
365 feature.header = ori_header |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
366 feature.start = ori_start |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
367 feature.end = ori_end |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
368 fh_out.write(str(feature)) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
369 return gff3_file_recalculated |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
370 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
371 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
372 def get_feature_attributes(line): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
373 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
374 Get attributes as dictionary from gff3 list |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
375 :param line: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
376 :return: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
377 attributes_dict |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
378 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
379 attributes_dict = {} |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
380 for item in line[8].split(';'): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
381 if item.strip(): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
382 key, value = item.strip().split('=') |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
383 attributes_dict[key] = value |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
384 return attributes_dict |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
385 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
386 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
387 def get_unique_features(gff3_file): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
388 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
389 return list of ID of non-ovelaping features. |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
390 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
391 :param gff3_file: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
392 :return: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
393 duplicated_features |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
394 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
395 good_id = [] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
396 feature_list = [] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
397 with open(gff3_file, 'r') as fh: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
398 for line in fh: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
399 if line[0] == '#': |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
400 continue |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
401 feature = Gff3Feature(line) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
402 if feature.type != 'transposable_element': |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
403 continue |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
404 feature_list.append(feature) # sort by start position and header |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
405 feature_list.sort(key=lambda x: (x.start, x.header)) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
406 i = 0 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
407 while i < len(feature_list) - 1: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
408 ch = feature_list[i].header == feature_list[i + 1].header |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
409 if not ch: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
410 good_id.append(feature_list[i].attributes_dict['ID']) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
411 i += 1 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
412 continue |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
413 if feature_list[i].identical_region(feature_list[i + 1]): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
414 # identical position |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
415 good_id.append(feature_list[i].attributes_dict['ID']) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
416 i += 2 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
417 continue |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
418 if feature_list[i].overlap(feature_list[i + 1]): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
419 # overlap |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
420 if feature_list[i] > feature_list[i + 1]: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
421 good_id.append(feature_list[i].attributes_dict['ID']) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
422 i += 2 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
423 continue |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
424 else: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
425 good_id.append(feature_list[i + 1].attributes_dict['ID']) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
426 i += 2 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
427 continue |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
428 else: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
429 good_id.append(feature_list[i].attributes_dict['ID']) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
430 i += 1 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
431 if i == len(feature_list) - 1: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
432 good_id.append(feature_list[i].attributes_dict['ID']) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
433 return good_id |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
434 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
435 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
436 def filter_gff3_file(gff3_file, good_id, output_file): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
437 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
438 Filter gff3 file by good_id |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
439 :param gff3_file: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
440 :param good_id: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
441 :param output_file: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
442 :return: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
443 filtered_gff3_file |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
444 """ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
445 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
446 with open(gff3_file, 'r') as fh, open(output_file, 'w') as fout: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
447 for line in fh: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
448 if line[0] == '#': |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
449 fout.write(line) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
450 else: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
451 feature = Gff3Feature(line) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
452 if ('ID' in feature.attributes_dict and |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
453 feature.attributes_dict['ID'] in good_id): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
454 fout.write(line) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
455 continue |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
456 if 'Parent' in feature.attributes_dict: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
457 if feature.attributes_dict['Parent'] in good_id: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
458 fout.write(line) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
459 continue |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
460 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
461 |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
462 def main(): |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
463 """ |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
464 Main function |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
465 """ |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
466 args = get_arguments() |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
467 # locate directory of current script |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
468 tool_path = os.path.dirname(os.path.realpath(__file__)) |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
469 # split fasta file to chunks |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
470 fasta_file_split, matching_table = split_fasta_to_chunks( |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
471 args.reference_sequence, chunk_size=args.max_chunk_size |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
472 ) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
473 # recalculate gff3 coordinates |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
474 gff3_file_recalculated = recalculate_gff3_coordinates(args.gff3, matching_table) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
475 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
476 fasta_seq_size = read_fasta_sequence_size(fasta_file_split) |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
477 total_size = sum(fasta_seq_size.values()) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
478 number_of_sequences = len(fasta_seq_size) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
479 if total_size > args.max_chunk_size and number_of_sequences > 1: |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
480 print('running analysis on chunks of ~ {} Mb'.format(args.max_chunk_size)) |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
481 # sort dictionary by values |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
482 seq_id_size_sorted = [i[0] for i in sorted( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
483 fasta_seq_size.items(), key=lambda x: int(x[1]), reverse=True |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
484 )] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
485 number_of_temp_files = int(total_size / args.max_chunk_size) + 1 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
486 if number_of_temp_files > number_of_sequences: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
487 number_of_temp_files = number_of_sequences |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
488 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
489 temp_files_fasta = make_temp_files(number_of_temp_files) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
490 file_handles = [open(temp_file, 'w') for temp_file in temp_files_fasta] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
491 # make dictionary seq_id_sorted as keys and values as file handles |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
492 seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles))) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
493 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
494 # write sequences to temporary files |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
495 with open(fasta_file_split, 'r') as f: |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
496 for line in f: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
497 if line[0] == '>': |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
498 header = line.strip().split(' ')[0][1:] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
499 seq_id_file_handle_dict[header].write(line) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
500 else: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
501 seq_id_file_handle_dict[header].write(line) |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
502 os.remove(fasta_file_split) |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
503 # close file handles |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
504 for file_handle in file_handles: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
505 file_handle.close() |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
506 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
507 # split gff3 file to temporary files - |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
508 # each temporary file will contain gff lines matching fasta |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
509 temp_files_gff = make_temp_files(number_of_temp_files) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
510 file_handles = [open(temp_file, 'w') for temp_file in temp_files_gff] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
511 # make dictionary seq_id_sorted as keys and values as file handles |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
512 seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles))) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
513 # write gff lines to chunks |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
514 with open(gff3_file_recalculated, 'r') as f: |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
515 for line in f: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
516 if line[0] == '#': |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
517 continue |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
518 else: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
519 header = line.strip().split('\t')[0] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
520 seq_id_file_handle_dict[header].write(line) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
521 # close file handles |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
522 for file_handle in file_handles: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
523 file_handle.close() |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
524 os.remove(gff3_file_recalculated) |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
525 # run retrotransposon detection on each temporary file |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
526 output_files = make_temp_files(number_of_temp_files) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
527 for i in range(number_of_temp_files): |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
528 print('Running retrotransposon detection on file ' + str(i)) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
529 subprocess.check_call( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
530 [f'{tool_path}/detect_putative_ltr.R', '-s', temp_files_fasta[i], '-g', |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
531 temp_files_gff[i], '-o', output_files[i], '-c', str(args.cpu), '-M', |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
532 str(args.max_missing_domains), '-L', str(args.min_relative_length)] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
533 ) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
534 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
535 #remove all temporary input files |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
536 for temp_file in temp_files_fasta + temp_files_gff: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
537 os.remove(temp_file) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
538 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
539 # concatenate output files |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
540 output_file_suffixes = ['_D.fasta', '_DL.fasta', '_DLT.fasta', '_DLTP.fasta', |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
541 '_DLP.fasta', '.gff3', '_statistics.csv'] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
542 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
543 for suffix in output_file_suffixes: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
544 if suffix == '_statistics.csv': |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
545 # sum up line with same word in first column |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
546 stat_files = [output_file + suffix for output_file in output_files] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
547 new_statistics = sum_up_stats_files(stat_files) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
548 with open(args.output + suffix, 'w') as f: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
549 f.write("\n".join(new_statistics)) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
550 # remove parsed temporary statistics files |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
551 for file in stat_files: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
552 os.remove(file) |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
553 elif suffix == '.gff3': |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
554 tmp_gff_unfiltered = tempfile.NamedTemporaryFile(delete=False).name |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
555 with open(tmp_gff_unfiltered, 'w') as f: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
556 for i in range(number_of_temp_files): |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
557 tmp_gff = recalculate_gff3_back_to_original_coordinates( |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
558 output_files[i] + suffix, matching_table |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
559 ) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
560 # remove temporary gff3 file |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
561 os.remove(output_files[i] + suffix) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
562 with open(tmp_gff, 'r') as f_tmp: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
563 for line in f_tmp: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
564 f.write(line) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
565 os.remove(tmp_gff) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
566 # filter overlapping features |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
567 good_id = get_unique_features(tmp_gff_unfiltered) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
568 filter_gff3_file( |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
569 tmp_gff_unfiltered, good_id, args.output + suffix |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
570 ) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
571 # remove temporary gff3 file |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
572 os.remove(tmp_gff_unfiltered) |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
573 else: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
574 with open(args.output + suffix, 'w') as f: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
575 for i in range(number_of_temp_files): |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
576 # some file may not exist, so we need to check |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
577 try: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
578 with open(output_files[i] + suffix, 'r') as g: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
579 for line in g: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
580 f.write(line) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
581 # remove parsed temporary output files |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
582 os.remove(output_files[i] + suffix) |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
583 except FileNotFoundError: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
584 pass |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
585 else: |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
586 print('running analysis on whole input') |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
587 # no need to split sequences into chunks |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
588 subprocess.check_call( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
589 [f'{tool_path}/detect_putative_ltr.R', '-s', args.reference_sequence, '-g', |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
590 args.gff3, '-o', args.output, '-c', str(args.cpu), '-M', |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
591 str(args.max_missing_domains), '-L', str(args.min_relative_length)] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
592 ) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
593 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
594 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
595 if __name__ == '__main__': |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
596 # check version of python must be 3.6 or greater |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
597 if sys.version_info < (3, 6): |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
598 print('Python version must be 3.6 or greater') |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
599 sys.exit(1) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
600 main() |