comparison detect_putative_ltr_wrapper.py @ 12:ff01d4263391 draft

"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
author petr-novak
date Thu, 21 Jul 2022 08:23:15 +0000
parents
children 559940c04c44
comparison
equal deleted inserted replaced
11:54bd36973253 12:ff01d4263391
1 #!/usr/bin/env python
2 """This wrapper is intended to be used on large genomes and large DANTE input to
3 minimize memory usage, It splits input files to pieces and analyze it on by one by
4 detect_putative_ltr.R
5 If input does not exceed memory limit, it will run detect_putative_ltr.R directly
6 """
7
8 import argparse
9 import os
10 import sys
11 import tempfile
12 from itertools import cycle
13 import subprocess
14
15
16 def get_arguments():
17 """
18 Get arguments from command line
19 :return:
20 args
21 """
22 parser = argparse.ArgumentParser(
23 description="""detect_putative_ltr_wrapper.py is a wrapper for
24 detect_putative_ltr.R""", formatter_class=argparse.RawTextHelpFormatter
25 )
26 parser.add_argument(
27 '-g', '--gff3', default=None, required=True, help="gff3 file", type=str,
28 action='store'
29 )
30 parser.add_argument(
31 '-s', '--reference_sequence', default=None, required=True,
32 help="reference sequence as fasta file", type=str, action='store'
33 )
34 parser.add_argument(
35 '-o', '--output', default=None, required=True, help="output file path and prefix",
36 type=str, action='store'
37 )
38 parser.add_argument(
39 '-c', '--cpu', default=1, required=False, help="number of CPUs", type=int,
40 action='store'
41 )
42 parser.add_argument(
43 '-M', '--max_missing_domains', default=0, required=False, type=int
44 )
45 parser.add_argument(
46 '-L', '--min_relative_length', default=0.6, required=False, type=float,
47 help="Minimum relative length of protein domain to be "
48 "considered for retrostransposon detection"
49 )
50 parser.add_argument(
51 '-S', '--max_chunk_size', default=100000000, required=False, type=int,
52 help='If size of reference sequence is greater than this value, reference is '
53 'analyzed in chunks of this size. This is just approximate value - '
54 'sequences '
55 'which are longer are are not split, default is %(default)s'
56 )
57 args = parser.parse_args()
58 return args
59
60
61 def read_fasta_sequence_size(fasta_file):
62 """Read size of sequence into dictionary"""
63 fasta_dict = {}
64 with open(fasta_file, 'r') as f:
65 for line in f:
66 if line[0] == '>':
67 header = line.strip().split(' ')[0][1:] # remove part of name after space
68 fasta_dict[header] = 0
69 else:
70 fasta_dict[header] += len(line.strip())
71 return fasta_dict
72
73
74 def make_temp_files(number_of_files):
75 """
76 Make named temporary files, file will not be deleted upon exit!
77 :param number_of_files:
78 :return:
79 filepaths
80 """
81 temp_files = []
82 for i in range(number_of_files):
83 temp_files.append(tempfile.NamedTemporaryFile(delete=False).name)
84 return temp_files
85
86
87 def sum_up_stats_files(files):
88 """
89 Sum up statistics files
90 :return:
91 """
92 new_statistics = {}
93 for file in files:
94 with open(file, 'r') as fh:
95 for line in fh:
96 items = line.strip().split('\t')
97 if items[0] == 'Classification':
98 header = items
99 continue
100 else:
101 counts = [int(item) for item in items[1:]]
102 if items[0] in new_statistics:
103 new_statistics[items[0]] = [sum(x) for x in
104 zip(new_statistics[items[0]], counts)]
105 else:
106 new_statistics[items[0]] = counts
107 # convert to string, first line is header
108 statistics_str = []
109 for classification, counts in new_statistics.items():
110 statistics_str.append(classification + '\t' + '\t'.join([str(x) for x in counts]))
111 sorted_stat_with_header = ['\t'.join(header)] + sorted(statistics_str)
112 return sorted_stat_with_header
113
114
115 def main():
116 """
117 Main function
118 """
119 args = get_arguments()
120 # locate directory of current script
121 tool_path = os.path.dirname(os.path.realpath(__file__))
122 fasta_seq_size = read_fasta_sequence_size(args.reference_sequence)
123 total_size = sum(fasta_seq_size.values())
124 number_of_sequences = len(fasta_seq_size)
125 if total_size > args.max_chunk_size and number_of_sequences > 1:
126 # sort dictionary by values
127 seq_id_size_sorted = [i[0] for i in sorted(
128 fasta_seq_size.items(), key=lambda x: int(x[1]), reverse=True
129 )]
130 number_of_temp_files = int(total_size / args.max_chunk_size) + 1
131 if number_of_temp_files > number_of_sequences:
132 number_of_temp_files = number_of_sequences
133
134 temp_files_fasta = make_temp_files(number_of_temp_files)
135 file_handles = [open(temp_file, 'w') for temp_file in temp_files_fasta]
136 # make dictionary seq_id_sorted as keys and values as file handles
137 seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles)))
138
139 # write sequences to temporary files
140 with open(args.reference_sequence, 'r') as f:
141 for line in f:
142 if line[0] == '>':
143 header = line.strip().split(' ')[0][1:]
144 print(header)
145 seq_id_file_handle_dict[header].write(line)
146 else:
147 seq_id_file_handle_dict[header].write(line)
148 # close file handles
149 for file_handle in file_handles:
150 file_handle.close()
151
152 # split gff3 file to temporary files -
153 # each temporary file will contain gff lines matching fasta
154 temp_files_gff = make_temp_files(number_of_temp_files)
155 file_handles = [open(temp_file, 'w') for temp_file in temp_files_gff]
156 # make dictionary seq_id_sorted as keys and values as file handles
157 seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles)))
158 # write gff lines to chunks
159 with open(args.gff3, 'r') as f:
160 for line in f:
161 if line[0] == '#':
162 continue
163 else:
164 header = line.strip().split('\t')[0]
165 seq_id_file_handle_dict[header].write(line)
166 # close file handles
167 for file_handle in file_handles:
168 file_handle.close()
169
170 # run retrotransposon detection on each temporary file
171 output_files = make_temp_files(number_of_temp_files)
172 for i in range(number_of_temp_files):
173 print('Running retrotransposon detection on file ' + str(i))
174 subprocess.check_call(
175 [f'{tool_path}/detect_putative_ltr.R', '-s', temp_files_fasta[i], '-g',
176 temp_files_gff[i], '-o', output_files[i], '-c', str(args.cpu), '-M',
177 str(args.max_missing_domains), '-L', str(args.min_relative_length)]
178 )
179
180 # remove all temporary input files
181 for temp_file in temp_files_fasta + temp_files_gff:
182 os.remove(temp_file)
183
184 # concatenate output files
185 output_file_suffixes = ['_D.fasta', '_DL.fasta', '_DLT.fasta', '_DLTP.fasta',
186 '_DLP.fasta', '.gff3', '_statistics.csv']
187
188 for suffix in output_file_suffixes:
189 if suffix == '_statistics.csv':
190 # sum up line with same word in first column
191 stat_files = [output_file + suffix for output_file in output_files]
192 new_statistics = sum_up_stats_files(stat_files)
193 with open(args.output + suffix, 'w') as f:
194 f.write("\n".join(new_statistics))
195 # remove parsed temporary statistics files
196 for file in stat_files:
197 os.remove(file)
198 else:
199 with open(args.output + suffix, 'w') as f:
200 for i in range(number_of_temp_files):
201 # some file may not exist, so we need to check
202 try:
203 with open(output_files[i] + suffix, 'r') as g:
204 for line in g:
205 f.write(line)
206 # remove parsed temporary output files
207 os.remove(output_files[i])
208 except FileNotFoundError:
209 pass
210 else:
211 # no need to split sequences into chunks
212 subprocess.check_call(
213 [f'{tool_path}/detect_putative_ltr.R', '-s', args.reference_sequence, '-g',
214 args.gff3, '-o', args.output, '-c', str(args.cpu), '-M',
215 str(args.max_missing_domains), '-L', str(args.min_relative_length)]
216 )
217
218
219 if __name__ == '__main__':
220 # check version of python must be 3.6 or greater
221 if sys.version_info < (3, 6):
222 print('Python version must be 3.6 or greater')
223 sys.exit(1)
224 main()