annotate reAnnotate.py @ 11:5366d5ea04bc draft

planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
author petr-novak
date Fri, 04 Aug 2023 12:35:32 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
11
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
2 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
3 parse blast output table to gff file
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
4 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
5 import argparse
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
6 import itertools
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
7 import os
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
8 import re
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
9 import shutil
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
10 import subprocess
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
11 import sys
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
12 import tempfile
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
13 from collections import defaultdict
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
14
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
15 # check version of python, must be at least 3.7
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
16 if sys.version_info < (3, 10):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
17 sys.exit("Python 3.10 or a more recent version is required.")
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
18
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
19 def make_temp_files(number_of_files):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
20 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
21 Make named temporary files, file will not be deleted upon exit!
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
22 :param number_of_files:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
23 :return:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
24 filepaths
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
25 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
26 temp_files = []
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
27 for i in range(number_of_files):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
28 temp_files.append(tempfile.NamedTemporaryFile(delete=False).name)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
29 os.remove(temp_files[-1])
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
30 return temp_files
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
31
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
32
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
33 def split_fasta_to_chunks(fasta_file, chunk_size=100000000, overlap=100000):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
34 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
35 Split fasta file to chunks, sequences longe than chuck size are split to overlaping
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
36 peaces. If sequences are shorter, chunck with multiple sequences are created.
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
37 :param fasta_file:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
38
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
39 :param fasta_file:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
40 :param chunk_size:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
41 :param overlap:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
42 :return:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
43 fasta_file_split
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
44 matching_table (list of lists [header,chunk_number, start, end, new_header])
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
45 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
46 min_chunk_size = chunk_size * 2
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
47 fasta_sizes_dict = read_fasta_sequence_size(fasta_file)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
48 # calculate size of items in fasta_dist dictionary
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
49 fasta_size = sum(fasta_sizes_dict.values())
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
50
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
51 # calculates ranges for splitting of fasta files and store them in list
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
52 matching_table = []
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
53 fasta_file_split = tempfile.NamedTemporaryFile(delete=False).name
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
54 for header, size in fasta_sizes_dict.items():
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
55 print(header, size, min_chunk_size)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
56
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
57 if size > min_chunk_size:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
58 number_of_chunks = int(size / chunk_size)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
59 print("number_of_chunks", number_of_chunks)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
60 print("size", size)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
61 print("chunk_size", chunk_size)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
62 print("-----------------------------------------")
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
63 adjusted_chunk_size = int(size / number_of_chunks)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
64 for i in range(number_of_chunks):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
65 start = i * adjusted_chunk_size
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
66 end = ((i + 1) *
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
67 adjusted_chunk_size
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
68 + overlap) if i + 1 < number_of_chunks else size
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
69 new_header = header + '_' + str(i)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
70 matching_table.append([header, i, start, end, new_header])
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
71 else:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
72 new_header = header + '_0'
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
73 matching_table.append([header, 0, 0, size, new_header])
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
74 # read sequences from fasta files and split them to chunks according to matching table
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
75 # open output and input files, use with statement to close files
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
76 number_of_temp_files = len(matching_table)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
77 print('number of temp files', number_of_temp_files)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
78 fasta_dict = read_single_fasta_to_dictionary(open(fasta_file, 'r'))
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
79 with open(fasta_file_split, 'w') as fh_out:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
80 for header in fasta_dict:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
81 matching_table_part = [x for x in matching_table if x[0] == header]
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
82 for header2, i, start, end, new_header in matching_table_part:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
83 fh_out.write('>' + new_header + '\n')
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
84 fh_out.write(fasta_dict[header][start:end] + '\n')
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
85 temp_files_fasta = make_temp_files(number_of_temp_files)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
86 fasta_seq_size = read_fasta_sequence_size(fasta_file_split)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
87 seq_id_size_sorted = [i[0] for i in sorted(
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
88 fasta_seq_size.items(), key=lambda x: int(x[1]), reverse=True
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
89 )]
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
90 seq_id_file_dict = dict(zip(seq_id_size_sorted, itertools.cycle(temp_files_fasta)))
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
91 # write sequences to temporary files
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
92 with open(fasta_file_split, 'r') as f:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
93 first = True
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
94 for line in f:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
95 if line[0] == '>':
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
96 # close previous file if it is not the first sequence
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
97 if not first:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
98 fout.close()
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
99 first = False
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
100 header = line.strip().split(' ')[0][1:]
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
101 fout = open(seq_id_file_dict[header],'a')
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
102 fout.write(line)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
103 else:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
104 fout.write(line)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
105 os.remove(fasta_file_split)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
106 return temp_files_fasta, matching_table
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
107
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
108
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
109 def read_fasta_sequence_size(fasta_file):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
110 """Read size of sequence into dictionary"""
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
111 fasta_dict = {}
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
112 with open(fasta_file, 'r') as f:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
113 for line in f:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
114 if line[0] == '>':
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
115 header = line.strip().split(' ')[0][1:] # remove part of name after space
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
116 fasta_dict[header] = 0
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
117 else:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
118 fasta_dict[header] += len(line.strip())
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
119 return fasta_dict
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
120
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
121
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
122 def read_single_fasta_to_dictionary(fh):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
123 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
124 Read fasta file into dictionary
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
125 :param fh:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
126 :return:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
127 fasta_dict
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
128 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
129 fasta_dict = {}
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
130 for line in fh:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
131 if line[0] == '>':
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
132 header = line.strip().split(' ')[0][1:] # remove part of name after space
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
133 fasta_dict[header] = []
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
134 else:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
135 fasta_dict[header] += [line.strip()]
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
136 fasta_dict = {k: ''.join(v) for k, v in fasta_dict.items()}
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
137 return fasta_dict
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
138
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
139
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
140 def overlap(a, b):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
141 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
142 check if two intervals overlap
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
143 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
144 return max(a[0], b[0]) <= min(a[1], b[1])
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
145
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
146
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
147 def blast2disjoint(
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
148 blastfile, seqid_counts=None, start_column=6, end_column=7, class_column=1,
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
149 bitscore_column=11, pident_column=2, canonical_classification=True
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
150 ):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
151 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
152 find all interval beginning and ends in blast file and create bed file
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
153 input blastfile is tab separated file with columns:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
154 'qaccver saccver pident length mismatch gapopen qstart qend sstart send
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
155 evalue bitscore' (default outfmt 6
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
156 blast must be sorted on qseqid and qstart
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
157 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
158 # assume all in one chromosome!
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
159 starts_ends = {}
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
160 intervals = {}
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
161 if canonical_classification:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
162 # make regular expression for canonical classification
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
163 # to match: Name#classification
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
164 # e.g. "Name_of_sequence#LTR/Ty1_copia/Angela"
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
165 regex = re.compile(r"(.*)[#](.*)")
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
166 group = 2
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
167 else:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
168 # make regular expression for non-canonical classification
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
169 # to match: Classification__Name
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
170 # e.g. "LTR/Ty1_copia/Angela__Name_of_sequence"
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
171 regex = re.compile(r"(.*)__(.*)")
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
172 group = 1
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
173
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
174 # identify continuous intervals
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
175 with open(blastfile, "r") as f:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
176 for seqid in sorted(seqid_counts.keys()):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
177 n_lines = seqid_counts[seqid]
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
178 starts_ends[seqid] = set()
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
179 for i in range(n_lines):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
180 items = f.readline().strip().split()
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
181 # note 1s and 2s labels are used to distinguish between start and end and
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
182 # guarantee that with same coordinated start will be before end when
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
183 # sorting (1s < 2e)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
184 starts_ends[seqid].add((int(items[start_column]), '1s'))
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
185 starts_ends[seqid].add((int(items[end_column]), '2e'))
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
186 intervals[seqid] = []
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
187 for p1, p2 in itertools.pairwise(sorted(starts_ends[seqid])):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
188 if p1[1] == '1s':
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
189 sp = 0
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
190 else:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
191 sp = 1
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
192 if p2[1] == '2e':
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
193 ep = 0
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
194 else:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
195 ep = 1
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
196 intervals[seqid].append((p1[0] + sp, p2[0] - ep))
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
197 # scan each blast hit against continuous region and record hit with best score
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
198 with open(blastfile, "r") as f:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
199 disjoint_regions = []
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
200 for seqid in sorted(seqid_counts.keys()):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
201 n_lines = seqid_counts[seqid]
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
202 idx_of_overlaps = {}
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
203 best_pident = defaultdict(lambda: 0.0)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
204 best_bitscore = defaultdict(lambda: 0.0)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
205 best_hit_name = defaultdict(lambda: "")
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
206 i1 = 0
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
207 for i in range(n_lines):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
208 items = f.readline().strip().split()
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
209 start = int(items[start_column])
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
210 end = int(items[end_column])
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
211 pident = float(items[pident_column])
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
212 bitscore = float(items[bitscore_column])
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
213 classification = items[class_column]
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
214 j = 0
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
215 done = False
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
216 while True:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
217 # beginning of searched region - does it overlap?
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
218 c_ovl = overlap(intervals[seqid][i1], (start, end))
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
219 if c_ovl:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
220 # if overlap is detected, add to dictionary
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
221 idx_of_overlaps[i] = [i1]
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
222 if best_bitscore[i1] < bitscore:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
223 best_pident[i1] = pident
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
224 best_bitscore[i1] = bitscore
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
225 best_hit_name[i1] = classification
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
226 # add search also downstream
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
227 while True:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
228 j += 1
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
229 if j + i1 >= len(intervals[seqid]):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
230 done = True
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
231 break
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
232 c_ovl = overlap(intervals[seqid][i1 + j], (start, end))
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
233 if c_ovl:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
234 idx_of_overlaps[i].append(i1 + j)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
235 if best_bitscore[i1 + j] < bitscore:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
236 best_pident[i1 + j] = pident
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
237 best_bitscore[i1 + j] = bitscore
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
238 best_hit_name[i1 + j] = classification
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
239 else:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
240 done = True
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
241 break
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
242
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
243 else:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
244 # does no overlap - search next interval
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
245 i1 += 1
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
246 if done or i1 >= (len(intervals[seqid]) - 1):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
247 break
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
248
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
249 for i in sorted(best_pident.keys()):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
250 try:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
251 classification = re.match(regex, best_hit_name[i]).group(group)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
252 except AttributeError:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
253 classification = best_hit_name[i]
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
254 record = (
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
255 seqid, intervals[seqid][i][0], intervals[seqid][i][1], best_pident[i],
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
256 classification)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
257 disjoint_regions.append(record)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
258 return disjoint_regions
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
259
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
260
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
261 def remove_short_interrupting_regions(regions, min_len=10, max_gap=2):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
262 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
263 remove intervals shorter than min_len which are directly adjacent to other
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
264 regions on both sides which are longer than min_len and has same classification
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
265 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
266 regions_to_remove = []
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
267 for i in range(1, len(regions) - 1):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
268 if regions[i][2] - regions[i][1] < min_len:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
269 c1 = regions[i - 1][2] - regions[i - 1][1] > min_len
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
270 c2 = regions[i + 1][2] - regions[i + 1][1] > min_len
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
271 c3 = regions[i - 1][4] == regions[i + 1][4] # same classification
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
272 c4 = regions[i + 1][4] != regions[i][4] # different classification
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
273 c5 = regions[i][1] - regions[i - 1][2] < max_gap # max gap between regions
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
274 c6 = regions[i + 1][1] - regions[i][2] < max_gap # max gap between regions
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
275 if c1 and c2 and c3 & c4 and c5 and c6:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
276 regions_to_remove.append(i)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
277 for i in sorted(regions_to_remove, reverse=True):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
278 del regions[i]
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
279 return regions
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
280
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
281
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
282 def remove_short_regions(regions, min_l_score=600):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
283 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
284 remove intervals shorter than min_len
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
285 min_l_score is the minimum score for a region to be considered
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
286 l_score = length * PID
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
287 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
288 regions_to_remove = []
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
289 for i in range(len(regions)):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
290 l_score = (regions[i][3] - 50) * (regions[i][2] - regions[i][1])
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
291 if l_score < min_l_score:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
292 regions_to_remove.append(i)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
293 for i in sorted(regions_to_remove, reverse=True):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
294 del regions[i]
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
295 return regions
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
296
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
297
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
298 def join_disjoint_regions_by_classification(disjoint_regions, max_gap=0):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
299 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
300 merge neighboring intervals with same classification and calculate mean weighted score
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
301 weight correspond to length of the interval
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
302 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
303 merged_regions = []
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
304 for seqid, start, end, score, classification in disjoint_regions:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
305 score_length = (end - start + 1) * score
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
306 if len(merged_regions) == 0:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
307 merged_regions.append([seqid, start, end, score_length, classification])
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
308 else:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
309 cond_same_class = merged_regions[-1][4] == classification
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
310 cond_same_seqid = merged_regions[-1][0] == seqid
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
311 cond_neighboring = start - merged_regions[-1][2] + 1 <= max_gap
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
312 if cond_same_class and cond_same_seqid and cond_neighboring:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
313 # extend region
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
314 merged_regions[-1] = [merged_regions[-1][0], merged_regions[-1][1], end,
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
315 merged_regions[-1][3] + score_length,
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
316 merged_regions[-1][4]]
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
317 else:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
318 merged_regions.append([seqid, start, end, score_length, classification])
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
319 # recalculate length weighted score
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
320 for record in merged_regions:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
321 record[3] = record[3] / (record[2] - record[1] + 1)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
322 return merged_regions
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
323
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
324
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
325 def write_merged_regions_to_gff3(merged_regions, outfile):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
326 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
327 write merged regions to gff3 file
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
328 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
329 with open(outfile, "w") as f:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
330 # write header
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
331 f.write("##gff-version 3\n")
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
332 for seqid, start, end, score, classification in merged_regions:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
333 attributes = "Name={};score={}".format(classification, score)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
334 f.write(
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
335 "\t".join(
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
336 [seqid, "blast_parsed", "repeat_region", str(start), str(end),
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
337 str(round(score,2)), ".", ".", attributes]
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
338 )
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
339 )
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
340 f.write("\n")
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
341
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
342
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
343 def sort_blast_table(
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
344 blastfile, seqid_column=0, start_column=6, cpu=1
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
345 ):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
346 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
347 split blast table by seqid and sort by start position
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
348 stores output in temp files
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
349 columns are indexed from 0
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
350 but cut uses 1-based indexing!
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
351 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
352 blast_sorted = tempfile.NamedTemporaryFile().name
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
353 # create sorted dictionary seqid counts
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
354 seq_id_counts = {}
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
355 # sort blast file on disk using sort on seqid and start (numeric) position columns
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
356 # using sort command as blast output could be very large
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
357 cmd = "sort -k {0},{0} -k {1},{1}n --parallel {4} {2} > {3}".format(
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
358 seqid_column + 1, start_column + 1, blastfile, blast_sorted, cpu
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
359 )
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
360 subprocess.check_call(cmd, shell=True)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
361
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
362 # count seqids using uniq command
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
363 cmd = "cut -f {0} {1} | uniq -c > {2}".format(
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
364 seqid_column + 1, blast_sorted, blast_sorted + ".counts"
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
365 )
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
366 subprocess.check_call(cmd, shell=True)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
367 # read counts file and create dictionary
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
368 with open(blast_sorted + ".counts", "r") as f:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
369 for line in f:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
370 line = line.strip().split()
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
371 seq_id_counts[line[1]] = int(line[0])
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
372 # remove counts file
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
373 subprocess.call(["rm", blast_sorted + ".counts"])
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
374 # return sorted dictionary and sorted blast file
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
375 return seq_id_counts, blast_sorted
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
376
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
377
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
378 def run_blastn(
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
379 query, db, blastfile, evalue=1e-3, max_target_seqs=999999999, gapopen=2,
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
380 gapextend=1, reward=1, penalty=-1, word_size=9, num_threads=1, outfmt="6"
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
381 ):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
382 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
383 run blastn
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
384 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
385 # create temporary blast database:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
386 db_formated = tempfile.NamedTemporaryFile().name
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
387 cmd = "makeblastdb -in {0} -dbtype nucl -out {1}".format(db, db_formated)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
388 subprocess.check_call(cmd, shell=True)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
389 # if query is smaller than 1GB, run blast on single file
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
390 size = os.path.getsize(query)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
391 print("query size: {} bytes".format(size))
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
392 max_size = 1e6
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
393 overlap = 50000
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
394 if size < max_size:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
395 cmd = ("blastn -task rmblastn -query {0} -db {1} -out {2} -evalue {3} "
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
396 "-max_target_seqs {4} "
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
397 "-gapopen {5} -gapextend {6} -word_size {7} -num_threads "
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
398 "{8} -outfmt '{9}' -reward {10} -penalty {11} -dust no").format(
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
399 query, db_formated, blastfile, evalue, max_target_seqs, gapopen, gapextend,
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
400 word_size, num_threads, outfmt, reward, penalty
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
401 )
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
402 subprocess.check_call(cmd, shell=True)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
403 # if query is larger than 1GB, split query in chunks and run blast on each chunk
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
404 else:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
405 print(f"query is larger than {max_size}, splitting query in chunks")
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
406 query_parts, matching_table = split_fasta_to_chunks(query, max_size, overlap)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
407 print(query_parts)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
408 for i, part in enumerate(query_parts):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
409 print(f"running blast on chunk {i}")
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
410 print(part)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
411 cmd = ("blastn -task rmblastn -query {0} -db {1} -out {2} -evalue {3} "
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
412 "-max_target_seqs {4} "
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
413 "-gapopen {5} -gapextend {6} -word_size {7} -num_threads "
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
414 "{8} -outfmt '{9}' -reward {10} -penalty {11} -dust no").format(
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
415 part, db_formated, f'{blastfile}.{i}', evalue, max_target_seqs, gapopen,
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
416 gapextend,
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
417 word_size, num_threads, outfmt, reward, penalty
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
418 )
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
419 subprocess.check_call(cmd, shell=True)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
420 print(cmd)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
421 # remove part file
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
422 # os.unlink(part)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
423 # merge blast results and recalculate start, end positions and header
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
424 merge_blast_results(blastfile, matching_table, n_parts=len(query_parts))
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
425
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
426 # remove temporary blast database
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
427 os.unlink(db_formated + ".nhr")
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
428 os.unlink(db_formated + ".nin")
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
429 os.unlink(db_formated + ".nsq")
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
430
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
431 def merge_blast_results(blastfile, matching_table, n_parts):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
432 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
433 Merge blast tables and recalculate start, end positions based on
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
434 matching table
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
435 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
436 with open(blastfile, "w") as f:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
437 matching_table_dict = {i[4]: i for i in matching_table}
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
438 print(matching_table_dict)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
439 for i in range(n_parts):
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
440 with open(f'{blastfile}.{i}', "r") as f2:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
441 for line in f2:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
442 line = line.strip().split("\t")
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
443 # seqid (header) is in column 1
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
444 seqid = line[0]
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
445 line[0] = matching_table_dict[seqid][0]
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
446 # increase coordinates by start position of chunk
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
447 line[6] = str(int(line[6]) + matching_table_dict[seqid][2])
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
448 line[7] = str(int(line[7]) + matching_table_dict[seqid][2])
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
449 f.write("\t".join(line) + "\n")
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
450 # remove temporary blast file
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
451 # os.unlink(f'{blastfile}.{i}')
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
452
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
453 def main():
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
454 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
455 main function
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
456 """
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
457 # get command line arguments
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
458 parser = argparse.ArgumentParser(
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
459 description="""This script is used to parse blast output table to gff file""",
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
460 formatter_class=argparse.RawTextHelpFormatter
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
461 )
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
462 parser.add_argument(
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
463 '-i', '--input', default=None, required=True, help="input file", type=str,
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
464 action='store'
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
465 )
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
466 parser.add_argument(
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
467 '-d', '--db', default=None, required=False,
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
468 help="Fasta file with repeat database", type=str, action='store'
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
469 )
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
470 parser.add_argument(
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
471 '-o', '--output', default=None, required=True, help="output file name", type=str,
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
472 action='store'
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
473 )
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
474 parser.add_argument(
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
475 '-a', '--alternative_classification_coding', default=False,
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
476 help="Use alternative classification coding", action='store_true'
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
477 )
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
478 parser.add_argument(
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
479 '-f', '--fasta_input', default=False,
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
480 help="Input is fasta file instead of blast table", action='store_true'
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
481 )
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
482 parser.add_argument(
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
483 '-c', '--cpu', default=1, help="Number of cpu to use", type=int
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
484 )
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
485
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
486 args = parser.parse_args()
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
487
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
488 if args.fasta_input:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
489 # run blast using blastn
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
490 blastfile = tempfile.NamedTemporaryFile().name
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
491 if args.db:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
492 run_blastn(args.input, args.db, blastfile, num_threads=args.cpu)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
493 else:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
494 sys.exit("No repeat database provided")
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
495 else:
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
496 blastfile = args.input
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
497
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
498 # sort blast table
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
499 seq_id_counts, blast_sorted = sort_blast_table(blastfile, cpu=args.cpu)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
500 disjoin_regions = blast2disjoint(
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
501 blast_sorted, seq_id_counts,
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
502 canonical_classification=not args.alternative_classification_coding
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
503 )
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
504
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
505 # remove short regions
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
506 disjoin_regions = remove_short_interrupting_regions(disjoin_regions)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
507
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
508 # join neighboring regions with same classification
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
509 merged_regions = join_disjoint_regions_by_classification(disjoin_regions)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
510
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
511 # remove short regions again
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
512 merged_regions = remove_short_interrupting_regions(merged_regions)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
513
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
514 # merge again neighboring regions with same classification
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
515 merged_regions = join_disjoint_regions_by_classification(merged_regions, max_gap=10)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
516
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
517 # remove short weak regions
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
518 merged_regions = remove_short_regions(merged_regions)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
519
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
520 # last merge
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
521 merged_regions = join_disjoint_regions_by_classification(merged_regions, max_gap=20)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
522 write_merged_regions_to_gff3(merged_regions, args.output)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
523 # remove temporary files
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
524 os.remove(blast_sorted)
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
525
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
526
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
527 if __name__ == "__main__":
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff changeset
528 main()