comparison RaGOO/ragoo_utilities/utilities.py @ 13:b9a3aeb162ab draft default tip

Uploaded
author dereeper
date Mon, 26 Jul 2021 18:22:37 +0000
parents
children
comparison
equal deleted inserted replaced
12:68a9ec9ce51e 13:b9a3aeb162ab
1 import time
2 import subprocess
3 import operator
4
5 from ragoo_utilities.SeqReader import SeqReader
6
7 """ A collection of various helper functions"""
8
9 complements = str.maketrans("ACGTNURYSWKMBVDHacgtnuryswkmbvdh", "TGCANAYRSWMKVBHDtgcanayrswmkvbhd")
10
11
12 def reverse_complement(seq):
13 """
14 Reverse complement a nucleotide sequence.
15 :param seq: Sequence to be reverse complemented
16 :return: A reverse complemented sequence
17 """
18 return seq.translate(complements)[::-1]
19
20
21 def run(cmnd):
22 """ Run command and report status. """
23 log('Running : %s' % cmnd)
24 if subprocess.call(cmnd, shell=True, executable='/bin/bash') != 0:
25 raise RuntimeError('Failed : %s ' % cmnd)
26
27
28 def log(message):
29 """ Log messages to standard output. """
30 print(time.ctime() + ' --- ' + message, flush=True)
31
32
33 def read_contigs(in_file):
34 d = dict()
35 x = SeqReader(in_file)
36 for header, seq in x.parse_fasta():
37 d[header.replace('>', '').split(' ')[0]] = seq
38 return d
39
40
41 def read_gz_contigs(in_file):
42 d = dict()
43 x = SeqReader(in_file)
44 for header, seq in x.parse_gzip_fasta():
45 d[header.replace('>', '').split(' ')[0]] = seq
46 return d
47
48
49 def binary_search(query, numbers, left, right):
50 """
51 The contents of this method are either influenced by or directly copied from "Assemblytics_uniq_anchor.py"
52 written by Maria Nattestad. The original script can be found here:
53
54 https://github.com/MariaNattestad/Assemblytics
55
56 And the publication associated with Maria's work is here:
57
58 Nattestad, Maria, and Michael C. Schatz. "Assemblytics: a
59 web analytics tool for the detection of variants from an
60 assembly." Bioinformatics 32.19 (2016): 3021-3023.
61 """
62 # Returns index of the matching element or the first element to the right
63
64 if left >= right:
65 return right
66 mid = (right + left) // 2
67
68 if query == numbers[mid]:
69 return mid
70 elif query < numbers[mid]:
71 return binary_search(query, numbers, left, mid)
72 else: # if query > numbers[mid]:
73 return binary_search(query, numbers, mid + 1, right)
74
75
76 def summarize_planesweep(lines, unique_length_required, keep_small_uniques=False):
77 """
78 The contents of this method are either influenced by or directly copied from "Assemblytics_uniq_anchor.py"
79 written by Maria Nattestad. The original script can be found here:
80
81 https://github.com/MariaNattestad/Assemblytics
82
83 And the publication associated with Maria's work is here:
84
85 Nattestad, Maria, and Michael C. Schatz. "Assemblytics: a
86 web analytics tool for the detection of variants from an
87 assembly." Bioinformatics 32.19 (2016): 3021-3023.
88 """
89 alignments_to_keep = []
90 # print len(lines)
91
92 # If no alignments:
93 if len(lines) == 0:
94 return []
95
96 # If only one alignment:
97 if len(lines) == 1:
98 if keep_small_uniques == True or abs(lines[0][1] - lines[0][0]) >= unique_length_required:
99 return [0]
100 else:
101 return []
102
103 starts_and_stops = []
104 for query_min, query_max in lines:
105 # print query_min, query_max
106 starts_and_stops.append((query_min, "start"))
107 starts_and_stops.append((query_max, "stop"))
108
109 sorted_starts_and_stops = sorted(starts_and_stops, key=operator.itemgetter(0))
110
111 current_coverage = 0
112 last_position = -1
113 sorted_unique_intervals_left = []
114 sorted_unique_intervals_right = []
115 for pos, change in sorted_starts_and_stops:
116
117 if current_coverage == 1:
118 # sorted_unique_intervals.append((last_position,pos))
119 sorted_unique_intervals_left.append(last_position)
120 sorted_unique_intervals_right.append(pos)
121
122 if change == "start":
123 current_coverage += 1
124 else:
125 current_coverage -= 1
126 last_position = pos
127
128 linecounter = 0
129 for query_min, query_max in lines:
130
131 i = binary_search(query_min, sorted_unique_intervals_left, 0, len(sorted_unique_intervals_left))
132
133 exact_match = False
134 if sorted_unique_intervals_left[i] == query_min and sorted_unique_intervals_right[i] == query_max:
135 exact_match = True
136 sum_uniq = 0
137 while i < len(sorted_unique_intervals_left) and sorted_unique_intervals_left[i] >= query_min and \
138 sorted_unique_intervals_right[i] <= query_max:
139 sum_uniq += sorted_unique_intervals_right[i] - sorted_unique_intervals_left[i]
140 i += 1
141
142 # print query_min,query_max,sum_uniq
143 if sum_uniq >= unique_length_required:
144 alignments_to_keep.append(linecounter)
145 elif keep_small_uniques == True and exact_match == True:
146 alignments_to_keep.append(linecounter)
147 # print "Keeping small alignment:", query_min, query_max
148 # print sorted_unique_intervals_left[i-1],sorted_unique_intervals_right[i-1]
149
150 linecounter += 1
151
152 return alignments_to_keep