annotate RaGOO/ragoo_utilities/utilities.py @ 13:b9a3aeb162ab draft default tip

Uploaded
author dereeper
date Mon, 26 Jul 2021 18:22:37 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
13
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
1 import time
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
2 import subprocess
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
3 import operator
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
4
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
5 from ragoo_utilities.SeqReader import SeqReader
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
6
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
7 """ A collection of various helper functions"""
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
8
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
9 complements = str.maketrans("ACGTNURYSWKMBVDHacgtnuryswkmbvdh", "TGCANAYRSWMKVBHDtgcanayrswmkvbhd")
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
10
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
11
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
12 def reverse_complement(seq):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
13 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
14 Reverse complement a nucleotide sequence.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
15 :param seq: Sequence to be reverse complemented
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
16 :return: A reverse complemented sequence
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
17 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
18 return seq.translate(complements)[::-1]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
19
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
20
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
21 def run(cmnd):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
22 """ Run command and report status. """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
23 log('Running : %s' % cmnd)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
24 if subprocess.call(cmnd, shell=True, executable='/bin/bash') != 0:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
25 raise RuntimeError('Failed : %s ' % cmnd)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
26
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
27
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
28 def log(message):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
29 """ Log messages to standard output. """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
30 print(time.ctime() + ' --- ' + message, flush=True)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
31
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
32
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
33 def read_contigs(in_file):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
34 d = dict()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
35 x = SeqReader(in_file)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
36 for header, seq in x.parse_fasta():
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
37 d[header.replace('>', '').split(' ')[0]] = seq
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
38 return d
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
39
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
40
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
41 def read_gz_contigs(in_file):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
42 d = dict()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
43 x = SeqReader(in_file)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
44 for header, seq in x.parse_gzip_fasta():
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
45 d[header.replace('>', '').split(' ')[0]] = seq
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
46 return d
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
47
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
48
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
49 def binary_search(query, numbers, left, right):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
50 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
51 The contents of this method are either influenced by or directly copied from "Assemblytics_uniq_anchor.py"
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
52 written by Maria Nattestad. The original script can be found here:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
53
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
54 https://github.com/MariaNattestad/Assemblytics
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
55
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
56 And the publication associated with Maria's work is here:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
57
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
58 Nattestad, Maria, and Michael C. Schatz. "Assemblytics: a
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
59 web analytics tool for the detection of variants from an
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
60 assembly." Bioinformatics 32.19 (2016): 3021-3023.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
61 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
62 # Returns index of the matching element or the first element to the right
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
63
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
64 if left >= right:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
65 return right
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
66 mid = (right + left) // 2
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
67
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
68 if query == numbers[mid]:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
69 return mid
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
70 elif query < numbers[mid]:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
71 return binary_search(query, numbers, left, mid)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
72 else: # if query > numbers[mid]:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
73 return binary_search(query, numbers, mid + 1, right)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
74
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
75
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
76 def summarize_planesweep(lines, unique_length_required, keep_small_uniques=False):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
77 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
78 The contents of this method are either influenced by or directly copied from "Assemblytics_uniq_anchor.py"
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
79 written by Maria Nattestad. The original script can be found here:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
80
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
81 https://github.com/MariaNattestad/Assemblytics
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
82
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
83 And the publication associated with Maria's work is here:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
84
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
85 Nattestad, Maria, and Michael C. Schatz. "Assemblytics: a
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
86 web analytics tool for the detection of variants from an
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
87 assembly." Bioinformatics 32.19 (2016): 3021-3023.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
88 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
89 alignments_to_keep = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
90 # print len(lines)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
91
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
92 # If no alignments:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
93 if len(lines) == 0:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
94 return []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
95
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
96 # If only one alignment:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
97 if len(lines) == 1:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
98 if keep_small_uniques == True or abs(lines[0][1] - lines[0][0]) >= unique_length_required:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
99 return [0]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
100 else:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
101 return []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
102
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
103 starts_and_stops = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
104 for query_min, query_max in lines:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
105 # print query_min, query_max
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
106 starts_and_stops.append((query_min, "start"))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
107 starts_and_stops.append((query_max, "stop"))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
108
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
109 sorted_starts_and_stops = sorted(starts_and_stops, key=operator.itemgetter(0))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
110
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
111 current_coverage = 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
112 last_position = -1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
113 sorted_unique_intervals_left = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
114 sorted_unique_intervals_right = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
115 for pos, change in sorted_starts_and_stops:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
116
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
117 if current_coverage == 1:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
118 # sorted_unique_intervals.append((last_position,pos))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
119 sorted_unique_intervals_left.append(last_position)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
120 sorted_unique_intervals_right.append(pos)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
121
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
122 if change == "start":
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
123 current_coverage += 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
124 else:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
125 current_coverage -= 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
126 last_position = pos
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
127
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
128 linecounter = 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
129 for query_min, query_max in lines:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
130
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
131 i = binary_search(query_min, sorted_unique_intervals_left, 0, len(sorted_unique_intervals_left))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
132
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
133 exact_match = False
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
134 if sorted_unique_intervals_left[i] == query_min and sorted_unique_intervals_right[i] == query_max:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
135 exact_match = True
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
136 sum_uniq = 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
137 while i < len(sorted_unique_intervals_left) and sorted_unique_intervals_left[i] >= query_min and \
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
138 sorted_unique_intervals_right[i] <= query_max:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
139 sum_uniq += sorted_unique_intervals_right[i] - sorted_unique_intervals_left[i]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
140 i += 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
141
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
142 # print query_min,query_max,sum_uniq
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
143 if sum_uniq >= unique_length_required:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
144 alignments_to_keep.append(linecounter)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
145 elif keep_small_uniques == True and exact_match == True:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
146 alignments_to_keep.append(linecounter)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
147 # print "Keeping small alignment:", query_min, query_max
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
148 # print sorted_unique_intervals_left[i-1],sorted_unique_intervals_right[i-1]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
149
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
150 linecounter += 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
151
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
152 return alignments_to_keep