comparison fastaregexfinder.py @ 1:9a811adb714f draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/bioinformatics-cafe commit c318cd3f894f89c13d7eb257678673da70f72212
author iuc
date Wed, 25 Jan 2023 14:03:52 +0000
parents 269c627ae9f4
children
comparison
equal deleted inserted replaced
0:269c627ae9f4 1:9a811adb714f
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 import argparse
4 import operator
3 import re 5 import re
4 import sys 6 import sys
5 import string 7
6 import argparse 8 VERSION = "0.1.1"
7 import operator 9
8 10 parser = argparse.ArgumentParser(
9 VERSION='0.1.1' 11 description="""
10
11 parser = argparse.ArgumentParser(description="""
12 12
13 DESCRIPTION 13 DESCRIPTION
14 14
15 Search a fasta file for matches to a regex and return a bed file with the 15 Search a fasta file for matches to a regex and return a bed file with the
16 coordinates of the match and the matched sequence itself. 16 coordinates of the match and the matched sequence itself.
17 17
18 Output bed file has columns: 18 Output bed file has columns:
19 1. Name of fasta sequence (e.g. chromosome) 19 1. Name of fasta sequence (e.g. chromosome)
20 2. Start of the match 20 2. Start of the match
21 3. End of the match 21 3. End of the match
22 4. ID of the match 22 4. ID of the match
23 5. Length of the match 23 5. Length of the match
24 6. Strand 24 6. Strand
25 7. Matched sequence as it appears on the forward strand 25 7. Matched sequence as it appears on the forward strand
26 26
27 For matches on the reverse strand it is reported the start and end position on the 27 For matches on the reverse strand it is reported the start and end position on the
28 forward strand and the matched string on the forward strand (so the G4 'GGGAGGGT' 28 forward strand and the matched string on the forward strand (so the G4 'GGGAGGGT'
29 present on the reverse strand is reported as ACCCTCCC). 29 present on the reverse strand is reported as ACCCTCCC).
30 30
31 Note: Fasta sequences (chroms) are read in memory one at a time along with the 31 Note: Fasta sequences (chroms) are read in memory one at a time along with the
32 matches for that chromosome. 32 matches for that chromosome.
33 The order of the output is: chroms as they are found in the inut fasta, matches 33 The order of the output is: chroms as they are found in the inut fasta, matches
34 sorted within chroms by positions. 34 sorted within chroms by positions.
35 35
36 EXAMPLE: 36 EXAMPLE:
37 ## Test data: 37 ## Test data:
38 echo '>mychr' > /tmp/mychr.fa 38 echo '>mychr' > /tmp/mychr.fa
39 echo 'ACTGnACTGnACTGnTGAC' >> /tmp/mychr.fa 39 echo 'ACTGnACTGnACTGnTGAC' >> /tmp/mychr.fa
40 40
41 fastaRegexFinder.py -f /tmp/mychr.fa -r 'ACTG' 41 fastaRegexFinder.py -f /tmp/mychr.fa -r 'ACTG'
42 mychr 0 4 mychr_0_4_for 4 + ACTG 42 mychr 0 4 mychr_0_4_for 4 + ACTG
43 mychr 5 9 mychr_5_9_for 4 + ACTG 43 mychr 5 9 mychr_5_9_for 4 + ACTG
44 mychr 10 14 mychr_10_14_for 4 + ACTG 44 mychr 10 14 mychr_10_14_for 4 + ACTG
45 45
46 fastaRegexFinder.py -f /tmp/mychr.fa -r 'ACTG' --maxstr 3 46 fastaRegexFinder.py -f /tmp/mychr.fa -r 'ACTG' --maxstr 3
47 mychr 0 4 mychr_0_4_for 4 + ACT[3,4] 47 mychr 0 4 mychr_0_4_for 4 + ACT[3,4]
48 mychr 5 9 mychr_5_9_for 4 + ACT[3,4] 48 mychr 5 9 mychr_5_9_for 4 + ACT[3,4]
49 mychr 10 14 mychr_10_14_for 4 + ACT[3,4] 49 mychr 10 14 mychr_10_14_for 4 + ACT[3,4]
50 50
51 less /tmp/mychr.fa | fastaRegexFinder.py -f - -r 'A\w\wGn' 51 less /tmp/mychr.fa | fastaRegexFinder.py -f - -r 'A\\w\\wGn'
52 mychr 0 5 mychr_0_5_for 5 + ACTGn 52 mychr 0 5 mychr_0_5_for 5 + ACTGn
53 mychr 5 10 mychr_5_10_for 5 + ACTGn 53 mychr 5 10 mychr_5_10_for 5 + ACTGn
54 mychr 10 15 mychr_10_15_for 5 + ACTGn 54 mychr 10 15 mychr_10_15_for 5 + ACTGn
55 55
56 DOWNLOAD 56 DOWNLOAD
57 fastaRegexFinder.py is hosted at https://github.com/dariober/bioinformatics-cafe/tree/master/fastaRegexFinder 57 fastaRegexFinder.py is hosted at https://github.com/dariober/bioinformatics-cafe/tree/master/fastaRegexFinder
58 58
59 """, formatter_class= argparse.RawTextHelpFormatter) 59 """,
60 60 formatter_class=argparse.RawTextHelpFormatter,
61 parser.add_argument('--fasta', '-f', 61 )
62 type= str, 62
63 help='''Input fasta file to search. Use '-' to read the file from stdin. 63 parser.add_argument(
64 64 "--fasta",
65 ''', 65 "-f",
66 required= True) 66 type=str,
67 67 help="Input fasta file to search. Use '-' to read the file from stdin.",
68 parser.add_argument('--regex', '-r', 68 required=True,
69 type= str, 69 )
70 help='''Regex to be searched in the fasta input. 70
71 parser.add_argument(
72 "--regex",
73 "-r",
74 type=str,
75 help="""Regex to be searched in the fasta input.
71 Matches to the reverse complement will have - strand. 76 Matches to the reverse complement will have - strand.
72 The default regex is '([gG]{3,}\w{1,7}){3,}[gG]{3,}' which searches 77 The default regex is '([gG]{3,}\\w{1,7}){3,}[gG]{3,}' which searches
73 for G-quadruplexes. 78 for G-quadruplexes.""",
74 ''', 79 default="([gG]{3,}\\w{1,7}){3,}[gG]{3,}",
75 default= '([gG]{3,}\w{1,7}){3,}[gG]{3,}') 80 )
76 81
77 parser.add_argument('--matchcase', '-m', 82 parser.add_argument(
78 action= 'store_true', 83 "--matchcase",
79 help='''Match case while searching for matches. Default is 84 "-m",
85 action="store_true",
86 help="""Match case while searching for matches. Default is
80 to ignore case (I.e. 'ACTG' will match 'actg'). 87 to ignore case (I.e. 'ACTG' will match 'actg').
81 ''') 88 """,
82 89 )
83 parser.add_argument('--noreverse', 90
84 action= 'store_true', 91 parser.add_argument(
85 help='''Do not search the reverse complement of the input fasta. 92 "--noreverse",
86 Use this flag to search protein sequences. 93 action="store_true",
87 ''') 94 help="""Do not search the reverse complement of the input fasta.
88 95 Use this flag to search protein sequences.""",
89 parser.add_argument('--maxstr', 96 )
90 type= int, 97
91 required= False, 98 parser.add_argument(
92 default= 10000, 99 "--maxstr",
93 help='''Maximum length of the match to report in the 7th column of the output. 100 type=int,
101 required=False,
102 default=10000,
103 help="""Maximum length of the match to report in the 7th column of the output.
94 Default is to report up to 10000nt. 104 Default is to report up to 10000nt.
95 Truncated matches are reported as <ACTG...ACTG>[<maxstr>,<tot length>] 105 Truncated matches are reported as <ACTG...ACTG>[<maxstr>,<tot length>]
96 ''') 106 """,
97 107 )
98 parser.add_argument('--seqnames', '-s', 108
99 type= str, 109 parser.add_argument(
100 nargs= '+', 110 "--seqnames",
101 default= [None], 111 "-s",
102 required= False, 112 type=str,
103 help='''List of fasta sequences in --fasta to 113 nargs="+",
114 default=[None],
115 required=False,
116 help="""List of fasta sequences in --fasta to
104 search. E.g. use --seqnames chr1 chr2 chrM to search only these crhomosomes. 117 search. E.g. use --seqnames chr1 chr2 chrM to search only these crhomosomes.
105 Default is to search all the sequences in input. 118 Default is to search all the sequences in input.
106 ''') 119 """,
107 parser.add_argument('--quiet', '-q', 120 )
108 action= 'store_true', 121 parser.add_argument(
109 help='''Do not print progress report (i.e. sequence names as they are scanned). 122 "--quiet",
110 ''') 123 "-q",
111 124 action="store_true",
112 125 help="""Do not print progress report (i.e. sequence names as they are scanned).
113 126 """,
114 parser.add_argument('--version', '-v', action='version', version='%(prog)s ' + VERSION) 127 )
128
129
130 parser.add_argument("--version", "-v", action="version", version="%(prog)s " + VERSION)
115 131
116 132
117 args = parser.parse_args() 133 args = parser.parse_args()
118 134
119 " --------------------------[ Check and parse arguments ]---------------------- " 135 " --------------------------[ Check and parse arguments ]---------------------- "
120 136
121 if args.matchcase: 137 if args.matchcase:
122 flag= 0 138 flag = 0
123 else: 139 else:
124 flag= re.IGNORECASE 140 flag = re.IGNORECASE
125 141
126 " ------------------------------[ Functions ]--------------------------------- " 142 " ------------------------------[ Functions ]--------------------------------- "
127 143
144
128 def sort_table(table, cols): 145 def sort_table(table, cols):
129 """ Code to sort list of lists 146 """Code to sort list of lists
130 see http://www.saltycrane.com/blog/2007/12/how-to-sort-table-by-columns-in-python/ 147 see http://www.saltycrane.com/blog/2007/12/how-to-sort-table-by-columns-in-python/
131 148
132 sort a table by multiple columns 149 sort a table by multiple columns
133 table: a list of lists (or tuple of tuples) where each inner list 150 table: a list of lists (or tuple of tuples) where each inner list
134 represents a row 151 represents a row
135 cols: a list (or tuple) specifying the column numbers to sort by 152 cols: a list (or tuple) specifying the column numbers to sort by
136 e.g. (1,0) would sort by column 1, then by column 0 153 e.g. (1,0) would sort by column 1, then by column 0
137 """ 154 """
138 for col in reversed(cols): 155 for col in reversed(cols):
139 table = sorted(table, key=operator.itemgetter(col)) 156 table = sorted(table, key=operator.itemgetter(col))
140 return(table) 157 return table
158
141 159
142 def trimMatch(x, n): 160 def trimMatch(x, n):
143 """ Trim the string x to be at most length n. Trimmed matches will be reported 161 """Trim the string x to be at most length n. Trimmed matches will be reported
144 with the syntax ACTG[a,b] where Ns are the beginning of x, a is the length of 162 with the syntax ACTG[a,b] where Ns are the beginning of x, a is the length of
145 the trimmed strng (e.g 4 here) and b is the full length of the match 163 the trimmed strng (e.g 4 here) and b is the full length of the match
146 EXAMPLE: 164 EXAMPLE:
147 trimMatch('ACTGNNNN', 4) 165 trimMatch('ACTGNNNN', 4)
148 >>>'ACTG[4,8]' 166 >>>'ACTG[4,8]'
149 trimMatch('ACTGNNNN', 8) 167 trimMatch('ACTGNNNN', 8)
150 >>>'ACTGNNNN' 168 >>>'ACTGNNNN'
151 """ 169 """
152 if len(x) > n and n is not None: 170 if len(x) > n and n is not None:
153 m= x[0:n] + '[' + str(n) + ',' + str(len(x)) + ']' 171 m = x[0:n] + "[" + str(n) + "," + str(len(x)) + "]"
154 else: 172 else:
155 m= x 173 m = x
156 return(m) 174 return m
175
157 176
158 def revcomp(x): 177 def revcomp(x):
159 """Reverse complement string x. Ambiguity codes are handled and case conserved. 178 """Reverse complement string x. Ambiguity codes are handled and case conserved.
160 179
161 Test 180 Test
162 x= 'ACGTRYSWKMBDHVNacgtryswkmbdhvn' 181 x= 'ACGTRYSWKMBDHVNacgtryswkmbdhvn'
163 revcomp(x) 182 revcomp(x)
164 """ 183 """
165 compdict= {'A':'T', 184 compdict = {
166 'C':'G', 185 "A": "T",
167 'G':'C', 186 "C": "G",
168 'T':'A', 187 "G": "C",
169 'R':'Y', 188 "T": "A",
170 'Y':'R', 189 "R": "Y",
171 'S':'W', 190 "Y": "R",
172 'W':'S', 191 "S": "W",
173 'K':'M', 192 "W": "S",
174 'M':'K', 193 "K": "M",
175 'B':'V', 194 "M": "K",
176 'D':'H', 195 "B": "V",
177 'H':'D', 196 "D": "H",
178 'V':'B', 197 "H": "D",
179 'N':'N', 198 "V": "B",
180 'a':'t', 199 "N": "N",
181 'c':'g', 200 "a": "t",
182 'g':'c', 201 "c": "g",
183 't':'a', 202 "g": "c",
184 'r':'y', 203 "t": "a",
185 'y':'r', 204 "r": "y",
186 's':'w', 205 "y": "r",
187 'w':'s', 206 "s": "w",
188 'k':'m', 207 "w": "s",
189 'm':'k', 208 "k": "m",
190 'b':'v', 209 "m": "k",
191 'd':'h', 210 "b": "v",
192 'h':'d', 211 "d": "h",
193 'v':'b', 212 "h": "d",
194 'n':'n'} 213 "v": "b",
195 xrc= [] 214 "n": "n",
215 }
216 xrc = []
196 for n in x: 217 for n in x:
197 xrc.append(compdict[n]) 218 xrc.append(compdict[n])
198 xrc= ''.join(xrc)[::-1] 219 xrc = "".join(xrc)[::-1]
199 return(xrc) 220 return xrc
221
222
200 # ----------------------------------------------------------------------------- 223 # -----------------------------------------------------------------------------
201 224
202 psq_re_f= re.compile(args.regex, flags= flag) 225 psq_re_f = re.compile(args.regex, flags=flag)
203 ## psq_re_r= re.compile(regexrev) 226 # psq_re_r= re.compile(regexrev)
204 227
205 if args.fasta != '-': 228 if args.fasta != "-":
206 ref_seq_fh= open(args.fasta) 229 ref_seq_fh = open(args.fasta)
207 else: 230 else:
208 ref_seq_fh= sys.stdin 231 ref_seq_fh = sys.stdin
209 232
210 ref_seq=[] 233 ref_seq = []
211 line= (ref_seq_fh.readline()).strip() 234 line = (ref_seq_fh.readline()).strip()
212 chr= re.sub('^>', '', line) 235 chr = re.sub("^>", "", line)
213 line= (ref_seq_fh.readline()).strip() 236 line = (ref_seq_fh.readline()).strip()
214 gquad_list= [] 237 gquad_list = []
215 while True: 238 while True:
216 if not args.quiet: 239 if not args.quiet:
217 sys.stderr.write('Processing %s\n' %(chr)) 240 sys.stderr.write("Processing %s\n" % (chr))
218 while line.startswith('>') is False: 241 while line.startswith(">") is False:
219 ref_seq.append(line) 242 ref_seq.append(line)
220 line= (ref_seq_fh.readline()).strip() 243 line = (ref_seq_fh.readline()).strip()
221 if line == '': 244 if line == "":
222 break 245 break
223 ref_seq= ''.join(ref_seq) 246 ref_seq = "".join(ref_seq)
224 if args.seqnames == [None] or chr in args.seqnames: 247 if args.seqnames == [None] or chr in args.seqnames:
225 for m in re.finditer(psq_re_f, ref_seq): 248 for m in re.finditer(psq_re_f, ref_seq):
226 matchstr= trimMatch(m.group(0), args.maxstr) 249 matchstr = trimMatch(m.group(0), args.maxstr)
227 quad_id= str(chr) + '_' + str(m.start()) + '_' + str(m.end()) + '_for' 250 quad_id = str(chr) + "_" + str(m.start()) + "_" + str(m.end()) + "_for"
228 gquad_list.append([chr, m.start(), m.end(), quad_id, len(m.group(0)), '+', matchstr]) 251 gquad_list.append(
252 [chr, m.start(), m.end(), quad_id, len(m.group(0)), "+", matchstr]
253 )
229 if args.noreverse is False: 254 if args.noreverse is False:
230 ref_seq= revcomp(ref_seq) 255 ref_seq = revcomp(ref_seq)
231 seqlen= len(ref_seq) 256 seqlen = len(ref_seq)
232 for m in re.finditer(psq_re_f, ref_seq): 257 for m in re.finditer(psq_re_f, ref_seq):
233 matchstr= trimMatch(revcomp(m.group(0)), args.maxstr) 258 matchstr = trimMatch(revcomp(m.group(0)), args.maxstr)
234 mstart= seqlen - m.end() 259 mstart = seqlen - m.end()
235 mend= seqlen - m.start() 260 mend = seqlen - m.start()
236 quad_id= str(chr) + '_' + str(mstart) + '_' + str(mend) + '_rev' 261 quad_id = str(chr) + "_" + str(mstart) + "_" + str(mend) + "_rev"
237 gquad_list.append([chr, mstart, mend, quad_id, len(m.group(0)), '-', matchstr]) 262 gquad_list.append(
238 gquad_sorted= sort_table(gquad_list, (1,2,3)) 263 [chr, mstart, mend, quad_id, len(m.group(0)), "-", matchstr]
239 gquad_list= [] 264 )
265 gquad_sorted = sort_table(gquad_list, (1, 2, 3))
266 gquad_list = []
240 for xline in gquad_sorted: 267 for xline in gquad_sorted:
241 xline= '\t'.join([str(x) for x in xline]) 268 xline = "\t".join([str(x) for x in xline])
242 print(xline) 269 print(xline)
243 chr= re.sub('^>', '', line) 270 chr = re.sub("^>", "", line)
244 ref_seq= [] 271 ref_seq = []
245 line= (ref_seq_fh.readline()).strip() 272 line = (ref_seq_fh.readline()).strip()
246 if line == '': 273 if line == "":
247 break 274 break
248 275
249 #gquad_sorted= sort_table(gquad_list, (0,1,2,3)) 276 # gquad_sorted= sort_table(gquad_list, (0,1,2,3))
250 # 277 #
251 #for line in gquad_sorted: 278 # for line in gquad_sorted:
252 # line= '\t'.join([str(x) for x in line]) 279 # line= '\t'.join([str(x) for x in line])
253 # print(line) 280 # print(line)
254 sys.exit() 281 sys.exit()