comparison fastaregexfinder.py @ 0:269c627ae9f4 draft

planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/tree/master/tools/fasta_regex_finder commit 8e118a4d24047e2c62912b962e854f789d6ff559
author mbernt
date Wed, 20 Jun 2018 11:06:57 -0400
parents
children 9a811adb714f
comparison
equal deleted inserted replaced
-1:000000000000 0:269c627ae9f4
1 #!/usr/bin/env python
2
3 import re
4 import sys
5 import string
6 import argparse
7 import operator
8
9 VERSION='0.1.1'
10
11 parser = argparse.ArgumentParser(description="""
12
13 DESCRIPTION
14
15 Search a fasta file for matches to a regex and return a bed file with the
16 coordinates of the match and the matched sequence itself.
17
18 Output bed file has columns:
19 1. Name of fasta sequence (e.g. chromosome)
20 2. Start of the match
21 3. End of the match
22 4. ID of the match
23 5. Length of the match
24 6. Strand
25 7. Matched sequence as it appears on the forward strand
26
27 For matches on the reverse strand it is reported the start and end position on the
28 forward strand and the matched string on the forward strand (so the G4 'GGGAGGGT'
29 present on the reverse strand is reported as ACCCTCCC).
30
31 Note: Fasta sequences (chroms) are read in memory one at a time along with the
32 matches for that chromosome.
33 The order of the output is: chroms as they are found in the inut fasta, matches
34 sorted within chroms by positions.
35
36 EXAMPLE:
37 ## Test data:
38 echo '>mychr' > /tmp/mychr.fa
39 echo 'ACTGnACTGnACTGnTGAC' >> /tmp/mychr.fa
40
41 fastaRegexFinder.py -f /tmp/mychr.fa -r 'ACTG'
42 mychr 0 4 mychr_0_4_for 4 + ACTG
43 mychr 5 9 mychr_5_9_for 4 + ACTG
44 mychr 10 14 mychr_10_14_for 4 + ACTG
45
46 fastaRegexFinder.py -f /tmp/mychr.fa -r 'ACTG' --maxstr 3
47 mychr 0 4 mychr_0_4_for 4 + ACT[3,4]
48 mychr 5 9 mychr_5_9_for 4 + ACT[3,4]
49 mychr 10 14 mychr_10_14_for 4 + ACT[3,4]
50
51 less /tmp/mychr.fa | fastaRegexFinder.py -f - -r 'A\w\wGn'
52 mychr 0 5 mychr_0_5_for 5 + ACTGn
53 mychr 5 10 mychr_5_10_for 5 + ACTGn
54 mychr 10 15 mychr_10_15_for 5 + ACTGn
55
56 DOWNLOAD
57 fastaRegexFinder.py is hosted at https://github.com/dariober/bioinformatics-cafe/tree/master/fastaRegexFinder
58
59 """, formatter_class= argparse.RawTextHelpFormatter)
60
61 parser.add_argument('--fasta', '-f',
62 type= str,
63 help='''Input fasta file to search. Use '-' to read the file from stdin.
64
65 ''',
66 required= True)
67
68 parser.add_argument('--regex', '-r',
69 type= str,
70 help='''Regex to be searched in the fasta input.
71 Matches to the reverse complement will have - strand.
72 The default regex is '([gG]{3,}\w{1,7}){3,}[gG]{3,}' which searches
73 for G-quadruplexes.
74 ''',
75 default= '([gG]{3,}\w{1,7}){3,}[gG]{3,}')
76
77 parser.add_argument('--matchcase', '-m',
78 action= 'store_true',
79 help='''Match case while searching for matches. Default is
80 to ignore case (I.e. 'ACTG' will match 'actg').
81 ''')
82
83 parser.add_argument('--noreverse',
84 action= 'store_true',
85 help='''Do not search the reverse complement of the input fasta.
86 Use this flag to search protein sequences.
87 ''')
88
89 parser.add_argument('--maxstr',
90 type= int,
91 required= False,
92 default= 10000,
93 help='''Maximum length of the match to report in the 7th column of the output.
94 Default is to report up to 10000nt.
95 Truncated matches are reported as <ACTG...ACTG>[<maxstr>,<tot length>]
96 ''')
97
98 parser.add_argument('--seqnames', '-s',
99 type= str,
100 nargs= '+',
101 default= [None],
102 required= False,
103 help='''List of fasta sequences in --fasta to
104 search. E.g. use --seqnames chr1 chr2 chrM to search only these crhomosomes.
105 Default is to search all the sequences in input.
106 ''')
107 parser.add_argument('--quiet', '-q',
108 action= 'store_true',
109 help='''Do not print progress report (i.e. sequence names as they are scanned).
110 ''')
111
112
113
114 parser.add_argument('--version', '-v', action='version', version='%(prog)s ' + VERSION)
115
116
117 args = parser.parse_args()
118
119 " --------------------------[ Check and parse arguments ]---------------------- "
120
121 if args.matchcase:
122 flag= 0
123 else:
124 flag= re.IGNORECASE
125
126 " ------------------------------[ Functions ]--------------------------------- "
127
128 def sort_table(table, cols):
129 """ Code to sort list of lists
130 see http://www.saltycrane.com/blog/2007/12/how-to-sort-table-by-columns-in-python/
131
132 sort a table by multiple columns
133 table: a list of lists (or tuple of tuples) where each inner list
134 represents a row
135 cols: a list (or tuple) specifying the column numbers to sort by
136 e.g. (1,0) would sort by column 1, then by column 0
137 """
138 for col in reversed(cols):
139 table = sorted(table, key=operator.itemgetter(col))
140 return(table)
141
142 def trimMatch(x, n):
143 """ Trim the string x to be at most length n. Trimmed matches will be reported
144 with the syntax ACTG[a,b] where Ns are the beginning of x, a is the length of
145 the trimmed strng (e.g 4 here) and b is the full length of the match
146 EXAMPLE:
147 trimMatch('ACTGNNNN', 4)
148 >>>'ACTG[4,8]'
149 trimMatch('ACTGNNNN', 8)
150 >>>'ACTGNNNN'
151 """
152 if len(x) > n and n is not None:
153 m= x[0:n] + '[' + str(n) + ',' + str(len(x)) + ']'
154 else:
155 m= x
156 return(m)
157
158 def revcomp(x):
159 """Reverse complement string x. Ambiguity codes are handled and case conserved.
160
161 Test
162 x= 'ACGTRYSWKMBDHVNacgtryswkmbdhvn'
163 revcomp(x)
164 """
165 compdict= {'A':'T',
166 'C':'G',
167 'G':'C',
168 'T':'A',
169 'R':'Y',
170 'Y':'R',
171 'S':'W',
172 'W':'S',
173 'K':'M',
174 'M':'K',
175 'B':'V',
176 'D':'H',
177 'H':'D',
178 'V':'B',
179 'N':'N',
180 'a':'t',
181 'c':'g',
182 'g':'c',
183 't':'a',
184 'r':'y',
185 'y':'r',
186 's':'w',
187 'w':'s',
188 'k':'m',
189 'm':'k',
190 'b':'v',
191 'd':'h',
192 'h':'d',
193 'v':'b',
194 'n':'n'}
195 xrc= []
196 for n in x:
197 xrc.append(compdict[n])
198 xrc= ''.join(xrc)[::-1]
199 return(xrc)
200 # -----------------------------------------------------------------------------
201
202 psq_re_f= re.compile(args.regex, flags= flag)
203 ## psq_re_r= re.compile(regexrev)
204
205 if args.fasta != '-':
206 ref_seq_fh= open(args.fasta)
207 else:
208 ref_seq_fh= sys.stdin
209
210 ref_seq=[]
211 line= (ref_seq_fh.readline()).strip()
212 chr= re.sub('^>', '', line)
213 line= (ref_seq_fh.readline()).strip()
214 gquad_list= []
215 while True:
216 if not args.quiet:
217 sys.stderr.write('Processing %s\n' %(chr))
218 while line.startswith('>') is False:
219 ref_seq.append(line)
220 line= (ref_seq_fh.readline()).strip()
221 if line == '':
222 break
223 ref_seq= ''.join(ref_seq)
224 if args.seqnames == [None] or chr in args.seqnames:
225 for m in re.finditer(psq_re_f, ref_seq):
226 matchstr= trimMatch(m.group(0), args.maxstr)
227 quad_id= str(chr) + '_' + str(m.start()) + '_' + str(m.end()) + '_for'
228 gquad_list.append([chr, m.start(), m.end(), quad_id, len(m.group(0)), '+', matchstr])
229 if args.noreverse is False:
230 ref_seq= revcomp(ref_seq)
231 seqlen= len(ref_seq)
232 for m in re.finditer(psq_re_f, ref_seq):
233 matchstr= trimMatch(revcomp(m.group(0)), args.maxstr)
234 mstart= seqlen - m.end()
235 mend= seqlen - m.start()
236 quad_id= str(chr) + '_' + str(mstart) + '_' + str(mend) + '_rev'
237 gquad_list.append([chr, mstart, mend, quad_id, len(m.group(0)), '-', matchstr])
238 gquad_sorted= sort_table(gquad_list, (1,2,3))
239 gquad_list= []
240 for xline in gquad_sorted:
241 xline= '\t'.join([str(x) for x in xline])
242 print(xline)
243 chr= re.sub('^>', '', line)
244 ref_seq= []
245 line= (ref_seq_fh.readline()).strip()
246 if line == '':
247 break
248
249 #gquad_sorted= sort_table(gquad_list, (0,1,2,3))
250 #
251 #for line in gquad_sorted:
252 # line= '\t'.join([str(x) for x in line])
253 # print(line)
254 sys.exit()