0
|
1 #!/usr/bin/python2.6
|
|
2 ##find snps that condition CAPS
|
|
3 ##usage find_CAPS.py <reference file> <gff file>
|
|
4
|
|
5
|
|
6 #Copyright 2012 John McCallum & Leshi Chen
|
|
7 #New Zealand Institute for Plant and Food Research
|
|
8 #This program is free software: you can redistribute it and/or modify
|
|
9 # it under the terms of the GNU General Public License as published by
|
|
10 # the Free Software Foundation, either version 3 of the License, or
|
|
11 # (at your option) any later version.
|
|
12 #
|
|
13 # This program is distributed in the hope that it will be useful,
|
|
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
16 # GNU General Public License for more details.
|
|
17 #
|
|
18 # You should have received a copy of the GNU General Public License
|
|
19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
20
|
|
21
|
|
22
|
|
23 import sys
|
|
24
|
|
25 from Bio import SeqIO
|
|
26 from BCBio import GFF
|
|
27 from Bio.Restriction import *
|
|
28
|
|
29 ###This list is limited to economical enzymes performing well in PCR buffer
|
|
30 rest_batch = RestrictionBatch(
|
|
31 [AluI, ApaI, BamHI, BbrPI, BfrI, ClaI, DdeI, DpnII, DraI, EcoRI,
|
|
32 HaeIII, HindII, HinfI, HpaI, PvuII, RsaI, SacI, Sau3AI, SmaI, TaqI])
|
|
33
|
|
34 in_file=sys.argv[1]
|
|
35 gff_file=sys.argv[2]
|
|
36
|
|
37 in_seq_handle = open(in_file)
|
|
38 in_gff_handle=open(gff_file)
|
|
39
|
|
40 ##use iterator
|
|
41 for myrec in SeqIO.parse(in_seq_handle, "fasta"):
|
|
42 ##create single-entry dictionary to accept gff annotations from parser
|
|
43 seq_dict = {myrec.id:myrec}
|
|
44
|
|
45 ##note that this filters out only SNP features
|
|
46 limit_info = dict(gff_id = [myrec.id] ,gff_type = ['SNP'])
|
|
47 in_gff_handle.seek(0)
|
|
48
|
|
49 ##parse annotations into
|
|
50 annotations = [r for r
|
|
51 in GFF.parse(in_gff_handle,
|
|
52 base_dict=seq_dict,
|
|
53 limit_info=limit_info)]
|
|
54
|
|
55 ##if there are any for this sequence, proceed
|
|
56 if annotations:
|
|
57 rec=annotations[0]
|
|
58 for feat in rec.features:
|
|
59 fstart=feat.location.start.position
|
|
60 fend=feat.location.end.position
|
|
61
|
|
62 if 20 < fstart < len(rec) - 20:
|
|
63 #just work with +/- 20 bp, ignoring SNPS within this
|
|
64 #distance from ends
|
|
65 fseq=rec.seq[fstart-20:fstart+20]
|
|
66 ref_seq = rec.seq[fstart-20:fstart+20]
|
|
67 variant_seq = ref_seq.tomutable()
|
|
68
|
|
69 #mutate the variant
|
|
70 variant_seq[20]= feat.qualifiers['Variant_seq'][0]
|
|
71 variant_seq = variant_seq.toseq()
|
|
72
|
|
73 #digest the sequences
|
|
74 ref_cuts = rest_batch.search(ref_seq)
|
|
75 var_cuts = rest_batch.search(variant_seq)
|
|
76
|
|
77 #print
|
|
78 for enz in ref_cuts:
|
|
79 kr = set(ref_cuts[enz])
|
|
80 km = set(var_cuts[enz])
|
|
81 outputstr=[rec.id, fstart +1,fend+1,feat.id,enz]
|
|
82 if len(kr) > len(km):
|
|
83 outputstr.append("reference")
|
|
84 print('\t'.join(map(str,outputstr)))
|
|
85 elif len(kr) < len(km):
|
|
86 outputstr.append("variant")
|
|
87 print('\t'.join(map(str,outputstr)))
|
|
88
|
|
89 in_gff_handle.close()
|
|
90 in_seq_handle.close()
|
|
91
|
|
92
|
|
93
|