comparison find_CAPS.py @ 0:21053f7f9ed1 draft

First upload of PCR Marker tools
author john-mccallum
date Thu, 14 Jun 2012 19:29:26 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:21053f7f9ed1
1 #!/usr/bin/python2.6
2 ##find snps that condition CAPS
3 ##usage find_CAPS.py <reference file> <gff file>
4
5
6 #Copyright 2012 John McCallum & Leshi Chen
7 #New Zealand Institute for Plant and Food Research
8 #This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20
21
22
23 import sys
24
25 from Bio import SeqIO
26 from BCBio import GFF
27 from Bio.Restriction import *
28
29 ###This list is limited to economical enzymes performing well in PCR buffer
30 rest_batch = RestrictionBatch(
31 [AluI, ApaI, BamHI, BbrPI, BfrI, ClaI, DdeI, DpnII, DraI, EcoRI,
32 HaeIII, HindII, HinfI, HpaI, PvuII, RsaI, SacI, Sau3AI, SmaI, TaqI])
33
34 in_file=sys.argv[1]
35 gff_file=sys.argv[2]
36
37 in_seq_handle = open(in_file)
38 in_gff_handle=open(gff_file)
39
40 ##use iterator
41 for myrec in SeqIO.parse(in_seq_handle, "fasta"):
42 ##create single-entry dictionary to accept gff annotations from parser
43 seq_dict = {myrec.id:myrec}
44
45 ##note that this filters out only SNP features
46 limit_info = dict(gff_id = [myrec.id] ,gff_type = ['SNP'])
47 in_gff_handle.seek(0)
48
49 ##parse annotations into
50 annotations = [r for r
51 in GFF.parse(in_gff_handle,
52 base_dict=seq_dict,
53 limit_info=limit_info)]
54
55 ##if there are any for this sequence, proceed
56 if annotations:
57 rec=annotations[0]
58 for feat in rec.features:
59 fstart=feat.location.start.position
60 fend=feat.location.end.position
61
62 if 20 < fstart < len(rec) - 20:
63 #just work with +/- 20 bp, ignoring SNPS within this
64 #distance from ends
65 fseq=rec.seq[fstart-20:fstart+20]
66 ref_seq = rec.seq[fstart-20:fstart+20]
67 variant_seq = ref_seq.tomutable()
68
69 #mutate the variant
70 variant_seq[20]= feat.qualifiers['Variant_seq'][0]
71 variant_seq = variant_seq.toseq()
72
73 #digest the sequences
74 ref_cuts = rest_batch.search(ref_seq)
75 var_cuts = rest_batch.search(variant_seq)
76
77 #print
78 for enz in ref_cuts:
79 kr = set(ref_cuts[enz])
80 km = set(var_cuts[enz])
81 outputstr=[rec.id, fstart +1,fend+1,feat.id,enz]
82 if len(kr) > len(km):
83 outputstr.append("reference")
84 print('\t'.join(map(str,outputstr)))
85 elif len(kr) < len(km):
86 outputstr.append("variant")
87 print('\t'.join(map(str,outputstr)))
88
89 in_gff_handle.close()
90 in_seq_handle.close()
91
92
93