annotate find_CAPS.py @ 0:21053f7f9ed1 draft

First upload of PCR Marker tools
author john-mccallum
date Thu, 14 Jun 2012 19:29:26 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
1 #!/usr/bin/python2.6
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
2 ##find snps that condition CAPS
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
3 ##usage find_CAPS.py <reference file> <gff file>
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
4
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
5
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
6 #Copyright 2012 John McCallum & Leshi Chen
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
7 #New Zealand Institute for Plant and Food Research
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
8 #This program is free software: you can redistribute it and/or modify
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
9 # it under the terms of the GNU General Public License as published by
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
10 # the Free Software Foundation, either version 3 of the License, or
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
11 # (at your option) any later version.
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
12 #
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
13 # This program is distributed in the hope that it will be useful,
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
16 # GNU General Public License for more details.
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
17 #
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
18 # You should have received a copy of the GNU General Public License
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
20
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
21
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
22
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
23 import sys
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
24
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
25 from Bio import SeqIO
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
26 from BCBio import GFF
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
27 from Bio.Restriction import *
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
28
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
29 ###This list is limited to economical enzymes performing well in PCR buffer
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
30 rest_batch = RestrictionBatch(
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
31 [AluI, ApaI, BamHI, BbrPI, BfrI, ClaI, DdeI, DpnII, DraI, EcoRI,
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
32 HaeIII, HindII, HinfI, HpaI, PvuII, RsaI, SacI, Sau3AI, SmaI, TaqI])
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
33
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
34 in_file=sys.argv[1]
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
35 gff_file=sys.argv[2]
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
36
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
37 in_seq_handle = open(in_file)
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
38 in_gff_handle=open(gff_file)
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
39
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
40 ##use iterator
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
41 for myrec in SeqIO.parse(in_seq_handle, "fasta"):
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
42 ##create single-entry dictionary to accept gff annotations from parser
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
43 seq_dict = {myrec.id:myrec}
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
44
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
45 ##note that this filters out only SNP features
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
46 limit_info = dict(gff_id = [myrec.id] ,gff_type = ['SNP'])
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
47 in_gff_handle.seek(0)
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
48
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
49 ##parse annotations into
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
50 annotations = [r for r
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
51 in GFF.parse(in_gff_handle,
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
52 base_dict=seq_dict,
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
53 limit_info=limit_info)]
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
54
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
55 ##if there are any for this sequence, proceed
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
56 if annotations:
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
57 rec=annotations[0]
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
58 for feat in rec.features:
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
59 fstart=feat.location.start.position
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
60 fend=feat.location.end.position
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
61
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
62 if 20 < fstart < len(rec) - 20:
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
63 #just work with +/- 20 bp, ignoring SNPS within this
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
64 #distance from ends
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
65 fseq=rec.seq[fstart-20:fstart+20]
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
66 ref_seq = rec.seq[fstart-20:fstart+20]
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
67 variant_seq = ref_seq.tomutable()
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
68
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
69 #mutate the variant
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
70 variant_seq[20]= feat.qualifiers['Variant_seq'][0]
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
71 variant_seq = variant_seq.toseq()
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
72
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
73 #digest the sequences
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
74 ref_cuts = rest_batch.search(ref_seq)
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
75 var_cuts = rest_batch.search(variant_seq)
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
76
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
77 #print
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
78 for enz in ref_cuts:
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
79 kr = set(ref_cuts[enz])
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
80 km = set(var_cuts[enz])
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
81 outputstr=[rec.id, fstart +1,fend+1,feat.id,enz]
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
82 if len(kr) > len(km):
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
83 outputstr.append("reference")
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
84 print('\t'.join(map(str,outputstr)))
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
85 elif len(kr) < len(km):
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
86 outputstr.append("variant")
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
87 print('\t'.join(map(str,outputstr)))
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
88
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
89 in_gff_handle.close()
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
90 in_seq_handle.close()
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
91
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
92
21053f7f9ed1 First upload of PCR Marker tools
john-mccallum
parents:
diff changeset
93