Mercurial > repos > john-mccallum > pcr_markers
diff find_CAPS.py @ 0:21053f7f9ed1 draft
First upload of PCR Marker tools
author | john-mccallum |
---|---|
date | Thu, 14 Jun 2012 19:29:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/find_CAPS.py Thu Jun 14 19:29:26 2012 -0400 @@ -0,0 +1,93 @@ +#!/usr/bin/python2.6 +##find snps that condition CAPS +##usage find_CAPS.py <reference file> <gff file> + + +#Copyright 2012 John McCallum & Leshi Chen +#New Zealand Institute for Plant and Food Research +#This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + + + +import sys + +from Bio import SeqIO +from BCBio import GFF +from Bio.Restriction import * + +###This list is limited to economical enzymes performing well in PCR buffer +rest_batch = RestrictionBatch( + [AluI, ApaI, BamHI, BbrPI, BfrI, ClaI, DdeI, DpnII, DraI, EcoRI, + HaeIII, HindII, HinfI, HpaI, PvuII, RsaI, SacI, Sau3AI, SmaI, TaqI]) + +in_file=sys.argv[1] +gff_file=sys.argv[2] + +in_seq_handle = open(in_file) +in_gff_handle=open(gff_file) + +##use iterator +for myrec in SeqIO.parse(in_seq_handle, "fasta"): + ##create single-entry dictionary to accept gff annotations from parser + seq_dict = {myrec.id:myrec} + + ##note that this filters out only SNP features + limit_info = dict(gff_id = [myrec.id] ,gff_type = ['SNP']) + in_gff_handle.seek(0) + + ##parse annotations into + annotations = [r for r + in GFF.parse(in_gff_handle, + base_dict=seq_dict, + limit_info=limit_info)] + + ##if there are any for this sequence, proceed + if annotations: + rec=annotations[0] + for feat in rec.features: + fstart=feat.location.start.position + fend=feat.location.end.position + + if 20 < fstart < len(rec) - 20: + #just work with +/- 20 bp, ignoring SNPS within this + #distance from ends + fseq=rec.seq[fstart-20:fstart+20] + ref_seq = rec.seq[fstart-20:fstart+20] + variant_seq = ref_seq.tomutable() + + #mutate the variant + variant_seq[20]= feat.qualifiers['Variant_seq'][0] + variant_seq = variant_seq.toseq() + + #digest the sequences + ref_cuts = rest_batch.search(ref_seq) + var_cuts = rest_batch.search(variant_seq) + + #print + for enz in ref_cuts: + kr = set(ref_cuts[enz]) + km = set(var_cuts[enz]) + outputstr=[rec.id, fstart +1,fend+1,feat.id,enz] + if len(kr) > len(km): + outputstr.append("reference") + print('\t'.join(map(str,outputstr))) + elif len(kr) < len(km): + outputstr.append("variant") + print('\t'.join(map(str,outputstr))) + +in_gff_handle.close() +in_seq_handle.close() + + +