pcr_markers: design_primers.py comparison

comparison design_primers.py @ 0:21053f7f9ed1 draft

First upload of PCR Marker tools

author	john-mccallum
date	Thu, 14 Jun 2012 19:29:26 -0400
parents
children	b321e0517be3

comparison

equal deleted inserted replaced

--1:000000000000
+:21053f7f9ed1
+#!/usr/local/bin/python2.6
+##design primers to features in multiple sequences
+##usage: python  design_primers.py <fasta-file> <gff file> <file of target IDs> <prod_min_size> <prod_max_size>
+#Copyright 2012 John McCallum & Leshi Chen
+#New Zealand Institute for Plant and Food Research
+#This program is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+import os
+import StringIO
+import re
+import tempfile
+import subprocess
+import copy
+import sys
+from BCBio import GFF
+from BCBio.GFF import GFFExaminer
+from Bio import SeqIO
+from Bio.Emboss.Applications import Primer3Commandline
+from Bio.Emboss import Primer3
+in_file = sys.argv[1]
+gff_file = sys.argv[2]
+target_file =  sys.argv[3]
+prod_min_size = int(sys.argv[4])
+prod_max_size = int(sys.argv[5])
+max_tm_diff=1                                        ##
+opt_GC_percent=50                                    ##
+maxpolyx=4                                           ##
+optimum_length=20
+##target is specified in start, end format
+productsizerange = str(prod_min_size) + "," + str(prod_max_size)
+#open input files
+in_seq_handle = open(in_file)
+in_gff_handle = open(gff_file)
+in_target_handle=open(target_file)
+#read  target feature IDs into list
+targets=in_target_handle.readlines()
+in_target_handle.close()
+##and create a hit list of sequences from this
+target_seq_id_list = list(set([line.split(":")[0] for line in targets]))
+##create iterator returning sequence records
+for myrec in SeqIO.parse(in_seq_handle, "fasta"):
+#check if this sequence is included in the target list
+if myrec.id in target_seq_id_list:
+##create sequence dictionary so we can add in gff annotations
+seq_dict = {myrec.id : myrec}
+##just limit to gff annotations for this sequence
+limit_info = dict(gff_id = [ myrec.id ])
+##rewind gff filehandle
+in_gff_handle.seek(0)
+##read annotations into sequence dictionary for this sequence record only
+annotations = [r for r in GFF.parse(in_gff_handle, base_dict=seq_dict,limit_info=limit_info)]
+##if there are any annotations, then  proceed.
+if annotations:
+rec=annotations[0]
+##iterate over list of target IDs
+for t in targets:
+target_ID = t.strip('\n')
+target_annotations = [f for f in rec.features if f.id == target_ID ]
+if target_annotations:
+mytarget = target_annotations[0]
+#create temporary files
+tempfastaFile = tempfile.mktemp()
+tempproutfile = tempfile.mktemp()
+#just consider slice of sequence in a window of +/- prod_max_size  bp
+##from feature UNLESS feature is close to end
+##Note that slice is zero-based
+featLocation = mytarget.location.start.position
+if featLocation > prod_max_size:
+slice_start = featLocation - prod_max_size
+featPosition = prod_max_size
+else:
+slice_start = 0
+featPosition = featLocation
+if (len(rec) - featLocation) < prod_max_size:
+slice_end = len(rec)
+else:
+slice_end = featLocation + prod_max_size
+###grab slice of sequence fom this window.
+targetRec = rec[slice_start:slice_end]
+matching_feature = [f for f in targetRec.features if f.id == mytarget.id]
+if matching_feature:
+target_feat = matching_feature[0]
+if target_feat.location.start.position == 0:
+target_feat.location.start.position = 1
+#we get the mask features by removing the target...all features are masked as just using snp and indels
+##a smarter filter could be added
+##note use of list copy to avoid possible side-effects
+exclude_feat = list(targetRec.features)
+exclude_feat.remove(target_feat)
+##print'targetRec.features',  targetRec.features ##for debug
+mask_str=map(lambda f: str(f.location.start.position+1) + "," + str(f.location.end.position + 1) ,exclude_feat)
+#mask_str=map(lambda f: str(f.location).strip('[]'),exclude_feat)
+p3_exclude_str = str(mask_str).replace('\', \'',':')
+p3_target = str(target_feat.location.start.position +1)  + "," + str(target_feat.location.end.position +1)
+#write sequence record into template file as  fasta
+t_output_handle = open(tempfastaFile, "w")
+SeqIO.write([targetRec], t_output_handle, "fasta")
+t_output_handle.close()
+#create Primer3Commandline() for emboss tool
+primer_cl = Primer3Commandline()
+#set the emboss tool to suppress  output as this will make Galaxy  think it is error message although it is a message to state run success
+primer_cl.set_parameter("-auto",'1')
+#pass  sequence file to emboss
+primer_cl.set_parameter("-sequence",tempfastaFile)
+#add target location
+primer_cl.set_parameter("-target", p3_target)
+##mask off other features...dumb masking of everything at present, beware
+if (p3_exclude_str != ""):
+primer_cl.set_parameter("-excludedregion", p3_exclude_str)
+#add temporary output file to get the result
+primer_cl.set_parameter("-outfile", tempproutfile)
+#specify maximum different of tm
+primer_cl.set_parameter("-maxdifftm",max_tm_diff )
+#other useful parameters
+primer_cl.set_parameter("-ogcpercent", opt_GC_percent)
+primer_cl.set_parameter("-opolyxmax", maxpolyx)
+primer_cl.set_parameter("-osize", optimum_length)
+#set product size range
+primer_cl.set_parameter("-prange", productsizerange)
+#using python subprocess method to run emboss command line programe with the parameters given
+fnull = open(os.devnull, 'w')
+result=subprocess.check_call(str(primer_cl),shell=True ,stdout = fnull, stderr = fnull)
+#read temporary outputfile
+handle = open(tempproutfile)
+record = Primer3.read(handle)
+##just return first set, if there is one
+if len(record.primers) > 0:
+primer= record.primers[0]
+outputstr=[mytarget.id, primer.forward_seq,primer.reverse_seq,primer.size]
+else:
+outputstr=[mytarget.id,"NONE","NONE","NONE"]
+print('\t'.join(map(str,outputstr)))
+in_gff_handle.close()
+in_seq_handle.close()

Mercurial > repos > john-mccallum > pcr_markers

comparison design_primers.py @ 0:21053f7f9ed1 draft