diff design_primers.py @ 0:21053f7f9ed1 draft

First upload of PCR Marker tools
author john-mccallum
date Thu, 14 Jun 2012 19:29:26 -0400
parents
children b321e0517be3
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/design_primers.py	Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,153 @@
+#!/usr/local/bin/python2.6
+##design primers to features in multiple sequences
+##usage: python  design_primers.py <fasta-file> <gff file> <file of target IDs> <prod_min_size> <prod_max_size>
+
+
+#Copyright 2012 John McCallum & Leshi Chen
+#New Zealand Institute for Plant and Food Research
+#This program is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+import os
+import StringIO
+import re
+import tempfile
+import subprocess
+import copy
+import sys
+from BCBio import GFF
+from BCBio.GFF import GFFExaminer
+from Bio import SeqIO
+from Bio.Emboss.Applications import Primer3Commandline
+from Bio.Emboss import Primer3
+
+
+in_file = sys.argv[1]
+gff_file = sys.argv[2] 
+target_file =  sys.argv[3]
+prod_min_size = int(sys.argv[4])
+prod_max_size = int(sys.argv[5])
+
+max_tm_diff=1                                        ##
+opt_GC_percent=50                                    ##
+maxpolyx=4                                           ##
+optimum_length=20
+##target is specified in start, end format 
+productsizerange = str(prod_min_size) + "," + str(prod_max_size)
+#open input files
+in_seq_handle = open(in_file)
+in_gff_handle = open(gff_file)
+in_target_handle=open(target_file)
+#read  target feature IDs into list
+targets=in_target_handle.readlines()
+in_target_handle.close()
+##and create a hit list of sequences from this
+target_seq_id_list = list(set([line.split(":")[0] for line in targets]))
+##create iterator returning sequence records
+for myrec in SeqIO.parse(in_seq_handle, "fasta"):
+    #check if this sequence is included in the target list
+    if myrec.id in target_seq_id_list:
+        ##create sequence dictionary so we can add in gff annotations
+        seq_dict = {myrec.id : myrec}
+        ##just limit to gff annotations for this sequence
+        limit_info = dict(gff_id = [ myrec.id ])
+        ##rewind gff filehandle
+        in_gff_handle.seek(0)
+        ##read annotations into sequence dictionary for this sequence record only 
+        annotations = [r for r in GFF.parse(in_gff_handle, base_dict=seq_dict,limit_info=limit_info)]
+        ##if there are any annotations, then  proceed. 
+        if annotations:
+            rec=annotations[0]
+            ##iterate over list of target IDs
+            for t in targets:
+                target_ID = t.strip('\n')
+                target_annotations = [f for f in rec.features if f.id == target_ID ]
+                if target_annotations:
+                    mytarget = target_annotations[0]
+                    #create temporary files
+                    tempfastaFile = tempfile.mktemp()
+                    tempproutfile = tempfile.mktemp()
+                    #just consider slice of sequence in a window of +/- prod_max_size  bp
+                    ##from feature UNLESS feature is close to end
+                    ##Note that slice is zero-based
+                    featLocation = mytarget.location.start.position 
+                    if featLocation > prod_max_size:
+                        slice_start = featLocation - prod_max_size
+                        featPosition = prod_max_size  
+                    else:
+                        slice_start = 0
+                        featPosition = featLocation
+                    if (len(rec) - featLocation) < prod_max_size:
+                        slice_end = len(rec)
+                    else:
+                        slice_end = featLocation + prod_max_size
+                    ###grab slice of sequence fom this window.
+                    targetRec = rec[slice_start:slice_end]
+                    matching_feature = [f for f in targetRec.features if f.id == mytarget.id]
+                    if matching_feature:
+                        target_feat = matching_feature[0]
+                        if target_feat.location.start.position == 0:
+                            target_feat.location.start.position = 1
+                        #we get the mask features by removing the target...all features are masked as just using snp and indels
+                        ##a smarter filter could be added 
+                        ##note use of list copy to avoid possible side-effects
+                        exclude_feat = list(targetRec.features)
+                        exclude_feat.remove(target_feat)
+                        ##print'targetRec.features',  targetRec.features ##for debug
+                        mask_str=map(lambda f: str(f.location.start.position+1) + "," + str(f.location.end.position + 1) ,exclude_feat)
+                        #mask_str=map(lambda f: str(f.location).strip('[]'),exclude_feat)
+                        p3_exclude_str = str(mask_str).replace('\', \'',':')
+                        p3_target = str(target_feat.location.start.position +1)  + "," + str(target_feat.location.end.position +1)
+                        #write sequence record into template file as  fasta
+                        t_output_handle = open(tempfastaFile, "w")
+                        SeqIO.write([targetRec], t_output_handle, "fasta")
+                        t_output_handle.close()
+                        #create Primer3Commandline() for emboss tool
+                        primer_cl = Primer3Commandline()
+                        #set the emboss tool to suppress  output as this will make Galaxy  think it is error message although it is a message to state run success
+                        primer_cl.set_parameter("-auto",'1')
+                        #pass  sequence file to emboss
+                        primer_cl.set_parameter("-sequence",tempfastaFile)
+                        #add target location
+                        primer_cl.set_parameter("-target", p3_target)
+                        ##mask off other features...dumb masking of everything at present, beware
+                        if (p3_exclude_str != ""):
+                            primer_cl.set_parameter("-excludedregion", p3_exclude_str)
+                        #add temporary output file to get the result
+                        primer_cl.set_parameter("-outfile", tempproutfile)
+                        #specify maximum different of tm
+                        primer_cl.set_parameter("-maxdifftm",max_tm_diff )
+                        #other useful parameters
+                        primer_cl.set_parameter("-ogcpercent", opt_GC_percent)
+                        primer_cl.set_parameter("-opolyxmax", maxpolyx)  
+                        primer_cl.set_parameter("-osize", optimum_length)
+                        #set product size range
+                        primer_cl.set_parameter("-prange", productsizerange)
+                        #using python subprocess method to run emboss command line programe with the parameters given
+                        fnull = open(os.devnull, 'w')
+                        result=subprocess.check_call(str(primer_cl),shell=True ,stdout = fnull, stderr = fnull)
+                        #read temporary outputfile
+                        handle = open(tempproutfile)
+                        record = Primer3.read(handle)
+                        ##just return first set, if there is one
+                        if len(record.primers) > 0:
+                            primer= record.primers[0]
+                            outputstr=[mytarget.id, primer.forward_seq,primer.reverse_seq,primer.size]
+                        else:
+                            outputstr=[mytarget.id,"NONE","NONE","NONE"]
+                        print('\t'.join(map(str,outputstr)))
+
+                        
+in_gff_handle.close()
+in_seq_handle.close()