comparison design_primers.py @ 0:21053f7f9ed1 draft

First upload of PCR Marker tools
author john-mccallum
date Thu, 14 Jun 2012 19:29:26 -0400
parents
children b321e0517be3
comparison
equal deleted inserted replaced
-1:000000000000 0:21053f7f9ed1
1 #!/usr/local/bin/python2.6
2 ##design primers to features in multiple sequences
3 ##usage: python design_primers.py <fasta-file> <gff file> <file of target IDs> <prod_min_size> <prod_max_size>
4
5
6 #Copyright 2012 John McCallum & Leshi Chen
7 #New Zealand Institute for Plant and Food Research
8 #This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20
21
22 import os
23 import StringIO
24 import re
25 import tempfile
26 import subprocess
27 import copy
28 import sys
29 from BCBio import GFF
30 from BCBio.GFF import GFFExaminer
31 from Bio import SeqIO
32 from Bio.Emboss.Applications import Primer3Commandline
33 from Bio.Emboss import Primer3
34
35
36 in_file = sys.argv[1]
37 gff_file = sys.argv[2]
38 target_file = sys.argv[3]
39 prod_min_size = int(sys.argv[4])
40 prod_max_size = int(sys.argv[5])
41
42 max_tm_diff=1 ##
43 opt_GC_percent=50 ##
44 maxpolyx=4 ##
45 optimum_length=20
46 ##target is specified in start, end format
47 productsizerange = str(prod_min_size) + "," + str(prod_max_size)
48 #open input files
49 in_seq_handle = open(in_file)
50 in_gff_handle = open(gff_file)
51 in_target_handle=open(target_file)
52 #read target feature IDs into list
53 targets=in_target_handle.readlines()
54 in_target_handle.close()
55 ##and create a hit list of sequences from this
56 target_seq_id_list = list(set([line.split(":")[0] for line in targets]))
57 ##create iterator returning sequence records
58 for myrec in SeqIO.parse(in_seq_handle, "fasta"):
59 #check if this sequence is included in the target list
60 if myrec.id in target_seq_id_list:
61 ##create sequence dictionary so we can add in gff annotations
62 seq_dict = {myrec.id : myrec}
63 ##just limit to gff annotations for this sequence
64 limit_info = dict(gff_id = [ myrec.id ])
65 ##rewind gff filehandle
66 in_gff_handle.seek(0)
67 ##read annotations into sequence dictionary for this sequence record only
68 annotations = [r for r in GFF.parse(in_gff_handle, base_dict=seq_dict,limit_info=limit_info)]
69 ##if there are any annotations, then proceed.
70 if annotations:
71 rec=annotations[0]
72 ##iterate over list of target IDs
73 for t in targets:
74 target_ID = t.strip('\n')
75 target_annotations = [f for f in rec.features if f.id == target_ID ]
76 if target_annotations:
77 mytarget = target_annotations[0]
78 #create temporary files
79 tempfastaFile = tempfile.mktemp()
80 tempproutfile = tempfile.mktemp()
81 #just consider slice of sequence in a window of +/- prod_max_size bp
82 ##from feature UNLESS feature is close to end
83 ##Note that slice is zero-based
84 featLocation = mytarget.location.start.position
85 if featLocation > prod_max_size:
86 slice_start = featLocation - prod_max_size
87 featPosition = prod_max_size
88 else:
89 slice_start = 0
90 featPosition = featLocation
91 if (len(rec) - featLocation) < prod_max_size:
92 slice_end = len(rec)
93 else:
94 slice_end = featLocation + prod_max_size
95 ###grab slice of sequence fom this window.
96 targetRec = rec[slice_start:slice_end]
97 matching_feature = [f for f in targetRec.features if f.id == mytarget.id]
98 if matching_feature:
99 target_feat = matching_feature[0]
100 if target_feat.location.start.position == 0:
101 target_feat.location.start.position = 1
102 #we get the mask features by removing the target...all features are masked as just using snp and indels
103 ##a smarter filter could be added
104 ##note use of list copy to avoid possible side-effects
105 exclude_feat = list(targetRec.features)
106 exclude_feat.remove(target_feat)
107 ##print'targetRec.features', targetRec.features ##for debug
108 mask_str=map(lambda f: str(f.location.start.position+1) + "," + str(f.location.end.position + 1) ,exclude_feat)
109 #mask_str=map(lambda f: str(f.location).strip('[]'),exclude_feat)
110 p3_exclude_str = str(mask_str).replace('\', \'',':')
111 p3_target = str(target_feat.location.start.position +1) + "," + str(target_feat.location.end.position +1)
112 #write sequence record into template file as fasta
113 t_output_handle = open(tempfastaFile, "w")
114 SeqIO.write([targetRec], t_output_handle, "fasta")
115 t_output_handle.close()
116 #create Primer3Commandline() for emboss tool
117 primer_cl = Primer3Commandline()
118 #set the emboss tool to suppress output as this will make Galaxy think it is error message although it is a message to state run success
119 primer_cl.set_parameter("-auto",'1')
120 #pass sequence file to emboss
121 primer_cl.set_parameter("-sequence",tempfastaFile)
122 #add target location
123 primer_cl.set_parameter("-target", p3_target)
124 ##mask off other features...dumb masking of everything at present, beware
125 if (p3_exclude_str != ""):
126 primer_cl.set_parameter("-excludedregion", p3_exclude_str)
127 #add temporary output file to get the result
128 primer_cl.set_parameter("-outfile", tempproutfile)
129 #specify maximum different of tm
130 primer_cl.set_parameter("-maxdifftm",max_tm_diff )
131 #other useful parameters
132 primer_cl.set_parameter("-ogcpercent", opt_GC_percent)
133 primer_cl.set_parameter("-opolyxmax", maxpolyx)
134 primer_cl.set_parameter("-osize", optimum_length)
135 #set product size range
136 primer_cl.set_parameter("-prange", productsizerange)
137 #using python subprocess method to run emboss command line programe with the parameters given
138 fnull = open(os.devnull, 'w')
139 result=subprocess.check_call(str(primer_cl),shell=True ,stdout = fnull, stderr = fnull)
140 #read temporary outputfile
141 handle = open(tempproutfile)
142 record = Primer3.read(handle)
143 ##just return first set, if there is one
144 if len(record.primers) > 0:
145 primer= record.primers[0]
146 outputstr=[mytarget.id, primer.forward_seq,primer.reverse_seq,primer.size]
147 else:
148 outputstr=[mytarget.id,"NONE","NONE","NONE"]
149 print('\t'.join(map(str,outputstr)))
150
151
152 in_gff_handle.close()
153 in_seq_handle.close()