0
|
1 #!/usr/local/bin/python2.6
|
|
2 ##design primers to features in multiple sequences
|
|
3 ##usage: python design_primers.py <fasta-file> <gff file> <file of target IDs> <prod_min_size> <prod_max_size>
|
|
4
|
|
5
|
|
6 #Copyright 2012 John McCallum & Leshi Chen
|
|
7 #New Zealand Institute for Plant and Food Research
|
|
8 #This program is free software: you can redistribute it and/or modify
|
|
9 # it under the terms of the GNU General Public License as published by
|
|
10 # the Free Software Foundation, either version 3 of the License, or
|
|
11 # (at your option) any later version.
|
|
12 #
|
|
13 # This program is distributed in the hope that it will be useful,
|
|
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
16 # GNU General Public License for more details.
|
|
17 #
|
|
18 # You should have received a copy of the GNU General Public License
|
|
19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
20
|
|
21
|
|
22 import os
|
|
23 import StringIO
|
|
24 import re
|
|
25 import tempfile
|
|
26 import subprocess
|
|
27 import copy
|
|
28 import sys
|
|
29 from BCBio import GFF
|
|
30 from BCBio.GFF import GFFExaminer
|
|
31 from Bio import SeqIO
|
|
32 from Bio.Emboss.Applications import Primer3Commandline
|
|
33 from Bio.Emboss import Primer3
|
|
34
|
|
35
|
|
36 in_file = sys.argv[1]
|
|
37 gff_file = sys.argv[2]
|
|
38 target_file = sys.argv[3]
|
|
39 prod_min_size = int(sys.argv[4])
|
|
40 prod_max_size = int(sys.argv[5])
|
|
41
|
|
42 max_tm_diff=1 ##
|
|
43 opt_GC_percent=50 ##
|
|
44 maxpolyx=4 ##
|
|
45 optimum_length=20
|
|
46 ##target is specified in start, end format
|
|
47 productsizerange = str(prod_min_size) + "," + str(prod_max_size)
|
|
48 #open input files
|
|
49 in_seq_handle = open(in_file)
|
|
50 in_gff_handle = open(gff_file)
|
|
51 in_target_handle=open(target_file)
|
|
52 #read target feature IDs into list
|
|
53 targets=in_target_handle.readlines()
|
|
54 in_target_handle.close()
|
|
55 ##and create a hit list of sequences from this
|
|
56 target_seq_id_list = list(set([line.split(":")[0] for line in targets]))
|
|
57 ##create iterator returning sequence records
|
|
58 for myrec in SeqIO.parse(in_seq_handle, "fasta"):
|
|
59 #check if this sequence is included in the target list
|
|
60 if myrec.id in target_seq_id_list:
|
|
61 ##create sequence dictionary so we can add in gff annotations
|
|
62 seq_dict = {myrec.id : myrec}
|
|
63 ##just limit to gff annotations for this sequence
|
|
64 limit_info = dict(gff_id = [ myrec.id ])
|
|
65 ##rewind gff filehandle
|
|
66 in_gff_handle.seek(0)
|
|
67 ##read annotations into sequence dictionary for this sequence record only
|
|
68 annotations = [r for r in GFF.parse(in_gff_handle, base_dict=seq_dict,limit_info=limit_info)]
|
|
69 ##if there are any annotations, then proceed.
|
|
70 if annotations:
|
|
71 rec=annotations[0]
|
|
72 ##iterate over list of target IDs
|
|
73 for t in targets:
|
|
74 target_ID = t.strip('\n')
|
|
75 target_annotations = [f for f in rec.features if f.id == target_ID ]
|
|
76 if target_annotations:
|
|
77 mytarget = target_annotations[0]
|
|
78 #create temporary files
|
|
79 tempfastaFile = tempfile.mktemp()
|
|
80 tempproutfile = tempfile.mktemp()
|
|
81 #just consider slice of sequence in a window of +/- prod_max_size bp
|
|
82 ##from feature UNLESS feature is close to end
|
|
83 ##Note that slice is zero-based
|
|
84 featLocation = mytarget.location.start.position
|
|
85 if featLocation > prod_max_size:
|
|
86 slice_start = featLocation - prod_max_size
|
|
87 featPosition = prod_max_size
|
|
88 else:
|
|
89 slice_start = 0
|
|
90 featPosition = featLocation
|
|
91 if (len(rec) - featLocation) < prod_max_size:
|
|
92 slice_end = len(rec)
|
|
93 else:
|
|
94 slice_end = featLocation + prod_max_size
|
|
95 ###grab slice of sequence fom this window.
|
|
96 targetRec = rec[slice_start:slice_end]
|
|
97 matching_feature = [f for f in targetRec.features if f.id == mytarget.id]
|
|
98 if matching_feature:
|
|
99 target_feat = matching_feature[0]
|
|
100 if target_feat.location.start.position == 0:
|
|
101 target_feat.location.start.position = 1
|
|
102 #we get the mask features by removing the target...all features are masked as just using snp and indels
|
|
103 ##a smarter filter could be added
|
|
104 ##note use of list copy to avoid possible side-effects
|
|
105 exclude_feat = list(targetRec.features)
|
|
106 exclude_feat.remove(target_feat)
|
|
107 ##print'targetRec.features', targetRec.features ##for debug
|
|
108 mask_str=map(lambda f: str(f.location.start.position+1) + "," + str(f.location.end.position + 1) ,exclude_feat)
|
|
109 #mask_str=map(lambda f: str(f.location).strip('[]'),exclude_feat)
|
|
110 p3_exclude_str = str(mask_str).replace('\', \'',':')
|
|
111 p3_target = str(target_feat.location.start.position +1) + "," + str(target_feat.location.end.position +1)
|
|
112 #write sequence record into template file as fasta
|
|
113 t_output_handle = open(tempfastaFile, "w")
|
|
114 SeqIO.write([targetRec], t_output_handle, "fasta")
|
|
115 t_output_handle.close()
|
|
116 #create Primer3Commandline() for emboss tool
|
|
117 primer_cl = Primer3Commandline()
|
|
118 #set the emboss tool to suppress output as this will make Galaxy think it is error message although it is a message to state run success
|
|
119 primer_cl.set_parameter("-auto",'1')
|
|
120 #pass sequence file to emboss
|
|
121 primer_cl.set_parameter("-sequence",tempfastaFile)
|
|
122 #add target location
|
|
123 primer_cl.set_parameter("-target", p3_target)
|
|
124 ##mask off other features...dumb masking of everything at present, beware
|
|
125 if (p3_exclude_str != ""):
|
|
126 primer_cl.set_parameter("-excludedregion", p3_exclude_str)
|
|
127 #add temporary output file to get the result
|
|
128 primer_cl.set_parameter("-outfile", tempproutfile)
|
|
129 #specify maximum different of tm
|
|
130 primer_cl.set_parameter("-maxdifftm",max_tm_diff )
|
|
131 #other useful parameters
|
|
132 primer_cl.set_parameter("-ogcpercent", opt_GC_percent)
|
|
133 primer_cl.set_parameter("-opolyxmax", maxpolyx)
|
|
134 primer_cl.set_parameter("-osize", optimum_length)
|
|
135 #set product size range
|
|
136 primer_cl.set_parameter("-prange", productsizerange)
|
|
137 #using python subprocess method to run emboss command line programe with the parameters given
|
|
138 fnull = open(os.devnull, 'w')
|
|
139 result=subprocess.check_call(str(primer_cl),shell=True ,stdout = fnull, stderr = fnull)
|
|
140 #read temporary outputfile
|
|
141 handle = open(tempproutfile)
|
|
142 record = Primer3.read(handle)
|
|
143 ##just return first set, if there is one
|
|
144 if len(record.primers) > 0:
|
|
145 primer= record.primers[0]
|
|
146 outputstr=[mytarget.id, primer.forward_seq,primer.reverse_seq,primer.size]
|
|
147 else:
|
|
148 outputstr=[mytarget.id,"NONE","NONE","NONE"]
|
|
149 print('\t'.join(map(str,outputstr)))
|
|
150
|
|
151
|
|
152 in_gff_handle.close()
|
|
153 in_seq_handle.close()
|