0
|
1
|
5
|
2 #!/usr/bin/env python
|
|
3 ##design primers to features in multiple sequences, with option to predict melting
|
|
4 #usage: design_HRM_primers.py [-h] -i IN_FILE -g GFF_FILE -T TARGET_FILE [-u]
|
|
5 # [-n MAX_PRIMERS] [-p PROD_MIN_SIZE]
|
|
6 # [-P PROD_MAX_SIZE] [-l OPT_PRIMER_LENGTH]
|
|
7 # [-m MAX_TM_DIFF] [-t OPTIMUM_TM]
|
|
8 # [-G OPT_GC_PERCENT] [-x MAXPOLYX] [-c GC_CLAMP]
|
0
|
9
|
5
|
10 #Copyright 2013 John McCallum & Susan Thomson
|
0
|
11 #New Zealand Institute for Plant and Food Research
|
|
12 #This program is free software: you can redistribute it and/or modify
|
|
13 # it under the terms of the GNU General Public License as published by
|
|
14 # the Free Software Foundation, either version 3 of the License, or
|
|
15 # (at your option) any later version.
|
|
16 #
|
|
17 # This program is distributed in the hope that it will be useful,
|
|
18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
20 # GNU General Public License for more details.
|
|
21 #
|
|
22 # You should have received a copy of the GNU General Public License
|
|
23 # along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
24
|
|
25 import os
|
|
26 import StringIO
|
|
27 import re
|
|
28 import copy
|
|
29 import sys
|
|
30 from BCBio import GFF
|
|
31 from BCBio.GFF import GFFExaminer
|
|
32 from Bio import SeqIO
|
5
|
33 import run_p3 as P3
|
|
34 #import umelt_service as umelts
|
|
35 import argparse
|
|
36
|
|
37 ##Primer3 defaults or additional options defined as dictionary
|
|
38 def_dict={
|
|
39 'PRIMER_MIN_SIZE':'18',
|
|
40 'PRIMER_MAX_SIZE':'25',
|
|
41 'PRIMER_MAX_NS_ACCEPTED':'1'}
|
|
42
|
|
43 #parse arguments
|
|
44 parser = argparse.ArgumentParser(description='Primer set design and melt prediction parameters')
|
|
45 parser.add_argument('-i', type=argparse.FileType('r'), help="input sequence file, required", dest='in_file', required=True)
|
|
46 parser.add_argument('-g', type=argparse.FileType('r'), help="input gff file with SNP and indels, required", dest='gff_file', required=True)
|
|
47 parser.add_argument('-T', type=argparse.FileType('r'), help="input target SNP file, required", dest='target_file', required=True)
|
|
48 parser.add_argument('-u',help="do uMelt prediction, optional", dest='run_uMelt',action='store_true', default=False )
|
|
49 parser.add_argument('-n', type=int, help="maximum number of primer pairs to return, default=5", dest='max_primers', default=5) ## PRIMER_NUM_RETURN
|
|
50 parser.add_argument('-p', type=int, help="minimum product size", dest='prod_min_size', default=100) ## PRIMER_PRODUCT_SIZE_RANGE min
|
|
51 parser.add_argument('-P', type=int, help="maximum product size", dest='prod_max_size', default=300) ## PRIMER_PRODUCT_SIZE_RANGE max
|
|
52 parser.add_argument('-l', type=int, help="optimum primer length", dest='opt_primer_length', default=20) ## PRIMER_OPT_SIZE
|
|
53 parser.add_argument('-m', type=int, help="maximum tm difference between primers", dest='max_tm_diff', default=1) ## PRIMER_PAIR_MAX_DIFF_TM
|
|
54 parser.add_argument('-t', type=int, help="optimum Tm for primers, recommend range 59 to 61", dest='optimum_tm', default=59) ## PRIMER_OPT_TM
|
|
55 parser.add_argument('-G', type=int, help="optimum GC percentage of primers", dest='opt_GC_percent', default=50) ## PRIMER_OPT_GC_PERCENT
|
|
56 parser.add_argument('-x', type=int, help="maximum polyx, recommend less than 4", dest='maxpolyx', default=3) ## PRIMER_MAX_POLY_X
|
|
57 parser.add_argument('-c', type=int, help="number of C/Gs at end, recommend 2", dest='gc_clamp', default=1) ## PRIMER_GC_CLAMP
|
|
58
|
|
59 parser.add_argument('-e', type=int, help="maximum allowable 3'-anchored complementarity", dest='maxselfend', default=3) ## PRIMER_MAX_SELF_END
|
|
60 parser.add_argument('-a', type=int, help="maximum complementarity between left and right or self", dest='maxselfany', default=8) ## PRIMER_MAX_SELF_ANY
|
|
61 parser.add_argument('-maxgc', type=float, help="Maximum allowable percentage of Gs and Cs in any primer.", dest='maxgc', default=80.0) ## PRIMER_MAX_GC
|
|
62 parser.add_argument('-mingc', type=float, help="Minimum allowable percentage of Gs and Cs in any primer.", dest='mingc', default=20.0) ## PRIMER_MIN_GC
|
0
|
63
|
|
64
|
5
|
65 parser.add_argument('-d', type=str, help="variant indentifier delimiter, used to separate sequence ID from rest ", dest='target_delim', default=':')
|
|
66 try:
|
|
67 my_args = parser.parse_args()
|
|
68 except SystemExit:
|
|
69 print("\nOops, an argument is missing/invalid, exiting...\n")
|
|
70 sys.exit(0)
|
|
71
|
|
72 ##update from args. NEEDS TO BE FINISHED
|
|
73 productsizerange = str(my_args.prod_min_size) + "-" + str(my_args.prod_max_size)
|
|
74 def_dict['PRIMER_PRODUCT_SIZE_RANGE']=productsizerange
|
|
75 def_dict['PRIMER_NUM_RETURN']=str(my_args.max_primers +1)
|
|
76 def_dict['PRIMER_OPT_SIZE']=str(my_args.opt_primer_length)
|
|
77 def_dict['PRIMER_PAIR_MAX_DIFF_TM']=str(my_args.max_tm_diff)
|
|
78 def_dict['PRIMER_OPT_TM']=str(my_args.optimum_tm)
|
|
79 def_dict['PRIMER_OPT_GC_PERCENT']=str(my_args.opt_GC_percent)
|
|
80 def_dict['PRIMER_MAX_POLY_X']=str(my_args.maxpolyx)
|
|
81 def_dict['PRIMER_GC_CLAMP']=str(my_args.gc_clamp)
|
0
|
82
|
5
|
83 def_dict['PRIMER_MAX_SELF_END']=str(my_args.maxselfend)
|
|
84 def_dict['PRIMER_MAX_SELF_ANY']=str(my_args.maxselfany)
|
|
85 def_dict['PRIMER_MAX_GC']=str(my_args.maxgc)
|
|
86 def_dict['PRIMER_MIN_GC']=str(my_args.mingc)
|
|
87
|
|
88
|
|
89
|
|
90 ##conditional import of umelt
|
|
91 if my_args.run_uMelt:
|
|
92 import umelt_service as umelts
|
|
93
|
0
|
94 #open input files
|
5
|
95
|
|
96 targets=my_args.target_file.readlines()
|
|
97 my_args.target_file.close()
|
0
|
98 ##and create a hit list of sequences from this
|
5
|
99 target_seq_id_list = [re.split(my_args.target_delim,X)[0] for X in targets] ## target_delimiter defaults to ':' e.g. ABC:SNP:SAMTOOL:1234
|
|
100
|
|
101 ##print header
|
|
102 print "SNP_Target_ID", "Position","Ref_base","Variant_base" ,"Amplicon_bp","PRIMER_LEFT_SEQUENCE",'PRIMER_RIGHT_SEQUENCE', "ref_melt_Tm","var_melt_Tm","Tm_difference"
|
0
|
103 ##create iterator returning sequence records
|
5
|
104 for myrec in SeqIO.parse(my_args.in_file, "fasta"):
|
0
|
105 #check if this sequence is included in the target list
|
|
106 if myrec.id in target_seq_id_list:
|
|
107 ##create sequence dictionary so we can add in gff annotations
|
|
108 seq_dict = {myrec.id : myrec}
|
|
109 ##just limit to gff annotations for this sequence
|
|
110 limit_info = dict(gff_id = [ myrec.id ])
|
|
111 ##rewind gff filehandle
|
5
|
112 my_args.gff_file.seek(0)
|
0
|
113 ##read annotations into sequence dictionary for this sequence record only
|
5
|
114 annotations = [r for r in GFF.parse(my_args.gff_file, base_dict=seq_dict,limit_info=limit_info)]
|
0
|
115 ##if there are any annotations, then proceed.
|
|
116 if annotations:
|
|
117 rec=annotations[0]
|
|
118 ##iterate over list of target IDs
|
|
119 for t in targets:
|
|
120 target_ID = t.strip('\n')
|
|
121 target_annotations = [f for f in rec.features if f.id == target_ID ]
|
|
122 if target_annotations:
|
|
123 mytarget = target_annotations[0]
|
|
124 #just consider slice of sequence in a window of +/- prod_max_size bp
|
|
125 ##from feature UNLESS feature is close to end
|
|
126 ##Note that slice is zero-based
|
|
127 featLocation = mytarget.location.start.position
|
5
|
128 if featLocation > my_args.prod_max_size:
|
|
129 slice_start = featLocation - my_args.prod_max_size
|
|
130 featPosition = my_args.prod_max_size
|
0
|
131 else:
|
|
132 slice_start = 0
|
|
133 featPosition = featLocation
|
5
|
134 if (len(rec) - featLocation) < my_args.prod_max_size:
|
0
|
135 slice_end = len(rec)
|
|
136 else:
|
5
|
137 slice_end = featLocation + my_args.prod_max_size
|
0
|
138 ###grab slice of sequence fom this window.
|
|
139 targetRec = rec[slice_start:slice_end]
|
|
140 matching_feature = [f for f in targetRec.features if f.id == mytarget.id]
|
|
141 if matching_feature:
|
|
142 target_feat = matching_feature[0]
|
5
|
143 my_target_dict={} # re-initialize target dictionary
|
0
|
144 if target_feat.location.start.position == 0:
|
|
145 target_feat.location.start.position = 1
|
5
|
146 #get the mask features by removing target...all features are masked as just using snp and indels, a smarter filter could be added
|
|
147 exclude_feat = list(targetRec.features) ##list copy to avoid possible side-effects
|
0
|
148 exclude_feat.remove(target_feat)
|
5
|
149 excludes_str=' '.join([str(x.location.start.position)+','+str(x.location.end.position -x.location.start.position) for x in exclude_feat])
|
|
150 my_target_dict={'SEQUENCE_ID' : rec.name, 'SEQUENCE_TEMPLATE': targetRec.seq.tostring(),'SEQUENCE_TARGET': str(target_feat.location.start.position) + ',1','SEQUENCE_INTERNAL_EXCLUDED_REGION': excludes_str}
|
|
151 my_target_dict.update(def_dict) ##add in defaults
|
|
152 result=P3.run_P3(target_dict=my_target_dict)
|
|
153 if my_args.run_uMelt:
|
|
154 amp_seq=targetRec.seq ##need to make this conditional on getting a result >0 and melt=True
|
|
155 mutamp_seq=targetRec.seq.tomutable()
|
|
156 mutamp_seq[target_feat.location.start:target_feat.location.end]=target_feat.qualifiers['Variant_seq'][0] #mutate to variant
|
|
157 for primerset in result:
|
|
158 amp_start=int(primerset['PRIMER_LEFT'].split(',')[0])
|
|
159 amp_end=int(primerset['PRIMER_RIGHT'].split(',')[0])
|
|
160 ref_melt_Tm=0
|
|
161 var_melt_Tm=0
|
6
|
162 diff_melt=0
|
5
|
163 if my_args.run_uMelt:
|
|
164 try:
|
|
165 ref_melt_Tm=umelts.getTm(umelts.getmelt(amp_seq.tostring()[amp_start:amp_end+1]))
|
|
166 var_melt_Tm=umelts.getTm(umelts.getmelt(mutamp_seq.tostring()[amp_start:amp_end+1]))
|
6
|
167 diff_melt=abs(ref_melt_Tm - var_melt_Tm)
|
5
|
168 except:
|
6
|
169 ref_melt_Tm="NA" ##preferably something more informative?
|
|
170 var_melt_Tm="NA" ##exception handling to be added
|
|
171 diff_melt="NA"
|
5
|
172 reference_seq=target_feat.qualifiers['Reference_seq'][0]
|
|
173 if target_feat.qualifiers.has_key('Variant_seq'):
|
|
174 variant_seq=target_feat.qualifiers['Variant_seq'][0]
|
|
175 else:
|
|
176 variant_seq="NA"
|
6
|
177 print mytarget.id, featLocation + 1 ,reference_seq, variant_seq,amp_end-amp_start,primerset['PRIMER_LEFT_SEQUENCE'],primerset['PRIMER_RIGHT_SEQUENCE'], ref_melt_Tm,var_melt_Tm,diff_melt#, amp_seq.tostring()[amp_start:amp_end+1], mutamp_seq.tostring()[amp_start:amp_end+1]
|
0
|
178
|
5
|
179 my_args.gff_file.close()
|
|
180 my_args.in_file.close()
|
|
181
|