0
+ − 1
5
+ − 2 #!/usr/bin/env python
+ − 3 ##design primers to features in multiple sequences, with option to predict melting
+ − 4 #usage: design_HRM_primers.py [-h] -i IN_FILE -g GFF_FILE -T TARGET_FILE [-u]
+ − 5 # [-n MAX_PRIMERS] [-p PROD_MIN_SIZE]
+ − 6 # [-P PROD_MAX_SIZE] [-l OPT_PRIMER_LENGTH]
+ − 7 # [-m MAX_TM_DIFF] [-t OPTIMUM_TM]
+ − 8 # [-G OPT_GC_PERCENT] [-x MAXPOLYX] [-c GC_CLAMP]
0
+ − 9
5
+ − 10 #Copyright 2013 John McCallum & Susan Thomson
0
+ − 11 #New Zealand Institute for Plant and Food Research
+ − 12 #This program is free software: you can redistribute it and/or modify
+ − 13 # it under the terms of the GNU General Public License as published by
+ − 14 # the Free Software Foundation, either version 3 of the License, or
+ − 15 # (at your option) any later version.
+ − 16 #
+ − 17 # This program is distributed in the hope that it will be useful,
+ − 18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
+ − 19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ − 20 # GNU General Public License for more details.
+ − 21 #
+ − 22 # You should have received a copy of the GNU General Public License
+ − 23 # along with this program. If not, see <http://www.gnu.org/licenses/>.
+ − 24
+ − 25 import os
+ − 26 import StringIO
+ − 27 import re
+ − 28 import copy
+ − 29 import sys
+ − 30 from BCBio import GFF
+ − 31 from BCBio.GFF import GFFExaminer
+ − 32 from Bio import SeqIO
5
+ − 33 import run_p3 as P3
+ − 34 #import umelt_service as umelts
+ − 35 import argparse
+ − 36
+ − 37 ##Primer3 defaults or additional options defined as dictionary
+ − 38 def_dict={
+ − 39 'PRIMER_MIN_SIZE':'18',
+ − 40 'PRIMER_MAX_SIZE':'25',
+ − 41 'PRIMER_MAX_NS_ACCEPTED':'1'}
+ − 42
+ − 43 #parse arguments
+ − 44 parser = argparse.ArgumentParser(description='Primer set design and melt prediction parameters')
+ − 45 parser.add_argument('-i', type=argparse.FileType('r'), help="input sequence file, required", dest='in_file', required=True)
+ − 46 parser.add_argument('-g', type=argparse.FileType('r'), help="input gff file with SNP and indels, required", dest='gff_file', required=True)
+ − 47 parser.add_argument('-T', type=argparse.FileType('r'), help="input target SNP file, required", dest='target_file', required=True)
+ − 48 parser.add_argument('-u',help="do uMelt prediction, optional", dest='run_uMelt',action='store_true', default=False )
+ − 49 parser.add_argument('-n', type=int, help="maximum number of primer pairs to return, default=5", dest='max_primers', default=5) ## PRIMER_NUM_RETURN
+ − 50 parser.add_argument('-p', type=int, help="minimum product size", dest='prod_min_size', default=100) ## PRIMER_PRODUCT_SIZE_RANGE min
+ − 51 parser.add_argument('-P', type=int, help="maximum product size", dest='prod_max_size', default=300) ## PRIMER_PRODUCT_SIZE_RANGE max
+ − 52 parser.add_argument('-l', type=int, help="optimum primer length", dest='opt_primer_length', default=20) ## PRIMER_OPT_SIZE
+ − 53 parser.add_argument('-m', type=int, help="maximum tm difference between primers", dest='max_tm_diff', default=1) ## PRIMER_PAIR_MAX_DIFF_TM
+ − 54 parser.add_argument('-t', type=int, help="optimum Tm for primers, recommend range 59 to 61", dest='optimum_tm', default=59) ## PRIMER_OPT_TM
+ − 55 parser.add_argument('-G', type=int, help="optimum GC percentage of primers", dest='opt_GC_percent', default=50) ## PRIMER_OPT_GC_PERCENT
+ − 56 parser.add_argument('-x', type=int, help="maximum polyx, recommend less than 4", dest='maxpolyx', default=3) ## PRIMER_MAX_POLY_X
+ − 57 parser.add_argument('-c', type=int, help="number of C/Gs at end, recommend 2", dest='gc_clamp', default=1) ## PRIMER_GC_CLAMP
+ − 58
+ − 59 parser.add_argument('-e', type=int, help="maximum allowable 3'-anchored complementarity", dest='maxselfend', default=3) ## PRIMER_MAX_SELF_END
+ − 60 parser.add_argument('-a', type=int, help="maximum complementarity between left and right or self", dest='maxselfany', default=8) ## PRIMER_MAX_SELF_ANY
+ − 61 parser.add_argument('-maxgc', type=float, help="Maximum allowable percentage of Gs and Cs in any primer.", dest='maxgc', default=80.0) ## PRIMER_MAX_GC
+ − 62 parser.add_argument('-mingc', type=float, help="Minimum allowable percentage of Gs and Cs in any primer.", dest='mingc', default=20.0) ## PRIMER_MIN_GC
0
+ − 63
+ − 64
5
+ − 65 parser.add_argument('-d', type=str, help="variant indentifier delimiter, used to separate sequence ID from rest ", dest='target_delim', default=':')
+ − 66 try:
+ − 67 my_args = parser.parse_args()
+ − 68 except SystemExit:
+ − 69 print("\nOops, an argument is missing/invalid, exiting...\n")
+ − 70 sys.exit(0)
+ − 71
+ − 72 ##update from args. NEEDS TO BE FINISHED
+ − 73 productsizerange = str(my_args.prod_min_size) + "-" + str(my_args.prod_max_size)
+ − 74 def_dict['PRIMER_PRODUCT_SIZE_RANGE']=productsizerange
+ − 75 def_dict['PRIMER_NUM_RETURN']=str(my_args.max_primers +1)
+ − 76 def_dict['PRIMER_OPT_SIZE']=str(my_args.opt_primer_length)
+ − 77 def_dict['PRIMER_PAIR_MAX_DIFF_TM']=str(my_args.max_tm_diff)
+ − 78 def_dict['PRIMER_OPT_TM']=str(my_args.optimum_tm)
+ − 79 def_dict['PRIMER_OPT_GC_PERCENT']=str(my_args.opt_GC_percent)
+ − 80 def_dict['PRIMER_MAX_POLY_X']=str(my_args.maxpolyx)
+ − 81 def_dict['PRIMER_GC_CLAMP']=str(my_args.gc_clamp)
0
+ − 82
5
+ − 83 def_dict['PRIMER_MAX_SELF_END']=str(my_args.maxselfend)
+ − 84 def_dict['PRIMER_MAX_SELF_ANY']=str(my_args.maxselfany)
+ − 85 def_dict['PRIMER_MAX_GC']=str(my_args.maxgc)
+ − 86 def_dict['PRIMER_MIN_GC']=str(my_args.mingc)
+ − 87
+ − 88
+ − 89
+ − 90 ##conditional import of umelt
+ − 91 if my_args.run_uMelt:
+ − 92 import umelt_service as umelts
+ − 93
0
+ − 94 #open input files
5
+ − 95
+ − 96 targets=my_args.target_file.readlines()
+ − 97 my_args.target_file.close()
0
+ − 98 ##and create a hit list of sequences from this
5
+ − 99 target_seq_id_list = [re.split(my_args.target_delim,X)[0] for X in targets] ## target_delimiter defaults to ':' e.g. ABC:SNP:SAMTOOL:1234
+ − 100
+ − 101 ##print header
+ − 102 print "SNP_Target_ID", "Position","Ref_base","Variant_base" ,"Amplicon_bp","PRIMER_LEFT_SEQUENCE",'PRIMER_RIGHT_SEQUENCE', "ref_melt_Tm","var_melt_Tm","Tm_difference"
0
+ − 103 ##create iterator returning sequence records
5
+ − 104 for myrec in SeqIO.parse(my_args.in_file, "fasta"):
0
+ − 105 #check if this sequence is included in the target list
+ − 106 if myrec.id in target_seq_id_list:
+ − 107 ##create sequence dictionary so we can add in gff annotations
+ − 108 seq_dict = {myrec.id : myrec}
+ − 109 ##just limit to gff annotations for this sequence
+ − 110 limit_info = dict(gff_id = [ myrec.id ])
+ − 111 ##rewind gff filehandle
5
+ − 112 my_args.gff_file.seek(0)
0
+ − 113 ##read annotations into sequence dictionary for this sequence record only
5
+ − 114 annotations = [r for r in GFF.parse(my_args.gff_file, base_dict=seq_dict,limit_info=limit_info)]
0
+ − 115 ##if there are any annotations, then proceed.
+ − 116 if annotations:
+ − 117 rec=annotations[0]
+ − 118 ##iterate over list of target IDs
+ − 119 for t in targets:
+ − 120 target_ID = t.strip('\n')
+ − 121 target_annotations = [f for f in rec.features if f.id == target_ID ]
+ − 122 if target_annotations:
+ − 123 mytarget = target_annotations[0]
+ − 124 #just consider slice of sequence in a window of +/- prod_max_size bp
+ − 125 ##from feature UNLESS feature is close to end
+ − 126 ##Note that slice is zero-based
+ − 127 featLocation = mytarget.location.start.position
5
+ − 128 if featLocation > my_args.prod_max_size:
+ − 129 slice_start = featLocation - my_args.prod_max_size
+ − 130 featPosition = my_args.prod_max_size
0
+ − 131 else:
+ − 132 slice_start = 0
+ − 133 featPosition = featLocation
5
+ − 134 if (len(rec) - featLocation) < my_args.prod_max_size:
0
+ − 135 slice_end = len(rec)
+ − 136 else:
5
+ − 137 slice_end = featLocation + my_args.prod_max_size
0
+ − 138 ###grab slice of sequence fom this window.
+ − 139 targetRec = rec[slice_start:slice_end]
+ − 140 matching_feature = [f for f in targetRec.features if f.id == mytarget.id]
+ − 141 if matching_feature:
+ − 142 target_feat = matching_feature[0]
5
+ − 143 my_target_dict={} # re-initialize target dictionary
0
+ − 144 if target_feat.location.start.position == 0:
+ − 145 target_feat.location.start.position = 1
5
+ − 146 #get the mask features by removing target...all features are masked as just using snp and indels, a smarter filter could be added
+ − 147 exclude_feat = list(targetRec.features) ##list copy to avoid possible side-effects
0
+ − 148 exclude_feat.remove(target_feat)
5
+ − 149 excludes_str=' '.join([str(x.location.start.position)+','+str(x.location.end.position -x.location.start.position) for x in exclude_feat])
+ − 150 my_target_dict={'SEQUENCE_ID' : rec.name, 'SEQUENCE_TEMPLATE': targetRec.seq.tostring(),'SEQUENCE_TARGET': str(target_feat.location.start.position) + ',1','SEQUENCE_INTERNAL_EXCLUDED_REGION': excludes_str}
+ − 151 my_target_dict.update(def_dict) ##add in defaults
+ − 152 result=P3.run_P3(target_dict=my_target_dict)
+ − 153 if my_args.run_uMelt:
+ − 154 amp_seq=targetRec.seq ##need to make this conditional on getting a result >0 and melt=True
+ − 155 mutamp_seq=targetRec.seq.tomutable()
+ − 156 mutamp_seq[target_feat.location.start:target_feat.location.end]=target_feat.qualifiers['Variant_seq'][0] #mutate to variant
+ − 157 for primerset in result:
+ − 158 amp_start=int(primerset['PRIMER_LEFT'].split(',')[0])
+ − 159 amp_end=int(primerset['PRIMER_RIGHT'].split(',')[0])
+ − 160 ref_melt_Tm=0
+ − 161 var_melt_Tm=0
6
+ − 162 diff_melt=0
5
+ − 163 if my_args.run_uMelt:
+ − 164 try:
+ − 165 ref_melt_Tm=umelts.getTm(umelts.getmelt(amp_seq.tostring()[amp_start:amp_end+1]))
+ − 166 var_melt_Tm=umelts.getTm(umelts.getmelt(mutamp_seq.tostring()[amp_start:amp_end+1]))
6
+ − 167 diff_melt=abs(ref_melt_Tm - var_melt_Tm)
5
+ − 168 except:
6
+ − 169 ref_melt_Tm="NA" ##preferably something more informative?
+ − 170 var_melt_Tm="NA" ##exception handling to be added
+ − 171 diff_melt="NA"
5
+ − 172 reference_seq=target_feat.qualifiers['Reference_seq'][0]
+ − 173 if target_feat.qualifiers.has_key('Variant_seq'):
+ − 174 variant_seq=target_feat.qualifiers['Variant_seq'][0]
+ − 175 else:
+ − 176 variant_seq="NA"
6
+ − 177 print mytarget.id, featLocation + 1 ,reference_seq, variant_seq,amp_end-amp_start,primerset['PRIMER_LEFT_SEQUENCE'],primerset['PRIMER_RIGHT_SEQUENCE'], ref_melt_Tm,var_melt_Tm,diff_melt#, amp_seq.tostring()[amp_start:amp_end+1], mutamp_seq.tostring()[amp_start:amp_end+1]
0
+ − 178
5
+ − 179 my_args.gff_file.close()
+ − 180 my_args.in_file.close()
+ − 181