Mercurial > repos > john-mccallum > pcr_markers


#!/usr/bin/env python
##design primers to features in multiple sequences, with option to predict melting
#usage: design_HRM_primers.py [-h] -i IN_FILE -g GFF_FILE -T TARGET_FILE [-u]
#                             [-n MAX_PRIMERS] [-p PROD_MIN_SIZE]
#                             [-P PROD_MAX_SIZE] [-l OPT_PRIMER_LENGTH]
#                             [-m MAX_TM_DIFF] [-t OPTIMUM_TM]
#                             [-G OPT_GC_PERCENT] [-x MAXPOLYX] [-c GC_CLAMP]

#Copyright 2013 John McCallum & Susan Thomson
#New Zealand Institute for Plant and Food Research
#This program is free software: you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.

import os
import StringIO
import re
import copy
import sys
from BCBio import GFF
from BCBio.GFF import GFFExaminer
from Bio import SeqIO
import run_p3 as P3
#import umelt_service as umelts
import argparse

##Primer3 defaults or additional options defined as dictionary
def_dict={
'PRIMER_MIN_SIZE':'18',
'PRIMER_MAX_SIZE':'25',
'PRIMER_MAX_NS_ACCEPTED':'1'}

#parse arguments
parser = argparse.ArgumentParser(description='Primer set design and melt prediction parameters')
parser.add_argument('-i', type=argparse.FileType('r'), help="input sequence file, required", dest='in_file', required=True)
parser.add_argument('-g', type=argparse.FileType('r'), help="input gff file with SNP and indels, required", dest='gff_file', required=True)
parser.add_argument('-T', type=argparse.FileType('r'), help="input target SNP file, required", dest='target_file', required=True)
parser.add_argument('-u',help="do uMelt prediction, optional", dest='run_uMelt',action='store_true', default=False )
parser.add_argument('-n', type=int, help="maximum number of primer pairs to return, default=5", dest='max_primers', default=5) ## PRIMER_NUM_RETURN
parser.add_argument('-p', type=int, help="minimum product size", dest='prod_min_size', default=100)                             ## PRIMER_PRODUCT_SIZE_RANGE min
parser.add_argument('-P', type=int, help="maximum product size", dest='prod_max_size', default=300)                            ## PRIMER_PRODUCT_SIZE_RANGE max
parser.add_argument('-l', type=int, help="optimum primer length", dest='opt_primer_length', default=20)                        ## PRIMER_OPT_SIZE
parser.add_argument('-m', type=int, help="maximum tm difference between primers", dest='max_tm_diff', default=1)               ## PRIMER_PAIR_MAX_DIFF_TM
parser.add_argument('-t', type=int, help="optimum Tm for primers, recommend range 59 to 61", dest='optimum_tm', default=59)    ## PRIMER_OPT_TM
parser.add_argument('-G', type=int, help="optimum GC percentage of primers", dest='opt_GC_percent', default=50)                ## PRIMER_OPT_GC_PERCENT
parser.add_argument('-x', type=int, help="maximum polyx, recommend less than 4", dest='maxpolyx', default=3)                   ## PRIMER_MAX_POLY_X
parser.add_argument('-c', type=int, help="number of C/Gs at end, recommend 2", dest='gc_clamp', default=1)                     ## PRIMER_GC_CLAMP

parser.add_argument('-e', type=int, help="maximum allowable 3'-anchored complementarity", dest='maxselfend', default=3)                     ## PRIMER_MAX_SELF_END
parser.add_argument('-a', type=int, help="maximum complementarity between left and right or self", dest='maxselfany', default=8)                     ## PRIMER_MAX_SELF_ANY
parser.add_argument('-maxgc', type=float, help="Maximum allowable percentage of Gs and Cs in any primer.", dest='maxgc', default=80.0)                     ## PRIMER_MAX_GC
parser.add_argument('-mingc', type=float, help="Minimum allowable percentage of Gs and Cs in any primer.", dest='mingc', default=20.0)                     ## PRIMER_MIN_GC


parser.add_argument('-d', type=str, help="variant indentifier delimiter, used to separate sequence ID from rest ", dest='target_delim', default=':')
try:
        my_args = parser.parse_args()
except SystemExit:
        print("\nOops, an argument is missing/invalid, exiting...\n")
        sys.exit(0)

##update from args. NEEDS TO BE FINISHED
productsizerange = str(my_args.prod_min_size) + "-" + str(my_args.prod_max_size)
def_dict['PRIMER_PRODUCT_SIZE_RANGE']=productsizerange
def_dict['PRIMER_NUM_RETURN']=str(my_args.max_primers +1)
def_dict['PRIMER_OPT_SIZE']=str(my_args.opt_primer_length)
def_dict['PRIMER_PAIR_MAX_DIFF_TM']=str(my_args.max_tm_diff)
def_dict['PRIMER_OPT_TM']=str(my_args.optimum_tm)
def_dict['PRIMER_OPT_GC_PERCENT']=str(my_args.opt_GC_percent)
def_dict['PRIMER_MAX_POLY_X']=str(my_args.maxpolyx)
def_dict['PRIMER_GC_CLAMP']=str(my_args.gc_clamp)

def_dict['PRIMER_MAX_SELF_END']=str(my_args.maxselfend)
def_dict['PRIMER_MAX_SELF_ANY']=str(my_args.maxselfany)
def_dict['PRIMER_MAX_GC']=str(my_args.maxgc)
def_dict['PRIMER_MIN_GC']=str(my_args.mingc)


##conditional import of umelt
if my_args.run_uMelt:
    import umelt_service as umelts

#open input files

targets=my_args.target_file.readlines()
my_args.target_file.close()
##and create a hit list of sequences from this
target_seq_id_list = [re.split(my_args.target_delim,X)[0] for X in targets] ## target_delimiter defaults to ':'  e.g. ABC:SNP:SAMTOOL:1234

##print header
print "SNP_Target_ID", "Position","Ref_base","Variant_base" ,"Amplicon_bp","PRIMER_LEFT_SEQUENCE",'PRIMER_RIGHT_SEQUENCE', "ref_melt_Tm","var_melt_Tm","Tm_difference"
##create iterator returning sequence records
for myrec in SeqIO.parse(my_args.in_file, "fasta"):
    #check if this sequence is included in the target list
    if myrec.id in target_seq_id_list:
        ##create sequence dictionary so we can add in gff annotations
        seq_dict = {myrec.id : myrec}
        ##just limit to gff annotations for this sequence
        limit_info = dict(gff_id = [ myrec.id ])
        ##rewind gff filehandle
        my_args.gff_file.seek(0)
        ##read annotations into sequence dictionary for this sequence record only
        annotations = [r for r in GFF.parse(my_args.gff_file, base_dict=seq_dict,limit_info=limit_info)]
        ##if there are any annotations, then  proceed.
        if annotations:
            rec=annotations[0]
            ##iterate over list of target IDs
            for t in targets:
                target_ID = t.strip('\n')
                target_annotations = [f for f in rec.features if f.id == target_ID ]
                if target_annotations:
                    mytarget = target_annotations[0]
                    #just consider slice of sequence in a window of +/- prod_max_size  bp
                    ##from feature UNLESS feature is close to end
                    ##Note that slice is zero-based
                    featLocation = mytarget.location.start.position
                    if featLocation > my_args.prod_max_size:
                        slice_start = featLocation -  my_args.prod_max_size
                        featPosition =  my_args.prod_max_size
                    else:
                        slice_start = 0
                        featPosition = featLocation
                    if (len(rec) - featLocation) <  my_args.prod_max_size:
                        slice_end = len(rec)
                    else:
                        slice_end = featLocation +  my_args.prod_max_size
                    ###grab slice of sequence fom this window.
                    targetRec = rec[slice_start:slice_end]
                    matching_feature = [f for f in targetRec.features if f.id == mytarget.id]
                    if matching_feature:
                        target_feat = matching_feature[0]
                        my_target_dict={} # re-initialize target dictionary
                        if target_feat.location.start.position == 0:
                            target_feat.location.start.position = 1
                        #get the mask features by removing  target...all features are masked as just using snp and indels, a smarter filter could be added
			exclude_feat = list(targetRec.features) ##list copy to avoid possible side-effects
                        exclude_feat.remove(target_feat)
			excludes_str=' '.join([str(x.location.start.position)+','+str(x.location.end.position -x.location.start.position) for x in exclude_feat])
                        my_target_dict={'SEQUENCE_ID' : rec.name, 'SEQUENCE_TEMPLATE': targetRec.seq.tostring(),'SEQUENCE_TARGET': str(target_feat.location.start.position) + ',1','SEQUENCE_INTERNAL_EXCLUDED_REGION': excludes_str}
                        my_target_dict.update(def_dict) ##add in defaults
			result=P3.run_P3(target_dict=my_target_dict)
                        if my_args.run_uMelt:
                            amp_seq=targetRec.seq ##need to make this conditional on getting a result >0 and melt=True
                            mutamp_seq=targetRec.seq.tomutable()
                            mutamp_seq[target_feat.location.start:target_feat.location.end]=target_feat.qualifiers['Variant_seq'][0] #mutate to variant
			for primerset in result:
				amp_start=int(primerset['PRIMER_LEFT'].split(',')[0])
				amp_end=int(primerset['PRIMER_RIGHT'].split(',')[0])
                                ref_melt_Tm=0
                                var_melt_Tm=0
                                diff_melt=0
                                if my_args.run_uMelt:
                                    try:
                                        ref_melt_Tm=umelts.getTm(umelts.getmelt(amp_seq.tostring()[amp_start:amp_end+1]))
					var_melt_Tm=umelts.getTm(umelts.getmelt(mutamp_seq.tostring()[amp_start:amp_end+1]))
                                        diff_melt=abs(ref_melt_Tm - var_melt_Tm)
                                    except:
					ref_melt_Tm="NA" ##preferably something more informative?
					var_melt_Tm="NA" ##exception handling to be added
                                        diff_melt="NA"
                                reference_seq=target_feat.qualifiers['Reference_seq'][0]
                                if target_feat.qualifiers.has_key('Variant_seq'):
                                    variant_seq=target_feat.qualifiers['Variant_seq'][0]
                                else:
                                    variant_seq="NA"
                                print mytarget.id, featLocation + 1 ,reference_seq, variant_seq,amp_end-amp_start,primerset['PRIMER_LEFT_SEQUENCE'],primerset['PRIMER_RIGHT_SEQUENCE'], ref_melt_Tm,var_melt_Tm,diff_melt#, amp_seq.tostring()[amp_start:amp_end+1], mutamp_seq.tostring()[amp_start:amp_end+1]

my_args.gff_file.close()
my_args.in_file.close()
author	ben-warren
date	Mon, 07 Jul 2014 19:28:17 -0400
parents	b321e0517be3
children