Mercurial > repos > john-mccallum > pcr_markers
diff design_primers.py @ 5:b321e0517be3 draft
Uploaded
author | ben-warren |
---|---|
date | Thu, 22 May 2014 20:30:19 -0400 |
parents | 21053f7f9ed1 |
children | f201e8c6e004 |
line wrap: on
line diff
--- a/design_primers.py Thu Oct 18 19:47:07 2012 -0400 +++ b/design_primers.py Thu May 22 20:30:19 2014 -0400 @@ -1,9 +1,13 @@ -#!/usr/local/bin/python2.6 -##design primers to features in multiple sequences -##usage: python design_primers.py <fasta-file> <gff file> <file of target IDs> <prod_min_size> <prod_max_size> +#!/usr/bin/env python +##design primers to features in multiple sequences, with option to predict melting +#usage: design_HRM_primers.py [-h] -i IN_FILE -g GFF_FILE -T TARGET_FILE [-u] +# [-n MAX_PRIMERS] [-p PROD_MIN_SIZE] +# [-P PROD_MAX_SIZE] [-l OPT_PRIMER_LENGTH] +# [-m MAX_TM_DIFF] [-t OPTIMUM_TM] +# [-G OPT_GC_PERCENT] [-x MAXPOLYX] [-c GC_CLAMP] -#Copyright 2012 John McCallum & Leshi Chen +#Copyright 2013 John McCallum & Susan Thomson #New Zealand Institute for Plant and Food Research #This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -18,44 +22,86 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. - import os import StringIO import re -import tempfile -import subprocess import copy import sys from BCBio import GFF from BCBio.GFF import GFFExaminer from Bio import SeqIO -from Bio.Emboss.Applications import Primer3Commandline -from Bio.Emboss import Primer3 +import run_p3 as P3 +#import umelt_service as umelts +import argparse + +##Primer3 defaults or additional options defined as dictionary +def_dict={ +'PRIMER_MIN_SIZE':'18', +'PRIMER_MAX_SIZE':'25', +'PRIMER_MAX_NS_ACCEPTED':'1'} + +#parse arguments +parser = argparse.ArgumentParser(description='Primer set design and melt prediction parameters') +parser.add_argument('-i', type=argparse.FileType('r'), help="input sequence file, required", dest='in_file', required=True) +parser.add_argument('-g', type=argparse.FileType('r'), help="input gff file with SNP and indels, required", dest='gff_file', required=True) +parser.add_argument('-T', type=argparse.FileType('r'), help="input target SNP file, required", dest='target_file', required=True) +parser.add_argument('-u',help="do uMelt prediction, optional", dest='run_uMelt',action='store_true', default=False ) +parser.add_argument('-n', type=int, help="maximum number of primer pairs to return, default=5", dest='max_primers', default=5) ## PRIMER_NUM_RETURN +parser.add_argument('-p', type=int, help="minimum product size", dest='prod_min_size', default=100) ## PRIMER_PRODUCT_SIZE_RANGE min +parser.add_argument('-P', type=int, help="maximum product size", dest='prod_max_size', default=300) ## PRIMER_PRODUCT_SIZE_RANGE max +parser.add_argument('-l', type=int, help="optimum primer length", dest='opt_primer_length', default=20) ## PRIMER_OPT_SIZE +parser.add_argument('-m', type=int, help="maximum tm difference between primers", dest='max_tm_diff', default=1) ## PRIMER_PAIR_MAX_DIFF_TM +parser.add_argument('-t', type=int, help="optimum Tm for primers, recommend range 59 to 61", dest='optimum_tm', default=59) ## PRIMER_OPT_TM +parser.add_argument('-G', type=int, help="optimum GC percentage of primers", dest='opt_GC_percent', default=50) ## PRIMER_OPT_GC_PERCENT +parser.add_argument('-x', type=int, help="maximum polyx, recommend less than 4", dest='maxpolyx', default=3) ## PRIMER_MAX_POLY_X +parser.add_argument('-c', type=int, help="number of C/Gs at end, recommend 2", dest='gc_clamp', default=1) ## PRIMER_GC_CLAMP + +parser.add_argument('-e', type=int, help="maximum allowable 3'-anchored complementarity", dest='maxselfend', default=3) ## PRIMER_MAX_SELF_END +parser.add_argument('-a', type=int, help="maximum complementarity between left and right or self", dest='maxselfany', default=8) ## PRIMER_MAX_SELF_ANY +parser.add_argument('-maxgc', type=float, help="Maximum allowable percentage of Gs and Cs in any primer.", dest='maxgc', default=80.0) ## PRIMER_MAX_GC +parser.add_argument('-mingc', type=float, help="Minimum allowable percentage of Gs and Cs in any primer.", dest='mingc', default=20.0) ## PRIMER_MIN_GC -in_file = sys.argv[1] -gff_file = sys.argv[2] -target_file = sys.argv[3] -prod_min_size = int(sys.argv[4]) -prod_max_size = int(sys.argv[5]) +parser.add_argument('-d', type=str, help="variant indentifier delimiter, used to separate sequence ID from rest ", dest='target_delim', default=':') +try: + my_args = parser.parse_args() +except SystemExit: + print("\nOops, an argument is missing/invalid, exiting...\n") + sys.exit(0) + +##update from args. NEEDS TO BE FINISHED +productsizerange = str(my_args.prod_min_size) + "-" + str(my_args.prod_max_size) +def_dict['PRIMER_PRODUCT_SIZE_RANGE']=productsizerange +def_dict['PRIMER_NUM_RETURN']=str(my_args.max_primers +1) +def_dict['PRIMER_OPT_SIZE']=str(my_args.opt_primer_length) +def_dict['PRIMER_PAIR_MAX_DIFF_TM']=str(my_args.max_tm_diff) +def_dict['PRIMER_OPT_TM']=str(my_args.optimum_tm) +def_dict['PRIMER_OPT_GC_PERCENT']=str(my_args.opt_GC_percent) +def_dict['PRIMER_MAX_POLY_X']=str(my_args.maxpolyx) +def_dict['PRIMER_GC_CLAMP']=str(my_args.gc_clamp) -max_tm_diff=1 ## -opt_GC_percent=50 ## -maxpolyx=4 ## -optimum_length=20 -##target is specified in start, end format -productsizerange = str(prod_min_size) + "," + str(prod_max_size) +def_dict['PRIMER_MAX_SELF_END']=str(my_args.maxselfend) +def_dict['PRIMER_MAX_SELF_ANY']=str(my_args.maxselfany) +def_dict['PRIMER_MAX_GC']=str(my_args.maxgc) +def_dict['PRIMER_MIN_GC']=str(my_args.mingc) + + + +##conditional import of umelt +if my_args.run_uMelt: + import umelt_service as umelts + #open input files -in_seq_handle = open(in_file) -in_gff_handle = open(gff_file) -in_target_handle=open(target_file) -#read target feature IDs into list -targets=in_target_handle.readlines() -in_target_handle.close() + +targets=my_args.target_file.readlines() +my_args.target_file.close() ##and create a hit list of sequences from this -target_seq_id_list = list(set([line.split(":")[0] for line in targets])) +target_seq_id_list = [re.split(my_args.target_delim,X)[0] for X in targets] ## target_delimiter defaults to ':' e.g. ABC:SNP:SAMTOOL:1234 + +##print header +print "SNP_Target_ID", "Position","Ref_base","Variant_base" ,"Amplicon_bp","PRIMER_LEFT_SEQUENCE",'PRIMER_RIGHT_SEQUENCE', "ref_melt_Tm","var_melt_Tm","Tm_difference" ##create iterator returning sequence records -for myrec in SeqIO.parse(in_seq_handle, "fasta"): +for myrec in SeqIO.parse(my_args.in_file, "fasta"): #check if this sequence is included in the target list if myrec.id in target_seq_id_list: ##create sequence dictionary so we can add in gff annotations @@ -63,9 +109,9 @@ ##just limit to gff annotations for this sequence limit_info = dict(gff_id = [ myrec.id ]) ##rewind gff filehandle - in_gff_handle.seek(0) + my_args.gff_file.seek(0) ##read annotations into sequence dictionary for this sequence record only - annotations = [r for r in GFF.parse(in_gff_handle, base_dict=seq_dict,limit_info=limit_info)] + annotations = [r for r in GFF.parse(my_args.gff_file, base_dict=seq_dict,limit_info=limit_info)] ##if there are any annotations, then proceed. if annotations: rec=annotations[0] @@ -75,79 +121,58 @@ target_annotations = [f for f in rec.features if f.id == target_ID ] if target_annotations: mytarget = target_annotations[0] - #create temporary files - tempfastaFile = tempfile.mktemp() - tempproutfile = tempfile.mktemp() #just consider slice of sequence in a window of +/- prod_max_size bp ##from feature UNLESS feature is close to end ##Note that slice is zero-based featLocation = mytarget.location.start.position - if featLocation > prod_max_size: - slice_start = featLocation - prod_max_size - featPosition = prod_max_size + if featLocation > my_args.prod_max_size: + slice_start = featLocation - my_args.prod_max_size + featPosition = my_args.prod_max_size else: slice_start = 0 featPosition = featLocation - if (len(rec) - featLocation) < prod_max_size: + if (len(rec) - featLocation) < my_args.prod_max_size: slice_end = len(rec) else: - slice_end = featLocation + prod_max_size + slice_end = featLocation + my_args.prod_max_size ###grab slice of sequence fom this window. targetRec = rec[slice_start:slice_end] matching_feature = [f for f in targetRec.features if f.id == mytarget.id] if matching_feature: target_feat = matching_feature[0] + my_target_dict={} # re-initialize target dictionary if target_feat.location.start.position == 0: target_feat.location.start.position = 1 - #we get the mask features by removing the target...all features are masked as just using snp and indels - ##a smarter filter could be added - ##note use of list copy to avoid possible side-effects - exclude_feat = list(targetRec.features) + #get the mask features by removing target...all features are masked as just using snp and indels, a smarter filter could be added + exclude_feat = list(targetRec.features) ##list copy to avoid possible side-effects exclude_feat.remove(target_feat) - ##print'targetRec.features', targetRec.features ##for debug - mask_str=map(lambda f: str(f.location.start.position+1) + "," + str(f.location.end.position + 1) ,exclude_feat) - #mask_str=map(lambda f: str(f.location).strip('[]'),exclude_feat) - p3_exclude_str = str(mask_str).replace('\', \'',':') - p3_target = str(target_feat.location.start.position +1) + "," + str(target_feat.location.end.position +1) - #write sequence record into template file as fasta - t_output_handle = open(tempfastaFile, "w") - SeqIO.write([targetRec], t_output_handle, "fasta") - t_output_handle.close() - #create Primer3Commandline() for emboss tool - primer_cl = Primer3Commandline() - #set the emboss tool to suppress output as this will make Galaxy think it is error message although it is a message to state run success - primer_cl.set_parameter("-auto",'1') - #pass sequence file to emboss - primer_cl.set_parameter("-sequence",tempfastaFile) - #add target location - primer_cl.set_parameter("-target", p3_target) - ##mask off other features...dumb masking of everything at present, beware - if (p3_exclude_str != ""): - primer_cl.set_parameter("-excludedregion", p3_exclude_str) - #add temporary output file to get the result - primer_cl.set_parameter("-outfile", tempproutfile) - #specify maximum different of tm - primer_cl.set_parameter("-maxdifftm",max_tm_diff ) - #other useful parameters - primer_cl.set_parameter("-ogcpercent", opt_GC_percent) - primer_cl.set_parameter("-opolyxmax", maxpolyx) - primer_cl.set_parameter("-osize", optimum_length) - #set product size range - primer_cl.set_parameter("-prange", productsizerange) - #using python subprocess method to run emboss command line programe with the parameters given - fnull = open(os.devnull, 'w') - result=subprocess.check_call(str(primer_cl),shell=True ,stdout = fnull, stderr = fnull) - #read temporary outputfile - handle = open(tempproutfile) - record = Primer3.read(handle) - ##just return first set, if there is one - if len(record.primers) > 0: - primer= record.primers[0] - outputstr=[mytarget.id, primer.forward_seq,primer.reverse_seq,primer.size] - else: - outputstr=[mytarget.id,"NONE","NONE","NONE"] - print('\t'.join(map(str,outputstr))) + excludes_str=' '.join([str(x.location.start.position)+','+str(x.location.end.position -x.location.start.position) for x in exclude_feat]) + my_target_dict={'SEQUENCE_ID' : rec.name, 'SEQUENCE_TEMPLATE': targetRec.seq.tostring(),'SEQUENCE_TARGET': str(target_feat.location.start.position) + ',1','SEQUENCE_INTERNAL_EXCLUDED_REGION': excludes_str} + my_target_dict.update(def_dict) ##add in defaults + result=P3.run_P3(target_dict=my_target_dict) + if my_args.run_uMelt: + amp_seq=targetRec.seq ##need to make this conditional on getting a result >0 and melt=True + mutamp_seq=targetRec.seq.tomutable() + mutamp_seq[target_feat.location.start:target_feat.location.end]=target_feat.qualifiers['Variant_seq'][0] #mutate to variant + for primerset in result: + amp_start=int(primerset['PRIMER_LEFT'].split(',')[0]) + amp_end=int(primerset['PRIMER_RIGHT'].split(',')[0]) + ref_melt_Tm=0 + var_melt_Tm=0 + if my_args.run_uMelt: + try: + ref_melt_Tm=umelts.getTm(umelts.getmelt(amp_seq.tostring()[amp_start:amp_end+1])) + var_melt_Tm=umelts.getTm(umelts.getmelt(mutamp_seq.tostring()[amp_start:amp_end+1])) + except: + ref_melt_Tm=0 ##preferably something more informative? + var_melt_Tm=0 ##exception handling to be added + reference_seq=target_feat.qualifiers['Reference_seq'][0] + if target_feat.qualifiers.has_key('Variant_seq'): + variant_seq=target_feat.qualifiers['Variant_seq'][0] + else: + variant_seq="NA" + print mytarget.id, featLocation + 1 ,reference_seq, variant_seq,amp_end-amp_start,primerset['PRIMER_LEFT_SEQUENCE'],primerset['PRIMER_RIGHT_SEQUENCE'], ref_melt_Tm,var_melt_Tm,abs(ref_melt_Tm-var_melt_Tm)#, amp_seq.tostring()[amp_start:amp_end+1], mutamp_seq.tostring()[amp_start:amp_end+1] - -in_gff_handle.close() -in_seq_handle.close() +my_args.gff_file.close() +my_args.in_file.close() +