Mercurial > repos > john-mccallum > pcr_markers
comparison design_primers.py @ 0:21053f7f9ed1 draft
First upload of PCR Marker tools
author | john-mccallum |
---|---|
date | Thu, 14 Jun 2012 19:29:26 -0400 |
parents | |
children | b321e0517be3 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:21053f7f9ed1 |
---|---|
1 #!/usr/local/bin/python2.6 | |
2 ##design primers to features in multiple sequences | |
3 ##usage: python design_primers.py <fasta-file> <gff file> <file of target IDs> <prod_min_size> <prod_max_size> | |
4 | |
5 | |
6 #Copyright 2012 John McCallum & Leshi Chen | |
7 #New Zealand Institute for Plant and Food Research | |
8 #This program is free software: you can redistribute it and/or modify | |
9 # it under the terms of the GNU General Public License as published by | |
10 # the Free Software Foundation, either version 3 of the License, or | |
11 # (at your option) any later version. | |
12 # | |
13 # This program is distributed in the hope that it will be useful, | |
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 # GNU General Public License for more details. | |
17 # | |
18 # You should have received a copy of the GNU General Public License | |
19 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | |
21 | |
22 import os | |
23 import StringIO | |
24 import re | |
25 import tempfile | |
26 import subprocess | |
27 import copy | |
28 import sys | |
29 from BCBio import GFF | |
30 from BCBio.GFF import GFFExaminer | |
31 from Bio import SeqIO | |
32 from Bio.Emboss.Applications import Primer3Commandline | |
33 from Bio.Emboss import Primer3 | |
34 | |
35 | |
36 in_file = sys.argv[1] | |
37 gff_file = sys.argv[2] | |
38 target_file = sys.argv[3] | |
39 prod_min_size = int(sys.argv[4]) | |
40 prod_max_size = int(sys.argv[5]) | |
41 | |
42 max_tm_diff=1 ## | |
43 opt_GC_percent=50 ## | |
44 maxpolyx=4 ## | |
45 optimum_length=20 | |
46 ##target is specified in start, end format | |
47 productsizerange = str(prod_min_size) + "," + str(prod_max_size) | |
48 #open input files | |
49 in_seq_handle = open(in_file) | |
50 in_gff_handle = open(gff_file) | |
51 in_target_handle=open(target_file) | |
52 #read target feature IDs into list | |
53 targets=in_target_handle.readlines() | |
54 in_target_handle.close() | |
55 ##and create a hit list of sequences from this | |
56 target_seq_id_list = list(set([line.split(":")[0] for line in targets])) | |
57 ##create iterator returning sequence records | |
58 for myrec in SeqIO.parse(in_seq_handle, "fasta"): | |
59 #check if this sequence is included in the target list | |
60 if myrec.id in target_seq_id_list: | |
61 ##create sequence dictionary so we can add in gff annotations | |
62 seq_dict = {myrec.id : myrec} | |
63 ##just limit to gff annotations for this sequence | |
64 limit_info = dict(gff_id = [ myrec.id ]) | |
65 ##rewind gff filehandle | |
66 in_gff_handle.seek(0) | |
67 ##read annotations into sequence dictionary for this sequence record only | |
68 annotations = [r for r in GFF.parse(in_gff_handle, base_dict=seq_dict,limit_info=limit_info)] | |
69 ##if there are any annotations, then proceed. | |
70 if annotations: | |
71 rec=annotations[0] | |
72 ##iterate over list of target IDs | |
73 for t in targets: | |
74 target_ID = t.strip('\n') | |
75 target_annotations = [f for f in rec.features if f.id == target_ID ] | |
76 if target_annotations: | |
77 mytarget = target_annotations[0] | |
78 #create temporary files | |
79 tempfastaFile = tempfile.mktemp() | |
80 tempproutfile = tempfile.mktemp() | |
81 #just consider slice of sequence in a window of +/- prod_max_size bp | |
82 ##from feature UNLESS feature is close to end | |
83 ##Note that slice is zero-based | |
84 featLocation = mytarget.location.start.position | |
85 if featLocation > prod_max_size: | |
86 slice_start = featLocation - prod_max_size | |
87 featPosition = prod_max_size | |
88 else: | |
89 slice_start = 0 | |
90 featPosition = featLocation | |
91 if (len(rec) - featLocation) < prod_max_size: | |
92 slice_end = len(rec) | |
93 else: | |
94 slice_end = featLocation + prod_max_size | |
95 ###grab slice of sequence fom this window. | |
96 targetRec = rec[slice_start:slice_end] | |
97 matching_feature = [f for f in targetRec.features if f.id == mytarget.id] | |
98 if matching_feature: | |
99 target_feat = matching_feature[0] | |
100 if target_feat.location.start.position == 0: | |
101 target_feat.location.start.position = 1 | |
102 #we get the mask features by removing the target...all features are masked as just using snp and indels | |
103 ##a smarter filter could be added | |
104 ##note use of list copy to avoid possible side-effects | |
105 exclude_feat = list(targetRec.features) | |
106 exclude_feat.remove(target_feat) | |
107 ##print'targetRec.features', targetRec.features ##for debug | |
108 mask_str=map(lambda f: str(f.location.start.position+1) + "," + str(f.location.end.position + 1) ,exclude_feat) | |
109 #mask_str=map(lambda f: str(f.location).strip('[]'),exclude_feat) | |
110 p3_exclude_str = str(mask_str).replace('\', \'',':') | |
111 p3_target = str(target_feat.location.start.position +1) + "," + str(target_feat.location.end.position +1) | |
112 #write sequence record into template file as fasta | |
113 t_output_handle = open(tempfastaFile, "w") | |
114 SeqIO.write([targetRec], t_output_handle, "fasta") | |
115 t_output_handle.close() | |
116 #create Primer3Commandline() for emboss tool | |
117 primer_cl = Primer3Commandline() | |
118 #set the emboss tool to suppress output as this will make Galaxy think it is error message although it is a message to state run success | |
119 primer_cl.set_parameter("-auto",'1') | |
120 #pass sequence file to emboss | |
121 primer_cl.set_parameter("-sequence",tempfastaFile) | |
122 #add target location | |
123 primer_cl.set_parameter("-target", p3_target) | |
124 ##mask off other features...dumb masking of everything at present, beware | |
125 if (p3_exclude_str != ""): | |
126 primer_cl.set_parameter("-excludedregion", p3_exclude_str) | |
127 #add temporary output file to get the result | |
128 primer_cl.set_parameter("-outfile", tempproutfile) | |
129 #specify maximum different of tm | |
130 primer_cl.set_parameter("-maxdifftm",max_tm_diff ) | |
131 #other useful parameters | |
132 primer_cl.set_parameter("-ogcpercent", opt_GC_percent) | |
133 primer_cl.set_parameter("-opolyxmax", maxpolyx) | |
134 primer_cl.set_parameter("-osize", optimum_length) | |
135 #set product size range | |
136 primer_cl.set_parameter("-prange", productsizerange) | |
137 #using python subprocess method to run emboss command line programe with the parameters given | |
138 fnull = open(os.devnull, 'w') | |
139 result=subprocess.check_call(str(primer_cl),shell=True ,stdout = fnull, stderr = fnull) | |
140 #read temporary outputfile | |
141 handle = open(tempproutfile) | |
142 record = Primer3.read(handle) | |
143 ##just return first set, if there is one | |
144 if len(record.primers) > 0: | |
145 primer= record.primers[0] | |
146 outputstr=[mytarget.id, primer.forward_seq,primer.reverse_seq,primer.size] | |
147 else: | |
148 outputstr=[mytarget.id,"NONE","NONE","NONE"] | |
149 print('\t'.join(map(str,outputstr))) | |
150 | |
151 | |
152 in_gff_handle.close() | |
153 in_seq_handle.close() |