Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/prada-guess-if @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc2ca1a3ba4 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 #GUESS-if is to find abnormal intragenic fusions. | |
4 #It is an extension of the GUESS-ft method. | |
5 | |
6 import pysam | |
7 import subprocess | |
8 import os | |
9 import time | |
10 import sys | |
11 import bioclass | |
12 import ioprada | |
13 | |
14 args=sys.argv | |
15 | |
16 #Get all parameters | |
17 help_menu='''\nUsage: prada-guess-if Gene -conf xx.txt -inputbam X -mm 1 -minmapq 30 -junL X -outdir ./ -unmap X\n | |
18 **Parameters**: | |
19 -conf the configure file. see prada-fusion -conf for details | |
20 -inputbam the input bam file | |
21 -mm number of mismatch allowed | |
22 -minmapq mininum mapping quality for reads to be used in fusion finding | |
23 -junL length of exons to be used for junctions. see prada-fusion | |
24 -outdir output directory | |
25 -unmap the numapped reads. useful if user need to run guess-ft multiple times | |
26 ''' | |
27 | |
28 if '-h' in args or '-help' in args or len(args)==1: | |
29 print help_menu | |
30 sys.exit(0) | |
31 | |
32 ######################################################################### | |
33 target=args[1] | |
34 if '-inputbam' not in args: | |
35 sys.exit('Input BAM needed') | |
36 else: | |
37 i=args.index('-inputbam') | |
38 sourcefile=args[i+1] | |
39 if '-outdir' not in args: | |
40 outdir='./' | |
41 else: | |
42 i=args.index('-outdir') | |
43 outdir=args[i+1] | |
44 if not os.path.exists(outdir): | |
45 os.mkdir(outdir) | |
46 if '-unmap' not in args: | |
47 unmapbam='%s/one.end.unmapped.bam'%outdir | |
48 extract_mask=1 | |
49 else: | |
50 i=args.index('-unmap') | |
51 unmapbam=args[i+1] | |
52 extract_mask=0 | |
53 | |
54 if '-mm' not in args: | |
55 mm=1 | |
56 else: | |
57 i=args.index('-mm') | |
58 mm=int(args[i+1]) | |
59 | |
60 #minimum mapping quality for reads as fusion evidences | |
61 if '-minmapq' not in args: | |
62 minmapq=30 | |
63 else: | |
64 i=args.index('-minmapq') | |
65 minmapq=int(args[i+1]) | |
66 | |
67 if '-junL' not in args: | |
68 sys.exit('-junL needed') | |
69 else: | |
70 i=args.index('-junL') | |
71 junL=int(args[i+1]) | |
72 | |
73 prada_path=os.path.dirname(os.path.abspath(__file__)) #### | |
74 ref_search_path=[prada_path,os.getcwd()] #search path for ref file if not specified in command line | |
75 | |
76 if '-conf' in args: | |
77 i=args.index('-conf') | |
78 reffile=args[i+1] | |
79 if os.path.exists(reffile): | |
80 pass | |
81 else: | |
82 for pth in ref_search_path: | |
83 new_reffile='%s/%s'%(pth, os.path.basename(reffile)) | |
84 if os.path.exists(new_reffile): | |
85 reffile=new_reffile | |
86 break | |
87 else: | |
88 sys.exit('ERROR: ref file %s not found'%reffile) | |
89 else: | |
90 reffile='%s/conf.txt'%prada_path | |
91 if not os.path.exists(reffile): | |
92 sys.exit('ERROR: No default conf.txt found and none specified') | |
93 | |
94 #reference files | |
95 refdict=ioprada.read_conf(reffile) | |
96 ref_anno=refdict['--REF--']['ref_anno'] | |
97 ref_map=refdict['--REF--']['ref_map'] | |
98 ref_fasta=refdict['--REF--']['ref_fasta'] | |
99 featurefile=refdict['--REF--']['feature_file'] | |
100 | |
101 samtools='%s/tools/samtools-0.1.16/samtools'%prada_path | |
102 bwa='%s/tools/bwa-0.5.7-mh/bwa'%prada_path | |
103 | |
104 ######################################################################### | |
105 print 'GUESS-if start: %s'%time.ctime() | |
106 print 'CMD: %s'%('\t'.join(args)) | |
107 | |
108 ##get gene position information | |
109 gobj=ioprada.read_feature_genes(featurefile,target)[0] | |
110 if gobj is None: | |
111 sys.exit('%s not found'%target) | |
112 | |
113 gchr=gobj.strand | |
114 gchr_rev=True if gchr=='-1' else False | |
115 | |
116 #Generate unmapped reads. | |
117 if extract_mask: | |
118 cmd='%s view -b -f 4 -F 8 %s > %s'%(samtools,sourcefile,unmapbam) | |
119 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) | |
120 while True: | |
121 if cmdout.poll() is None: | |
122 print 'Extracting unmapped reads...' | |
123 time.sleep(120) | |
124 pass | |
125 if cmdout.poll()==0: | |
126 print 'Extracted unmapped reads' | |
127 break | |
128 if cmdout.poll() is not None and cmdout.poll() != 0: | |
129 raise Exception('Error extracting unmapped reads') | |
130 | |
131 | |
132 #Generate junction db | |
133 juncfile='%s.junction.fasta'%target | |
134 cmd='perl %s/make_intragenic_junctions.pl %s %s %s %s %s %d > %s/%s'%(prada_path,target,target,ref_anno,ref_map,ref_fasta,junL,outdir,juncfile) | |
135 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) | |
136 while True: | |
137 if cmdout.poll() is None: | |
138 print 'Generating junction db. Waiting...' | |
139 time.sleep(20) | |
140 pass | |
141 if cmdout.poll()==0: | |
142 print 'Generated Junction DB.' | |
143 break | |
144 if cmdout.poll() is not None and cmdout.poll() != 0: | |
145 raise Exception('Error generated Junction DB.') | |
146 | |
147 #scan BAM file for mapping reads. | |
148 samfile=pysam.Samfile(sourcefile,'rb') | |
149 reads_se=[] | |
150 reads_as=[] | |
151 for alignedread in samfile.fetch(gobj.chr,gobj.start-1,gobj.end): | |
152 if alignedread.mapq >= minmapq: | |
153 mmf=[x[1] for x in alignedread.tags if x[0]=='NM'][0] | |
154 if mmf <= mm: | |
155 if alignedread.is_reverse==gchr_rev: #sense | |
156 reads_se.append(alignedread) | |
157 else: #anti-sense | |
158 reads_as.append(alignedread) | |
159 samfile.close() | |
160 | |
161 seonly,asonly=[],[] | |
162 for rd in reads_se: | |
163 if rd.mate_is_unmapped: | |
164 seonly.append(rd) | |
165 for rd in reads_as: | |
166 if rd.mate_is_unmapped: | |
167 asonly.append(rd) | |
168 seunmap=[x.qname for x in seonly] | |
169 asunmap=[x.qname for x in asonly] | |
170 | |
171 #find read sequences | |
172 print 'Extracting unmapped read sequences.' | |
173 print 'mate unmapped reads for sense strand:',len(seonly) | |
174 print 'mate unmapped reads for antisense strand:',len(asonly) | |
175 samfile=pysam.Samfile(unmapbam,'rb') | |
176 resfq_a=open('%s/%s_se_unmap.fq'%(outdir,target),'w') | |
177 resfq_b=open('%s/%s_as_unmap.fq'%(outdir,target),'w') | |
178 for item in samfile: | |
179 if item.qname in seunmap: | |
180 resfq_a.write('@%s\n'%item.qname) | |
181 resfq_a.write('%s\n'%item.seq) | |
182 resfq_a.write('+\n') | |
183 resfq_a.write('%s\n'%item.qual) | |
184 if item.qname in asunmap: | |
185 resfq_b.write('@%s\n'%item.qname) | |
186 resfq_b.write('%s\n'%item.seq) | |
187 resfq_b.write('+\n') | |
188 resfq_b.write('%s\n'%item.qual) | |
189 resfq_a.close() | |
190 resfq_b.close() | |
191 samfile.close() | |
192 | |
193 ##indexing junction db | |
194 print 'Aligning reads to junction db' | |
195 cmd='%s index %s/%s'%(bwa,outdir,juncfile) | |
196 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) | |
197 while True: | |
198 if cmdout.poll() is None: | |
199 time.sleep(3) | |
200 pass | |
201 if cmdout.poll()==0: | |
202 print 'Junction DB indexed.' | |
203 break | |
204 if cmdout.poll() is not None and cmdout.poll() != 0: | |
205 raise Exception('Error building junction db index') | |
206 | |
207 taga='%s_se'%target | |
208 tagb='%s_as'%target | |
209 for rs in [taga,tagb]: | |
210 cmd='%s aln -n %d -R 100 %s/%s %s/%s_unmap.fq > %s/%s_unmap.sai'%(bwa,mm,outdir,juncfile,outdir,rs,outdir,rs) | |
211 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) | |
212 while True: | |
213 if cmdout.poll() is None: | |
214 time.sleep(5) | |
215 pass | |
216 if cmdout.poll()==0: | |
217 print 'Aligned unmapped reads group %s'%rs | |
218 break | |
219 if cmdout.poll() is not None and cmdout.poll() != 0: | |
220 raise Exception('Error aligning unmapped reads for group %s'%rs) | |
221 | |
222 cmd='%s samse -n 1000 %s/%s %s/%s_unmap.sai %s/%s_unmap.fq > %s/%s_unmap.sam'%(bwa,outdir,juncfile,outdir,rs,outdir,rs,outdir,rs) | |
223 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) | |
224 while True: | |
225 if cmdout.poll() is None: | |
226 time.sleep(2) | |
227 pass | |
228 if cmdout.poll()==0: | |
229 print 'Converting to sam for group %s'%rs | |
230 break | |
231 if cmdout.poll() is not None and cmdout.poll() != 0: | |
232 raise Exception('Error converting to sam for group %s'%rs) | |
233 | |
234 qualrd_a=[] | |
235 junc_a=[] | |
236 samfile=pysam.Samfile('%s/%s_unmap.sam'%(outdir,taga),'r') | |
237 for rd in samfile: | |
238 if not rd.is_unmapped and rd.is_reverse: | |
239 qualrd_a.append(rd) | |
240 junc_a.append(samfile.getrname(rd.tid)) | |
241 samfile.close() | |
242 qualrd_b=[] | |
243 junc_b=[] | |
244 samfile=pysam.Samfile('%s/%s_unmap.sam'%(outdir,tagb),'r') | |
245 for rd in samfile: | |
246 if not rd.is_unmapped and not rd.is_reverse: | |
247 qualrd_b.append(rd) | |
248 junc_b.append(samfile.getrname(rd.tid)) | |
249 samfile.close() | |
250 | |
251 junc_span=[] | |
252 junc_span.extend(qualrd_a) | |
253 junc_span.extend(qualrd_b) | |
254 | |
255 junc_name=[] | |
256 junc_name.extend(junc_a) | |
257 junc_name.extend(junc_b) | |
258 | |
259 #Generate a summary report | |
260 sumfile=open('%s/%s.GUESS-IF.summary.txt'%(outdir,target),'w') | |
261 sumfile.write('%s\n'%(target)) | |
262 sumfile.write('\n') | |
263 sumfile.write('>spanning\n') | |
264 for i in range(len(junc_span)): | |
265 rd=junc_span[i] | |
266 jname=junc_name[i] | |
267 mm_j=[x[1] for x in rd.tags if x[0]=='NM'][0] | |
268 ss='%s\t%s.mm%d'%(rd.qname,jname,mm_j) | |
269 sumfile.write('%s\n'%ss) | |
270 | |
271 sumfile.write('\n') | |
272 sumfile.write('>junction\n') | |
273 juncol=[] | |
274 for item in set(junc_name): | |
275 nn=junc_name.count(item) | |
276 juncol.append([item,nn]) | |
277 juncol=sorted(juncol,key=lambda x:x[1],reverse=True) | |
278 for item in juncol: | |
279 sumfile.write('%s\t%s\n'%(item[0],item[1])) | |
280 | |
281 sumfile.write('\n') | |
282 sumfile.write('>summary\n') | |
283 sumfile.write('Number of Fusion Reads = %d\n'%len(junc_span)) | |
284 sumfile.write('Number of Distinct Junctions = %d\n'%len(set(junc_name))) | |
285 | |
286 sumfile.close() | |
287 print 'Done: %s'%time.ctime() | |
288 |