Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/prada-guess-if @ 0:acc2ca1a3ba4
Uploaded
| author | siyuan |
|---|---|
| date | Thu, 20 Feb 2014 00:44:58 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:acc2ca1a3ba4 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 #GUESS-if is to find abnormal intragenic fusions. | |
| 4 #It is an extension of the GUESS-ft method. | |
| 5 | |
| 6 import pysam | |
| 7 import subprocess | |
| 8 import os | |
| 9 import time | |
| 10 import sys | |
| 11 import bioclass | |
| 12 import ioprada | |
| 13 | |
| 14 args=sys.argv | |
| 15 | |
| 16 #Get all parameters | |
| 17 help_menu='''\nUsage: prada-guess-if Gene -conf xx.txt -inputbam X -mm 1 -minmapq 30 -junL X -outdir ./ -unmap X\n | |
| 18 **Parameters**: | |
| 19 -conf the configure file. see prada-fusion -conf for details | |
| 20 -inputbam the input bam file | |
| 21 -mm number of mismatch allowed | |
| 22 -minmapq mininum mapping quality for reads to be used in fusion finding | |
| 23 -junL length of exons to be used for junctions. see prada-fusion | |
| 24 -outdir output directory | |
| 25 -unmap the numapped reads. useful if user need to run guess-ft multiple times | |
| 26 ''' | |
| 27 | |
| 28 if '-h' in args or '-help' in args or len(args)==1: | |
| 29 print help_menu | |
| 30 sys.exit(0) | |
| 31 | |
| 32 ######################################################################### | |
| 33 target=args[1] | |
| 34 if '-inputbam' not in args: | |
| 35 sys.exit('Input BAM needed') | |
| 36 else: | |
| 37 i=args.index('-inputbam') | |
| 38 sourcefile=args[i+1] | |
| 39 if '-outdir' not in args: | |
| 40 outdir='./' | |
| 41 else: | |
| 42 i=args.index('-outdir') | |
| 43 outdir=args[i+1] | |
| 44 if not os.path.exists(outdir): | |
| 45 os.mkdir(outdir) | |
| 46 if '-unmap' not in args: | |
| 47 unmapbam='%s/one.end.unmapped.bam'%outdir | |
| 48 extract_mask=1 | |
| 49 else: | |
| 50 i=args.index('-unmap') | |
| 51 unmapbam=args[i+1] | |
| 52 extract_mask=0 | |
| 53 | |
| 54 if '-mm' not in args: | |
| 55 mm=1 | |
| 56 else: | |
| 57 i=args.index('-mm') | |
| 58 mm=int(args[i+1]) | |
| 59 | |
| 60 #minimum mapping quality for reads as fusion evidences | |
| 61 if '-minmapq' not in args: | |
| 62 minmapq=30 | |
| 63 else: | |
| 64 i=args.index('-minmapq') | |
| 65 minmapq=int(args[i+1]) | |
| 66 | |
| 67 if '-junL' not in args: | |
| 68 sys.exit('-junL needed') | |
| 69 else: | |
| 70 i=args.index('-junL') | |
| 71 junL=int(args[i+1]) | |
| 72 | |
| 73 prada_path=os.path.dirname(os.path.abspath(__file__)) #### | |
| 74 ref_search_path=[prada_path,os.getcwd()] #search path for ref file if not specified in command line | |
| 75 | |
| 76 if '-conf' in args: | |
| 77 i=args.index('-conf') | |
| 78 reffile=args[i+1] | |
| 79 if os.path.exists(reffile): | |
| 80 pass | |
| 81 else: | |
| 82 for pth in ref_search_path: | |
| 83 new_reffile='%s/%s'%(pth, os.path.basename(reffile)) | |
| 84 if os.path.exists(new_reffile): | |
| 85 reffile=new_reffile | |
| 86 break | |
| 87 else: | |
| 88 sys.exit('ERROR: ref file %s not found'%reffile) | |
| 89 else: | |
| 90 reffile='%s/conf.txt'%prada_path | |
| 91 if not os.path.exists(reffile): | |
| 92 sys.exit('ERROR: No default conf.txt found and none specified') | |
| 93 | |
| 94 #reference files | |
| 95 refdict=ioprada.read_conf(reffile) | |
| 96 ref_anno=refdict['--REF--']['ref_anno'] | |
| 97 ref_map=refdict['--REF--']['ref_map'] | |
| 98 ref_fasta=refdict['--REF--']['ref_fasta'] | |
| 99 featurefile=refdict['--REF--']['feature_file'] | |
| 100 | |
| 101 samtools='%s/tools/samtools-0.1.16/samtools'%prada_path | |
| 102 bwa='%s/tools/bwa-0.5.7-mh/bwa'%prada_path | |
| 103 | |
| 104 ######################################################################### | |
| 105 print 'GUESS-if start: %s'%time.ctime() | |
| 106 print 'CMD: %s'%('\t'.join(args)) | |
| 107 | |
| 108 ##get gene position information | |
| 109 gobj=ioprada.read_feature_genes(featurefile,target)[0] | |
| 110 if gobj is None: | |
| 111 sys.exit('%s not found'%target) | |
| 112 | |
| 113 gchr=gobj.strand | |
| 114 gchr_rev=True if gchr=='-1' else False | |
| 115 | |
| 116 #Generate unmapped reads. | |
| 117 if extract_mask: | |
| 118 cmd='%s view -b -f 4 -F 8 %s > %s'%(samtools,sourcefile,unmapbam) | |
| 119 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) | |
| 120 while True: | |
| 121 if cmdout.poll() is None: | |
| 122 print 'Extracting unmapped reads...' | |
| 123 time.sleep(120) | |
| 124 pass | |
| 125 if cmdout.poll()==0: | |
| 126 print 'Extracted unmapped reads' | |
| 127 break | |
| 128 if cmdout.poll() is not None and cmdout.poll() != 0: | |
| 129 raise Exception('Error extracting unmapped reads') | |
| 130 | |
| 131 | |
| 132 #Generate junction db | |
| 133 juncfile='%s.junction.fasta'%target | |
| 134 cmd='perl %s/make_intragenic_junctions.pl %s %s %s %s %s %d > %s/%s'%(prada_path,target,target,ref_anno,ref_map,ref_fasta,junL,outdir,juncfile) | |
| 135 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) | |
| 136 while True: | |
| 137 if cmdout.poll() is None: | |
| 138 print 'Generating junction db. Waiting...' | |
| 139 time.sleep(20) | |
| 140 pass | |
| 141 if cmdout.poll()==0: | |
| 142 print 'Generated Junction DB.' | |
| 143 break | |
| 144 if cmdout.poll() is not None and cmdout.poll() != 0: | |
| 145 raise Exception('Error generated Junction DB.') | |
| 146 | |
| 147 #scan BAM file for mapping reads. | |
| 148 samfile=pysam.Samfile(sourcefile,'rb') | |
| 149 reads_se=[] | |
| 150 reads_as=[] | |
| 151 for alignedread in samfile.fetch(gobj.chr,gobj.start-1,gobj.end): | |
| 152 if alignedread.mapq >= minmapq: | |
| 153 mmf=[x[1] for x in alignedread.tags if x[0]=='NM'][0] | |
| 154 if mmf <= mm: | |
| 155 if alignedread.is_reverse==gchr_rev: #sense | |
| 156 reads_se.append(alignedread) | |
| 157 else: #anti-sense | |
| 158 reads_as.append(alignedread) | |
| 159 samfile.close() | |
| 160 | |
| 161 seonly,asonly=[],[] | |
| 162 for rd in reads_se: | |
| 163 if rd.mate_is_unmapped: | |
| 164 seonly.append(rd) | |
| 165 for rd in reads_as: | |
| 166 if rd.mate_is_unmapped: | |
| 167 asonly.append(rd) | |
| 168 seunmap=[x.qname for x in seonly] | |
| 169 asunmap=[x.qname for x in asonly] | |
| 170 | |
| 171 #find read sequences | |
| 172 print 'Extracting unmapped read sequences.' | |
| 173 print 'mate unmapped reads for sense strand:',len(seonly) | |
| 174 print 'mate unmapped reads for antisense strand:',len(asonly) | |
| 175 samfile=pysam.Samfile(unmapbam,'rb') | |
| 176 resfq_a=open('%s/%s_se_unmap.fq'%(outdir,target),'w') | |
| 177 resfq_b=open('%s/%s_as_unmap.fq'%(outdir,target),'w') | |
| 178 for item in samfile: | |
| 179 if item.qname in seunmap: | |
| 180 resfq_a.write('@%s\n'%item.qname) | |
| 181 resfq_a.write('%s\n'%item.seq) | |
| 182 resfq_a.write('+\n') | |
| 183 resfq_a.write('%s\n'%item.qual) | |
| 184 if item.qname in asunmap: | |
| 185 resfq_b.write('@%s\n'%item.qname) | |
| 186 resfq_b.write('%s\n'%item.seq) | |
| 187 resfq_b.write('+\n') | |
| 188 resfq_b.write('%s\n'%item.qual) | |
| 189 resfq_a.close() | |
| 190 resfq_b.close() | |
| 191 samfile.close() | |
| 192 | |
| 193 ##indexing junction db | |
| 194 print 'Aligning reads to junction db' | |
| 195 cmd='%s index %s/%s'%(bwa,outdir,juncfile) | |
| 196 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) | |
| 197 while True: | |
| 198 if cmdout.poll() is None: | |
| 199 time.sleep(3) | |
| 200 pass | |
| 201 if cmdout.poll()==0: | |
| 202 print 'Junction DB indexed.' | |
| 203 break | |
| 204 if cmdout.poll() is not None and cmdout.poll() != 0: | |
| 205 raise Exception('Error building junction db index') | |
| 206 | |
| 207 taga='%s_se'%target | |
| 208 tagb='%s_as'%target | |
| 209 for rs in [taga,tagb]: | |
| 210 cmd='%s aln -n %d -R 100 %s/%s %s/%s_unmap.fq > %s/%s_unmap.sai'%(bwa,mm,outdir,juncfile,outdir,rs,outdir,rs) | |
| 211 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) | |
| 212 while True: | |
| 213 if cmdout.poll() is None: | |
| 214 time.sleep(5) | |
| 215 pass | |
| 216 if cmdout.poll()==0: | |
| 217 print 'Aligned unmapped reads group %s'%rs | |
| 218 break | |
| 219 if cmdout.poll() is not None and cmdout.poll() != 0: | |
| 220 raise Exception('Error aligning unmapped reads for group %s'%rs) | |
| 221 | |
| 222 cmd='%s samse -n 1000 %s/%s %s/%s_unmap.sai %s/%s_unmap.fq > %s/%s_unmap.sam'%(bwa,outdir,juncfile,outdir,rs,outdir,rs,outdir,rs) | |
| 223 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) | |
| 224 while True: | |
| 225 if cmdout.poll() is None: | |
| 226 time.sleep(2) | |
| 227 pass | |
| 228 if cmdout.poll()==0: | |
| 229 print 'Converting to sam for group %s'%rs | |
| 230 break | |
| 231 if cmdout.poll() is not None and cmdout.poll() != 0: | |
| 232 raise Exception('Error converting to sam for group %s'%rs) | |
| 233 | |
| 234 qualrd_a=[] | |
| 235 junc_a=[] | |
| 236 samfile=pysam.Samfile('%s/%s_unmap.sam'%(outdir,taga),'r') | |
| 237 for rd in samfile: | |
| 238 if not rd.is_unmapped and rd.is_reverse: | |
| 239 qualrd_a.append(rd) | |
| 240 junc_a.append(samfile.getrname(rd.tid)) | |
| 241 samfile.close() | |
| 242 qualrd_b=[] | |
| 243 junc_b=[] | |
| 244 samfile=pysam.Samfile('%s/%s_unmap.sam'%(outdir,tagb),'r') | |
| 245 for rd in samfile: | |
| 246 if not rd.is_unmapped and not rd.is_reverse: | |
| 247 qualrd_b.append(rd) | |
| 248 junc_b.append(samfile.getrname(rd.tid)) | |
| 249 samfile.close() | |
| 250 | |
| 251 junc_span=[] | |
| 252 junc_span.extend(qualrd_a) | |
| 253 junc_span.extend(qualrd_b) | |
| 254 | |
| 255 junc_name=[] | |
| 256 junc_name.extend(junc_a) | |
| 257 junc_name.extend(junc_b) | |
| 258 | |
| 259 #Generate a summary report | |
| 260 sumfile=open('%s/%s.GUESS-IF.summary.txt'%(outdir,target),'w') | |
| 261 sumfile.write('%s\n'%(target)) | |
| 262 sumfile.write('\n') | |
| 263 sumfile.write('>spanning\n') | |
| 264 for i in range(len(junc_span)): | |
| 265 rd=junc_span[i] | |
| 266 jname=junc_name[i] | |
| 267 mm_j=[x[1] for x in rd.tags if x[0]=='NM'][0] | |
| 268 ss='%s\t%s.mm%d'%(rd.qname,jname,mm_j) | |
| 269 sumfile.write('%s\n'%ss) | |
| 270 | |
| 271 sumfile.write('\n') | |
| 272 sumfile.write('>junction\n') | |
| 273 juncol=[] | |
| 274 for item in set(junc_name): | |
| 275 nn=junc_name.count(item) | |
| 276 juncol.append([item,nn]) | |
| 277 juncol=sorted(juncol,key=lambda x:x[1],reverse=True) | |
| 278 for item in juncol: | |
| 279 sumfile.write('%s\t%s\n'%(item[0],item[1])) | |
| 280 | |
| 281 sumfile.write('\n') | |
| 282 sumfile.write('>summary\n') | |
| 283 sumfile.write('Number of Fusion Reads = %d\n'%len(junc_span)) | |
| 284 sumfile.write('Number of Distinct Junctions = %d\n'%len(set(junc_name))) | |
| 285 | |
| 286 sumfile.close() | |
| 287 print 'Done: %s'%time.ctime() | |
| 288 |
