Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/prada-homology @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc2ca1a3ba4 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 #This script is part of the pyPRADA package. It is used to calculate sequence similarity of genes. | |
4 #In practice, it finds longest transcripts and uses blastn to compare the sequences. | |
5 #It is used to filter out fusion false positives like family genes etc. | |
6 #A temp folder is generated to save transcript fasta files. | |
7 | |
8 import os | |
9 import re | |
10 import sys | |
11 import time | |
12 import subprocess | |
13 import bioclass | |
14 from Bio import SeqIO | |
15 import ioprada | |
16 | |
17 ###################################################################################### | |
18 help_menu='''\nUsage: prada-homology -conf xx.txt -i inputfile -o outputfile -tmpdir XXX | |
19 **parameters** | |
20 -conf see prada-fusion -conf for details | |
21 -i a tab/space delimited two-column file --- gene1 gene2 | |
22 -o output file | |
23 -tmpdir tmporary directory that saves fasta files | |
24 ''' | |
25 | |
26 args=sys.argv | |
27 | |
28 if '-h' in args or '-help' in args or len(args)==1: | |
29 print help_menu | |
30 sys.exit(0) | |
31 | |
32 if '-i' not in sys.argv: | |
33 sys.exit('Input file needed') | |
34 else: | |
35 i=sys.argv.index('-i') | |
36 datafilename=sys.argv[i+1] | |
37 if '-o' not in sys.argv: | |
38 sys.exit('Output file needed') | |
39 else: | |
40 i=sys.argv.index('-o') | |
41 outfilename=sys.argv[i+1] | |
42 if '-tmpdir' not in sys.argv: | |
43 sys.exit('TMP dir needed') | |
44 else: | |
45 i=sys.argv.index('-tmpdir') | |
46 blastseq_tmp_dir=sys.argv[i+1] | |
47 if os.path.exists(blastseq_tmp_dir): | |
48 print 'Warning: tmpdir %s exists!'%blastseq_tmp_dir | |
49 else: | |
50 cmd='mkdir %s'%blastseq_tmp_dir | |
51 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) | |
52 | |
53 prada_path=os.path.dirname(os.path.abspath(__file__)) #### | |
54 ref_search_path=[prada_path,os.getcwd()] #search path for ref file if not specified in command line | |
55 | |
56 if '-conf' in args: | |
57 i=args.index('-conf') | |
58 reffile=args[i+1] | |
59 if os.path.exists(reffile): | |
60 pass | |
61 else: | |
62 for pth in ref_search_path: | |
63 new_reffile='%s/%s'%(pth, os.path.basename(reffile)) | |
64 if os.path.exists(new_reffile): | |
65 reffile=new_reffile | |
66 break | |
67 else: | |
68 sys.exit('ERROR: conf file %s not found'%reffile) | |
69 else: | |
70 reffile='%s/conf.txt'%prada_path | |
71 if not os.path.exists(reffile): | |
72 sys.exit('ERROR: No default conf.txt found and none specified') | |
73 | |
74 #Now print all input parameters | |
75 print 'CMD: %s'%('\t'.join(args)) | |
76 | |
77 #reference files | |
78 refdict=ioprada.read_conf(reffile) | |
79 featurefile=refdict['--REF--']['feature_file'] | |
80 refdb=refdict['--REF--']['tx_seq_file'] | |
81 | |
82 blastn='%s/tools/blastn'%prada_path | |
83 | |
84 #################Here We Go############################################# | |
85 ##Read information from raw file | |
86 print 'Collecting gene/Transcript info ...' | |
87 #call functions in ioprada module // | |
88 txdb,genedb=ioprada.read_feature(featurefile,verbose=False) | |
89 | |
90 ################################################################## | |
91 ##Get all candidate gene pairs | |
92 infile=open(datafilename) | |
93 candidates={} | |
94 for line in infile: | |
95 if line.strip(): | |
96 info=line.split() | |
97 geneA,geneB=info[0],info[1] | |
98 key='%s_%s'%(geneA,geneB) | |
99 candidates[key]='' | |
100 infile.close() | |
101 print '%d unique gene pairs collected'%len(candidates) | |
102 | |
103 ##Select the transcript for each gene | |
104 selecttranscript={} | |
105 for gene in genedb: | |
106 txs=genedb[gene].transcript | |
107 stx=txs[0] | |
108 initlen=stx.length | |
109 for tx in txs: | |
110 if tx.length >= initlen: | |
111 stx=tx | |
112 initlen=stx.length | |
113 selecttranscript[gene]=stx.name | |
114 print 'Selected longest transcript for each gene.' | |
115 | |
116 ##Get all needed sequences | |
117 allpartners=set() | |
118 for item in candidates: | |
119 sset=set(item.split('_')) | |
120 allpartners=allpartners.union(sset) | |
121 | |
122 #################################################################### | |
123 def seqblast(seqA,seqB): | |
124 '''seqA,seqB:fasta files''' | |
125 global blastn | |
126 cmdstr='%s -task=blastn -subject %s -query %s -outfmt 6'%(blastn,seqA,seqB) | |
127 cmdout=subprocess.Popen(cmdstr.split(),stdout=subprocess.PIPE,stderr=subprocess.STDOUT) | |
128 result=cmdout.stdout.read() | |
129 best_align=result.split('\n')[0] | |
130 if best_align=='': | |
131 return None | |
132 else: | |
133 info=best_align.split('\t') | |
134 identity=info[2].strip() | |
135 align_len=info[3].strip() | |
136 evalue=info[10].strip() | |
137 bit_score=info[11].strip() | |
138 return [identity,align_len,evalue,bit_score] | |
139 | |
140 presenttxs=[] #tx that is present in our annotation | |
141 absent=[] #tx that is not in our annotation | |
142 for gene in allpartners: | |
143 if selecttranscript.has_key(gene): | |
144 presenttxs.append(selecttranscript[gene]) | |
145 else: | |
146 absent.append(gene) | |
147 | |
148 for seq_record in SeqIO.parse(refdb,'fasta'): | |
149 sid=seq_record.id | |
150 seq=seq_record.seq | |
151 if sid in presenttxs: | |
152 g=txdb[sid].gene | |
153 fastafile=open('%s/%s.fasta'%(blastseq_tmp_dir,g),'w') | |
154 SeqIO.write(seq_record,fastafile,'fasta') | |
155 fastafile.close() | |
156 | |
157 for gp in candidates: | |
158 geneA,geneB=gp.split('_') | |
159 if geneA in absent or geneB in absent: | |
160 candidates[gp]=['NA']*6 | |
161 else: | |
162 gaseq='%s/%s.fasta'%(blastseq_tmp_dir,geneA) | |
163 gaobj=SeqIO.parse(gaseq,'fasta').next() | |
164 gbseq='%s/%s.fasta'%(blastseq_tmp_dir,geneB) | |
165 gbobj=SeqIO.parse(gbseq,'fasta').next() | |
166 ga_len,gb_len=str(len(gaobj.seq)),str(len(gbobj.seq)) | |
167 print 'Comparing %s %s'%(geneA,geneB) | |
168 a=seqblast(gaseq,gbseq) | |
169 if a==None: | |
170 candidates[gp]=[ga_len,gb_len,'NA','NA','100','0'] | |
171 else: | |
172 a[0:0]=[ga_len,gb_len] | |
173 candidates[gp]=a | |
174 | |
175 | |
176 infile=open(datafilename) | |
177 outfile=open(outfilename,'w') | |
178 header=['#Gene1','Gene2','Transcript1_Len','Transcript2_Len','Identity','Align_Len','Evalue','BitScore'] | |
179 outfile.write('%s\n'%('\t'.join(header))) | |
180 for line in infile: | |
181 if line.strip(): | |
182 info=line.split() | |
183 geneA,geneB=info[0],info[1] | |
184 key='%s_%s'%(geneA,geneB) | |
185 vv=candidates[key] | |
186 outfile.write('%s\t%s\t%s\n'%(geneA,geneB,'\t'.join(vv))) | |
187 outfile.close() | |
188 infile.close() | |
189 | |
190 print 'Homolgy test done!' |