Mercurial > repos > vipints > rdiff
comparison rDiff/src/rdiff_caller.m @ 0:0f80a5141704
version 0.3 uploaded
author | vipints |
---|---|
date | Thu, 14 Feb 2013 23:38:36 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:0f80a5141704 |
---|---|
1 function STAT = rDiff_caller(PAR) | |
2 % genes = opt_transcripts_caller(PAR) | |
3 % | |
4 % -- input -- | |
5 % PAR contains | |
6 % CFG: configuration struct | |
7 % genes: struct defining genes with start, stops, exons etc. | |
8 % profile_weights: weights of profile functions | |
9 % intron_dists: distances to closest intron | |
10 % | |
11 % -- output -- | |
12 % genes: struct with additional fields of eg. estimated transcript weights | |
13 addpath ../../tests | |
14 addpath ../../variance/final | |
15 addpath ../../experimental | |
16 | |
17 samtools='/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/samtools/'; | |
18 CFG = PAR.CFG; | |
19 SUFF_NAME=CFG.SUFF_NAME; | |
20 %load(CFG.genes_path , 'genes'); | |
21 %[genes]=mask_dubl(genes,0); | |
22 | |
23 load(CFG.genes_path , 'genes'); | |
24 [genes]=mask_dubl(genes,0); | |
25 | |
26 %[genes]=remove_transcripts(genes); | |
27 genes=genes(PAR.genes_idx); | |
28 BAM_FILES=CFG.BAM_FILES; | |
29 JB_NR=CFG.JB_NR; | |
30 | |
31 | |
32 | |
33 %clear PAR; | |
34 | |
35 %%%% paths | |
36 addpath(CFG.paths); | |
37 addpath('/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/variance/nbin'); | |
38 addpath('/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/experiments/'); | |
39 addpath('/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/tests/'); | |
40 addpath('/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/read_utils/'); | |
41 addpath('/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/tests/sequence_tools/'); | |
42 addpath(genpath('/fml/ag-raetsch/home/drewe/svn/tools/chronux/')) | |
43 addpath('/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/tests/functions/'); | |
44 | |
45 out_base=CFG.out_base; | |
46 %sim_data_base=CFG.sim_data_base; | |
47 real_data_base=CFG.real_data_base; | |
48 | |
49 expr12_bam=CFG.expr12_bam; | |
50 expr11_bam=CFG.expr11_bam; | |
51 expr22_bam=CFG.expr22_bam; | |
52 expr21_bam=CFG.expr21_bam; | |
53 | |
54 CFG.SEQUENCED_LENGTH=80; | |
55 SEQUENCED_LENGTH=80; | |
56 %make map: gene->file | |
57 | |
58 IDX1=CFG.IDX11; | |
59 IDX2=CFG.IDX22; | |
60 | |
61 %BAM_FILES{IDX1} | |
62 %BAM_FILES{IDX2} | |
63 | |
64 | |
65 %genes=PAR.genes; | |
66 clear PAR | |
67 | |
68 STAT=cell(size(genes)); | |
69 do_save = 0; | |
70 out_base = '/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/out'; | |
71 save_dir=fullfile(out_base,'cache'); | |
72 | |
73 for i=1:size(genes,2) | |
74 %try | |
75 RESULT=cell(1,7) | |
76 gene = genes(i); | |
77 | |
78 RESULT{7}=JB_NR; | |
79 RESULT{1}=gene.name | |
80 load_only = false; | |
81 | |
82 if or(isempty(gene.exons),gene.stop-gene.start<=SEQUENCED_LENGTH) | |
83 PV=1; | |
84 RESULT{2}={PV,''}; | |
85 RESULT{3}={PV,''}; | |
86 RESULT{4}={PV,''}; | |
87 RESULT{5}=[Inf,Inf]; | |
88 RESULT{6}=[Inf,Inf]; | |
89 RESULT{7}={'empty gene exons'}; | |
90 RESULT{8}={PV,''}; | |
91 continue; | |
92 end | |
93 | |
94 PV1=1; | |
95 for l=1:size(gene.transcripts,2) | |
96 EXONS=gene.exons{l} | |
97 if sum(EXONS(:,2)-EXONS(:,1))<=SEQUENCED_LENGTH | |
98 PV=1; | |
99 RESULT{2}={PV,''}; | |
100 RESULT{3}={PV,''}; | |
101 RESULT{4}={PV,''}; | |
102 RESULT{5}=[Inf,Inf]; | |
103 RESULT{6}=[Inf,Inf]; | |
104 RESULT{7}={'empty gene exons'}; | |
105 RESULT{8}={PV,''}; | |
106 PV1=2; | |
107 | |
108 continue; | |
109 end | |
110 end | |
111 if PV1==2 | |
112 continue; | |
113 end | |
114 gene.name | |
115 | |
116 | |
117 | |
118 %adjust start and stop positions on the negative strand | |
119 if strcmp(gene.strand,'-') | |
120 gene.start=gene.start+1; | |
121 gene.stop=gene.stop+1; | |
122 [reads11,reads12,intron11,intron12] = get_reads_dual2_intron(expr11_bam,expr12_bam,gene,samtools,false,10); | |
123 [reads11,FLAG]=remove_reads_from_other_genes(reads11,gene); | |
124 [reads12,FLAG]=remove_reads_from_other_genes(reads12,gene); | |
125 | |
126 | |
127 [reads21,reads22,intron21,intron22] = get_reads_dual2_intron(expr21_bam,expr22_bam,gene,samtools,false,10); | |
128 [reads21,FLAG]=remove_reads_from_other_genes(reads21,gene); | |
129 [reads22,FLAG]=remove_reads_from_other_genes(reads22,gene); | |
130 gene.start=gene.start-1; | |
131 gene.stop=gene.stop-1; | |
132 else | |
133 [reads11,reads12,intron11,intron12] = get_reads_dual2_intron(expr11_bam,expr12_bam,gene,samtools,false,10); | |
134 [reads11,FLAG]=remove_reads_from_other_genes(reads11,gene); | |
135 [reads12,FLAG]=remove_reads_from_other_genes(reads12,gene); | |
136 | |
137 [reads21,reads22,intron21,intron22] = get_reads_dual2_intron(expr21_bam,expr22_bam,gene,samtools,false,10); | |
138 [reads21,FLAG]=remove_reads_from_other_genes(reads21,gene); | |
139 [reads22,FLAG]=remove_reads_from_other_genes(reads22,gene); | |
140 end | |
141 | |
142 reads11=reads11(sum(reads11,2)>70,:); | |
143 reads12=reads12(sum(reads12,2)>70,:); | |
144 reads21=reads21(sum(reads21,2)>70,:); | |
145 reads22=reads22(sum(reads22,2)>70,:); | |
146 %% TEST with read cliping | |
147 | |
148 READS1={reads11,reads12}; | |
149 READS2={reads21,reads22}; | |
150 reads1=[reads11;reads12]; | |
151 reads2=[reads21;reads22]; | |
152 | |
153 TOTAL_SIZE=(size(reads12,1)+size(reads12,1)+(size(reads21,1)+size(reads22,1))); | |
154 MIN_SIZE=min(size(reads12,1)+size(reads12,1),(size(reads21,1)+size(reads22,1))); | |
155 %keyboard | |
156 | |
157 CLIP=0; | |
158 if 1==0 | |
159 if (size(reads1,1)>0) | |
160 for i=1:size(reads1,1) | |
161 reads1(i,find(reads1(i,:),2,'first'))=0; | |
162 end | |
163 for i=1:size(reads1,1) | |
164 reads1(i,find(reads1(i,:),2,'last'))=0; | |
165 end | |
166 end | |
167 if (size(reads2,1)>0) | |
168 for i=1:size(reads2,1) | |
169 reads2(i,find(reads2(i,:),2,'first'))=0; | |
170 end | |
171 for i=1:size(reads2,1) | |
172 reads2(i,find(reads2(i,:),2,'last'))=0; | |
173 end | |
174 end | |
175 CLIP=4; | |
176 end | |
177 | |
178 % keybloard | |
179 %% END TEST | |
180 | |
181 COUNTER=3; | |
182 FLAG=0; | |
183 for k=1:size(gene.transcripts,2) | |
184 if sum(gene.exons{k}(:,2)-gene.exons{k}(:,1))<75 | |
185 FLAG=1; | |
186 end | |
187 end | |
188 | |
189 if FLAG==1 | |
190 RESULT{2}={'gene shorter than 75 bp'}; | |
191 STAT{i}=RESULT; | |
192 continue | |
193 end | |
194 | |
195 if 1==0 | |
196 | |
197 for V_COUNTER=2%:length(CFG.VARIANCES1) | |
198 if (TOTAL_SIZE>0) | |
199 if(MIN_SIZE>0) | |
200 PV=1; | |
201 INFO=''; | |
202 [PV,INFO] =diff_mmd([reads11;reads12],[reads21;reads22],gene); | |
203 PV | |
204 RESULT{COUNTER}={PV,INFO}; | |
205 COUNTER=COUNTER+1; | |
206 else | |
207 PV=1; | |
208 RESULT{COUNTER}={PV,2}; | |
209 COUNTER=COUNTER+1; | |
210 end | |
211 else | |
212 PV=1; | |
213 RESULT{COUNTER}={PV,1}; | |
214 COUNTER=COUNTER+1; | |
215 end | |
216 end | |
217 end | |
218 % keyboard | |
219 | |
220 VAR_FACT=1; | |
221 COV_POS=1; | |
222 REWEIGHT=0; | |
223 if 1==1 | |
224 for V_COUNTER=2%:length(CFG.VARIANCES1) | |
225 if (TOTAL_SIZE>0) | |
226 if(MIN_SIZE>0) | |
227 cen_ARR=0.1:0.1:1; | |
228 REWEIGHT_WEIGHTS=zeros(length( cen_ARR),size(reads1,2)); | |
229 cen_count=1; | |
230 for censor_frac= cen_ARR | |
231 %if strcmp(gene.name,'AT1G01073') | |
232 % keyboard | |
233 %end | |
234 censor_frac | |
235 temp_reads1=reads1; | |
236 temp_reads2=reads2; | |
237 %cut to relevant positions | |
238 COVERAGE=sum(temp_reads1,1)+sum(temp_reads2,1); | |
239 NONZERO=COVERAGE>0; | |
240 SORTED_COVERAGES=sort(COVERAGE(NONZERO)); | |
241 NR_OF_NONZERO=sum(NONZERO); | |
242 CHOSEN_POSITIONS=COVERAGE<=SORTED_COVERAGES(ceil(NR_OF_NONZERO*censor_frac)); | |
243 REWEIGHT_WEIGHTS(cen_count,CHOSEN_POSITIONS)=1; | |
244 cen_count=cen_count+1; | |
245 %temp_reads1=temp_reads1(:,CHOSEN_POSITIONS); | |
246 %temp_reads2=temp_reads2(:,CHOSEN_POSITIONS); | |
247 %remove reads which have no coverage; | |
248 %temp_reads1=temp_reads1(sum(temp_reads1,2)>0,:); | |
249 %temp_reads2=temp_reads2(sum(temp_reads2,2)>0,:); | |
250 %SIZE1=size(temp_reads1,1); | |
251 %SIZE2=size(temp_reads2,1); | |
252 %MIN_SIZE=min(SIZE1,SIZE2); | |
253 %TOTAL_SIZE=SIZE1+SIZE2; | |
254 end | |
255 if (TOTAL_SIZE>0) | |
256 if(MIN_SIZE>0) | |
257 PV=1; | |
258 INFO=''; | |
259 REWEIGHT=1; | |
260 [PV,INFO] =diff_mmd_variance_subsample3({reads11,reads12},{reads21,reads22},gene,CFG.VARIANCES1{V_COUNTER},CFG.VARIANCES2{V_COUNTER},1,1,REWEIGHT, REWEIGHT_WEIGHTS) | |
261 RESULT{COUNTER}={PV,INFO}; | |
262 COUNTER=COUNTER+1; | |
263 else | |
264 PV=1; | |
265 RESULT{COUNTER}={PV,2}; | |
266 COUNTER=COUNTER+1; | |
267 end | |
268 else | |
269 PV=1; | |
270 RESULT{COUNTER}={PV,1}; | |
271 COUNTER=COUNTER+1; | |
272 end | |
273 end | |
274 end | |
275 end | |
276 end | |
277 | |
278 if 1==0 | |
279 VAR_FACT=1; | |
280 COV_POS=1; | |
281 REWEIGHT=0; | |
282 for V_COUNTER=2%:length(CFG.VARIANCES1) | |
283 for cov_fact_ix=1:length(COV_POS) | |
284 | |
285 if (TOTAL_SIZE>0) | |
286 if(MIN_SIZE>0) | |
287 PV=1; | |
288 INFO=''; | |
289 [PV,INFO] =diff_mmd_variance_subsample2({reads11,reads12},{reads21,reads22},gene,CFG.VARIANCES1{V_COUNTER},CFG.VARIANCES2{V_COUNTER},VAR_FACT(var_fact_ix),COV_POS,REWEIGHT); | |
290 RESULT{COUNTER}={PV,INFO}; | |
291 COUNTER=COUNTER+1; | |
292 else | |
293 PV=1; | |
294 RESULT{COUNTER}={PV,2}; | |
295 COUNTER=COUNTER+1; | |
296 end | |
297 else | |
298 PV=1; | |
299 RESULT{COUNTER}={PV,1}; | |
300 COUNTER=COUNTER+1; | |
301 end | |
302 end | |
303 end | |
304 | |
305 if 1==1 | |
306 VAR_FACT=1; | |
307 COV_POS=1; | |
308 REWEIGHT=1; | |
309 for V_COUNTER=2%:length(CFG.VARIANCES1) | |
310 for var_fact_ix=1:length(VAR_FACT) | |
311 | |
312 if (TOTAL_SIZE>0) | |
313 if(MIN_SIZE>0) | |
314 PV=1; | |
315 INFO=''; | |
316 [PV,INFO] =diff_mmd_variance_subsample2({reads11,reads12},{reads21,reads22},gene,CFG.VARIANCES1{V_COUNTER},CFG.VARIANCES2{V_COUNTER},VAR_FACT(var_fact_ix),COV_POS,REWEIGHT) | |
317 RESULT{COUNTER}={PV,INFO}; | |
318 COUNTER=COUNTER+1; | |
319 else | |
320 PV=1; | |
321 RESULT{COUNTER}={PV,2}; | |
322 COUNTER=COUNTER+1; | |
323 end | |
324 else | |
325 PV=1; | |
326 RESULT{COUNTER}={PV,1}; | |
327 COUNTER=COUNTER+1; | |
328 end | |
329 end | |
330 end | |
331 end | |
332 end | |
333 | |
334 %keyboard | |
335 TOTAL_SIZE=(size(reads12,1)+size(reads12,1)+(size(reads21,1)+size(reads22,1))); | |
336 MIN_SIZE=min(size(reads12,1)+size(reads12,1),(size(reads21,1)+size(reads22,1))); | |
337 if 1==0 | |
338 for V_COUNTER=1 | |
339 if (TOTAL_SIZE>0) | |
340 if(MIN_SIZE>0) | |
341 PV=1; | |
342 INFO=''; | |
343 if isempty([intron11,intron12])|isempty([intron21,intron22]) | |
344 [PV,INFO] =diff_mmd_variance_NB_NB_simple([reads11;reads12],[reads21;reads22],gene,CFG.VARIANCES1{V_COUNTER},CFG.VARIANCES2{V_COUNTER}); | |
345 else | |
346 [PV,INFO] =diff_mmd_variance_splice([reads11;reads12],[reads21;reads22],0.5,[intron11,intron12],[intron21,intron22],gene,CFG.VARIANCES1{V_COUNTER},CFG.VARIANCES2{V_COUNTER}); | |
347 end | |
348 RESULT{COUNTER}={PV,INFO}; | |
349 COUNTER=COUNTER+1; | |
350 else | |
351 PV=1; | |
352 RESULT{COUNTER}={PV,2}; | |
353 COUNTER=COUNTER+1; | |
354 end | |
355 else | |
356 PV=1; | |
357 RESULT{COUNTER}={PV,1}; | |
358 COUNTER=COUNTER+1; | |
359 end | |
360 end | |
361 end | |
362 PV=1; | |
363 %keyboard | |
364 for V_COUNTER=length(CFG.VARIANCES1) | |
365 if (TOTAL_SIZE>0) | |
366 | |
367 if(MIN_SIZE>0) | |
368 [P_VALUE, INFO]= diff_nbin7(READS1,READS2,gene,SEQUENCED_LENGTH,CFG.VARIANCES1{V_COUNTER},CFG.VARIANCES2{V_COUNTER}); | |
369 RESULT{COUNTER}={P_VALUE,INFO}; | |
370 if not(isempty(INFO)) | |
371 if iscell(INFO) | |
372 RESULT{COUNTER}=INFO{5}; | |
373 COUNTER=COUNTER+1; | |
374 end | |
375 end | |
376 else | |
377 PV=1; | |
378 RESULT{COUNTER}={PV,2}; | |
379 COUNTER=COUNTER+1; | |
380 end | |
381 else | |
382 PV=1; | |
383 RESULT{COUNTER}={PV,1}; | |
384 COUNTER=COUNTER+1; | |
385 end | |
386 end | |
387 | |
388 | |
389 [SPLICINGEVENTS,SEQUENCE,EXONSEQUENCE]=splicingsequence(gene); | |
390 [UNIQUE_NEW_EXONS,GRAPHNODES,ORDER_OF_GRAPHNODE,EIRS_IN_SEQ]=transform_single_end_reads(SPLICINGEVENTS,SEQUENCE,EXONSEQUENCE,CFG.SEQUENCED_LENGTH-CLIP); | |
391 [NEW_READS1,UNEXPLAINED_READS1,UNEXPLAINED_INDEX1]= convert_reads_to_region_indicators([reads11;reads12],UNIQUE_NEW_EXONS,GRAPHNODES,ORDER_OF_GRAPHNODE,EIRS_IN_SEQ,gene); | |
392 [NEW_READS2,UNEXPLAINED_READS2,UNEXPLAINED_INDEX2]= convert_reads_to_region_indicators([reads21;reads22],UNIQUE_NEW_EXONS,GRAPHNODES,ORDER_OF_GRAPHNODE,EIRS_IN_SEQ,gene); | |
393 % keyboard | |
394 TOTAL_SIZE=(size(NEW_READS1,1)+(size(NEW_READS2,1))); | |
395 MIN_SIZE=min(size(NEW_READS1,1),(size(NEW_READS2,1))); | |
396 | |
397 if (TOTAL_SIZE>0) | |
398 if(MIN_SIZE>0) | |
399 [PV,INFO] = diff_poisson_bonf_3_unequal_segment(NEW_READS1,NEW_READS2,gene,CFG.SEQUENCED_LENGTH-CLIP); | |
400 RESULT{COUNTER}={PV,INFO}; | |
401 COUNTER=COUNTER+1; | |
402 else | |
403 PV=1; | |
404 RESULT{COUNTER}={PV,2}; | |
405 COUNTER=COUNTER+1; | |
406 end | |
407 else | |
408 PV=1; | |
409 RESULT{COUNTER}={PV,1}; | |
410 COUNTER=COUNTER+1; | |
411 end | |
412 PV=1 | |
413 if 1==0 | |
414 if (TOTAL_SIZE>0) | |
415 if(MIN_SIZE>0) | |
416 [PV,INFO] = diff_poisson_bonf_4_unequal_segment(NEW_READS1,NEW_READS2,gene,CFG.SEQUENCED_LENGTH-CLIP); | |
417 RESULT{COUNTER}={PV,INFO}; | |
418 COUNTER=COUNTER+1; | |
419 else | |
420 PV=1; | |
421 RESULT{COUNTER}={PV,2}; | |
422 COUNTER=COUNTER+1; | |
423 end | |
424 else | |
425 PV=1; | |
426 RESULT{COUNTER}={PV,1}; | |
427 COUNTER=COUNTER+1; | |
428 end | |
429 %keyboard | |
430 end | |
431 STAT{i}=RESULT; | |
432 | |
433 end; | |
434 %keyboard | |
435 OUT_FILENAME=['/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/out/analysis_artificial_variance_03_08_2012/' SUFF_NAME '_rep_mmd_07_07_2012_' int2str(JB_NR) '.mat']; | |
436 save(OUT_FILENAME,'STAT') |