0
|
1 function STAT = rDiff_caller(PAR)
|
|
2 % genes = opt_transcripts_caller(PAR)
|
|
3 %
|
|
4 % -- input --
|
|
5 % PAR contains
|
|
6 % CFG: configuration struct
|
|
7 % genes: struct defining genes with start, stops, exons etc.
|
|
8 % profile_weights: weights of profile functions
|
|
9 % intron_dists: distances to closest intron
|
|
10 %
|
|
11 % -- output --
|
|
12 % genes: struct with additional fields of eg. estimated transcript weights
|
|
13 addpath ../../tests
|
|
14 addpath ../../variance/final
|
|
15 addpath ../../experimental
|
|
16
|
|
17 samtools='/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/samtools/';
|
|
18 CFG = PAR.CFG;
|
|
19 SUFF_NAME=CFG.SUFF_NAME;
|
|
20 %load(CFG.genes_path , 'genes');
|
|
21 %[genes]=mask_dubl(genes,0);
|
|
22
|
|
23 load(CFG.genes_path , 'genes');
|
|
24 [genes]=mask_dubl(genes,0);
|
|
25
|
|
26 %[genes]=remove_transcripts(genes);
|
|
27 genes=genes(PAR.genes_idx);
|
|
28 BAM_FILES=CFG.BAM_FILES;
|
|
29 JB_NR=CFG.JB_NR;
|
|
30
|
|
31
|
|
32
|
|
33 %clear PAR;
|
|
34
|
|
35 %%%% paths
|
|
36 addpath(CFG.paths);
|
|
37 addpath('/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/variance/nbin');
|
|
38 addpath('/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/experiments/');
|
|
39 addpath('/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/tests/');
|
|
40 addpath('/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/read_utils/');
|
|
41 addpath('/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/tests/sequence_tools/');
|
|
42 addpath(genpath('/fml/ag-raetsch/home/drewe/svn/tools/chronux/'))
|
|
43 addpath('/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/tests/functions/');
|
|
44
|
|
45 out_base=CFG.out_base;
|
|
46 %sim_data_base=CFG.sim_data_base;
|
|
47 real_data_base=CFG.real_data_base;
|
|
48
|
|
49 expr12_bam=CFG.expr12_bam;
|
|
50 expr11_bam=CFG.expr11_bam;
|
|
51 expr22_bam=CFG.expr22_bam;
|
|
52 expr21_bam=CFG.expr21_bam;
|
|
53
|
|
54 CFG.SEQUENCED_LENGTH=80;
|
|
55 SEQUENCED_LENGTH=80;
|
|
56 %make map: gene->file
|
|
57
|
|
58 IDX1=CFG.IDX11;
|
|
59 IDX2=CFG.IDX22;
|
|
60
|
|
61 %BAM_FILES{IDX1}
|
|
62 %BAM_FILES{IDX2}
|
|
63
|
|
64
|
|
65 %genes=PAR.genes;
|
|
66 clear PAR
|
|
67
|
|
68 STAT=cell(size(genes));
|
|
69 do_save = 0;
|
|
70 out_base = '/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/out';
|
|
71 save_dir=fullfile(out_base,'cache');
|
|
72
|
|
73 for i=1:size(genes,2)
|
|
74 %try
|
|
75 RESULT=cell(1,7)
|
|
76 gene = genes(i);
|
|
77
|
|
78 RESULT{7}=JB_NR;
|
|
79 RESULT{1}=gene.name
|
|
80 load_only = false;
|
|
81
|
|
82 if or(isempty(gene.exons),gene.stop-gene.start<=SEQUENCED_LENGTH)
|
|
83 PV=1;
|
|
84 RESULT{2}={PV,''};
|
|
85 RESULT{3}={PV,''};
|
|
86 RESULT{4}={PV,''};
|
|
87 RESULT{5}=[Inf,Inf];
|
|
88 RESULT{6}=[Inf,Inf];
|
|
89 RESULT{7}={'empty gene exons'};
|
|
90 RESULT{8}={PV,''};
|
|
91 continue;
|
|
92 end
|
|
93
|
|
94 PV1=1;
|
|
95 for l=1:size(gene.transcripts,2)
|
|
96 EXONS=gene.exons{l}
|
|
97 if sum(EXONS(:,2)-EXONS(:,1))<=SEQUENCED_LENGTH
|
|
98 PV=1;
|
|
99 RESULT{2}={PV,''};
|
|
100 RESULT{3}={PV,''};
|
|
101 RESULT{4}={PV,''};
|
|
102 RESULT{5}=[Inf,Inf];
|
|
103 RESULT{6}=[Inf,Inf];
|
|
104 RESULT{7}={'empty gene exons'};
|
|
105 RESULT{8}={PV,''};
|
|
106 PV1=2;
|
|
107
|
|
108 continue;
|
|
109 end
|
|
110 end
|
|
111 if PV1==2
|
|
112 continue;
|
|
113 end
|
|
114 gene.name
|
|
115
|
|
116
|
|
117
|
|
118 %adjust start and stop positions on the negative strand
|
|
119 if strcmp(gene.strand,'-')
|
|
120 gene.start=gene.start+1;
|
|
121 gene.stop=gene.stop+1;
|
|
122 [reads11,reads12,intron11,intron12] = get_reads_dual2_intron(expr11_bam,expr12_bam,gene,samtools,false,10);
|
|
123 [reads11,FLAG]=remove_reads_from_other_genes(reads11,gene);
|
|
124 [reads12,FLAG]=remove_reads_from_other_genes(reads12,gene);
|
|
125
|
|
126
|
|
127 [reads21,reads22,intron21,intron22] = get_reads_dual2_intron(expr21_bam,expr22_bam,gene,samtools,false,10);
|
|
128 [reads21,FLAG]=remove_reads_from_other_genes(reads21,gene);
|
|
129 [reads22,FLAG]=remove_reads_from_other_genes(reads22,gene);
|
|
130 gene.start=gene.start-1;
|
|
131 gene.stop=gene.stop-1;
|
|
132 else
|
|
133 [reads11,reads12,intron11,intron12] = get_reads_dual2_intron(expr11_bam,expr12_bam,gene,samtools,false,10);
|
|
134 [reads11,FLAG]=remove_reads_from_other_genes(reads11,gene);
|
|
135 [reads12,FLAG]=remove_reads_from_other_genes(reads12,gene);
|
|
136
|
|
137 [reads21,reads22,intron21,intron22] = get_reads_dual2_intron(expr21_bam,expr22_bam,gene,samtools,false,10);
|
|
138 [reads21,FLAG]=remove_reads_from_other_genes(reads21,gene);
|
|
139 [reads22,FLAG]=remove_reads_from_other_genes(reads22,gene);
|
|
140 end
|
|
141
|
|
142 reads11=reads11(sum(reads11,2)>70,:);
|
|
143 reads12=reads12(sum(reads12,2)>70,:);
|
|
144 reads21=reads21(sum(reads21,2)>70,:);
|
|
145 reads22=reads22(sum(reads22,2)>70,:);
|
|
146 %% TEST with read cliping
|
|
147
|
|
148 READS1={reads11,reads12};
|
|
149 READS2={reads21,reads22};
|
|
150 reads1=[reads11;reads12];
|
|
151 reads2=[reads21;reads22];
|
|
152
|
|
153 TOTAL_SIZE=(size(reads12,1)+size(reads12,1)+(size(reads21,1)+size(reads22,1)));
|
|
154 MIN_SIZE=min(size(reads12,1)+size(reads12,1),(size(reads21,1)+size(reads22,1)));
|
|
155 %keyboard
|
|
156
|
|
157 CLIP=0;
|
|
158 if 1==0
|
|
159 if (size(reads1,1)>0)
|
|
160 for i=1:size(reads1,1)
|
|
161 reads1(i,find(reads1(i,:),2,'first'))=0;
|
|
162 end
|
|
163 for i=1:size(reads1,1)
|
|
164 reads1(i,find(reads1(i,:),2,'last'))=0;
|
|
165 end
|
|
166 end
|
|
167 if (size(reads2,1)>0)
|
|
168 for i=1:size(reads2,1)
|
|
169 reads2(i,find(reads2(i,:),2,'first'))=0;
|
|
170 end
|
|
171 for i=1:size(reads2,1)
|
|
172 reads2(i,find(reads2(i,:),2,'last'))=0;
|
|
173 end
|
|
174 end
|
|
175 CLIP=4;
|
|
176 end
|
|
177
|
|
178 % keybloard
|
|
179 %% END TEST
|
|
180
|
|
181 COUNTER=3;
|
|
182 FLAG=0;
|
|
183 for k=1:size(gene.transcripts,2)
|
|
184 if sum(gene.exons{k}(:,2)-gene.exons{k}(:,1))<75
|
|
185 FLAG=1;
|
|
186 end
|
|
187 end
|
|
188
|
|
189 if FLAG==1
|
|
190 RESULT{2}={'gene shorter than 75 bp'};
|
|
191 STAT{i}=RESULT;
|
|
192 continue
|
|
193 end
|
|
194
|
|
195 if 1==0
|
|
196
|
|
197 for V_COUNTER=2%:length(CFG.VARIANCES1)
|
|
198 if (TOTAL_SIZE>0)
|
|
199 if(MIN_SIZE>0)
|
|
200 PV=1;
|
|
201 INFO='';
|
|
202 [PV,INFO] =diff_mmd([reads11;reads12],[reads21;reads22],gene);
|
|
203 PV
|
|
204 RESULT{COUNTER}={PV,INFO};
|
|
205 COUNTER=COUNTER+1;
|
|
206 else
|
|
207 PV=1;
|
|
208 RESULT{COUNTER}={PV,2};
|
|
209 COUNTER=COUNTER+1;
|
|
210 end
|
|
211 else
|
|
212 PV=1;
|
|
213 RESULT{COUNTER}={PV,1};
|
|
214 COUNTER=COUNTER+1;
|
|
215 end
|
|
216 end
|
|
217 end
|
|
218 % keyboard
|
|
219
|
|
220 VAR_FACT=1;
|
|
221 COV_POS=1;
|
|
222 REWEIGHT=0;
|
|
223 if 1==1
|
|
224 for V_COUNTER=2%:length(CFG.VARIANCES1)
|
|
225 if (TOTAL_SIZE>0)
|
|
226 if(MIN_SIZE>0)
|
|
227 cen_ARR=0.1:0.1:1;
|
|
228 REWEIGHT_WEIGHTS=zeros(length( cen_ARR),size(reads1,2));
|
|
229 cen_count=1;
|
|
230 for censor_frac= cen_ARR
|
|
231 %if strcmp(gene.name,'AT1G01073')
|
|
232 % keyboard
|
|
233 %end
|
|
234 censor_frac
|
|
235 temp_reads1=reads1;
|
|
236 temp_reads2=reads2;
|
|
237 %cut to relevant positions
|
|
238 COVERAGE=sum(temp_reads1,1)+sum(temp_reads2,1);
|
|
239 NONZERO=COVERAGE>0;
|
|
240 SORTED_COVERAGES=sort(COVERAGE(NONZERO));
|
|
241 NR_OF_NONZERO=sum(NONZERO);
|
|
242 CHOSEN_POSITIONS=COVERAGE<=SORTED_COVERAGES(ceil(NR_OF_NONZERO*censor_frac));
|
|
243 REWEIGHT_WEIGHTS(cen_count,CHOSEN_POSITIONS)=1;
|
|
244 cen_count=cen_count+1;
|
|
245 %temp_reads1=temp_reads1(:,CHOSEN_POSITIONS);
|
|
246 %temp_reads2=temp_reads2(:,CHOSEN_POSITIONS);
|
|
247 %remove reads which have no coverage;
|
|
248 %temp_reads1=temp_reads1(sum(temp_reads1,2)>0,:);
|
|
249 %temp_reads2=temp_reads2(sum(temp_reads2,2)>0,:);
|
|
250 %SIZE1=size(temp_reads1,1);
|
|
251 %SIZE2=size(temp_reads2,1);
|
|
252 %MIN_SIZE=min(SIZE1,SIZE2);
|
|
253 %TOTAL_SIZE=SIZE1+SIZE2;
|
|
254 end
|
|
255 if (TOTAL_SIZE>0)
|
|
256 if(MIN_SIZE>0)
|
|
257 PV=1;
|
|
258 INFO='';
|
|
259 REWEIGHT=1;
|
|
260 [PV,INFO] =diff_mmd_variance_subsample3({reads11,reads12},{reads21,reads22},gene,CFG.VARIANCES1{V_COUNTER},CFG.VARIANCES2{V_COUNTER},1,1,REWEIGHT, REWEIGHT_WEIGHTS)
|
|
261 RESULT{COUNTER}={PV,INFO};
|
|
262 COUNTER=COUNTER+1;
|
|
263 else
|
|
264 PV=1;
|
|
265 RESULT{COUNTER}={PV,2};
|
|
266 COUNTER=COUNTER+1;
|
|
267 end
|
|
268 else
|
|
269 PV=1;
|
|
270 RESULT{COUNTER}={PV,1};
|
|
271 COUNTER=COUNTER+1;
|
|
272 end
|
|
273 end
|
|
274 end
|
|
275 end
|
|
276 end
|
|
277
|
|
278 if 1==0
|
|
279 VAR_FACT=1;
|
|
280 COV_POS=1;
|
|
281 REWEIGHT=0;
|
|
282 for V_COUNTER=2%:length(CFG.VARIANCES1)
|
|
283 for cov_fact_ix=1:length(COV_POS)
|
|
284
|
|
285 if (TOTAL_SIZE>0)
|
|
286 if(MIN_SIZE>0)
|
|
287 PV=1;
|
|
288 INFO='';
|
|
289 [PV,INFO] =diff_mmd_variance_subsample2({reads11,reads12},{reads21,reads22},gene,CFG.VARIANCES1{V_COUNTER},CFG.VARIANCES2{V_COUNTER},VAR_FACT(var_fact_ix),COV_POS,REWEIGHT);
|
|
290 RESULT{COUNTER}={PV,INFO};
|
|
291 COUNTER=COUNTER+1;
|
|
292 else
|
|
293 PV=1;
|
|
294 RESULT{COUNTER}={PV,2};
|
|
295 COUNTER=COUNTER+1;
|
|
296 end
|
|
297 else
|
|
298 PV=1;
|
|
299 RESULT{COUNTER}={PV,1};
|
|
300 COUNTER=COUNTER+1;
|
|
301 end
|
|
302 end
|
|
303 end
|
|
304
|
|
305 if 1==1
|
|
306 VAR_FACT=1;
|
|
307 COV_POS=1;
|
|
308 REWEIGHT=1;
|
|
309 for V_COUNTER=2%:length(CFG.VARIANCES1)
|
|
310 for var_fact_ix=1:length(VAR_FACT)
|
|
311
|
|
312 if (TOTAL_SIZE>0)
|
|
313 if(MIN_SIZE>0)
|
|
314 PV=1;
|
|
315 INFO='';
|
|
316 [PV,INFO] =diff_mmd_variance_subsample2({reads11,reads12},{reads21,reads22},gene,CFG.VARIANCES1{V_COUNTER},CFG.VARIANCES2{V_COUNTER},VAR_FACT(var_fact_ix),COV_POS,REWEIGHT)
|
|
317 RESULT{COUNTER}={PV,INFO};
|
|
318 COUNTER=COUNTER+1;
|
|
319 else
|
|
320 PV=1;
|
|
321 RESULT{COUNTER}={PV,2};
|
|
322 COUNTER=COUNTER+1;
|
|
323 end
|
|
324 else
|
|
325 PV=1;
|
|
326 RESULT{COUNTER}={PV,1};
|
|
327 COUNTER=COUNTER+1;
|
|
328 end
|
|
329 end
|
|
330 end
|
|
331 end
|
|
332 end
|
|
333
|
|
334 %keyboard
|
|
335 TOTAL_SIZE=(size(reads12,1)+size(reads12,1)+(size(reads21,1)+size(reads22,1)));
|
|
336 MIN_SIZE=min(size(reads12,1)+size(reads12,1),(size(reads21,1)+size(reads22,1)));
|
|
337 if 1==0
|
|
338 for V_COUNTER=1
|
|
339 if (TOTAL_SIZE>0)
|
|
340 if(MIN_SIZE>0)
|
|
341 PV=1;
|
|
342 INFO='';
|
|
343 if isempty([intron11,intron12])|isempty([intron21,intron22])
|
|
344 [PV,INFO] =diff_mmd_variance_NB_NB_simple([reads11;reads12],[reads21;reads22],gene,CFG.VARIANCES1{V_COUNTER},CFG.VARIANCES2{V_COUNTER});
|
|
345 else
|
|
346 [PV,INFO] =diff_mmd_variance_splice([reads11;reads12],[reads21;reads22],0.5,[intron11,intron12],[intron21,intron22],gene,CFG.VARIANCES1{V_COUNTER},CFG.VARIANCES2{V_COUNTER});
|
|
347 end
|
|
348 RESULT{COUNTER}={PV,INFO};
|
|
349 COUNTER=COUNTER+1;
|
|
350 else
|
|
351 PV=1;
|
|
352 RESULT{COUNTER}={PV,2};
|
|
353 COUNTER=COUNTER+1;
|
|
354 end
|
|
355 else
|
|
356 PV=1;
|
|
357 RESULT{COUNTER}={PV,1};
|
|
358 COUNTER=COUNTER+1;
|
|
359 end
|
|
360 end
|
|
361 end
|
|
362 PV=1;
|
|
363 %keyboard
|
|
364 for V_COUNTER=length(CFG.VARIANCES1)
|
|
365 if (TOTAL_SIZE>0)
|
|
366
|
|
367 if(MIN_SIZE>0)
|
|
368 [P_VALUE, INFO]= diff_nbin7(READS1,READS2,gene,SEQUENCED_LENGTH,CFG.VARIANCES1{V_COUNTER},CFG.VARIANCES2{V_COUNTER});
|
|
369 RESULT{COUNTER}={P_VALUE,INFO};
|
|
370 if not(isempty(INFO))
|
|
371 if iscell(INFO)
|
|
372 RESULT{COUNTER}=INFO{5};
|
|
373 COUNTER=COUNTER+1;
|
|
374 end
|
|
375 end
|
|
376 else
|
|
377 PV=1;
|
|
378 RESULT{COUNTER}={PV,2};
|
|
379 COUNTER=COUNTER+1;
|
|
380 end
|
|
381 else
|
|
382 PV=1;
|
|
383 RESULT{COUNTER}={PV,1};
|
|
384 COUNTER=COUNTER+1;
|
|
385 end
|
|
386 end
|
|
387
|
|
388
|
|
389 [SPLICINGEVENTS,SEQUENCE,EXONSEQUENCE]=splicingsequence(gene);
|
|
390 [UNIQUE_NEW_EXONS,GRAPHNODES,ORDER_OF_GRAPHNODE,EIRS_IN_SEQ]=transform_single_end_reads(SPLICINGEVENTS,SEQUENCE,EXONSEQUENCE,CFG.SEQUENCED_LENGTH-CLIP);
|
|
391 [NEW_READS1,UNEXPLAINED_READS1,UNEXPLAINED_INDEX1]= convert_reads_to_region_indicators([reads11;reads12],UNIQUE_NEW_EXONS,GRAPHNODES,ORDER_OF_GRAPHNODE,EIRS_IN_SEQ,gene);
|
|
392 [NEW_READS2,UNEXPLAINED_READS2,UNEXPLAINED_INDEX2]= convert_reads_to_region_indicators([reads21;reads22],UNIQUE_NEW_EXONS,GRAPHNODES,ORDER_OF_GRAPHNODE,EIRS_IN_SEQ,gene);
|
|
393 % keyboard
|
|
394 TOTAL_SIZE=(size(NEW_READS1,1)+(size(NEW_READS2,1)));
|
|
395 MIN_SIZE=min(size(NEW_READS1,1),(size(NEW_READS2,1)));
|
|
396
|
|
397 if (TOTAL_SIZE>0)
|
|
398 if(MIN_SIZE>0)
|
|
399 [PV,INFO] = diff_poisson_bonf_3_unequal_segment(NEW_READS1,NEW_READS2,gene,CFG.SEQUENCED_LENGTH-CLIP);
|
|
400 RESULT{COUNTER}={PV,INFO};
|
|
401 COUNTER=COUNTER+1;
|
|
402 else
|
|
403 PV=1;
|
|
404 RESULT{COUNTER}={PV,2};
|
|
405 COUNTER=COUNTER+1;
|
|
406 end
|
|
407 else
|
|
408 PV=1;
|
|
409 RESULT{COUNTER}={PV,1};
|
|
410 COUNTER=COUNTER+1;
|
|
411 end
|
|
412 PV=1
|
|
413 if 1==0
|
|
414 if (TOTAL_SIZE>0)
|
|
415 if(MIN_SIZE>0)
|
|
416 [PV,INFO] = diff_poisson_bonf_4_unequal_segment(NEW_READS1,NEW_READS2,gene,CFG.SEQUENCED_LENGTH-CLIP);
|
|
417 RESULT{COUNTER}={PV,INFO};
|
|
418 COUNTER=COUNTER+1;
|
|
419 else
|
|
420 PV=1;
|
|
421 RESULT{COUNTER}={PV,2};
|
|
422 COUNTER=COUNTER+1;
|
|
423 end
|
|
424 else
|
|
425 PV=1;
|
|
426 RESULT{COUNTER}={PV,1};
|
|
427 COUNTER=COUNTER+1;
|
|
428 end
|
|
429 %keyboard
|
|
430 end
|
|
431 STAT{i}=RESULT;
|
|
432
|
|
433 end;
|
|
434 %keyboard
|
|
435 OUT_FILENAME=['/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/out/analysis_artificial_variance_03_08_2012/' SUFF_NAME '_rep_mmd_07_07_2012_' int2str(JB_NR) '.mat'];
|
|
436 save(OUT_FILENAME,'STAT')
|