0
|
1 function [pval,info] = rDiff_nonparametric(CFG,READS1,READS2,variance_function_nonparametric_1, variance_function_nonparametric_2)
|
|
2
|
|
3 bootstraps=CFG.bootstraps;
|
|
4
|
|
5 %Initizialize the reads
|
|
6 SUMS1=[]; %the readcoverage for sample 1
|
|
7 SUMS2=[]; %the readcoverage for sample 2
|
|
8 reads1=[]; %the total of reads in sample 1
|
|
9 reads2=[]; %the total of reads in sample 2
|
|
10 NON_EMPTY=[];
|
|
11 for i=1:length(READS1)
|
|
12 if not(isempty(READS1{i}))
|
|
13 NON_EMPTY=[NON_EMPTY,i];
|
|
14 end
|
|
15 end
|
|
16 READS1={READS1{ NON_EMPTY}};
|
|
17 NON_EMPTY=[];
|
|
18 for i=1:length(READS2)
|
|
19 if not(isempty(READS2{i}))
|
|
20 NON_EMPTY=[NON_EMPTY,i];
|
|
21 end
|
|
22 end
|
|
23 READS2={READS2{NON_EMPTY}};
|
|
24 for i=1:length(READS1)
|
|
25 SUMS1=[SUMS1;sum(READS1{i},1)];
|
|
26 reads1=[reads1;READS1{i}];
|
|
27 end
|
|
28 for i=1:length(READS2)
|
|
29 SUMS2=[SUMS2;sum(READS2{i},1)];
|
|
30 reads2=[reads2;READS2{i}];
|
|
31 end
|
|
32
|
|
33 if size(reads1,1)==0 || size(reads2,1)==0
|
|
34 pval=1;
|
|
35 info=1;
|
|
36 bootstrap_results=1;
|
|
37 statistic=1;
|
|
38 return
|
|
39 end
|
|
40
|
|
41
|
|
42 %Get masks for the highly covered positions
|
|
43 [MASKS]=get_nonparametric_masks(CFG,reads1,reads2);
|
|
44
|
|
45
|
|
46 %Determine number of masks
|
|
47 NR_OF_MASKS=size(MASKS,1);
|
|
48
|
|
49 % Predict variance
|
|
50 VARIANCE1=(1e-8)*ones(size(reads1,2),1);
|
|
51 for i=1:length(READS1)
|
|
52 TEMP_VARIANCE1=predict_variance(sum(READS1{i},1)',variance_function_nonparametric_1);
|
|
53 TEMP_VARIANCE1(isnan(TEMP_VARIANCE1))=0;
|
|
54 TEMP_VARIANCE1(isinf(TEMP_VARIANCE1))=0;
|
|
55 VARIANCE1=VARIANCE1+TEMP_VARIANCE1;
|
|
56 end
|
|
57
|
|
58 VARIANCE2=(1e-8)*ones(size(reads1,2),1);
|
|
59 for i=1:length(READS2)
|
|
60 TEMP_VARIANCE2=predict_variance(sum(READS2{i},1)',variance_function_nonparametric_2);
|
|
61 TEMP_VARIANCE2(isnan(TEMP_VARIANCE2))=0;
|
|
62 TEMP_VARIANCE2(isinf(TEMP_VARIANCE2))=0;
|
|
63 VARIANCE2=VARIANCE2+TEMP_VARIANCE2;
|
|
64 end
|
|
65
|
|
66 %Get the mean variance
|
|
67 VARIANCE1=VARIANCE1/length(READS1);
|
|
68 VARIANCE2=VARIANCE2/length(READS2);
|
|
69
|
|
70 %Concatenation of all reads
|
|
71 allreads = [reads1;reads2];
|
|
72
|
|
73
|
|
74 %Determine the subsampling rate
|
|
75 p=sum(allreads,1)/size(allreads,1);
|
|
76 R=size(reads1,1);
|
|
77 c=(p.*(1-p))/(size(allreads,1)-1);
|
|
78 N1s=size(allreads,1)*c./(c+(VARIANCE1'/(R^2)));
|
|
79
|
|
80 p=sum(allreads,1)/size(allreads,1);
|
|
81 R=size(reads2,1);
|
|
82 c=(p.*(1-p))/(size(allreads,1)-1);
|
|
83 N2s=size(allreads,1)*c./(c+(VARIANCE2'/(R^2)));
|
|
84
|
|
85 %Round to get integer values
|
|
86 N1s=ceil(full(N1s));
|
|
87 N2s=ceil(full(N2s));
|
|
88
|
|
89
|
|
90 %Total number of reads in each replicate
|
|
91 N1 = size(reads1,1);
|
|
92 N2 = size(reads2,1);
|
|
93 N = N1 + N2;
|
|
94
|
|
95
|
|
96 bootstrap_results=ones(NR_OF_MASKS,bootstraps);
|
|
97 COUNTER=0;
|
|
98 R1s=[];
|
|
99 R2s=[];
|
|
100 statistic=ones(NR_OF_MASKS,bootstraps);
|
|
101 nr_of_slices=CFG.nr_of_slices;
|
|
102
|
|
103 TOTAL_SUM1=sum(reads1,1);
|
|
104 TOTAL_SUM2=sum(reads2,1);
|
|
105 TOTAL_SUM=TOTAL_SUM1+TOTAL_SUM2;
|
|
106
|
|
107 clear reads1
|
|
108 clear reads2
|
|
109 %Use the transpose to make the selection of columms faster
|
|
110 all_reads_trans=allreads';
|
|
111 clear allreads
|
|
112 if length(unique(TOTAL_SUM))<nr_of_slices+5
|
|
113 pval=1;
|
|
114 bootstrap_results=[];
|
|
115 statistic=[];
|
|
116 pval=1;
|
|
117 info = 1;
|
|
118 return
|
|
119 end
|
|
120
|
|
121 [SUM_SORTED,SUM_POS]=sort(TOTAL_SUM);
|
|
122 NR_OF_ZEROS=sum(TOTAL_SUM==0);
|
|
123
|
|
124 %precompute this sum once
|
|
125 all_reads_trans_row_sum=sum(all_reads_trans,1);
|
|
126
|
|
127 %These field contain
|
|
128 STAT_DIST=zeros(NR_OF_MASKS,nr_of_slices);
|
|
129 TEMP_DIST=zeros(1,nr_of_slices);
|
|
130
|
|
131 %Precompute normalizing constants
|
|
132 SUM_TOTAL_SUM1=sum(TOTAL_SUM1);
|
|
133 SUM_TOTAL_SUM2=sum(TOTAL_SUM2);
|
|
134 SUM_SUM_TOTAL_SUM=SUM_TOTAL_SUM1+SUM_TOTAL_SUM2;
|
|
135
|
|
136 %Precompute the some of the values or the slices
|
|
137 SLICES=zeros(nr_of_slices,size(TOTAL_SUM,2))==1;
|
|
138 N1_arr=zeros(nr_of_slices,1);
|
|
139 N2_arr=zeros(nr_of_slices,1);
|
|
140 FACT_arr=zeros(nr_of_slices,1);
|
|
141 V1_cell=cell(nr_of_slices,1);
|
|
142 V2_cell=cell(nr_of_slices,1);
|
|
143
|
|
144 %Detemine regions wher to match the variances
|
|
145 for s=nr_of_slices:(-1):1
|
|
146 LOWER_POS=min(NR_OF_ZEROS+ceil(((nr_of_slices-s)/nr_of_slices)*(length(TOTAL_SUM)-NR_OF_ZEROS))+1,length(TOTAL_SUM));
|
|
147 UPPER_POS=min(NR_OF_ZEROS+ceil(((nr_of_slices-s+1)/nr_of_slices)*(length(TOTAL_SUM)-NR_OF_ZEROS))+1,length(TOTAL_SUM));
|
|
148
|
|
149 SLICE_LOWER_BOUND=SUM_SORTED(LOWER_POS);
|
|
150 SLICE_UPPER_BOUND=SUM_SORTED(UPPER_POS);
|
|
151 if s==nr_of_slices
|
|
152 SLICE=and(TOTAL_SUM>=SLICE_LOWER_BOUND,TOTAL_SUM<=SLICE_UPPER_BOUND);
|
|
153 else
|
|
154 SLICE=and(TOTAL_SUM>SLICE_LOWER_BOUND,TOTAL_SUM<=SLICE_UPPER_BOUND);
|
|
155 end
|
|
156 SLICES(s,:)=SLICE;
|
|
157
|
|
158 N1s_temp=ceil(median(N1s(SLICE)));
|
|
159 N2s_temp=ceil(median(N2s(SLICE)));
|
|
160 N1s_temp=min(N1s_temp,N1);
|
|
161 N2s_temp=min(N2s_temp,N2);
|
|
162
|
|
163 N1_arr(s)=N1s_temp;
|
|
164 N2_arr(s)=N2s_temp;
|
|
165
|
|
166 FACT_arr(s)=sum(TOTAL_SUM1(SLICE)+TOTAL_SUM2(SLICE))/(SUM_SUM_TOTAL_SUM);
|
|
167
|
|
168 V1_cell{s}=TOTAL_SUM1(SLICE)/SUM_TOTAL_SUM1;%temporary variable to safe time
|
|
169 V2_cell{s}=TOTAL_SUM2(SLICE)/SUM_TOTAL_SUM2;%temporary variable to safe time
|
|
170 for mask_ix=1:NR_OF_MASKS
|
|
171 STAT_DIST(mask_ix,s)=eucl_dist_weigthed(V1_cell{s},V2_cell{s},MASKS(mask_ix,SLICES(s,:)));
|
|
172 end
|
|
173 end
|
|
174 SUM_SLICES=sum(SLICES,2);
|
|
175 STAT_DIST(isnan(TEMP_DIST))=0;
|
|
176 all_reads_trans_slices=cell(nr_of_slices,1);
|
|
177 for s=nr_of_slices:(-1):1
|
|
178 all_reads_trans_slices{s}=all_reads_trans(SLICES(s,:),:);
|
|
179 end
|
|
180
|
|
181 for i = 1:bootstraps
|
|
182 % permutation of the reads
|
|
183 read_per = randperm(N);
|
|
184
|
|
185 %Peform the computation for each region where the variances are matched
|
|
186 for s=nr_of_slices:(-1):1
|
|
187 if SUM_SLICES(s)==0;
|
|
188 continue;
|
|
189 end
|
|
190 %Create random samples 1 and 2
|
|
191 sample1 = sum(all_reads_trans_slices{s}(:,read_per(1:N1_arr(s))),2);
|
|
192 sample2 = sum(all_reads_trans_slices{s}(:,read_per((N1_arr(s)+1):(N1_arr(s)+N2_arr(s)))),2);
|
|
193
|
|
194 W1=sample1/sum(sample1)*FACT_arr(s);%temporary variable to safe time
|
|
195 W2=sample2/sum(sample2)*FACT_arr(s);%temporary variable to safe time
|
|
196
|
|
197 for mask_ix=1:NR_OF_MASKS
|
|
198 TEMP_DIST(mask_ix,s)=eucl_dist_weigthed(W1,W2,MASKS(mask_ix,SLICES(s,:))');
|
|
199 end
|
|
200
|
|
201 end
|
|
202
|
|
203 %make sure the normalisation doe not intruduces nan's
|
|
204 TEMP_DIST(isnan(TEMP_DIST))=0;
|
|
205
|
|
206 COUNTER=COUNTER+1;
|
|
207 %Compute the average from the different matching regionon
|
|
208 statistic(:,COUNTER)=mean(STAT_DIST,2);
|
|
209 bootstrap_results(:,COUNTER)=mean(TEMP_DIST,2);
|
|
210 end
|
|
211
|
|
212 bootstrap_results=bootstrap_results(:,1:COUNTER);
|
|
213 statistic=statistic(:,1:COUNTER);
|
|
214
|
|
215 pval=double(sum(bootstrap_results>=statistic,2)) / COUNTER;
|
|
216
|
|
217 info = {bootstrap_results,statistic,pval};
|
|
218 pval=min(pval)*10;
|
|
219
|
|
220 function result = eucl_dist_weigthed(A,B,W)
|
|
221
|
|
222 result = sqrt(sum( W.*((A - B) .^ 2) ));
|