Mercurial > repos > vipints > rdiff
comparison rDiff/src/tests/rDiff_nonparametric.m @ 0:0f80a5141704
version 0.3 uploaded
| author | vipints |
|---|---|
| date | Thu, 14 Feb 2013 23:38:36 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:0f80a5141704 |
|---|---|
| 1 function [pval,info] = rDiff_nonparametric(CFG,READS1,READS2,variance_function_nonparametric_1, variance_function_nonparametric_2) | |
| 2 | |
| 3 bootstraps=CFG.bootstraps; | |
| 4 | |
| 5 %Initizialize the reads | |
| 6 SUMS1=[]; %the readcoverage for sample 1 | |
| 7 SUMS2=[]; %the readcoverage for sample 2 | |
| 8 reads1=[]; %the total of reads in sample 1 | |
| 9 reads2=[]; %the total of reads in sample 2 | |
| 10 NON_EMPTY=[]; | |
| 11 for i=1:length(READS1) | |
| 12 if not(isempty(READS1{i})) | |
| 13 NON_EMPTY=[NON_EMPTY,i]; | |
| 14 end | |
| 15 end | |
| 16 READS1={READS1{ NON_EMPTY}}; | |
| 17 NON_EMPTY=[]; | |
| 18 for i=1:length(READS2) | |
| 19 if not(isempty(READS2{i})) | |
| 20 NON_EMPTY=[NON_EMPTY,i]; | |
| 21 end | |
| 22 end | |
| 23 READS2={READS2{NON_EMPTY}}; | |
| 24 for i=1:length(READS1) | |
| 25 SUMS1=[SUMS1;sum(READS1{i},1)]; | |
| 26 reads1=[reads1;READS1{i}]; | |
| 27 end | |
| 28 for i=1:length(READS2) | |
| 29 SUMS2=[SUMS2;sum(READS2{i},1)]; | |
| 30 reads2=[reads2;READS2{i}]; | |
| 31 end | |
| 32 | |
| 33 if size(reads1,1)==0 || size(reads2,1)==0 | |
| 34 pval=1; | |
| 35 info=1; | |
| 36 bootstrap_results=1; | |
| 37 statistic=1; | |
| 38 return | |
| 39 end | |
| 40 | |
| 41 | |
| 42 %Get masks for the highly covered positions | |
| 43 [MASKS]=get_nonparametric_masks(CFG,reads1,reads2); | |
| 44 | |
| 45 | |
| 46 %Determine number of masks | |
| 47 NR_OF_MASKS=size(MASKS,1); | |
| 48 | |
| 49 % Predict variance | |
| 50 VARIANCE1=(1e-8)*ones(size(reads1,2),1); | |
| 51 for i=1:length(READS1) | |
| 52 TEMP_VARIANCE1=predict_variance(sum(READS1{i},1)',variance_function_nonparametric_1); | |
| 53 TEMP_VARIANCE1(isnan(TEMP_VARIANCE1))=0; | |
| 54 TEMP_VARIANCE1(isinf(TEMP_VARIANCE1))=0; | |
| 55 VARIANCE1=VARIANCE1+TEMP_VARIANCE1; | |
| 56 end | |
| 57 | |
| 58 VARIANCE2=(1e-8)*ones(size(reads1,2),1); | |
| 59 for i=1:length(READS2) | |
| 60 TEMP_VARIANCE2=predict_variance(sum(READS2{i},1)',variance_function_nonparametric_2); | |
| 61 TEMP_VARIANCE2(isnan(TEMP_VARIANCE2))=0; | |
| 62 TEMP_VARIANCE2(isinf(TEMP_VARIANCE2))=0; | |
| 63 VARIANCE2=VARIANCE2+TEMP_VARIANCE2; | |
| 64 end | |
| 65 | |
| 66 %Get the mean variance | |
| 67 VARIANCE1=VARIANCE1/length(READS1); | |
| 68 VARIANCE2=VARIANCE2/length(READS2); | |
| 69 | |
| 70 %Concatenation of all reads | |
| 71 allreads = [reads1;reads2]; | |
| 72 | |
| 73 | |
| 74 %Determine the subsampling rate | |
| 75 p=sum(allreads,1)/size(allreads,1); | |
| 76 R=size(reads1,1); | |
| 77 c=(p.*(1-p))/(size(allreads,1)-1); | |
| 78 N1s=size(allreads,1)*c./(c+(VARIANCE1'/(R^2))); | |
| 79 | |
| 80 p=sum(allreads,1)/size(allreads,1); | |
| 81 R=size(reads2,1); | |
| 82 c=(p.*(1-p))/(size(allreads,1)-1); | |
| 83 N2s=size(allreads,1)*c./(c+(VARIANCE2'/(R^2))); | |
| 84 | |
| 85 %Round to get integer values | |
| 86 N1s=ceil(full(N1s)); | |
| 87 N2s=ceil(full(N2s)); | |
| 88 | |
| 89 | |
| 90 %Total number of reads in each replicate | |
| 91 N1 = size(reads1,1); | |
| 92 N2 = size(reads2,1); | |
| 93 N = N1 + N2; | |
| 94 | |
| 95 | |
| 96 bootstrap_results=ones(NR_OF_MASKS,bootstraps); | |
| 97 COUNTER=0; | |
| 98 R1s=[]; | |
| 99 R2s=[]; | |
| 100 statistic=ones(NR_OF_MASKS,bootstraps); | |
| 101 nr_of_slices=CFG.nr_of_slices; | |
| 102 | |
| 103 TOTAL_SUM1=sum(reads1,1); | |
| 104 TOTAL_SUM2=sum(reads2,1); | |
| 105 TOTAL_SUM=TOTAL_SUM1+TOTAL_SUM2; | |
| 106 | |
| 107 clear reads1 | |
| 108 clear reads2 | |
| 109 %Use the transpose to make the selection of columms faster | |
| 110 all_reads_trans=allreads'; | |
| 111 clear allreads | |
| 112 if length(unique(TOTAL_SUM))<nr_of_slices+5 | |
| 113 pval=1; | |
| 114 bootstrap_results=[]; | |
| 115 statistic=[]; | |
| 116 pval=1; | |
| 117 info = 1; | |
| 118 return | |
| 119 end | |
| 120 | |
| 121 [SUM_SORTED,SUM_POS]=sort(TOTAL_SUM); | |
| 122 NR_OF_ZEROS=sum(TOTAL_SUM==0); | |
| 123 | |
| 124 %precompute this sum once | |
| 125 all_reads_trans_row_sum=sum(all_reads_trans,1); | |
| 126 | |
| 127 %These field contain | |
| 128 STAT_DIST=zeros(NR_OF_MASKS,nr_of_slices); | |
| 129 TEMP_DIST=zeros(1,nr_of_slices); | |
| 130 | |
| 131 %Precompute normalizing constants | |
| 132 SUM_TOTAL_SUM1=sum(TOTAL_SUM1); | |
| 133 SUM_TOTAL_SUM2=sum(TOTAL_SUM2); | |
| 134 SUM_SUM_TOTAL_SUM=SUM_TOTAL_SUM1+SUM_TOTAL_SUM2; | |
| 135 | |
| 136 %Precompute the some of the values or the slices | |
| 137 SLICES=zeros(nr_of_slices,size(TOTAL_SUM,2))==1; | |
| 138 N1_arr=zeros(nr_of_slices,1); | |
| 139 N2_arr=zeros(nr_of_slices,1); | |
| 140 FACT_arr=zeros(nr_of_slices,1); | |
| 141 V1_cell=cell(nr_of_slices,1); | |
| 142 V2_cell=cell(nr_of_slices,1); | |
| 143 | |
| 144 %Detemine regions wher to match the variances | |
| 145 for s=nr_of_slices:(-1):1 | |
| 146 LOWER_POS=min(NR_OF_ZEROS+ceil(((nr_of_slices-s)/nr_of_slices)*(length(TOTAL_SUM)-NR_OF_ZEROS))+1,length(TOTAL_SUM)); | |
| 147 UPPER_POS=min(NR_OF_ZEROS+ceil(((nr_of_slices-s+1)/nr_of_slices)*(length(TOTAL_SUM)-NR_OF_ZEROS))+1,length(TOTAL_SUM)); | |
| 148 | |
| 149 SLICE_LOWER_BOUND=SUM_SORTED(LOWER_POS); | |
| 150 SLICE_UPPER_BOUND=SUM_SORTED(UPPER_POS); | |
| 151 if s==nr_of_slices | |
| 152 SLICE=and(TOTAL_SUM>=SLICE_LOWER_BOUND,TOTAL_SUM<=SLICE_UPPER_BOUND); | |
| 153 else | |
| 154 SLICE=and(TOTAL_SUM>SLICE_LOWER_BOUND,TOTAL_SUM<=SLICE_UPPER_BOUND); | |
| 155 end | |
| 156 SLICES(s,:)=SLICE; | |
| 157 | |
| 158 N1s_temp=ceil(median(N1s(SLICE))); | |
| 159 N2s_temp=ceil(median(N2s(SLICE))); | |
| 160 N1s_temp=min(N1s_temp,N1); | |
| 161 N2s_temp=min(N2s_temp,N2); | |
| 162 | |
| 163 N1_arr(s)=N1s_temp; | |
| 164 N2_arr(s)=N2s_temp; | |
| 165 | |
| 166 FACT_arr(s)=sum(TOTAL_SUM1(SLICE)+TOTAL_SUM2(SLICE))/(SUM_SUM_TOTAL_SUM); | |
| 167 | |
| 168 V1_cell{s}=TOTAL_SUM1(SLICE)/SUM_TOTAL_SUM1;%temporary variable to safe time | |
| 169 V2_cell{s}=TOTAL_SUM2(SLICE)/SUM_TOTAL_SUM2;%temporary variable to safe time | |
| 170 for mask_ix=1:NR_OF_MASKS | |
| 171 STAT_DIST(mask_ix,s)=eucl_dist_weigthed(V1_cell{s},V2_cell{s},MASKS(mask_ix,SLICES(s,:))); | |
| 172 end | |
| 173 end | |
| 174 SUM_SLICES=sum(SLICES,2); | |
| 175 STAT_DIST(isnan(TEMP_DIST))=0; | |
| 176 all_reads_trans_slices=cell(nr_of_slices,1); | |
| 177 for s=nr_of_slices:(-1):1 | |
| 178 all_reads_trans_slices{s}=all_reads_trans(SLICES(s,:),:); | |
| 179 end | |
| 180 | |
| 181 for i = 1:bootstraps | |
| 182 % permutation of the reads | |
| 183 read_per = randperm(N); | |
| 184 | |
| 185 %Peform the computation for each region where the variances are matched | |
| 186 for s=nr_of_slices:(-1):1 | |
| 187 if SUM_SLICES(s)==0; | |
| 188 continue; | |
| 189 end | |
| 190 %Create random samples 1 and 2 | |
| 191 sample1 = sum(all_reads_trans_slices{s}(:,read_per(1:N1_arr(s))),2); | |
| 192 sample2 = sum(all_reads_trans_slices{s}(:,read_per((N1_arr(s)+1):(N1_arr(s)+N2_arr(s)))),2); | |
| 193 | |
| 194 W1=sample1/sum(sample1)*FACT_arr(s);%temporary variable to safe time | |
| 195 W2=sample2/sum(sample2)*FACT_arr(s);%temporary variable to safe time | |
| 196 | |
| 197 for mask_ix=1:NR_OF_MASKS | |
| 198 TEMP_DIST(mask_ix,s)=eucl_dist_weigthed(W1,W2,MASKS(mask_ix,SLICES(s,:))'); | |
| 199 end | |
| 200 | |
| 201 end | |
| 202 | |
| 203 %make sure the normalisation doe not intruduces nan's | |
| 204 TEMP_DIST(isnan(TEMP_DIST))=0; | |
| 205 | |
| 206 COUNTER=COUNTER+1; | |
| 207 %Compute the average from the different matching regionon | |
| 208 statistic(:,COUNTER)=mean(STAT_DIST,2); | |
| 209 bootstrap_results(:,COUNTER)=mean(TEMP_DIST,2); | |
| 210 end | |
| 211 | |
| 212 bootstrap_results=bootstrap_results(:,1:COUNTER); | |
| 213 statistic=statistic(:,1:COUNTER); | |
| 214 | |
| 215 pval=double(sum(bootstrap_results>=statistic,2)) / COUNTER; | |
| 216 | |
| 217 info = {bootstrap_results,statistic,pval}; | |
| 218 pval=min(pval)*10; | |
| 219 | |
| 220 function result = eucl_dist_weigthed(A,B,W) | |
| 221 | |
| 222 result = sqrt(sum( W.*((A - B) .^ 2) )); |
