comparison rDiff/src/get_reads_caller.m @ 0:0f80a5141704

version 0.3 uploaded
author vipints
date Thu, 14 Feb 2013 23:38:36 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:0f80a5141704
1 function [COUNTS]=get_reads_caller(PAR)
2
3 CFG = PAR.CFG;
4 genes = PAR.genes;
5 clear PAR;
6
7 % add paths
8 addpath(CFG.paths);
9
10 data_dir=CFG.data_dir;
11 OUT_STR=[];
12
13 COUNTS=cell(size(genes,2),1);
14
15
16 % NON_PARAM_SAMPLE contains the read start density
17 if CFG.perform_nonparametric
18 NON_PARAM_SAMPLE=sparse([],[],[],10000,1,5000);
19 end
20
21
22 for i=1:size(genes,2)
23
24
25
26 %TEMP_COUNT contins the counts for the current gene
27 TEMP_COUNT=cell(1,7);
28 gene = genes(i);
29
30 %set default return values
31 TEMP_COUNT{1}=gene.name;
32 TEMP_COUNT{2}=[];
33 TEMP_COUNT{3}=[];
34 TEMP_COUNT{4}=[];
35 TEMP_COUNT{5}=[];
36 TEMP_COUNT{6}=[];
37
38 %check that the gene has exons defined
39 if isempty(gene.exons)
40 STAT{i}=TEMP_COUNT;
41 continue;
42 end
43
44 %check that the gene is longer than the Reads. Otherwise the
45 %definition of regions does not makes sense
46 if gene.stop-gene.start<CFG.sequenced_length+3
47 STAT{i}=TEMP_COUNT;
48 continue;
49 end
50
51 %Get the reads from gene
52 [reads] = get_reads_for_gene(CFG,gene);
53
54
55 %Get total number of reads
56 NR_OF_READS=size(reads,1);
57 TEMP_COUNT{2}=NR_OF_READS;
58
59
60 NR_OF_TRANS=size(gene.transcripts,2);
61 %check that the gene has more than one isoform
62 if NR_OF_TRANS<=1
63 STAT{i}=TEMP_COUNT;
64 continue;
65 end
66
67
68 EXON_IDX=sum(gene.exonsequence,1)<NR_OF_TRANS;
69
70 %Transform the reads in to counts per region
71 [NEW_READS,UNEXPLAINED_READS,UNEXPLAINED_INDEX]= convert_reads_to_region_indicators(reads,gene);
72
73 if length(unique(sum(NEW_READS,2)))>1
74 warning(['Assignment of reads to regions is not unique for gene:' gene.name '\n']);
75 end
76
77
78 %Calulate gene expression
79 %Get the non_unique_regions
80 NON_ALT_EIRS=sum(gene.eirs_in_seq,1)==NR_OF_TRANS;
81 TEMP_COUNT{3}=sum(sum(NEW_READS(:,NON_ALT_EIRS),2)>0);
82
83
84 %Get Counts for nonparametric variance function
85 if CFG.perform_nonparametric
86 %Get the read starts
87 [TEMP,START]=max(reads,[],2);
88 read_starts=sparse((1:NR_OF_READS)',START,ones(NR_OF_READS,1),NR_OF_READS,size(reads,2),NR_OF_READS);
89
90 %Get the index of the alternative reads
91 ALT_READ_IX=zeros(size(reads,1),1);
92 ALT_EIRS=and(sum(gene.eirs_in_seq,1)<NR_OF_TRANS,sum(gene.eirs_in_seq,1)>0);
93 ALT_READ_IX(UNEXPLAINED_INDEX==0)=sum(NEW_READS(:,ALT_EIRS),2)>0;
94
95 %Get the coverage of the read starts
96 %COVERAGE=sum(read_starts(find(ALT_READ_IX>0),:),1);
97 if CFG.only_gene_start
98 COVERAGE=sum(reads,1);
99 else
100 COVERAGE=sum(reads(find(ALT_READ_IX>0),:),1);
101 end
102 if max(COVERAGE)>0
103 TEMP_COUNT{4}=COVERAGE;
104 else
105 TEMP_COUNT{4}=[];
106 end
107 end
108
109
110 %Get counts for parametric settting
111 if or(CFG.perform_parametric,CFG.perform_poisson)
112 %Get the alternative reads
113 ALT_EIRS=and(sum(gene.eirs_in_seq,1)<NR_OF_TRANS,sum(gene.eirs_in_seq,1)>0);
114
115 %Return the Counts in the alternative EIRS
116
117 COUNTS_PER_EIR=sum(NEW_READS(:,ALT_EIRS),1);
118 EXS_SEQ=gene.eirs_in_seq(:,ALT_EIRS);
119 [NEWCOLS,IDX2,POS]=unique(EXS_SEQ','rows');
120 NEWCOLS=NEWCOLS';
121 EIR_COUNTS=zeros(1,length(IDX2));
122 for j=1:max(POS)
123 EIR_COUNTS(j)=sum(COUNTS_PER_EIR(POS==j));
124 end
125 TEMP_COUNT{6}=EIR_COUNTS;
126 end
127
128 clear NEW_READS
129 clear reads;
130 COUNTS{i}=TEMP_COUNT;
131
132 OLD_OUT_STR=OUT_STR;
133 OUT_STR=['Finished ' num2str(i) ' out of ' num2str(size(genes,2)) ' genes'];
134 %print progress
135 if CFG.use_rproc
136 fprintf([OUT_STR '\n'])
137 else
138 % Erase old progress
139 fprintf(repmat('\b',1,length(OLD_OUT_STR)));
140 fprintf([OUT_STR])
141 end
142
143
144 end
145 fprintf('\n')
146 %Save the counts
147 OUT_FILENAME=[CFG.outfile_prefix];
148 save(OUT_FILENAME,'COUNTS')
149 %Save the nonparametric histogram
150