0
|
1 function [COUNTS]=get_reads_caller(PAR)
|
|
2
|
|
3 CFG = PAR.CFG;
|
|
4 genes = PAR.genes;
|
|
5 clear PAR;
|
|
6
|
|
7 % add paths
|
|
8 addpath(CFG.paths);
|
|
9
|
|
10 data_dir=CFG.data_dir;
|
|
11 OUT_STR=[];
|
|
12
|
|
13 COUNTS=cell(size(genes,2),1);
|
|
14
|
|
15
|
|
16 % NON_PARAM_SAMPLE contains the read start density
|
|
17 if CFG.perform_nonparametric
|
|
18 NON_PARAM_SAMPLE=sparse([],[],[],10000,1,5000);
|
|
19 end
|
|
20
|
|
21
|
|
22 for i=1:size(genes,2)
|
|
23
|
|
24
|
|
25
|
|
26 %TEMP_COUNT contins the counts for the current gene
|
|
27 TEMP_COUNT=cell(1,7);
|
|
28 gene = genes(i);
|
|
29
|
|
30 %set default return values
|
|
31 TEMP_COUNT{1}=gene.name;
|
|
32 TEMP_COUNT{2}=[];
|
|
33 TEMP_COUNT{3}=[];
|
|
34 TEMP_COUNT{4}=[];
|
|
35 TEMP_COUNT{5}=[];
|
|
36 TEMP_COUNT{6}=[];
|
|
37
|
|
38 %check that the gene has exons defined
|
|
39 if isempty(gene.exons)
|
|
40 STAT{i}=TEMP_COUNT;
|
|
41 continue;
|
|
42 end
|
|
43
|
|
44 %check that the gene is longer than the Reads. Otherwise the
|
|
45 %definition of regions does not makes sense
|
|
46 if gene.stop-gene.start<CFG.sequenced_length+3
|
|
47 STAT{i}=TEMP_COUNT;
|
|
48 continue;
|
|
49 end
|
|
50
|
|
51 %Get the reads from gene
|
|
52 [reads] = get_reads_for_gene(CFG,gene);
|
|
53
|
|
54
|
|
55 %Get total number of reads
|
|
56 NR_OF_READS=size(reads,1);
|
|
57 TEMP_COUNT{2}=NR_OF_READS;
|
|
58
|
|
59
|
|
60 NR_OF_TRANS=size(gene.transcripts,2);
|
|
61 %check that the gene has more than one isoform
|
|
62 if NR_OF_TRANS<=1
|
|
63 STAT{i}=TEMP_COUNT;
|
|
64 continue;
|
|
65 end
|
|
66
|
|
67
|
|
68 EXON_IDX=sum(gene.exonsequence,1)<NR_OF_TRANS;
|
|
69
|
|
70 %Transform the reads in to counts per region
|
|
71 [NEW_READS,UNEXPLAINED_READS,UNEXPLAINED_INDEX]= convert_reads_to_region_indicators(reads,gene);
|
|
72
|
|
73 if length(unique(sum(NEW_READS,2)))>1
|
|
74 warning(['Assignment of reads to regions is not unique for gene:' gene.name '\n']);
|
|
75 end
|
|
76
|
|
77
|
|
78 %Calulate gene expression
|
|
79 %Get the non_unique_regions
|
|
80 NON_ALT_EIRS=sum(gene.eirs_in_seq,1)==NR_OF_TRANS;
|
|
81 TEMP_COUNT{3}=sum(sum(NEW_READS(:,NON_ALT_EIRS),2)>0);
|
|
82
|
|
83
|
|
84 %Get Counts for nonparametric variance function
|
|
85 if CFG.perform_nonparametric
|
|
86 %Get the read starts
|
|
87 [TEMP,START]=max(reads,[],2);
|
|
88 read_starts=sparse((1:NR_OF_READS)',START,ones(NR_OF_READS,1),NR_OF_READS,size(reads,2),NR_OF_READS);
|
|
89
|
|
90 %Get the index of the alternative reads
|
|
91 ALT_READ_IX=zeros(size(reads,1),1);
|
|
92 ALT_EIRS=and(sum(gene.eirs_in_seq,1)<NR_OF_TRANS,sum(gene.eirs_in_seq,1)>0);
|
|
93 ALT_READ_IX(UNEXPLAINED_INDEX==0)=sum(NEW_READS(:,ALT_EIRS),2)>0;
|
|
94
|
|
95 %Get the coverage of the read starts
|
|
96 %COVERAGE=sum(read_starts(find(ALT_READ_IX>0),:),1);
|
|
97 if CFG.only_gene_start
|
|
98 COVERAGE=sum(reads,1);
|
|
99 else
|
|
100 COVERAGE=sum(reads(find(ALT_READ_IX>0),:),1);
|
|
101 end
|
|
102 if max(COVERAGE)>0
|
|
103 TEMP_COUNT{4}=COVERAGE;
|
|
104 else
|
|
105 TEMP_COUNT{4}=[];
|
|
106 end
|
|
107 end
|
|
108
|
|
109
|
|
110 %Get counts for parametric settting
|
|
111 if or(CFG.perform_parametric,CFG.perform_poisson)
|
|
112 %Get the alternative reads
|
|
113 ALT_EIRS=and(sum(gene.eirs_in_seq,1)<NR_OF_TRANS,sum(gene.eirs_in_seq,1)>0);
|
|
114
|
|
115 %Return the Counts in the alternative EIRS
|
|
116
|
|
117 COUNTS_PER_EIR=sum(NEW_READS(:,ALT_EIRS),1);
|
|
118 EXS_SEQ=gene.eirs_in_seq(:,ALT_EIRS);
|
|
119 [NEWCOLS,IDX2,POS]=unique(EXS_SEQ','rows');
|
|
120 NEWCOLS=NEWCOLS';
|
|
121 EIR_COUNTS=zeros(1,length(IDX2));
|
|
122 for j=1:max(POS)
|
|
123 EIR_COUNTS(j)=sum(COUNTS_PER_EIR(POS==j));
|
|
124 end
|
|
125 TEMP_COUNT{6}=EIR_COUNTS;
|
|
126 end
|
|
127
|
|
128 clear NEW_READS
|
|
129 clear reads;
|
|
130 COUNTS{i}=TEMP_COUNT;
|
|
131
|
|
132 OLD_OUT_STR=OUT_STR;
|
|
133 OUT_STR=['Finished ' num2str(i) ' out of ' num2str(size(genes,2)) ' genes'];
|
|
134 %print progress
|
|
135 if CFG.use_rproc
|
|
136 fprintf([OUT_STR '\n'])
|
|
137 else
|
|
138 % Erase old progress
|
|
139 fprintf(repmat('\b',1,length(OLD_OUT_STR)));
|
|
140 fprintf([OUT_STR])
|
|
141 end
|
|
142
|
|
143
|
|
144 end
|
|
145 fprintf('\n')
|
|
146 %Save the counts
|
|
147 OUT_FILENAME=[CFG.outfile_prefix];
|
|
148 save(OUT_FILENAME,'COUNTS')
|
|
149 %Save the nonparametric histogram
|
|
150
|