0
|
1 function []=get_read_counts(CFG,genes)
|
|
2
|
|
3 if CFG.use_rproc
|
|
4 JB_NR=1;
|
|
5 JOB_INFO = rproc_empty();
|
|
6 end
|
|
7
|
|
8 %%% Get the read counts
|
|
9 if CFG.estimate_gene_expression==1
|
|
10 for RUN=1:size(CFG.BAM_FILES,2)
|
|
11 % configuration
|
|
12 CFG.curr_bamfile = CFG.BAM_FILES{RUN};
|
|
13 if not(CFG.use_rproc)
|
|
14 fprintf('Getting gene expression for: %s\n', CFG.curr_bamfile);
|
|
15 end
|
|
16 tic
|
|
17 %define the splits of the genes for the jobs
|
|
18 idx=[(1:size(genes,2))',ceil((1:size(genes,2))*CFG.rproc_num_jobs/size(genes,2))'];
|
|
19 % submit jobs to cluster
|
|
20 for i = 1:CFG.rproc_num_jobs
|
|
21 PAR.genes = genes(idx(idx(:,2)==i,1));
|
|
22 CFG.rproc_memreq = 5000;
|
|
23 CFG.rproc_par.mem_req_resubmit = [10000 20000 32000];
|
|
24 CFG.rproc_par.identifier = sprintf('Exp.%i-',i);
|
|
25 CFG.outfile_prefix=[CFG.out_base_temp CFG.NAMES{RUN} '_' num2str(i) '_of_' num2str(CFG.rproc_num_jobs) '.mat'];
|
|
26 PAR.CFG=CFG;
|
|
27 if CFG.use_rproc
|
|
28 fprintf(1, 'Submitting job %i to cluster\n',i);
|
|
29 JOB_INFO(JB_NR) = rproc('get_reads_caller', PAR,CFG.rproc_memreq, CFG.rproc_par, CFG.rproc_time);
|
|
30 JB_NR=JB_NR+1;
|
|
31 else
|
|
32 get_reads_caller(PAR);
|
|
33 end
|
|
34 end
|
|
35 toc
|
|
36 end
|
|
37 if CFG.use_rproc
|
|
38 [JOB_INFO num_crashed] = rproc_wait(JOB_INFO, 60, 1, -1);
|
|
39 end
|
|
40 end
|
|
41
|
|
42 %%% Generate the output files
|
|
43 %load the count files
|
|
44 %load the count files
|
|
45 READS_PER_GENE=zeros(size(genes,2),size(CFG.BAM_FILES,2));
|
|
46 GENE_EXPR=zeros(size(genes,2),size(CFG.BAM_FILES,2));
|
|
47
|
|
48 Counts_rDiff_parametric=cell(size(genes,2),size(CFG.BAM_FILES,2));
|
|
49 Counts_rDiff_nonparametric=cell(size(genes,2),size(CFG.BAM_FILES,2));
|
|
50
|
|
51
|
|
52 %Field containing the errors
|
|
53 ERRORS_NR=[];
|
|
54 idx=[(1:size(genes,2))',ceil((1:size(genes,2))*CFG.rproc_num_jobs/size(genes,2))'];
|
|
55 % Iterate over the result files to load the data from the count files
|
|
56 for RUN=1:size(CFG.BAM_FILES,2)
|
|
57 for j = 1:CFG.rproc_num_jobs
|
|
58 IN_FILENAME=[CFG.out_base_temp CFG.NAMES{RUN} '_' num2str(j) '_of_' num2str(CFG.rproc_num_jobs) '.mat'];
|
|
59 IDX=idx(idx(:,2)==j,1);
|
|
60 try
|
|
61 load(IN_FILENAME)
|
|
62 for k=1:length(IDX)
|
|
63 %Check wether COUNTS is empty
|
|
64 if isempty(COUNTS{k})
|
|
65 continue
|
|
66 end
|
|
67 %Get the number of reads mapping to a gene
|
|
68 if not(isempty(COUNTS{k}{2}))
|
|
69 READS_PER_GENE(IDX(k),RUN)=COUNTS{k}{2};
|
|
70 end
|
|
71
|
|
72 %get the number of nonalternative reads if
|
|
73 %possible. Otherwise use the number of reads
|
|
74 %mapping to a gene as gene expression
|
|
75 if not(isempty(COUNTS{k}{3}))
|
|
76 GENE_EXPE(IDX(k),RUN)=COUNTS{k}{3};
|
|
77 else
|
|
78 GENE_EXPE(IDX(k),RUN)=READS_PER_GENE(IDX(k),RUN);
|
|
79 end
|
|
80
|
|
81 %get the Counts for rDiff.parametric
|
|
82 if not(isempty(COUNTS{k}{6}))
|
|
83 Counts_rDiff_parametric{IDX(k),RUN}=COUNTS{k}{6};
|
|
84 end
|
|
85
|
|
86 %get the counts for rDiff.nonparametric
|
|
87 if not(isempty(COUNTS{k}{4}))
|
|
88 Counts_rDiff_nonparametric{IDX(k),RUN}=COUNTS{k}{4};
|
|
89 end
|
|
90 end
|
|
91 catch
|
|
92 warning(['There was a problem loading: ' IN_FILENAME ])
|
|
93 % If something went wrong
|
|
94 for k=1:length(IDX)
|
|
95 READS_PER_GENE(IDX(k),RUN)=0;
|
|
96 GENE_EXPR(IDX(k),RUN)=0;
|
|
97 Counts_rDiff_parametric{IDX(k),RUN}={};
|
|
98 Counts_rDiff_nonparametric{IDX(k),RUN}={};
|
|
99 end
|
|
100 ERRORS_NR=[ERRORS_NR; [RUN,i]];
|
|
101 end
|
|
102 end
|
|
103 end
|
|
104 if not(isempty(ERRORS_NR))
|
|
105 warning('There have been problems loading some of the raw count files');
|
|
106 end
|
|
107
|
|
108 %If less than 10 reads use abulute number of reads
|
|
109 GENE_EXPR(GENE_EXPR<10)=READS_PER_GENE(GENE_EXPR<10);
|
|
110
|
|
111 %Generate gene expression tables
|
|
112
|
|
113 %Open file handler for the gene expression table
|
|
114 EXPR_TAB_FILENAME=[CFG.out_base 'Gene_expression.tab'];
|
|
115 fid=fopen(EXPR_TAB_FILENAME,'w');
|
|
116
|
|
117 %print header
|
|
118 fprintf(fid,'gene');
|
|
119 for i=1:length(CFG.NAMES)
|
|
120 fprintf(fid,'\t%s',CFG.NAMES{i});
|
|
121 end
|
|
122 fprintf(fid,'\n');
|
|
123
|
|
124 for j=1:size(genes,2)
|
|
125 fprintf(fid,'%s',genes(j).name);
|
|
126 for i=1:length(CFG.NAMES)
|
|
127 fprintf(fid,'\t%i',GENE_EXPR(j,i));
|
|
128 end
|
|
129 fprintf(fid,'\n');
|
|
130 end
|
|
131
|
|
132 fclose(fid)
|
|
133
|
|
134 %Determine interpreter
|
|
135 if size(ver('Octave'),1)
|
|
136 INTERPR = 1;
|
|
137 else
|
|
138 INTERPR = 0;
|
|
139 end
|
|
140
|
|
141
|
|
142 %Save alternative region count file for rDiff.parametric
|
|
143 OUT_FILENAME=[CFG.out_base 'Alternative_region_counts.mat'];
|
|
144 if INTERPR
|
|
145 save('-mat7-binary',OUT_FILENAME,'Counts_rDiff_parametric')
|
|
146 else
|
|
147 save(OUT_FILENAME,'Counts_rDiff_parametric','-v7.3')
|
|
148 end
|
|
149
|
|
150 %Save alternative region count file for rDiff.nonparametric
|
|
151 OUT_FILENAME=[CFG.out_base 'Nonparametric_region_counts.mat'];
|
|
152 if INTERPR
|
|
153 save('-mat7-binary',OUT_FILENAME,'Counts_rDiff_nonparametric')
|
|
154 else
|
|
155 save(OUT_FILENAME,'Counts_rDiff_nonparametric','-v7.3')
|
|
156 end
|
|
157
|
|
158 return
|