Mercurial > repos > vipints > rdiff
comparison rDiff/src/get_read_counts.m @ 0:0f80a5141704
version 0.3 uploaded
author | vipints |
---|---|
date | Thu, 14 Feb 2013 23:38:36 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:0f80a5141704 |
---|---|
1 function []=get_read_counts(CFG,genes) | |
2 | |
3 if CFG.use_rproc | |
4 JB_NR=1; | |
5 JOB_INFO = rproc_empty(); | |
6 end | |
7 | |
8 %%% Get the read counts | |
9 if CFG.estimate_gene_expression==1 | |
10 for RUN=1:size(CFG.BAM_FILES,2) | |
11 % configuration | |
12 CFG.curr_bamfile = CFG.BAM_FILES{RUN}; | |
13 if not(CFG.use_rproc) | |
14 fprintf('Getting gene expression for: %s\n', CFG.curr_bamfile); | |
15 end | |
16 tic | |
17 %define the splits of the genes for the jobs | |
18 idx=[(1:size(genes,2))',ceil((1:size(genes,2))*CFG.rproc_num_jobs/size(genes,2))']; | |
19 % submit jobs to cluster | |
20 for i = 1:CFG.rproc_num_jobs | |
21 PAR.genes = genes(idx(idx(:,2)==i,1)); | |
22 CFG.rproc_memreq = 5000; | |
23 CFG.rproc_par.mem_req_resubmit = [10000 20000 32000]; | |
24 CFG.rproc_par.identifier = sprintf('Exp.%i-',i); | |
25 CFG.outfile_prefix=[CFG.out_base_temp CFG.NAMES{RUN} '_' num2str(i) '_of_' num2str(CFG.rproc_num_jobs) '.mat']; | |
26 PAR.CFG=CFG; | |
27 if CFG.use_rproc | |
28 fprintf(1, 'Submitting job %i to cluster\n',i); | |
29 JOB_INFO(JB_NR) = rproc('get_reads_caller', PAR,CFG.rproc_memreq, CFG.rproc_par, CFG.rproc_time); | |
30 JB_NR=JB_NR+1; | |
31 else | |
32 get_reads_caller(PAR); | |
33 end | |
34 end | |
35 toc | |
36 end | |
37 if CFG.use_rproc | |
38 [JOB_INFO num_crashed] = rproc_wait(JOB_INFO, 60, 1, -1); | |
39 end | |
40 end | |
41 | |
42 %%% Generate the output files | |
43 %load the count files | |
44 %load the count files | |
45 READS_PER_GENE=zeros(size(genes,2),size(CFG.BAM_FILES,2)); | |
46 GENE_EXPR=zeros(size(genes,2),size(CFG.BAM_FILES,2)); | |
47 | |
48 Counts_rDiff_parametric=cell(size(genes,2),size(CFG.BAM_FILES,2)); | |
49 Counts_rDiff_nonparametric=cell(size(genes,2),size(CFG.BAM_FILES,2)); | |
50 | |
51 | |
52 %Field containing the errors | |
53 ERRORS_NR=[]; | |
54 idx=[(1:size(genes,2))',ceil((1:size(genes,2))*CFG.rproc_num_jobs/size(genes,2))']; | |
55 % Iterate over the result files to load the data from the count files | |
56 for RUN=1:size(CFG.BAM_FILES,2) | |
57 for j = 1:CFG.rproc_num_jobs | |
58 IN_FILENAME=[CFG.out_base_temp CFG.NAMES{RUN} '_' num2str(j) '_of_' num2str(CFG.rproc_num_jobs) '.mat']; | |
59 IDX=idx(idx(:,2)==j,1); | |
60 try | |
61 load(IN_FILENAME) | |
62 for k=1:length(IDX) | |
63 %Check wether COUNTS is empty | |
64 if isempty(COUNTS{k}) | |
65 continue | |
66 end | |
67 %Get the number of reads mapping to a gene | |
68 if not(isempty(COUNTS{k}{2})) | |
69 READS_PER_GENE(IDX(k),RUN)=COUNTS{k}{2}; | |
70 end | |
71 | |
72 %get the number of nonalternative reads if | |
73 %possible. Otherwise use the number of reads | |
74 %mapping to a gene as gene expression | |
75 if not(isempty(COUNTS{k}{3})) | |
76 GENE_EXPE(IDX(k),RUN)=COUNTS{k}{3}; | |
77 else | |
78 GENE_EXPE(IDX(k),RUN)=READS_PER_GENE(IDX(k),RUN); | |
79 end | |
80 | |
81 %get the Counts for rDiff.parametric | |
82 if not(isempty(COUNTS{k}{6})) | |
83 Counts_rDiff_parametric{IDX(k),RUN}=COUNTS{k}{6}; | |
84 end | |
85 | |
86 %get the counts for rDiff.nonparametric | |
87 if not(isempty(COUNTS{k}{4})) | |
88 Counts_rDiff_nonparametric{IDX(k),RUN}=COUNTS{k}{4}; | |
89 end | |
90 end | |
91 catch | |
92 warning(['There was a problem loading: ' IN_FILENAME ]) | |
93 % If something went wrong | |
94 for k=1:length(IDX) | |
95 READS_PER_GENE(IDX(k),RUN)=0; | |
96 GENE_EXPR(IDX(k),RUN)=0; | |
97 Counts_rDiff_parametric{IDX(k),RUN}={}; | |
98 Counts_rDiff_nonparametric{IDX(k),RUN}={}; | |
99 end | |
100 ERRORS_NR=[ERRORS_NR; [RUN,i]]; | |
101 end | |
102 end | |
103 end | |
104 if not(isempty(ERRORS_NR)) | |
105 warning('There have been problems loading some of the raw count files'); | |
106 end | |
107 | |
108 %If less than 10 reads use abulute number of reads | |
109 GENE_EXPR(GENE_EXPR<10)=READS_PER_GENE(GENE_EXPR<10); | |
110 | |
111 %Generate gene expression tables | |
112 | |
113 %Open file handler for the gene expression table | |
114 EXPR_TAB_FILENAME=[CFG.out_base 'Gene_expression.tab']; | |
115 fid=fopen(EXPR_TAB_FILENAME,'w'); | |
116 | |
117 %print header | |
118 fprintf(fid,'gene'); | |
119 for i=1:length(CFG.NAMES) | |
120 fprintf(fid,'\t%s',CFG.NAMES{i}); | |
121 end | |
122 fprintf(fid,'\n'); | |
123 | |
124 for j=1:size(genes,2) | |
125 fprintf(fid,'%s',genes(j).name); | |
126 for i=1:length(CFG.NAMES) | |
127 fprintf(fid,'\t%i',GENE_EXPR(j,i)); | |
128 end | |
129 fprintf(fid,'\n'); | |
130 end | |
131 | |
132 fclose(fid) | |
133 | |
134 %Determine interpreter | |
135 if size(ver('Octave'),1) | |
136 INTERPR = 1; | |
137 else | |
138 INTERPR = 0; | |
139 end | |
140 | |
141 | |
142 %Save alternative region count file for rDiff.parametric | |
143 OUT_FILENAME=[CFG.out_base 'Alternative_region_counts.mat']; | |
144 if INTERPR | |
145 save('-mat7-binary',OUT_FILENAME,'Counts_rDiff_parametric') | |
146 else | |
147 save(OUT_FILENAME,'Counts_rDiff_parametric','-v7.3') | |
148 end | |
149 | |
150 %Save alternative region count file for rDiff.nonparametric | |
151 OUT_FILENAME=[CFG.out_base 'Nonparametric_region_counts.mat']; | |
152 if INTERPR | |
153 save('-mat7-binary',OUT_FILENAME,'Counts_rDiff_nonparametric') | |
154 else | |
155 save(OUT_FILENAME,'Counts_rDiff_nonparametric','-v7.3') | |
156 end | |
157 | |
158 return |