Mercurial > repos > vipints > rdiff
diff rDiff/src/get_read_counts.m @ 0:0f80a5141704
version 0.3 uploaded
author | vipints |
---|---|
date | Thu, 14 Feb 2013 23:38:36 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rDiff/src/get_read_counts.m Thu Feb 14 23:38:36 2013 -0500 @@ -0,0 +1,158 @@ +function []=get_read_counts(CFG,genes) + +if CFG.use_rproc + JB_NR=1; + JOB_INFO = rproc_empty(); +end + +%%% Get the read counts +if CFG.estimate_gene_expression==1 + for RUN=1:size(CFG.BAM_FILES,2) + % configuration + CFG.curr_bamfile = CFG.BAM_FILES{RUN}; + if not(CFG.use_rproc) + fprintf('Getting gene expression for: %s\n', CFG.curr_bamfile); + end + tic + %define the splits of the genes for the jobs + idx=[(1:size(genes,2))',ceil((1:size(genes,2))*CFG.rproc_num_jobs/size(genes,2))']; + % submit jobs to cluster + for i = 1:CFG.rproc_num_jobs + PAR.genes = genes(idx(idx(:,2)==i,1)); + CFG.rproc_memreq = 5000; + CFG.rproc_par.mem_req_resubmit = [10000 20000 32000]; + CFG.rproc_par.identifier = sprintf('Exp.%i-',i); + CFG.outfile_prefix=[CFG.out_base_temp CFG.NAMES{RUN} '_' num2str(i) '_of_' num2str(CFG.rproc_num_jobs) '.mat']; + PAR.CFG=CFG; + if CFG.use_rproc + fprintf(1, 'Submitting job %i to cluster\n',i); + JOB_INFO(JB_NR) = rproc('get_reads_caller', PAR,CFG.rproc_memreq, CFG.rproc_par, CFG.rproc_time); + JB_NR=JB_NR+1; + else + get_reads_caller(PAR); + end + end + toc + end + if CFG.use_rproc + [JOB_INFO num_crashed] = rproc_wait(JOB_INFO, 60, 1, -1); + end +end + +%%% Generate the output files +%load the count files +%load the count files +READS_PER_GENE=zeros(size(genes,2),size(CFG.BAM_FILES,2)); +GENE_EXPR=zeros(size(genes,2),size(CFG.BAM_FILES,2)); + +Counts_rDiff_parametric=cell(size(genes,2),size(CFG.BAM_FILES,2)); +Counts_rDiff_nonparametric=cell(size(genes,2),size(CFG.BAM_FILES,2)); + + +%Field containing the errors +ERRORS_NR=[]; +idx=[(1:size(genes,2))',ceil((1:size(genes,2))*CFG.rproc_num_jobs/size(genes,2))']; +% Iterate over the result files to load the data from the count files +for RUN=1:size(CFG.BAM_FILES,2) + for j = 1:CFG.rproc_num_jobs + IN_FILENAME=[CFG.out_base_temp CFG.NAMES{RUN} '_' num2str(j) '_of_' num2str(CFG.rproc_num_jobs) '.mat']; + IDX=idx(idx(:,2)==j,1); + try + load(IN_FILENAME) + for k=1:length(IDX) + %Check wether COUNTS is empty + if isempty(COUNTS{k}) + continue + end + %Get the number of reads mapping to a gene + if not(isempty(COUNTS{k}{2})) + READS_PER_GENE(IDX(k),RUN)=COUNTS{k}{2}; + end + + %get the number of nonalternative reads if + %possible. Otherwise use the number of reads + %mapping to a gene as gene expression + if not(isempty(COUNTS{k}{3})) + GENE_EXPE(IDX(k),RUN)=COUNTS{k}{3}; + else + GENE_EXPE(IDX(k),RUN)=READS_PER_GENE(IDX(k),RUN); + end + + %get the Counts for rDiff.parametric + if not(isempty(COUNTS{k}{6})) + Counts_rDiff_parametric{IDX(k),RUN}=COUNTS{k}{6}; + end + + %get the counts for rDiff.nonparametric + if not(isempty(COUNTS{k}{4})) + Counts_rDiff_nonparametric{IDX(k),RUN}=COUNTS{k}{4}; + end + end + catch + warning(['There was a problem loading: ' IN_FILENAME ]) + % If something went wrong + for k=1:length(IDX) + READS_PER_GENE(IDX(k),RUN)=0; + GENE_EXPR(IDX(k),RUN)=0; + Counts_rDiff_parametric{IDX(k),RUN}={}; + Counts_rDiff_nonparametric{IDX(k),RUN}={}; + end + ERRORS_NR=[ERRORS_NR; [RUN,i]]; + end + end +end +if not(isempty(ERRORS_NR)) + warning('There have been problems loading some of the raw count files'); +end + +%If less than 10 reads use abulute number of reads +GENE_EXPR(GENE_EXPR<10)=READS_PER_GENE(GENE_EXPR<10); + +%Generate gene expression tables + +%Open file handler for the gene expression table +EXPR_TAB_FILENAME=[CFG.out_base 'Gene_expression.tab']; +fid=fopen(EXPR_TAB_FILENAME,'w'); + +%print header +fprintf(fid,'gene'); +for i=1:length(CFG.NAMES) + fprintf(fid,'\t%s',CFG.NAMES{i}); +end +fprintf(fid,'\n'); + +for j=1:size(genes,2) + fprintf(fid,'%s',genes(j).name); + for i=1:length(CFG.NAMES) + fprintf(fid,'\t%i',GENE_EXPR(j,i)); + end + fprintf(fid,'\n'); +end + +fclose(fid) + +%Determine interpreter +if size(ver('Octave'),1) + INTERPR = 1; +else + INTERPR = 0; +end + + +%Save alternative region count file for rDiff.parametric +OUT_FILENAME=[CFG.out_base 'Alternative_region_counts.mat']; +if INTERPR + save('-mat7-binary',OUT_FILENAME,'Counts_rDiff_parametric') +else + save(OUT_FILENAME,'Counts_rDiff_parametric','-v7.3') +end + +%Save alternative region count file for rDiff.nonparametric +OUT_FILENAME=[CFG.out_base 'Nonparametric_region_counts.mat']; +if INTERPR + save('-mat7-binary',OUT_FILENAME,'Counts_rDiff_nonparametric') +else + save(OUT_FILENAME,'Counts_rDiff_nonparametric','-v7.3') +end + +return