Mercurial > repos > vipints > rdiff
view rDiff/src/get_read_counts.m @ 2:233c30f91d66
updated python based GFF parsing module which will handle GTF/GFF/GFF3 file types
author | vipints <vipin@cbio.mskcc.org> |
---|---|
date | Tue, 08 Oct 2013 07:15:44 -0400 |
parents | 0f80a5141704 |
children |
line wrap: on
line source
function []=get_read_counts(CFG,genes) if CFG.use_rproc JB_NR=1; JOB_INFO = rproc_empty(); end %%% Get the read counts if CFG.estimate_gene_expression==1 for RUN=1:size(CFG.BAM_FILES,2) % configuration CFG.curr_bamfile = CFG.BAM_FILES{RUN}; if not(CFG.use_rproc) fprintf('Getting gene expression for: %s\n', CFG.curr_bamfile); end tic %define the splits of the genes for the jobs idx=[(1:size(genes,2))',ceil((1:size(genes,2))*CFG.rproc_num_jobs/size(genes,2))']; % submit jobs to cluster for i = 1:CFG.rproc_num_jobs PAR.genes = genes(idx(idx(:,2)==i,1)); CFG.rproc_memreq = 5000; CFG.rproc_par.mem_req_resubmit = [10000 20000 32000]; CFG.rproc_par.identifier = sprintf('Exp.%i-',i); CFG.outfile_prefix=[CFG.out_base_temp CFG.NAMES{RUN} '_' num2str(i) '_of_' num2str(CFG.rproc_num_jobs) '.mat']; PAR.CFG=CFG; if CFG.use_rproc fprintf(1, 'Submitting job %i to cluster\n',i); JOB_INFO(JB_NR) = rproc('get_reads_caller', PAR,CFG.rproc_memreq, CFG.rproc_par, CFG.rproc_time); JB_NR=JB_NR+1; else get_reads_caller(PAR); end end toc end if CFG.use_rproc [JOB_INFO num_crashed] = rproc_wait(JOB_INFO, 60, 1, -1); end end %%% Generate the output files %load the count files %load the count files READS_PER_GENE=zeros(size(genes,2),size(CFG.BAM_FILES,2)); GENE_EXPR=zeros(size(genes,2),size(CFG.BAM_FILES,2)); Counts_rDiff_parametric=cell(size(genes,2),size(CFG.BAM_FILES,2)); Counts_rDiff_nonparametric=cell(size(genes,2),size(CFG.BAM_FILES,2)); %Field containing the errors ERRORS_NR=[]; idx=[(1:size(genes,2))',ceil((1:size(genes,2))*CFG.rproc_num_jobs/size(genes,2))']; % Iterate over the result files to load the data from the count files for RUN=1:size(CFG.BAM_FILES,2) for j = 1:CFG.rproc_num_jobs IN_FILENAME=[CFG.out_base_temp CFG.NAMES{RUN} '_' num2str(j) '_of_' num2str(CFG.rproc_num_jobs) '.mat']; IDX=idx(idx(:,2)==j,1); try load(IN_FILENAME) for k=1:length(IDX) %Check wether COUNTS is empty if isempty(COUNTS{k}) continue end %Get the number of reads mapping to a gene if not(isempty(COUNTS{k}{2})) READS_PER_GENE(IDX(k),RUN)=COUNTS{k}{2}; end %get the number of nonalternative reads if %possible. Otherwise use the number of reads %mapping to a gene as gene expression if not(isempty(COUNTS{k}{3})) GENE_EXPE(IDX(k),RUN)=COUNTS{k}{3}; else GENE_EXPE(IDX(k),RUN)=READS_PER_GENE(IDX(k),RUN); end %get the Counts for rDiff.parametric if not(isempty(COUNTS{k}{6})) Counts_rDiff_parametric{IDX(k),RUN}=COUNTS{k}{6}; end %get the counts for rDiff.nonparametric if not(isempty(COUNTS{k}{4})) Counts_rDiff_nonparametric{IDX(k),RUN}=COUNTS{k}{4}; end end catch warning(['There was a problem loading: ' IN_FILENAME ]) % If something went wrong for k=1:length(IDX) READS_PER_GENE(IDX(k),RUN)=0; GENE_EXPR(IDX(k),RUN)=0; Counts_rDiff_parametric{IDX(k),RUN}={}; Counts_rDiff_nonparametric{IDX(k),RUN}={}; end ERRORS_NR=[ERRORS_NR; [RUN,i]]; end end end if not(isempty(ERRORS_NR)) warning('There have been problems loading some of the raw count files'); end %If less than 10 reads use abulute number of reads GENE_EXPR(GENE_EXPR<10)=READS_PER_GENE(GENE_EXPR<10); %Generate gene expression tables %Open file handler for the gene expression table EXPR_TAB_FILENAME=[CFG.out_base 'Gene_expression.tab']; fid=fopen(EXPR_TAB_FILENAME,'w'); %print header fprintf(fid,'gene'); for i=1:length(CFG.NAMES) fprintf(fid,'\t%s',CFG.NAMES{i}); end fprintf(fid,'\n'); for j=1:size(genes,2) fprintf(fid,'%s',genes(j).name); for i=1:length(CFG.NAMES) fprintf(fid,'\t%i',GENE_EXPR(j,i)); end fprintf(fid,'\n'); end fclose(fid) %Determine interpreter if size(ver('Octave'),1) INTERPR = 1; else INTERPR = 0; end %Save alternative region count file for rDiff.parametric OUT_FILENAME=[CFG.out_base 'Alternative_region_counts.mat']; if INTERPR save('-mat7-binary',OUT_FILENAME,'Counts_rDiff_parametric') else save(OUT_FILENAME,'Counts_rDiff_parametric','-v7.3') end %Save alternative region count file for rDiff.nonparametric OUT_FILENAME=[CFG.out_base 'Nonparametric_region_counts.mat']; if INTERPR save('-mat7-binary',OUT_FILENAME,'Counts_rDiff_nonparametric') else save(OUT_FILENAME,'Counts_rDiff_nonparametric','-v7.3') end return