Mercurial > repos > vipints > deseq_hts
diff deseq-hts_1.0/src/get_read_counts.m @ 0:94a108763d9e draft
deseq-hts version 1.0 wraps the DESeq 1.6.0
author | vipints |
---|---|
date | Wed, 09 May 2012 20:43:47 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deseq-hts_1.0/src/get_read_counts.m Wed May 09 20:43:47 2012 -0400 @@ -0,0 +1,172 @@ +function get_read_counts(anno_dir, outfile, varargin) +% +% -- input -- +% anno_dir: directory of genes +% outfile: output file +% varargin: list of BAM files (at least two) + +% DESeq paths +global DESEQ_PATH DESEQ_SRC_PATH + +% interpreter paths +global INTERPRETER MATLAB_BIN_PATH OCTAVE_BIN_PATH + +% SAMTools path +global SAMTOOLS_DIR + +%%%% paths +addpath(sprintf('%s/tools', DESEQ_PATH)); +addpath(sprintf('%s/mex', DESEQ_PATH)); +addpath(sprintf('%s', DESEQ_SRC_PATH)); + +deseq_config; + +%%% read list of replicate groups from variable length argument list +rg_list = cell(1,size(varargin, 2)); +file_list = cell(); +file_cond_ids = []; +file_rep_ids = []; +for idx = 1:size(varargin, 2) + rg_list(idx) = varargin(idx); +end +idx = strmatch('', rg_list, 'exact'); +rg_list(idx) = []; +for idx = 1:length(rg_list), + items = separate(rg_list{idx}, ':'); + for idx2 = 1:length(items) + if isempty(deblank(items{idx2})), + continue; + end; + file_list{end + 1} = items{idx2}; + file_cond_ids(end + 1) = idx; + file_rep_ids(end + 1) = idx2; + end; +end; +clear idx idx2; + +%%%%% adapt to number of input arguments +file_num = length(file_list); +RESULTS = cell(1, file_num); + +%%%% get annotation file +load(sprintf('%s', anno_dir)); + +%%%%% mask overlapping gene regions -> later not counted +[genes] = mask_dubl(genes,0); + +%%%% remove genes with no annotated exons or where no +idx = find(arrayfun(@(x)(~isempty(x.exons)*~isempty(x.start)*~isempty(x.stop)), genes)); +fprintf('removed %i of %i genes, which had either no exons annotated or lacked a start or stop position\n', size(genes, 2) - size(idx, 2), size(genes, 2)) +genes = genes(idx); +clear idx; + +%%%% check if genes have field chr_num +if ~isfield(genes, 'chr_num') + chrms = unique({genes(:).chr}); + for i = 1:length(genes) + genes(i).chr_num = strmatch(genes(i).chr, chrms, 'exact'); + end; +end; + +%%%% iterate over all given bam files +for f_idx = 1:file_num + expr1_bam = fullfile('', file_list{f_idx}); + STAT = cell(size(genes, 2),1); + for i=1:size(genes,2) + RESULT = cell(1,7); + gene = genes(i); + RESULT{4} = f_idx; + RESULT{1} = gene.name; + if isempty(gene.exons) + RESULT{2} = inf; + RESULT{3} = inf; + RESULT{5} = [inf,inf]; + STAT{i} = RESULT; + continue; + elseif or(isempty(gene.start),isempty(gene.stop)) + RESULT{2} = inf; + RESULT{3} = inf; + RESULT{5} = [inf,inf]; + STAT{i} = RESULT; + continue; + end + if ~isempty(gene.chr_num), + [mask1, read_intron_list] = get_reads(expr1_bam, gene.chr, gene.start, gene.stop, '0'); + clear read_intron_list; + else + mask1 = []; + end; + + if isempty(mask1) + reads1 = zeros(0,gene.stop-gene.start+1); + else + reads1 = sparse(mask1(1,:)',mask1(2,:)',ones(size(mask1,2),1),max(mask1(1,:)),gene.stop-gene.start+1); + end + if ~isempty(reads1); + [reads1,FLAG] = remove_reads_from_other_genes(reads1,gene); + end + L = size(reads1); + RESULT{2}=[size(reads1,1)]; % number of all reads falling in that gene + EXON_IDX=zeros(1,gene.stop-gene.start+1); + for t=1:size(gene.transcripts,2) + for e=1:size(gene.exons{t},1) + EXON_IDX((gene.exons{t}(e,1)-gene.start+1):(gene.exons{t}(e,2)-gene.start+1))=1; + end + end + reads1 = reads1(sum(reads1(:,find(EXON_IDX)),2)>0,:); + L1 = sum(EXON_IDX); + RESULT{3}=[size(reads1,1)]; % number of reads overlapping to exons + RESULT{5}=[L, L1]; % size of reads1, number of exonic positions + % old and weighted poisson new ,weighted regions reads and + % unexplained reads + clear reads1; + STAT{i} = RESULT; + end; + RESULTS{f_idx} = STAT; +end; + +S=size(genes,2); +READCOUNTS_ALL=zeros(S, file_num); +READCOUNTS_EXON=zeros(S, file_num); +LENGTH_ALL=zeros(S,file_num); +LEN_EXON=zeros(S, file_num); + +for j=1:file_num, + for i=1:S + T=RESULTS{j}{i}; + if isempty(T) + continue + else + READCOUNTS_ALL(i,j)=T{2}; + READCOUNTS_EXON(i,j)=T{3}; + LENGTH_ALL(i,j)=T{5}(1); + LEN_EXON(i,j)=T{5}(2); + end + end +end + +%%%%% write results for all bam files +fid_conditions = fopen(sprintf('%s_CONDITIONS.tab', outfile), 'w'); +fid_counts = fopen(sprintf('%s_COUNTS.tab', outfile) ,'w'); +fprintf(fid_counts,'gene'); +fprintf(fid_conditions, 'file\tcondition\treplicate\n'); +for j = 1:length(file_list) + fname = file_list{j} ; + fname = separate(fname, '/'); + fname = fname{end}; + fname = strrep(fname, '.bam', '') ; + fprintf(fid_counts,'\t%s', fname); + fprintf(fid_conditions, '%s\t%i\t%i\n', fname, file_cond_ids(j), file_rep_ids(j)); +end; +fprintf(fid_counts,'\n') ; + +for i = 1:size(genes,2) + fprintf(fid_counts,'%s',genes(i).name); + for j = 1:length(file_list), + fprintf(fid_counts,'\t%i', READCOUNTS_EXON(i,j)); + end + fprintf(fid_counts,'\n'); +end +fclose(fid_counts); +fclose(fid_conditions); +exit;