deseq_hts: deseq-hts_1.0/src/get_read

comparison deseq-hts_1.0/src/get_read_counts.m @ 0:94a108763d9e draft

deseq-hts version 1.0 wraps the DESeq 1.6.0

author	vipints
date	Wed, 09 May 2012 20:43:47 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:94a108763d9e
+function get_read_counts(anno_dir, outfile, varargin)
+%
+% -- input --
+% anno_dir: directory of genes
+% outfile: output file
+% varargin: list of BAM files (at least two)
+% DESeq paths
+global DESEQ_PATH DESEQ_SRC_PATH
+% interpreter paths
+global INTERPRETER MATLAB_BIN_PATH OCTAVE_BIN_PATH
+% SAMTools path
+global SAMTOOLS_DIR
+%%%% paths
+addpath(sprintf('%s/tools', DESEQ_PATH));
+addpath(sprintf('%s/mex', DESEQ_PATH));
+addpath(sprintf('%s', DESEQ_SRC_PATH));
+deseq_config;
+%%% read list of replicate groups from variable length argument list
+rg_list = cell(1,size(varargin, 2));
+file_list = cell();
+file_cond_ids = [];
+file_rep_ids = [];
+for idx = 1:size(varargin, 2)
+rg_list(idx) = varargin(idx);
+end
+idx = strmatch('', rg_list, 'exact');
+rg_list(idx) = [];
+for idx = 1:length(rg_list),
+items = separate(rg_list{idx}, ':');
+for idx2 = 1:length(items)
+		if isempty(deblank(items{idx2})),
+			continue;
+	    end;
+file_list{end + 1} = items{idx2};
+file_cond_ids(end + 1) = idx;
+file_rep_ids(end + 1) = idx2;
+end;
+end;
+clear idx idx2;
+%%%%% adapt to number of input arguments
+file_num = length(file_list);
+RESULTS = cell(1, file_num);
+%%%% get annotation file
+load(sprintf('%s', anno_dir));
+%%%%% mask overlapping gene regions -> later not counted
+[genes] = mask_dubl(genes,0);
+%%%% remove genes with no annotated exons or where no
+idx = find(arrayfun(@(x)(~isempty(x.exons)*~isempty(x.start)*~isempty(x.stop)), genes));
+fprintf('removed %i of %i genes, which had either no exons annotated or lacked a start or stop position\n', size(genes, 2) - size(idx, 2), size(genes, 2))
+genes = genes(idx);
+clear idx;
+%%%% check if genes have field chr_num
+if ~isfield(genes, 'chr_num')
+	chrms = unique({genes(:).chr});
+	for i = 1:length(genes)
+		genes(i).chr_num = strmatch(genes(i).chr, chrms, 'exact');
+	end;
+end;
+%%%% iterate over all given bam files
+for f_idx = 1:file_num
+expr1_bam = fullfile('', file_list{f_idx});
+STAT = cell(size(genes, 2),1);
+for i=1:size(genes,2)
+RESULT = cell(1,7);
+gene = genes(i);
+RESULT{4} = f_idx;
+RESULT{1} = gene.name;
+if isempty(gene.exons)
+RESULT{2} = inf;
+RESULT{3} = inf;
+RESULT{5} = [inf,inf];
+STAT{i} = RESULT;
+continue;
+elseif or(isempty(gene.start),isempty(gene.stop))
+		    RESULT{2} = inf;
+		    RESULT{3} = inf;
+		    RESULT{5} = [inf,inf];
+		    STAT{i} = RESULT;
+		    continue;
+end
+if ~isempty(gene.chr_num),
+		    [mask1, read_intron_list] = get_reads(expr1_bam, gene.chr, gene.start, gene.stop, '0');
+		    clear read_intron_list;
+	    else
+		    mask1 = [];
+end;
+if isempty(mask1)
+reads1 = zeros(0,gene.stop-gene.start+1);
+else
+reads1 = sparse(mask1(1,:)',mask1(2,:)',ones(size(mask1,2),1),max(mask1(1,:)),gene.stop-gene.start+1);
+end
+	    if ~isempty(reads1);
+		    [reads1,FLAG] = remove_reads_from_other_genes(reads1,gene);
+	    end
+L = size(reads1);
+RESULT{2}=[size(reads1,1)]; % number of all reads falling in that gene
+EXON_IDX=zeros(1,gene.stop-gene.start+1);
+for t=1:size(gene.transcripts,2)
+for e=1:size(gene.exons{t},1)
+EXON_IDX((gene.exons{t}(e,1)-gene.start+1):(gene.exons{t}(e,2)-gene.start+1))=1;
+end
+end
+reads1 = reads1(sum(reads1(:,find(EXON_IDX)),2)>0,:);
+L1 = sum(EXON_IDX);
+RESULT{3}=[size(reads1,1)]; % number of reads overlapping to exons
+RESULT{5}=[L, L1]; % size of reads1, number of exonic positions
+% old and weighted poisson new ,weighted regions reads and
+% unexplained reads
+clear reads1;
+STAT{i} = RESULT;
+end;
+RESULTS{f_idx} = STAT;
+end;
+S=size(genes,2);
+READCOUNTS_ALL=zeros(S, file_num);
+READCOUNTS_EXON=zeros(S, file_num);
+LENGTH_ALL=zeros(S,file_num);
+LEN_EXON=zeros(S, file_num);
+for j=1:file_num,
+for i=1:S
+T=RESULTS{j}{i};
+if isempty(T)
+continue
+else
+READCOUNTS_ALL(i,j)=T{2};
+READCOUNTS_EXON(i,j)=T{3};
+LENGTH_ALL(i,j)=T{5}(1);
+LEN_EXON(i,j)=T{5}(2);
+end
+end
+end
+%%%%% write results for all bam files
+fid_conditions = fopen(sprintf('%s_CONDITIONS.tab', outfile), 'w');
+fid_counts = fopen(sprintf('%s_COUNTS.tab', outfile) ,'w');
+fprintf(fid_counts,'gene');
+fprintf(fid_conditions, 'file\tcondition\treplicate\n');
+for j = 1:length(file_list)
+fname = file_list{j} ;
+fname = separate(fname, '/');
+fname = fname{end};
+fname = strrep(fname, '.bam', '') ;
+fprintf(fid_counts,'\t%s', fname);
+fprintf(fid_conditions, '%s\t%i\t%i\n', fname, file_cond_ids(j), file_rep_ids(j));
+end;
+fprintf(fid_counts,'\n') ;
+for i = 1:size(genes,2)
+fprintf(fid_counts,'%s',genes(i).name);
+for j = 1:length(file_list),
+fprintf(fid_counts,'\t%i', READCOUNTS_EXON(i,j));
+end
+fprintf(fid_counts,'\n');
+end
+fclose(fid_counts);
+fclose(fid_conditions);
+exit;

Mercurial > repos > vipints > deseq_hts

comparison deseq-hts_1.0/src/get_read_counts.m @ 0:94a108763d9e draft