Mercurial > repos > vipints > deseq_hts
comparison deseq-hts_1.0/src/get_read_counts.m @ 0:94a108763d9e draft
deseq-hts version 1.0 wraps the DESeq 1.6.0
author | vipints |
---|---|
date | Wed, 09 May 2012 20:43:47 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:94a108763d9e |
---|---|
1 function get_read_counts(anno_dir, outfile, varargin) | |
2 % | |
3 % -- input -- | |
4 % anno_dir: directory of genes | |
5 % outfile: output file | |
6 % varargin: list of BAM files (at least two) | |
7 | |
8 % DESeq paths | |
9 global DESEQ_PATH DESEQ_SRC_PATH | |
10 | |
11 % interpreter paths | |
12 global INTERPRETER MATLAB_BIN_PATH OCTAVE_BIN_PATH | |
13 | |
14 % SAMTools path | |
15 global SAMTOOLS_DIR | |
16 | |
17 %%%% paths | |
18 addpath(sprintf('%s/tools', DESEQ_PATH)); | |
19 addpath(sprintf('%s/mex', DESEQ_PATH)); | |
20 addpath(sprintf('%s', DESEQ_SRC_PATH)); | |
21 | |
22 deseq_config; | |
23 | |
24 %%% read list of replicate groups from variable length argument list | |
25 rg_list = cell(1,size(varargin, 2)); | |
26 file_list = cell(); | |
27 file_cond_ids = []; | |
28 file_rep_ids = []; | |
29 for idx = 1:size(varargin, 2) | |
30 rg_list(idx) = varargin(idx); | |
31 end | |
32 idx = strmatch('', rg_list, 'exact'); | |
33 rg_list(idx) = []; | |
34 for idx = 1:length(rg_list), | |
35 items = separate(rg_list{idx}, ':'); | |
36 for idx2 = 1:length(items) | |
37 if isempty(deblank(items{idx2})), | |
38 continue; | |
39 end; | |
40 file_list{end + 1} = items{idx2}; | |
41 file_cond_ids(end + 1) = idx; | |
42 file_rep_ids(end + 1) = idx2; | |
43 end; | |
44 end; | |
45 clear idx idx2; | |
46 | |
47 %%%%% adapt to number of input arguments | |
48 file_num = length(file_list); | |
49 RESULTS = cell(1, file_num); | |
50 | |
51 %%%% get annotation file | |
52 load(sprintf('%s', anno_dir)); | |
53 | |
54 %%%%% mask overlapping gene regions -> later not counted | |
55 [genes] = mask_dubl(genes,0); | |
56 | |
57 %%%% remove genes with no annotated exons or where no | |
58 idx = find(arrayfun(@(x)(~isempty(x.exons)*~isempty(x.start)*~isempty(x.stop)), genes)); | |
59 fprintf('removed %i of %i genes, which had either no exons annotated or lacked a start or stop position\n', size(genes, 2) - size(idx, 2), size(genes, 2)) | |
60 genes = genes(idx); | |
61 clear idx; | |
62 | |
63 %%%% check if genes have field chr_num | |
64 if ~isfield(genes, 'chr_num') | |
65 chrms = unique({genes(:).chr}); | |
66 for i = 1:length(genes) | |
67 genes(i).chr_num = strmatch(genes(i).chr, chrms, 'exact'); | |
68 end; | |
69 end; | |
70 | |
71 %%%% iterate over all given bam files | |
72 for f_idx = 1:file_num | |
73 expr1_bam = fullfile('', file_list{f_idx}); | |
74 STAT = cell(size(genes, 2),1); | |
75 for i=1:size(genes,2) | |
76 RESULT = cell(1,7); | |
77 gene = genes(i); | |
78 RESULT{4} = f_idx; | |
79 RESULT{1} = gene.name; | |
80 if isempty(gene.exons) | |
81 RESULT{2} = inf; | |
82 RESULT{3} = inf; | |
83 RESULT{5} = [inf,inf]; | |
84 STAT{i} = RESULT; | |
85 continue; | |
86 elseif or(isempty(gene.start),isempty(gene.stop)) | |
87 RESULT{2} = inf; | |
88 RESULT{3} = inf; | |
89 RESULT{5} = [inf,inf]; | |
90 STAT{i} = RESULT; | |
91 continue; | |
92 end | |
93 if ~isempty(gene.chr_num), | |
94 [mask1, read_intron_list] = get_reads(expr1_bam, gene.chr, gene.start, gene.stop, '0'); | |
95 clear read_intron_list; | |
96 else | |
97 mask1 = []; | |
98 end; | |
99 | |
100 if isempty(mask1) | |
101 reads1 = zeros(0,gene.stop-gene.start+1); | |
102 else | |
103 reads1 = sparse(mask1(1,:)',mask1(2,:)',ones(size(mask1,2),1),max(mask1(1,:)),gene.stop-gene.start+1); | |
104 end | |
105 if ~isempty(reads1); | |
106 [reads1,FLAG] = remove_reads_from_other_genes(reads1,gene); | |
107 end | |
108 L = size(reads1); | |
109 RESULT{2}=[size(reads1,1)]; % number of all reads falling in that gene | |
110 EXON_IDX=zeros(1,gene.stop-gene.start+1); | |
111 for t=1:size(gene.transcripts,2) | |
112 for e=1:size(gene.exons{t},1) | |
113 EXON_IDX((gene.exons{t}(e,1)-gene.start+1):(gene.exons{t}(e,2)-gene.start+1))=1; | |
114 end | |
115 end | |
116 reads1 = reads1(sum(reads1(:,find(EXON_IDX)),2)>0,:); | |
117 L1 = sum(EXON_IDX); | |
118 RESULT{3}=[size(reads1,1)]; % number of reads overlapping to exons | |
119 RESULT{5}=[L, L1]; % size of reads1, number of exonic positions | |
120 % old and weighted poisson new ,weighted regions reads and | |
121 % unexplained reads | |
122 clear reads1; | |
123 STAT{i} = RESULT; | |
124 end; | |
125 RESULTS{f_idx} = STAT; | |
126 end; | |
127 | |
128 S=size(genes,2); | |
129 READCOUNTS_ALL=zeros(S, file_num); | |
130 READCOUNTS_EXON=zeros(S, file_num); | |
131 LENGTH_ALL=zeros(S,file_num); | |
132 LEN_EXON=zeros(S, file_num); | |
133 | |
134 for j=1:file_num, | |
135 for i=1:S | |
136 T=RESULTS{j}{i}; | |
137 if isempty(T) | |
138 continue | |
139 else | |
140 READCOUNTS_ALL(i,j)=T{2}; | |
141 READCOUNTS_EXON(i,j)=T{3}; | |
142 LENGTH_ALL(i,j)=T{5}(1); | |
143 LEN_EXON(i,j)=T{5}(2); | |
144 end | |
145 end | |
146 end | |
147 | |
148 %%%%% write results for all bam files | |
149 fid_conditions = fopen(sprintf('%s_CONDITIONS.tab', outfile), 'w'); | |
150 fid_counts = fopen(sprintf('%s_COUNTS.tab', outfile) ,'w'); | |
151 fprintf(fid_counts,'gene'); | |
152 fprintf(fid_conditions, 'file\tcondition\treplicate\n'); | |
153 for j = 1:length(file_list) | |
154 fname = file_list{j} ; | |
155 fname = separate(fname, '/'); | |
156 fname = fname{end}; | |
157 fname = strrep(fname, '.bam', '') ; | |
158 fprintf(fid_counts,'\t%s', fname); | |
159 fprintf(fid_conditions, '%s\t%i\t%i\n', fname, file_cond_ids(j), file_rep_ids(j)); | |
160 end; | |
161 fprintf(fid_counts,'\n') ; | |
162 | |
163 for i = 1:size(genes,2) | |
164 fprintf(fid_counts,'%s',genes(i).name); | |
165 for j = 1:length(file_list), | |
166 fprintf(fid_counts,'\t%i', READCOUNTS_EXON(i,j)); | |
167 end | |
168 fprintf(fid_counts,'\n'); | |
169 end | |
170 fclose(fid_counts); | |
171 fclose(fid_conditions); | |
172 exit; |