view rDiff/src/tools/detect_overlapping_regions.m @ 2:233c30f91d66

updated python based GFF parsing module which will handle GTF/GFF/GFF3 file types
author vipints <vipin@cbio.mskcc.org>
date Tue, 08 Oct 2013 07:15:44 -0400
parents 0f80a5141704
children
line wrap: on
line source

function [new_genes]=detect_overlapping_regions(genes);
% this function determines regions in a gene which overlapp with
% other genes. Those regons are then saved in the field "non_unique_regions"

CHROMOSOMES={};
COUNTER=1;
for i=1:size(genes,2)
  CHROMOSOMES{COUNTER}=genes(i).chr;
  COUNTER=COUNTER+1;
end
CHROMOSOMES=unique(CHROMOSOMES);


INFO=zeros(size(genes,2),4);
for i=1:size(genes,2)
  CHR_VAL=0;
  for chr= 1:length(CHROMOSOMES)	
    if  strcmp(genes(i).chr,CHROMOSOMES(chr))
      CHR_VAL=chr;
    end
  end	
  INFO(i,:)=[i,genes(i).start,genes(i).stop, CHR_VAL];
end

COUNTER=1;
new_genes=genes;
for chr= 1:length(CHROMOSOMES)	
  GENES_ON_CHR=INFO(INFO(:,4)==chr,:);
  [TEMP,POS]=sort(GENES_ON_CHR(:,2));
  GENES_ON_CHR=GENES_ON_CHR(POS,:);
  STARTS=GENES_ON_CHR(:,2);
  STOPS=GENES_ON_CHR(:,3);
  for i=1:(size(GENES_ON_CHR,1))
    MIN_START=find(STOPS>=STARTS(i),1,'first');
    MAX_STOP=find(STARTS<=STOPS(i),1,'last');
    if MIN_START==i
      MIN_START=[];
    end
    if MAX_STOP==i
      MAX_STOP=[];
    end
    EXONS=[];
    if not (isempty(MIN_START))
      for CURR=MIN_START:(i-1)
	if(not(isempty(genes(GENES_ON_CHR(CURR,1)).transcripts)))
	  for tra=1:size(genes(GENES_ON_CHR(CURR,1)).transcripts,2)
	    if(not(isempty(genes(GENES_ON_CHR(CURR,1)).exons)))
	      EXONS=[EXONS;genes(GENES_ON_CHR(CURR,1)).exons{tra}];
	    else
	      EXONS=[EXONS;genes(GENES_ON_CHR(CURR,1)).start,genes(GENES_ON_CHR(CURR,1)).stop];
	    end	  
	  end
	else
	  EXONS=[EXONS;genes(GENES_ON_CHR(CURR,1)).start,genes(GENES_ON_CHR(CURR,1)).stop];
	end	  
      end
    end
    if not (isempty(MAX_STOP))
      for CURR=(i+1):MAX_STOP
	if(not(isempty(genes(GENES_ON_CHR(CURR,1)).transcripts)))
	  for tra=1:size(genes(GENES_ON_CHR(CURR,1)).transcripts,2)
	    if(not(isempty(genes(GENES_ON_CHR(CURR,1)).exons)))
	      EXONS=[EXONS;genes(GENES_ON_CHR(CURR,1)).exons{tra}];
	    else
	      EXONS=[EXONS;genes(GENES_ON_CHR(CURR,1)).start,genes(GENES_ON_CHR(CURR,1)).stop];	    
	    end
	  end
	else
	  EXONS=[EXONS;genes(GENES_ON_CHR(CURR,1)).start,genes(GENES_ON_CHR(CURR,1)).stop];
	end
	
      end
    end
    if  not (isempty([MAX_STOP,MIN_START]))
      EXONS=EXONS(EXONS(:,2)>=STARTS(i),:);
      EXONS=EXONS(EXONS(:,1)<=STOPS(i),:);
      new_genes(GENES_ON_CHR(i,1)).non_unique_regions=EXONS;    
    else
        new_genes(GENES_ON_CHR(i,1)).non_unique_regions=[];    
    end
  end
  COUNTER=COUNTER+1;
end