Mercurial > repos > youngkim > ezbamqc
view ezBAMQC/src/ezBAMQC/GeneFeatures.cpp @ 19:caed0168f704
wrapper
author | youngkim |
---|---|
date | Wed, 30 Mar 2016 13:46:41 -0400 |
parents | dfa3745e5fd8 |
children |
line wrap: on
line source
// // GeneFeatures.cpp // BAMQC-0.5 // // Created by Ying Jin on 9/15/15. // Copyright (c) 2015 Ying Jin. All rights reserved. // #include "GeneFeatures.h" #include <cmath> #include <fstream> #include <sstream> //#include <regex> #include "stdlib.h" #include <algorithm> #include <iostream> #include <math.h> #include <iterator> //#include <boost/tokenizer.hpp> //template <class T1, class T2, class Pred = std::less<T2> > //struct sort_pair_second { //(const std::pair<T1,T2>&left, const std::pair<T1,T2>&right) { // Pred p; // return p(left.second, right.second); // } //}; //bool sort_pair_second(std::pair<int,int> first, std::pair<int,int> second) //{ // return first.second > second.second ; //} bool itv_comp(Interval first, Interval second){ return first.start < second.start ; } int pivot(std::vector<Interval> &intervals, int first, int last) { int p = first; int pivotElement = intervals[first].start; for(int i = first+1 ; i <= last ; i++) { /* If you want to sort the list in the other order, change "<=" to ">" */ if(intervals[i].start <= pivotElement) { std::swap(intervals[i],intervals[p]); p++; } } return p; } void quick_sort(std::vector<Interval> &intervals, int first, int last){ int pivotElement; if(first < last) { pivotElement = pivot(intervals, first, last); quick_sort(intervals, first, pivotElement-1); quick_sort(intervals, pivotElement+1, last); } } bool reverse_ord_func (int i,int j) { return (j<i); } Gene::Gene(std::string gid, std::string ss){ id = gid; strand = ss ; min_start = std::numeric_limits<int>::max(); max_stop = std::numeric_limits<int>::min(); gene_actual_len = 0; stop_codon_st = -1; stop_codon_end = -1; } Gene::~Gene(){} void Gene::add_cds(int st, int end){ std::pair<int,int> cds_interval (st,end); cds.push_back(cds_interval); } void Gene::add_exons(int st, int end){ if (this->min_start > st) { this->min_start = st; } if (this->max_stop<end) { this->max_stop = end; } std::pair<int,int> exon_interval (st,end); gene_actual_len += (end - st+1); exons.push_back(exon_interval); } void Gene::set_stop_codon(int st,int end) { stop_codon_st = st; stop_codon_end = end; } void Gene::get_others(){ std::vector<std::pair<int,int> > left_cds; std::vector<std::pair<int,int> > left_exons; //int idx[exons.size()]; size_t i; //,j; int itgUp1k_st, itgUp1k_end,itgDn1k_st,itgDn1k_end ; sort(exons.begin(),exons.end()); sort(cds.begin(),cds.end()); for (i = 1; i < exons.size(); i++) { intron.push_back(std::make_pair(exons[i-1].second +1, exons[i].first -1)); } if(strand == "+") { itgUp1k_st = std::max(int(0),exons[0].first-1000); itgUp1k_end = std::max(int(0),exons[0].first-1 ); itgDn1k_st = exons[exons.size()-1].second + 1; itgDn1k_end = exons[exons.size()-1].second + 1000 ; itg1k.push_back(std::make_pair(itgUp1k_st,itgUp1k_end)) ; itg1k.push_back(std::make_pair(itgDn1k_st,itgDn1k_end)) ; if (stop_codon_st == -1) { utr5 = exons; } else { cds[cds.size()-1].second = stop_codon_end; for( i=0;i < exons.size(); i++) { if (exons[i].second < cds[0].first) { utr5.push_back(exons[i]); } if (exons[i].first < cds[0].first && exons[i].second > cds[0].first ) { utr5.push_back(std::make_pair(exons[i].first,cds[0].first - 1)); } if (exons[i].first <= stop_codon_st && exons[i].second > stop_codon_end ) { utr3.push_back(std::make_pair(stop_codon_end + 1,exons[i].second)); } if (exons[i].first > stop_codon_end ) { utr3.push_back(exons[i]) ; } } } } else { itgDn1k_st = std::max(int(0),exons[0].first - 1000); itgDn1k_end = std::max(int(0),exons[0].first -1); itgUp1k_st = exons[exons.size()-1].second + 1 ; itgUp1k_end = exons[exons.size()-1].second + 1000 ; itg1k.push_back(std::make_pair(itgUp1k_st,itgUp1k_end)); itg1k.push_back(std::make_pair(itgDn1k_st,itgDn1k_end)); if (stop_codon_st == -1 ) { utr3 = exons; } else { //if (left_cds.size() == 1) { cds[0].first = stop_codon_st; for( i=0;i < exons.size(); i++) { if (exons[i].second < stop_codon_st) { utr3.push_back(exons[i]); } if (exons[i].first < stop_codon_st && exons[i].second >= stop_codon_end ) { utr3.push_back(std::make_pair(exons[i].first,stop_codon_st - 1)); } if (exons[i].first <= cds[cds.size()-1].first && exons[i].second > cds[cds.size()-1].second ) { utr5.push_back(std::make_pair(cds[cds.size()-1].second + 1,exons[i].second)); } if (exons[i].first > cds[cds.size()-1].second ) { utr5.push_back(exons[i]) ; } } } } } GeneFeatures::GeneFeatures(std::string GTFfilename,std::string id_attribute) { this->total_exon = 0; read_features(GTFfilename,id_attribute); } GeneFeatures::~GeneFeatures(){ chrom_itvTree_Dict_itr it; for (it=cds_exon_idx_plus.begin(); it != cds_exon_idx_plus.end(); it++) { std::map<int,IntervalTree *> tmp = it->second; std::map<int,IntervalTree *>::iterator tmp_itr ; //std::cout << tmp.size() << std::endl; for (tmp_itr = tmp.begin(); tmp_itr != tmp.end(); tmp_itr ++) { delete tmp_itr->second; } } for (it=cds_exon_idx_minus.begin(); it != cds_exon_idx_minus.end(); it++) { std::map<int,IntervalTree *> tmp = it->second; std::map<int,IntervalTree *>::iterator tmp_itr ; for (tmp_itr = tmp.begin(); tmp_itr != tmp.end(); tmp_itr ++) { delete tmp_itr->second; } } } void GeneFeatures::build_tree(std::map<std::string, std::map<std::string,Gene> > temp_plus, std::map<std::string, std::map<std::string,Gene> > temp_minus) { std::vector<Interval> itemlist; std::vector<Interval> sublist; std::map<std::string, std::map<std::string,Gene> >::iterator it; std::map<std::string,Gene> tmp; std::map<std::string,Gene>::iterator tmp_itr; gene_exon_Dict_It exon_str_itr; std::string chr,gid;//,start_ss,end_ss; int g_idx =-1; int e_idx =0; size_t i ; int cur_bin_id,start_bin_id, js, je, k;//, buket_size; for (it = temp_plus.begin(); it != temp_plus.end(); it++) { chr = it->first; tmp = it->second; itemlist.clear(); for (tmp_itr = tmp.begin(); tmp_itr != tmp.end(); tmp_itr++) { gid = tmp_itr->first; features.push_back(gid); //save gene name g_idx +=1; Gene tmp_gene = tmp_itr->second; tmp_gene.get_others(); //int gene_len = (int) tmp_gene.max_stop - tmp_gene.min_start + 1; //std::cout << gene_len << std::endl; std::vector<int> gene_base_pos ; for (i=0; i< tmp_gene.exons.size(); i++) { int st = (int) tmp_gene.exons[i].first - tmp_gene.min_start; int end = (int) tmp_gene.exons[i].second - tmp_gene.min_start; for (int j=st; j<=end; j++) { gene_base_pos.push_back(j); } } gene_lengths.push_back(tmp_gene.gene_actual_len); gene_starts.push_back(tmp_gene.min_start); gene_ends.push_back(tmp_gene.max_stop); std::vector<int> percentile_list; std::sort(gene_base_pos.begin(),gene_base_pos.end()); for (int j=0; j<=100;j++) { float kk = (tmp_gene.gene_actual_len - 1) * j/100.0; float f = floor(kk); float c = ceil(kk); if (f == c){ percentile_list.push_back( gene_base_pos[kk]); } else{ float d0 = gene_base_pos[int(f)] * (c-kk); float d1 = gene_base_pos[int(c)] * (kk-f); percentile_list.push_back(int(round(d0+d1))); } } gene_percentile_list.push_back(percentile_list); for (i=0; i < tmp_gene.cds.size(); i++) { this->total_exon ++ ; //std::cout<< this->total_exon << std::endl; itemlist.push_back(Interval(g_idx,e_idx,tmp_gene.cds[i].first,tmp_gene.cds[i].second,CDS)); e_idx ++; } for (i=0; i < tmp_gene.utr5.size(); i++) { this->total_exon ++ ; itemlist.push_back(Interval(g_idx,e_idx,tmp_gene.utr5[i].first,tmp_gene.utr5[i].second,UTR5)); e_idx ++; } for (i=0; i < tmp_gene.utr3.size(); i++) { this->total_exon ++ ; //std::cout << chr << "\t" << tmp_gene.utr3[i].first << "\t" << tmp_gene.utr3[i].second << "\t" << gid << std::endl; itemlist.push_back(Interval(g_idx,e_idx,tmp_gene.utr3[i].first,tmp_gene.utr3[i].second,UTR3)); e_idx ++; } for (i=0; i < tmp_gene.intron.size(); i++) { itemlist.push_back(Interval(g_idx,-1,tmp_gene.intron[i].first,tmp_gene.intron[i].second,INTRON)); } itemlist.push_back(Interval(g_idx,-1,tmp_gene.itg1k[0].first,tmp_gene.itg1k[0].second,ITGUP1K)); itemlist.push_back(Interval(g_idx,-1,tmp_gene.itg1k[1].first,tmp_gene.itg1k[1].second,ITGDN1K)) ; } std::sort(itemlist.begin(),itemlist.end(),itv_comp) ; //key=operator.attrgetter('start')); //quick_sort(itemlist,0,itemlist.size()-1); start_bin_id = itemlist[0].start/BIN_SIZE; js = 0 ; je = 0 ; k = 0 ; for (i=0; i < itemlist.size(); i++) { cur_bin_id = itemlist[i].start/BIN_SIZE; //std::cout << cur_bin_id << std::endl; if (cur_bin_id == start_bin_id) { je += 1; } else { //buket_size = (int)sqrt(je - js) + 1; //std::cout << buket_size << std::endl; sublist = std::vector<Interval>(itemlist.begin()+js,itemlist.begin() + je); cds_exon_idx_plus[chr][start_bin_id] = new IntervalTree(sublist); // std::cout << start_bin_id << " built one tree." << std::endl; k ++; start_bin_id = cur_bin_id ; js = je ; je ++; } } if (js != je) { //buket_size = (int) sqrt(je - js) + 1; sublist = std::vector<Interval>(itemlist.begin()+js,itemlist.begin() + je); cds_exon_idx_plus[chr][start_bin_id] = new IntervalTree(sublist); //print("tree depth = " + str(cds_exon_idx_plus[chr][start_bin_id].get_depth())+ "\n") k+=1; } } //std::cout << " build minus strand." << std::endl; for (it = temp_minus.begin(); it != temp_minus.end(); it++) { //std::cout << "negative strand " << std::endl; chr = it->first; tmp = it->second; itemlist.clear(); for (tmp_itr = tmp.begin(); tmp_itr != tmp.end(); tmp_itr++) { gid = tmp_itr->first; //std::cout << "GID " << gid << std::endl; Gene tmp_gene = tmp_itr->second; tmp_gene.get_others(); features.push_back(gid); //save gene name g_idx +=1; //std::string bs_string = ""; //int gene_len = (int) tmp_gene.max_stop - tmp_gene.min_start + 1; //std::vector<std::bitset<100> > bs_list; std::vector<int> gene_base_pos ; /*for (int j=0; j< gene_len; j+=100) { std::bitset<100> bs; bs_list.push_back(bs); }*/ for (i=0; i< tmp_gene.exons.size(); i++) { int st = (int) tmp_gene.exons[i].first - tmp_gene.min_start; int end = (int) tmp_gene.exons[i].second - tmp_gene.min_start; //int size = end - st; for (int j=st; j<=end; j++) { gene_base_pos.push_back(j); } } gene_lengths.push_back(tmp_gene.gene_actual_len); gene_starts.push_back(tmp_gene.min_start); gene_ends.push_back(tmp_gene.max_stop); std::vector<int> percentile_list; std::sort(gene_base_pos.begin(),gene_base_pos.end(),reverse_ord_func); for (int j=0; j<=100;j++) { float kk = (tmp_gene.gene_actual_len - 1) * j/100.0; float f = floor(kk); float c = ceil(kk); if (f == c){ percentile_list.push_back( gene_base_pos[kk]); } else{ float d0 = gene_base_pos[int(f)] * (c-kk); float d1 = gene_base_pos[int(c)] * (kk-f); percentile_list.push_back(int(round(d0+d1))); } } gene_percentile_list.push_back(percentile_list); for (i=0; i < tmp_gene.cds.size(); i++) { total_exon += 1; itemlist.push_back(Interval(g_idx,e_idx,tmp_gene.cds[i].first,tmp_gene.cds[i].second,CDS)); e_idx ++; } for (i=0; i < tmp_gene.utr5.size(); i++) { total_exon += 1; itemlist.push_back(Interval(g_idx,e_idx,tmp_gene.utr5[i].first,tmp_gene.utr5[i].second,UTR5)); e_idx ++; } for (i=0; i < tmp_gene.utr3.size(); i++) { total_exon += 1; itemlist.push_back(Interval(g_idx,e_idx,tmp_gene.utr3[i].first,tmp_gene.utr3[i].second,UTR3)); e_idx ++; } for (i=0; i < tmp_gene.intron.size(); i++) { itemlist.push_back(Interval(g_idx,-1,tmp_gene.intron[i].first,tmp_gene.intron[i].second,INTRON)); } itemlist.push_back(Interval(g_idx,-1,tmp_gene.itg1k[0].first,tmp_gene.itg1k[0].second,ITGUP1K)); itemlist.push_back(Interval(g_idx,-1,tmp_gene.itg1k[1].first,tmp_gene.itg1k[1].second,ITGDN1K)) ; } //std::sort(key=operator.attrgetter('start')); //quick_sort(itemlist,0,itemlist.size()-1); std::sort(itemlist.begin(),itemlist.end(),itv_comp) ; start_bin_id = itemlist[0].start/BIN_SIZE; js = 0 ; je = 0 ; k = 0 ; for (i=0; i < itemlist.size(); i++) { cur_bin_id = itemlist[i].start/BIN_SIZE; if (cur_bin_id == start_bin_id) { je += 1; } else { //buket_size = (int)sqrt(je - js) + 1; // std::cout << buket_size << std::endl; sublist = std::vector<Interval>(itemlist.begin()+js,itemlist.begin() + je); //cds_exon_idx_minus[chr][start_bin_id] = new IntervalTree(sublist,16,buket_size,-1,-1,buket_size); cds_exon_idx_minus[chr][start_bin_id] = new IntervalTree(sublist); k ++; start_bin_id = cur_bin_id ; js = je ; je ++; } } if (js != je) { //buket_size = (int) sqrt(je - js) + 1; //std::cout << buket_size << std::endl; sublist = std::vector<Interval>(itemlist.begin()+js,itemlist.begin() + je); //cds_exon_idx_minus[chr][start_bin_id] = new IntervalTree(sublist, 16, buket_size,-1,-1,buket_size ); cds_exon_idx_minus[chr][start_bin_id] = new IntervalTree(sublist); //print("tree depth = " + str(cds_exon_idx_plus[chr][start_bin_id].get_depth())+ "\n") k+=1; } } } //Reading & processing annotation files void GeneFeatures::read_features(std::string gff_filename, std::string id_attribute) { //dict of dicts since the builtin type doesn't support it for some reason std::map<std::string, std::map<std::string, Gene> > temp_plus ; std::map<std::string, std::map<std::string, Gene> > temp_minus ; std::map<std::string, std::map<std::string, Gene> >::iterator tmp_itr; std::map<std::string, Gene>::iterator id_itr; bool matched = false; //int k = 0; int i = 0; int counts = 0 ; int line_no = 0; int start = -1; int end = -1; std::size_t pos,cur_pos; //std::string left_str,sub_str; std::ifstream input; //(gff_filename); try{ input.open (gff_filename, std::ifstream::in); while(! input.eof()){ std::string line,chrom,source,feature,start_ss,end_ss,score,strand,frame,attributeStr; std::stringstream ss; std::string id = ""; if (! std::getline(input,line)){ break; } line_no ++; if (line == "\n" || !line.compare(0,1,"#")) { continue; } ss << line; std::getline(ss,chrom,'\t'); std::getline(ss,source,'\t'); std::getline(ss,feature,'\t'); std::getline(ss,start_ss,'\t'); std::getline(ss,end_ss,'\t'); std::getline(ss,score,'\t'); std::getline(ss,strand,'\t'); std::getline(ss,frame,'\t'); std::getline(ss,attributeStr,'\t'); //std::cout << strand << std::endl; try{ start = std::stol(start_ss); end = std::stol(end_ss); } catch (const std::invalid_argument& ia) { std::cerr << "Invalid argument: " << ia.what() << '\n'; std::exit(1); } cur_pos = 0; //std::cout <<attributeStr << std::endl; while (cur_pos < attributeStr.length()) { std::size_t next_pos = attributeStr.find(";",cur_pos); if (next_pos !=std::string::npos) { std::string tok = attributeStr.substr(cur_pos,(next_pos - cur_pos)); //std::cout << tok << std::endl; tok = tok.substr(tok.find_first_not_of(' ')); pos = tok.find('='); if (pos == std::string::npos) { pos = tok.find(' '); } std::string key = tok.substr(0,pos); key = key.substr(0,key.find(' ')); std::size_t pos_stop = 1 + pos + tok.substr(pos+1).find_first_not_of(' '); std::string val = (tok[pos_stop] == '"') ? tok.substr(pos_stop+1, (tok.length() - (pos_stop+2))) : tok.substr(pos_stop, (tok.length() - (pos_stop+1))); if (key == id_attribute) { id = val; matched = true; break; } cur_pos = next_pos + 1; } else { break; } } if (!matched) { std::cout << "Failure parsing GFF attribute line." << std::endl; exit (EXIT_FAILURE); } if (id == "") { continue; } if (feature == "stop_codon") { if (strand == "+" ){ tmp_itr = temp_plus.find(chrom); if (tmp_itr != temp_plus.end()) { id_itr = temp_plus[chrom].find(id); if (id_itr != temp_plus[chrom].end()) { Gene *g = &(id_itr->second); (*g).set_stop_codon(start,end); } else{ Gene g (id,strand); g.set_stop_codon(start,end); temp_plus[chrom].insert(std::pair<std::string,Gene>(id,g)); } } else{ Gene g(id,strand); g.set_stop_codon(start,end); std::map<std::string, Gene> gene_id_map ; gene_id_map.insert(std::pair<std::string,Gene> (id,g)); temp_plus.insert(std::pair<std::string,std::map<std::string, Gene> > (chrom,gene_id_map)); } } if (strand == "-" ) { tmp_itr = temp_minus.find(chrom); if (tmp_itr != temp_minus.end()) { id_itr = temp_minus[chrom].find(id); if (id_itr != temp_minus[chrom].end()) { Gene *g = &(id_itr->second); (*g).set_stop_codon(start,end); } else{ Gene g(id,strand); g.set_stop_codon(start,end); std::map<std::string, Gene> gene_id_map ; temp_minus[chrom].insert(std::pair<std::string,Gene>(id,g)); } } else{ Gene g(id,strand); g.set_stop_codon(start,end); std::map<std::string, Gene> gene_id_map ; gene_id_map.insert(std::pair<std::string,Gene> (id,g)); temp_minus.insert(std::pair<std::string,std::map<std::string, Gene> > (chrom,gene_id_map)); } } } if (feature == "CDS" ){ if (strand == "+" ){ tmp_itr = temp_plus.find(chrom); if (tmp_itr != temp_plus.end()) { id_itr = temp_plus[chrom].find(id); if (id_itr != temp_plus[chrom].end()) { Gene *g = &(id_itr->second); //(id_itr->second).add_cds(start,end); (*g).add_cds(start,end); } else{ Gene g (id,strand); g.add_cds(start,end); temp_plus[chrom].insert(std::pair<std::string,Gene>(id,g)); } } else{ Gene g(id,strand); g.add_cds(start,end); std::map<std::string, Gene> gene_id_map ; gene_id_map.insert(std::pair<std::string,Gene> (id,g)); temp_plus.insert(std::pair<std::string,std::map<std::string, Gene> > (chrom,gene_id_map)); } } if (strand == "-" ) { tmp_itr = temp_minus.find(chrom); if (tmp_itr != temp_minus.end()) { id_itr = temp_minus[chrom].find(id); if (id_itr != temp_minus[chrom].end()) { Gene *g = &(id_itr->second); (*g).add_cds(start,end); } else{ Gene g(id,strand); g.add_cds(start,end); std::map<std::string, Gene> gene_id_map ; temp_minus[chrom].insert(std::pair<std::string,Gene>(id,g)); } } else{ Gene g(id,strand); g.add_cds(start,end); std::map<std::string, Gene> gene_id_map ; gene_id_map.insert(std::pair<std::string,Gene> (id,g)); temp_minus.insert(std::pair<std::string,std::map<std::string, Gene> > (chrom,gene_id_map)); } } } if (feature == "exon" ){ counts += 1 ; if (strand == "+" ){ tmp_itr = temp_plus.find(chrom); if (tmp_itr != temp_plus.end()) { id_itr = temp_plus[chrom].find(id); if (id_itr != temp_plus[chrom].end()) { Gene *g = &(id_itr->second); (*g).add_exons(start,end); } else{ Gene g (id,strand); g.add_exons(start,end); temp_plus[chrom].insert(std::pair<std::string,Gene>(id,g)); } } else{ Gene g (id,strand); g.add_exons(start,end); std::map<std::string, Gene> gene_id_map ; gene_id_map.insert(std::pair<std::string,Gene> (id,g)); temp_plus.insert(std::pair<std::string,std::map<std::string, Gene> > (chrom,gene_id_map)); } } if (strand == "-" ) { tmp_itr = temp_minus.find(chrom); if (tmp_itr != temp_minus.end()) { id_itr = temp_minus[chrom].find(id); if (id_itr != temp_minus[chrom].end()) { Gene *g = &(id_itr->second); (*g).add_exons(start,end); } else{ Gene g(id,strand); g.add_exons(start,end); temp_minus[chrom].insert(std::pair<std::string,Gene>(id,g)); } } else{ Gene g (id,strand); g.add_exons(start,end); std::map<std::string, Gene> gene_id_map ; gene_id_map.insert(std::pair<std::string,Gene> (id,g)); temp_minus.insert(std::pair<std::string,std::map<std::string, Gene> > (chrom,gene_id_map)); } } } i += 1 ; //if (i % 100000 == 0 ) //{ //sys.stderr.write("%d GTF lines processed.\n" % i); //std::cout << i << " GTF lines processed." << std::endl; //} } input.close(); } catch(std::ifstream::failure e){ std::cout << "error in read file " << gff_filename << std::endl; } if (counts == 0 ){ std::cout << "Warning: No features of type 'exon' or 'CDS' found in gene GTF file." << std::endl; } build_tree(temp_plus,temp_minus); } //find exons of given gene that overlap with the given intervals //return list of tuples std::vector<std::pair<int,int> > GeneFeatures::get_exons(std::string chrom,int st,int end,std::string strand) { std::vector<std::pair<int,int> > exons; chrom_itvTree_Dict::iterator chrom_it; std::map<int,IntervalTree *>::iterator bin_iter; std::vector<Interval> fs ; std::vector<Interval> temp ; size_t i; int bin_id ; //try: bin_id = st/BIN_SIZE ; if (strand == "+" || strand == "."){ chrom_it = cds_exon_idx_plus.find(chrom); if (chrom_it != cds_exon_idx_plus.end()) { bin_iter = cds_exon_idx_plus[chrom].find(bin_id); if (bin_iter != cds_exon_idx_plus[chrom].end()) { fs = (*cds_exon_idx_plus[chrom][bin_id]).find(st,end); } } } if (strand == "-" || strand == "."){ chrom_it = cds_exon_idx_minus.find(chrom); if (chrom_it != cds_exon_idx_minus.end()) { bin_iter = cds_exon_idx_minus[chrom].find(bin_id); if (bin_iter != cds_exon_idx_minus[chrom].end()) { temp = (*cds_exon_idx_minus[chrom][bin_id]).find(st,end); fs.insert(fs.end(),temp.begin(),temp.end()); } } } for(i =0 ; i < fs.size(); i++){ if (fs[i].type != INTRON && fs[i].type != ITGUP1K && fs[i].type != ITGDN1K){ int s = fs[i].start; int e = fs[i].stop; if (s < st) { s = st; } if (e > end) { e = end; } exons.push_back(std::make_pair(s,e)); } } //std::sort(exons.begin(),exons.end()); return exons; } std::vector<std::string> GeneFeatures::getFeatures() { return features ; } int GeneFeatures::get_start(int g) { if ((size_t) g < gene_starts.size() && g >=0) { return gene_starts[g]; } else{ return -1; } } int GeneFeatures::get_stop(int g) { if ((size_t)g < gene_ends.size() && g >=0) { return gene_ends[g]; } else{ return -1; } } std::string GeneFeatures::get_name(int g) { if ((size_t)g < features.size() && g >=0) { return features[g]; } else{ return ""; } } int GeneFeatures::get_numofgenes(){ return features.size(); } int GeneFeatures::exist_in_percentile_list(int gene,int pos){ int idx = (int)pos - gene_starts[gene]; for (int i=0; i<=100; i++) { if (idx == gene_percentile_list[gene][i]) { return i; } } return -1; } std::map<int,int> GeneFeatures::Gene_annotation(std::string chrom, std::vector<std::pair<int,int> > itv_list,std::string strand,std::vector<int> * mapped_exons ) { std::vector<Interval> fs ; chrom_itvTree_Dict::iterator chrom_it; std::map<int,IntervalTree *>::iterator bin_iter; size_t i,j; int bin_id_s, bin_id_e; for (i=0; i < itv_list.size(); i ++) { bin_id_s = itv_list[i].first/BIN_SIZE; bin_id_e = itv_list[i].second/BIN_SIZE; if (strand == "+" || strand == ".") { chrom_it = cds_exon_idx_plus.find(chrom); if (chrom_it != cds_exon_idx_plus.end()) { bin_iter = cds_exon_idx_plus[chrom].find(bin_id_s); if (bin_iter != cds_exon_idx_plus[chrom].end() ) { fs = (*cds_exon_idx_plus[chrom][bin_id_s]).find(itv_list[i].first,itv_list[i].second); if (bin_id_s != bin_id_e) { bin_iter = cds_exon_idx_plus[chrom].find(bin_id_e); if (bin_iter != cds_exon_idx_plus[chrom].end() ){ std::vector<Interval> tmp = (*cds_exon_idx_plus[chrom][bin_id_e]).find(itv_list[i].first,itv_list[i].second); fs.insert(fs.end(),tmp.begin(),tmp.end()); } } } } } if (strand == "-" or strand == ".") { chrom_it = cds_exon_idx_minus.find(chrom); if (chrom_it != cds_exon_idx_minus.end()) { bin_iter = cds_exon_idx_minus[chrom].find(bin_id_s); if (bin_iter != cds_exon_idx_minus[chrom].end() ) { std::vector<Interval> tmp = (*cds_exon_idx_minus[chrom][bin_id_s]).find(itv_list[i].first,itv_list[i].second); fs.insert(fs.end(),tmp.begin(),tmp.end()); if (bin_id_s != bin_id_e) { bin_iter = cds_exon_idx_minus[chrom].find(bin_id_e); if (bin_iter != cds_exon_idx_minus[chrom].end() ){ std::vector<Interval> tmp = (*cds_exon_idx_minus[chrom][bin_id_e]).find(itv_list[i].first,itv_list[i].second); fs.insert(fs.end(),tmp.begin(),tmp.end()); } } } } } } std::map<int,int> gene_type_map; std::map<int,int> gene_ovp_len_map; std::map<int,int> None_gene_type_map; std::map<int,int>::iterator gene_type_map_itr; int min_type = 6; for (i=0; i < fs.size(); i++) { if (fs[i].type <= UTR3) { int ovp_len = 0; for(j=0; j < itv_list.size(); j ++) { if (itv_list[j].first < fs[i].stop && itv_list[j].second > fs[i].start) { int ovp_st = itv_list[j].first > fs[i].start ? itv_list[j].first : fs[i].start; int ovp_end = itv_list[j].second > fs[i].stop ? fs[i].stop : itv_list[j].second; ovp_len += ovp_end - ovp_st+1; } } if (gene_ovp_len_map.find(fs[i].gene) != gene_ovp_len_map.end()) { gene_ovp_len_map[fs[i].gene] += ovp_len; if (fs[i].type <= min_type) { gene_type_map[fs[i].gene] = fs[i].type; min_type = fs[i].type; } } else { gene_ovp_len_map.insert(std::pair<int,int> (fs[i].gene,ovp_len)); gene_type_map.insert(std::pair<int,int>(fs[i].gene,fs[i].type)); min_type = fs[i].type; } } else { //not CDS or UTR if (None_gene_type_map.find(fs[i].gene) != None_gene_type_map.end()) { if(fs[i].type < None_gene_type_map[fs[i].gene]) { None_gene_type_map[fs[i].gene] = fs[i].type; } } else { None_gene_type_map.insert(std::pair<int, int> (fs[i].gene,fs[i].type)); } } //genes.push_back(std::pair<int,int>(fs[i].type,fs[i].gene)); if (fs[i].exon != -1) { mapped_exons->push_back(fs[i].exon); } } if (gene_type_map.size() == 0) { return None_gene_type_map; } else { std::map<int,int> res_gene_type_map; int max_ovp_len = 0; for (auto& kv : gene_ovp_len_map) { if( max_ovp_len < kv.second) { max_ovp_len = kv.second; } } for (auto& kv : gene_ovp_len_map) { if (gene_ovp_len_map[kv.first] == max_ovp_len) { res_gene_type_map.insert(std::pair<int,int> (kv.first,gene_type_map[kv.first])); } } return res_gene_type_map; } return gene_type_map; } /* int main() { std::string filename = "test.gtf"; std::string id_attr = "gene_id"; std::vector<std::pair<int,int> > itv_list ; chr_ITV exp ; exp.chrom = "chr4"; int start = 1048489; int end = 1049900; std::pair<int,int> p (start,end); itv_list.push_back(p); std::cout << "start to build tree " << std::endl; GeneFeatures gIdx (filename,id_attr); std::cout << "after build tree " << std::endl; std::cout << "total exon " << gIdx.total_exon << std::endl; std::cout << "total gene " << gIdx.features.size() << std::endl; // for (int i=0; i < itv_list.size(); i++) { // std::cout << itv_list[i].start << std::endl; //} std::vector<int> *exons = new std::vector<int>(); std::vector<int> res = gIdx.Gene_annotation("chr4",itv_list,".",exons); std::vector<std::pair<int,int> > exon_list = gIdx.get_exons("chr4",start,end,"."); for (auto& p : exon_list){ std::cout << p.first << "\t" << p.second << std::endl; } std::vector<std::vector<int> > gene_percentile_list = gIdx.gene_percentile_list; for (int i=0; i<gene_percentile_list.size(); i++) { std::cout << i << std::endl; std::vector<int> perc_list = gene_percentile_list[i]; for (int j=0; j<perc_list.size(); j++) { std::cout << perc_list[j] << "\t"; } std::cout << "\n"; } std::cout << exons->size() << std::endl; for (int i=0;i<res.size(); i++) { std::cout << res[i] << std::endl; //std::cout << exons->operator[](i) << std::endl; } delete exons; bool test_bool = false; std::cout << test_bool << std::endl; }*/