view match.cpp @ 15:0e221dbd17b2 default tip

Uploaded
author xuebing
date Sat, 31 Mar 2012 08:53:06 -0400
parents
children
line wrap: on
line source

#include <string>
#include <sstream>
#include <iostream>
#include <fstream>
#include <vector>
#include <set>
#include <map>

#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

using namespace std;

string current_time()
{// get current time, in the format:
 // Thu Mar 15 21:06:57 2012
  time_t rawtime;
  struct tm * timeinfo;
  time ( &rawtime );
  timeinfo = localtime ( &rawtime );
  string str = asctime (timeinfo);
  return str.substr(0,str.size()-1);
}  
    
template<class key,class val>
void PrintMap(map<key,val> m)
{// print out a map
  for (typename map<key,val>::iterator it = m.begin();it!=m.end(); it++)
  {
    cout << (*it).first << "\t=>\t" <<(*it).second<<endl;
  }
}

char complement(char ch){
        switch (ch)
        {
        case 'A':return 'T';
        case 'C':return 'G';
        case 'G':return 'C';
        case 'T':return 'A';
        default:return 'N';
        }
}

string reverseComplement(string seq){
        int L = seq.length();
        string rc (L,'0');
        for (int i=0;i<L; i++)
        {
                rc[L-i-1] = complement(seq[i]);
        }
        return rc;
}

void ReadOneSeqFromFasta(ifstream& infile, string& name, string& seq)
{
  // read one sequence from fasta file
  getline(infile,name); // read identifier line
  name = name.substr(1);// remove leading '>'
  seq = "";		// initialize sequence
  string str;		
  while(infile.peek() != '>' && infile.good())
  {// before next '>' and before hitting the end of the file
        getline(infile,str);
        seq.append(str);
    }
} 

vector<int> findall(string seq, string motif)
{
  // find all match positions of motif in seq
  vector<int> allpos;
  size_t pos = 0;
  while(pos != string::npos)
  {
    pos = seq.find(motif,pos+1);
    allpos.push_back(pos);
  }
  return allpos;
}

string int2str(int number)
{
   stringstream ss;//create a stringstream
   ss << number;//add number to the stream
   return ss.str();//return a string with the contents of the stream
}



map<string,string> ReadFasta(string filename)
{//read all sequences in a fasta file
  ifstream fin(filename.c_str());
  
  map<string,string> seqs;
  string name,seq;
  while(fin.good())
  {
    ReadOneSeqFromFasta(fin,name,seq);
    seqs[name] = seq;
  }
  fin.close();
  return seqs;
}

void mismatches(map<string,string>& mutant,map<string,int>& dist, string motif, int n, set<char> alphabet)
{
  set<char>::iterator it;
  if (mutant.count(motif) == 0)
  {
    mutant[motif] = "";
    dist[motif]=n;
  }
  if(n==0){return;}
  for (int i=0;i<motif.length();i++)
  {
      string str=motif;
      set<char> ab = alphabet;
      ab.erase(str[i]);
      for (it = ab.begin(); it!=ab.end(); it++)
      {
         str[i] = *it;
         //cout << "mutate "<<motif<<" to "<<str<<endl;
         if (mutant.count(str) >0)
         {
           if(dist[str] >= n)
           {
             //cout << mutant[str] <<endl;
             continue;
           }
         }
         
         //mutated to a new sequence
           //cout <<"new mutation"<<endl;
           mutant[str] = mutant[motif];
           mutant[str].push_back(','); 
           mutant[str].push_back(motif[i]);
           mutant[str].append(int2str(i+1));
           mutant[str].push_back(str[i]);
           dist[str]=n;
           //cout << "tag="<<mutant[str]<<" dist="<<n<<endl;
   
         if (n>1)
         {
           //cout << "subproc" <<endl;
           mismatches(mutant,dist,str,n-1,alphabet);
         }
      }

  }
}

map<string,string> ExpandMotifs(map<string,string>& motifs, int nmismatch, bool rc, set<char> alphabet) 
{ 
  map<string,string> expandedmotifs;
  // generate mismatched motifs
  map<string,string> mutants;
  map<string,int> dist;
  map<string,string>::iterator it;   // iterator for motifs
  map<string,int>::iterator it2; // iterator for mutants
  string name,seq,tmp;
  //cout<<"input motifs"<<endl;
  //PrintMap(motifs);
  for(it=motifs.begin();it!=motifs.end();it++)
  {
    name = (*it).first;
    seq = (*it).second;
   
    mismatches(mutants,dist,seq,nmismatch,alphabet);
    //cout << mutants.size()<<" mutants identified" <<endl;
    //PrintMap(mutants);
    // add mutants to motifs
    for(it2=dist.begin();it2!=dist.end();it2++)
    {
      string tmp = name;
      tmp.append(",").append((*it2).first);
      tmp.append(mutants[(*it2).first]);
      expandedmotifs[tmp] = (*it2).first;
      //cout << name <<","<<tmp<<","<<expandedmotifs[tmp]<<endl;
    }
    // clear the mutants list
    mutants.clear();
    dist.clear();
  }
  //PrintMap(expandedmotifs);
  //cout << expandedmotifs.size() <<" expanded motifs"<<endl;
  //cout <<"add reverse complement"<<endl;
  map<string,string> expandedmotifs_rc = expandedmotifs;
  if (rc)
  {
    for(it=expandedmotifs.begin();it!=expandedmotifs.end();it++)
    {
      name = (*it).first;
      expandedmotifs_rc[name.append(",rc")] = reverseComplement((*it).second);
    }
  }   
 
  return expandedmotifs_rc;
 
}



int* match(string motiffile, string seqfile, string outfile, int nmismatch, bool rc, set<char> alphabet)
{
  int nsite = 0; // total number of matches to report
  int nseq = 0;
  ifstream fmotif, fseq;
  ofstream fout;
  
  // load motifs
  map<string,string> motifs = ReadFasta(motiffile);
  cout <<"["<<current_time()<<"] "<<motifs.size()<< " motifs loaded from "<<motiffile<<endl;
  
  // expand motifs
  map<string,string> expandedmotifs = ExpandMotifs(motifs,nmismatch,rc,alphabet);
  cout <<"["<<current_time()<<"] "<<expandedmotifs.size()<< " motifs after expanding (mismatch/reverse-complement)"<<endl;
  
  //PrintMap(expandedmotifs);
  
  // searching motifs in each sequence
  fseq.open(seqfile.c_str());
  fout.open(outfile.c_str());
  
  string seqname,seq,motifname,motif;
  while(fseq.good())
  {
    // read one sequence
    ReadOneSeqFromFasta(fseq,seqname,seq);
    nseq = nseq + 1;

    cout.flush();
    // iterate over motifs
    map<string,string>::iterator it;
    for(it=expandedmotifs.begin();it!=expandedmotifs.end();it++)
    {
      motifname = (*it).first;
      motif = (*it).second;
      //cout << "searching for "<<motifname<<":"<< motif <<endl;
      vector<int> found = findall(seq,motif);
      for (int i =0;i<found.size()-1;i++)
      {
        fout <<seqname<<"\t"<<found[i]<< "\t"<< motifname <<"\t"<<motif<<endl; 
      }
      nsite = nsite + found.size()-1;
    }
    cout <<"\r["<<current_time()<<"] " << nsite << " sites found in "<< nseq << " sequences             "  ;
  }
  
  cout << endl; 
  fseq.close();
  fout.close();
  
  int* res = new int[2];
  res[0] = nsite;
  res[1] = nseq;
  
  return res;
}

vector<string>  StringExplode(string str, string separator){
    int found;
    vector<string> results;
    found = str.find_first_of(separator);
    while(found != string::npos){
        if(found > 0){
            results.push_back(str.substr(0,found));
        }
        str = str.substr(found+1);
        found = str.find_first_of(separator);
    }
    if(str.length() > 0){
        results.push_back(str);
    }
    return results;
}

int tab2bed(string infile, string outfile)
{
  //hg18_chr6_122208322_122209078_+	635	5ss,A7C,G8T-rc	AAGTACCTG
  //hg18_chr6_122208322_122209078_+	553	5ss,C1G,G3A	GAAGTAAGT
  ifstream fin;
  ofstream fout;
  fin.open(infile.c_str());
  fout.open(outfile.c_str());
  string line;
  vector<string> flds;
  vector<string> pos;
  vector<string> nm;
  while(fin)
  {
    getline(fin,line);
    if (line.length() == 0)
      continue;
    flds = StringExplode(line,"\t");
    pos = StringExplode(flds[0],"_");
    if (pos.size() < 5)
    {
      cout << "\n!! incorrect sequence name format!\n make sure sequence name looks like: hg18_chr6_122208322_122209078_+\n\n";
      return 0;
    }
    string chr = pos[1];
    int start = atoi(pos[2].c_str());
    int end = atoi(pos[3].c_str());
    int match_start = atoi(flds[1].c_str());
    int motifLen = flds[3].length();
    // check if match on the other strand
    string strandness = "sense";
    if (flds[2].find("rc") !=string::npos)
      strandness = "antisense";
    string strand = pos[4];
    
    if (strand== "+")
    {
      start = start + match_start;
      if (strandness == "antisense") 
      {
        strand = "-";
      }    
    }
    else//sequence on the - strand of the genome
    {
      start = end - match_start - motifLen;
      if (strandness == "antisense") 
      {
        strand = "+";
      }
    }
    end = start + motifLen;
    // number of mismatches
    nm = StringExplode(flds[2],",");
    int score = nm.size()-1; 
   
    fout << chr <<"\t"<<start<<"\t" <<end<<"\t"<<flds[3]<< "\t"<<score <<"\t"<<strand<< "\t"<<strandness<<"\t"<<flds[2]<<"\t"<<flds[0]<<"\t"<<flds[1]<<endl;
    
  }
  fin.close();
  fout.close();
  return 1;//return 1 if successfully created bed file
}

int main(int argc, char* argv[]) { 

  if (argc < 7)
  {
    cout << "usage\n";
    cout << "  match motif.fa seq.fa outfile nmismatch rc/norc bed/nobed" << endl;
    cout << "==========\n";
    cout << "arguments:" <<endl;
    cout << "==========\n";
    cout << "  1. motif file, fasta format, could include multiple sequences\n";
    cout << "  2. sequence file, fasta format, could include multiple sequences\n";
    cout << "  3. output file name\n";
    cout << "  4. to search on reverse complement strand or not. use 'rc' or 'norc'\n";
    cout << "  5. to output BED format or not. use 'bed' only if sequence have id like: hg18_chr12_52642148_52644699_+\n";
    cout << "==========\n";
    cout << "output format\n";
    cout << "==========\n";
    cout << " tabular:\n";
    cout << "==========\n";
    cout << "  hg18_chr10_20048197_20048851_+	21	5ss,T5A:CAGGAAAGT-rc	ACTTTCCTG\n";
    cout << "  column 1: sequence id\n";
    cout << "  column 2: start of the match in sequence\n";
    cout << "  column 3: motif variant id, format: motif_id,mismatch:motif_sequence[-reverse_complement]\n";
    cout << "  column 4: actual match in target sequence\n";
    cout << "==========\n";
    cout << " BED format:\n";
    cout << "==========\n";
    cout << "  column 1-3: genomic coordinates of the match, calculated using sequence ID: hg18_chr12_52642148_52644699_+ and match start in the sequence\n";
    cout << "  column 4: number of mismatches\n";
    cout << "  column 6: strand of the match sequence\n";
    cout << "  column 7: strandness of the match relative to the target sequence\n";
    return 1;
  }
  
  
  string motiffile = argv[1]; // motif file
  string seqfile = argv[2]; // sequence file
  string outfile = argv[3]; // output file
  int nmismatch = atoi(argv[4]);// # of mismatches allowed
  string _rc = argv[5]; // rc or norc: to search reverse complement strand or not
  string _bed = argv[6]; // to make bed output or not. only valid if sequence have id like: hg18_chr12_52642148_52644699_+
  bool rc = false;
  if (_rc == "rc") {rc=true;}
  bool bed = false;
  if (_bed == "bed") {bed=true;}
  
    // alphabet
  string ACGT_str = "ACGT";
  set<char> alphabet;
  for (int i=0;i<4;i++)
  {
    alphabet.insert(ACGT_str[i]);
  }

  int *res  = match(motiffile, seqfile, outfile, nmismatch, rc, alphabet);
  
  if (_bed == "bed" && res[0]>0)
  {
    cout <<"["<<current_time()<<"] creating BED format output..."<<endl;
    string bedfile = outfile;
    bedfile += ".bed";
    if (tab2bed(outfile,bedfile))
    {
      rename(bedfile.c_str(),outfile.c_str());
    }
  }
  cout <<"["<<current_time()<<"] Done"<<endl;
  return 0;
}