diff match.cpp @ 15:0e221dbd17b2 default tip

Uploaded
author xuebing
date Sat, 31 Mar 2012 08:53:06 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/match.cpp	Sat Mar 31 08:53:06 2012 -0400
@@ -0,0 +1,413 @@
+#include <string>
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <set>
+#include <map>
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+using namespace std;
+
+string current_time()
+{// get current time, in the format:
+ // Thu Mar 15 21:06:57 2012
+  time_t rawtime;
+  struct tm * timeinfo;
+  time ( &rawtime );
+  timeinfo = localtime ( &rawtime );
+  string str = asctime (timeinfo);
+  return str.substr(0,str.size()-1);
+}  
+    
+template<class key,class val>
+void PrintMap(map<key,val> m)
+{// print out a map
+  for (typename map<key,val>::iterator it = m.begin();it!=m.end(); it++)
+  {
+    cout << (*it).first << "\t=>\t" <<(*it).second<<endl;
+  }
+}
+
+char complement(char ch){
+        switch (ch)
+        {
+        case 'A':return 'T';
+        case 'C':return 'G';
+        case 'G':return 'C';
+        case 'T':return 'A';
+        default:return 'N';
+        }
+}
+
+string reverseComplement(string seq){
+        int L = seq.length();
+        string rc (L,'0');
+        for (int i=0;i<L; i++)
+        {
+                rc[L-i-1] = complement(seq[i]);
+        }
+        return rc;
+}
+
+void ReadOneSeqFromFasta(ifstream& infile, string& name, string& seq)
+{
+  // read one sequence from fasta file
+  getline(infile,name); // read identifier line
+  name = name.substr(1);// remove leading '>'
+  seq = "";		// initialize sequence
+  string str;		
+  while(infile.peek() != '>' && infile.good())
+  {// before next '>' and before hitting the end of the file
+        getline(infile,str);
+        seq.append(str);
+    }
+} 
+
+vector<int> findall(string seq, string motif)
+{
+  // find all match positions of motif in seq
+  vector<int> allpos;
+  size_t pos = 0;
+  while(pos != string::npos)
+  {
+    pos = seq.find(motif,pos+1);
+    allpos.push_back(pos);
+  }
+  return allpos;
+}
+
+string int2str(int number)
+{
+   stringstream ss;//create a stringstream
+   ss << number;//add number to the stream
+   return ss.str();//return a string with the contents of the stream
+}
+
+
+
+map<string,string> ReadFasta(string filename)
+{//read all sequences in a fasta file
+  ifstream fin(filename.c_str());
+  
+  map<string,string> seqs;
+  string name,seq;
+  while(fin.good())
+  {
+    ReadOneSeqFromFasta(fin,name,seq);
+    seqs[name] = seq;
+  }
+  fin.close();
+  return seqs;
+}
+
+void mismatches(map<string,string>& mutant,map<string,int>& dist, string motif, int n, set<char> alphabet)
+{
+  set<char>::iterator it;
+  if (mutant.count(motif) == 0)
+  {
+    mutant[motif] = "";
+    dist[motif]=n;
+  }
+  if(n==0){return;}
+  for (int i=0;i<motif.length();i++)
+  {
+      string str=motif;
+      set<char> ab = alphabet;
+      ab.erase(str[i]);
+      for (it = ab.begin(); it!=ab.end(); it++)
+      {
+         str[i] = *it;
+         //cout << "mutate "<<motif<<" to "<<str<<endl;
+         if (mutant.count(str) >0)
+         {
+           if(dist[str] >= n)
+           {
+             //cout << mutant[str] <<endl;
+             continue;
+           }
+         }
+         
+         //mutated to a new sequence
+           //cout <<"new mutation"<<endl;
+           mutant[str] = mutant[motif];
+           mutant[str].push_back(','); 
+           mutant[str].push_back(motif[i]);
+           mutant[str].append(int2str(i+1));
+           mutant[str].push_back(str[i]);
+           dist[str]=n;
+           //cout << "tag="<<mutant[str]<<" dist="<<n<<endl;
+   
+         if (n>1)
+         {
+           //cout << "subproc" <<endl;
+           mismatches(mutant,dist,str,n-1,alphabet);
+         }
+      }
+
+  }
+}
+
+map<string,string> ExpandMotifs(map<string,string>& motifs, int nmismatch, bool rc, set<char> alphabet) 
+{ 
+  map<string,string> expandedmotifs;
+  // generate mismatched motifs
+  map<string,string> mutants;
+  map<string,int> dist;
+  map<string,string>::iterator it;   // iterator for motifs
+  map<string,int>::iterator it2; // iterator for mutants
+  string name,seq,tmp;
+  //cout<<"input motifs"<<endl;
+  //PrintMap(motifs);
+  for(it=motifs.begin();it!=motifs.end();it++)
+  {
+    name = (*it).first;
+    seq = (*it).second;
+   
+    mismatches(mutants,dist,seq,nmismatch,alphabet);
+    //cout << mutants.size()<<" mutants identified" <<endl;
+    //PrintMap(mutants);
+    // add mutants to motifs
+    for(it2=dist.begin();it2!=dist.end();it2++)
+    {
+      string tmp = name;
+      tmp.append(",").append((*it2).first);
+      tmp.append(mutants[(*it2).first]);
+      expandedmotifs[tmp] = (*it2).first;
+      //cout << name <<","<<tmp<<","<<expandedmotifs[tmp]<<endl;
+    }
+    // clear the mutants list
+    mutants.clear();
+    dist.clear();
+  }
+  //PrintMap(expandedmotifs);
+  //cout << expandedmotifs.size() <<" expanded motifs"<<endl;
+  //cout <<"add reverse complement"<<endl;
+  map<string,string> expandedmotifs_rc = expandedmotifs;
+  if (rc)
+  {
+    for(it=expandedmotifs.begin();it!=expandedmotifs.end();it++)
+    {
+      name = (*it).first;
+      expandedmotifs_rc[name.append(",rc")] = reverseComplement((*it).second);
+    }
+  }   
+ 
+  return expandedmotifs_rc;
+ 
+}
+
+
+
+int* match(string motiffile, string seqfile, string outfile, int nmismatch, bool rc, set<char> alphabet)
+{
+  int nsite = 0; // total number of matches to report
+  int nseq = 0;
+  ifstream fmotif, fseq;
+  ofstream fout;
+  
+  // load motifs
+  map<string,string> motifs = ReadFasta(motiffile);
+  cout <<"["<<current_time()<<"] "<<motifs.size()<< " motifs loaded from "<<motiffile<<endl;
+  
+  // expand motifs
+  map<string,string> expandedmotifs = ExpandMotifs(motifs,nmismatch,rc,alphabet);
+  cout <<"["<<current_time()<<"] "<<expandedmotifs.size()<< " motifs after expanding (mismatch/reverse-complement)"<<endl;
+  
+  //PrintMap(expandedmotifs);
+  
+  // searching motifs in each sequence
+  fseq.open(seqfile.c_str());
+  fout.open(outfile.c_str());
+  
+  string seqname,seq,motifname,motif;
+  while(fseq.good())
+  {
+    // read one sequence
+    ReadOneSeqFromFasta(fseq,seqname,seq);
+    nseq = nseq + 1;
+
+    cout.flush();
+    // iterate over motifs
+    map<string,string>::iterator it;
+    for(it=expandedmotifs.begin();it!=expandedmotifs.end();it++)
+    {
+      motifname = (*it).first;
+      motif = (*it).second;
+      //cout << "searching for "<<motifname<<":"<< motif <<endl;
+      vector<int> found = findall(seq,motif);
+      for (int i =0;i<found.size()-1;i++)
+      {
+        fout <<seqname<<"\t"<<found[i]<< "\t"<< motifname <<"\t"<<motif<<endl; 
+      }
+      nsite = nsite + found.size()-1;
+    }
+    cout <<"\r["<<current_time()<<"] " << nsite << " sites found in "<< nseq << " sequences             "  ;
+  }
+  
+  cout << endl; 
+  fseq.close();
+  fout.close();
+  
+  int* res = new int[2];
+  res[0] = nsite;
+  res[1] = nseq;
+  
+  return res;
+}
+
+vector<string>  StringExplode(string str, string separator){
+    int found;
+    vector<string> results;
+    found = str.find_first_of(separator);
+    while(found != string::npos){
+        if(found > 0){
+            results.push_back(str.substr(0,found));
+        }
+        str = str.substr(found+1);
+        found = str.find_first_of(separator);
+    }
+    if(str.length() > 0){
+        results.push_back(str);
+    }
+    return results;
+}
+
+int tab2bed(string infile, string outfile)
+{
+  //hg18_chr6_122208322_122209078_+	635	5ss,A7C,G8T-rc	AAGTACCTG
+  //hg18_chr6_122208322_122209078_+	553	5ss,C1G,G3A	GAAGTAAGT
+  ifstream fin;
+  ofstream fout;
+  fin.open(infile.c_str());
+  fout.open(outfile.c_str());
+  string line;
+  vector<string> flds;
+  vector<string> pos;
+  vector<string> nm;
+  while(fin)
+  {
+    getline(fin,line);
+    if (line.length() == 0)
+      continue;
+    flds = StringExplode(line,"\t");
+    pos = StringExplode(flds[0],"_");
+    if (pos.size() < 5)
+    {
+      cout << "\n!! incorrect sequence name format!\n make sure sequence name looks like: hg18_chr6_122208322_122209078_+\n\n";
+      return 0;
+    }
+    string chr = pos[1];
+    int start = atoi(pos[2].c_str());
+    int end = atoi(pos[3].c_str());
+    int match_start = atoi(flds[1].c_str());
+    int motifLen = flds[3].length();
+    // check if match on the other strand
+    string strandness = "sense";
+    if (flds[2].find("rc") !=string::npos)
+      strandness = "antisense";
+    string strand = pos[4];
+    
+    if (strand== "+")
+    {
+      start = start + match_start;
+      if (strandness == "antisense") 
+      {
+        strand = "-";
+      }    
+    }
+    else//sequence on the - strand of the genome
+    {
+      start = end - match_start - motifLen;
+      if (strandness == "antisense") 
+      {
+        strand = "+";
+      }
+    }
+    end = start + motifLen;
+    // number of mismatches
+    nm = StringExplode(flds[2],",");
+    int score = nm.size()-1; 
+   
+    fout << chr <<"\t"<<start<<"\t" <<end<<"\t"<<flds[3]<< "\t"<<score <<"\t"<<strand<< "\t"<<strandness<<"\t"<<flds[2]<<"\t"<<flds[0]<<"\t"<<flds[1]<<endl;
+    
+  }
+  fin.close();
+  fout.close();
+  return 1;//return 1 if successfully created bed file
+}
+
+int main(int argc, char* argv[]) { 
+
+  if (argc < 7)
+  {
+    cout << "usage\n";
+    cout << "  match motif.fa seq.fa outfile nmismatch rc/norc bed/nobed" << endl;
+    cout << "==========\n";
+    cout << "arguments:" <<endl;
+    cout << "==========\n";
+    cout << "  1. motif file, fasta format, could include multiple sequences\n";
+    cout << "  2. sequence file, fasta format, could include multiple sequences\n";
+    cout << "  3. output file name\n";
+    cout << "  4. to search on reverse complement strand or not. use 'rc' or 'norc'\n";
+    cout << "  5. to output BED format or not. use 'bed' only if sequence have id like: hg18_chr12_52642148_52644699_+\n";
+    cout << "==========\n";
+    cout << "output format\n";
+    cout << "==========\n";
+    cout << " tabular:\n";
+    cout << "==========\n";
+    cout << "  hg18_chr10_20048197_20048851_+	21	5ss,T5A:CAGGAAAGT-rc	ACTTTCCTG\n";
+    cout << "  column 1: sequence id\n";
+    cout << "  column 2: start of the match in sequence\n";
+    cout << "  column 3: motif variant id, format: motif_id,mismatch:motif_sequence[-reverse_complement]\n";
+    cout << "  column 4: actual match in target sequence\n";
+    cout << "==========\n";
+    cout << " BED format:\n";
+    cout << "==========\n";
+    cout << "  column 1-3: genomic coordinates of the match, calculated using sequence ID: hg18_chr12_52642148_52644699_+ and match start in the sequence\n";
+    cout << "  column 4: number of mismatches\n";
+    cout << "  column 6: strand of the match sequence\n";
+    cout << "  column 7: strandness of the match relative to the target sequence\n";
+    return 1;
+  }
+  
+  
+  string motiffile = argv[1]; // motif file
+  string seqfile = argv[2]; // sequence file
+  string outfile = argv[3]; // output file
+  int nmismatch = atoi(argv[4]);// # of mismatches allowed
+  string _rc = argv[5]; // rc or norc: to search reverse complement strand or not
+  string _bed = argv[6]; // to make bed output or not. only valid if sequence have id like: hg18_chr12_52642148_52644699_+
+  bool rc = false;
+  if (_rc == "rc") {rc=true;}
+  bool bed = false;
+  if (_bed == "bed") {bed=true;}
+  
+    // alphabet
+  string ACGT_str = "ACGT";
+  set<char> alphabet;
+  for (int i=0;i<4;i++)
+  {
+    alphabet.insert(ACGT_str[i]);
+  }
+
+  int *res  = match(motiffile, seqfile, outfile, nmismatch, rc, alphabet);
+  
+  if (_bed == "bed" && res[0]>0)
+  {
+    cout <<"["<<current_time()<<"] creating BED format output..."<<endl;
+    string bedfile = outfile;
+    bedfile += ".bed";
+    if (tab2bed(outfile,bedfile))
+    {
+      rename(bedfile.c_str(),outfile.c_str());
+    }
+  }
+  cout <<"["<<current_time()<<"] Done"<<endl;
+  return 0;
+}
+