Mercurial > repos > xuebing > sharplab_seq_motif
view match.cpp @ 15:0e221dbd17b2 default tip
Uploaded
author | xuebing |
---|---|
date | Sat, 31 Mar 2012 08:53:06 -0400 |
parents | |
children |
line wrap: on
line source
#include <string> #include <sstream> #include <iostream> #include <fstream> #include <vector> #include <set> #include <map> #include <string.h> #include <stdio.h> #include <stdlib.h> #include <time.h> using namespace std; string current_time() {// get current time, in the format: // Thu Mar 15 21:06:57 2012 time_t rawtime; struct tm * timeinfo; time ( &rawtime ); timeinfo = localtime ( &rawtime ); string str = asctime (timeinfo); return str.substr(0,str.size()-1); } template<class key,class val> void PrintMap(map<key,val> m) {// print out a map for (typename map<key,val>::iterator it = m.begin();it!=m.end(); it++) { cout << (*it).first << "\t=>\t" <<(*it).second<<endl; } } char complement(char ch){ switch (ch) { case 'A':return 'T'; case 'C':return 'G'; case 'G':return 'C'; case 'T':return 'A'; default:return 'N'; } } string reverseComplement(string seq){ int L = seq.length(); string rc (L,'0'); for (int i=0;i<L; i++) { rc[L-i-1] = complement(seq[i]); } return rc; } void ReadOneSeqFromFasta(ifstream& infile, string& name, string& seq) { // read one sequence from fasta file getline(infile,name); // read identifier line name = name.substr(1);// remove leading '>' seq = ""; // initialize sequence string str; while(infile.peek() != '>' && infile.good()) {// before next '>' and before hitting the end of the file getline(infile,str); seq.append(str); } } vector<int> findall(string seq, string motif) { // find all match positions of motif in seq vector<int> allpos; size_t pos = 0; while(pos != string::npos) { pos = seq.find(motif,pos+1); allpos.push_back(pos); } return allpos; } string int2str(int number) { stringstream ss;//create a stringstream ss << number;//add number to the stream return ss.str();//return a string with the contents of the stream } map<string,string> ReadFasta(string filename) {//read all sequences in a fasta file ifstream fin(filename.c_str()); map<string,string> seqs; string name,seq; while(fin.good()) { ReadOneSeqFromFasta(fin,name,seq); seqs[name] = seq; } fin.close(); return seqs; } void mismatches(map<string,string>& mutant,map<string,int>& dist, string motif, int n, set<char> alphabet) { set<char>::iterator it; if (mutant.count(motif) == 0) { mutant[motif] = ""; dist[motif]=n; } if(n==0){return;} for (int i=0;i<motif.length();i++) { string str=motif; set<char> ab = alphabet; ab.erase(str[i]); for (it = ab.begin(); it!=ab.end(); it++) { str[i] = *it; //cout << "mutate "<<motif<<" to "<<str<<endl; if (mutant.count(str) >0) { if(dist[str] >= n) { //cout << mutant[str] <<endl; continue; } } //mutated to a new sequence //cout <<"new mutation"<<endl; mutant[str] = mutant[motif]; mutant[str].push_back(','); mutant[str].push_back(motif[i]); mutant[str].append(int2str(i+1)); mutant[str].push_back(str[i]); dist[str]=n; //cout << "tag="<<mutant[str]<<" dist="<<n<<endl; if (n>1) { //cout << "subproc" <<endl; mismatches(mutant,dist,str,n-1,alphabet); } } } } map<string,string> ExpandMotifs(map<string,string>& motifs, int nmismatch, bool rc, set<char> alphabet) { map<string,string> expandedmotifs; // generate mismatched motifs map<string,string> mutants; map<string,int> dist; map<string,string>::iterator it; // iterator for motifs map<string,int>::iterator it2; // iterator for mutants string name,seq,tmp; //cout<<"input motifs"<<endl; //PrintMap(motifs); for(it=motifs.begin();it!=motifs.end();it++) { name = (*it).first; seq = (*it).second; mismatches(mutants,dist,seq,nmismatch,alphabet); //cout << mutants.size()<<" mutants identified" <<endl; //PrintMap(mutants); // add mutants to motifs for(it2=dist.begin();it2!=dist.end();it2++) { string tmp = name; tmp.append(",").append((*it2).first); tmp.append(mutants[(*it2).first]); expandedmotifs[tmp] = (*it2).first; //cout << name <<","<<tmp<<","<<expandedmotifs[tmp]<<endl; } // clear the mutants list mutants.clear(); dist.clear(); } //PrintMap(expandedmotifs); //cout << expandedmotifs.size() <<" expanded motifs"<<endl; //cout <<"add reverse complement"<<endl; map<string,string> expandedmotifs_rc = expandedmotifs; if (rc) { for(it=expandedmotifs.begin();it!=expandedmotifs.end();it++) { name = (*it).first; expandedmotifs_rc[name.append(",rc")] = reverseComplement((*it).second); } } return expandedmotifs_rc; } int* match(string motiffile, string seqfile, string outfile, int nmismatch, bool rc, set<char> alphabet) { int nsite = 0; // total number of matches to report int nseq = 0; ifstream fmotif, fseq; ofstream fout; // load motifs map<string,string> motifs = ReadFasta(motiffile); cout <<"["<<current_time()<<"] "<<motifs.size()<< " motifs loaded from "<<motiffile<<endl; // expand motifs map<string,string> expandedmotifs = ExpandMotifs(motifs,nmismatch,rc,alphabet); cout <<"["<<current_time()<<"] "<<expandedmotifs.size()<< " motifs after expanding (mismatch/reverse-complement)"<<endl; //PrintMap(expandedmotifs); // searching motifs in each sequence fseq.open(seqfile.c_str()); fout.open(outfile.c_str()); string seqname,seq,motifname,motif; while(fseq.good()) { // read one sequence ReadOneSeqFromFasta(fseq,seqname,seq); nseq = nseq + 1; cout.flush(); // iterate over motifs map<string,string>::iterator it; for(it=expandedmotifs.begin();it!=expandedmotifs.end();it++) { motifname = (*it).first; motif = (*it).second; //cout << "searching for "<<motifname<<":"<< motif <<endl; vector<int> found = findall(seq,motif); for (int i =0;i<found.size()-1;i++) { fout <<seqname<<"\t"<<found[i]<< "\t"<< motifname <<"\t"<<motif<<endl; } nsite = nsite + found.size()-1; } cout <<"\r["<<current_time()<<"] " << nsite << " sites found in "<< nseq << " sequences " ; } cout << endl; fseq.close(); fout.close(); int* res = new int[2]; res[0] = nsite; res[1] = nseq; return res; } vector<string> StringExplode(string str, string separator){ int found; vector<string> results; found = str.find_first_of(separator); while(found != string::npos){ if(found > 0){ results.push_back(str.substr(0,found)); } str = str.substr(found+1); found = str.find_first_of(separator); } if(str.length() > 0){ results.push_back(str); } return results; } int tab2bed(string infile, string outfile) { //hg18_chr6_122208322_122209078_+ 635 5ss,A7C,G8T-rc AAGTACCTG //hg18_chr6_122208322_122209078_+ 553 5ss,C1G,G3A GAAGTAAGT ifstream fin; ofstream fout; fin.open(infile.c_str()); fout.open(outfile.c_str()); string line; vector<string> flds; vector<string> pos; vector<string> nm; while(fin) { getline(fin,line); if (line.length() == 0) continue; flds = StringExplode(line,"\t"); pos = StringExplode(flds[0],"_"); if (pos.size() < 5) { cout << "\n!! incorrect sequence name format!\n make sure sequence name looks like: hg18_chr6_122208322_122209078_+\n\n"; return 0; } string chr = pos[1]; int start = atoi(pos[2].c_str()); int end = atoi(pos[3].c_str()); int match_start = atoi(flds[1].c_str()); int motifLen = flds[3].length(); // check if match on the other strand string strandness = "sense"; if (flds[2].find("rc") !=string::npos) strandness = "antisense"; string strand = pos[4]; if (strand== "+") { start = start + match_start; if (strandness == "antisense") { strand = "-"; } } else//sequence on the - strand of the genome { start = end - match_start - motifLen; if (strandness == "antisense") { strand = "+"; } } end = start + motifLen; // number of mismatches nm = StringExplode(flds[2],","); int score = nm.size()-1; fout << chr <<"\t"<<start<<"\t" <<end<<"\t"<<flds[3]<< "\t"<<score <<"\t"<<strand<< "\t"<<strandness<<"\t"<<flds[2]<<"\t"<<flds[0]<<"\t"<<flds[1]<<endl; } fin.close(); fout.close(); return 1;//return 1 if successfully created bed file } int main(int argc, char* argv[]) { if (argc < 7) { cout << "usage\n"; cout << " match motif.fa seq.fa outfile nmismatch rc/norc bed/nobed" << endl; cout << "==========\n"; cout << "arguments:" <<endl; cout << "==========\n"; cout << " 1. motif file, fasta format, could include multiple sequences\n"; cout << " 2. sequence file, fasta format, could include multiple sequences\n"; cout << " 3. output file name\n"; cout << " 4. to search on reverse complement strand or not. use 'rc' or 'norc'\n"; cout << " 5. to output BED format or not. use 'bed' only if sequence have id like: hg18_chr12_52642148_52644699_+\n"; cout << "==========\n"; cout << "output format\n"; cout << "==========\n"; cout << " tabular:\n"; cout << "==========\n"; cout << " hg18_chr10_20048197_20048851_+ 21 5ss,T5A:CAGGAAAGT-rc ACTTTCCTG\n"; cout << " column 1: sequence id\n"; cout << " column 2: start of the match in sequence\n"; cout << " column 3: motif variant id, format: motif_id,mismatch:motif_sequence[-reverse_complement]\n"; cout << " column 4: actual match in target sequence\n"; cout << "==========\n"; cout << " BED format:\n"; cout << "==========\n"; cout << " column 1-3: genomic coordinates of the match, calculated using sequence ID: hg18_chr12_52642148_52644699_+ and match start in the sequence\n"; cout << " column 4: number of mismatches\n"; cout << " column 6: strand of the match sequence\n"; cout << " column 7: strandness of the match relative to the target sequence\n"; return 1; } string motiffile = argv[1]; // motif file string seqfile = argv[2]; // sequence file string outfile = argv[3]; // output file int nmismatch = atoi(argv[4]);// # of mismatches allowed string _rc = argv[5]; // rc or norc: to search reverse complement strand or not string _bed = argv[6]; // to make bed output or not. only valid if sequence have id like: hg18_chr12_52642148_52644699_+ bool rc = false; if (_rc == "rc") {rc=true;} bool bed = false; if (_bed == "bed") {bed=true;} // alphabet string ACGT_str = "ACGT"; set<char> alphabet; for (int i=0;i<4;i++) { alphabet.insert(ACGT_str[i]); } int *res = match(motiffile, seqfile, outfile, nmismatch, rc, alphabet); if (_bed == "bed" && res[0]>0) { cout <<"["<<current_time()<<"] creating BED format output..."<<endl; string bedfile = outfile; bedfile += ".bed"; if (tab2bed(outfile,bedfile)) { rename(bedfile.c_str(),outfile.c_str()); } } cout <<"["<<current_time()<<"] Done"<<endl; return 0; }