Mercurial > repos > xuebing > sharplab_seq_motif
diff match.cpp @ 15:0e221dbd17b2 default tip
Uploaded
author | xuebing |
---|---|
date | Sat, 31 Mar 2012 08:53:06 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/match.cpp Sat Mar 31 08:53:06 2012 -0400 @@ -0,0 +1,413 @@ +#include <string> +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> +#include <set> +#include <map> + +#include <string.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> + +using namespace std; + +string current_time() +{// get current time, in the format: + // Thu Mar 15 21:06:57 2012 + time_t rawtime; + struct tm * timeinfo; + time ( &rawtime ); + timeinfo = localtime ( &rawtime ); + string str = asctime (timeinfo); + return str.substr(0,str.size()-1); +} + +template<class key,class val> +void PrintMap(map<key,val> m) +{// print out a map + for (typename map<key,val>::iterator it = m.begin();it!=m.end(); it++) + { + cout << (*it).first << "\t=>\t" <<(*it).second<<endl; + } +} + +char complement(char ch){ + switch (ch) + { + case 'A':return 'T'; + case 'C':return 'G'; + case 'G':return 'C'; + case 'T':return 'A'; + default:return 'N'; + } +} + +string reverseComplement(string seq){ + int L = seq.length(); + string rc (L,'0'); + for (int i=0;i<L; i++) + { + rc[L-i-1] = complement(seq[i]); + } + return rc; +} + +void ReadOneSeqFromFasta(ifstream& infile, string& name, string& seq) +{ + // read one sequence from fasta file + getline(infile,name); // read identifier line + name = name.substr(1);// remove leading '>' + seq = ""; // initialize sequence + string str; + while(infile.peek() != '>' && infile.good()) + {// before next '>' and before hitting the end of the file + getline(infile,str); + seq.append(str); + } +} + +vector<int> findall(string seq, string motif) +{ + // find all match positions of motif in seq + vector<int> allpos; + size_t pos = 0; + while(pos != string::npos) + { + pos = seq.find(motif,pos+1); + allpos.push_back(pos); + } + return allpos; +} + +string int2str(int number) +{ + stringstream ss;//create a stringstream + ss << number;//add number to the stream + return ss.str();//return a string with the contents of the stream +} + + + +map<string,string> ReadFasta(string filename) +{//read all sequences in a fasta file + ifstream fin(filename.c_str()); + + map<string,string> seqs; + string name,seq; + while(fin.good()) + { + ReadOneSeqFromFasta(fin,name,seq); + seqs[name] = seq; + } + fin.close(); + return seqs; +} + +void mismatches(map<string,string>& mutant,map<string,int>& dist, string motif, int n, set<char> alphabet) +{ + set<char>::iterator it; + if (mutant.count(motif) == 0) + { + mutant[motif] = ""; + dist[motif]=n; + } + if(n==0){return;} + for (int i=0;i<motif.length();i++) + { + string str=motif; + set<char> ab = alphabet; + ab.erase(str[i]); + for (it = ab.begin(); it!=ab.end(); it++) + { + str[i] = *it; + //cout << "mutate "<<motif<<" to "<<str<<endl; + if (mutant.count(str) >0) + { + if(dist[str] >= n) + { + //cout << mutant[str] <<endl; + continue; + } + } + + //mutated to a new sequence + //cout <<"new mutation"<<endl; + mutant[str] = mutant[motif]; + mutant[str].push_back(','); + mutant[str].push_back(motif[i]); + mutant[str].append(int2str(i+1)); + mutant[str].push_back(str[i]); + dist[str]=n; + //cout << "tag="<<mutant[str]<<" dist="<<n<<endl; + + if (n>1) + { + //cout << "subproc" <<endl; + mismatches(mutant,dist,str,n-1,alphabet); + } + } + + } +} + +map<string,string> ExpandMotifs(map<string,string>& motifs, int nmismatch, bool rc, set<char> alphabet) +{ + map<string,string> expandedmotifs; + // generate mismatched motifs + map<string,string> mutants; + map<string,int> dist; + map<string,string>::iterator it; // iterator for motifs + map<string,int>::iterator it2; // iterator for mutants + string name,seq,tmp; + //cout<<"input motifs"<<endl; + //PrintMap(motifs); + for(it=motifs.begin();it!=motifs.end();it++) + { + name = (*it).first; + seq = (*it).second; + + mismatches(mutants,dist,seq,nmismatch,alphabet); + //cout << mutants.size()<<" mutants identified" <<endl; + //PrintMap(mutants); + // add mutants to motifs + for(it2=dist.begin();it2!=dist.end();it2++) + { + string tmp = name; + tmp.append(",").append((*it2).first); + tmp.append(mutants[(*it2).first]); + expandedmotifs[tmp] = (*it2).first; + //cout << name <<","<<tmp<<","<<expandedmotifs[tmp]<<endl; + } + // clear the mutants list + mutants.clear(); + dist.clear(); + } + //PrintMap(expandedmotifs); + //cout << expandedmotifs.size() <<" expanded motifs"<<endl; + //cout <<"add reverse complement"<<endl; + map<string,string> expandedmotifs_rc = expandedmotifs; + if (rc) + { + for(it=expandedmotifs.begin();it!=expandedmotifs.end();it++) + { + name = (*it).first; + expandedmotifs_rc[name.append(",rc")] = reverseComplement((*it).second); + } + } + + return expandedmotifs_rc; + +} + + + +int* match(string motiffile, string seqfile, string outfile, int nmismatch, bool rc, set<char> alphabet) +{ + int nsite = 0; // total number of matches to report + int nseq = 0; + ifstream fmotif, fseq; + ofstream fout; + + // load motifs + map<string,string> motifs = ReadFasta(motiffile); + cout <<"["<<current_time()<<"] "<<motifs.size()<< " motifs loaded from "<<motiffile<<endl; + + // expand motifs + map<string,string> expandedmotifs = ExpandMotifs(motifs,nmismatch,rc,alphabet); + cout <<"["<<current_time()<<"] "<<expandedmotifs.size()<< " motifs after expanding (mismatch/reverse-complement)"<<endl; + + //PrintMap(expandedmotifs); + + // searching motifs in each sequence + fseq.open(seqfile.c_str()); + fout.open(outfile.c_str()); + + string seqname,seq,motifname,motif; + while(fseq.good()) + { + // read one sequence + ReadOneSeqFromFasta(fseq,seqname,seq); + nseq = nseq + 1; + + cout.flush(); + // iterate over motifs + map<string,string>::iterator it; + for(it=expandedmotifs.begin();it!=expandedmotifs.end();it++) + { + motifname = (*it).first; + motif = (*it).second; + //cout << "searching for "<<motifname<<":"<< motif <<endl; + vector<int> found = findall(seq,motif); + for (int i =0;i<found.size()-1;i++) + { + fout <<seqname<<"\t"<<found[i]<< "\t"<< motifname <<"\t"<<motif<<endl; + } + nsite = nsite + found.size()-1; + } + cout <<"\r["<<current_time()<<"] " << nsite << " sites found in "<< nseq << " sequences " ; + } + + cout << endl; + fseq.close(); + fout.close(); + + int* res = new int[2]; + res[0] = nsite; + res[1] = nseq; + + return res; +} + +vector<string> StringExplode(string str, string separator){ + int found; + vector<string> results; + found = str.find_first_of(separator); + while(found != string::npos){ + if(found > 0){ + results.push_back(str.substr(0,found)); + } + str = str.substr(found+1); + found = str.find_first_of(separator); + } + if(str.length() > 0){ + results.push_back(str); + } + return results; +} + +int tab2bed(string infile, string outfile) +{ + //hg18_chr6_122208322_122209078_+ 635 5ss,A7C,G8T-rc AAGTACCTG + //hg18_chr6_122208322_122209078_+ 553 5ss,C1G,G3A GAAGTAAGT + ifstream fin; + ofstream fout; + fin.open(infile.c_str()); + fout.open(outfile.c_str()); + string line; + vector<string> flds; + vector<string> pos; + vector<string> nm; + while(fin) + { + getline(fin,line); + if (line.length() == 0) + continue; + flds = StringExplode(line,"\t"); + pos = StringExplode(flds[0],"_"); + if (pos.size() < 5) + { + cout << "\n!! incorrect sequence name format!\n make sure sequence name looks like: hg18_chr6_122208322_122209078_+\n\n"; + return 0; + } + string chr = pos[1]; + int start = atoi(pos[2].c_str()); + int end = atoi(pos[3].c_str()); + int match_start = atoi(flds[1].c_str()); + int motifLen = flds[3].length(); + // check if match on the other strand + string strandness = "sense"; + if (flds[2].find("rc") !=string::npos) + strandness = "antisense"; + string strand = pos[4]; + + if (strand== "+") + { + start = start + match_start; + if (strandness == "antisense") + { + strand = "-"; + } + } + else//sequence on the - strand of the genome + { + start = end - match_start - motifLen; + if (strandness == "antisense") + { + strand = "+"; + } + } + end = start + motifLen; + // number of mismatches + nm = StringExplode(flds[2],","); + int score = nm.size()-1; + + fout << chr <<"\t"<<start<<"\t" <<end<<"\t"<<flds[3]<< "\t"<<score <<"\t"<<strand<< "\t"<<strandness<<"\t"<<flds[2]<<"\t"<<flds[0]<<"\t"<<flds[1]<<endl; + + } + fin.close(); + fout.close(); + return 1;//return 1 if successfully created bed file +} + +int main(int argc, char* argv[]) { + + if (argc < 7) + { + cout << "usage\n"; + cout << " match motif.fa seq.fa outfile nmismatch rc/norc bed/nobed" << endl; + cout << "==========\n"; + cout << "arguments:" <<endl; + cout << "==========\n"; + cout << " 1. motif file, fasta format, could include multiple sequences\n"; + cout << " 2. sequence file, fasta format, could include multiple sequences\n"; + cout << " 3. output file name\n"; + cout << " 4. to search on reverse complement strand or not. use 'rc' or 'norc'\n"; + cout << " 5. to output BED format or not. use 'bed' only if sequence have id like: hg18_chr12_52642148_52644699_+\n"; + cout << "==========\n"; + cout << "output format\n"; + cout << "==========\n"; + cout << " tabular:\n"; + cout << "==========\n"; + cout << " hg18_chr10_20048197_20048851_+ 21 5ss,T5A:CAGGAAAGT-rc ACTTTCCTG\n"; + cout << " column 1: sequence id\n"; + cout << " column 2: start of the match in sequence\n"; + cout << " column 3: motif variant id, format: motif_id,mismatch:motif_sequence[-reverse_complement]\n"; + cout << " column 4: actual match in target sequence\n"; + cout << "==========\n"; + cout << " BED format:\n"; + cout << "==========\n"; + cout << " column 1-3: genomic coordinates of the match, calculated using sequence ID: hg18_chr12_52642148_52644699_+ and match start in the sequence\n"; + cout << " column 4: number of mismatches\n"; + cout << " column 6: strand of the match sequence\n"; + cout << " column 7: strandness of the match relative to the target sequence\n"; + return 1; + } + + + string motiffile = argv[1]; // motif file + string seqfile = argv[2]; // sequence file + string outfile = argv[3]; // output file + int nmismatch = atoi(argv[4]);// # of mismatches allowed + string _rc = argv[5]; // rc or norc: to search reverse complement strand or not + string _bed = argv[6]; // to make bed output or not. only valid if sequence have id like: hg18_chr12_52642148_52644699_+ + bool rc = false; + if (_rc == "rc") {rc=true;} + bool bed = false; + if (_bed == "bed") {bed=true;} + + // alphabet + string ACGT_str = "ACGT"; + set<char> alphabet; + for (int i=0;i<4;i++) + { + alphabet.insert(ACGT_str[i]); + } + + int *res = match(motiffile, seqfile, outfile, nmismatch, rc, alphabet); + + if (_bed == "bed" && res[0]>0) + { + cout <<"["<<current_time()<<"] creating BED format output..."<<endl; + string bedfile = outfile; + bedfile += ".bed"; + if (tab2bed(outfile,bedfile)) + { + rename(bedfile.c_str(),outfile.c_str()); + } + } + cout <<"["<<current_time()<<"] Done"<<endl; + return 0; +} +