Mercurial > repos > cstrittmatter > mitokmer
view kmer_read_m3.cpp @ 0:1472b4f4fbfe draft
planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
author | cstrittmatter |
---|---|
date | Fri, 12 Apr 2019 14:58:18 -0400 |
parents | |
children |
line wrap: on
line source
// newkmer_5.cpp load k-mers for all species // test reads by SmithWaterman // MKM 1 APR 2016 //-std=c++0x -D__NO_INLINE__ ... at end: -lz //#include "stdafx.h" #include <fstream> #include <stdlib.h> #include <stdio.h> #include <math.h> #include <iostream> #include <sstream> #include <iomanip> #include <ostream> #include <string> #include <vector> #include <unordered_set> #include <time.h> //#include <sys/time.h> #include <cstddef> #include <cstring> #include <algorithm> #include <set> #include <zlib.h> #include <dirent.h> using namespace std; const int minalign = 0; //120; unordered_set<int> myset = { 0}; typedef int ptype; typedef unsigned long long ui64; typedef unsigned long long ktype; typedef unsigned int vtype; typedef unsigned short int otype; typedef unsigned long long itype; const int KSIZE = 30; //kmer size const int MAXREP = 2048; const int REPSHIFT = 11; //(log of maxrep) const int SAVENUM = 12; //save this many reads const itype MAXHASH = (1 << 30); //pow2 size of hashtable const int MAXREPROBE = 16; const int GAPO = 11; const int GAPX = 1; const int MATCH = 5; const int MISMATCH = -4; const bool INIGAPPEN = false; const int MAXALIGN = 1024; const int beam = 8; vector<string> accession; vector<int> targno; int mcount, savepos, tct, num_orgs=0, save_target=0; vector<int> gcount, ucount; ofstream outread; set<ktype> kmer_seen; string fdir = ""; //directory for fasta.gz files const char bases[4] = { 'A', 'C', 'G', 'T' }; const ui64 one = 1; const ui64 two = 2; const ui64 three = 3; const ui64 mask = (one << (KSIZE * 2)) - 1; const ui64 hiC = one << (KSIZE - 1) * 2; const ui64 hiG = two << (KSIZE - 1) * 2; const ui64 hiT = three << (KSIZE - 1) * 2; const ui64 hi1 = one << 62; const ui64 hi2 = two << 62; const ui64 hi3 = three << 62; const ui64 himask = hi1 - one; const unsigned BUFLEN = 0x4000; void error(const char* const msg) { cerr << msg << "\n"; exit(255); } string int2str(ktype key) { string sequence = ""; for (int i = 0; i < KSIZE; ++i) { sequence = bases[key & 3] + sequence; key >>= 2; } return sequence; } class Tree1 { public: vector<vector<int>> children; vector<int> parent; int root; Tree1(int nt) { vector<int> blank; root = 1; for (int i = 0; i < nt; i++) { parent.push_back(root); children.push_back(blank); } } ~Tree1() { } void add_edge(int x, int y) { parent[y] = x; children[x].push_back(y); } int msca(int x, int y) { //return most specific common ancestor of nodes x and y set<int> ancestors; int z; ancestors.insert(root); z = x; //cout << x << " " << y << " " << z << endl; while (z != root) { ancestors.insert(z); z = get_parent(z); //cout << x << " " << y << " " << z << endl; } z = y; if (ancestors.find(y) != ancestors.end()) return x; //x is descendent of y while (ancestors.find(z) == ancestors.end()) { z = get_parent(z); if (z==x) //y is descendent of x return y; //cout << x << " " << y << " " << z << endl; } return z; //common ancesctor } int get_parent(int x) { if (x != root && x > 0) return parent[x]; else return root; } }; Tree1 *taxonomy; class Hashtable { public: itype size; struct Cell { ktype key; vtype value; int org; ptype position; bool fstrand; } *pHash; Hashtable() { size = 0; pHash = new Cell[MAXHASH]; HashClear(); //for (int i = 1; i<MAXREPROBE; i++) // reprobeVal[i] = reprobeVal[i - 1] + i; //i*i/2+i/2 } ~Hashtable() { delete[] pHash; size = 0; } // from code.google.com/p/smhasher/wiki/MurmurHash3 itype integerHash(ui64 k) { k ^= k >> 33; k *= 0xff51afd7ed558ccd; k ^= k >> 33; k *= 0xc4ceb9fe1a85ec53; k ^= k >> 33; return (itype) k; } void HashClear() { memset(pHash, 0, MAXHASH * sizeof(Cell)); } int getHash(ktype key, otype &org, ptype &position, bool &fstrand) { //0 target=bad //returns target if probe present in table //sets position and fstrand itype index, reprobe, i, hash; position = 0; hash = integerHash(key); reprobe = 0; i = 0; do { index = ((hash + reprobe) & (MAXHASH-1)); //for power of 2 size reprobe += ++i; if (pHash[index].value == 0) { //empty cell return 0; } else if (pHash[index].key == key) { position = pHash[index].position; fstrand = pHash[index].fstrand; org = pHash[index].org; return (pHash[index].value); } } while (reprobe < MAXHASH && i < MAXREPROBE); return 0; } void add_kmer(ktype key, vtype target, otype org, ptype position, bool fstrand) { //add kmer to table itype index, reprobe, i, hash; bool notdone=true; hash = integerHash(key); reprobe = 0; i = 0; do { index = ((hash + reprobe) & (MAXHASH-1)); //for power of 2 size reprobe += ++i; if (pHash[index].value == 0) { //empty cell notdone = false; pHash[index].key = key; pHash[index].value = target; pHash[index].org = org; pHash[index].position = position; pHash[index].fstrand = fstrand; if (++size > MAXHASH - 32) { cout << "out of memory in table " << endl; exit(1); } } } while (notdone); } }; //class Hashtable Hashtable *ht; string process_seq(string seq) { //format nucleotide sequence to all caps ACTG, all else is N string seq2; string::iterator it1; char base; seq2 = ""; for (it1 = seq.begin(); it1 != seq.end(); ++it1) { base = *it1; switch (base) { case 'a': seq2 += "A"; break; case 'c': seq2 += "C"; break; case 'g': seq2 += "G"; break; case 't': seq2 += "T"; break; case 'A': seq2 += "A"; break; case 'C': seq2 += "C"; break; case 'G': seq2 += "G"; break; case 'T': seq2 += "T"; break; default: seq2 += "N"; break; } } return seq2; } string load_data3(gzFile in) { //read gzip fasta file and extract nt sequence char buf[BUFLEN]; char* offset = buf; string line; string sequence = ""; while (true) { int err, len = sizeof(buf)-(offset-buf); if (len == 0) error("Buffer to small for input line lengths"); len = gzread(in, offset, len); if (len == 0) break; if (len < 0) error(gzerror(in, &err)); char* cur = buf; char* end = offset+len; for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1) { line = string(cur, eol); if (line.back() == '\r') line.pop_back(); if (line.length() > 0) { if (line.at(0) == '>') { sequence += "N"; //contig separator } else { sequence += process_seq(line); } } //cout << line << endl; } // any trailing data in [eol, end) now is a partial line offset = copy(cur, end, buf); } // trailing data without eol line = string(buf, offset); if (gzclose(in) != Z_OK) error("failed gzclose"); return sequence; } int *tableM, *tableI, *tableD; void tableinit() { int i,j; const int NINF = -2000000000; tableM[0] = 0; tableI[0] = NINF; tableD[0] = NINF; for (i = 1; i<MAXALIGN; i++) //initialize upper row { j = (0 > i - beam - 1) ? 0 : i - beam - 1; tableI[j*MAXALIGN + i] = NINF; if (INIGAPPEN) { tableD[j*MAXALIGN + i] = -GAPO - (i - 1)*GAPX; tableM[j*MAXALIGN + i] = -GAPO - (i - 1)*GAPX; } else { tableD[0 + i] = 0; tableM[0 + i] = 0; } } for (j = 1; j<MAXALIGN; j++) //init left column { i = (0 > j - beam - 1) ? 0 : j - beam - 1; tableD[j*MAXALIGN + i] = NINF; if (INIGAPPEN) { tableI[j*MAXALIGN + i] = -GAPO - (j - 1)*GAPX; tableM[j*MAXALIGN + i] = -GAPO - (j - 1)*GAPX; } else { tableI[j*MAXALIGN + i] = 0; tableM[j*MAXALIGN + i] = 0; } } } int align(string dna1, string dna2) { //smith waterman alignment with beam //returns alignment score int b1, b2, len1, len2, tindex; int b,bb,c,cc; int i, j, maxval, startx, starty; int i1, i2; len1 = dna1.length() + 1; len2 = dna2.length() + 1; j=1; do { i = (j <= beam) ? j : j - beam; i2 = (j + beam > len1) ? len1 : j + beam; do { maxval = max(max(tableM[(j-1)*MAXALIGN + i-1], tableI[(j-1)*MAXALIGN + i-1]), tableD[(j-1)*MAXALIGN + i-1]); //copy/replace if (dna1[i-1] != dna2[j-1]) maxval += MISMATCH; else maxval += MATCH; tableM[j*MAXALIGN + i] = maxval; b=tableM[(j-1)*MAXALIGN + i] - GAPO; //new gap bb = tableI[(j-1)*MAXALIGN + i] -GAPX; //extend tableI[j*MAXALIGN + i] = (bb<b) ? b : bb; c=tableM[j*MAXALIGN + i-1] - GAPO; //new gap cc = tableD[j*MAXALIGN + i-1] -GAPX; //extend tableD[j*MAXALIGN + i] = (cc<c) ? c : cc; } while(++i < i2); } while(++j < len2); i = len1 - 1; j = len2 - 1; //process last entry maxval = max(max(tableM[j*MAXALIGN + i], tableI[j*MAXALIGN + i]), tableD[j*MAXALIGN + i]); return maxval; } int process_read(string &sequence, string acc, int start, int stop) { //returns id of read, 0 = no match, updates gcount, ucount //will do align for first minalign of each target //will save first SAVENUM reads of each target char base; int it1, it2; string kmer, sequence2, dna1, dna1r, dna2, fname, finalkeystr; ktype keyF, keyR, key; int i, cpos, minscr, gpos; bool fstrand1, fstrand2; int readlength = stop - start + 1, readlen2; int st2, scr, stlen2; ktype target, final_targ=0; ptype position; otype org; int maxct, totct, rtarg=0; bool reject = false; cpos = 0; keyF = 0; keyR = 0; gpos = 0; minscr = 5 * sequence.length() / 2; for (it1 = start; it1 <= stop; ++it1) { base = sequence.at(it1); switch (base) { case 'A': keyF = (keyF << 2) & mask; keyR = (keyR >> 2) | hiT; cpos++; break; case 'C': keyF = ((keyF << 2) & mask) | 1; keyR = (keyR >> 2) | hiG; cpos++; break; case 'G': keyF = ((keyF << 2) & mask) | 2; keyR = (keyR >> 2) | hiC; cpos++; break; case 'T': keyF = ((keyF << 2) & mask) | 3; keyR = keyR >> 2; cpos++; break; case 'a': keyF = (keyF << 2) & mask; keyR = (keyR >> 2) | hiT; cpos++; break; case 'c': keyF = ((keyF << 2) & mask) | 1; keyR = (keyR >> 2) | hiG; cpos++; break; case 'g': keyF = ((keyF << 2) & mask) | 2; keyR = (keyR >> 2) | hiC; cpos++; break; case 't': keyF = ((keyF << 2) & mask) | 3; keyR = keyR >> 2; cpos++; break; default: cpos = 0; //ambiguous character keyF = 0; keyR = 0; break; } gpos++; if (cpos == KSIZE) { key = keyF < keyR ? keyF : keyR; target = ht->getHash(key, org, position, fstrand2); //position is ending base, 1-based if (target > 0 && gcount[target] < minalign && target != final_targ) { //do alignment on initial reads of a species fname = fdir + accession[org] + ".fasta.gz"; sequence2 = load_data3(gzopen(fname.c_str(), "rb")); stlen2 = sequence2.length(); fstrand1 = (keyF < keyR); readlen2 = readlength; if (fstrand1 == fstrand2) { st2 = position - it1; if (st2 < 0) st2 = 0; if (st2 + readlen2 > stlen2) readlen2 = stlen2 - st2; dna1 = sequence.substr(start, readlength); dna2 = sequence2.substr(st2, readlen2); } else { //reverse complement dna1 st2 = position - KSIZE + 2 + it1 - readlength; if (st2 < 0) st2 = 0; if (st2 + readlen2 > stlen2) readlen2 = stlen2 - st2; //dna1 = sequence.substr(start, readlength); dna1 = ""; for (it2 = stop; it2 >= start; --it2) { base = sequence.at(it2); switch (base) { case 'A': dna1.push_back('T'); break; case 'C': dna1.push_back('G'); break; case 'G': dna1.push_back('C'); break; case 'T': dna1.push_back('A'); break; default: dna1.push_back('N'); break; } } dna2 = sequence2.substr(st2, readlen2); } scr = align(dna1, dna2); if (scr < minscr) { rtarg = target; target = 0; //fail reject = true; //cout << rtarg << ": failed " << scr << endl; } } if (final_targ > 0 && target > 0) { final_targ = taxonomy->msca(target, final_targ); finalkeystr = int2str(key); } else if (target > 0) { final_targ = target; finalkeystr = int2str(key); } if (target > 1) { if (kmer_seen.find(key) == kmer_seen.end()) { ucount[target]++; kmer_seen.insert(key); } } cpos--; } } //it /* if (final_targ > 1 && gcount[final_targ] < SAVENUM && save_target == 0) { outread << ">" << final_targ << ":" << acc << endl << sequence.substr(start, stop-start+1) << endl; outread << "[" << finalkeystr << "]" << endl; } if (final_targ > 1 && final_targ == save_target) { outread << ">" << final_targ << ":" << acc << endl << sequence.substr(start, stop-start+1) << endl; } */ gcount[final_targ]++; tct++; return final_targ; } void process_kmer(string sequence, vtype target, otype org, ptype position, bool fstrand) { //adds kmer from datafile into hashtable char base; string::iterator it1; int cpos, gpos; ui64 keyF; cpos = 0; gpos = 0; keyF = 0; for (it1 = sequence.begin(); it1 != sequence.end(); ++it1) { base = *it1; switch (base) { case 'A': keyF = (keyF << 2) & mask; cpos++; break; case 'C': keyF = ((keyF << 2) & mask) | 1; cpos++; break; case 'G': keyF = ((keyF << 2) & mask) | 2; cpos++; break; case 'T': keyF = ((keyF << 2) & mask) | 3; cpos++; break; default: cpos = 0; //ambiguous character keyF = 0; break; } if (cpos == KSIZE) { ht->add_kmer(keyF, target, org, position, fstrand); cpos--; } gpos++; } //it } void process_qual(string acc, string &seq, string &qual) { //uses PHRED 33 scores //to trim read and pass it to process_read const int cutoff_qual = 17; const char cutoff_char = 32 + cutoff_qual; int start, stop, target, i; int window_size = 4; int window_val, window_cut = cutoff_qual * window_size; string trim_seq; stop = seq.length()-1; start = 0; //trim all low qual bases from end while (qual.at(start) < cutoff_char && start < stop) start++; while (qual.at(stop) < cutoff_char && stop > start) stop--; //trim by window if (start < stop - window_size) { window_val = 0; for (i=0; i<window_size; i++) window_val += qual.at(start+i) - 32; while (window_val < window_cut && start < stop - window_size) { window_val += qual.at(start+window_size) - qual.at(start); start++; } } if (start < stop - window_size) { window_val = 0; for (i=0; i<window_size; i++) window_val += qual.at(stop-i) - 32; while (window_val < window_cut && start < stop - window_size) { window_val += qual.at(stop-window_size) - qual.at(stop); stop--; } } //trim_seq = seq.substr(start,stop-start+1); if (stop - start >= KSIZE) { target = process_read(seq,acc,start,stop); } } void process_fqgz(gzFile in) { //read gzip fastq file and extract sequence and quality lines //passes them to process_qual char buf[BUFLEN]; char* offset = buf; string line, seq, acc; int mod4=0; while (true) { int err, len = sizeof(buf)-(offset-buf); if (len == 0) error("Buffer to small for input line lengths"); len = gzread(in, offset, len); if (len == 0) break; if (len < 0) error(gzerror(in, &err)); char* cur = buf; char* end = offset+len; for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1) { line = string(cur, eol); if (line.back() == '\r') //windows line.pop_back(); if (line.length() > 0) { if (mod4 == 1) { seq = line; } else if (mod4 == 0) { acc = line; } else if (mod4 == 3) { process_qual(acc, seq, line); } mod4 = (mod4 + 1) % 4; } //cout << line << endl; } // any trailing data in [eol, end) now is a partial line offset = copy(cur, end, buf); } // trailing data without eol line = string(buf, offset); if (gzclose(in) != Z_OK) error("failed gzclose"); } void process_fagz(gzFile in) { //read gzip fasta file and extract sequence //passes read to process_read char buf[BUFLEN]; char* offset = buf; string line; string sequence = "", acc = ""; int c1, c2, target; cout << "true" << endl; while (true) { int err, len = sizeof(buf)-(offset-buf); if (len == 0) error("Buffer to small for input line lengths"); len = gzread(in, offset, len); if (len == 0) break; if (len < 0) error(gzerror(in, &err)); char* cur = buf; char* end = offset+len; for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1) { line = string(cur, eol); if (line.back() == '\r') line.pop_back(); if (line.length() > 0) { if (line.at(0) == '>') { if (sequence.length() > KSIZE) { // process previous sequence target = process_read(sequence, acc, 0, sequence.length()-1); } sequence = ""; acc = line.substr(1,line.length()-1); } else { sequence += line; } } //cout << line << endl; } // any trailing data in [eol, end) now is a partial line offset = copy(cur, end, buf); } if (sequence.length() > KSIZE) { //last one target = process_read(sequence, acc, 0, sequence.length()-1); } // trailing data without eol line = string(buf, offset); if (gzclose(in) != Z_OK) error("failed gzclose"); } int process_gzkmers(gzFile in) { char buf[BUFLEN]; char* offset = buf; string line; int target, count, org, position, tct=0; string sequence; char strand; bool fstrand; while (true) { int err, len = sizeof(buf)-(offset-buf); if (len == 0) error("Buffer to small for input line lengths"); len = gzread(in, offset, len); if (len == 0) break; if (len < 0) error(gzerror(in, &err)); char* cur = buf; char* end = offset+len; for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1) { line = string(cur, eol); if (line.back() == '\r') line.pop_back(); if (line.length() > 0) { replace(line.begin(), line.end(), ',', ' '); istringstream ss(line); if (ss >> sequence >> target >> org >> position >> strand >> count) { fstrand = (strand == 'F'); process_kmer(sequence, target, org, position, fstrand); tct++; } } //cout << line << endl; } // any trailing data in [eol, end) now is a partial line offset = copy(cur, end, buf); } // trailing data without eol line = string(buf, offset); if (gzclose(in) != Z_OK) error("failed gzclose"); return tct; } void process_fq(string rname) { //read fastq file and extract sequence and quality lines //passes them to process_qual string line, seq, acc= "", lseq; ifstream fin; int target; int mod4=0; fin.open(rname); if (fin) { while (getline(fin, line)) { if (line.back() == '\r') line.pop_back(); stringstream linestream(line); linestream >> lseq; if (lseq.length() > 0) { if (mod4 == 1) { seq = lseq; } else if (mod4 == 0) { acc = lseq; } else if (mod4 == 3) { process_qual(acc, seq, lseq); } mod4 = (mod4 + 1) % 4; } } fin.close(); } } void process_fa(string rname) { //read unzipped fasta file and extract sequence //passes read to process_read string sequence = "", line, lseq, acc= ""; ifstream fin; int target; fin.open(rname); if (fin) { while (getline(fin, line)) { if (line.back() == '\r') line.pop_back(); stringstream linestream(line); linestream >> lseq; if (lseq[0] == '>') { if (sequence.length() > KSIZE) { target = process_read(sequence, acc, 0, sequence.length()-1); } sequence = ""; acc = lseq.substr(1,lseq.length()-1); } else sequence += lseq; } fin.close(); if (sequence.length() > KSIZE) { //last one target = process_read(sequence, acc, 0, sequence.length()-1); } } else { cout << "nark " << rname << endl; } } int main(int argc, char* argv[]) { int i,j,num_jobs; vtype target; ptype position; char strand; bool fstrand; string header, jstr, fname, line, r1name, r2name = "", oname1, oname2; int refi, targi, strni, count, org, num_targ; ifstream fin; string genus, species, acc, sequence, lseq, remainder; vector<vector<string>> fnames; vector<string> jnames; vector<int> jcounts; vector<string>::iterator it2; string trname, s1, s2, s3,s4; string argst, wdir = "", dname = "", jname = "", jfile = "", jdir = ""; string iname = "mitochondria_data.txt"; //text file of groups for probe design string tname = "mitochondria_tree.txt"; //edges in taxonomy tree string pname = "mitochondria_probes.txt.gz"; if (argc > 1) { for (i = 1; i < argc; i++) { argst = argv[i]; if (argst == "-wdir") { wdir = argv[i+1]; } if (argst == "-f1") { r1name =argv[i+1]; } if (argst == "-f2") { r2name =argv[i+1]; } } } iname = wdir + iname; tname = wdir + tname; pname = wdir + pname; ht = new Hashtable(); //load strain list cout << "r1 " << r1name << endl; cout << "r2 " << r2name << endl; cout << "wd " << wdir << endl; fin.open(iname); if (fin) { while (getline(fin, line)) { if (line.back() == '\r') line.pop_back(); if (line.length() > 1) { stringstream linestream(line); linestream >> targi >> acc; accession.push_back(acc); targno.push_back(targi); if (targi > num_targ) num_targ = targi; num_orgs++; } } fin.close(); cout << num_orgs << " strains" << endl; num_targ++; } taxonomy = new Tree1(num_targ); //load taxonomy tree fin.open(tname); if (fin) { while (getline(fin, line)) { if (line.back() == '\r') line.pop_back(); stringstream linestream(line); linestream >> i >> j; taxonomy->add_edge(i,j); } } else exit(1); fin.close(); cout << "tree loaded" << endl; //load kmer database tct = process_gzkmers(gzopen(pname.c_str(), "rb")); cout << tct << " kmers loaded" << endl; if (tct < 2) exit(1); //process reads gcount.resize(num_targ,0); ucount.resize(num_targ,0); fill(gcount.begin(), gcount.end(), 0); fill(ucount.begin(), ucount.end(), 0); kmer_seen.clear(); tct = 0; s1 = ".fastq.gz"; s2 = ".fasta"; s3 = ".fastq"; s4 = ".fasta.gz"; int slen = r1name.length(); char sch = r1name.at(slen-1); cout << slen << " : " << sch << endl; if (equal(s1.rbegin(), s1.rend(), r1name.rbegin())) { process_fqgz(gzopen(r1name.c_str(), "rb")); } else if (equal(s2.rbegin(), s2.rend(), r1name.rbegin())) { process_fa(r1name.c_str()); } else if (equal(s3.rbegin(), s3.rend(), r1name.rbegin())) { process_fq(r1name.c_str()); } else if (equal(s4.rbegin(), s4.rend(), r1name.rbegin())) { process_fagz(gzopen(r1name.c_str(), "rb")); } cout << tct << " reads loaded" << endl; if (r2name.length() > 1 && r2name != "none") { //second file? if (equal(s1.rbegin(), s1.rend(), r2name.rbegin())) { process_fqgz(gzopen(r2name.c_str(), "rb")); cout << tct << " reads loaded" << endl; } else if (equal(s2.rbegin(), s2.rend(), r2name.rbegin())) { process_fa(r2name.c_str()); } else if (equal(s3.rbegin(), s3.rend(), r2name.rbegin())) { process_fq(r2name.c_str()); } else if (equal(s4.rbegin(), s4.rend(), r2name.rbegin())) { process_fagz(gzopen(r2name.c_str(), "rb")); } cout << tct << " reads loaded" << endl; } oname2 = wdir + "result.txt"; ofstream out2(oname2); for (i=0; i<num_targ; i++) out2 << i << "," << gcount[i] << "," << ucount[i] << endl; out2.close(); delete taxonomy; delete ht; return 0; }