Mercurial > repos > matteoc > agame_custom_tools
view mytrimmer/trim.seqs.C.cpp @ 0:68a3648c7d91 draft default tip
Uploaded
author | matteoc |
---|---|
date | Thu, 22 Dec 2016 04:45:31 -0500 |
parents | |
children |
line wrap: on
line source
#include <iostream> #include <fstream> #include <algorithm> #include <vector> #include <map> #include <math.h> #include <string> using namespace std; int main (int argc, char *argv[]); int eval_quality(string & qstring,int lencutoff,int errors); int main (int argc, char *argv[]) { if (argc==9) { unsigned long inseq=0; unsigned long outseq=0; unsigned long pfile=0; ifstream infile; ifstream infileP; string file=argv[1]; string filep=argv[2]; ofstream outfile; ofstream outfilep; ofstream outfileunm; string outname=(argv[6]); string outnamep=(argv[7]); string outunm=(argv[8]); outfile.open(outname.c_str()); outfilep.open(outnamep.c_str()); outfileunm.open(outunm.c_str()); int cutoff=atoi(argv[3]); int errors=atoi(argv[4]); int discard=atoi(argv[5]); infile.open(file.c_str()); infileP.open(filep.c_str()); if (!infile) { cerr << "Couldn't open "<< infile << "\n"; exit(1); } if (!infileP) { cerr << "Couldn't open "<< outfile << "\n"; exit(1); } map <int,int> Min; map <int,int> Max; if (infile.is_open() && infileP.is_open()){ string header; string seq; string seqp; string qscore; string qscorep; while (!infile.eof() && !infileP.eof()) { getline(infile,header); if (header!="") { //read headers + sequences getline(infile,seq); // getline(infileP,seqp); getline(infileP,seqp);// // //cout <<"A:" << seq << "\n" << "B:" << seqp << "\n"; inseq+=seq.length(); inseq+=seqp.length(); //read Qscores getline(infile,qscore); getline(infile,qscore);// getline(infileP,qscorep); getline(infileP,qscorep);// if (discard >0 && discard<=seq.length()) { seq=seq.substr(discard-1,seq.length()-discard); seqp=seqp.substr(discard-1,seqp.length()-discard); qscore=qscore.substr(discard-1,qscore.length()-discard); qscorep=qscorep.substr(discard-1,qscorep.length()-discard); } if (qscore.length()!=seq.length()) { cerr << "Invalid fastq\n" << seq << "\n" << qscore << "\n"; exit(1); } if (qscorep.length()!=seqp.length()) { cerr << "Invalid fastq\n" << seqp << "\n" << qscorep << "\n"; exit(1); } // //cout << qscore << "\n" << qscorep << "\n"; //eval Qscores int p=eval_quality(qscore,cutoff,errors); int pp=eval_quality(qscorep,cutoff,errors); string Qheader=header; if (*(Qheader.end()-2)=='/') // togli gli slash { Qheader.replace(Qheader.end()-2,Qheader.end(),""); } string Oheader=Qheader; Oheader[0]='+'; if (p>0) { seq=seq.substr(0,p); qscore=qscore.substr(0,p); if (pp>0) { seqp=seqp.substr(0,pp); qscorep=qscorep.substr(0,pp); outseq+=seqp.length(); outseq+=seq.length(); outfile << Qheader <<"/1" << "\n" << seq << "\n" << Oheader <<"/1" << "\n" << qscore << "\n"; outfilep << Qheader <<"/2" << "\n" << seqp << "\n" << Oheader<<"/2" << "\n" << qscorep << "\n"; }else{ outseq+=seq.length(); outfileunm << Qheader <<"/1" << "\n" << seq << "\n" << Oheader<<"/1" << "\n" << qscore << "\n"; } }else if(p==0 && pp>0){ seqp=seqp.substr(0,pp); qscorep=qscorep.substr(0,pp); outseq+=seqp.length(); outfileunm << Qheader <<"/2" << "\n" << seqp << "\n" << Oheader <<"/2" << "\n" << qscorep << "\n"; } } } }else{ cerr << "could not open files\n"; } //cerr << "Input "<< inseq << " bases.\nOutput " << outseq << " bases.\n"; }else{ cout << "input: <first_file> <second_file> <len_cutoff> <number of errors> <low qual base> <ofile1 <ofile2> <ofile3>\n"; } } int eval_quality(string & qstring,int lencutoff,int errors) { int Nminori10=0; int Nminori20=0; int Nmaggiori25=0; int l10=0; int l20=0; int p=0; double total_perr=0; string::iterator pos; for (pos=qstring.begin();pos!=qstring.end();pos++) { int punteggio=static_cast<int> (*pos)-33; if (punteggio>=1 && punteggio <=41) { double exp=(double)punteggio/-10; total_perr+=pow(10,exp); if (p>0) { if (punteggio<=10) //count qscores <=10 { l10++; Nminori20++; Nminori10++; }else if (punteggio<=20){ // count Qscores <=20 l20++; Nminori10=0; Nminori20++; }else if (punteggio>20){ Nminori20=0; Nminori10=0; if (punteggio>=25) { Nmaggiori25++; } } } if (Nminori10>=10) // 3 or more consecutives very low quality bases { p-=Nminori10; break; }else if (Nminori20>=15){ // 5 or more consecutives low quality bases p-=Nminori20; break; } if (total_perr>=(double)errors) // sum of per base error probability when 5e-2 5 wrong base calls in 100 { //cout << p << " " << total_perr << " " << errors << "\n"; break; } p++; }else{ cerr << "Invalid Qscore" << *pos << "=" << punteggio << "\n"; exit(1); } } double prop_gr_25=(double)Nmaggiori25/(double)(p); if (prop_gr_25>=0.35 && p>=lencutoff && l20 <= p*0.2 && l10 <= p*0.1) // if 50% of Qscores are >= 25,size is >= cutoff a { return p; }else{ return 0; } }