Mercurial > repos > matteoc > agame_custom_tools
diff mytrimmer/trim.seqs.C.cpp @ 0:68a3648c7d91 draft default tip
Uploaded
author | matteoc |
---|---|
date | Thu, 22 Dec 2016 04:45:31 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mytrimmer/trim.seqs.C.cpp Thu Dec 22 04:45:31 2016 -0500 @@ -0,0 +1,208 @@ +#include <iostream> +#include <fstream> +#include <algorithm> +#include <vector> +#include <map> +#include <math.h> +#include <string> + +using namespace std; +int main (int argc, char *argv[]); +int eval_quality(string & qstring,int lencutoff,int errors); + +int main (int argc, char *argv[]) +{ + if (argc==9) + { + unsigned long inseq=0; + unsigned long outseq=0; + unsigned long pfile=0; + ifstream infile; + ifstream infileP; + string file=argv[1]; + string filep=argv[2]; + ofstream outfile; + ofstream outfilep; + ofstream outfileunm; + string outname=(argv[6]); + string outnamep=(argv[7]); + string outunm=(argv[8]); + outfile.open(outname.c_str()); + outfilep.open(outnamep.c_str()); + outfileunm.open(outunm.c_str()); + int cutoff=atoi(argv[3]); + int errors=atoi(argv[4]); + int discard=atoi(argv[5]); + infile.open(file.c_str()); + infileP.open(filep.c_str()); + if (!infile) + { + cerr << "Couldn't open "<< infile << "\n"; + exit(1); + } + if (!infileP) + { + cerr << "Couldn't open "<< outfile << "\n"; + exit(1); + } + map <int,int> Min; + map <int,int> Max; + if (infile.is_open() && infileP.is_open()){ + string header; + string seq; + string seqp; + string qscore; + string qscorep; + while (!infile.eof() && !infileP.eof()) + { + getline(infile,header); + if (header!="") + { + //read headers + sequences + getline(infile,seq); // + getline(infileP,seqp); + getline(infileP,seqp);// + // + //cout <<"A:" << seq << "\n" << "B:" << seqp << "\n"; + inseq+=seq.length(); + inseq+=seqp.length(); + + + //read Qscores + getline(infile,qscore); + getline(infile,qscore);// + getline(infileP,qscorep); + getline(infileP,qscorep);// + if (discard >0 && discard<=seq.length()) + { + seq=seq.substr(discard-1,seq.length()-discard); + seqp=seqp.substr(discard-1,seqp.length()-discard); + qscore=qscore.substr(discard-1,qscore.length()-discard); + qscorep=qscorep.substr(discard-1,qscorep.length()-discard); + } + if (qscore.length()!=seq.length()) + { + cerr << "Invalid fastq\n" << seq << "\n" << qscore << "\n"; + exit(1); + } + + if (qscorep.length()!=seqp.length()) + { + cerr << "Invalid fastq\n" << seqp << "\n" << qscorep << "\n"; + exit(1); + } + + + // + //cout << qscore << "\n" << qscorep << "\n"; + + //eval Qscores + int p=eval_quality(qscore,cutoff,errors); + int pp=eval_quality(qscorep,cutoff,errors); + + string Qheader=header; + if (*(Qheader.end()-2)=='/') // togli gli slash + { + Qheader.replace(Qheader.end()-2,Qheader.end(),""); + } + string Oheader=Qheader; + Oheader[0]='+'; + if (p>0) + { + seq=seq.substr(0,p); + qscore=qscore.substr(0,p); + if (pp>0) + { + seqp=seqp.substr(0,pp); + qscorep=qscorep.substr(0,pp); + outseq+=seqp.length(); + outseq+=seq.length(); + outfile << Qheader <<"/1" << "\n" << seq << "\n" << Oheader <<"/1" << "\n" << qscore << "\n"; + outfilep << Qheader <<"/2" << "\n" << seqp << "\n" << Oheader<<"/2" << "\n" << qscorep << "\n"; + }else{ + outseq+=seq.length(); + outfileunm << Qheader <<"/1" << "\n" << seq << "\n" << Oheader<<"/1" << "\n" << qscore << "\n"; + } + }else if(p==0 && pp>0){ + seqp=seqp.substr(0,pp); + qscorep=qscorep.substr(0,pp); + outseq+=seqp.length(); + outfileunm << Qheader <<"/2" << "\n" << seqp << "\n" << Oheader <<"/2" << "\n" << qscorep << "\n"; + } + } + } + + }else{ + cerr << "could not open files\n"; + } + //cerr << "Input "<< inseq << " bases.\nOutput " << outseq << " bases.\n"; + }else{ + + cout << "input: <first_file> <second_file> <len_cutoff> <number of errors> <low qual base> <ofile1 <ofile2> <ofile3>\n"; + } +} + +int eval_quality(string & qstring,int lencutoff,int errors) +{ + int Nminori10=0; + int Nminori20=0; + int Nmaggiori25=0; + int l10=0; + int l20=0; + int p=0; + double total_perr=0; + string::iterator pos; + for (pos=qstring.begin();pos!=qstring.end();pos++) + { + int punteggio=static_cast<int> (*pos)-33; + if (punteggio>=1 && punteggio <=41) + { + double exp=(double)punteggio/-10; + total_perr+=pow(10,exp); + if (p>0) + { + if (punteggio<=10) //count qscores <=10 + { + l10++; + Nminori20++; + Nminori10++; + }else if (punteggio<=20){ // count Qscores <=20 + l20++; + Nminori10=0; + Nminori20++; + }else if (punteggio>20){ + Nminori20=0; + Nminori10=0; + if (punteggio>=25) + { + Nmaggiori25++; + } + } + } + if (Nminori10>=10) // 3 or more consecutives very low quality bases + { + p-=Nminori10; + break; + }else if (Nminori20>=15){ // 5 or more consecutives low quality bases + p-=Nminori20; + break; + } + if (total_perr>=(double)errors) // sum of per base error probability when 5e-2 5 wrong base calls in 100 + { + //cout << p << " " << total_perr << " " << errors << "\n"; + break; + } + p++; + }else{ + cerr << "Invalid Qscore" << *pos << "=" << punteggio << "\n"; + exit(1); + } + } + double prop_gr_25=(double)Nmaggiori25/(double)(p); + if (prop_gr_25>=0.35 && p>=lencutoff && l20 <= p*0.2 && l10 <= p*0.1) // if 50% of Qscores are >= 25,size is >= cutoff a + { + return p; + }else{ + return 0; + } +}