view mytrimmer/trim.seqs.C.cpp @ 0:68a3648c7d91 draft default tip

Uploaded
author matteoc
date Thu, 22 Dec 2016 04:45:31 -0500
parents
children
line wrap: on
line source

#include <iostream>
#include <fstream>
#include <algorithm>
#include <vector>
#include <map>
#include <math.h>
#include <string>

using namespace std;
int main (int argc, char *argv[]);
int eval_quality(string & qstring,int lencutoff,int errors);

int main (int argc, char *argv[])
{
	if (argc==9)
	{
		unsigned long inseq=0;
		unsigned long outseq=0;
		unsigned long pfile=0; 
		ifstream infile;
		ifstream infileP;
		string file=argv[1];
		string filep=argv[2];
		ofstream outfile;
		ofstream outfilep;
		ofstream outfileunm;
		string outname=(argv[6]);
		string outnamep=(argv[7]);
		string outunm=(argv[8]);
		outfile.open(outname.c_str());
		outfilep.open(outnamep.c_str());
		outfileunm.open(outunm.c_str());
		int cutoff=atoi(argv[3]);
		int errors=atoi(argv[4]);
		int discard=atoi(argv[5]);
		infile.open(file.c_str());
		infileP.open(filep.c_str());
		if (!infile)
		{
			cerr << "Couldn't open "<< infile << "\n"; 
			exit(1);
		}
		if (!infileP)
		{
			cerr << "Couldn't open "<< outfile << "\n"; 
			exit(1);
		}
		map <int,int> Min;
		map <int,int> Max;
		if (infile.is_open() && infileP.is_open()){
			string header;
			string seq;
			string seqp;
			string qscore;
			string qscorep;
			while (!infile.eof() && !infileP.eof())
			{
				getline(infile,header);
				if (header!="")
				{
				//read headers + sequences
				getline(infile,seq); //
				getline(infileP,seqp);
				getline(infileP,seqp);//
				//
				//cout <<"A:" << seq << "\n" << "B:" << seqp << "\n";
				inseq+=seq.length();
				inseq+=seqp.length();


				//read Qscores
				getline(infile,qscore);
				getline(infile,qscore);//
				getline(infileP,qscorep);
				getline(infileP,qscorep);//
				if (discard >0 && discard<=seq.length())
				{
					seq=seq.substr(discard-1,seq.length()-discard);
					seqp=seqp.substr(discard-1,seqp.length()-discard);
					qscore=qscore.substr(discard-1,qscore.length()-discard);
					qscorep=qscorep.substr(discard-1,qscorep.length()-discard);
				}
				if (qscore.length()!=seq.length())
				{
					cerr << "Invalid fastq\n" << seq << "\n" << qscore  << "\n";
					exit(1);
				}
				
				if (qscorep.length()!=seqp.length())
                                {
                                        cerr << "Invalid fastq\n" << seqp << "\n" << qscorep  << "\n";
					exit(1);
                                }

	
				//
				//cout << qscore << "\n" << qscorep << "\n";	

				//eval Qscores
				int p=eval_quality(qscore,cutoff,errors);
				int pp=eval_quality(qscorep,cutoff,errors);
	
				string Qheader=header;
                                if (*(Qheader.end()-2)=='/') // togli gli slash
                                {
                                	Qheader.replace(Qheader.end()-2,Qheader.end(),"");
                                }
                     		string Oheader=Qheader;
                     		Oheader[0]='+';                   
				if (p>0)
				{
					seq=seq.substr(0,p);
					qscore=qscore.substr(0,p);
					if (pp>0)
					{
						seqp=seqp.substr(0,pp);
						qscorep=qscorep.substr(0,pp);
						outseq+=seqp.length();
						outseq+=seq.length();
						outfile << Qheader <<"/1" << "\n" << seq << "\n" << Oheader <<"/1" << "\n" << qscore << "\n";
						outfilep << Qheader <<"/2" << "\n" << seqp << "\n" << Oheader<<"/2"  << "\n" << qscorep << "\n";
					}else{
						outseq+=seq.length();
						outfileunm << Qheader <<"/1"  << "\n" << seq << "\n" << Oheader<<"/1"  << "\n" << qscore << "\n";
					}
				}else if(p==0 && pp>0){
					seqp=seqp.substr(0,pp);
					qscorep=qscorep.substr(0,pp);
					outseq+=seqp.length();
					outfileunm << Qheader <<"/2"  << "\n" << seqp << "\n" << Oheader <<"/2"  << "\n" << qscorep << "\n";
				}	
				}
			}
			
		}else{
			cerr << "could not open files\n";
		}
		//cerr << "Input "<< inseq << " bases.\nOutput " << outseq << " bases.\n";
	}else{

		cout << "input: <first_file> <second_file> <len_cutoff> <number of errors> <low qual base> <ofile1 <ofile2> <ofile3>\n";
	}
}

int eval_quality(string & qstring,int lencutoff,int errors)
{
	int Nminori10=0;
	int Nminori20=0;
	int Nmaggiori25=0;
	int l10=0;
	int l20=0;
	int p=0;
	double total_perr=0;
	string::iterator pos;
	for (pos=qstring.begin();pos!=qstring.end();pos++)
	{
		int punteggio=static_cast<int> (*pos)-33;
		if (punteggio>=1 && punteggio <=41)
		{
			double exp=(double)punteggio/-10;
			total_perr+=pow(10,exp);
			if (p>0)
                	{
                		if (punteggio<=10) //count qscores <=10
                        	{
                         		l10++;
                                	Nminori20++;
                                	Nminori10++;
                        	}else if (punteggio<=20){ // count Qscores <=20
                        		l20++;
                        		Nminori10=0;
                        		Nminori20++;
                        	}else if (punteggio>20){
                        		Nminori20=0;
                        		Nminori10=0;
					if (punteggio>=25)
					{
						Nmaggiori25++;
					}
                        	}
			}
			if (Nminori10>=10) // 3 or more consecutives very low quality bases
                	{
                		p-=Nminori10;
                       		break;
                	}else if (Nminori20>=15){ // 5 or more consecutives low quality bases 
                		p-=Nminori20;
                        	break;
			}
                	if (total_perr>=(double)errors) // sum of per base error probability when 5e-2 5 wrong base calls in 100 
                	{
				//cout << p << " " << total_perr << " " << errors << "\n";
                		break;
                	}
			p++;
		}else{
			cerr << "Invalid Qscore" << *pos << "=" << punteggio << "\n";
			exit(1);
		}
	}
	double prop_gr_25=(double)Nmaggiori25/(double)(p);
        if (prop_gr_25>=0.35 && p>=lencutoff && l20 <= p*0.2 && l10 <= p*0.1) // if 50% of Qscores are >= 25,size is >= cutoff a 
	{
		return p;
	}else{
		return 0;
	}
}