diff mytrimmer/trim.seqs.C.cpp @ 0:68a3648c7d91 draft default tip

Uploaded
author matteoc
date Thu, 22 Dec 2016 04:45:31 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mytrimmer/trim.seqs.C.cpp	Thu Dec 22 04:45:31 2016 -0500
@@ -0,0 +1,208 @@
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+#include <vector>
+#include <map>
+#include <math.h>
+#include <string>
+
+using namespace std;
+int main (int argc, char *argv[]);
+int eval_quality(string & qstring,int lencutoff,int errors);
+
+int main (int argc, char *argv[])
+{
+	if (argc==9)
+	{
+		unsigned long inseq=0;
+		unsigned long outseq=0;
+		unsigned long pfile=0; 
+		ifstream infile;
+		ifstream infileP;
+		string file=argv[1];
+		string filep=argv[2];
+		ofstream outfile;
+		ofstream outfilep;
+		ofstream outfileunm;
+		string outname=(argv[6]);
+		string outnamep=(argv[7]);
+		string outunm=(argv[8]);
+		outfile.open(outname.c_str());
+		outfilep.open(outnamep.c_str());
+		outfileunm.open(outunm.c_str());
+		int cutoff=atoi(argv[3]);
+		int errors=atoi(argv[4]);
+		int discard=atoi(argv[5]);
+		infile.open(file.c_str());
+		infileP.open(filep.c_str());
+		if (!infile)
+		{
+			cerr << "Couldn't open "<< infile << "\n"; 
+			exit(1);
+		}
+		if (!infileP)
+		{
+			cerr << "Couldn't open "<< outfile << "\n"; 
+			exit(1);
+		}
+		map <int,int> Min;
+		map <int,int> Max;
+		if (infile.is_open() && infileP.is_open()){
+			string header;
+			string seq;
+			string seqp;
+			string qscore;
+			string qscorep;
+			while (!infile.eof() && !infileP.eof())
+			{
+				getline(infile,header);
+				if (header!="")
+				{
+				//read headers + sequences
+				getline(infile,seq); //
+				getline(infileP,seqp);
+				getline(infileP,seqp);//
+				//
+				//cout <<"A:" << seq << "\n" << "B:" << seqp << "\n";
+				inseq+=seq.length();
+				inseq+=seqp.length();
+
+
+				//read Qscores
+				getline(infile,qscore);
+				getline(infile,qscore);//
+				getline(infileP,qscorep);
+				getline(infileP,qscorep);//
+				if (discard >0 && discard<=seq.length())
+				{
+					seq=seq.substr(discard-1,seq.length()-discard);
+					seqp=seqp.substr(discard-1,seqp.length()-discard);
+					qscore=qscore.substr(discard-1,qscore.length()-discard);
+					qscorep=qscorep.substr(discard-1,qscorep.length()-discard);
+				}
+				if (qscore.length()!=seq.length())
+				{
+					cerr << "Invalid fastq\n" << seq << "\n" << qscore  << "\n";
+					exit(1);
+				}
+				
+				if (qscorep.length()!=seqp.length())
+                                {
+                                        cerr << "Invalid fastq\n" << seqp << "\n" << qscorep  << "\n";
+					exit(1);
+                                }
+
+	
+				//
+				//cout << qscore << "\n" << qscorep << "\n";	
+
+				//eval Qscores
+				int p=eval_quality(qscore,cutoff,errors);
+				int pp=eval_quality(qscorep,cutoff,errors);
+	
+				string Qheader=header;
+                                if (*(Qheader.end()-2)=='/') // togli gli slash
+                                {
+                                	Qheader.replace(Qheader.end()-2,Qheader.end(),"");
+                                }
+                     		string Oheader=Qheader;
+                     		Oheader[0]='+';                   
+				if (p>0)
+				{
+					seq=seq.substr(0,p);
+					qscore=qscore.substr(0,p);
+					if (pp>0)
+					{
+						seqp=seqp.substr(0,pp);
+						qscorep=qscorep.substr(0,pp);
+						outseq+=seqp.length();
+						outseq+=seq.length();
+						outfile << Qheader <<"/1" << "\n" << seq << "\n" << Oheader <<"/1" << "\n" << qscore << "\n";
+						outfilep << Qheader <<"/2" << "\n" << seqp << "\n" << Oheader<<"/2"  << "\n" << qscorep << "\n";
+					}else{
+						outseq+=seq.length();
+						outfileunm << Qheader <<"/1"  << "\n" << seq << "\n" << Oheader<<"/1"  << "\n" << qscore << "\n";
+					}
+				}else if(p==0 && pp>0){
+					seqp=seqp.substr(0,pp);
+					qscorep=qscorep.substr(0,pp);
+					outseq+=seqp.length();
+					outfileunm << Qheader <<"/2"  << "\n" << seqp << "\n" << Oheader <<"/2"  << "\n" << qscorep << "\n";
+				}	
+				}
+			}
+			
+		}else{
+			cerr << "could not open files\n";
+		}
+		//cerr << "Input "<< inseq << " bases.\nOutput " << outseq << " bases.\n";
+	}else{
+
+		cout << "input: <first_file> <second_file> <len_cutoff> <number of errors> <low qual base> <ofile1 <ofile2> <ofile3>\n";
+	}
+}
+
+int eval_quality(string & qstring,int lencutoff,int errors)
+{
+	int Nminori10=0;
+	int Nminori20=0;
+	int Nmaggiori25=0;
+	int l10=0;
+	int l20=0;
+	int p=0;
+	double total_perr=0;
+	string::iterator pos;
+	for (pos=qstring.begin();pos!=qstring.end();pos++)
+	{
+		int punteggio=static_cast<int> (*pos)-33;
+		if (punteggio>=1 && punteggio <=41)
+		{
+			double exp=(double)punteggio/-10;
+			total_perr+=pow(10,exp);
+			if (p>0)
+                	{
+                		if (punteggio<=10) //count qscores <=10
+                        	{
+                         		l10++;
+                                	Nminori20++;
+                                	Nminori10++;
+                        	}else if (punteggio<=20){ // count Qscores <=20
+                        		l20++;
+                        		Nminori10=0;
+                        		Nminori20++;
+                        	}else if (punteggio>20){
+                        		Nminori20=0;
+                        		Nminori10=0;
+					if (punteggio>=25)
+					{
+						Nmaggiori25++;
+					}
+                        	}
+			}
+			if (Nminori10>=10) // 3 or more consecutives very low quality bases
+                	{
+                		p-=Nminori10;
+                       		break;
+                	}else if (Nminori20>=15){ // 5 or more consecutives low quality bases 
+                		p-=Nminori20;
+                        	break;
+			}
+                	if (total_perr>=(double)errors) // sum of per base error probability when 5e-2 5 wrong base calls in 100 
+                	{
+				//cout << p << " " << total_perr << " " << errors << "\n";
+                		break;
+                	}
+			p++;
+		}else{
+			cerr << "Invalid Qscore" << *pos << "=" << punteggio << "\n";
+			exit(1);
+		}
+	}
+	double prop_gr_25=(double)Nmaggiori25/(double)(p);
+        if (prop_gr_25>=0.35 && p>=lencutoff && l20 <= p*0.2 && l10 <= p*0.1) // if 50% of Qscores are >= 25,size is >= cutoff a 
+	{
+		return p;
+	}else{
+		return 0;
+	}
+}