| 
0
 | 
     1 #include <iostream>
 | 
| 
 | 
     2 #include <fstream>
 | 
| 
 | 
     3 #include <algorithm>
 | 
| 
 | 
     4 #include <vector>
 | 
| 
 | 
     5 #include <map>
 | 
| 
 | 
     6 #include <math.h>
 | 
| 
 | 
     7 #include <string>
 | 
| 
 | 
     8 
 | 
| 
 | 
     9 using namespace std;
 | 
| 
 | 
    10 int main (int argc, char *argv[]);
 | 
| 
 | 
    11 int eval_quality(string & qstring,int lencutoff,int errors);
 | 
| 
 | 
    12 
 | 
| 
 | 
    13 int main (int argc, char *argv[])
 | 
| 
 | 
    14 {
 | 
| 
 | 
    15 	if (argc==9)
 | 
| 
 | 
    16 	{
 | 
| 
 | 
    17 		unsigned long inseq=0;
 | 
| 
 | 
    18 		unsigned long outseq=0;
 | 
| 
 | 
    19 		unsigned long pfile=0; 
 | 
| 
 | 
    20 		ifstream infile;
 | 
| 
 | 
    21 		ifstream infileP;
 | 
| 
 | 
    22 		string file=argv[1];
 | 
| 
 | 
    23 		string filep=argv[2];
 | 
| 
 | 
    24 		ofstream outfile;
 | 
| 
 | 
    25 		ofstream outfilep;
 | 
| 
 | 
    26 		ofstream outfileunm;
 | 
| 
 | 
    27 		string outname=(argv[6]);
 | 
| 
 | 
    28 		string outnamep=(argv[7]);
 | 
| 
 | 
    29 		string outunm=(argv[8]);
 | 
| 
 | 
    30 		outfile.open(outname.c_str());
 | 
| 
 | 
    31 		outfilep.open(outnamep.c_str());
 | 
| 
 | 
    32 		outfileunm.open(outunm.c_str());
 | 
| 
 | 
    33 		int cutoff=atoi(argv[3]);
 | 
| 
 | 
    34 		int errors=atoi(argv[4]);
 | 
| 
 | 
    35 		int discard=atoi(argv[5]);
 | 
| 
 | 
    36 		infile.open(file.c_str());
 | 
| 
 | 
    37 		infileP.open(filep.c_str());
 | 
| 
 | 
    38 		if (!infile)
 | 
| 
 | 
    39 		{
 | 
| 
 | 
    40 			cerr << "Couldn't open "<< infile << "\n"; 
 | 
| 
 | 
    41 			exit(1);
 | 
| 
 | 
    42 		}
 | 
| 
 | 
    43 		if (!infileP)
 | 
| 
 | 
    44 		{
 | 
| 
 | 
    45 			cerr << "Couldn't open "<< outfile << "\n"; 
 | 
| 
 | 
    46 			exit(1);
 | 
| 
 | 
    47 		}
 | 
| 
 | 
    48 		map <int,int> Min;
 | 
| 
 | 
    49 		map <int,int> Max;
 | 
| 
 | 
    50 		if (infile.is_open() && infileP.is_open()){
 | 
| 
 | 
    51 			string header;
 | 
| 
 | 
    52 			string seq;
 | 
| 
 | 
    53 			string seqp;
 | 
| 
 | 
    54 			string qscore;
 | 
| 
 | 
    55 			string qscorep;
 | 
| 
 | 
    56 			while (!infile.eof() && !infileP.eof())
 | 
| 
 | 
    57 			{
 | 
| 
 | 
    58 				getline(infile,header);
 | 
| 
 | 
    59 				if (header!="")
 | 
| 
 | 
    60 				{
 | 
| 
 | 
    61 				//read headers + sequences
 | 
| 
 | 
    62 				getline(infile,seq); //
 | 
| 
 | 
    63 				getline(infileP,seqp);
 | 
| 
 | 
    64 				getline(infileP,seqp);//
 | 
| 
 | 
    65 				//
 | 
| 
 | 
    66 				//cout <<"A:" << seq << "\n" << "B:" << seqp << "\n";
 | 
| 
 | 
    67 				inseq+=seq.length();
 | 
| 
 | 
    68 				inseq+=seqp.length();
 | 
| 
 | 
    69 
 | 
| 
 | 
    70 
 | 
| 
 | 
    71 				//read Qscores
 | 
| 
 | 
    72 				getline(infile,qscore);
 | 
| 
 | 
    73 				getline(infile,qscore);//
 | 
| 
 | 
    74 				getline(infileP,qscorep);
 | 
| 
 | 
    75 				getline(infileP,qscorep);//
 | 
| 
 | 
    76 				if (discard >0 && discard<=seq.length())
 | 
| 
 | 
    77 				{
 | 
| 
 | 
    78 					seq=seq.substr(discard-1,seq.length()-discard);
 | 
| 
 | 
    79 					seqp=seqp.substr(discard-1,seqp.length()-discard);
 | 
| 
 | 
    80 					qscore=qscore.substr(discard-1,qscore.length()-discard);
 | 
| 
 | 
    81 					qscorep=qscorep.substr(discard-1,qscorep.length()-discard);
 | 
| 
 | 
    82 				}
 | 
| 
 | 
    83 				if (qscore.length()!=seq.length())
 | 
| 
 | 
    84 				{
 | 
| 
 | 
    85 					cerr << "Invalid fastq\n" << seq << "\n" << qscore  << "\n";
 | 
| 
 | 
    86 					exit(1);
 | 
| 
 | 
    87 				}
 | 
| 
 | 
    88 				
 | 
| 
 | 
    89 				if (qscorep.length()!=seqp.length())
 | 
| 
 | 
    90                                 {
 | 
| 
 | 
    91                                         cerr << "Invalid fastq\n" << seqp << "\n" << qscorep  << "\n";
 | 
| 
 | 
    92 					exit(1);
 | 
| 
 | 
    93                                 }
 | 
| 
 | 
    94 
 | 
| 
 | 
    95 	
 | 
| 
 | 
    96 				//
 | 
| 
 | 
    97 				//cout << qscore << "\n" << qscorep << "\n";	
 | 
| 
 | 
    98 
 | 
| 
 | 
    99 				//eval Qscores
 | 
| 
 | 
   100 				int p=eval_quality(qscore,cutoff,errors);
 | 
| 
 | 
   101 				int pp=eval_quality(qscorep,cutoff,errors);
 | 
| 
 | 
   102 	
 | 
| 
 | 
   103 				string Qheader=header;
 | 
| 
 | 
   104                                 if (*(Qheader.end()-2)=='/') // togli gli slash
 | 
| 
 | 
   105                                 {
 | 
| 
 | 
   106                                 	Qheader.replace(Qheader.end()-2,Qheader.end(),"");
 | 
| 
 | 
   107                                 }
 | 
| 
 | 
   108                      		string Oheader=Qheader;
 | 
| 
 | 
   109                      		Oheader[0]='+';                   
 | 
| 
 | 
   110 				if (p>0)
 | 
| 
 | 
   111 				{
 | 
| 
 | 
   112 					seq=seq.substr(0,p);
 | 
| 
 | 
   113 					qscore=qscore.substr(0,p);
 | 
| 
 | 
   114 					if (pp>0)
 | 
| 
 | 
   115 					{
 | 
| 
 | 
   116 						seqp=seqp.substr(0,pp);
 | 
| 
 | 
   117 						qscorep=qscorep.substr(0,pp);
 | 
| 
 | 
   118 						outseq+=seqp.length();
 | 
| 
 | 
   119 						outseq+=seq.length();
 | 
| 
 | 
   120 						outfile << Qheader <<"/1" << "\n" << seq << "\n" << Oheader <<"/1" << "\n" << qscore << "\n";
 | 
| 
 | 
   121 						outfilep << Qheader <<"/2" << "\n" << seqp << "\n" << Oheader<<"/2"  << "\n" << qscorep << "\n";
 | 
| 
 | 
   122 					}else{
 | 
| 
 | 
   123 						outseq+=seq.length();
 | 
| 
 | 
   124 						outfileunm << Qheader <<"/1"  << "\n" << seq << "\n" << Oheader<<"/1"  << "\n" << qscore << "\n";
 | 
| 
 | 
   125 					}
 | 
| 
 | 
   126 				}else if(p==0 && pp>0){
 | 
| 
 | 
   127 					seqp=seqp.substr(0,pp);
 | 
| 
 | 
   128 					qscorep=qscorep.substr(0,pp);
 | 
| 
 | 
   129 					outseq+=seqp.length();
 | 
| 
 | 
   130 					outfileunm << Qheader <<"/2"  << "\n" << seqp << "\n" << Oheader <<"/2"  << "\n" << qscorep << "\n";
 | 
| 
 | 
   131 				}	
 | 
| 
 | 
   132 				}
 | 
| 
 | 
   133 			}
 | 
| 
 | 
   134 			
 | 
| 
 | 
   135 		}else{
 | 
| 
 | 
   136 			cerr << "could not open files\n";
 | 
| 
 | 
   137 		}
 | 
| 
 | 
   138 		//cerr << "Input "<< inseq << " bases.\nOutput " << outseq << " bases.\n";
 | 
| 
 | 
   139 	}else{
 | 
| 
 | 
   140 
 | 
| 
 | 
   141 		cout << "input: <first_file> <second_file> <len_cutoff> <number of errors> <low qual base> <ofile1 <ofile2> <ofile3>\n";
 | 
| 
 | 
   142 	}
 | 
| 
 | 
   143 }
 | 
| 
 | 
   144 
 | 
| 
 | 
   145 int eval_quality(string & qstring,int lencutoff,int errors)
 | 
| 
 | 
   146 {
 | 
| 
 | 
   147 	int Nminori10=0;
 | 
| 
 | 
   148 	int Nminori20=0;
 | 
| 
 | 
   149 	int Nmaggiori25=0;
 | 
| 
 | 
   150 	int l10=0;
 | 
| 
 | 
   151 	int l20=0;
 | 
| 
 | 
   152 	int p=0;
 | 
| 
 | 
   153 	double total_perr=0;
 | 
| 
 | 
   154 	string::iterator pos;
 | 
| 
 | 
   155 	for (pos=qstring.begin();pos!=qstring.end();pos++)
 | 
| 
 | 
   156 	{
 | 
| 
 | 
   157 		int punteggio=static_cast<int> (*pos)-33;
 | 
| 
 | 
   158 		if (punteggio>=1 && punteggio <=41)
 | 
| 
 | 
   159 		{
 | 
| 
 | 
   160 			double exp=(double)punteggio/-10;
 | 
| 
 | 
   161 			total_perr+=pow(10,exp);
 | 
| 
 | 
   162 			if (p>0)
 | 
| 
 | 
   163                 	{
 | 
| 
 | 
   164                 		if (punteggio<=10) //count qscores <=10
 | 
| 
 | 
   165                         	{
 | 
| 
 | 
   166                          		l10++;
 | 
| 
 | 
   167                                 	Nminori20++;
 | 
| 
 | 
   168                                 	Nminori10++;
 | 
| 
 | 
   169                         	}else if (punteggio<=20){ // count Qscores <=20
 | 
| 
 | 
   170                         		l20++;
 | 
| 
 | 
   171                         		Nminori10=0;
 | 
| 
 | 
   172                         		Nminori20++;
 | 
| 
 | 
   173                         	}else if (punteggio>20){
 | 
| 
 | 
   174                         		Nminori20=0;
 | 
| 
 | 
   175                         		Nminori10=0;
 | 
| 
 | 
   176 					if (punteggio>=25)
 | 
| 
 | 
   177 					{
 | 
| 
 | 
   178 						Nmaggiori25++;
 | 
| 
 | 
   179 					}
 | 
| 
 | 
   180                         	}
 | 
| 
 | 
   181 			}
 | 
| 
 | 
   182 			if (Nminori10>=10) // 3 or more consecutives very low quality bases
 | 
| 
 | 
   183                 	{
 | 
| 
 | 
   184                 		p-=Nminori10;
 | 
| 
 | 
   185                        		break;
 | 
| 
 | 
   186                 	}else if (Nminori20>=15){ // 5 or more consecutives low quality bases 
 | 
| 
 | 
   187                 		p-=Nminori20;
 | 
| 
 | 
   188                         	break;
 | 
| 
 | 
   189 			}
 | 
| 
 | 
   190                 	if (total_perr>=(double)errors) // sum of per base error probability when 5e-2 5 wrong base calls in 100 
 | 
| 
 | 
   191                 	{
 | 
| 
 | 
   192 				//cout << p << " " << total_perr << " " << errors << "\n";
 | 
| 
 | 
   193                 		break;
 | 
| 
 | 
   194                 	}
 | 
| 
 | 
   195 			p++;
 | 
| 
 | 
   196 		}else{
 | 
| 
 | 
   197 			cerr << "Invalid Qscore" << *pos << "=" << punteggio << "\n";
 | 
| 
 | 
   198 			exit(1);
 | 
| 
 | 
   199 		}
 | 
| 
 | 
   200 	}
 | 
| 
 | 
   201 	double prop_gr_25=(double)Nmaggiori25/(double)(p);
 | 
| 
 | 
   202         if (prop_gr_25>=0.35 && p>=lencutoff && l20 <= p*0.2 && l10 <= p*0.1) // if 50% of Qscores are >= 25,size is >= cutoff a 
 | 
| 
 | 
   203 	{
 | 
| 
 | 
   204 		return p;
 | 
| 
 | 
   205 	}else{
 | 
| 
 | 
   206 		return 0;
 | 
| 
 | 
   207 	}
 | 
| 
 | 
   208 }
 |