0
|
1 #include <iostream>
|
|
2 #include <fstream>
|
|
3 #include <algorithm>
|
|
4 #include <vector>
|
|
5 #include <map>
|
|
6 #include <math.h>
|
|
7 #include <string>
|
|
8
|
|
9 using namespace std;
|
|
10 int main (int argc, char *argv[]);
|
|
11 int eval_quality(string & qstring,int lencutoff,int errors);
|
|
12
|
|
13 int main (int argc, char *argv[])
|
|
14 {
|
|
15 if (argc==9)
|
|
16 {
|
|
17 unsigned long inseq=0;
|
|
18 unsigned long outseq=0;
|
|
19 unsigned long pfile=0;
|
|
20 ifstream infile;
|
|
21 ifstream infileP;
|
|
22 string file=argv[1];
|
|
23 string filep=argv[2];
|
|
24 ofstream outfile;
|
|
25 ofstream outfilep;
|
|
26 ofstream outfileunm;
|
|
27 string outname=(argv[6]);
|
|
28 string outnamep=(argv[7]);
|
|
29 string outunm=(argv[8]);
|
|
30 outfile.open(outname.c_str());
|
|
31 outfilep.open(outnamep.c_str());
|
|
32 outfileunm.open(outunm.c_str());
|
|
33 int cutoff=atoi(argv[3]);
|
|
34 int errors=atoi(argv[4]);
|
|
35 int discard=atoi(argv[5]);
|
|
36 infile.open(file.c_str());
|
|
37 infileP.open(filep.c_str());
|
|
38 if (!infile)
|
|
39 {
|
|
40 cerr << "Couldn't open "<< infile << "\n";
|
|
41 exit(1);
|
|
42 }
|
|
43 if (!infileP)
|
|
44 {
|
|
45 cerr << "Couldn't open "<< outfile << "\n";
|
|
46 exit(1);
|
|
47 }
|
|
48 map <int,int> Min;
|
|
49 map <int,int> Max;
|
|
50 if (infile.is_open() && infileP.is_open()){
|
|
51 string header;
|
|
52 string seq;
|
|
53 string seqp;
|
|
54 string qscore;
|
|
55 string qscorep;
|
|
56 while (!infile.eof() && !infileP.eof())
|
|
57 {
|
|
58 getline(infile,header);
|
|
59 if (header!="")
|
|
60 {
|
|
61 //read headers + sequences
|
|
62 getline(infile,seq); //
|
|
63 getline(infileP,seqp);
|
|
64 getline(infileP,seqp);//
|
|
65 //
|
|
66 //cout <<"A:" << seq << "\n" << "B:" << seqp << "\n";
|
|
67 inseq+=seq.length();
|
|
68 inseq+=seqp.length();
|
|
69
|
|
70
|
|
71 //read Qscores
|
|
72 getline(infile,qscore);
|
|
73 getline(infile,qscore);//
|
|
74 getline(infileP,qscorep);
|
|
75 getline(infileP,qscorep);//
|
|
76 if (discard >0 && discard<=seq.length())
|
|
77 {
|
|
78 seq=seq.substr(discard-1,seq.length()-discard);
|
|
79 seqp=seqp.substr(discard-1,seqp.length()-discard);
|
|
80 qscore=qscore.substr(discard-1,qscore.length()-discard);
|
|
81 qscorep=qscorep.substr(discard-1,qscorep.length()-discard);
|
|
82 }
|
|
83 if (qscore.length()!=seq.length())
|
|
84 {
|
|
85 cerr << "Invalid fastq\n" << seq << "\n" << qscore << "\n";
|
|
86 exit(1);
|
|
87 }
|
|
88
|
|
89 if (qscorep.length()!=seqp.length())
|
|
90 {
|
|
91 cerr << "Invalid fastq\n" << seqp << "\n" << qscorep << "\n";
|
|
92 exit(1);
|
|
93 }
|
|
94
|
|
95
|
|
96 //
|
|
97 //cout << qscore << "\n" << qscorep << "\n";
|
|
98
|
|
99 //eval Qscores
|
|
100 int p=eval_quality(qscore,cutoff,errors);
|
|
101 int pp=eval_quality(qscorep,cutoff,errors);
|
|
102
|
|
103 string Qheader=header;
|
|
104 if (*(Qheader.end()-2)=='/') // togli gli slash
|
|
105 {
|
|
106 Qheader.replace(Qheader.end()-2,Qheader.end(),"");
|
|
107 }
|
|
108 string Oheader=Qheader;
|
|
109 Oheader[0]='+';
|
|
110 if (p>0)
|
|
111 {
|
|
112 seq=seq.substr(0,p);
|
|
113 qscore=qscore.substr(0,p);
|
|
114 if (pp>0)
|
|
115 {
|
|
116 seqp=seqp.substr(0,pp);
|
|
117 qscorep=qscorep.substr(0,pp);
|
|
118 outseq+=seqp.length();
|
|
119 outseq+=seq.length();
|
|
120 outfile << Qheader <<"/1" << "\n" << seq << "\n" << Oheader <<"/1" << "\n" << qscore << "\n";
|
|
121 outfilep << Qheader <<"/2" << "\n" << seqp << "\n" << Oheader<<"/2" << "\n" << qscorep << "\n";
|
|
122 }else{
|
|
123 outseq+=seq.length();
|
|
124 outfileunm << Qheader <<"/1" << "\n" << seq << "\n" << Oheader<<"/1" << "\n" << qscore << "\n";
|
|
125 }
|
|
126 }else if(p==0 && pp>0){
|
|
127 seqp=seqp.substr(0,pp);
|
|
128 qscorep=qscorep.substr(0,pp);
|
|
129 outseq+=seqp.length();
|
|
130 outfileunm << Qheader <<"/2" << "\n" << seqp << "\n" << Oheader <<"/2" << "\n" << qscorep << "\n";
|
|
131 }
|
|
132 }
|
|
133 }
|
|
134
|
|
135 }else{
|
|
136 cerr << "could not open files\n";
|
|
137 }
|
|
138 //cerr << "Input "<< inseq << " bases.\nOutput " << outseq << " bases.\n";
|
|
139 }else{
|
|
140
|
|
141 cout << "input: <first_file> <second_file> <len_cutoff> <number of errors> <low qual base> <ofile1 <ofile2> <ofile3>\n";
|
|
142 }
|
|
143 }
|
|
144
|
|
145 int eval_quality(string & qstring,int lencutoff,int errors)
|
|
146 {
|
|
147 int Nminori10=0;
|
|
148 int Nminori20=0;
|
|
149 int Nmaggiori25=0;
|
|
150 int l10=0;
|
|
151 int l20=0;
|
|
152 int p=0;
|
|
153 double total_perr=0;
|
|
154 string::iterator pos;
|
|
155 for (pos=qstring.begin();pos!=qstring.end();pos++)
|
|
156 {
|
|
157 int punteggio=static_cast<int> (*pos)-33;
|
|
158 if (punteggio>=1 && punteggio <=41)
|
|
159 {
|
|
160 double exp=(double)punteggio/-10;
|
|
161 total_perr+=pow(10,exp);
|
|
162 if (p>0)
|
|
163 {
|
|
164 if (punteggio<=10) //count qscores <=10
|
|
165 {
|
|
166 l10++;
|
|
167 Nminori20++;
|
|
168 Nminori10++;
|
|
169 }else if (punteggio<=20){ // count Qscores <=20
|
|
170 l20++;
|
|
171 Nminori10=0;
|
|
172 Nminori20++;
|
|
173 }else if (punteggio>20){
|
|
174 Nminori20=0;
|
|
175 Nminori10=0;
|
|
176 if (punteggio>=25)
|
|
177 {
|
|
178 Nmaggiori25++;
|
|
179 }
|
|
180 }
|
|
181 }
|
|
182 if (Nminori10>=10) // 3 or more consecutives very low quality bases
|
|
183 {
|
|
184 p-=Nminori10;
|
|
185 break;
|
|
186 }else if (Nminori20>=15){ // 5 or more consecutives low quality bases
|
|
187 p-=Nminori20;
|
|
188 break;
|
|
189 }
|
|
190 if (total_perr>=(double)errors) // sum of per base error probability when 5e-2 5 wrong base calls in 100
|
|
191 {
|
|
192 //cout << p << " " << total_perr << " " << errors << "\n";
|
|
193 break;
|
|
194 }
|
|
195 p++;
|
|
196 }else{
|
|
197 cerr << "Invalid Qscore" << *pos << "=" << punteggio << "\n";
|
|
198 exit(1);
|
|
199 }
|
|
200 }
|
|
201 double prop_gr_25=(double)Nmaggiori25/(double)(p);
|
|
202 if (prop_gr_25>=0.35 && p>=lencutoff && l20 <= p*0.2 && l10 <= p*0.1) // if 50% of Qscores are >= 25,size is >= cutoff a
|
|
203 {
|
|
204 return p;
|
|
205 }else{
|
|
206 return 0;
|
|
207 }
|
|
208 }
|