annotate mytrimmer/trim.seqs.C.cpp @ 0:68a3648c7d91 draft default tip

Uploaded
author matteoc
date Thu, 22 Dec 2016 04:45:31 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
1 #include <iostream>
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
2 #include <fstream>
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
3 #include <algorithm>
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
4 #include <vector>
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
5 #include <map>
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
6 #include <math.h>
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
7 #include <string>
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
8
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
9 using namespace std;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
10 int main (int argc, char *argv[]);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
11 int eval_quality(string & qstring,int lencutoff,int errors);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
12
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
13 int main (int argc, char *argv[])
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
14 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
15 if (argc==9)
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
16 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
17 unsigned long inseq=0;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
18 unsigned long outseq=0;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
19 unsigned long pfile=0;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
20 ifstream infile;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
21 ifstream infileP;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
22 string file=argv[1];
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
23 string filep=argv[2];
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
24 ofstream outfile;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
25 ofstream outfilep;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
26 ofstream outfileunm;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
27 string outname=(argv[6]);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
28 string outnamep=(argv[7]);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
29 string outunm=(argv[8]);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
30 outfile.open(outname.c_str());
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
31 outfilep.open(outnamep.c_str());
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
32 outfileunm.open(outunm.c_str());
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
33 int cutoff=atoi(argv[3]);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
34 int errors=atoi(argv[4]);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
35 int discard=atoi(argv[5]);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
36 infile.open(file.c_str());
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
37 infileP.open(filep.c_str());
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
38 if (!infile)
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
39 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
40 cerr << "Couldn't open "<< infile << "\n";
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
41 exit(1);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
42 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
43 if (!infileP)
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
44 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
45 cerr << "Couldn't open "<< outfile << "\n";
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
46 exit(1);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
47 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
48 map <int,int> Min;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
49 map <int,int> Max;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
50 if (infile.is_open() && infileP.is_open()){
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
51 string header;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
52 string seq;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
53 string seqp;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
54 string qscore;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
55 string qscorep;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
56 while (!infile.eof() && !infileP.eof())
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
57 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
58 getline(infile,header);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
59 if (header!="")
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
60 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
61 //read headers + sequences
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
62 getline(infile,seq); //
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
63 getline(infileP,seqp);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
64 getline(infileP,seqp);//
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
65 //
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
66 //cout <<"A:" << seq << "\n" << "B:" << seqp << "\n";
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
67 inseq+=seq.length();
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
68 inseq+=seqp.length();
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
69
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
70
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
71 //read Qscores
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
72 getline(infile,qscore);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
73 getline(infile,qscore);//
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
74 getline(infileP,qscorep);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
75 getline(infileP,qscorep);//
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
76 if (discard >0 && discard<=seq.length())
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
77 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
78 seq=seq.substr(discard-1,seq.length()-discard);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
79 seqp=seqp.substr(discard-1,seqp.length()-discard);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
80 qscore=qscore.substr(discard-1,qscore.length()-discard);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
81 qscorep=qscorep.substr(discard-1,qscorep.length()-discard);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
82 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
83 if (qscore.length()!=seq.length())
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
84 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
85 cerr << "Invalid fastq\n" << seq << "\n" << qscore << "\n";
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
86 exit(1);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
87 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
88
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
89 if (qscorep.length()!=seqp.length())
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
90 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
91 cerr << "Invalid fastq\n" << seqp << "\n" << qscorep << "\n";
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
92 exit(1);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
93 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
94
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
95
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
96 //
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
97 //cout << qscore << "\n" << qscorep << "\n";
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
98
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
99 //eval Qscores
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
100 int p=eval_quality(qscore,cutoff,errors);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
101 int pp=eval_quality(qscorep,cutoff,errors);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
102
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
103 string Qheader=header;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
104 if (*(Qheader.end()-2)=='/') // togli gli slash
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
105 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
106 Qheader.replace(Qheader.end()-2,Qheader.end(),"");
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
107 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
108 string Oheader=Qheader;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
109 Oheader[0]='+';
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
110 if (p>0)
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
111 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
112 seq=seq.substr(0,p);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
113 qscore=qscore.substr(0,p);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
114 if (pp>0)
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
115 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
116 seqp=seqp.substr(0,pp);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
117 qscorep=qscorep.substr(0,pp);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
118 outseq+=seqp.length();
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
119 outseq+=seq.length();
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
120 outfile << Qheader <<"/1" << "\n" << seq << "\n" << Oheader <<"/1" << "\n" << qscore << "\n";
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
121 outfilep << Qheader <<"/2" << "\n" << seqp << "\n" << Oheader<<"/2" << "\n" << qscorep << "\n";
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
122 }else{
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
123 outseq+=seq.length();
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
124 outfileunm << Qheader <<"/1" << "\n" << seq << "\n" << Oheader<<"/1" << "\n" << qscore << "\n";
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
125 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
126 }else if(p==0 && pp>0){
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
127 seqp=seqp.substr(0,pp);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
128 qscorep=qscorep.substr(0,pp);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
129 outseq+=seqp.length();
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
130 outfileunm << Qheader <<"/2" << "\n" << seqp << "\n" << Oheader <<"/2" << "\n" << qscorep << "\n";
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
131 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
132 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
133 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
134
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
135 }else{
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
136 cerr << "could not open files\n";
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
137 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
138 //cerr << "Input "<< inseq << " bases.\nOutput " << outseq << " bases.\n";
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
139 }else{
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
140
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
141 cout << "input: <first_file> <second_file> <len_cutoff> <number of errors> <low qual base> <ofile1 <ofile2> <ofile3>\n";
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
142 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
143 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
144
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
145 int eval_quality(string & qstring,int lencutoff,int errors)
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
146 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
147 int Nminori10=0;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
148 int Nminori20=0;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
149 int Nmaggiori25=0;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
150 int l10=0;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
151 int l20=0;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
152 int p=0;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
153 double total_perr=0;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
154 string::iterator pos;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
155 for (pos=qstring.begin();pos!=qstring.end();pos++)
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
156 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
157 int punteggio=static_cast<int> (*pos)-33;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
158 if (punteggio>=1 && punteggio <=41)
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
159 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
160 double exp=(double)punteggio/-10;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
161 total_perr+=pow(10,exp);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
162 if (p>0)
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
163 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
164 if (punteggio<=10) //count qscores <=10
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
165 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
166 l10++;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
167 Nminori20++;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
168 Nminori10++;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
169 }else if (punteggio<=20){ // count Qscores <=20
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
170 l20++;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
171 Nminori10=0;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
172 Nminori20++;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
173 }else if (punteggio>20){
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
174 Nminori20=0;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
175 Nminori10=0;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
176 if (punteggio>=25)
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
177 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
178 Nmaggiori25++;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
179 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
180 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
181 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
182 if (Nminori10>=10) // 3 or more consecutives very low quality bases
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
183 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
184 p-=Nminori10;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
185 break;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
186 }else if (Nminori20>=15){ // 5 or more consecutives low quality bases
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
187 p-=Nminori20;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
188 break;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
189 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
190 if (total_perr>=(double)errors) // sum of per base error probability when 5e-2 5 wrong base calls in 100
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
191 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
192 //cout << p << " " << total_perr << " " << errors << "\n";
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
193 break;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
194 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
195 p++;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
196 }else{
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
197 cerr << "Invalid Qscore" << *pos << "=" << punteggio << "\n";
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
198 exit(1);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
199 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
200 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
201 double prop_gr_25=(double)Nmaggiori25/(double)(p);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
202 if (prop_gr_25>=0.35 && p>=lencutoff && l20 <= p*0.2 && l10 <= p*0.1) // if 50% of Qscores are >= 25,size is >= cutoff a
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
203 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
204 return p;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
205 }else{
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
206 return 0;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
207 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
208 }