Mercurial > repos > matteoc > agame_custom_tools
comparison mytrimmer/trim.seqs.C.cpp @ 0:68a3648c7d91 draft default tip
Uploaded
author | matteoc |
---|---|
date | Thu, 22 Dec 2016 04:45:31 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:68a3648c7d91 |
---|---|
1 #include <iostream> | |
2 #include <fstream> | |
3 #include <algorithm> | |
4 #include <vector> | |
5 #include <map> | |
6 #include <math.h> | |
7 #include <string> | |
8 | |
9 using namespace std; | |
10 int main (int argc, char *argv[]); | |
11 int eval_quality(string & qstring,int lencutoff,int errors); | |
12 | |
13 int main (int argc, char *argv[]) | |
14 { | |
15 if (argc==9) | |
16 { | |
17 unsigned long inseq=0; | |
18 unsigned long outseq=0; | |
19 unsigned long pfile=0; | |
20 ifstream infile; | |
21 ifstream infileP; | |
22 string file=argv[1]; | |
23 string filep=argv[2]; | |
24 ofstream outfile; | |
25 ofstream outfilep; | |
26 ofstream outfileunm; | |
27 string outname=(argv[6]); | |
28 string outnamep=(argv[7]); | |
29 string outunm=(argv[8]); | |
30 outfile.open(outname.c_str()); | |
31 outfilep.open(outnamep.c_str()); | |
32 outfileunm.open(outunm.c_str()); | |
33 int cutoff=atoi(argv[3]); | |
34 int errors=atoi(argv[4]); | |
35 int discard=atoi(argv[5]); | |
36 infile.open(file.c_str()); | |
37 infileP.open(filep.c_str()); | |
38 if (!infile) | |
39 { | |
40 cerr << "Couldn't open "<< infile << "\n"; | |
41 exit(1); | |
42 } | |
43 if (!infileP) | |
44 { | |
45 cerr << "Couldn't open "<< outfile << "\n"; | |
46 exit(1); | |
47 } | |
48 map <int,int> Min; | |
49 map <int,int> Max; | |
50 if (infile.is_open() && infileP.is_open()){ | |
51 string header; | |
52 string seq; | |
53 string seqp; | |
54 string qscore; | |
55 string qscorep; | |
56 while (!infile.eof() && !infileP.eof()) | |
57 { | |
58 getline(infile,header); | |
59 if (header!="") | |
60 { | |
61 //read headers + sequences | |
62 getline(infile,seq); // | |
63 getline(infileP,seqp); | |
64 getline(infileP,seqp);// | |
65 // | |
66 //cout <<"A:" << seq << "\n" << "B:" << seqp << "\n"; | |
67 inseq+=seq.length(); | |
68 inseq+=seqp.length(); | |
69 | |
70 | |
71 //read Qscores | |
72 getline(infile,qscore); | |
73 getline(infile,qscore);// | |
74 getline(infileP,qscorep); | |
75 getline(infileP,qscorep);// | |
76 if (discard >0 && discard<=seq.length()) | |
77 { | |
78 seq=seq.substr(discard-1,seq.length()-discard); | |
79 seqp=seqp.substr(discard-1,seqp.length()-discard); | |
80 qscore=qscore.substr(discard-1,qscore.length()-discard); | |
81 qscorep=qscorep.substr(discard-1,qscorep.length()-discard); | |
82 } | |
83 if (qscore.length()!=seq.length()) | |
84 { | |
85 cerr << "Invalid fastq\n" << seq << "\n" << qscore << "\n"; | |
86 exit(1); | |
87 } | |
88 | |
89 if (qscorep.length()!=seqp.length()) | |
90 { | |
91 cerr << "Invalid fastq\n" << seqp << "\n" << qscorep << "\n"; | |
92 exit(1); | |
93 } | |
94 | |
95 | |
96 // | |
97 //cout << qscore << "\n" << qscorep << "\n"; | |
98 | |
99 //eval Qscores | |
100 int p=eval_quality(qscore,cutoff,errors); | |
101 int pp=eval_quality(qscorep,cutoff,errors); | |
102 | |
103 string Qheader=header; | |
104 if (*(Qheader.end()-2)=='/') // togli gli slash | |
105 { | |
106 Qheader.replace(Qheader.end()-2,Qheader.end(),""); | |
107 } | |
108 string Oheader=Qheader; | |
109 Oheader[0]='+'; | |
110 if (p>0) | |
111 { | |
112 seq=seq.substr(0,p); | |
113 qscore=qscore.substr(0,p); | |
114 if (pp>0) | |
115 { | |
116 seqp=seqp.substr(0,pp); | |
117 qscorep=qscorep.substr(0,pp); | |
118 outseq+=seqp.length(); | |
119 outseq+=seq.length(); | |
120 outfile << Qheader <<"/1" << "\n" << seq << "\n" << Oheader <<"/1" << "\n" << qscore << "\n"; | |
121 outfilep << Qheader <<"/2" << "\n" << seqp << "\n" << Oheader<<"/2" << "\n" << qscorep << "\n"; | |
122 }else{ | |
123 outseq+=seq.length(); | |
124 outfileunm << Qheader <<"/1" << "\n" << seq << "\n" << Oheader<<"/1" << "\n" << qscore << "\n"; | |
125 } | |
126 }else if(p==0 && pp>0){ | |
127 seqp=seqp.substr(0,pp); | |
128 qscorep=qscorep.substr(0,pp); | |
129 outseq+=seqp.length(); | |
130 outfileunm << Qheader <<"/2" << "\n" << seqp << "\n" << Oheader <<"/2" << "\n" << qscorep << "\n"; | |
131 } | |
132 } | |
133 } | |
134 | |
135 }else{ | |
136 cerr << "could not open files\n"; | |
137 } | |
138 //cerr << "Input "<< inseq << " bases.\nOutput " << outseq << " bases.\n"; | |
139 }else{ | |
140 | |
141 cout << "input: <first_file> <second_file> <len_cutoff> <number of errors> <low qual base> <ofile1 <ofile2> <ofile3>\n"; | |
142 } | |
143 } | |
144 | |
145 int eval_quality(string & qstring,int lencutoff,int errors) | |
146 { | |
147 int Nminori10=0; | |
148 int Nminori20=0; | |
149 int Nmaggiori25=0; | |
150 int l10=0; | |
151 int l20=0; | |
152 int p=0; | |
153 double total_perr=0; | |
154 string::iterator pos; | |
155 for (pos=qstring.begin();pos!=qstring.end();pos++) | |
156 { | |
157 int punteggio=static_cast<int> (*pos)-33; | |
158 if (punteggio>=1 && punteggio <=41) | |
159 { | |
160 double exp=(double)punteggio/-10; | |
161 total_perr+=pow(10,exp); | |
162 if (p>0) | |
163 { | |
164 if (punteggio<=10) //count qscores <=10 | |
165 { | |
166 l10++; | |
167 Nminori20++; | |
168 Nminori10++; | |
169 }else if (punteggio<=20){ // count Qscores <=20 | |
170 l20++; | |
171 Nminori10=0; | |
172 Nminori20++; | |
173 }else if (punteggio>20){ | |
174 Nminori20=0; | |
175 Nminori10=0; | |
176 if (punteggio>=25) | |
177 { | |
178 Nmaggiori25++; | |
179 } | |
180 } | |
181 } | |
182 if (Nminori10>=10) // 3 or more consecutives very low quality bases | |
183 { | |
184 p-=Nminori10; | |
185 break; | |
186 }else if (Nminori20>=15){ // 5 or more consecutives low quality bases | |
187 p-=Nminori20; | |
188 break; | |
189 } | |
190 if (total_perr>=(double)errors) // sum of per base error probability when 5e-2 5 wrong base calls in 100 | |
191 { | |
192 //cout << p << " " << total_perr << " " << errors << "\n"; | |
193 break; | |
194 } | |
195 p++; | |
196 }else{ | |
197 cerr << "Invalid Qscore" << *pos << "=" << punteggio << "\n"; | |
198 exit(1); | |
199 } | |
200 } | |
201 double prop_gr_25=(double)Nmaggiori25/(double)(p); | |
202 if (prop_gr_25>=0.35 && p>=lencutoff && l20 <= p*0.2 && l10 <= p*0.1) // if 50% of Qscores are >= 25,size is >= cutoff a | |
203 { | |
204 return p; | |
205 }else{ | |
206 return 0; | |
207 } | |
208 } |