Mercurial > repos > matteoc > agame_custom_tools
comparison mytrimmer/trim.seqs.C.cpp @ 0:68a3648c7d91 draft default tip
Uploaded
| author | matteoc |
|---|---|
| date | Thu, 22 Dec 2016 04:45:31 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:68a3648c7d91 |
|---|---|
| 1 #include <iostream> | |
| 2 #include <fstream> | |
| 3 #include <algorithm> | |
| 4 #include <vector> | |
| 5 #include <map> | |
| 6 #include <math.h> | |
| 7 #include <string> | |
| 8 | |
| 9 using namespace std; | |
| 10 int main (int argc, char *argv[]); | |
| 11 int eval_quality(string & qstring,int lencutoff,int errors); | |
| 12 | |
| 13 int main (int argc, char *argv[]) | |
| 14 { | |
| 15 if (argc==9) | |
| 16 { | |
| 17 unsigned long inseq=0; | |
| 18 unsigned long outseq=0; | |
| 19 unsigned long pfile=0; | |
| 20 ifstream infile; | |
| 21 ifstream infileP; | |
| 22 string file=argv[1]; | |
| 23 string filep=argv[2]; | |
| 24 ofstream outfile; | |
| 25 ofstream outfilep; | |
| 26 ofstream outfileunm; | |
| 27 string outname=(argv[6]); | |
| 28 string outnamep=(argv[7]); | |
| 29 string outunm=(argv[8]); | |
| 30 outfile.open(outname.c_str()); | |
| 31 outfilep.open(outnamep.c_str()); | |
| 32 outfileunm.open(outunm.c_str()); | |
| 33 int cutoff=atoi(argv[3]); | |
| 34 int errors=atoi(argv[4]); | |
| 35 int discard=atoi(argv[5]); | |
| 36 infile.open(file.c_str()); | |
| 37 infileP.open(filep.c_str()); | |
| 38 if (!infile) | |
| 39 { | |
| 40 cerr << "Couldn't open "<< infile << "\n"; | |
| 41 exit(1); | |
| 42 } | |
| 43 if (!infileP) | |
| 44 { | |
| 45 cerr << "Couldn't open "<< outfile << "\n"; | |
| 46 exit(1); | |
| 47 } | |
| 48 map <int,int> Min; | |
| 49 map <int,int> Max; | |
| 50 if (infile.is_open() && infileP.is_open()){ | |
| 51 string header; | |
| 52 string seq; | |
| 53 string seqp; | |
| 54 string qscore; | |
| 55 string qscorep; | |
| 56 while (!infile.eof() && !infileP.eof()) | |
| 57 { | |
| 58 getline(infile,header); | |
| 59 if (header!="") | |
| 60 { | |
| 61 //read headers + sequences | |
| 62 getline(infile,seq); // | |
| 63 getline(infileP,seqp); | |
| 64 getline(infileP,seqp);// | |
| 65 // | |
| 66 //cout <<"A:" << seq << "\n" << "B:" << seqp << "\n"; | |
| 67 inseq+=seq.length(); | |
| 68 inseq+=seqp.length(); | |
| 69 | |
| 70 | |
| 71 //read Qscores | |
| 72 getline(infile,qscore); | |
| 73 getline(infile,qscore);// | |
| 74 getline(infileP,qscorep); | |
| 75 getline(infileP,qscorep);// | |
| 76 if (discard >0 && discard<=seq.length()) | |
| 77 { | |
| 78 seq=seq.substr(discard-1,seq.length()-discard); | |
| 79 seqp=seqp.substr(discard-1,seqp.length()-discard); | |
| 80 qscore=qscore.substr(discard-1,qscore.length()-discard); | |
| 81 qscorep=qscorep.substr(discard-1,qscorep.length()-discard); | |
| 82 } | |
| 83 if (qscore.length()!=seq.length()) | |
| 84 { | |
| 85 cerr << "Invalid fastq\n" << seq << "\n" << qscore << "\n"; | |
| 86 exit(1); | |
| 87 } | |
| 88 | |
| 89 if (qscorep.length()!=seqp.length()) | |
| 90 { | |
| 91 cerr << "Invalid fastq\n" << seqp << "\n" << qscorep << "\n"; | |
| 92 exit(1); | |
| 93 } | |
| 94 | |
| 95 | |
| 96 // | |
| 97 //cout << qscore << "\n" << qscorep << "\n"; | |
| 98 | |
| 99 //eval Qscores | |
| 100 int p=eval_quality(qscore,cutoff,errors); | |
| 101 int pp=eval_quality(qscorep,cutoff,errors); | |
| 102 | |
| 103 string Qheader=header; | |
| 104 if (*(Qheader.end()-2)=='/') // togli gli slash | |
| 105 { | |
| 106 Qheader.replace(Qheader.end()-2,Qheader.end(),""); | |
| 107 } | |
| 108 string Oheader=Qheader; | |
| 109 Oheader[0]='+'; | |
| 110 if (p>0) | |
| 111 { | |
| 112 seq=seq.substr(0,p); | |
| 113 qscore=qscore.substr(0,p); | |
| 114 if (pp>0) | |
| 115 { | |
| 116 seqp=seqp.substr(0,pp); | |
| 117 qscorep=qscorep.substr(0,pp); | |
| 118 outseq+=seqp.length(); | |
| 119 outseq+=seq.length(); | |
| 120 outfile << Qheader <<"/1" << "\n" << seq << "\n" << Oheader <<"/1" << "\n" << qscore << "\n"; | |
| 121 outfilep << Qheader <<"/2" << "\n" << seqp << "\n" << Oheader<<"/2" << "\n" << qscorep << "\n"; | |
| 122 }else{ | |
| 123 outseq+=seq.length(); | |
| 124 outfileunm << Qheader <<"/1" << "\n" << seq << "\n" << Oheader<<"/1" << "\n" << qscore << "\n"; | |
| 125 } | |
| 126 }else if(p==0 && pp>0){ | |
| 127 seqp=seqp.substr(0,pp); | |
| 128 qscorep=qscorep.substr(0,pp); | |
| 129 outseq+=seqp.length(); | |
| 130 outfileunm << Qheader <<"/2" << "\n" << seqp << "\n" << Oheader <<"/2" << "\n" << qscorep << "\n"; | |
| 131 } | |
| 132 } | |
| 133 } | |
| 134 | |
| 135 }else{ | |
| 136 cerr << "could not open files\n"; | |
| 137 } | |
| 138 //cerr << "Input "<< inseq << " bases.\nOutput " << outseq << " bases.\n"; | |
| 139 }else{ | |
| 140 | |
| 141 cout << "input: <first_file> <second_file> <len_cutoff> <number of errors> <low qual base> <ofile1 <ofile2> <ofile3>\n"; | |
| 142 } | |
| 143 } | |
| 144 | |
| 145 int eval_quality(string & qstring,int lencutoff,int errors) | |
| 146 { | |
| 147 int Nminori10=0; | |
| 148 int Nminori20=0; | |
| 149 int Nmaggiori25=0; | |
| 150 int l10=0; | |
| 151 int l20=0; | |
| 152 int p=0; | |
| 153 double total_perr=0; | |
| 154 string::iterator pos; | |
| 155 for (pos=qstring.begin();pos!=qstring.end();pos++) | |
| 156 { | |
| 157 int punteggio=static_cast<int> (*pos)-33; | |
| 158 if (punteggio>=1 && punteggio <=41) | |
| 159 { | |
| 160 double exp=(double)punteggio/-10; | |
| 161 total_perr+=pow(10,exp); | |
| 162 if (p>0) | |
| 163 { | |
| 164 if (punteggio<=10) //count qscores <=10 | |
| 165 { | |
| 166 l10++; | |
| 167 Nminori20++; | |
| 168 Nminori10++; | |
| 169 }else if (punteggio<=20){ // count Qscores <=20 | |
| 170 l20++; | |
| 171 Nminori10=0; | |
| 172 Nminori20++; | |
| 173 }else if (punteggio>20){ | |
| 174 Nminori20=0; | |
| 175 Nminori10=0; | |
| 176 if (punteggio>=25) | |
| 177 { | |
| 178 Nmaggiori25++; | |
| 179 } | |
| 180 } | |
| 181 } | |
| 182 if (Nminori10>=10) // 3 or more consecutives very low quality bases | |
| 183 { | |
| 184 p-=Nminori10; | |
| 185 break; | |
| 186 }else if (Nminori20>=15){ // 5 or more consecutives low quality bases | |
| 187 p-=Nminori20; | |
| 188 break; | |
| 189 } | |
| 190 if (total_perr>=(double)errors) // sum of per base error probability when 5e-2 5 wrong base calls in 100 | |
| 191 { | |
| 192 //cout << p << " " << total_perr << " " << errors << "\n"; | |
| 193 break; | |
| 194 } | |
| 195 p++; | |
| 196 }else{ | |
| 197 cerr << "Invalid Qscore" << *pos << "=" << punteggio << "\n"; | |
| 198 exit(1); | |
| 199 } | |
| 200 } | |
| 201 double prop_gr_25=(double)Nmaggiori25/(double)(p); | |
| 202 if (prop_gr_25>=0.35 && p>=lencutoff && l20 <= p*0.2 && l10 <= p*0.1) // if 50% of Qscores are >= 25,size is >= cutoff a | |
| 203 { | |
| 204 return p; | |
| 205 }else{ | |
| 206 return 0; | |
| 207 } | |
| 208 } |
