Mercurial > repos > ryanmorin > nextgen_variant_identification
view SNV/SNVMix2_source/SNVMix2-v0.12.1-rc1/SNVMix2.c @ 0:74f5ea818cea
Uploaded
author | ryanmorin |
---|---|
date | Wed, 12 Oct 2011 19:50:38 -0400 |
parents | |
children |
line wrap: on
line source
/* The MIT License Copyright (c) 2009, by Sohrab Shah <sshah@bccrc.ca> and Rodrigo Goya <rgoya@bcgsc.ca> Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* C Implementation of SNVMix2 */ #define VERSION "0.12.1-rc1" #include <stdio.h> #include <stdlib.h> #include <string.h> #include <math.h> #include "SNVMix2.h" #include "bam.h" #define START_QNUM 1000 int main(int argc, char **argv) { param_struct params;// = {NULL, NULL, NULL, NULL, NULL, 0, 0, 0, 0, 0}; initSNVMix(argc, argv, ¶ms); if(params.classify || params.filter) { if(params.input_type == Q_PILEUP) snvmixClassify_qualities(¶ms); else if(params.input_type == M_PILEUP || params.input_type == S_PILEUP) snvmixClassify_pileup(¶ms); else if(params.input_type == BAM_FILE) { fprintf(stderr,"Output is:\n"); fprintf(stderr,"\tchr:pos ref nref ref:ref_count,nref:nref_count,pAA,pAB,pBB,maxP"); fprintf(stderr,"\tref_fwd ref_rev"); fprintf(stderr,"\tnref_fwd nref_rev"); fprintf(stderr,"\tnref_center nref_edges"); fprintf(stderr,"\tindel_pos"); fprintf(stderr,"\tref_clean nref_clean"); fprintf(stderr,"\tref_bad_pair nref_bad_pair"); fprintf(stderr,"\tref_ins nref_ins"); fprintf(stderr,"\tref_del nref_del"); fprintf(stderr,"\tref_junc nref_junc"); fprintf(stderr,"\tnref_in_unique_pos"); fprintf(stderr,"\traw[A,C,G,T,N]"); fprintf(stderr,"\tthr[A,C,G,T,N]\n"); snvmixClassify_bamfile(¶ms); } } else if(params.train) { if(params.input_type == M_PILEUP || params.input_type == S_PILEUP) snvmixTrain_pileup(¶ms); else if(params.input_type == Q_PILEUP) { fprintf(stderr,"Sorry, Training with quality-type input is not yet implemented\n"); exit(1); } } return(0); } /* void snvmixClassify_qualities Arguments: *params : pointer to model parameters and command line options Function: Reads a pilep-style file that has been preprocessed to include the quality of calls being the reference allele and the maping quality both in decimal values. For each line it evaluates de model according to the parameters in *params and outputs the corresponding SNV prediction values. This function may come in handy when filtering of calls is done by a different program or the base-calling and maping quality information is not in a pileup/phred format. The file read is a tab-separated file where the columns are: - target sequence (e.g. chromosome) - sequence position - reference base - non-reference base - comma separated probabilities of the call being the reference - comma separated mapping qualities, one for each base call */ void snvmixClassify_qualities(param_struct *params) { FILE *fptrIN, *fptrOUT; char *line = NULL; size_t line_len = 0, prev_len = 0; int read_len = 0; int qual_num = 0, col_num = 0; char *col, *qual; char *col_sep = "\t", *col_str, *col_ptr; char *qual_sep = ",", *qual_str, *qual_ptr; char *chr, *pos, *ref, *nref; long double *bQ, *mQ, *tmpQ; int pos_int, depth = 0, maxp; long int depth_allocated = 0; long double bsum = 0, msum = 0, *b, *mu, *notmu, *pi, *log_pi, z = 0; int i, k, states; states = params->states; // Initialize local values of parameters mu = params->mu; pi = params->pi; if( ( b = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_qualities] Could not allocate b\n"); exit(1); } if( ( notmu = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_qualities] Could not allocate notmu\n"); exit(1); } if( ( log_pi = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_qualities] Could not allocate log_pi\n"); exit(1); } for(k = 0; k < states; k++) { notmu[k] = 1 - mu[k]; log_pi[k] = logl(pi[k]); } fptrIN = params->input; fptrOUT = params->output; if( (bQ = malloc(START_QNUM * sizeof (long double))) == NULL) { perror("ERROR allocating initial space for bQ"); exit(1); } if( (mQ = malloc(START_QNUM * sizeof (long double))) == NULL) { perror("ERROR allocating initial space for mQ"); exit(1); } depth_allocated = START_QNUM; #if defined __linux__ || defined _GNU_SOURCE while( (read_len = getline(&line,&line_len,fptrIN)) != -1 ) #endif #ifdef __APPLE__ while( (line = fgetln(fptrIN,&line_len)) != NULL ) #endif { line[line_len-1] = '\0'; depth = 0; col_ptr = NULL; for(col_num = 0, col_str = line; ; col_num++, col_str = NULL) { col = strtok_r(col_str, "\t", &col_ptr); if(col == NULL) { break; } if(col_num == 0) { chr = col; } else if(col_num == 1) { pos = col; pos_int = strtol(pos, NULL, 10); } else if(col_num == 2) { ref = col; } else if(col_num == 3) { nref = col; } else if(col_num == 4) { for(qual_num = 0, qual_str = col; ; qual_num++, qual_str = NULL) { qual = strtok_r(qual_str, ",", &qual_ptr); if(qual == NULL) { break; } if(qual_num >= depth_allocated) { if( (bQ = realloc(bQ, sizeof(long double) * (depth_allocated + START_QNUM))) == NULL) { perror("ERROR allocating bQ"); exit(1); } if( (mQ = realloc(mQ, sizeof(long double) * (depth_allocated + START_QNUM))) == NULL) { perror("ERROR allocating mQ"); exit(1); } depth_allocated = depth_allocated + START_QNUM; } bQ[qual_num] = atof(qual); } depth = qual_num; } else if(col_num == 5) { for(qual_num = 0, qual_str = col; ; qual_num++, qual_str = NULL) { qual = strtok_r(qual_str, ",", &qual_ptr); if(qual == NULL) { break; } if(qual_num >= depth_allocated) { fprintf(stderr, "FATAL ERROR: should not have more mapping qualities than base qualities\n"); exit(1); } mQ[qual_num] = atof(qual); } if(depth != qual_num) { fprintf(stderr, "FATAL ERROR: should not have more base qualities than mapping qualities\n"); exit(1); } } } for(k = 0; k < states; k++) b[k] = 0; //b = sum(log(notm*0.5 + m.*(yr.*mur+notyr.*notmur)),2); for(qual_num = 0; qual_num < depth; qual_num++) for(k = 0; k < states; k++) b[k] = b[k] + logl((1- mQ[qual_num])*0.5 + mQ[qual_num] * (bQ[qual_num]*mu[k]+(1-bQ[qual_num])*notmu[k])); z = 0; for(k = 0; k < states; k++) { b[k] = expl(b[k] + log_pi[k]); z += b[k]; } if(!z) { z = 1; } for(k = 0; k < states; k++) b[k] = b[k] / z; maxp = 0; z = b[0]; for(k = 0; k < states; k++) if( b[k] > z) { maxp = k; z = b[k]; } fprintf(fptrOUT,"%s\t%s\t%s\t%s",chr, pos, ref, nref); for(k = 0; k , states; k++) fprintf(fptrOUT,"%0.10Lf,",b[k]); fprintf(fptrOUT,"%d\n",maxp+1); } fclose(fptrIN); fclose(fptrOUT); free(line); free(b); free(notmu); free(log_pi); } /* void snvmixClassify_pileup Arguments: *params : pointer to model parameters and command line options Function: Reads a MAQ or Samtools pileup format. For required formatting we refer the user to the corresponding websites: http://maq.sourceforge.net/ http://samtools.sourceforge.net/ In general the format of both files can be described as a tab separated file where columns are: - target sequence (e.g. chromosome) - sequence position - reference base - depth - stream of base calls - stream of base call PHRED-scaled qualities - stream of mapping PHRED-scaled qualities Note: when handling pileup files, care should be taken when doing copy-paste operations not to transform 'tabs' into streams of spaces. For each line it evaluates de model according to the parameters in *params and outputs the corresponding SNV prediction values. Predictions can be output either only for positions where when non-reference values are observed or, when run with '-f' flag, for all positions. The -f flag is useful when the resulting calls are being compared or joined accros different datasets and all predictions need to be present. */ void snvmixClassify_pileup(param_struct *params) { //MAQ: // 1 554484 C 1752 @,,.,.,, @xxx @xxx xxxxx //SAM: // 1 554484 C 1752 ,,.,.,, xxx xxxx FILE *fptrIN, *fptrOUT; char *line = NULL; size_t line_len = 0, prev_len = 0; int read_len = 0; int col_num = 0; long int line_num = 0; char *col, *qual; char *col_sep = "\t", *col_str, *col_ptr; char *qual_sep = ",", *qual_str, *qual_ptr; char *chr, *pos, *ref, nref, call, nref_skip = 'N'; char *bQ, *mQ; int *calls, *tmpQ; int pos_int, depth = 0, qual_num, maxp; int depth_allocated = 0; long double bsum = 0, msum = 0, *b, *mu, *notmu, *pi, *log_pi, z = 0; int nref_count = 0, ref_count = 0; long double Y, not_Y; long double phred[PHRED_MAX + 1]; int i, call_i, k, states; //initPhred(phred, PHRED_MAX+1); initPhred(phred, PHRED_MAX); states = params->states; // Initialize local values of parameters mu = params->mu; pi = params->pi; if( ( b = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate b\n"); exit(1); } if( ( notmu = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate notmu\n"); exit(1); } if( ( log_pi = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate log_pi\n"); exit(1); } for(k = 0; k < states; k++) { notmu[k] = 1 - mu[k]; log_pi[k] = logl(pi[k]); } fptrIN = params->input; fptrOUT = params->output; if(params->full) nref_skip = -2; if( (calls = malloc(START_QNUM * sizeof (int))) == NULL) { perror("ERROR allocating initial space for calls"); exit(1); } depth_allocated = START_QNUM; #if defined __linux__ || defined _GNU_SOURCE while( (read_len = getline(&line,&line_len,fptrIN)) != -1 ) #endif #ifdef __APPLE__ while( (line = fgetln(fptrIN,&line_len)) != NULL ) #endif { line[line_len-1] = '\0'; line_num++; depth = 0; nref = 0; col_ptr = NULL; for(col_num = 0, col_str = line; nref != nref_skip ; col_num++, col_str = NULL) { col = strtok_r(col_str, "\t", &col_ptr); if(col == NULL) { break; } if(col_num == 0) { chr = col; } else if(col_num == 1) { pos = col; } else if(col_num == 2) { ref = col; } else if(col_num == 3) { depth = atoi(col); if(depth > depth_allocated) { if( (calls = realloc(calls, sizeof (int) * (depth + START_QNUM))) == NULL) { perror("ERROR allocating space for calls"); exit(1); } depth_allocated = depth + START_QNUM; } } else if(col_num == 4) { if(params->input_type == M_PILEUP) col = col+sizeof(char); snvmixGetCallString(col, calls, depth, &nref); } else if(col_num == 5) { bQ = col; // If it's a MAQ pileup, we need to skip the @ sign if(params->input_type == M_PILEUP) bQ = bQ + sizeof(char); for(call_i = 0; call_i < depth; call_i++) bQ[call_i] = bQ[call_i]-33; } else if(col_num == 6) { mQ = col; // If it's a MAQ pileup, we need to skip the @ sign if(params->input_type == M_PILEUP) mQ = mQ + sizeof(char); for(call_i = 0; call_i < depth; call_i++) mQ[call_i] = mQ[call_i]-33; } } // If we quit the for because no nref was found, skip this too, next line nref = snvmixFilterCalls(calls,depth,bQ,mQ,params); nref_count = 0; ref_count = 0; for(qual_num = 0; qual_num < depth; qual_num++) { //if( snvmixSkipCall(calls,qual_num,params,bQ,mQ) ) if(calls[qual_num] == -1) continue; if(calls[qual_num] == 1) ref_count++; if(calls[qual_num] == nref) nref_count++; } if(params->filter) { fprintf(fptrOUT,"%s:%s\t%s:%d\t%c:%d\t", chr, pos, ref, ref_count, nref, nref_count); for(qual_num = 0; qual_num < ref_count; qual_num++) fprintf(fptrOUT,","); for(qual_num = 0; qual_num < nref_count; qual_num++) fprintf(fptrOUT,"%c",nref); fprintf(fptrOUT,"\n"); } else { if(nref == nref_skip) continue; //nref = snvmixFilterCalls(calls,depth,bQ,mQ,params); //if(nref == nref_skip) // continue; for(k = 0; k < states; k++) b[k] = 0; nref_count = 0; for(qual_num = 0; qual_num < depth; qual_num++) { //if( snvmixSkipCall(calls,qual_num,params,bQ,mQ) ) if(calls[qual_num] == -1) continue; // from matlab: // b = sum(log(notm*0.5 + m.*(yr.*mur+notyr.*notmur)),2); if(calls[qual_num] == 1) { // If REF then Y = phred[(unsigned char) bQ[qual_num]]; not_Y = 1-phred[(unsigned char) bQ[qual_num]]; } else { nref_count++; // If NREF then Y = (1-phred[(unsigned char) bQ[qual_num]])/3; not_Y = phred[(unsigned char) bQ[qual_num]] + 2*(1-phred[(unsigned char)bQ[qual_num]])/3; } for(k = 0; k < states; k++) b[k] = b[k] + logl((1-phred[(unsigned char) mQ[qual_num]])*0.5 + phred[(unsigned char) mQ[qual_num]] * (Y * mu[k] + not_Y * notmu[k])); } // Check if any non-references actually passed the filter //if(!nref_count) // continue; z = 0; for(k = 0; k < states; k++) { b[k] = expl(b[k] + log_pi[k]); z += b[k]; } if(!z) z = 1; for(k = 0; k < states; k++) b[k] = b[k] / z; maxp = 0; z = b[0]; for(k = 0; k < states; k++) if( b[k] > z) { maxp = k; z = b[k]; } fprintf(fptrOUT,"%s:%s\t%s\t%c\t%s:%d,%c:%d,",chr,pos, ref, nref, ref, ref_count, nref, nref_count); for(k = 0; k < states; k++) fprintf(fptrOUT,"%0.10Lf,",b[k]); fprintf(fptrOUT,"%d\n",maxp+1); } } fclose(fptrIN); fclose(fptrOUT); free(line); free(b); free(notmu); free(log_pi); } int snvmixClassify_bamfile(param_struct *params) { snvmix_pl_t snvmix_pl; int states, i, k, bam_flag_mask; snvmix_pl.params = params; snvmix_pl.tid = -1; // TODO: change this to read range snvmix_pl.begin = 0; snvmix_pl.end = 0x7fffffff; snvmix_pl.in = samopen(params->inputfile, "rb", 0); if (snvmix_pl.in == 0) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", params->bamfile); return 1; } snvmix_pl.fai = fai_load(params->fastafile); if (snvmix_pl.fai == 0) { fprintf(stderr, "Fail to open BAM file %s\n", params->fastafile); return 1; } if(params->listfile) snvmix_pl.hash = load_pos(params->listfile, snvmix_pl.in->header); snvmix_pl.depth_allocated = 0; snvmix_pl.ref = NULL; snvmix_pl.calls = NULL; //snvmix_pl.bQ = NULL; //snvmix_pl.mQ = NULL; /* This looks ugly... but we don't want to be allocating new memory for EVERY location. */ states = params->states; if( ( snvmix_pl.notmu = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_bamfile] Could not allocate snvmix_pl.notmu\n"); exit(1); } if( ( snvmix_pl.log_pi = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_bamfile] Could not allocate snvmix_pl.log_pi\n"); exit(1); } if( ( snvmix_pl.b = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_bamfile] Could not allocate snvmix_pl.b\n"); exit(1); } for(k = 0; k < states; k++) { snvmix_pl.notmu[k] = 1 - params->mu[k]; snvmix_pl.log_pi[k] = logl(params->pi[k]); } initPhred(snvmix_pl.phred, PHRED_MAX); /* Init call buffer */ snvmix_pl.n_buffer = params->window; if( (snvmix_pl.buffer = malloc(snvmix_pl.n_buffer * sizeof(snvmix_call_t)) ) == NULL) { perror("[snvmixClassify_bamfile] Could not allocate space for buffer\n"); exit(1); } snvmix_pl.b_start = 0; snvmix_pl.b_end = 0; for(i = 0; i < snvmix_pl.n_buffer; i++) { snvmix_pl.buffer[i].tid = 0; snvmix_pl.buffer[i].pos = 0; snvmix_pl.buffer[i].in_use = 0; if( (snvmix_pl.buffer[i].p = malloc(states * sizeof(long double)) ) == NULL) {perror("[snvmixClassify_bamfile] Could not allocate space for buffer.p\n"); exit(1);} } bam_flag_mask = (BAM_FUNMAP | BAM_FSECONDARY); // | BAM_FQCFAIL | BAM_FDUP); if(params->filter_chast) bam_flag_mask |= BAM_FQCFAIL; if(params->filter_dup) bam_flag_mask |= BAM_FDUP; /* Call pileup with our function and bam_flag_mask */ sampileup(snvmix_pl.in, bam_flag_mask, snvmixClassify_pileup_func, &snvmix_pl); free(snvmix_pl.notmu); free(snvmix_pl.log_pi); free(snvmix_pl.b); } int flush_buffer(uint32_t tid, uint32_t pos, snvmix_pl_t *snvmix_pl) { snvmix_call_t *s; int i, k, cnt, buff_id; cnt = 0; #define _fptrOUT snvmix_pl->params->output //#define _fptrOUT stderr //fprintf(_fptrOUT, "ENTERING flush_buffer, b_start = %d\tb_end = %d\n",snvmix_pl->b_start, snvmix_pl->b_end); for(i = 0, buff_id = snvmix_pl->b_start; i < snvmix_pl->n_buffer; i++, buff_id=(buff_id+1)%snvmix_pl->params->window) { //fprintf(stderr, "ITERATING in flush_buffer, buff_id = %d\n", buff_id); s = snvmix_pl->buffer + buff_id; if(pos+1 >= (s->pos + snvmix_pl->params->window) || tid != s->tid) { //fprintf(stderr, "PASSED if in flush_buffer, buff_id = %d\n", buff_id); if(s->in_use) { //fprintf(_fptrOUT,"%d\t",buff_id); fprintf(_fptrOUT,"%s:%d\t%c\t%c\t%c:%d,%c:%d", snvmix_pl->in->header->target_name[s->tid], s->pos, s->ref, s->nref, s->ref, s->ref_count, s->nref, s->nref_count); if(!snvmix_pl->params->filter) { fprintf(_fptrOUT,","); for(k = 0; k < snvmix_pl->params->states; k++) fprintf(_fptrOUT,"%0.10Lf,",s->p[k]); fprintf(_fptrOUT,"%d",s->maxP); } fprintf(_fptrOUT,"\t%d %d", s->forward[0], s->reverse[0]); fprintf(_fptrOUT,"\t%d %d", s->forward[1], s->reverse[1]); fprintf(_fptrOUT,"\t%d %d", s->nref_center, s->nref_edges); fprintf(_fptrOUT,"\t%d", s->indel_pos); fprintf(_fptrOUT,"\t%d %d", s->c_clean[0], s->c_clean[1]); fprintf(_fptrOUT,"\t%d %d", s->bad_pair[0], s->bad_pair[1]); fprintf(_fptrOUT,"\t%d %d", s->c_ins[0], s->c_ins[1]); fprintf(_fptrOUT,"\t%d %d", s->c_del[0], s->c_del[1]); fprintf(_fptrOUT,"\t%d %d", s->c_junc[0], s->c_junc[1]); fprintf(_fptrOUT,"\t%d", s->aln_unique_pos); fprintf(_fptrOUT,"\t[%d,%d,%d,%d,%d]", s->raw_cvg[0],s->raw_cvg[1],s->raw_cvg[2],s->raw_cvg[3],s->raw_cvg[4]); fprintf(_fptrOUT,"\t[%d,%d,%d,%d,%d]", s->thr_cvg[0],s->thr_cvg[1],s->thr_cvg[2],s->thr_cvg[3],s->thr_cvg[4]); if(snvmix_pl->params->filter) { fprintf(_fptrOUT,"\t"); for(k = 0; k < s->ref_count; k++) fprintf(_fptrOUT,","); for(k = 0; k < s->nref_count; k++) fprintf(_fptrOUT,"%c",s->nref); } fprintf(_fptrOUT,"\n"); cnt++; s->in_use = 0; snvmix_pl->b_start = (buff_id+1)%snvmix_pl->params->window; } } else break; if(buff_id == snvmix_pl->b_end) break; } //fprintf(_fptrOUT, "EXITING flush_buffer, b_start = %d\tb_end = %d\n",snvmix_pl->b_start, snvmix_pl->b_end); return cnt; } static int snvmixClassify_pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) { #define _bQ(p, x) bam1_qual(p[x].b)[p[x].qpos] #define _mQ(p, x) p[x].b->core.qual //FILE *fptrOUT; char nref_skip = 'N'; //int *calls, ; int depth = 0; //long double *b, *notmu, *log_pi, *mu, *pi; long double Y, not_Y, z; //int states; int i, k, qual_num, maxp; //long double *phred; int b_end_tmp = 0; snvmix_call_t *s; snvmix_pl_t *snvmix_pl = (snvmix_pl_t *)data; param_struct *params = snvmix_pl->params; // Avoid allocating new variables, use defines to make things readable #define _calls snvmix_pl->calls #define _b snvmix_pl->b #define _notmu snvmix_pl->notmu #define _log_pi snvmix_pl->log_pi #define _mu params->mu #define _pi params->pi #define _phred snvmix_pl->phred #define _states params->states //fprintf(stderr,"DEBUG: snvmix_pl->buffer[snvmix_pl->b_start].in_use = %d\n", snvmix_pl->buffer[snvmix_pl->b_start].in_use); //if(pos+1 == 148971) // printf("DEBUG: test 148971, start.in_use = %d\tb_start=%d\tb_end=%d\n",snvmix_pl->buffer[snvmix_pl->b_start].in_use, snvmix_pl->b_start,snvmix_pl->b_end); //if(pos == 148971) // printf("DEBUG: test 148972, start.in_use = %d\tb_start=%d\tb_end=%d (pos+1 >= (snvmix_pl->buffer[snvmix_pl->b_start].pos + snvmix_pl->params->window) = %d >= %d\n", snvmix_pl->buffer[snvmix_pl->b_start].in_use, snvmix_pl->b_start,snvmix_pl->b_end, pos+1, snvmix_pl->buffer[snvmix_pl->b_start].pos + snvmix_pl->params->window); // If we have gone further than window indicates, or changed tid, then flush buffer if(snvmix_pl->buffer[snvmix_pl->b_start].in_use && (pos+1 >= (snvmix_pl->buffer[snvmix_pl->b_start].pos + snvmix_pl->params->window) || tid != snvmix_pl->buffer[snvmix_pl->b_start].tid )) flush_buffer(tid, pos, snvmix_pl); b_end_tmp = snvmix_pl->b_end; s = snvmix_pl->buffer + snvmix_pl->b_end; snvmix_pl->b_end = (snvmix_pl->b_end+1)%snvmix_pl->params->window; if(snvmix_pl->hash) { khint_t k_h = kh_get(64, snvmix_pl->hash, (uint64_t)tid<<32|pos); if (k_h == kh_end(snvmix_pl->hash)) return 0; } s->in_use = 1; if(params->full) nref_skip = -2; if (snvmix_pl->fai && (int)tid != snvmix_pl->tid) { free(snvmix_pl->ref); snvmix_pl->ref = fai_fetch(snvmix_pl->fai, snvmix_pl->in->header->target_name[tid], &snvmix_pl->len); snvmix_pl->tid = tid; } //chr = snvmix_pl->in->header->target_name[tid], s->tid = tid; s->ref = toupper((snvmix_pl->ref && (int)pos < snvmix_pl->len)? snvmix_pl->ref[pos] : 'N'); s->pos = pos + 1; depth = n; if(depth > snvmix_pl->depth_allocated) { if( (snvmix_pl->calls = realloc(snvmix_pl->calls, sizeof (int) * (depth + START_QNUM))) == NULL) { perror("ERROR allocating space for snvmix_pl->calls"); exit(1); } //calls = snvmix_pl->calls; snvmix_pl->depth_allocated = depth + START_QNUM; } //for(i = 0; i < depth; i++) { // FIXME: watch out for deletions, do they get automatically an "n"? if so, ok //fprintf(stderr, "DEBUG: len = %d\tref = %c\tcalls = ", snvmix_pl->len, ref); s->raw_cvg[0] = s->raw_cvg[1] = s->raw_cvg[2] = s->raw_cvg[3] = s->raw_cvg[4] = 0; s->thr_cvg[0] = s->thr_cvg[1] = s->thr_cvg[2] = s->thr_cvg[3] = s->thr_cvg[4] = 0; for(i = depth; i--; ) { if(pl[i].is_del) { _calls[i] = -1; continue; } _calls[i] = "nACnGnnnTnnnnnnn"[bam1_seqi(bam1_seq(pl[i].b), pl[i].qpos)]; //fprintf(stderr, "%c", _calls[i]); switch(_calls[i]) { case 'A': s->raw_cvg[0]++; break; case 'C': s->raw_cvg[1]++; break; case 'G': s->raw_cvg[2]++; break; case 'T': s->raw_cvg[3]++; break; case 'n': s->raw_cvg[4]++; break; default: break; } // Mask calls according to quality if( snvmixSkipCall_alt(params, _calls[i], (char) bam1_qual(pl[i].b)[pl[i].qpos], (char) pl[i].b->core.qual) ) _calls[i] = -1; else { switch(_calls[i]) { case 'A': s->thr_cvg[0]++; break; case 'C': s->thr_cvg[1]++; break; case 'G': s->thr_cvg[2]++; break; case 'T': s->thr_cvg[3]++; break; case 'n': s->thr_cvg[4]++; break; default: break; } if(_calls[i] == s->ref) _calls[i] = 1; else if(_calls[i] == 'n') _calls[i] = -1; } } s->nref = snvmixFilterCalls(_calls,depth,NULL,NULL,params); s->nref_count = 0; s->ref_count = 0; // FIXME: review this section, extra flags for(i = 0; i < 2; i++) { s->forward[i] = 0; s->reverse[i] = 0; s->good_pair[i] = 0; s->bad_pair[i] = 0; s->c_clean[i] = 0; s->c_ins[i] = 0; s->c_del[i] = 0; s->c_junc[i] = 0; } s->nref_edges = 0; // FIXME: fixed 5 bp edges s->nref_center = 0; // FIXME: fixed 5 bp edges s->indel_pos = 0; // How many reads have a deletion at that site s->indel_near = 0; // How many reasd have a deletion overall s->aln_unique_pos = 0; uint8_t cigar_ops; // 0x1 INS, 0x2 DEL , 0x4 JUNCTION //for(qual_num = 0; qual_num < depth; qual_num++) { uint32_t debug_sort = 0x7fffffff; if(s->nref != nref_skip) { k = -1; for(qual_num = depth; qual_num--; ) { if(_calls[qual_num] == -1) { if(pl[qual_num].is_del) s->indel_pos++; continue; } cigar_ops = 0; for(i=0; i<pl[qual_num].b->core.n_cigar; i++) { int op = bam1_cigar(pl[qual_num].b)[i] & BAM_CIGAR_MASK; if(op == BAM_CINS) cigar_ops |= 0x1; if(op == BAM_CDEL) cigar_ops |= 0x2; if(op == BAM_CREF_SKIP) cigar_ops |= 0x4; } if(pl[qual_num].indel) { s->indel_pos++; } if(_calls[qual_num] == 1) { s->ref_count++; k = 0; } else if(_calls[qual_num] == s->nref) { s->nref_count++; k = 1; if(pl[qual_num].qpos > (pl[qual_num].b->core.l_qseq - 5) || pl[qual_num].qpos < 5) s->nref_edges++; else s->nref_center++; if(pl[qual_num].b->core.pos < debug_sort) { s->aln_unique_pos++; debug_sort = pl[qual_num].b->core.pos; } else if(pl[qual_num].b->core.pos > debug_sort) {fprintf(stderr, "ERROR checking sort in read-position, prev = %Ld, new = %Ld\n", debug_sort, pl[qual_num].b->core.pos); exit(1);} } if(k != -1) { if(cigar_ops) { if(cigar_ops & 0x1) s->c_ins[k]++; if(cigar_ops & 0x2) s->c_del[k]++; if(cigar_ops & 0x4) s->c_junc[k]++; } else s->c_clean[k]++; if(pl[qual_num].b->core.tid != pl[qual_num].b->core.mtid) s->bad_pair[k]++; if(pl[qual_num].b->core.flag & 0x10) s->reverse[k]++; else s->forward[k]++; } } } if(!params->filter) { if(s->nref == nref_skip) { snvmix_pl->b_end = b_end_tmp; s->in_use = 0; return 0; } int block = (_states / 3) * 3; //for(k = 0; k < _states; k++) //for(k = _states; k--; ) // _b[k] = 0; k = 0; while( k < block ) { _b[k] = 0; _b[k+1] = 0; _b[k+2] = 0; k += 3; } if( k < _states) { switch( _states - k ) { case 2: _b[k] = 0; k++; case 1: _b[k] = 0; } } //for(qual_num = 0; qual_num < depth; qual_num++) { for(qual_num = depth; qual_num--; ) { //if( snvmixSkipCall(calls,qual_num,params,bQ,mQ) ) if(_calls[qual_num] == -1) continue; // from matlab: // b = sum(log(notm*0.5 + m.*(yr.*mur+notyr.*notmur)),2); if(_calls[qual_num] == 1) { // If REF then Y = _phred[(unsigned char) _bQ(pl, qual_num)]; not_Y = 1-_phred[(unsigned char) _bQ(pl, qual_num)]; } else { // If NREF then Y = (1-_phred[(unsigned char) _bQ(pl, qual_num)])/3; not_Y = _phred[(unsigned char) _bQ(pl, qual_num)] + 2*(1-_phred[(unsigned char)_bQ(pl, qual_num)])/3; } //for(k = 0; k < _states; k++) //for(k = _states; k--; ) // _b[k] = _b[k] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k] + not_Y * _notmu[k])); k = 0; while( k < block ) { _b[k] = _b[k] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k] + not_Y * _notmu[k])); _b[k+1] = _b[k+1] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k+1] + not_Y * _notmu[k+1])); _b[k+2] = _b[k+2] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k+2] + not_Y * _notmu[k+2])); k += 3; } if( k < _states) { switch( _states - k ) { case 2: _b[k] = _b[k] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k] + not_Y * _notmu[k])); k++; case 1: _b[k] = _b[k] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k] + not_Y * _notmu[k])); } } } z = 0; //for(k = 0; k < _states; k++) { //for(k = _states; k--; ) { // _b[k] = expl(_b[k] + _log_pi[k]); // z += _b[k]; //} k = 0; while( k < block ) { _b[k] = expl(_b[k] + _log_pi[k]); z += _b[k]; _b[k+1] = expl(_b[k+1] + _log_pi[k+1]); z += _b[k+1]; _b[k+2] = expl(_b[k+2] + _log_pi[k+2]); z += _b[k+2]; k += 3; } if( k < _states ) { switch( _states - k ) { case 2: _b[k] = expl(_b[k] + _log_pi[k]); z += _b[k]; k++; case 1: _b[k] = expl(_b[k] + _log_pi[k]); z += _b[k]; } } if(!z) z = 1; //for(k = 0; k < _states; k++) //for(k = _states; k--; ) // _b[k] = _b[k] / z; k = 0; while( k < block ) { _b[k] = _b[k] / z; _b[k+1] = _b[k+1] / z; _b[k+2] = _b[k+2] / z; k += 3; } if( k < _states ) { switch( _states - k ) { case 2: _b[k] = _b[k] / z; k++; case 1: _b[k] = _b[k] / z; } } maxp = _states - 1; z = _b[maxp]; //for(k = 0; k < _states; k++) for(k = _states; k--; ) if( _b[k] > z) { maxp = k; z = _b[k]; } for(k = 0; k < _states; k++) s->p[k] = _b[k]; s->maxP = maxp+1; } } /* void snvmixGetCallString Arguments: *col : pointer to the current file-line in memory *calls : pointer to array where we will store the final base-calls depth : number of base reads for this position *nref : the observed NREF value will be placed here (-1 if none was found) Function: This will parse column 5 of the pileup file, which contains the base calls and will fill the array "calls" with: 1 : when a reference call was made -1 : when an invalid value is seen ('N', 'n', '*') [ACTG] : when a non-reference call was made */ inline void snvmixGetCallString(char *col, int *calls, int depth, char *nref) { int i; int call_i = 0, call_skip_num = 0; char call_skip_char[10]; for(i = 0 ; call_i < depth; i++) { switch(col[i]){ case ',': case '.': calls[call_i] = 1; call_i++; break; case 'a': case 'A': case 't': case 'T': case 'g': case 'G': case 'c': case 'C': //calls[call_i] = 0; calls[call_i] = toupper(col[i]); call_i++; //if(*nref == 0) //*nref = toupper(*(col+sizeof(char)*i)); if(*nref == 0) *nref = toupper(col[i]); break; case 'N': case 'n': case '*': // Not comparable values, we ignore them, but need to // keep count to compare against the number of mapping qualities calls[call_i] = -1; call_i++; break; case '$': // End of a read, just ignore it break; case '^': // Start of read, ignore it and skip the next value (not base-related) i++; break; case '+': case '-': // Eliminate: // +2AA // -3AAA // Start skiping the sign i++; for(call_skip_num = 0; col[i] <= '9' && col[i] >= '0'; call_skip_num++, i++) { //call_skip_char[call_skip_num] = call; call_skip_char[call_skip_num] = col[i]; //i++; } // we have the number string in call_skip_char, lets terminate it call_skip_char[call_skip_num] = '\0'; // i has been updated to first letter, just need to jump the rest of them i = i+atoi(call_skip_char)-1; break; default: fprintf(stderr,"ERROR: problem reading pileup file calls (%c)\n",col[i]); exit(1); break; } } // If no nref was found, don't bother parsing the other columns, make the for fail with -1 if(!*nref) *nref = -1; } /* int snvmixFilterCalls Arguments: *calls : array built by snvmixGetCallString containing 1 : when a reference call was made -1 : when an invalid value is seen ('N', 'n', '*') [ACTG] : when a non-reference call was made depth : number of calls for the current position *bQ : base qualities observed for each call *mQ : map quality for each call *params : parameter structure, get thresholding data Function: For each valid call read in the position this function will apply thresholding according to the type selected (-t flag) and the bQ (-q) and mQ (-Q) thresholds provided. Any base-call that does not pass thresholds will be switched from it's current value in *calls to -1; The most prevalent NREF after filtering will be determined and returned. */ inline int snvmixFilterCalls(int *calls, int depth, char *bQ, char *mQ, param_struct *params) { int qual_num, nref_counts[5]; nref_counts[0] = 0; nref_counts[1] = 0; nref_counts[2] = 0; nref_counts[3] = 0; nref_counts[4] = 0; int max_nref = N; char nref = 0; //for(qual_num = 0; qual_num < depth; qual_num++) { for(qual_num = depth; qual_num--; ) { //if( snvmixSkipCall(calls,qual_num,params,bQ,mQ) ) { if( bQ != NULL && mQ != NULL && snvmixSkipCall(calls,qual_num,params,bQ,mQ) ) { calls[qual_num] = -1; } else { nref_counts[0]++; switch(calls[qual_num]) { case 'A': nref_counts[A]++; break; case 'C': nref_counts[C]++; break; case 'G': nref_counts[G]++; break; case 'T': nref_counts[T]++; break; case 1: case -1: nref_counts[0]--; break; default: fprintf(stderr,"ERROR: unknown call base\n"); exit(1); } } } if(nref_counts[0]) { max_nref = A; if(nref_counts[max_nref] < nref_counts[C]) max_nref = C; if(nref_counts[max_nref] < nref_counts[G]) max_nref = G; if(nref_counts[max_nref] < nref_counts[T]) max_nref = T; } else { //return -1; } //for(qual_num = 0; qual_num < depth; qual_num++) { for(qual_num = depth; qual_num--; ) { if(calls[qual_num] == 1) continue; if(calls[qual_num] != base_code[max_nref]) calls[qual_num] = -1; } return base_code[max_nref]; } /* int snvmixSkipCall Arguments: *calls : array built by snvmixGetCallString containing 1 : when a reference call was made -1 : when an invalid value is seen ('N', 'n', '*') [ACTG] : when a non-reference call was made qual_num : call number that is being evaluated *params : parameter structure, get thresholding data *bQ : base qualities observed for each call *mQ : map quality for each call Function: Evalates quality values in each of the filtering models Returns 1 if the calls[qual_num] needs to be filtered out Returns 0 otherwise */ inline int snvmixSkipCall(int *calls, int qual_num, param_struct *params, char *bQ, char *mQ) { if(calls[qual_num] == -1) return(1); if(params->filter_type == TYPE_mb) { if(bQ[qual_num] <= params->bQ || mQ[qual_num] <= params->mQ) return(1); } else if(params->filter_type == TYPE_b) { if(bQ[qual_num] <= params->bQ) return(1); } else { if(mQ[qual_num] <= params->mQ) return(1); if(params->filter_type == TYPE_m) { // Use mapping as surrogate bQ[qual_num] = mQ[qual_num]; // Make mapping one mQ[qual_num] = (char) PHRED_MAX; } else if(params->filter_type == TYPE_M) { // Mapping passed, make it one mQ[qual_num] = (char) PHRED_MAX; } else if(params->filter_type == TYPE_Mb) { // Nothing special here } else if(params->filter_type == TYPE_MB || params->filter_type == TYPE_SNVMix1) { if(bQ[qual_num] <= params->bQ) return(1); if(params->filter_type == TYPE_SNVMix1) { bQ[qual_num] = (char) PHRED_MAX; mQ[qual_num] = (char) PHRED_MAX; } } } return(0); } inline int snvmixSkipCall_alt(param_struct *params, int call, char bQ, char mQ) { if(call == -1) return(1); if(params->filter_type != TYPE_b && mQ <= params->mQ) return(1); switch(params->filter_type) { case TYPE_mb: if(bQ <= params->bQ || mQ <= params->mQ) return(1); break; case TYPE_b: if(bQ <= params->bQ) return(1); break; case TYPE_m: // Use mapping as surrogate bQ = mQ; // Make mapping one mQ = (char) PHRED_MAX; break; case TYPE_M: // Mapping passed, make it one mQ = (char) PHRED_MAX; break; case TYPE_Mb: // Nothing special here break; case TYPE_MB: case TYPE_SNVMix1: if(bQ <= params->bQ) return(1); if(params->filter_type == TYPE_SNVMix1) { bQ = (char) PHRED_MAX; mQ = (char) PHRED_MAX; } break; } return(0); } void resetParams(param_struct *params) { params->input = stdin; params->output = stdout; params->inputfile = NULL; params->outputfile = NULL; params->modelfile = NULL; params->fastafile = NULL; params->listfile = NULL; params->window = 1; params->filter_type = 0; params->filter_dup = 1; params->filter_chast = 1; params->train = 0; params->classify = 0; params->filter = 0; params->full = 0; params->input_type = S_PILEUP; // 0 = processed, 1 = maq pileup, 2 = sam pileup, 3 = bam file params->mu = NULL; params->pi = NULL; params->max_iter = 1000; params->bQ = 19; params->mQ = 19; params->debug = 0; params->states = 3; params->trainP.alpha = NULL; params->trainP.beta = NULL; params->trainP.delta = NULL; params->trainP.param_file = NULL; params->trainP.max_iter = 100; params->trainP.bQ = NULL; params->trainP.mQ = NULL; params->trainP.calls = NULL; params->window = 1; } void initSNVMix(int argc, char **argv, param_struct *params) { char c; int i; resetParams(params); while ((c = getopt (argc, argv, "hDTCFfi:o:m:t:p:q:Q:a:b:d:M:S:r:l:w:cR")) != -1) { switch (c) { case 'm': params->modelfile = optarg; break; case 'i': params->inputfile = optarg; break; case 'o': params->outputfile = optarg; break; // Bam file opts case 'r': params->fastafile = optarg; break; case 'l': params->listfile = optarg; break; case 'T': params->train = 1; break; case 'C': params->classify = 1; break; case 'F': params->filter = 1; break; case 'f': params->full = 1; break; case 'p': if(*optarg == 'q') { params->input_type = Q_PILEUP; } else if(*optarg == 'm') { params->input_type = M_PILEUP; } else if(*optarg == 's') { params->input_type = S_PILEUP; } else if(*optarg == 'b') { params->input_type = BAM_FILE; } else { fprintf(stderr,"ERROR: unknown pileup format %c\n",*optarg); exit(1); } break; case 't': if(strcmp("mb",optarg) == 0) params->filter_type = TYPE_mb; else if(strcmp("m",optarg) == 0) params->filter_type = TYPE_m; else if(strcmp("b",optarg) == 0) params->filter_type = TYPE_b; else if(strcmp("M",optarg) == 0) params->filter_type = TYPE_M; else if(strcmp("Mb",optarg) == 0) params->filter_type = TYPE_Mb; else if(strcmp("MB",optarg) == 0) params->filter_type = TYPE_MB; else if(strcmp("SNVMix1",optarg) == 0) params->filter_type = TYPE_SNVMix1; else { fprintf(stderr,"ERROR: filter type '%s' not recognized\n",optarg); exit(1); } break; case 'q': params->bQ = atoi(optarg); if( params->bQ < 0 || params->bQ >= PHRED_MAX ) { fprintf(stderr,"ERROR: quality threshold value Q%d out of range\n",params->bQ); exit(1); } break; case 'Q': params->mQ = atoi(optarg); if( params->mQ < 0 || params->mQ >= PHRED_MAX ) { fprintf(stderr,"ERROR: mapping threshold value Q%d out of range\n",params->mQ); exit(1); } break; case 'h': usage(argv[0]); break; case 'D': params->debug = 1; break; case 'M': params->trainP.param_file = optarg; break; case 'S': params->states = atoi(optarg); if( params->states < 3 ) { fprintf(stderr,"ERROR: state minimum is 3\n"); exit(1); } break; case 'w': params->window = atoi(optarg); if(params->window < 1) params->window = 1; break; case 'c': params->filter_chast = 0; break; case 'R': params->filter_dup = 0; break; default: fprintf(stderr,"Unknown option\n"); usage(argv[0]); break; } } /* Decide if we're going to train or classify */ if(params->filter) { params->train = 0; params->classify = 0; } if(params->train && params->classify) { fprintf(stderr,"ERROR: choose either train or classify\n"); exit(1); } else if(!params->train && !params->classify && !params->filter) { fprintf(stderr,"Train/Classify/Filter not selected, picking default: Classify\n"); params->classify = 1; } /* Set parameters */ allocateParameters(params); if( params->train ) setTrainingParameters(params); if( params->classify ) setClassificationParameters(params); /* Open input and output streams */ if( params->input_type == BAM_FILE ) { if( params->train ) { fprintf(stderr, "BAM input for training not yet implemented\n"); exit(1); } if( params->inputfile == NULL || strcmp(params->inputfile, "-") == 0) { fprintf(stderr, "ERROR: if '-p b' is chosen, input file has to be a bam file\n"); exit(1); } if( params->fastafile == NULL ) { fprintf(stderr, "ERROR: if '-p b' is chosen, reference fasta file is required (-r <ref.fa>)\n"); exit(1); } } else if( params->inputfile != NULL) { params->input = fopen(params->inputfile, "r"); if(params->input == NULL) { perror("ERROR: could not open input file"); exit(1); } } else { params->input = stdin; } if( params->outputfile != NULL ) { params->output = fopen(params->outputfile, "w"); if(params->output == NULL) { perror("ERROR: could not open output file"); exit(1); } } else { params->output = stdout; } } void allocateParameters(param_struct *params) { /* Allocate alpha */ if( (params->trainP.alpha = malloc(params->states * sizeof(long double))) == NULL) { perror("ERROR: could not allocate space for alpha\n"); exit(1); } /* Allocate beta */ if( (params->trainP.beta = malloc(params->states * sizeof(long double))) == NULL) { perror("ERROR: could not allocate space for beta\n"); exit(1); } /* Allocate delta*/ if( (params->trainP.delta = malloc(params->states * sizeof(long double))) == NULL) { perror("ERROR: could not allocate space for delta\n"); exit(1); } /* Allocate mu*/ if( (params->mu = malloc(params->states * sizeof(long double))) == NULL) { perror("ERROR: could not allocate space for mu\n"); exit(1); } /* Allocate pi*/ if( (params->pi = malloc(params->states * sizeof(long double))) == NULL) { perror("ERROR: could not allocate space for pi\n"); exit(1); } } void setTrainingParameters(param_struct *params) { if( params->trainP.param_file != NULL ) { readParamFile(params,'t'); } else { if(params->states != 3) { fprintf(stderr, "ERROR: if state space larger than 3 requested, parameters must be provided\n"); exit(1); } fprintf(stderr,"Training parameter file not given, using defaults\n"); params->trainP.alpha[0] = 1000; params->trainP.alpha[1] = 5000; params->trainP.alpha[2] = 1; params->trainP.beta[0] = 1; params->trainP.beta[1] = 5000; params->trainP.beta[2] = 1000; params->trainP.delta[0] = 10; params->trainP.delta[1] = 1; params->trainP.delta[2] = 1; } } void setClassificationParameters(param_struct *params) { int k; if( params->modelfile != NULL ) { readParamFile(params,'c'); } else { if(params->states != 3) { fprintf(stderr, "ERROR: if state space larger than 3 requested, model file must be provided\n"); exit(1); } params->mu[0] = 0.995905287891696078261816182930; params->mu[1] = 0.499569401000925172873223800707; params->mu[2] = 0.001002216846753116409260431219; params->pi[0] = 0.672519580755555956841362785781; params->pi[1] = 0.139342327124010650907237618412; params->pi[2] = 0.188138092120433392251399595807; fprintf(stderr,"Classification parameter file not given, using for mQ35 bQ10\n"); } fprintf(stderr,"Mu and pi values:\n"); for(k = 0; k < params->states; k++) fprintf(stderr,"\tmu[%d] = %Lf\tpi[%d] = %Lf\n", k, params->mu[k], k, params->pi[k]); fprintf(stderr,"\n"); } void readParamFile(param_struct *params, char type) { FILE *fptrPARAM; char *line = NULL, *param_name, *param, *param_ptr, *filename; size_t line_len = 0, read_len = 0; long double *target; int i; if(type == 't') { filename=params->trainP.param_file; } else if(type == 'c') { filename=params->modelfile; } fptrPARAM=fopen(filename, "r"); if(fptrPARAM == NULL) { fprintf(stderr, "ERROR: could not open parameter file %s\n", filename); exit(1); } #if defined __linux__ || defined _GNU_SOURCE while( (read_len = getline(&line,&line_len,fptrPARAM)) != -1 ) #endif #ifdef __APPLE__ while( (line = fgetln(fptrPARAM,&line_len)) != NULL ) #endif { line[line_len-1] = '\0'; target = NULL; param_name = strtok_r(line, " ", ¶m_ptr); if(type == 't') { if(strcmp("alpha", param_name) == 0) target = params->trainP.alpha; else if(strcmp("beta", param_name) == 0) target = params->trainP.beta; else if(strcmp("delta", param_name) == 0) target = params->trainP.delta; } else if(type == 'c') { if(strcmp("mu", param_name) == 0) target = params->mu; else if(strcmp("pi", param_name) == 0) target = params->pi; } if(target == NULL) { fprintf(stderr, "Unknown parameter %s, skipping\n", param_name); continue; } for(i = 0; i < params->states; i++) { param = strtok_r(NULL, " ", ¶m_ptr); if(param == NULL) { fprintf(stderr,"ERROR: missing value #%d for %s\n", i, param_name); exit(1); } target[i] = atof(param); fprintf(stderr, "DEBUG: %s[%d] = %Lf\n", param_name, i, target[i]); } } fclose(fptrPARAM); free(line); } void initPhred(long double *phredTable, int elem) { int i; for(i = 0; i < elem; i++) { phredTable[i] = 1-powl(10,(-(long double)i/10)); } phredTable[i] = 1; } void usage(char *selfname) { param_struct default_params; resetParams(&default_params); fprintf(stderr,"Syntax:\n\t%s\t-m <modelfile> [-i <infile>] [-o <outfile>] [-T | -C | -F] [-p < q | m | s >] [-t < mb | m | b | M | Mb | MB | SNVMix1>] [-q <#>] [-Q <#>] [-M <trainP file>] [-h]\n",selfname); fprintf(stderr,"\tRequired:\n"); fprintf(stderr,"\t-m <modelfile>\tmodel file, a line for mu and a line for pi, three\n"); fprintf(stderr,"\t\t\tspace-separated values each, like:\n"); fprintf(stderr,"\t\t\tmu 0.xxxxxxxx 0.xxxxxxxx 0.xxxxxxxx\n"); fprintf(stderr,"\t\t\tpi 0.xxxxxxxx 0.xxxxxxxx 0.xxxxxxxx\n"); fprintf(stderr,"\tOptional:\n"); fprintf(stderr,"\t-i <infile>\tquality pileup, from pileup2matlab.pl script (def: STDIN)\n"); fprintf(stderr,"\t-o <outfile>\twhere to put the results (def: STDOUT)\n"); fprintf(stderr,"\t-T | -C | -F\tTrain, Classify or Filter (def: Classify)\n"); fprintf(stderr,"\t-p q|m|s\tInput pileup format (def: s)\n\t\t\tq = quality\n\t\t\tm = MAQ\n\t\t\ts = SAMtools\n"); fprintf(stderr,"\t-t mb|m|b|M|Mb|MB|SNVMix1\n\t\t\tFilter type (def: mb)\n"); fprintf(stderr,"\t\t\tmb\tLowest between map and base quality\n"); fprintf(stderr,"\t\t\tm\tFilter on map, and use as surrogate for base quality\n"); fprintf(stderr,"\t\t\tb\tFilter on base quality, take map quality as 1\n"); fprintf(stderr,"\t\t\tM\tFilter on map quality but use only base quality\n"); fprintf(stderr,"\t\t\tMb\tFilter on map quality and use both map and base qualities\n"); fprintf(stderr,"\t\t\tMB\tFilter on map quality AND base quality\n"); fprintf(stderr,"\t\t\tSNVMix1\tFilter on base quality and map quality, afterwards consider them perfect\n"); fprintf(stderr,"\t-q #\t\tCutoff Phred value for Base Quality, anything <= this value is ignored (def: Q%d)\n",default_params.bQ); fprintf(stderr,"\t-Q #\t\tCutoff Phred value for Map Quality, anything <= this value is ignored (def: Q%d)\n",default_params.mQ); fprintf(stderr,"\n\tTraining parameters:\n"); fprintf(stderr,"\t-M <file>\tProvide a file containing training parameters\n"); fprintf(stderr,"\t\t\tValues are space-separated:\n"); fprintf(stderr,"\t\t\talpha # # #\n"); fprintf(stderr,"\t\t\tbeta # # #\n"); fprintf(stderr,"\t\t\tdelta # # #\n"); fprintf(stderr,"\n\t-h\t\tthis message\n"); fprintf(stderr,"\nImplementation: Rodrigo Goya, 2009. Version %s\n",VERSION); exit(0); } // Based on classify pileup, but reads the entire file to memory for training purposes. void snvmixGetTrainSet_pileup(param_struct *params) { //MAQ: // 1 554484 C 1752 @,,.,.,, @xxx @xxx xxxxx //SAM: // 1 554484 C 1752 ,,.,.,, xxx xxxx FILE *fptrIN; char *line = NULL; size_t line_len = 0, prev_len = 0; int read_len = 0; int col_num = 0; long int line_num = 0; char *col, *qual; char *col_sep = "\t", *col_str, *col_ptr; char *qual_sep = ",", *qual_str, *qual_ptr; char *chr, *pos, *ref, nref, call; char *bQ, *mQ; int *calls, *tmpQ, pos_int; int depth = 0, qual_num, maxp; int depth_allocated = 0; int trainset = 0; int trainset_allocated = 0; int nref_count = 0, ref_count = 0; int i, call_i; fptrIN = params->input; if( (params->trainP.bQ = malloc(START_QNUM * sizeof (unsigned char **))) == NULL) { perror("ERROR allocating initial space for train.bQ"); exit(1); } if( (params->trainP.mQ = malloc(START_QNUM * sizeof (unsigned char **))) == NULL) { perror("ERROR allocating initial space for train.mQ"); exit(1); } if( (params->trainP.calls = malloc(START_QNUM * sizeof (signed char **))) == NULL) { perror("ERROR allocating initial space for train.calls"); exit(1); } if( (params->trainP.depth = malloc(START_QNUM * sizeof (int *))) == NULL) { perror("ERROR allocating initial space for train.depth"); exit(1); } trainset_allocated = START_QNUM; if( (calls = malloc(START_QNUM * sizeof (int))) == NULL) { perror("ERROR allocating initial space for train.depth"); exit(1); } depth_allocated = START_QNUM; #if defined __linux__ || defined _GNU_SOURCE while( (read_len = getline(&line,&line_len,fptrIN)) != -1 ) #endif #ifdef __APPLE__ while( (line = fgetln(fptrIN,&line_len)) != NULL ) #endif { line[line_len-1] = '\0'; depth = 0; nref = 0; col_ptr = NULL; for(col_num = 0, col_str = line; ; col_num++, col_str = NULL) { col = strtok_r(col_str, "\t", &col_ptr); if(col == NULL) { break; } if(col_num == 0) { chr = col; } else if(col_num == 1) { pos = col; pos_int = strtol(pos, NULL, 10); } else if(col_num == 2) { ref = col; } else if(col_num == 3) { depth = atoi(col); if(depth > depth_allocated) { if( (calls = realloc(calls, sizeof (int) * (depth + START_QNUM))) == NULL) { perror("ERROR allocating space for calls"); exit(1); } depth_allocated = depth + START_QNUM; } } else if(col_num == 4) { if(params->input_type == M_PILEUP) col = col+sizeof(char); snvmixGetCallString(col, calls, depth, &nref); } else if(col_num == 5) { bQ = col; // If it's a MAQ pileup, we need to skip the @ sign if(params->input_type == M_PILEUP) bQ = bQ + sizeof(char); for(call_i = 0; call_i < depth; call_i++) bQ[call_i] = bQ[call_i]-33; } else if(col_num == 6) { mQ = col; // If it's a MAQ pileup, we need to skip the @ sign if(params->input_type == M_PILEUP) mQ = mQ + sizeof(char); for(call_i = 0; call_i < depth; call_i++) mQ[call_i] = mQ[call_i]-33; } } if(line_num >= trainset_allocated) { if( ( params->trainP.bQ = realloc(params->trainP.bQ, (line_num + START_QNUM) * sizeof (unsigned char *)) ) == NULL ) { perror("ERROR reallocating space for trainP.bQ"); exit(1); } if( ( params->trainP.mQ = realloc(params->trainP.mQ, (line_num + START_QNUM) * sizeof (unsigned char *)) ) == NULL ) { perror("ERROR reallocating space for trainP.mQ"); exit(1); } if( ( params->trainP.calls = realloc(params->trainP.calls, (line_num + START_QNUM) * sizeof (signed char *)) ) == NULL) { perror("ERROR reallocating space for trainP.calls"); exit(1); } if( ( params->trainP.depth = realloc(params->trainP.depth, (line_num + START_QNUM) * sizeof (int)) ) == NULL) { perror("ERROR reallocating space for trainP.depth"); exit(1); } trainset_allocated = line_num + START_QNUM; } nref = snvmixFilterCalls(calls,depth,bQ,mQ,params); nref_count = 0; ref_count = 0; for(qual_num = 0; qual_num < depth; qual_num++) { if(calls[qual_num] == -1) continue; if(calls[qual_num] == 1) ref_count++; if(calls[qual_num] == nref) nref_count++; } params->trainP.depth[line_num] = ref_count + nref_count; if( (params->trainP.bQ[line_num] = malloc(sizeof(unsigned char) * params->trainP.depth[line_num])) == NULL ) { perror("ERROR allocating space for trainP.bQ"); exit(1); } if( (params->trainP.mQ[line_num] = malloc(sizeof(unsigned char) * params->trainP.depth[line_num])) == NULL ) { perror("ERROR allocating space for trainP.mQ"); exit(1); } if( (params->trainP.calls[line_num] = malloc(sizeof(signed char) * params->trainP.depth[line_num])) == NULL ) { perror("ERROR allocating space for trainP.calls"); exit(1); } call_i = 0; for(qual_num = 0; qual_num < depth; qual_num++) { if(calls[qual_num] == -1) continue; params->trainP.calls[line_num][call_i] = (signed char) calls[qual_num]; params->trainP.bQ[line_num][call_i] = (unsigned char) bQ[qual_num]; params->trainP.mQ[line_num][call_i] = (unsigned char) mQ[qual_num]; call_i++; } if( params->trainP.depth[line_num] != call_i ) { fprintf(stderr, "ERROR: mismatch between trainP.depth and call_i\n"); exit(1); } line_num++; } params->trainP.len = line_num; params->trainP.len = line_num; fclose(fptrIN); free(line); free(calls); } // Use EM algorithm on a memory-located data set to train the parameters void snvmixTrain_pileup(param_struct *params) { int line_num = 0, call_i = 0, k = 0, states; int iter = 0; FILE *fptrOUT; long double phred[PHRED_MAX + 1]; long double bsum = 0, msum = 0, *b, *mu, *notmu, *pi, *log_pi, z = 0; long double **pG, **xr; long double *xrhat, *sum_pG; long double Y, not_Y, M; int N_sum; int strength; long double ll, prev_ll = 0; //initPhred(phred, PHRED_MAX+1); initPhred(phred, PHRED_MAX); states = params->states; // Initialize local values of parameters mu = params->mu; pi = params->pi; if( ( b = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate b\n"); exit(1); } if( ( notmu = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate notmu\n"); exit(1); } if( ( log_pi = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate log_pi\n"); exit(1); } if( ( xrhat = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate xrhat\n"); exit(1); } if( ( sum_pG = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate sum_pG\n"); exit(1); } snvmixGetTrainSet_pileup(params); if(params->debug) { for(line_num = 0; line_num < params->trainP.len; line_num++) { fprintf(stderr, "line_num: %d", line_num); for(call_i = 0; call_i < params->trainP.depth[line_num]; call_i++) { fprintf(stderr, "\t(%d,bQ%d,mQ%d)", params->trainP.calls[line_num][call_i], params->trainP.bQ[line_num][call_i], params->trainP.mQ[line_num][call_i]); } fprintf(stderr, "\n\n"); } } // Initialize mu and pi fprintf(stderr, "Initializing mu\n"); for(k = 0; k < states; k++) { params->mu[k] = (long double) params->trainP.alpha[k] / (params->trainP.alpha[k] + params->trainP.beta[k]); fprintf(stderr,"\talpha[%d] = %Lf,\tbeta[%d] = %Lf,\tmu[%d] = %Lf\n", k, params->trainP.alpha[k], k, params->trainP.beta[k], k, params->mu[k]); } fprintf(stderr, "Initializing pi\n"); z = (long double) params->trainP.delta[0] + params->trainP.delta[1] + params->trainP.delta[2]; if(!z) { z = 1; } for(k = 0; k < states; k++) { params->pi[k] = (long double) params->trainP.delta[k] / z; fprintf(stderr,"\tdelta[%d] = %Lf,\tpi[%d] = %Lf\n", k, params->trainP.delta[k], k, params->pi[k]); } strength = params->trainP.len; // Initialize matrices if( (pG = malloc(sizeof(long double *) * params->trainP.len)) == NULL) { perror("ERROR allocating initial space for pG"); exit(1); } if( (xr = malloc(sizeof(long double *) * params->trainP.len)) == NULL) { perror("ERROR allocating initial space for xr"); exit(1); } for(line_num = 0; line_num < params->trainP.len; line_num++) { // [states] cells for each line_num if( (pG[line_num] = malloc(sizeof(long double) * states)) == NULL) { perror("ERROR allocating space for pG"); exit(1); } if( (xr[line_num] = malloc(sizeof(long double) * states)) == NULL) { perror("ERROR allocating space for xr"); exit(1); } } for(iter = 0; iter < params->trainP.max_iter; iter++) { // Reset values for this iteration for(k = 0; k < states; k++) { notmu[k] = 1 - mu[k]; log_pi[k] = logl(pi[k]); sum_pG[k] = 0; xrhat[k] = 0; } N_sum = 0; ll = 0; // E step for(line_num = 0; line_num < params->trainP.len; line_num++) { if(params->trainP.depth == 0) continue; for(k = 0; k < states; k++) { xr[line_num][k] = 0; b[k] = 0; } for(call_i = 0; call_i < params->trainP.depth[line_num]; call_i++) { if(params->trainP.calls[line_num][call_i] == 1) { Y = phred[params->trainP.bQ[line_num][call_i]]; not_Y = 1-phred[params->trainP.bQ[line_num][call_i]]; } else { Y = (1-phred[params->trainP.bQ[line_num][call_i]])/3; not_Y = phred[params->trainP.bQ[line_num][call_i]] + 2*(1-phred[params->trainP.bQ[line_num][call_i]])/3; } M = phred[params->trainP.mQ[line_num][call_i]]; for(k = 0; k < states; k++) { b[k] = b[k] + logl( (1 - M) * 0.5 + M * (Y * mu[k] + not_Y * notmu[k]) ); xr[line_num][k] = xr[line_num][k] + Y; } } z = 0; for(k = 0; k < states; k++) { b[k] = expl(b[k] + log_pi[k]); z = z + b[k]; } if(!z) { z=1; } for(k = 0; k < states; k++) { pG[line_num][k] = b[k] / z; } ll = ll + logl(z); N_sum = N_sum + params->trainP.depth[line_num]; for(k = 0; k < states; k++) { sum_pG[k] = sum_pG[k] + pG[line_num][k]; xrhat[k] = xrhat[k] + xr[line_num][k] * pG[line_num][k]; } } // Check log likelihood if(iter == 0) prev_ll = ll; else if(ll <= prev_ll) break; prev_ll = ll; // M step z = 0; for(k = 0; k < states; k++) { z = z + sum_pG[k] + params->trainP.delta[k]; } if(!z) { z=1; } for(k = 0; k < states; k++) { // Update pi params->pi[k] = (sum_pG[k] + params->trainP.delta[k]) / z; // Update mu params->mu[k] = (xrhat[k] + params->trainP.alpha[k]*strength-1) / (N_sum + params->trainP.alpha[k]*strength + params->trainP.beta[k]*strength-2); } } if(params->modelfile == NULL) { fptrOUT = stdout; } else { fptrOUT = fopen(params->modelfile, "w"); if(fptrOUT == NULL) { perror("ERROR: could not open modelfile for writing, using STDOUT"); fptrOUT = stdout; } } fprintf(fptrOUT,"mu"); for(k = 0; k < states; k++) { fprintf(fptrOUT, " %0.30Lf", params->mu[k]); } fprintf(fptrOUT,"\n"); fprintf(fptrOUT,"pi"); for(k = 0; k < states; k++) { fprintf(fptrOUT, " %0.30Lf", params->pi[k]); } fprintf(fptrOUT,"\n"); /* free unused memory */ free(b); free(notmu); free(log_pi); free(xrhat); free(sum_pG); for(line_num = 0; line_num < params->trainP.len; line_num++) { // free [states] cells for each line_num free(pG[line_num]); free(xr[line_num]); } free(pG); free(xr); }