Previous changeset 1:abdbc8fe98dd (2014-08-14) Next changeset 3:997f5136985f (2014-08-14) |
Commit message:
Deleted selected files |
removed:
bwa-0.7.9a/Makefile bwa-0.7.9a/bwamem.c bwa-0.7.9a/bwamem.h bwa-0.7.9a/bwamem_extra.c bwa-0.7.9a/bwamem_pair.c |
b |
diff -r abdbc8fe98dd -r dfe9332138cf bwa-0.7.9a/Makefile --- a/bwa-0.7.9a/Makefile Thu Aug 14 04:44:51 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,79 +0,0 @@ -CC= gcc -#CC= clang --analyze -CFLAGS= -g -Wall -Wno-unused-function -O2 -WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS -AR= ar -DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) -LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o -AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ - is.o bwtindex.o bwape.o kopen.o pemerge.o \ - bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ - bwtsw2_chain.o fastmap.o bwtsw2_pair.o -PROG= bwa -INCLUDES= -LIBS= -lm -lz -lpthread -SUBDIRS= . - -.SUFFIXES:.c .o .cc - -.c.o: - $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ - -all:$(PROG) - -bwa:libbwa.a $(AOBJS) main.o - $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS) - -bwamem-lite:libbwa.a example.o - $(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ -L. -lbwa $(LIBS) - -libbwa.a:$(LOBJS) - $(AR) -csru $@ $(LOBJS) - -clean: - rm -f gmon.out *.o a.out $(PROG) *~ *.a - -depend: - ( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c ) - -# DO NOT DELETE THIS LINE -- make depend depends on it. - -QSufSort.o: QSufSort.h -bamlite.o: bamlite.h malloc_wrap.h -bntseq.o: bntseq.h utils.h kseq.h malloc_wrap.h -bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h kstring.h malloc_wrap.h kseq.h -bwamem.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h -bwamem.o: ksort.h utils.h kbtree.h -bwamem_extra.o: bwa.h bntseq.h bwt.h bwamem.h kstring.h malloc_wrap.h -bwamem_pair.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h kvec.h -bwamem_pair.o: utils.h ksw.h -bwape.o: bwtaln.h bwt.h kvec.h malloc_wrap.h bntseq.h utils.h bwase.h bwa.h -bwape.o: ksw.h khash.h -bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h -bwase.o: bwa.h ksw.h -bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h -bwt.o: utils.h bwt.h kvec.h malloc_wrap.h -bwt_gen.o: QSufSort.h malloc_wrap.h -bwt_lite.o: bwt_lite.h malloc_wrap.h -bwtaln.o: bwtaln.h bwt.h bwtgap.h utils.h bwa.h bntseq.h malloc_wrap.h -bwtgap.o: bwtgap.h bwt.h bwtaln.h malloc_wrap.h -bwtindex.o: bntseq.h bwt.h utils.h malloc_wrap.h -bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h kstring.h -bwtsw2_aux.o: malloc_wrap.h bwa.h ksw.h kseq.h ksort.h -bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h malloc_wrap.h ksort.h -bwtsw2_core.o: bwt_lite.h bwtsw2.h bntseq.h bwt.h kvec.h malloc_wrap.h -bwtsw2_core.o: khash.h ksort.h -bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h -bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h -bwtsw2_pair.o: malloc_wrap.h ksw.h -example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h -fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h -is.o: malloc_wrap.h -kopen.o: malloc_wrap.h -kstring.o: kstring.h malloc_wrap.h -ksw.o: ksw.h malloc_wrap.h -main.o: kstring.h malloc_wrap.h utils.h -malloc_wrap.o: malloc_wrap.h -pemerge.o: ksw.h kseq.h malloc_wrap.h kstring.h bwa.h bntseq.h bwt.h utils.h -test-extend.o: ksw.h -utils.o: utils.h ksort.h malloc_wrap.h kseq.h |
b |
diff -r abdbc8fe98dd -r dfe9332138cf bwa-0.7.9a/bwamem.c --- a/bwa-0.7.9a/bwamem.c Thu Aug 14 04:44:51 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,1190 +0,0 @@\n-#include <stdlib.h>\n-#include <string.h>\n-#include <stdio.h>\n-#include <assert.h>\n-#include <math.h>\n-#ifdef HAVE_PTHREAD\n-#include <pthread.h>\n-#endif\n-\n-#include "kstring.h"\n-#include "bwamem.h"\n-#include "bntseq.h"\n-#include "ksw.h"\n-#include "kvec.h"\n-#include "ksort.h"\n-#include "utils.h"\n-\n-#ifdef USE_MALLOC_WRAPPERS\n-# include "malloc_wrap.h"\n-#endif\n-\n-/* Theory on probability and scoring *ungapped* alignment\n- *\n- * s\'(a,b) = log[P(b|a)/P(b)] = log[4P(b|a)], assuming uniform base distribution\n- * s\'(a,a) = log(4), s\'(a,b) = log(4e/3), where e is the error rate\n- *\n- * Scale s\'(a,b) to s(a,a) s.t. s(a,a)=x. Then s(a,b) = x*s\'(a,b)/log(4), or conversely: s\'(a,b)=s(a,b)*log(4)/x\n- *\n- * If the matching score is x and mismatch penalty is -y, we can compute error rate e:\n- * e = .75 * exp[-log(4) * y/x]\n- *\n- * log P(seq) = \\sum_i log P(b_i|a_i) = \\sum_i {s\'(a,b) - log(4)}\n- * = \\sum_i { s(a,b)*log(4)/x - log(4) } = log(4) * (S/x - l)\n- *\n- * where S=\\sum_i s(a,b) is the alignment score. Converting to the phred scale:\n- * Q(seq) = -10/log(10) * log P(seq) = 10*log(4)/log(10) * (l - S/x) = 6.02 * (l - S/x)\n- *\n- *\n- * Gap open (zero gap): q\' = log[P(gap-open)], r\' = log[P(gap-ext)] (see Durbin et al. (1998) Section 4.1)\n- * Then q = x*log[P(gap-open)]/log(4), r = x*log[P(gap-ext)]/log(4)\n- *\n- * When there are gaps, l should be the length of alignment matches (i.e. the M operator in CIGAR)\n- */\n-\n-static const bntseq_t *global_bns = 0; // for debugging only\n-\n-mem_opt_t *mem_opt_init()\n-{\n-\tmem_opt_t *o;\n-\to = calloc(1, sizeof(mem_opt_t));\n-\to->flag = 0;\n-\to->a = 1; o->b = 4;\n-\to->o_del = o->o_ins = 6;\n-\to->e_del = o->e_ins = 1;\n-\to->w = 100;\n-\to->T = 30;\n-\to->zdrop = 100;\n-\to->pen_unpaired = 17;\n-\to->pen_clip5 = o->pen_clip3 = 5;\n-\to->min_seed_len = 19;\n-\to->split_width = 10;\n-\to->max_occ = 500;\n-\to->max_chain_gap = 10000;\n-\to->max_ins = 10000;\n-\to->mask_level = 0.50;\n-\to->drop_ratio = 0.50;\n-\to->XA_drop_ratio = 0.80;\n-\to->split_factor = 1.5;\n-\to->chunk_size = 10000000;\n-\to->n_threads = 1;\n-\to->max_hits = 5;\n-\to->max_matesw = 50;\n-\to->mask_level_redun = 0.95;\n-\to->min_chain_weight = 0;\n-\to->max_chain_extend = 1<<30;\n-\to->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len);\n-\tbwa_fill_scmat(o->a, o->b, o->mat);\n-\treturn o;\n-}\n-\n-/***************************\n- * Collection SA invervals *\n- ***************************/\n-\n-#define intv_lt(a, b) ((a).info < (b).info)\n-KSORT_INIT(mem_intv, bwtintv_t, intv_lt)\n-\n-typedef struct {\n-\tbwtintv_v mem, mem1, *tmpv[2];\n-} smem_aux_t;\n-\n-static smem_aux_t *smem_aux_init()\n-{\n-\tsmem_aux_t *a;\n-\ta = calloc(1, sizeof(smem_aux_t));\n-\ta->tmpv[0] = calloc(1, sizeof(bwtintv_v));\n-\ta->tmpv[1] = calloc(1, sizeof(bwtintv_v));\n-\treturn a;\n-}\n-\n-static void smem_aux_destroy(smem_aux_t *a)\n-{\t\n-\tfree(a->tmpv[0]->a); free(a->tmpv[0]);\n-\tfree(a->tmpv[1]->a); free(a->tmpv[1]);\n-\tfree(a->mem.a); free(a->mem1.a);\n-\tfree(a);\n-}\n-\n-static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq, smem_aux_t *a)\n-{\n-\tint i, k, x = 0, old_n;\n-\tint start_width = (opt->flag & MEM_F_SELF_OVLP)? 2 : 1;\n-\tint split_len = (int)(opt->min_seed_len * opt->split_factor + .499);\n-\ta->mem.n = 0;\n-\t// first pass: find all SMEMs\n-\twhile (x < len) {\n-\t\tif (seq[x] < 4) {\n-\t\t\tx = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv);\n-\t\t\tfor (i = 0; i < a->mem1.n; ++i) {\n-\t\t\t\tbwtintv_t *p = &a->mem1.a[i];\n-\t\t\t\tint slen = (uint32_t)p->info - (p->info>>32); // seed length\n-\t\t\t\tif (slen >= opt->min_seed_len)\n-\t\t\t\t\tkv_push(bwtintv_t, a->mem, *p);\n-\t\t\t}\n-\t\t} else ++x;\n-\t}\n-\t// second pass: find MEMs inside a long SMEM\n-\told_n = a->mem.n;\n-\tfor (k = 0; k < old_n; ++k) {\n-\t\tbwtintv_t *p = &a->mem.a[k];\n-\t\tint start = p->info>>32, end = (int32_t)p->info;\n-\t\tif (end - start < split_len || p->x[2] > opt->split_width) continue;\n-\t\tbwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv);\n-\t\tfor (i = 0; i < a->mem1.n; ++i)\n-\t\t\tif ((a->mem1.a[i].inf'..b'or 5\'-end clipping\n-\t\t\ta.cigar[0] = clip5<<4 | 3;\n-\t\t\t++a.n_cigar;\n-\t\t}\n-\t\tif (clip3) {\n-\t\t\tmemmove(a.cigar + a.n_cigar + 1, a.cigar + a.n_cigar, l_MD); // make room for 3\'-end clipping\n-\t\t\ta.cigar[a.n_cigar++] = clip3<<4 | 3;\n-\t\t}\n-\t}\n-\ta.rid = bns_pos2rid(bns, pos);\n-\tassert(a.rid == ar->rid);\n-\ta.pos = pos - bns->anns[a.rid].offset;\n-\ta.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub;\n-\tfree(query);\n-\treturn a;\n-}\n-\n-typedef struct {\n-\tconst mem_opt_t *opt;\n-\tconst bwt_t *bwt;\n-\tconst bntseq_t *bns;\n-\tconst uint8_t *pac;\n-\tconst mem_pestat_t *pes;\n-\tsmem_aux_t **aux;\n-\tbseq1_t *seqs;\n-\tmem_alnreg_v *regs;\n-\tint64_t n_processed;\n-} worker_t;\n-\n-static void worker1(void *data, int i, int tid)\n-{\n-\tworker_t *w = (worker_t*)data;\n-\tif (!(w->opt->flag&MEM_F_PE)) {\n-\t\tif (bwa_verbose >= 4) printf("=====> Processing read \'%s\' <=====\\n", w->seqs[i].name);\n-\t\tw->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq, w->aux[tid]);\n-\t} else {\n-\t\tif (bwa_verbose >= 4) printf("=====> Processing read \'%s\'/1 <=====\\n", w->seqs[i<<1|0].name);\n-\t\tw->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq, w->aux[tid]);\n-\t\tif (bwa_verbose >= 4) printf("=====> Processing read \'%s\'/2 <=====\\n", w->seqs[i<<1|1].name);\n-\t\tw->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq, w->aux[tid]);\n-\t}\n-}\n-\n-static void worker2(void *data, int i, int tid)\n-{\n-\textern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]);\n-\textern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a);\n-\tworker_t *w = (worker_t*)data;\n-\tif (!(w->opt->flag&MEM_F_PE)) {\n-\t\tif (bwa_verbose >= 4) printf("=====> Finalizing read \'%s\' <=====\\n", w->seqs[i].name);\n-\t\tif (w->opt->flag & MEM_F_ALN_REG) {\n-\t\t\tmem_reg2ovlp(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i]);\n-\t\t} else {\n-\t\t\tmem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i);\n-\t\t\tmem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0);\n-\t\t}\n-\t\tfree(w->regs[i].a);\n-\t} else {\n-\t\tif (bwa_verbose >= 4) printf("=====> Finalizing read pair \'%s\' <=====\\n", w->seqs[i<<1|0].name);\n-\t\tmem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1]);\n-\t\tfree(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a);\n-\t}\n-}\n-\n-void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0)\n-{\n-\textern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n);\n-\tworker_t w;\n-\tmem_alnreg_v *regs;\n-\tmem_pestat_t pes[4];\n-\tdouble ctime, rtime;\n-\tint i;\n-\n-\tctime = cputime(); rtime = realtime();\n-\tglobal_bns = bns;\n-\tregs = malloc(n * sizeof(mem_alnreg_v));\n-\tw.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac;\n-\tw.seqs = seqs; w.regs = regs; w.n_processed = n_processed;\n-\tw.pes = &pes[0];\n-\tw.aux = malloc(opt->n_threads * sizeof(smem_aux_t));\n-\tfor (i = 0; i < opt->n_threads; ++i)\n-\t\tw.aux[i] = smem_aux_init();\n-\tkt_for(opt->n_threads, worker1, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions\n-\tfor (i = 0; i < opt->n_threads; ++i)\n-\t\tsmem_aux_destroy(w.aux[i]);\n-\tfree(w.aux);\n-\tif (opt->flag&MEM_F_PE) { // infer insert sizes if not provided\n-\t\tif (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0\n-\t\telse mem_pestat(opt, bns->l_pac, n, regs, pes); // otherwise, infer the insert size distribution from data\n-\t}\n-\tkt_for(opt->n_threads, worker2, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment\n-\tfree(regs);\n-\tif (bwa_verbose >= 3)\n-\t\tfprintf(stderr, "[M::%s] Processed %d reads in %.3f CPU sec, %.3f real sec\\n", __func__, n, cputime() - ctime, realtime() - rtime);\n-}\n' |
b |
diff -r abdbc8fe98dd -r dfe9332138cf bwa-0.7.9a/bwamem.h --- a/bwa-0.7.9a/bwamem.h Thu Aug 14 04:44:51 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,179 +0,0 @@ -#ifndef BWAMEM_H_ -#define BWAMEM_H_ - -#include "bwt.h" -#include "bntseq.h" -#include "bwa.h" - -#define MEM_MAPQ_COEF 30.0 -#define MEM_MAPQ_MAX 60 - -struct __smem_i; -typedef struct __smem_i smem_i; - -#define MEM_F_PE 0x2 -#define MEM_F_NOPAIRING 0x4 -#define MEM_F_ALL 0x8 -#define MEM_F_NO_MULTI 0x10 -#define MEM_F_NO_RESCUE 0x20 -#define MEM_F_SELF_OVLP 0x40 -#define MEM_F_ALN_REG 0x80 -#define MEM_F_SOFTCLIP 0x200 - -typedef struct { - int a, b; // match score and mismatch penalty - int o_del, e_del; - int o_ins, e_ins; - int pen_unpaired; // phred-scaled penalty for unpaired reads - int pen_clip5,pen_clip3;// clipping penalty. This score is not deducted from the DP score. - int w; // band width - int zdrop; // Z-dropoff - - int T; // output score threshold; only affecting output - int flag; // see MEM_F_* macros - int min_seed_len; // minimum seed length - int min_chain_weight; - int max_chain_extend; - float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor - int split_width; // split into a seed if its occurence is smaller than this value - int max_occ; // skip a seed if its occurence is larger than this value - int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed - int n_threads; // number of threads - int chunk_size; // process chunk_size-bp sequences in a batch - float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits - float drop_ratio; // drop a chain if its seed coverage is below drop_ratio times the seed coverage of a better chain overlapping with the small chain - float XA_drop_ratio; // when counting hits for the XA tag, ignore alignments with score < XA_drop_ratio * max_score; only effective for the XA tag - float mask_level_redun; - float mapQ_coef_len; - int mapQ_coef_fac; - int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value - int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end - int max_hits; // if there are max_hits or fewer, output them all - int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset -} mem_opt_t; - -typedef struct { - int64_t rb, re; // [rb,re): reference sequence in the alignment - int qb, qe; // [qb,qe): query sequence in the alignment - int rid; // reference seq ID - int score; // best local SW score - int truesc; // actual score corresponding to the aligned region; possibly smaller than $score - int sub; // 2nd best SW score - int csub; // SW score of a tandem hit - int sub_n; // approximate number of suboptimal hits - int w; // actual band width used in extension - int seedcov; // length of regions coverged by seeds - int secondary; // index of the parent hit shadowing the current hit; <0 if primary - int seedlen0; // length of the starting seed - int n_comp; // number of sub-alignments chained together - float frac_rep; - uint64_t hash; -} mem_alnreg_t; - -typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v; - -typedef struct { - int low, high; // lower and upper bounds within which a read pair is considered to be properly paired - int failed; // non-zero if the orientation is not supported by sufficient data - double avg, std; // mean and stddev of the insert size distribution -} mem_pestat_t; - -typedef struct { // This struct is only used for the convenience of API. - int64_t pos; // forward strand 5'-end mapping position - int rid; // reference sequence index in bntseq_t; <0 for unmapped - int flag; // extra flag - uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance - int n_cigar; // number of CIGAR operations - uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234 - char *XA; // alternative mappings - - int score, sub; -} mem_aln_t; - -#ifdef __cplusplus -extern "C" { -#endif - - smem_i *smem_itr_init(const bwt_t *bwt); - void smem_itr_destroy(smem_i *itr); - void smem_set_query(smem_i *itr, int len, const uint8_t *query); - const bwtintv_v *smem_next(smem_i *itr); - - mem_opt_t *mem_opt_init(void); - void mem_fill_scmat(int a, int b, int8_t mat[25]); - - /** - * Align a batch of sequences and generate the alignments in the SAM format - * - * This routine requires $seqs[i].{l_seq,seq,name} and write $seqs[i].sam. - * Note that $seqs[i].sam may consist of several SAM lines if the - * corresponding sequence has multiple primary hits. - * - * In the paired-end mode (i.e. MEM_F_PE is set in $opt->flag), query - * sequences must be interleaved: $n must be an even number and the 2i-th - * sequence and the (2i+1)-th sequence constitute a read pair. In this - * mode, there should be enough (typically >50) unique pairs for the - * routine to infer the orientation and insert size. - * - * @param opt alignment parameters - * @param bwt FM-index of the reference sequence - * @param bns Information of the reference - * @param pac 2-bit encoded reference - * @param n number of query sequences - * @param seqs query sequences; $seqs[i].seq/sam to be modified after the call - * @param pes0 insert-size info; if NULL, infer from data; if not NULL, it should be an array with 4 elements, - * corresponding to each FF, FR, RF and RR orientation. See mem_pestat() for more info. - */ - void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0); - - /** - * Find the aligned regions for one query sequence - * - * Note that this routine does not generate CIGAR. CIGAR should be - * generated later by mem_reg2aln() below. - * - * @param opt alignment parameters - * @param bwt FM-index of the reference sequence - * @param bns Information of the reference - * @param pac 2-bit encoded reference - * @param l_seq length of query sequence - * @param seq query sequence - * - * @return list of aligned regions. - */ - mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq); - - /** - * Generate CIGAR and forward-strand position from alignment region - * - * @param opt alignment parameters - * @param bns Information of the reference - * @param pac 2-bit encoded reference - * @param l_seq length of query sequence - * @param seq query sequence - * @param ar one alignment region - * - * @return CIGAR, strand, mapping quality and forward-strand position - */ - mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar); - mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar, const char *name); - - /** - * Infer the insert size distribution from interleaved alignment regions - * - * This function can be called after mem_align1(), as long as paired-end - * reads are properly interleaved. - * - * @param opt alignment parameters - * @param l_pac length of concatenated reference sequence - * @param n number of query sequences; must be an even number - * @param regs region array of size $n; 2i-th and (2i+1)-th elements constitute a pair - * @param pes inferred insert size distribution (output) - */ - void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]); - -#ifdef __cplusplus -} -#endif - -#endif |
b |
diff -r abdbc8fe98dd -r dfe9332138cf bwa-0.7.9a/bwamem_extra.c --- a/bwa-0.7.9a/bwamem_extra.c Thu Aug 14 04:44:51 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,147 +0,0 @@ -#include "bwa.h" -#include "bwamem.h" -#include "bntseq.h" -#include "kstring.h" - -/*************************** - * SMEM iterator interface * - ***************************/ - -struct __smem_i { - const bwt_t *bwt; - const uint8_t *query; - int start, len; - bwtintv_v *matches; // matches; to be returned by smem_next() - bwtintv_v *sub; // sub-matches inside the longest match; temporary - bwtintv_v *tmpvec[2]; // temporary arrays -}; - -smem_i *smem_itr_init(const bwt_t *bwt) -{ - smem_i *itr; - itr = calloc(1, sizeof(smem_i)); - itr->bwt = bwt; - itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); - itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); - itr->matches = calloc(1, sizeof(bwtintv_v)); - itr->sub = calloc(1, sizeof(bwtintv_v)); - return itr; -} - -void smem_itr_destroy(smem_i *itr) -{ - free(itr->tmpvec[0]->a); free(itr->tmpvec[0]); - free(itr->tmpvec[1]->a); free(itr->tmpvec[1]); - free(itr->matches->a); free(itr->matches); - free(itr->sub->a); free(itr->sub); - free(itr); -} - -void smem_set_query(smem_i *itr, int len, const uint8_t *query) -{ - itr->query = query; - itr->start = 0; - itr->len = len; -} - -const bwtintv_v *smem_next(smem_i *itr) -{ - int i, max, max_i, ori_start; - itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0; - if (itr->start >= itr->len || itr->start < 0) return 0; - while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases - if (itr->start == itr->len) return 0; - ori_start = itr->start; - itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM - if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here - for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match - bwtintv_t *p = &itr->matches->a[i]; - int len = (uint32_t)p->info - (p->info>>32); - if (max < len) max = len, max_i = i; - } - return itr->matches; -} - -/*********************** - *** Extra functions *** - ***********************/ - -mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq_) -{ // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence - extern mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf); - extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); - mem_alnreg_v ar; - char *seq; - seq = malloc(l_seq); - memcpy(seq, seq_, l_seq); // makes a copy of seq_ - ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq, 0); - mem_mark_primary_se(opt, ar.n, ar.a, lrand48()); - free(seq); - return ar; -} - -void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a) -{ - int i; - kstring_t str = {0,0,0}; - for (i = 0; i < a->n; ++i) { - const mem_alnreg_t *p = &a->a[i]; - int is_rev, rid, qb = p->qb, qe = p->qe; - int64_t pos, rb = p->rb, re = p->re; - pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); - rid = bns_pos2rid(bns, pos); - assert(rid == p->rid); - pos -= bns->anns[rid].offset; - kputs(s->name, &str); kputc('\t', &str); - kputw(s->l_seq, &str); kputc('\t', &str); - if (is_rev) qb ^= qe, qe ^= qb, qb ^= qe; // swap - kputw(qb, &str); kputc('\t', &str); kputw(qe, &str); kputc('\t', &str); - kputs(bns->anns[rid].name, &str); kputc('\t', &str); - kputw(bns->anns[rid].len, &str); kputc('\t', &str); - kputw(pos, &str); kputc('\t', &str); kputw(pos + (re - rb), &str); kputc('\t', &str); - ksprintf(&str, "%.3f", (double)p->truesc / opt->a / (qe - qb > re - rb? qe - qb : re - rb)); - kputc('\n', &str); - } - s->sam = str.s; -} - -// Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup. -char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se() -{ - int i, k, *cnt, tot; - kstring_t *aln = 0; - char **XA = 0; - - cnt = calloc(a->n, sizeof(int)); - for (i = 0, tot = 0; i < a->n; ++i) { - int j = a->a[i].secondary; - if (j >= 0 && a->a[i].score >= a->a[j].score * opt->XA_drop_ratio) - ++cnt[j], ++tot; - } - if (tot == 0) goto end_gen_alt; - aln = calloc(a->n, sizeof(kstring_t)); - for (i = 0; i < a->n; ++i) { - mem_aln_t t; - int j = a->a[i].secondary; - if (j < 0 || a->a[i].score < a->a[j].score * opt->XA_drop_ratio) continue; // we don't process the primary alignments as they will be converted to SAM later - if (cnt[j] > opt->max_hits) continue; - t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]); - kputs(bns->anns[t.rid].name, &aln[j]); - kputc(',', &aln[j]); kputc("+-"[t.is_rev], &aln[j]); kputl(t.pos + 1, &aln[j]); - kputc(',', &aln[j]); - for (k = 0; k < t.n_cigar; ++k) { - kputw(t.cigar[k]>>4, &aln[j]); - kputc("MIDSHN"[t.cigar[k]&0xf], &aln[j]); - } - kputc(',', &aln[j]); kputw(t.NM, &aln[j]); - kputc(';', &aln[j]); - free(t.cigar); - } - XA = calloc(a->n, sizeof(char*)); - for (k = 0; k < a->n; ++k) - XA[k] = aln[k].s; - -end_gen_alt: - free(cnt); free(aln); - return XA; -} |
b |
diff -r abdbc8fe98dd -r dfe9332138cf bwa-0.7.9a/bwamem_pair.c --- a/bwa-0.7.9a/bwamem_pair.c Thu Aug 14 04:44:51 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,362 +0,0 @@\n-#include <stdlib.h>\n-#include <string.h>\n-#include <stdio.h>\n-#include <math.h>\n-#include "kstring.h"\n-#include "bwamem.h"\n-#include "kvec.h"\n-#include "utils.h"\n-#include "ksw.h"\n-\n-#ifdef USE_MALLOC_WRAPPERS\n-# include "malloc_wrap.h"\n-#endif\n-\n-\n-#define MIN_RATIO 0.8\n-#define MIN_DIR_CNT 10\n-#define MIN_DIR_RATIO 0.05\n-#define OUTLIER_BOUND 2.0\n-#define MAPPING_BOUND 3.0\n-#define MAX_STDDEV 4.0\n-\n-static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist)\n-{\n-\tint64_t p2;\n-\tint r1 = (b1 >= l_pac), r2 = (b2 >= l_pac);\n-\tp2 = r1 == r2? b2 : (l_pac<<1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand\n-\t*dist = p2 > b1? p2 - b1 : b1 - p2;\n-\treturn (r1 == r2? 0 : 1) ^ (p2 > b1? 0 : 3);\n-}\n-\n-static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r)\n-{\n-\tint j;\n-\tfor (j = 1; j < r->n; ++j) { // choose unique alignment\n-\t\tint b_max = r->a[j].qb > r->a[0].qb? r->a[j].qb : r->a[0].qb;\n-\t\tint e_min = r->a[j].qe < r->a[0].qe? r->a[j].qe : r->a[0].qe;\n-\t\tif (e_min > b_max) { // have overlap\n-\t\t\tint min_l = r->a[j].qe - r->a[j].qb < r->a[0].qe - r->a[0].qb? r->a[j].qe - r->a[j].qb : r->a[0].qe - r->a[0].qb;\n-\t\t\tif (e_min - b_max >= min_l * opt->mask_level) break; // significant overlap\n-\t\t}\n-\t}\n-\treturn j < r->n? r->a[j].score : opt->min_seed_len * opt->a;\n-}\n-\n-void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4])\n-{\n-\tint i, d, max;\n-\tuint64_v isize[4];\n-\tmemset(pes, 0, 4 * sizeof(mem_pestat_t));\n-\tmemset(isize, 0, sizeof(kvec_t(int)) * 4);\n-\tfor (i = 0; i < n>>1; ++i) {\n-\t\tint dir;\n-\t\tint64_t is;\n-\t\tmem_alnreg_v *r[2];\n-\t\tr[0] = (mem_alnreg_v*)®s[i<<1|0];\n-\t\tr[1] = (mem_alnreg_v*)®s[i<<1|1];\n-\t\tif (r[0]->n == 0 || r[1]->n == 0) continue;\n-\t\tif (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue;\n-\t\tif (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue;\n-\t\tif (r[0]->a[0].rid != r[1]->a[0].rid) continue; // not on the same chr\n-\t\tdir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is);\n-\t\tif (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is);\n-\t}\n-\tif (bwa_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n);\n-\tfor (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two.\n-\t\tmem_pestat_t *r = &pes[d];\n-\t\tuint64_v *q = &isize[d];\n-\t\tint p25, p50, p75, x;\n-\t\tif (q->n < MIN_DIR_CNT) {\n-\t\t\tfprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\\n", __func__, "FR"[d>>1&1], "FR"[d&1]);\n-\t\t\tr->failed = 1;\n-\t\t\tcontinue;\n-\t\t} else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\\n", __func__, "FR"[d>>1&1], "FR"[d&1]);\n-\t\tks_introsort_64(q->n, q->a);\n-\t\tp25 = q->a[(int)(.25 * q->n + .499)];\n-\t\tp50 = q->a[(int)(.50 * q->n + .499)];\n-\t\tp75 = q->a[(int)(.75 * q->n + .499)];\n-\t\tr->low = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);\n-\t\tif (r->low < 1) r->low = 1;\n-\t\tr->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);\n-\t\tfprintf(stderr, "[M::%s] (25, 50, 75) percentile: (%d, %d, %d)\\n", __func__, p25, p50, p75);\n-\t\tfprintf(stderr, "[M::%s] low and high boundaries for computing mean and std.dev: (%d, %d)\\n", __func__, r->low, r->high);\n-\t\tfor (i = x = 0, r->avg = 0; i < q->n; ++i)\n-\t\t\tif (q->a[i] >= r->low && q->a[i] <= r->high)\n-\t\t\t\tr->avg += q->a[i], ++x;\n-\t\tr->avg /= x;\n-\t\tfor (i = 0, r->std = 0; i < q->n; ++i)\n-\t\t\tif (q->a[i] >= r->low && q->a[i] <= r->high)\n-\t\t\t\tr->std += (q->a[i] - r->avg) * (q->a[i] - r->avg);\n-\t\tr->std = sqrt(r->std / x);\n-\t\tfprintf(stderr, "[M::%s] mean and std.dev: (%.2f, %.2f)\\n", __func__, r->avg, r->std);\n-\t\tr->low = (int)(p25 - MAPPING_BOUND * (p75 - p25) + .499);\n-\t\tr->high = (int)(p75 + MAPPING_BOUND * (p75 - p25) + .499);\n-\t\tif (r->low > r->avg - MAX_STDDEV * r->std) r->low = (int)(r->avg - MAX_STDDEV'..b'\t}\n-\t\tif (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score\n-\t\t// compute mapQ for the best SE hit\n-\t\tscore_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired;\n-\t\t//q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0;\n-\t\tsubo = subo > score_un? subo : score_un;\n-\t\tq_pe = raw_mapq(o - subo, opt->a);\n-\t\tif (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499);\n-\t\tif (q_pe < 0) q_pe = 0;\n-\t\tif (q_pe > 60) q_pe = 60;\n-\t\tq_pe = (int)(q_pe * (1. - .5 * (a[0].a[0].frac_rep + a[1].a[0].frac_rep)) + .499);\n-\t\t// the following assumes no split hits\n-\t\tif (o > score_un) { // paired alignment is preferred\n-\t\t\tmem_alnreg_t *c[2];\n-\t\t\tc[0] = &a[0].a[z[0]]; c[1] = &a[1].a[z[1]];\n-\t\t\tfor (i = 0; i < 2; ++i) {\n-\t\t\t\tif (c[i]->secondary >= 0)\n-\t\t\t\t\tc[i]->sub = a[i].a[c[i]->secondary].score, c[i]->secondary = -2;\n-\t\t\t\tq_se[i] = mem_approx_mapq_se(opt, c[i]);\n-\t\t\t}\n-\t\t\tq_se[0] = q_se[0] > q_pe? q_se[0] : q_pe < q_se[0] + 40? q_pe : q_se[0] + 40;\n-\t\t\tq_se[1] = q_se[1] > q_pe? q_se[1] : q_pe < q_se[1] + 40? q_pe : q_se[1] + 40;\n-\t\t\textra_flag |= 2;\n-\t\t\t// cap at the tandem repeat score\n-\t\t\tq_se[0] = q_se[0] < raw_mapq(c[0]->score - c[0]->csub, opt->a)? q_se[0] : raw_mapq(c[0]->score - c[0]->csub, opt->a);\n-\t\t\tq_se[1] = q_se[1] < raw_mapq(c[1]->score - c[1]->csub, opt->a)? q_se[1] : raw_mapq(c[1]->score - c[1]->csub, opt->a);\n-\t\t} else { // the unpaired alignment is preferred\n-\t\t\tz[0] = z[1] = 0;\n-\t\t\tq_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]);\n-\t\t\tq_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]);\n-\t\t}\n-\t\t// suboptimal hits\n-\t\tif (!(opt->flag & MEM_F_ALL)) {\n-\t\t\tfor (i = 0; i < 2; ++i) {\n-\t\t\t\tint k = a[i].a[z[i]].secondary;\n-\t\t\t\tif (k >= 0) { // switch secondary and primary\n-\t\t\t\t\tassert(a[i].a[k].secondary < 0);\n-\t\t\t\t\tfor (j = 0; j < a[i].n; ++j)\n-\t\t\t\t\t\tif (a[i].a[j].secondary == k || j == k)\n-\t\t\t\t\t\t\ta[i].a[j].secondary = z[i];\n-\t\t\t\t\ta[i].a[z[i]].secondary = -1;\n-\t\t\t\t}\n-\t\t\t\tXA[i] = mem_gen_alt(opt, bns, pac, &a[i], s[i].l_seq, s[i].seq);\n-\t\t\t}\n-\t\t} else XA[0] = XA[1] = 0;\n-\t\t// write SAM\n-\t\th[0] = mem_reg2aln(opt, bns, pac, s[0].l_seq, s[0].seq, &a[0].a[z[0]]); h[0].mapq = q_se[0]; h[0].flag |= 0x40 | extra_flag; h[0].XA = XA[0]? XA[0][z[0]] : 0;\n-\t\th[1] = mem_reg2aln(opt, bns, pac, s[1].l_seq, s[1].seq, &a[1].a[z[1]]); h[1].mapq = q_se[1]; h[1].flag |= 0x80 | extra_flag; h[1].XA = XA[1]? XA[1][z[1]] : 0;\n-\t\tmem_aln2sam(bns, &str, &s[0], 1, &h[0], 0, &h[1], opt->flag&MEM_F_SOFTCLIP); s[0].sam = strdup(str.s); str.l = 0;\n-\t\tmem_aln2sam(bns, &str, &s[1], 1, &h[1], 0, &h[0], opt->flag&MEM_F_SOFTCLIP); s[1].sam = str.s;\n-\t\tif (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \\"%s\\", \\"%s\\"\\n", s[0].name, s[1].name);\n-\t\tfree(h[0].cigar); // h[0].XA will be freed in the following block\n-\t\tfree(h[1].cigar);\n-\t\t// free XA\n-\t\tfor (i = 0; i < 2; ++i) {\n-\t\t\tif (XA[i]) {\n-\t\t\t\tfor (j = 0; j < a[i].n; ++j) free(XA[i][j]);\n-\t\t\t\tfree(XA[i]);\n-\t\t\t}\n-\t\t}\n-\t} else goto no_pairing;\n-\treturn n;\n-\n-no_pairing:\n-\tfor (i = 0; i < 2; ++i) {\n-\t\tif (a[i].n && a[i].a[0].score >= opt->T)\n-\t\t\th[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[0]);\n-\t\telse h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0);\n-\t}\n-\tif (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it.\n-\t\tint64_t dist;\n-\t\tint d;\n-\t\td = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist);\n-\t\tif (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) extra_flag |= 2;\n-\t}\n-\tmem_reg2sam_se(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]);\n-\tmem_reg2sam_se(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]);\n-\tif (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \\"%s\\", \\"%s\\"\\n", s[0].name, s[1].name);\n-\tfree(h[0].cigar); free(h[1].cigar);\n-\treturn n;\n-}\n' |