Galaxy |

Changeset 2:dfe9332138cf (2014-08-14)

Previous changeset 1:abdbc8fe98dd (2014-08-14) Next changeset 3:997f5136985f (2014-08-14)

Commit message:
Deleted selected files

removed:
bwa-0.7.9a/Makefile
bwa-0.7.9a/bwamem.c
bwa-0.7.9a/bwamem.h
bwa-0.7.9a/bwamem_extra.c
bwa-0.7.9a/bwamem_pair.c

diff -r abdbc8fe98dd -r dfe9332138cf bwa-0.7.9a/Makefile
--- a/bwa-0.7.9a/Makefile Thu Aug 14 04:44:51 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,79 +0,0 @@
-CC= gcc
-#CC= clang --analyze
-CFLAGS= -g -Wall -Wno-unused-function -O2
-WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
-AR= ar
-DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
-LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o
-AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
- is.o bwtindex.o bwape.o kopen.o pemerge.o \
- bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
- bwtsw2_chain.o fastmap.o bwtsw2_pair.o
-PROG= bwa
-INCLUDES=
-LIBS= -lm -lz -lpthread
-SUBDIRS= .
-
-.SUFFIXES:.c .o .cc
-
-.c.o:
- $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
-
-all:$(PROG)
-
-bwa:libbwa.a $(AOBJS) main.o
- $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
-
-bwamem-lite:libbwa.a example.o
- $(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ -L. -lbwa $(LIBS)
-
-libbwa.a:$(LOBJS)
- $(AR) -csru $@ $(LOBJS)
-
-clean:
- rm -f gmon.out *.o a.out $(PROG) *~ *.a
-
-depend:
- ( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c )
-
-# DO NOT DELETE THIS LINE -- make depend depends on it.
-
-QSufSort.o: QSufSort.h
-bamlite.o: bamlite.h malloc_wrap.h
-bntseq.o: bntseq.h utils.h kseq.h malloc_wrap.h
-bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h kstring.h malloc_wrap.h kseq.h
-bwamem.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h
-bwamem.o: ksort.h utils.h kbtree.h
-bwamem_extra.o: bwa.h bntseq.h bwt.h bwamem.h kstring.h malloc_wrap.h
-bwamem_pair.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h kvec.h
-bwamem_pair.o: utils.h ksw.h
-bwape.o: bwtaln.h bwt.h kvec.h malloc_wrap.h bntseq.h utils.h bwase.h bwa.h
-bwape.o: ksw.h khash.h
-bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h
-bwase.o: bwa.h ksw.h
-bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h
-bwt.o: utils.h bwt.h kvec.h malloc_wrap.h
-bwt_gen.o: QSufSort.h malloc_wrap.h
-bwt_lite.o: bwt_lite.h malloc_wrap.h
-bwtaln.o: bwtaln.h bwt.h bwtgap.h utils.h bwa.h bntseq.h malloc_wrap.h
-bwtgap.o: bwtgap.h bwt.h bwtaln.h malloc_wrap.h
-bwtindex.o: bntseq.h bwt.h utils.h malloc_wrap.h
-bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h kstring.h
-bwtsw2_aux.o: malloc_wrap.h bwa.h ksw.h kseq.h ksort.h
-bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h malloc_wrap.h ksort.h
-bwtsw2_core.o: bwt_lite.h bwtsw2.h bntseq.h bwt.h kvec.h malloc_wrap.h
-bwtsw2_core.o: khash.h ksort.h
-bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h
-bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h
-bwtsw2_pair.o: malloc_wrap.h ksw.h
-example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h
-fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h
-is.o: malloc_wrap.h
-kopen.o: malloc_wrap.h
-kstring.o: kstring.h malloc_wrap.h
-ksw.o: ksw.h malloc_wrap.h
-main.o: kstring.h malloc_wrap.h utils.h
-malloc_wrap.o: malloc_wrap.h
-pemerge.o: ksw.h kseq.h malloc_wrap.h kstring.h bwa.h bntseq.h bwt.h utils.h
-test-extend.o: ksw.h
-utils.o: utils.h ksort.h malloc_wrap.h kseq.h

diff -r abdbc8fe98dd -r dfe9332138cf bwa-0.7.9a/bwamem.c
--- a/bwa-0.7.9a/bwamem.c Thu Aug 14 04:44:51 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,1190 +0,0 @@\n-#include <stdlib.h>\n-#include <string.h>\n-#include <stdio.h>\n-#include <assert.h>\n-#include <math.h>\n-#ifdef HAVE_PTHREAD\n-#include <pthread.h>\n-#endif\n-\n-#include "kstring.h"\n-#include "bwamem.h"\n-#include "bntseq.h"\n-#include "ksw.h"\n-#include "kvec.h"\n-#include "ksort.h"\n-#include "utils.h"\n-\n-#ifdef USE_MALLOC_WRAPPERS\n-# include "malloc_wrap.h"\n-#endif\n-\n-/* Theory on probability and scoring *ungapped* alignment\n- *\n- * s\'(a,b) = log[P(b|a)/P(b)] = log[4P(b|a)], assuming uniform base distribution\n- * s\'(a,a) = log(4), s\'(a,b) = log(4e/3), where e is the error rate\n- *\n- * Scale s\'(a,b) to s(a,a) s.t. s(a,a)=x. Then s(a,b) = x*s\'(a,b)/log(4), or conversely: s\'(a,b)=s(a,b)*log(4)/x\n- *\n- * If the matching score is x and mismatch penalty is -y, we can compute error rate e:\n- * e = .75 * exp[-log(4) * y/x]\n- *\n- * log P(seq) = \\sum_i log P(b_i|a_i) = \\sum_i {s\'(a,b) - log(4)}\n- * = \\sum_i { s(a,b)*log(4)/x - log(4) } = log(4) * (S/x - l)\n- *\n- * where S=\\sum_i s(a,b) is the alignment score. Converting to the phred scale:\n- * Q(seq) = -10/log(10) * log P(seq) = 10*log(4)/log(10) * (l - S/x) = 6.02 * (l - S/x)\n- *\n- *\n- * Gap open (zero gap): q\' = log[P(gap-open)], r\' = log[P(gap-ext)] (see Durbin et al. (1998) Section 4.1)\n- * Then q = x*log[P(gap-open)]/log(4), r = x*log[P(gap-ext)]/log(4)\n- *\n- * When there are gaps, l should be the length of alignment matches (i.e. the M operator in CIGAR)\n- */\n-\n-static const bntseq_t *global_bns = 0; // for debugging only\n-\n-mem_opt_t *mem_opt_init()\n-{\n-\tmem_opt_t *o;\n-\to = calloc(1, sizeof(mem_opt_t));\n-\to->flag = 0;\n-\to->a = 1; o->b = 4;\n-\to->o_del = o->o_ins = 6;\n-\to->e_del = o->e_ins = 1;\n-\to->w = 100;\n-\to->T = 30;\n-\to->zdrop = 100;\n-\to->pen_unpaired = 17;\n-\to->pen_clip5 = o->pen_clip3 = 5;\n-\to->min_seed_len = 19;\n-\to->split_width = 10;\n-\to->max_occ = 500;\n-\to->max_chain_gap = 10000;\n-\to->max_ins = 10000;\n-\to->mask_level = 0.50;\n-\to->drop_ratio = 0.50;\n-\to->XA_drop_ratio = 0.80;\n-\to->split_factor = 1.5;\n-\to->chunk_size = 10000000;\n-\to->n_threads = 1;\n-\to->max_hits = 5;\n-\to->max_matesw = 50;\n-\to->mask_level_redun = 0.95;\n-\to->min_chain_weight = 0;\n-\to->max_chain_extend = 1<<30;\n-\to->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len);\n-\tbwa_fill_scmat(o->a, o->b, o->mat);\n-\treturn o;\n-}\n-\n-/***************************\n- * Collection SA invervals *\n- ***************************/\n-\n-#define intv_lt(a, b) ((a).info < (b).info)\n-KSORT_INIT(mem_intv, bwtintv_t, intv_lt)\n-\n-typedef struct {\n-\tbwtintv_v mem, mem1, *tmpv[2];\n-} smem_aux_t;\n-\n-static smem_aux_t *smem_aux_init()\n-{\n-\tsmem_aux_t *a;\n-\ta = calloc(1, sizeof(smem_aux_t));\n-\ta->tmpv[0] = calloc(1, sizeof(bwtintv_v));\n-\ta->tmpv[1] = calloc(1, sizeof(bwtintv_v));\n-\treturn a;\n-}\n-\n-static void smem_aux_destroy(smem_aux_t *a)\n-{\t\n-\tfree(a->tmpv[0]->a); free(a->tmpv[0]);\n-\tfree(a->tmpv[1]->a); free(a->tmpv[1]);\n-\tfree(a->mem.a); free(a->mem1.a);\n-\tfree(a);\n-}\n-\n-static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq, smem_aux_t *a)\n-{\n-\tint i, k, x = 0, old_n;\n-\tint start_width = (opt->flag & MEM_F_SELF_OVLP)? 2 : 1;\n-\tint split_len = (int)(opt->min_seed_len * opt->split_factor + .499);\n-\ta->mem.n = 0;\n-\t// first pass: find all SMEMs\n-\twhile (x < len) {\n-\t\tif (seq[x] < 4) {\n-\t\t\tx = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv);\n-\t\t\tfor (i = 0; i < a->mem1.n; ++i) {\n-\t\t\t\tbwtintv_t *p = &a->mem1.a[i];\n-\t\t\t\tint slen = (uint32_t)p->info - (p->info>>32); // seed length\n-\t\t\t\tif (slen >= opt->min_seed_len)\n-\t\t\t\t\tkv_push(bwtintv_t, a->mem, *p);\n-\t\t\t}\n-\t\t} else ++x;\n-\t}\n-\t// second pass: find MEMs inside a long SMEM\n-\told_n = a->mem.n;\n-\tfor (k = 0; k < old_n; ++k) {\n-\t\tbwtintv_t *p = &a->mem.a[k];\n-\t\tint start = p->info>>32, end = (int32_t)p->info;\n-\t\tif (end - start < split_len || p->x[2] > opt->split_width) continue;\n-\t\tbwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv);\n-\t\tfor (i = 0; i < a->mem1.n; ++i)\n-\t\t\tif ((a->mem1.a[i].inf'..b'or 5\'-end clipping\n-\t\t\ta.cigar[0] = clip5<<4 | 3;\n-\t\t\t++a.n_cigar;\n-\t\t}\n-\t\tif (clip3) {\n-\t\t\tmemmove(a.cigar + a.n_cigar + 1, a.cigar + a.n_cigar, l_MD); // make room for 3\'-end clipping\n-\t\t\ta.cigar[a.n_cigar++] = clip3<<4 | 3;\n-\t\t}\n-\t}\n-\ta.rid = bns_pos2rid(bns, pos);\n-\tassert(a.rid == ar->rid);\n-\ta.pos = pos - bns->anns[a.rid].offset;\n-\ta.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub;\n-\tfree(query);\n-\treturn a;\n-}\n-\n-typedef struct {\n-\tconst mem_opt_t *opt;\n-\tconst bwt_t *bwt;\n-\tconst bntseq_t *bns;\n-\tconst uint8_t *pac;\n-\tconst mem_pestat_t *pes;\n-\tsmem_aux_t **aux;\n-\tbseq1_t *seqs;\n-\tmem_alnreg_v *regs;\n-\tint64_t n_processed;\n-} worker_t;\n-\n-static void worker1(void *data, int i, int tid)\n-{\n-\tworker_t *w = (worker_t*)data;\n-\tif (!(w->opt->flag&MEM_F_PE)) {\n-\t\tif (bwa_verbose >= 4) printf("=====> Processing read \'%s\' <=====\\n", w->seqs[i].name);\n-\t\tw->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq, w->aux[tid]);\n-\t} else {\n-\t\tif (bwa_verbose >= 4) printf("=====> Processing read \'%s\'/1 <=====\\n", w->seqs[i<<1|0].name);\n-\t\tw->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq, w->aux[tid]);\n-\t\tif (bwa_verbose >= 4) printf("=====> Processing read \'%s\'/2 <=====\\n", w->seqs[i<<1|1].name);\n-\t\tw->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq, w->aux[tid]);\n-\t}\n-}\n-\n-static void worker2(void *data, int i, int tid)\n-{\n-\textern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]);\n-\textern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a);\n-\tworker_t *w = (worker_t*)data;\n-\tif (!(w->opt->flag&MEM_F_PE)) {\n-\t\tif (bwa_verbose >= 4) printf("=====> Finalizing read \'%s\' <=====\\n", w->seqs[i].name);\n-\t\tif (w->opt->flag & MEM_F_ALN_REG) {\n-\t\t\tmem_reg2ovlp(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i]);\n-\t\t} else {\n-\t\t\tmem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i);\n-\t\t\tmem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0);\n-\t\t}\n-\t\tfree(w->regs[i].a);\n-\t} else {\n-\t\tif (bwa_verbose >= 4) printf("=====> Finalizing read pair \'%s\' <=====\\n", w->seqs[i<<1|0].name);\n-\t\tmem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1]);\n-\t\tfree(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a);\n-\t}\n-}\n-\n-void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0)\n-{\n-\textern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n);\n-\tworker_t w;\n-\tmem_alnreg_v *regs;\n-\tmem_pestat_t pes[4];\n-\tdouble ctime, rtime;\n-\tint i;\n-\n-\tctime = cputime(); rtime = realtime();\n-\tglobal_bns = bns;\n-\tregs = malloc(n * sizeof(mem_alnreg_v));\n-\tw.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac;\n-\tw.seqs = seqs; w.regs = regs; w.n_processed = n_processed;\n-\tw.pes = &pes[0];\n-\tw.aux = malloc(opt->n_threads * sizeof(smem_aux_t));\n-\tfor (i = 0; i < opt->n_threads; ++i)\n-\t\tw.aux[i] = smem_aux_init();\n-\tkt_for(opt->n_threads, worker1, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions\n-\tfor (i = 0; i < opt->n_threads; ++i)\n-\t\tsmem_aux_destroy(w.aux[i]);\n-\tfree(w.aux);\n-\tif (opt->flag&MEM_F_PE) { // infer insert sizes if not provided\n-\t\tif (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0\n-\t\telse mem_pestat(opt, bns->l_pac, n, regs, pes); // otherwise, infer the insert size distribution from data\n-\t}\n-\tkt_for(opt->n_threads, worker2, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment\n-\tfree(regs);\n-\tif (bwa_verbose >= 3)\n-\t\tfprintf(stderr, "[M::%s] Processed %d reads in %.3f CPU sec, %.3f real sec\\n", __func__, n, cputime() - ctime, realtime() - rtime);\n-}\n'

diff -r abdbc8fe98dd -r dfe9332138cf bwa-0.7.9a/bwamem.h
--- a/bwa-0.7.9a/bwamem.h Thu Aug 14 04:44:51 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,179 +0,0 @@
-#ifndef BWAMEM_H_
-#define BWAMEM_H_
-
-#include "bwt.h"
-#include "bntseq.h"
-#include "bwa.h"
-
-#define MEM_MAPQ_COEF 30.0
-#define MEM_MAPQ_MAX  60
-
-struct __smem_i;
-typedef struct __smem_i smem_i;
-
-#define MEM_F_PE        0x2
-#define MEM_F_NOPAIRING 0x4
-#define MEM_F_ALL       0x8
-#define MEM_F_NO_MULTI  0x10
-#define MEM_F_NO_RESCUE 0x20
-#define MEM_F_SELF_OVLP 0x40
-#define MEM_F_ALN_REG   0x80
-#define MEM_F_SOFTCLIP  0x200
-
-typedef struct {
- int a, b;               // match score and mismatch penalty
- int o_del, e_del;
- int o_ins, e_ins;
- int pen_unpaired;       // phred-scaled penalty for unpaired reads
- int pen_clip5,pen_clip3;// clipping penalty. This score is not deducted from the DP score.
- int w;                  // band width
- int zdrop;              // Z-dropoff
-
- int T;                  // output score threshold; only affecting output
- int flag;               // see MEM_F_* macros
- int min_seed_len;       // minimum seed length
- int min_chain_weight;
- int max_chain_extend;
- float split_factor;     // split into a seed if MEM is longer than min_seed_len*split_factor
- int split_width;        // split into a seed if its occurence is smaller than this value
- int max_occ;            // skip a seed if its occurence is larger than this value
- int max_chain_gap;      // do not chain seed if it is max_chain_gap-bp away from the closest seed
- int n_threads;          // number of threads
- int chunk_size;         // process chunk_size-bp sequences in a batch
- float mask_level;       // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
- float drop_ratio;       // drop a chain if its seed coverage is below drop_ratio times the seed coverage of a better chain overlapping with the small chain
- float XA_drop_ratio;    // when counting hits for the XA tag, ignore alignments with score < XA_drop_ratio * max_score; only effective for the XA tag
- float mask_level_redun;
- float mapQ_coef_len;
- int mapQ_coef_fac;
- int max_ins;            // when estimating insert size distribution, skip pairs with insert longer than this value
- int max_matesw;         // perform maximally max_matesw rounds of mate-SW for each end
- int max_hits;           // if there are max_hits or fewer, output them all
- int8_t mat[25];         // scoring matrix; mat[0] == 0 if unset
-} mem_opt_t;
-
-typedef struct {
- int64_t rb, re; // [rb,re): reference sequence in the alignment
- int qb, qe;     // [qb,qe): query sequence in the alignment
- int rid;        // reference seq ID
- int score;      // best local SW score
- int truesc;     // actual score corresponding to the aligned region; possibly smaller than $score
- int sub;        // 2nd best SW score
- int csub;       // SW score of a tandem hit
- int sub_n;      // approximate number of suboptimal hits
- int w;          // actual band width used in extension
- int seedcov;    // length of regions coverged by seeds
- int secondary;  // index of the parent hit shadowing the current hit; <0 if primary
- int seedlen0;   // length of the starting seed
- int n_comp;     // number of sub-alignments chained together
- float frac_rep;
- uint64_t hash;
-} mem_alnreg_t;
-
-typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v;
-
-typedef struct {
- int low, high;   // lower and upper bounds within which a read pair is considered to be properly paired
- int failed;      // non-zero if the orientation is not supported by sufficient data
- double avg, std; // mean and stddev of the insert size distribution
-} mem_pestat_t;
-
-typedef struct { // This struct is only used for the convenience of API.
- int64_t pos;     // forward strand 5'-end mapping position
- int rid;         // reference sequence index in bntseq_t; <0 for unmapped
- int flag;        // extra flag
- uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
- int n_cigar;     // number of CIGAR operations
- uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234
- char *XA;        // alternative mappings
-
- int score, sub;
-} mem_aln_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- smem_i *smem_itr_init(const bwt_t *bwt);
- void smem_itr_destroy(smem_i *itr);
- void smem_set_query(smem_i *itr, int len, const uint8_t *query);
- const bwtintv_v *smem_next(smem_i *itr);
-
- mem_opt_t *mem_opt_init(void);
- void mem_fill_scmat(int a, int b, int8_t mat[25]);
-
- /**
- * Align a batch of sequences and generate the alignments in the SAM format
- *
- * This routine requires $seqs[i].{l_seq,seq,name} and write $seqs[i].sam.
- * Note that $seqs[i].sam may consist of several SAM lines if the
- * corresponding sequence has multiple primary hits.
- *
- * In the paired-end mode (i.e. MEM_F_PE is set in $opt->flag), query
- * sequences must be interleaved: $n must be an even number and the 2i-th
- * sequence and the (2i+1)-th sequence constitute a read pair. In this
- * mode, there should be enough (typically >50) unique pairs for the
- * routine to infer the orientation and insert size.
- *
- * @param opt    alignment parameters
- * @param bwt    FM-index of the reference sequence
- * @param bns    Information of the reference
- * @param pac    2-bit encoded reference
- * @param n      number of query sequences
- * @param seqs   query sequences; $seqs[i].seq/sam to be modified after the call
- * @param pes0   insert-size info; if NULL, infer from data; if not NULL, it should be an array with 4 elements,
- *               corresponding to each FF, FR, RF and RR orientation. See mem_pestat() for more info.
- */
- void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0);
-
- /**
- * Find the aligned regions for one query sequence
- *
- * Note that this routine does not generate CIGAR. CIGAR should be
- * generated later by mem_reg2aln() below.
- *
- * @param opt    alignment parameters
- * @param bwt    FM-index of the reference sequence
- * @param bns    Information of the reference
- * @param pac    2-bit encoded reference
- * @param l_seq  length of query sequence
- * @param seq    query sequence
- *
- * @return       list of aligned regions.
- */
- mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq);
-
- /**
- * Generate CIGAR and forward-strand position from alignment region
- *
- * @param opt    alignment parameters
- * @param bns    Information of the reference
- * @param pac    2-bit encoded reference
- * @param l_seq  length of query sequence
- * @param seq    query sequence
- * @param ar     one alignment region
- *
- * @return       CIGAR, strand, mapping quality and forward-strand position
- */
- mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar);
- mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar, const char *name);
-
- /**
- * Infer the insert size distribution from interleaved alignment regions
- *
- * This function can be called after mem_align1(), as long as paired-end
- * reads are properly interleaved.
- *
- * @param opt    alignment parameters
- * @param l_pac  length of concatenated reference sequence
- * @param n      number of query sequences; must be an even number
- * @param regs   region array of size $n; 2i-th and (2i+1)-th elements constitute a pair
- * @param pes    inferred insert size distribution (output)
- */
- void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif

diff -r abdbc8fe98dd -r dfe9332138cf bwa-0.7.9a/bwamem_extra.c
--- a/bwa-0.7.9a/bwamem_extra.c Thu Aug 14 04:44:51 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,147 +0,0 @@
-#include "bwa.h"
-#include "bwamem.h"
-#include "bntseq.h"
-#include "kstring.h"
-
-/***************************
- * SMEM iterator interface *
- ***************************/
-
-struct __smem_i {
- const bwt_t *bwt;
- const uint8_t *query;
- int start, len;
- bwtintv_v *matches; // matches; to be returned by smem_next()
- bwtintv_v *sub;     // sub-matches inside the longest match; temporary
- bwtintv_v *tmpvec[2]; // temporary arrays
-};
-
-smem_i *smem_itr_init(const bwt_t *bwt)
-{
- smem_i *itr;
- itr = calloc(1, sizeof(smem_i));
- itr->bwt = bwt;
- itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v));
- itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v));
- itr->matches   = calloc(1, sizeof(bwtintv_v));
- itr->sub       = calloc(1, sizeof(bwtintv_v));
- return itr;
-}
-
-void smem_itr_destroy(smem_i *itr)
-{
- free(itr->tmpvec[0]->a); free(itr->tmpvec[0]);
- free(itr->tmpvec[1]->a); free(itr->tmpvec[1]);
- free(itr->matches->a);   free(itr->matches);
- free(itr->sub->a);       free(itr->sub);
- free(itr);
-}
-
-void smem_set_query(smem_i *itr, int len, const uint8_t *query)
-{
- itr->query = query;
- itr->start = 0;
- itr->len = len;
-}
-
-const bwtintv_v *smem_next(smem_i *itr)
-{
- int i, max, max_i, ori_start;
- itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0;
- if (itr->start >= itr->len || itr->start < 0) return 0;
- while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
- if (itr->start == itr->len) return 0;
- ori_start = itr->start;
- itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM
- if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here
- for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match
- bwtintv_t *p = &itr->matches->a[i];
- int len = (uint32_t)p->info - (p->info>>32);
- if (max < len) max = len, max_i = i;
- }
- return itr->matches;
-}
-
-/***********************
- *** Extra functions ***
- ***********************/
-
-mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq_)
-{ // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence
- extern mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf);
- extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id);
- mem_alnreg_v ar;
- char *seq;
- seq = malloc(l_seq);
- memcpy(seq, seq_, l_seq); // makes a copy of seq_
- ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq, 0);
- mem_mark_primary_se(opt, ar.n, ar.a, lrand48());
- free(seq);
- return ar;
-}
-
-void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a)
-{
- int i;
- kstring_t str = {0,0,0};
- for (i = 0; i < a->n; ++i) {
- const mem_alnreg_t *p = &a->a[i];
- int is_rev, rid, qb = p->qb, qe = p->qe;
- int64_t pos, rb = p->rb, re = p->re;
- pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
- rid = bns_pos2rid(bns, pos);
- assert(rid == p->rid);
- pos -= bns->anns[rid].offset;
- kputs(s->name, &str); kputc('\t', &str);
- kputw(s->l_seq, &str); kputc('\t', &str);
- if (is_rev) qb ^= qe, qe ^= qb, qb ^= qe; // swap
- kputw(qb, &str); kputc('\t', &str); kputw(qe, &str); kputc('\t', &str);
- kputs(bns->anns[rid].name, &str); kputc('\t', &str);
- kputw(bns->anns[rid].len, &str); kputc('\t', &str);
- kputw(pos, &str); kputc('\t', &str); kputw(pos + (re - rb), &str); kputc('\t', &str);
- ksprintf(&str, "%.3f", (double)p->truesc / opt->a / (qe - qb > re - rb? qe - qb : re - rb));
- kputc('\n', &str);
- }
- s->sam = str.s;
-}
-
-// Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup.
-char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se()
-{
- int i, k, *cnt, tot;
- kstring_t *aln = 0;
- char **XA = 0;
-
- cnt = calloc(a->n, sizeof(int));
- for (i = 0, tot = 0; i < a->n; ++i) {
- int j = a->a[i].secondary;
- if (j >= 0 && a->a[i].score >= a->a[j].score * opt->XA_drop_ratio)
- ++cnt[j], ++tot;
- }
- if (tot == 0) goto end_gen_alt;
- aln = calloc(a->n, sizeof(kstring_t));
- for (i = 0; i < a->n; ++i) {
- mem_aln_t t;
- int j = a->a[i].secondary;
- if (j < 0 || a->a[i].score < a->a[j].score * opt->XA_drop_ratio) continue; // we don't process the primary alignments as they will be converted to SAM later
- if (cnt[j] > opt->max_hits) continue;
- t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]);
- kputs(bns->anns[t.rid].name, &aln[j]);
- kputc(',', &aln[j]); kputc("+-"[t.is_rev], &aln[j]); kputl(t.pos + 1, &aln[j]);
- kputc(',', &aln[j]);
- for (k = 0; k < t.n_cigar; ++k) {
- kputw(t.cigar[k]>>4, &aln[j]);
- kputc("MIDSHN"[t.cigar[k]&0xf], &aln[j]);
- }
- kputc(',', &aln[j]); kputw(t.NM, &aln[j]);
- kputc(';', &aln[j]);
- free(t.cigar);
- }
- XA = calloc(a->n, sizeof(char*));
- for (k = 0; k < a->n; ++k)
- XA[k] = aln[k].s;
-
-end_gen_alt:
- free(cnt); free(aln);
- return XA;
-}

diff -r abdbc8fe98dd -r dfe9332138cf bwa-0.7.9a/bwamem_pair.c
--- a/bwa-0.7.9a/bwamem_pair.c Thu Aug 14 04:44:51 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,362 +0,0 @@\n-#include <stdlib.h>\n-#include <string.h>\n-#include <stdio.h>\n-#include <math.h>\n-#include "kstring.h"\n-#include "bwamem.h"\n-#include "kvec.h"\n-#include "utils.h"\n-#include "ksw.h"\n-\n-#ifdef USE_MALLOC_WRAPPERS\n-# include "malloc_wrap.h"\n-#endif\n-\n-\n-#define MIN_RATIO 0.8\n-#define MIN_DIR_CNT 10\n-#define MIN_DIR_RATIO 0.05\n-#define OUTLIER_BOUND 2.0\n-#define MAPPING_BOUND 3.0\n-#define MAX_STDDEV 4.0\n-\n-static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist)\n-{\n-\tint64_t p2;\n-\tint r1 = (b1 >= l_pac), r2 = (b2 >= l_pac);\n-\tp2 = r1 == r2? b2 : (l_pac<<1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand\n-\t*dist = p2 > b1? p2 - b1 : b1 - p2;\n-\treturn (r1 == r2? 0 : 1) ^ (p2 > b1? 0 : 3);\n-}\n-\n-static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r)\n-{\n-\tint j;\n-\tfor (j = 1; j < r->n; ++j) { // choose unique alignment\n-\t\tint b_max = r->a[j].qb > r->a[0].qb? r->a[j].qb : r->a[0].qb;\n-\t\tint e_min = r->a[j].qe < r->a[0].qe? r->a[j].qe : r->a[0].qe;\n-\t\tif (e_min > b_max) { // have overlap\n-\t\t\tint min_l = r->a[j].qe - r->a[j].qb < r->a[0].qe - r->a[0].qb? r->a[j].qe - r->a[j].qb : r->a[0].qe - r->a[0].qb;\n-\t\t\tif (e_min - b_max >= min_l * opt->mask_level) break; // significant overlap\n-\t\t}\n-\t}\n-\treturn j < r->n? r->a[j].score : opt->min_seed_len * opt->a;\n-}\n-\n-void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4])\n-{\n-\tint i, d, max;\n-\tuint64_v isize[4];\n-\tmemset(pes, 0, 4 * sizeof(mem_pestat_t));\n-\tmemset(isize, 0, sizeof(kvec_t(int)) * 4);\n-\tfor (i = 0; i < n>>1; ++i) {\n-\t\tint dir;\n-\t\tint64_t is;\n-\t\tmem_alnreg_v *r[2];\n-\t\tr[0] = (mem_alnreg_v*)&regs[i<<1|0];\n-\t\tr[1] = (mem_alnreg_v*)&regs[i<<1|1];\n-\t\tif (r[0]->n == 0 || r[1]->n == 0) continue;\n-\t\tif (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue;\n-\t\tif (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue;\n-\t\tif (r[0]->a[0].rid != r[1]->a[0].rid) continue; // not on the same chr\n-\t\tdir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is);\n-\t\tif (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is);\n-\t}\n-\tif (bwa_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n);\n-\tfor (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two.\n-\t\tmem_pestat_t *r = &pes[d];\n-\t\tuint64_v *q = &isize[d];\n-\t\tint p25, p50, p75, x;\n-\t\tif (q->n < MIN_DIR_CNT) {\n-\t\t\tfprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\\n", __func__, "FR"[d>>1&1], "FR"[d&1]);\n-\t\t\tr->failed = 1;\n-\t\t\tcontinue;\n-\t\t} else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\\n", __func__, "FR"[d>>1&1], "FR"[d&1]);\n-\t\tks_introsort_64(q->n, q->a);\n-\t\tp25 = q->a[(int)(.25 * q->n + .499)];\n-\t\tp50 = q->a[(int)(.50 * q->n + .499)];\n-\t\tp75 = q->a[(int)(.75 * q->n + .499)];\n-\t\tr->low = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);\n-\t\tif (r->low < 1) r->low = 1;\n-\t\tr->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);\n-\t\tfprintf(stderr, "[M::%s] (25, 50, 75) percentile: (%d, %d, %d)\\n", __func__, p25, p50, p75);\n-\t\tfprintf(stderr, "[M::%s] low and high boundaries for computing mean and std.dev: (%d, %d)\\n", __func__, r->low, r->high);\n-\t\tfor (i = x = 0, r->avg = 0; i < q->n; ++i)\n-\t\t\tif (q->a[i] >= r->low && q->a[i] <= r->high)\n-\t\t\t\tr->avg += q->a[i], ++x;\n-\t\tr->avg /= x;\n-\t\tfor (i = 0, r->std = 0; i < q->n; ++i)\n-\t\t\tif (q->a[i] >= r->low && q->a[i] <= r->high)\n-\t\t\t\tr->std += (q->a[i] - r->avg) * (q->a[i] - r->avg);\n-\t\tr->std = sqrt(r->std / x);\n-\t\tfprintf(stderr, "[M::%s] mean and std.dev: (%.2f, %.2f)\\n", __func__, r->avg, r->std);\n-\t\tr->low = (int)(p25 - MAPPING_BOUND * (p75 - p25) + .499);\n-\t\tr->high = (int)(p75 + MAPPING_BOUND * (p75 - p25) + .499);\n-\t\tif (r->low > r->avg - MAX_STDDEV * r->std) r->low = (int)(r->avg - MAX_STDDEV'..b'\t}\n-\t\tif (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score\n-\t\t// compute mapQ for the best SE hit\n-\t\tscore_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired;\n-\t\t//q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0;\n-\t\tsubo = subo > score_un? subo : score_un;\n-\t\tq_pe = raw_mapq(o - subo, opt->a);\n-\t\tif (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499);\n-\t\tif (q_pe < 0) q_pe = 0;\n-\t\tif (q_pe > 60) q_pe = 60;\n-\t\tq_pe = (int)(q_pe * (1. - .5 * (a[0].a[0].frac_rep + a[1].a[0].frac_rep)) + .499);\n-\t\t// the following assumes no split hits\n-\t\tif (o > score_un) { // paired alignment is preferred\n-\t\t\tmem_alnreg_t *c[2];\n-\t\t\tc[0] = &a[0].a[z[0]]; c[1] = &a[1].a[z[1]];\n-\t\t\tfor (i = 0; i < 2; ++i) {\n-\t\t\t\tif (c[i]->secondary >= 0)\n-\t\t\t\t\tc[i]->sub = a[i].a[c[i]->secondary].score, c[i]->secondary = -2;\n-\t\t\t\tq_se[i] = mem_approx_mapq_se(opt, c[i]);\n-\t\t\t}\n-\t\t\tq_se[0] = q_se[0] > q_pe? q_se[0] : q_pe < q_se[0] + 40? q_pe : q_se[0] + 40;\n-\t\t\tq_se[1] = q_se[1] > q_pe? q_se[1] : q_pe < q_se[1] + 40? q_pe : q_se[1] + 40;\n-\t\t\textra_flag |= 2;\n-\t\t\t// cap at the tandem repeat score\n-\t\t\tq_se[0] = q_se[0] < raw_mapq(c[0]->score - c[0]->csub, opt->a)? q_se[0] : raw_mapq(c[0]->score - c[0]->csub, opt->a);\n-\t\t\tq_se[1] = q_se[1] < raw_mapq(c[1]->score - c[1]->csub, opt->a)? q_se[1] : raw_mapq(c[1]->score - c[1]->csub, opt->a);\n-\t\t} else { // the unpaired alignment is preferred\n-\t\t\tz[0] = z[1] = 0;\n-\t\t\tq_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]);\n-\t\t\tq_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]);\n-\t\t}\n-\t\t// suboptimal hits\n-\t\tif (!(opt->flag & MEM_F_ALL)) {\n-\t\t\tfor (i = 0; i < 2; ++i) {\n-\t\t\t\tint k = a[i].a[z[i]].secondary;\n-\t\t\t\tif (k >= 0) { // switch secondary and primary\n-\t\t\t\t\tassert(a[i].a[k].secondary < 0);\n-\t\t\t\t\tfor (j = 0; j < a[i].n; ++j)\n-\t\t\t\t\t\tif (a[i].a[j].secondary == k || j == k)\n-\t\t\t\t\t\t\ta[i].a[j].secondary = z[i];\n-\t\t\t\t\ta[i].a[z[i]].secondary = -1;\n-\t\t\t\t}\n-\t\t\t\tXA[i] = mem_gen_alt(opt, bns, pac, &a[i], s[i].l_seq, s[i].seq);\n-\t\t\t}\n-\t\t} else XA[0] = XA[1] = 0;\n-\t\t// write SAM\n-\t\th[0] = mem_reg2aln(opt, bns, pac, s[0].l_seq, s[0].seq, &a[0].a[z[0]]); h[0].mapq = q_se[0]; h[0].flag |= 0x40 | extra_flag; h[0].XA = XA[0]? XA[0][z[0]] : 0;\n-\t\th[1] = mem_reg2aln(opt, bns, pac, s[1].l_seq, s[1].seq, &a[1].a[z[1]]); h[1].mapq = q_se[1]; h[1].flag |= 0x80 | extra_flag; h[1].XA = XA[1]? XA[1][z[1]] : 0;\n-\t\tmem_aln2sam(bns, &str, &s[0], 1, &h[0], 0, &h[1], opt->flag&MEM_F_SOFTCLIP); s[0].sam = strdup(str.s); str.l = 0;\n-\t\tmem_aln2sam(bns, &str, &s[1], 1, &h[1], 0, &h[0], opt->flag&MEM_F_SOFTCLIP); s[1].sam = str.s;\n-\t\tif (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \\"%s\\", \\"%s\\"\\n", s[0].name, s[1].name);\n-\t\tfree(h[0].cigar); // h[0].XA will be freed in the following block\n-\t\tfree(h[1].cigar);\n-\t\t// free XA\n-\t\tfor (i = 0; i < 2; ++i) {\n-\t\t\tif (XA[i]) {\n-\t\t\t\tfor (j = 0; j < a[i].n; ++j) free(XA[i][j]);\n-\t\t\t\tfree(XA[i]);\n-\t\t\t}\n-\t\t}\n-\t} else goto no_pairing;\n-\treturn n;\n-\n-no_pairing:\n-\tfor (i = 0; i < 2; ++i) {\n-\t\tif (a[i].n && a[i].a[0].score >= opt->T)\n-\t\t\th[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[0]);\n-\t\telse h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0);\n-\t}\n-\tif (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it.\n-\t\tint64_t dist;\n-\t\tint d;\n-\t\td = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist);\n-\t\tif (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) extra_flag |= 2;\n-\t}\n-\tmem_reg2sam_se(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]);\n-\tmem_reg2sam_se(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]);\n-\tif (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \\"%s\\", \\"%s\\"\\n", s[0].name, s[1].name);\n-\tfree(h[0].cigar); free(h[1].cigar);\n-\treturn n;\n-}\n'