Next changeset 1:b9877fa1159a (2017-09-12) |
Commit message:
planemo upload commit 93e677982c3636da455de2f827a87e516c7985ac-dirty |
added:
chimerascan.xml chimerascan/__init__.py chimerascan/__init__.pyc chimerascan/bx/__init__.py chimerascan/bx/__init__.pyc chimerascan/bx/cluster.c chimerascan/bx/cluster.pyx chimerascan/bx/cluster.so chimerascan/bx/intersection.c chimerascan/bx/intersection.pyx chimerascan/bx/intersection.so chimerascan/bx/intervalcluster.c chimerascan/bx/intervalcluster.h chimerascan/chimerascan_index.py chimerascan/lib/__init__.py chimerascan/lib/__init__.pyc chimerascan/lib/base.py chimerascan/lib/base.pyc chimerascan/lib/batch_sort.py chimerascan/lib/batch_sort.pyc chimerascan/lib/chimera.py chimerascan/lib/chimera.pyc chimerascan/lib/config.py chimerascan/lib/config.pyc chimerascan/lib/fastq_to_bam.py chimerascan/lib/feature.py chimerascan/lib/feature.pyc chimerascan/lib/fix_alignment_ordering.py chimerascan/lib/fragment_size_distribution.py chimerascan/lib/fragment_size_distribution.pyc chimerascan/lib/gene_to_genome.py chimerascan/lib/gene_to_genome.pyc chimerascan/lib/gtf.py chimerascan/lib/sam.py chimerascan/lib/sam.pyc chimerascan/lib/seq.py chimerascan/lib/seq.pyc chimerascan/lib/stats.py chimerascan/pipeline/__init__.py chimerascan/pipeline/__init__.pyc chimerascan/pipeline/align_bowtie.py chimerascan/pipeline/align_bowtie.pyc chimerascan/pipeline/chimeras_to_breakpoints.py chimerascan/pipeline/chimeras_to_breakpoints.pyc chimerascan/pipeline/discordant_reads_to_bedpe.py chimerascan/pipeline/discordant_reads_to_bedpe.pyc chimerascan/pipeline/fastq_inspect_reads.py chimerascan/pipeline/fastq_inspect_reads.pyc chimerascan/pipeline/fastq_merge_trim.py chimerascan/pipeline/filter_chimeras.py chimerascan/pipeline/filter_chimeras.pyc chimerascan/pipeline/filter_homologous_genes.py chimerascan/pipeline/filter_homologous_genes.pyc chimerascan/pipeline/find_discordant_reads.py chimerascan/pipeline/find_discordant_reads.pyc chimerascan/pipeline/merge_spanning_alignments.py chimerascan/pipeline/merge_spanning_alignments.pyc chimerascan/pipeline/nominate_chimeras.py chimerascan/pipeline/nominate_chimeras.pyc chimerascan/pipeline/nominate_spanning_reads.py chimerascan/pipeline/nominate_spanning_reads.pyc chimerascan/pipeline/profile_insert_size.py chimerascan/pipeline/resolve_discordant_reads.py chimerascan/pipeline/resolve_discordant_reads.pyc chimerascan/pipeline/sam2bam.py chimerascan/pipeline/write_output.py chimerascan/pipeline/write_output.pyc chimerascan/pysam/COPYING chimerascan/pysam/Pileup.py chimerascan/pysam/Pileup.pyc chimerascan/pysam/__init__.py chimerascan/pysam/__init__.pyc chimerascan/pysam/_cffi_backend.so chimerascan/pysam/_yaml.so chimerascan/pysam/csamtools.c chimerascan/pysam/csamtools.pxd chimerascan/pysam/csamtools.pyx chimerascan/pysam/csamtools.so chimerascan/pysam/ctabix.c chimerascan/pysam/ctabix.pxd chimerascan/pysam/ctabix.pyx chimerascan/pysam/ctabix.so chimerascan/pysam/namedtuple.py chimerascan/pysam/pysam_util.c chimerascan/pysam/pysam_util.h chimerascan/pysam/samtools/bam.c chimerascan/pysam/samtools/bam.h chimerascan/pysam/samtools/bam_aux.c chimerascan/pysam/samtools/bam_color.c chimerascan/pysam/samtools/bam_endian.h chimerascan/pysam/samtools/bam_import.c chimerascan/pysam/samtools/bam_index.c chimerascan/pysam/samtools/bam_lpileup.c chimerascan/pysam/samtools/bam_maqcns.c chimerascan/pysam/samtools/bam_maqcns.h chimerascan/pysam/samtools/bam_mate.c chimerascan/pysam/samtools/bam_md.c chimerascan/pysam/samtools/bam_pileup.c chimerascan/pysam/samtools/bam_plcmd.c chimerascan/pysam/samtools/bam_reheader.c chimerascan/pysam/samtools/bam_rmdup.c chimerascan/pysam/samtools/bam_rmdupse.c chimerascan/pysam/samtools/bam_sort.c chimerascan/pysam/samtools/bam_stat.c chimerascan/pysam/samtools/bam_tview.c chimerascan/pysam/samtools/bgzf.c chimerascan/pysam/samtools/bgzf.h chimerascan/pysam/samtools/faidx.c chimerascan/pysam/samtools/faidx.h chimerascan/pysam/samtools/glf.c chimerascan/pysam/samtools/glf.h chimerascan/pysam/samtools/kaln.c chimerascan/pysam/samtools/kaln.h chimerascan/pysam/samtools/khash.h chimerascan/pysam/samtools/klist.h chimerascan/pysam/samtools/knetfile.c chimerascan/pysam/samtools/knetfile.h chimerascan/pysam/samtools/kseq.h chimerascan/pysam/samtools/ksort.h chimerascan/pysam/samtools/kstring.c chimerascan/pysam/samtools/kstring.h chimerascan/pysam/samtools/razf.c chimerascan/pysam/samtools/razf.h chimerascan/pysam/samtools/sam.c chimerascan/pysam/samtools/sam.h chimerascan/pysam/samtools/sam_header.c chimerascan/pysam/samtools/sam_header.h chimerascan/pysam/samtools/sam_view.c chimerascan/pysam/setup.cfg chimerascan/pysam/setup.py chimerascan/pysam/tabix/bam_endian.h chimerascan/pysam/tabix/bgzf.c chimerascan/pysam/tabix/bgzf.h chimerascan/pysam/tabix/bgzip.c chimerascan/pysam/tabix/index.c chimerascan/pysam/tabix/khash.h chimerascan/pysam/tabix/knetfile.c chimerascan/pysam/tabix/knetfile.h chimerascan/pysam/tabix/ksort.h chimerascan/pysam/tabix/kstring.c chimerascan/pysam/tabix/kstring.h chimerascan/pysam/tabix/tabix.h chimerascan/pysam/tests/00README.txt chimerascan/pysam/tests/Makefile chimerascan/pysam/tests/ex1.fa chimerascan/pysam/tests/ex1.sam.gz chimerascan/pysam/tests/ex3.sam chimerascan/pysam/tests/ex4.sam chimerascan/pysam/tests/ex5.sam chimerascan/pysam/tests/ex6.sam chimerascan/pysam/tests/ex7.sam chimerascan/pysam/tests/ex8.sam chimerascan/pysam/tests/example.gtf.gz chimerascan/pysam/tests/example.gtf.gz.tbi chimerascan/pysam/tests/example.py chimerascan/pysam/tests/pysam_test.py chimerascan/pysam/tests/segfault_tests.py chimerascan/pysam/tests/tabix_test.py chimerascan/pysam/version.py chimerascan/pysam/version.pyc chimerascan/test/__init__.py chimerascan/test/test_homology.py chimerascan/tools/__init__.py chimerascan/tools/chimerascan_html_table.py chimerascan/tools/gtf_to_genepred.py chimerascan/tools/make_false_positive_file.py chimerascan/tools/sortable.js chimerascan/tools/sortable_us.js chimerascan/tools/table_style.css chimerascan/tools/table_template.html chimerascan_run.py test-data/input1.fastq test-data/input2.fastq test-data/outputfile.bed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan.xml Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,54 @@ +<tool id="chimerascan" name="ChimeraScan"> + <description>A tool for identifying chimeric transcription in sequencing data.</description> + <command detect_errors="exit_code" interpreter="python"> + $__tool_directory__/chimerascan_run.py -p 8 $__tool_directory__/myindex + #if $input_type_conditional.chimerascan_input_type == "paired" + $input_type_conditional.input_1 $input_type_conditional.input_2 + #else + $input_type_conditional.input.forward $input_type_conditional.input.reverse + #end if + $galaxy_output + </command> + <inputs> + <conditional name="input_type_conditional"> + <param name="chimerascan_input_type" type="select" label="Input Type" help="Select between paired and paired collection"> + <option value="paired" selected="true">Paired</option> + <option value="paired_collection">Paired Collection</option> + </param> + <when value="paired"> + <param format='fastq' name='input_1' type='data' label='FASTQ file, forward reads' /> + <param format='fastq' name='input_2' type='data' label='FASTQ file, reverse reads' /> + </when> + <when value="paired_collection"> + <param format="fastq" name='input' type="data_collection" collection_type="paired" label="Select a paired collection" help="Specify paired dataset collection containing paired reads"/> + </when> + </conditional> + </inputs> + <outputs> + <data name="galaxy_output" format="bed" /> + </outputs> + + <tests> + <test> + <param name="input1" value="input1.fastq"/> + <param name="input2" value="input2.fastq"/> + <output name="galaxy_output" file="outputfile.bed" ftype="bed"/> + </test> + <test> + <param name="fastq_input"> + <collection type="paired"> + <element name="forward" value="input1.fastq" /> + <element name="reverse" value="input2.fastq" /> + </collection> + </param> + <param name="input_type" value="paired_collection" /> + <output name="galaxy_output" file="outputfile.bed" ftype="bed"/> + </test> + </tests> + + <help> + Bowtie index files must be placed inside 'myindex folder' + A tool for identifying chimeric transcription in sequencing data. + </help> + +</tool> |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/__init__.py Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,2 @@ +# chimerascan versioning information +__version__ = "0.4.6" |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/__init__.pyc |
b |
Binary file chimerascan/__init__.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/bx/__init__.pyc |
b |
Binary file chimerascan/bx/__init__.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/bx/cluster.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/bx/cluster.c Thu Sep 07 17:55:18 2017 -0400 |
b |
b'@@ -0,0 +1,2316 @@\n+/* Generated by Cython 0.13 on Thu Feb 3 22:15:38 2011 */\n+\n+#define PY_SSIZE_T_CLEAN\n+#include "Python.h"\n+#ifndef Py_PYTHON_H\n+ #error Python headers needed to compile C extensions, please install development version of Python.\n+#else\n+\n+#include <stddef.h> /* For offsetof */\n+#ifndef offsetof\n+#define offsetof(type, member) ( (size_t) & ((type*)0) -> member )\n+#endif\n+\n+#if !defined(WIN32) && !defined(MS_WINDOWS)\n+ #ifndef __stdcall\n+ #define __stdcall\n+ #endif\n+ #ifndef __cdecl\n+ #define __cdecl\n+ #endif\n+ #ifndef __fastcall\n+ #define __fastcall\n+ #endif\n+#endif\n+\n+#ifndef DL_IMPORT\n+ #define DL_IMPORT(t) t\n+#endif\n+#ifndef DL_EXPORT\n+ #define DL_EXPORT(t) t\n+#endif\n+\n+#ifndef PY_LONG_LONG\n+ #define PY_LONG_LONG LONG_LONG\n+#endif\n+\n+#if PY_VERSION_HEX < 0x02040000\n+ #define METH_COEXIST 0\n+ #define PyDict_CheckExact(op) (Py_TYPE(op) == &PyDict_Type)\n+ #define PyDict_Contains(d,o) PySequence_Contains(d,o)\n+#endif\n+\n+#if PY_VERSION_HEX < 0x02050000\n+ typedef int Py_ssize_t;\n+ #define PY_SSIZE_T_MAX INT_MAX\n+ #define PY_SSIZE_T_MIN INT_MIN\n+ #define PY_FORMAT_SIZE_T ""\n+ #define PyInt_FromSsize_t(z) PyInt_FromLong(z)\n+ #define PyInt_AsSsize_t(o) PyInt_AsLong(o)\n+ #define PyNumber_Index(o) PyNumber_Int(o)\n+ #define PyIndex_Check(o) PyNumber_Check(o)\n+ #define PyErr_WarnEx(category, message, stacklevel) PyErr_Warn(category, message)\n+#endif\n+\n+#if PY_VERSION_HEX < 0x02060000\n+ #define Py_REFCNT(ob) (((PyObject*)(ob))->ob_refcnt)\n+ #define Py_TYPE(ob) (((PyObject*)(ob))->ob_type)\n+ #define Py_SIZE(ob) (((PyVarObject*)(ob))->ob_size)\n+ #define PyVarObject_HEAD_INIT(type, size) \\\n+ PyObject_HEAD_INIT(type) size,\n+ #define PyType_Modified(t)\n+\n+ typedef struct {\n+ void *buf;\n+ PyObject *obj;\n+ Py_ssize_t len;\n+ Py_ssize_t itemsize;\n+ int readonly;\n+ int ndim;\n+ char *format;\n+ Py_ssize_t *shape;\n+ Py_ssize_t *strides;\n+ Py_ssize_t *suboffsets;\n+ void *internal;\n+ } Py_buffer;\n+\n+ #define PyBUF_SIMPLE 0\n+ #define PyBUF_WRITABLE 0x0001\n+ #define PyBUF_FORMAT 0x0004\n+ #define PyBUF_ND 0x0008\n+ #define PyBUF_STRIDES (0x0010 | PyBUF_ND)\n+ #define PyBUF_C_CONTIGUOUS (0x0020 | PyBUF_STRIDES)\n+ #define PyBUF_F_CONTIGUOUS (0x0040 | PyBUF_STRIDES)\n+ #define PyBUF_ANY_CONTIGUOUS (0x0080 | PyBUF_STRIDES)\n+ #define PyBUF_INDIRECT (0x0100 | PyBUF_STRIDES)\n+\n+#endif\n+\n+#if PY_MAJOR_VERSION < 3\n+ #define __Pyx_BUILTIN_MODULE_NAME "__builtin__"\n+#else\n+ #define __Pyx_BUILTIN_MODULE_NAME "builtins"\n+#endif\n+\n+#if PY_MAJOR_VERSION >= 3\n+ #define Py_TPFLAGS_CHECKTYPES 0\n+ #define Py_TPFLAGS_HAVE_INDEX 0\n+#endif\n+\n+#if (PY_VERSION_HEX < 0x02060000) || (PY_MAJOR_VERSION >= 3)\n+ #define Py_TPFLAGS_HAVE_NEWBUFFER 0\n+#endif\n+\n+#if PY_MAJOR_VERSION >= 3\n+ #define PyBaseString_Type PyUnicode_Type\n+ #define PyStringObject PyUnicodeObject\n+ #define PyString_Type PyUnicode_Type\n+ #define PyString_Check PyUnicode_Check\n+ #define PyString_CheckExact PyUnicode_CheckExact\n+#endif\n+\n+#if PY_VERSION_HEX < 0x02060000\n+ #define PyBytesObject PyStringObject\n+ #define PyBytes_Type PyString_Type\n+ #define PyBytes_Check PyString_Check\n+ #define PyBytes_CheckExact PyString_CheckExact\n+ #define PyBytes_FromString PyString_FromString\n+ #define PyBytes_FromStringAndSize PyString_FromStringAndSize\n+ #define PyBytes_FromFormat PyString_FromFormat\n+ #define PyBytes_DecodeEscape PyString_DecodeEscape\n+ #define PyBytes_AsString PyString_AsString\n+ #define PyBytes_AsStringAndSize PyString_AsStringAndSize\n+ #define PyBytes_Size PyString_Size\n+ #define PyBytes_AS_STRING PyString_AS_STRING\n+ #define PyBytes_GET_SIZE PyString_GET_SIZE\n+ #define PyBytes_Repr PyString_Repr\n+ #define PyBytes_Concat '..b'\n+ if (!py_code) goto bad;\n+ py_frame = PyFrame_New(\n+ PyThreadState_GET(), /*PyThreadState *tstate,*/\n+ py_code, /*PyCodeObject *code,*/\n+ py_globals, /*PyObject *globals,*/\n+ 0 /*PyObject *locals*/\n+ );\n+ if (!py_frame) goto bad;\n+ py_frame->f_lineno = __pyx_lineno;\n+ PyTraceBack_Here(py_frame);\n+bad:\n+ Py_XDECREF(py_srcfile);\n+ Py_XDECREF(py_funcname);\n+ Py_XDECREF(py_code);\n+ Py_XDECREF(py_frame);\n+}\n+\n+static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {\n+ while (t->p) {\n+ #if PY_MAJOR_VERSION < 3\n+ if (t->is_unicode) {\n+ *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);\n+ } else if (t->intern) {\n+ *t->p = PyString_InternFromString(t->s);\n+ } else {\n+ *t->p = PyString_FromStringAndSize(t->s, t->n - 1);\n+ }\n+ #else /* Python 3+ has unicode identifiers */\n+ if (t->is_unicode | t->is_str) {\n+ if (t->intern) {\n+ *t->p = PyUnicode_InternFromString(t->s);\n+ } else if (t->encoding) {\n+ *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL);\n+ } else {\n+ *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1);\n+ }\n+ } else {\n+ *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1);\n+ }\n+ #endif\n+ if (!*t->p)\n+ return -1;\n+ ++t;\n+ }\n+ return 0;\n+}\n+\n+/* Type Conversion Functions */\n+\n+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject* x) {\n+ int is_true = x == Py_True;\n+ if (is_true | (x == Py_False) | (x == Py_None)) return is_true;\n+ else return PyObject_IsTrue(x);\n+}\n+\n+static CYTHON_INLINE PyObject* __Pyx_PyNumber_Int(PyObject* x) {\n+ PyNumberMethods *m;\n+ const char *name = NULL;\n+ PyObject *res = NULL;\n+#if PY_VERSION_HEX < 0x03000000\n+ if (PyInt_Check(x) || PyLong_Check(x))\n+#else\n+ if (PyLong_Check(x))\n+#endif\n+ return Py_INCREF(x), x;\n+ m = Py_TYPE(x)->tp_as_number;\n+#if PY_VERSION_HEX < 0x03000000\n+ if (m && m->nb_int) {\n+ name = "int";\n+ res = PyNumber_Int(x);\n+ }\n+ else if (m && m->nb_long) {\n+ name = "long";\n+ res = PyNumber_Long(x);\n+ }\n+#else\n+ if (m && m->nb_int) {\n+ name = "int";\n+ res = PyNumber_Long(x);\n+ }\n+#endif\n+ if (res) {\n+#if PY_VERSION_HEX < 0x03000000\n+ if (!PyInt_Check(res) && !PyLong_Check(res)) {\n+#else\n+ if (!PyLong_Check(res)) {\n+#endif\n+ PyErr_Format(PyExc_TypeError,\n+ "__%s__ returned non-%s (type %.200s)",\n+ name, name, Py_TYPE(res)->tp_name);\n+ Py_DECREF(res);\n+ return NULL;\n+ }\n+ }\n+ else if (!PyErr_Occurred()) {\n+ PyErr_SetString(PyExc_TypeError,\n+ "an integer is required");\n+ }\n+ return res;\n+}\n+\n+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) {\n+ Py_ssize_t ival;\n+ PyObject* x = PyNumber_Index(b);\n+ if (!x) return -1;\n+ ival = PyInt_AsSsize_t(x);\n+ Py_DECREF(x);\n+ return ival;\n+}\n+\n+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) {\n+#if PY_VERSION_HEX < 0x02050000\n+ if (ival <= LONG_MAX)\n+ return PyInt_FromLong((long)ival);\n+ else {\n+ unsigned char *bytes = (unsigned char *) &ival;\n+ int one = 1; int little = (int)*(unsigned char*)&one;\n+ return _PyLong_FromByteArray(bytes, sizeof(size_t), little, 0);\n+ }\n+#else\n+ return PyInt_FromSize_t(ival);\n+#endif\n+}\n+\n+static CYTHON_INLINE size_t __Pyx_PyInt_AsSize_t(PyObject* x) {\n+ unsigned PY_LONG_LONG val = __Pyx_PyInt_AsUnsignedLongLong(x);\n+ if (unlikely(val == (unsigned PY_LONG_LONG)-1 && PyErr_Occurred())) {\n+ return (size_t)-1;\n+ } else if (unlikely(val != (unsigned PY_LONG_LONG)(size_t)val)) {\n+ PyErr_SetString(PyExc_OverflowError,\n+ "value too large to convert to size_t");\n+ return (size_t)-1;\n+ }\n+ return (size_t)val;\n+}\n+\n+\n+#endif /* Py_PYTHON_H */\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/bx/cluster.pyx --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/bx/cluster.pyx Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,125 @@ +""" +Downloaded from: +https://bitbucket.org/james_taylor/bx-python/wiki/Home + +Kanwei Li, 2009 +Inspired by previous ClusterTree + +Provides a ClusterTree data structure that supports efficient finding of +clusters of intervals that are within a certain distance apart. + +This clustering algorithm uses a binary tree structure. Nodes correspond to +non-overlapping intervals, where overlapping means that the distance between +two intervals is less or equal to the max separation. + +The tree self-balances using rotations based on the binomial sequence. Merges +among nodes are performed whenever a node is changed/added that will cause other +nodes to form a new cluster. + +C source code is in intervalcluster.c +""" + +cdef extern from "intervalcluster.h": + + cdef struct struct_interval: + int start + int end + int id + struct_interval * next + + ctypedef struct_interval interval + + cdef struct struct_clusternode: + int start + int end + struct_interval *interval_head + struct_interval *interval_tail + + ctypedef struct_clusternode clusternode + + cdef struct struct_clustertree: + int max_dist + int min_intervals + + struct_clusternode *root + + ctypedef struct_clustertree clustertree + + cdef struct struct_treeitr: + struct_treeitr *next + struct_clusternode *node + + ctypedef struct_treeitr treeitr + + clusternode* clusternode_insert(clustertree *tree, clusternode *node, int start, int end, int id) + clustertree* create_clustertree(int max_dist, int min_intervals) + treeitr* clusteritr(clustertree *tree) + void freeclusteritr(treeitr *itr) + void free_tree(clustertree *tree) + +cdef class ClusterTree: + cdef clustertree *tree + cdef int mincols + cdef int minregions + + def __cinit__(self, mincols, minregions): + self.tree = create_clustertree(mincols, minregions) + self.mincols = mincols + self.minregions = minregions + + def __dealloc__(self): + free_tree(self.tree) + + def insert(self, s, e, id): + ''' Insert an interval with start, end, id as parameters''' + if s > e: raise ValueError("Interval start must be before end") + self.tree.root = clusternode_insert(self.tree, self.tree.root, s, e, id) + + def getregions(self): + ''' Returns a list clusters in ascending order of starting position. + Each cluster is a tuple of (start, end, [sorted ids of intervals in cluster]) + + tree = ClusterTree(0, 0) + Insert (6, 7, 1), (1, 2, 3), (9, 10, 2), (3, 4, 0), (3, 8, 4) + tree.getregions() returns [(1, 2, [3]), (3, 8, [0, 1, 4]), (9, 10, [2])] + ''' + cdef treeitr *itr + cdef treeitr *head + cdef interval *ival + + regions = [] + head = clusteritr(self.tree) + itr = head + while (itr): + ids = [] + ival = itr.node.interval_head + while (ival): + ids.append(ival.id) + ival = ival.next + + regions.append( (itr.node.start, itr.node.end, sorted(ids)) ) + itr = itr.next + freeclusteritr(head) + return regions + + def getlines(self): + ''' Similar to getregions except it just returns a list of ids of intervals + The above example would return [3, 0, 1, 4, 2] + ''' + cdef treeitr *itr + cdef interval *ival + + lines = [] + itr = clusteritr(self.tree) + + while (itr): + ids = [] + ival = itr.node.interval_head + while (ival): + ids.append(ival.id) + ival = ival.next + + lines.extend(sorted(ids)) + itr = itr.next + return lines + |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/bx/cluster.so |
b |
Binary file chimerascan/bx/cluster.so has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/bx/intersection.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/bx/intersection.c Thu Sep 07 17:55:18 2017 -0400 |
b |
b'@@ -0,0 +1,7517 @@\n+/* Generated by Cython 0.13 on Thu Feb 3 22:15:44 2011 */\n+\n+#define PY_SSIZE_T_CLEAN\n+#include "Python.h"\n+#ifndef Py_PYTHON_H\n+ #error Python headers needed to compile C extensions, please install development version of Python.\n+#else\n+\n+#include <stddef.h> /* For offsetof */\n+#ifndef offsetof\n+#define offsetof(type, member) ( (size_t) & ((type*)0) -> member )\n+#endif\n+\n+#if !defined(WIN32) && !defined(MS_WINDOWS)\n+ #ifndef __stdcall\n+ #define __stdcall\n+ #endif\n+ #ifndef __cdecl\n+ #define __cdecl\n+ #endif\n+ #ifndef __fastcall\n+ #define __fastcall\n+ #endif\n+#endif\n+\n+#ifndef DL_IMPORT\n+ #define DL_IMPORT(t) t\n+#endif\n+#ifndef DL_EXPORT\n+ #define DL_EXPORT(t) t\n+#endif\n+\n+#ifndef PY_LONG_LONG\n+ #define PY_LONG_LONG LONG_LONG\n+#endif\n+\n+#if PY_VERSION_HEX < 0x02040000\n+ #define METH_COEXIST 0\n+ #define PyDict_CheckExact(op) (Py_TYPE(op) == &PyDict_Type)\n+ #define PyDict_Contains(d,o) PySequence_Contains(d,o)\n+#endif\n+\n+#if PY_VERSION_HEX < 0x02050000\n+ typedef int Py_ssize_t;\n+ #define PY_SSIZE_T_MAX INT_MAX\n+ #define PY_SSIZE_T_MIN INT_MIN\n+ #define PY_FORMAT_SIZE_T ""\n+ #define PyInt_FromSsize_t(z) PyInt_FromLong(z)\n+ #define PyInt_AsSsize_t(o) PyInt_AsLong(o)\n+ #define PyNumber_Index(o) PyNumber_Int(o)\n+ #define PyIndex_Check(o) PyNumber_Check(o)\n+ #define PyErr_WarnEx(category, message, stacklevel) PyErr_Warn(category, message)\n+#endif\n+\n+#if PY_VERSION_HEX < 0x02060000\n+ #define Py_REFCNT(ob) (((PyObject*)(ob))->ob_refcnt)\n+ #define Py_TYPE(ob) (((PyObject*)(ob))->ob_type)\n+ #define Py_SIZE(ob) (((PyVarObject*)(ob))->ob_size)\n+ #define PyVarObject_HEAD_INIT(type, size) \\\n+ PyObject_HEAD_INIT(type) size,\n+ #define PyType_Modified(t)\n+\n+ typedef struct {\n+ void *buf;\n+ PyObject *obj;\n+ Py_ssize_t len;\n+ Py_ssize_t itemsize;\n+ int readonly;\n+ int ndim;\n+ char *format;\n+ Py_ssize_t *shape;\n+ Py_ssize_t *strides;\n+ Py_ssize_t *suboffsets;\n+ void *internal;\n+ } Py_buffer;\n+\n+ #define PyBUF_SIMPLE 0\n+ #define PyBUF_WRITABLE 0x0001\n+ #define PyBUF_FORMAT 0x0004\n+ #define PyBUF_ND 0x0008\n+ #define PyBUF_STRIDES (0x0010 | PyBUF_ND)\n+ #define PyBUF_C_CONTIGUOUS (0x0020 | PyBUF_STRIDES)\n+ #define PyBUF_F_CONTIGUOUS (0x0040 | PyBUF_STRIDES)\n+ #define PyBUF_ANY_CONTIGUOUS (0x0080 | PyBUF_STRIDES)\n+ #define PyBUF_INDIRECT (0x0100 | PyBUF_STRIDES)\n+\n+#endif\n+\n+#if PY_MAJOR_VERSION < 3\n+ #define __Pyx_BUILTIN_MODULE_NAME "__builtin__"\n+#else\n+ #define __Pyx_BUILTIN_MODULE_NAME "builtins"\n+#endif\n+\n+#if PY_MAJOR_VERSION >= 3\n+ #define Py_TPFLAGS_CHECKTYPES 0\n+ #define Py_TPFLAGS_HAVE_INDEX 0\n+#endif\n+\n+#if (PY_VERSION_HEX < 0x02060000) || (PY_MAJOR_VERSION >= 3)\n+ #define Py_TPFLAGS_HAVE_NEWBUFFER 0\n+#endif\n+\n+#if PY_MAJOR_VERSION >= 3\n+ #define PyBaseString_Type PyUnicode_Type\n+ #define PyStringObject PyUnicodeObject\n+ #define PyString_Type PyUnicode_Type\n+ #define PyString_Check PyUnicode_Check\n+ #define PyString_CheckExact PyUnicode_CheckExact\n+#endif\n+\n+#if PY_VERSION_HEX < 0x02060000\n+ #define PyBytesObject PyStringObject\n+ #define PyBytes_Type PyString_Type\n+ #define PyBytes_Check PyString_Check\n+ #define PyBytes_CheckExact PyString_CheckExact\n+ #define PyBytes_FromString PyString_FromString\n+ #define PyBytes_FromStringAndSize PyString_FromStringAndSize\n+ #define PyBytes_FromFormat PyString_FromFormat\n+ #define PyBytes_DecodeEscape PyString_DecodeEscape\n+ #define PyBytes_AsString PyString_AsString\n+ #define PyBytes_AsStringAndSize PyString_AsStringAndSize\n+ #define PyBytes_Size PyString_Size\n+ #define PyBytes_AS_STRING PyString_AS_STRING\n+ #define PyBytes_GET_SIZE PyString_GET_SIZE\n+ #define PyBytes_Repr PyString_Repr\n+ #define PyBytes_Concat '..b'\n+ if (!py_code) goto bad;\n+ py_frame = PyFrame_New(\n+ PyThreadState_GET(), /*PyThreadState *tstate,*/\n+ py_code, /*PyCodeObject *code,*/\n+ py_globals, /*PyObject *globals,*/\n+ 0 /*PyObject *locals*/\n+ );\n+ if (!py_frame) goto bad;\n+ py_frame->f_lineno = __pyx_lineno;\n+ PyTraceBack_Here(py_frame);\n+bad:\n+ Py_XDECREF(py_srcfile);\n+ Py_XDECREF(py_funcname);\n+ Py_XDECREF(py_code);\n+ Py_XDECREF(py_frame);\n+}\n+\n+static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {\n+ while (t->p) {\n+ #if PY_MAJOR_VERSION < 3\n+ if (t->is_unicode) {\n+ *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);\n+ } else if (t->intern) {\n+ *t->p = PyString_InternFromString(t->s);\n+ } else {\n+ *t->p = PyString_FromStringAndSize(t->s, t->n - 1);\n+ }\n+ #else /* Python 3+ has unicode identifiers */\n+ if (t->is_unicode | t->is_str) {\n+ if (t->intern) {\n+ *t->p = PyUnicode_InternFromString(t->s);\n+ } else if (t->encoding) {\n+ *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL);\n+ } else {\n+ *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1);\n+ }\n+ } else {\n+ *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1);\n+ }\n+ #endif\n+ if (!*t->p)\n+ return -1;\n+ ++t;\n+ }\n+ return 0;\n+}\n+\n+/* Type Conversion Functions */\n+\n+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject* x) {\n+ int is_true = x == Py_True;\n+ if (is_true | (x == Py_False) | (x == Py_None)) return is_true;\n+ else return PyObject_IsTrue(x);\n+}\n+\n+static CYTHON_INLINE PyObject* __Pyx_PyNumber_Int(PyObject* x) {\n+ PyNumberMethods *m;\n+ const char *name = NULL;\n+ PyObject *res = NULL;\n+#if PY_VERSION_HEX < 0x03000000\n+ if (PyInt_Check(x) || PyLong_Check(x))\n+#else\n+ if (PyLong_Check(x))\n+#endif\n+ return Py_INCREF(x), x;\n+ m = Py_TYPE(x)->tp_as_number;\n+#if PY_VERSION_HEX < 0x03000000\n+ if (m && m->nb_int) {\n+ name = "int";\n+ res = PyNumber_Int(x);\n+ }\n+ else if (m && m->nb_long) {\n+ name = "long";\n+ res = PyNumber_Long(x);\n+ }\n+#else\n+ if (m && m->nb_int) {\n+ name = "int";\n+ res = PyNumber_Long(x);\n+ }\n+#endif\n+ if (res) {\n+#if PY_VERSION_HEX < 0x03000000\n+ if (!PyInt_Check(res) && !PyLong_Check(res)) {\n+#else\n+ if (!PyLong_Check(res)) {\n+#endif\n+ PyErr_Format(PyExc_TypeError,\n+ "__%s__ returned non-%s (type %.200s)",\n+ name, name, Py_TYPE(res)->tp_name);\n+ Py_DECREF(res);\n+ return NULL;\n+ }\n+ }\n+ else if (!PyErr_Occurred()) {\n+ PyErr_SetString(PyExc_TypeError,\n+ "an integer is required");\n+ }\n+ return res;\n+}\n+\n+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) {\n+ Py_ssize_t ival;\n+ PyObject* x = PyNumber_Index(b);\n+ if (!x) return -1;\n+ ival = PyInt_AsSsize_t(x);\n+ Py_DECREF(x);\n+ return ival;\n+}\n+\n+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) {\n+#if PY_VERSION_HEX < 0x02050000\n+ if (ival <= LONG_MAX)\n+ return PyInt_FromLong((long)ival);\n+ else {\n+ unsigned char *bytes = (unsigned char *) &ival;\n+ int one = 1; int little = (int)*(unsigned char*)&one;\n+ return _PyLong_FromByteArray(bytes, sizeof(size_t), little, 0);\n+ }\n+#else\n+ return PyInt_FromSize_t(ival);\n+#endif\n+}\n+\n+static CYTHON_INLINE size_t __Pyx_PyInt_AsSize_t(PyObject* x) {\n+ unsigned PY_LONG_LONG val = __Pyx_PyInt_AsUnsignedLongLong(x);\n+ if (unlikely(val == (unsigned PY_LONG_LONG)-1 && PyErr_Occurred())) {\n+ return (size_t)-1;\n+ } else if (unlikely(val != (unsigned PY_LONG_LONG)(size_t)val)) {\n+ PyErr_SetString(PyExc_OverflowError,\n+ "value too large to convert to size_t");\n+ return (size_t)-1;\n+ }\n+ return (size_t)val;\n+}\n+\n+\n+#endif /* Py_PYTHON_H */\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/bx/intersection.pyx --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/bx/intersection.pyx Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,472 @@\n+"""\n+Downloaded from:\n+https://bitbucket.org/james_taylor/bx-python/wiki/Home\n+\n+Data structure for performing intersect queries on a set of intervals which\n+preserves all information about the intervals (unlike bitset projection methods).\n+\n+:Authors: James Taylor (james@jamestaylor.org),\n+ Ian Schenk (ian.schenck@gmail.com),\n+ Brent Pedersen (bpederse@gmail.com)\n+"""\n+\n+# Historical note:\n+# This module original contained an implementation based on sorted endpoints\n+# and a binary search, using an idea from Scott Schwartz and Piotr Berman.\n+# Later an interval tree implementation was implemented by Ian for Galaxy\'s\n+# join tool (see `bx.intervals.operations.quicksect.py`). This was then\n+# converted to Cython by Brent, who also added support for\n+# upstream/downstream/neighbor queries. This was modified by James to\n+# handle half-open intervals strictly, to maintain sort order, and to\n+# implement the same interface as the original Intersecter.\n+\n+import operator\n+\n+cdef extern from "stdlib.h":\n+ int ceil(float f)\n+ float log(float f)\n+ int RAND_MAX\n+ int rand()\n+ int strlen(char *)\n+ int iabs(int)\n+\n+cdef inline int imax2(int a, int b):\n+ if b > a: return b\n+ return a\n+\n+cdef inline int imax3(int a, int b, int c):\n+ if b > a:\n+ if c > b:\n+ return c\n+ return b\n+ if a > c:\n+ return a\n+ return c\n+\n+cdef inline int imin3(int a, int b, int c):\n+ if b < a:\n+ if c < b:\n+ return c\n+ return b\n+ if a < c:\n+ return a\n+ return c\n+\n+cdef inline int imin2(int a, int b):\n+ if b < a: return b\n+ return a\n+\n+cdef float nlog = -1.0 / log(0.5)\n+\n+cdef class IntervalNode:\n+ """\n+ A single node of an `IntervalTree`.\n+ \n+ NOTE: Unless you really know what you are doing, you probably should us\n+ `IntervalTree` rather than using this directly. \n+ """\n+ cdef float priority\n+ cdef public object interval\n+ cdef public int start, end\n+ cdef int minend, maxend, minstart\n+ cdef IntervalNode cleft, cright, croot\n+\n+ property left_node:\n+ def __get__(self):\n+ return self.cleft if self.cleft is not EmptyNode else None\n+ property right_node:\n+ def __get__(self):\n+ return self.cright if self.cright is not EmptyNode else None\n+ property root_node:\n+ def __get__(self):\n+ return self.croot if self.croot is not EmptyNode else None\n+ \n+ def __repr__(self):\n+ return "IntervalNode(%i, %i)" % (self.start, self.end)\n+\n+ def __cinit__(IntervalNode self, int start, int end, object interval):\n+ # Python lacks the binomial distribution, so we convert a\n+ # uniform into a binomial because it naturally scales with\n+ # tree size. Also, python\'s uniform is perfect since the\n+ # upper limit is not inclusive, which gives us undefined here.\n+ self.priority = ceil(nlog * log(-1.0/(1.0 * rand()/RAND_MAX - 1)))\n+ self.start = start\n+ self.end = end\n+ self.interval = interval\n+ self.maxend = end\n+ self.minstart = start\n+ self.minend = end\n+ self.cleft = EmptyNode\n+ self.cright = EmptyNode\n+ self.croot = EmptyNode\n+ \n+ cpdef IntervalNode insert(IntervalNode self, int start, int end, object interval):\n+ """\n+ Insert a new IntervalNode into the tree of which this node is\n+ currently the root. The return value is the new root of the tree (which\n+ may or may not be this node!)\n+ """\n+ cdef IntervalNode croot = self\n+ # If starts are the same, decide which to add interval to based on\n+ # end, thus maintaining sortedness relative to start/end\n+ cdef int decision_endpoint = start\n+ if start == self.start:\n+ decision_endpoint = end\n+ \n+ if decision_endpoint > self.start:\n+ #'..b'(1, 2, strand="-"), num_intervals=3)\n+ [Interval(3, 7), Interval(3, 40), Interval(13, 50)]\n+\n+ \n+ """\n+ \n+ cdef IntervalNode root\n+ \n+ def __cinit__( self ):\n+ root = None\n+ \n+ # ---- Position based interfaces -----------------------------------------\n+ \n+ def insert( self, int start, int end, object value=None ):\n+ """\n+ Insert the interval [start,end) associated with value `value`.\n+ """\n+ if self.root is None:\n+ self.root = IntervalNode( start, end, value )\n+ else:\n+ self.root = self.root.insert( start, end, value )\n+ \n+ add = insert\n+\n+\n+ def find( self, start, end ):\n+ """\n+ Return a sorted list of all intervals overlapping [start,end).\n+ """\n+ if self.root is None:\n+ return []\n+ return self.root.find( start, end )\n+ \n+ def before( self, position, num_intervals=1, max_dist=2500 ):\n+ """\n+ Find `num_intervals` intervals that lie before `position` and are no\n+ further than `max_dist` positions away\n+ """\n+ if self.root is None:\n+ return []\n+ return self.root.left( position, num_intervals, max_dist )\n+\n+ def after( self, position, num_intervals=1, max_dist=2500 ):\n+ """\n+ Find `num_intervals` intervals that lie after `position` and are no\n+ further than `max_dist` positions away\n+ """\n+ if self.root is None:\n+ return []\n+ return self.root.right( position, num_intervals, max_dist )\n+\n+ # ---- Interval-like object based interfaces -----------------------------\n+\n+ def insert_interval( self, interval ):\n+ """\n+ Insert an "interval" like object (one with at least start and end\n+ attributes)\n+ """\n+ self.insert( interval.start, interval.end, interval )\n+\n+ add_interval = insert_interval\n+\n+ def before_interval( self, interval, num_intervals=1, max_dist=2500 ):\n+ """\n+ Find `num_intervals` intervals that lie completely before `interval`\n+ and are no further than `max_dist` positions away\n+ """\n+ if self.root is None:\n+ return []\n+ return self.root.left( interval.start, num_intervals, max_dist )\n+\n+ def after_interval( self, interval, num_intervals=1, max_dist=2500 ):\n+ """\n+ Find `num_intervals` intervals that lie completely after `interval` and\n+ are no further than `max_dist` positions away\n+ """\n+ if self.root is None:\n+ return []\n+ return self.root.right( interval.end, num_intervals, max_dist )\n+\n+ def upstream_of_interval( self, interval, num_intervals=1, max_dist=2500 ):\n+ """\n+ Find `num_intervals` intervals that lie completely upstream of\n+ `interval` and are no further than `max_dist` positions away\n+ """\n+ if self.root is None:\n+ return []\n+ if interval.strand == -1 or interval.strand == "-":\n+ return self.root.right( interval.end, num_intervals, max_dist )\n+ else:\n+ return self.root.left( interval.start, num_intervals, max_dist )\n+\n+ def downstream_of_interval( self, interval, num_intervals=1, max_dist=2500 ):\n+ """\n+ Find `num_intervals` intervals that lie completely downstream of\n+ `interval` and are no further than `max_dist` positions away\n+ """\n+ if self.root is None:\n+ return []\n+ if interval.strand == -1 or interval.strand == "-":\n+ return self.root.left( interval.start, num_intervals, max_dist )\n+ else:\n+ return self.root.right( interval.end, num_intervals, max_dist )\n+ \n+ def traverse(self, fn):\n+ """\n+ call fn for each element in the tree\n+ """\n+ if self.root is None:\n+ return None\n+ return self.root.traverse(fn)\n+\n+# For backward compatibility\n+Intersecter = IntervalTree\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/bx/intersection.so |
b |
Binary file chimerascan/bx/intersection.so has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/bx/intervalcluster.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/bx/intervalcluster.c Thu Sep 07 17:55:18 2017 -0400 |
b |
b'@@ -0,0 +1,268 @@\n+/*\n+\tDownloaded from:\n+\thttps://bitbucket.org/james_taylor/bx-python/wiki/Home\n+\n+ Kanwei Li, 2009\n+ Inspired by previous ClusterTree\n+ \n+ This clustering algorithm uses a binary tree structure. Nodes correspond to \n+ non-overlapping intervals, where overlapping means that the distance between\n+ two intervals is less or equal to max_dist, which is the max separation.\n+ \n+ The tree self-balances using rotations based on the binomial sequence. Merges\n+ among nodes are performed whenever a node is changed/added that will cause other\n+ nodes to form a new cluster.\n+*/\n+#include <stdlib.h>\n+#include <stdio.h>\n+#include <math.h>\n+#include "intervalcluster.h"\n+\n+#define ALLOC(pt) (malloc(sizeof(pt)))\n+\n+static int min(int a, int b) {\n+ if( a < b )\n+ return a;\n+ else\n+ return b;\n+}\n+\n+static int max(int a, int b) {\n+ if( a > b )\n+ return a;\n+ else\n+ return b;\n+}\n+\n+/* Create new tree with given max_dist (max distance between intervals to be\n+ considered a cluster), and min_intervals, the minimum number of intervals\n+ needed for a cluster to be considered significant */\n+clustertree* create_clustertree(int max_dist, int min_intervals) {\n+ clustertree *tree = ALLOC(clustertree);\n+ tree->max_dist = max_dist;\n+ tree->min_intervals = min_intervals;\n+ tree->root = NULL;\n+ return tree;\n+}\n+\n+static interval* create_interval(int start, int end, int id) {\n+ interval *ival = ALLOC(interval);\n+ \n+ ival->start = start;\n+ ival->end = end;\n+ ival->id = id;\n+ ival->next = NULL;\n+ return ival;\n+}\n+\n+static clusternode* create_node(int start, int end, int id) {\n+ clusternode *new_node = ALLOC(clusternode);\n+ \n+ new_node->start = start;\n+ new_node->end = end;\n+ new_node->interval_head = create_interval(start, end, id);\n+ new_node->interval_tail = new_node->interval_head;\n+ new_node->num_ivals = 1;\n+ new_node->left = NULL;\n+ new_node->right = NULL;\n+ \n+ double uniform = ((double)rand()) / (RAND_MAX);\n+ if (uniform == 1.0)\n+ uniform = 0;\n+ new_node->priority = (int)ceil( (-1.0 / log(.5)) * log( -1.0 / (uniform - 1)));\n+ \n+ return new_node;\n+}\n+\n+static void recursively_free_intervals(interval *ival) {\n+ interval *next;\n+ if(ival) {\n+ next = ival->next;\n+ free(ival);\n+ recursively_free_intervals(next);\n+ }\n+}\n+\n+static void recursively_free_nodes(clusternode *node) {\n+ if(node) {\n+ recursively_free_nodes(node->left);\n+ recursively_free_nodes(node->right);\n+ recursively_free_intervals(node->interval_head);\n+ free(node);\n+ }\n+}\n+\n+void free_tree(clustertree *tree) {\n+ recursively_free_nodes(tree->root);\n+ free(tree);\n+}\n+\n+void cluster_rotateright(clusternode **node) {\n+ clusternode* root = (*node)->left;\n+ (*node)->left = (*node)->left->right;\n+ root->right = (*node);\n+ *node = root;\n+}\n+\n+void cluster_rotateleft(clusternode **node) {\n+ clusternode* root = (*node)->right;\n+ (*node)->right = (*node)->right->left;\n+ root->left = (*node);\n+ *node = root;\n+}\n+\n+/* Go down the tree and merge nodes if necessary */\n+void cluster_fixup(clustertree *tree, clusternode **ln, clusternode **rn) {\n+ clusternode* local = *ln;\n+ clusternode* root = *rn;\n+ int maxstart = max(root->start, local->start);\n+ int maxend = max(local->end, root->end);\n+ int minstart = min(root->start, local->start);\n+ int minend = min(root->end, local->end);\n+\n+ if( maxstart - minend <= tree->max_dist ) {\n+ /* Have to merge this node and children */\n+ root->start = minstart;\n+ root->end = maxend;\n+ root->interval_tail->next = local->interval_head;\n+ root->interval_tail = local->interval_tail;\n+ root->num_ivals += local->num_ivals;\n+ if( local->right) cluster_fixup(tree, &(local->right), rn);\n+ if( local->left) cluster_fixup(tree,'..b'ixup(tree, &(local->left), rn);\n+ }\n+ if(local->right) {\n+ cluster_fixup(tree, &(local->right), rn);\n+ }\n+}\n+\n+/* Pyrex "getregions" implements this. Only used for C debugging */\n+void clustereach(clustertree *tree, clusternode *node) {\n+ interval* ival;\n+ if (node == NULL) {\n+ exit(1); /* Shouldn\'t happen */\n+ }\n+ if (node->left != NULL) {\n+ clustereach(tree, node->left);\n+ }\n+ printf("Node: %d\\t%d\\n", node->start, node->end);\n+ ival = node->interval_head;\n+ while(ival) {\n+ printf("\\tInterval %d: %d\\t%d\\n", ival->id, ival->start, ival->end);\n+ ival = ival->next;\n+ }\n+ \n+ if (node->right != NULL) {\n+ clustereach(tree, node->right);\n+ }\n+}\n+\n+void clusteritr_recursive(clustertree *tree, clusternode *node, treeitr* *itr) {\n+ treeitr *newitr;\n+\n+ if (node == NULL) {\n+ return;\n+ }\n+ if (node->right != NULL) {\n+ clusteritr_recursive(tree, node->right, itr);\n+ }\n+ if (node->num_ivals >= tree->min_intervals) {\n+ newitr = ALLOC(treeitr);\n+ newitr->next = *itr;\n+ newitr->node = node;\n+ *itr = newitr;\n+ }\n+ if (node->left != NULL) {\n+ clusteritr_recursive(tree, node->left, itr);\n+ }\n+}\n+\n+/* Create an infix iterator */\n+treeitr* clusteritr(clustertree *tree) {\n+ treeitr *itr = NULL;\n+ \n+ clusteritr_recursive(tree, tree->root, &itr);\n+ if (itr != NULL) {\n+ return itr;\n+ }\n+ return NULL;\n+}\n+\n+/* Free an infix iterator */\n+void freeclusteritr(treeitr *itr) {\n+ if (itr != NULL) {\n+ \tif (itr->next != NULL) {\n+ \t\tfreeclusteritr(itr->next);\n+ \t}\n+ \tfree(itr);\n+ }\n+}\n+\n+/* Insert based on the start position of intervals */\n+clusternode* clusternode_insert(clustertree *tree, clusternode *node, int start, int end, int id) {\n+ int oldstart;\n+ int oldend;\n+ interval* ival;\n+ \n+ // printf("Inserting %d %d %d\\n", start, end, id);\n+ if (node == NULL) {\n+ node = create_node(start, end, id);\n+ \n+ } else if ( (start - tree->max_dist) > node->end ) { /* We\'re to the right of this cluster */\n+ node->right = clusternode_insert(tree, node->right, start, end, id);\n+ if (node->priority < node->right->priority) cluster_rotateleft(&node);\n+ \n+ } else if ( (end + tree->max_dist) < node->start) { /* We\'re to the left of this cluster */\n+ node->left = clusternode_insert(tree, node->left, start, end, id);\n+ if (node->priority < node->left->priority) cluster_rotateright(&node);\n+ \n+ } else { /* We\'re in the range of this cluster */\n+ /* Update the start and end to match to new values */\n+ oldstart = node->start;\n+ oldend = node->end;\n+ node->start = min(start, node->start);\n+ node->end = max(end, node->end);\n+ ival = create_interval(start, end, id);\n+ ival->next = node->interval_head; /* Add this interval as the head of the interval list */\n+ node->interval_head = ival;\n+ node->num_ivals += 1;\n+ \n+ if ( oldstart > node->start && node->left != NULL ) { /* New interval added to the start, and there\'s a left child */\n+ cluster_fixup(tree, &(node->left), &node);\n+ }\n+ if ( oldend < node->end && node->right != NULL ) { /* New interval added to the end, and there\'s a right child */\n+ cluster_fixup(tree, &(node->right), &node);\n+ }\n+ }\n+ return node;\n+}\n+\n+int main() {\n+ \n+ // Simple test\n+ clustertree* tree = create_clustertree(0, 1);\n+ \n+ tree->root = clusternode_insert(tree, tree->root, 3, 4, 0);\n+ tree->root = clusternode_insert(tree, tree->root, 6, 7, 1);\n+ tree->root = clusternode_insert(tree, tree->root, 9, 10, 2);\n+ tree->root = clusternode_insert(tree, tree->root, 1, 2, 3);\n+ tree->root = clusternode_insert(tree, tree->root, 3, 8, 4);\n+ \n+ clustereach(tree, tree->root);\n+ return 0;\n+ \n+}\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/bx/intervalcluster.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/bx/intervalcluster.h Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,44 @@ +/* + Downloaded from: + https://bitbucket.org/james_taylor/bx-python/wiki/Home +*/ + +typedef struct struct_interval { + int start; + int end; + int id; + + struct struct_interval *next; +} interval; + +typedef struct struct_clusternode { + int start; + int end; + int priority; + + struct struct_interval *interval_head; + struct struct_interval *interval_tail; + int num_ivals; + + struct struct_clusternode *left; + struct struct_clusternode *right; +} clusternode; + +typedef struct { + int max_dist; + int min_intervals; + + clusternode *root; +} clustertree; + +typedef struct struct_treeitr { + struct struct_treeitr *next; + struct struct_clusternode *node; +} treeitr; + + +clusternode* clusternode_insert(clustertree *tree, clusternode *node, int start, int end, int id); +clustertree* create_clustertree(int max_dist, int min_intervals); +treeitr* clusteritr(clustertree *tree); +void freeclusteritr(treeitr *itr); +void free_tree(clustertree *tree); |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/chimerascan_index.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/chimerascan_index.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,184 @@ +#!/usr/bin/env python +''' +Created on Jan 5, 2011 + +@author: mkiyer + +chimerascan: chimeric transcript discovery using RNA-seq + +Copyright (C) 2011 Matthew Iyer + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +''' +import logging +import os +import shutil +import subprocess +import sys +from optparse import OptionParser + +# local imports +import chimerascan.pysam as pysam +from chimerascan.lib.feature import GeneFeature +from chimerascan.lib.seq import DNA_reverse_complement +from chimerascan.lib.base import up_to_date, check_executable +from chimerascan.lib.config import JOB_ERROR, JOB_SUCCESS, ALIGN_INDEX, \ + BOWTIE_INDEX_FILE, GENE_FEATURE_FILE, GENE_REF_PREFIX + +BASES_PER_LINE = 50 + +def split_seq(seq, chars_per_line): + pos = 0 + newseq = [] + while pos < len(seq): + if pos + chars_per_line > len(seq): + endpos = len(seq) + else: + endpos = pos + chars_per_line + newseq.append(seq[pos:endpos]) + pos = endpos + return '\n'.join(newseq) + +def genepred_to_fasta(gene_feature_file, reference_seq_file): + ref_fa = pysam.Fastafile(reference_seq_file) + total = 0 + used = 0 + for g in GeneFeature.parse(open(gene_feature_file)): + total += 1 + exon_seqs = [] + error_occurred = False + for start, end in g.exons: + seq = ref_fa.fetch(g.chrom, start, end) + if (not seq) or (len(seq) < (end - start)): + logging.warning("gene %s exon %s:%d-%d not found in reference" % + (g.tx_name, g.chrom, start, end)) + error_occurred = True + break + exon_seqs.append(seq) + if error_occurred: + continue + used += 1 + # make fasta record + seq = ''.join(exon_seqs) + if g.strand == '-': + seq = DNA_reverse_complement(seq) + # break seq onto multiple lines + seqlines = split_seq(seq, BASES_PER_LINE) + fa_record = (">%s range=%s:%d-%d gene=%s strand=%s\n%s" % + (GENE_REF_PREFIX + g.tx_name, g.chrom, start, end, + g.gene_name, g.strand, seqlines)) + yield g, fa_record + logging.info("Used %d/%d gene features" % (used,total)) + ref_fa.close() + +def create_chimerascan_index(output_dir, + genome_fasta_file, + gene_feature_file, + bowtie_build_bin): + # create output dir if it does not exist + if not os.path.exists(output_dir): + os.makedirs(output_dir) + logging.info("Created index directory: %s" % (output_dir)) + # copy reference fasta file to output dir and index it + index_fasta_file = os.path.join(output_dir, ALIGN_INDEX + ".fa") + msg = "Adding reference genome to index" + if (up_to_date(index_fasta_file, genome_fasta_file)): + logging.info("[SKIPPED] %s" % (msg)) + else: + logging.info(msg) + shutil.copyfile(genome_fasta_file, index_fasta_file) + # index the genome fasta file + logging.info("Indexing FASTA file") + fh = pysam.Fastafile(index_fasta_file) + fh.close() + # add gene sequences to index + dst_gene_feature_file = os.path.join(output_dir, GENE_FEATURE_FILE) + msg = "Building transcriptome sequences and gene features" + if (up_to_date(index_fasta_file, gene_feature_file) and + up_to_date(dst_gene_feature_file, gene_feature_file)): + logging.info("[SKIPPED] %s" % (msg)) + else: + logging.info(msg) + # write sequences from gene feature file + logging.info("Adding transcript sequences and gene features to index") + fasta_fh = open(index_fasta_file, "a") + gene_fh = open(dst_gene_feature_file, "w") + for g, fa_record in genepred_to_fasta(gene_feature_file, index_fasta_file): + print >>gene_fh, str(g) + print >>fasta_fh, fa_record + gene_fh.close() + fasta_fh.close() + # remove old fasta index + if os.path.exists(index_fasta_file + ".fai"): + os.remove(index_fasta_file + ".fai") + # index the combined fasta file + logging.info("Reindexing the FASTA file") + fh = pysam.Fastafile(index_fasta_file) + fh.close() + # build bowtie index on the reference sequence file + bowtie_index_file = os.path.join(output_dir, BOWTIE_INDEX_FILE) + msg = "Building bowtie index" + if up_to_date(bowtie_index_file, index_fasta_file): + logging.info("[SKIPPED] %s" % (msg)) + else: + logging.info(msg) + bowtie_index_name = os.path.join(output_dir, ALIGN_INDEX) + args = [bowtie_build_bin, index_fasta_file, bowtie_index_name] + if subprocess.call(args) != os.EX_OK: + logging.error("bowtie-build failed to create alignment index") + if os.path.exists(bowtie_index_file): + os.remove(bowtie_index_file) + return JOB_ERROR + logging.info("Chimerascan index created successfully") + return JOB_SUCCESS + + +def main(): + logging.basicConfig(level=logging.DEBUG, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") + parser = OptionParser("usage: %prog [options] <reference_genome.fa> " + "<genepred_genes.txt> <index_output_dir>") + parser.add_option("--bowtie-dir", dest="bowtie_dir", default="", + help="Path to the 'bowtie' software (by default, " + "expects the 'bowtie' and 'bowtie-build' " + "binaries to be in current PATH)") + options, args = parser.parse_args() + # check command line arguments + if len(args) < 3: + parser.error("Incorrect number of command line arguments") + ref_fasta_file = args[0] + gene_feature_file = args[1] + output_dir = args[2] + # check that input files exist + if not os.path.isfile(ref_fasta_file): + parser.error("Reference fasta file '%s' not found" % (ref_fasta_file)) + if not os.path.isfile(gene_feature_file): + parser.error("Gene feature file '%s' not found" % (gene_feature_file)) + # check that output dir is not a regular file + if os.path.exists(output_dir) and (not os.path.isdir(output_dir)): + parser.error("Output directory name '%s' exists and is not a valid " + "directory" % (output_dir)) + # check that bowtie-build program exists + bowtie_build_bin = os.path.join(options.bowtie_dir, "bowtie-build") + if check_executable(bowtie_build_bin): + logging.debug("Checking for 'bowtie-build' binary... found") + else: + parser.error("bowtie-build binary not found or not executable") + # run main index creation function + retcode = create_chimerascan_index(output_dir, ref_fasta_file, + gene_feature_file, bowtie_build_bin) + sys.exit(retcode) + +if __name__ == '__main__': + main() \ No newline at end of file |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/lib/__init__.pyc |
b |
Binary file chimerascan/lib/__init__.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/lib/base.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/lib/base.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,100 @@ +''' +Created on Oct 26, 2010 + +@author: mkiyer + +chimerascan: chimeric transcript discovery using RNA-seq + +Copyright (C) 2011 Matthew Iyer + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +''' +import os +import subprocess +import tempfile +import operator + +# +# constants used for library type +# +class LibraryTypes: + FR_UNSTRANDED = "fr-unstranded" + FR_FIRSTSTRAND = "fr-firststrand" + FR_SECONDSTRAND = "fr-secondstrand" + + @staticmethod + def choices(): + return (LibraryTypes.FR_UNSTRANDED, + LibraryTypes.FR_FIRSTSTRAND, + LibraryTypes.FR_SECONDSTRAND) + + @staticmethod + def same_strand(library_type): + return (library_type[0] == library_type[1]) + +def parse_lines(line_iter, numlines=1): + """ + generator that returns list of 'numlines' lines at a time + """ + try: + while True: + yield [line_iter.next().rstrip() for x in xrange(numlines)] + except StopIteration: + pass + +def parse_bool(s): + return True if s[0].lower() == "t" else False + +def parse_string_none(s): + return None if s == "None" else s + +def make_temp(base_dir, suffix=''): + fd,name = tempfile.mkstemp(suffix=suffix, prefix='tmp', dir=base_dir) + os.close(fd) + return name + +def check_executable(filename): + # check that samtools binary exists + devnullfh = open(os.devnull, 'w') + try: + subprocess.call([filename], stdout=devnullfh, stderr=devnullfh) + except OSError: + return False + devnullfh.close() + return True + +def up_to_date(outfile, infile, nzsize=True): + if not os.path.exists(infile): + return False + if not os.path.exists(outfile): + return False + if nzsize and (os.path.getsize(outfile) == 0): + return False + return os.path.getmtime(outfile) >= os.path.getmtime(infile) + +# in-place XML prettyprint formatter +def indent_xml(elem, level=0): + i = "\n" + level*" " + if len(elem): + if not elem.text or not elem.text.strip(): + elem.text = i + " " + if not elem.tail or not elem.tail.strip(): + elem.tail = i + for elem in elem: + indent_xml(elem, level+1) + if not elem.tail or not elem.tail.strip(): + elem.tail = i + else: + if level and (not elem.tail or not elem.tail.strip()): + elem.tail = i |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/lib/base.pyc |
b |
Binary file chimerascan/lib/base.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/lib/batch_sort.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/lib/batch_sort.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,59 @@ +''' +Created on Jul 21, 2011 + +@author: mkiyer +''' + +# based on Recipe 466302: Sorting big files the Python 2.4 way +# by Nicolas Lehuen +#http://code.activestate.com/recipes/576755-sorting-big-files-the-python-26-way/ + +import os +from tempfile import gettempdir +from itertools import islice, cycle +from collections import namedtuple +import heapq + +Keyed = namedtuple("Keyed", ["key", "obj"]) + +def merge(key=None, *iterables): + # based on code posted by Scott David Daniels in c.l.p. + # http://groups.google.com/group/comp.lang.python/msg/484f01f1ea3c832d + + if key is None: + keyed_iterables = iterables + else: + keyed_iterables = [(Keyed(key(obj), obj) for obj in iterable) + for iterable in iterables] + for element in heapq.merge(*keyed_iterables): + yield element.obj + +def batch_sort(input, output, key=None, buffer_size=32000, tempdirs=None): + if tempdirs is None: + tempdirs = [] + if not tempdirs: + tempdirs.append(gettempdir()) + + chunks = [] + try: + with open(input,'rb',64*1024) as input_file: + input_iterator = iter(input_file) + for tempdir in cycle(tempdirs): + current_chunk = list(islice(input_iterator,buffer_size)) + if not current_chunk: + break + current_chunk.sort(key=key) + output_chunk = open(os.path.join(tempdir,'%06i'%len(chunks)),'w+b',64*1024) + chunks.append(output_chunk) + output_chunk.writelines(current_chunk) + output_chunk.flush() + output_chunk.seek(0) + with open(output,'wb',64*1024) as output_file: + output_file.writelines(merge(key, *chunks)) + finally: + for chunk in chunks: + try: + chunk.close() + os.remove(chunk.name) + except Exception: + pass |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/lib/batch_sort.pyc |
b |
Binary file chimerascan/lib/batch_sort.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/lib/chimera.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/lib/chimera.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,382 @@\n+\'\'\'\n+Created on Jun 3, 2011\n+\n+@author: mkiyer\n+\'\'\'\n+from base import parse_string_none\n+from sam import get_clipped_interval\n+\n+DISCORDANT_TAG_NAME = "XC"\n+class DiscordantTags(object):\n+ CONCORDANT_TX = 0\n+ DISCORDANT_STRAND_TX = 1\n+ CONCORDANT_GENE = 2\n+ DISCORDANT_STRAND_GENE = 3\n+ CONCORDANT_GENOME = 4\n+ DISCORDANT_STRAND_GENOME = 5\n+ DISCORDANT_GENE = 9\n+ DISCORDANT_GENOME = 17\n+\n+ORIENTATION_TAG_NAME = "XD"\n+class OrientationTags(object):\n+ NONE = 0\n+ FIVEPRIME = 1\n+ THREEPRIME = 2\n+\n+def cmp_orientation(a,b):\n+ if (a == OrientationTags.NONE) or (b == OrientationTags.NONE):\n+ return True\n+ return (a != b)\n+\n+# constants\n+MULTIMAP_BINS = (1,2,4,8,16,32,64,128)\n+CHIMERA_SEP = "|"\n+# amount of trimming to use to stop reads from overlapping \n+# exon boundaries and going into intronic space\n+EXON_JUNCTION_TRIM_BP = 10\n+\n+# chimera types\n+class ChimeraTypes(object):\n+ INTERCHROMOSOMAL = "Interchromosomal"\n+ OVERLAP_SAME = "Overlapping_Same"\n+ OVERLAP_CONVERGE = "Overlapping_Converging"\n+ OVERLAP_DIVERGE = "Overlapping_Diverging"\n+ OVERLAP_COMPLEX = "Overlapping_Complex"\n+ READTHROUGH = "Read_Through"\n+ ADJ_CONVERGE = "Adjacent_Converging"\n+ ADJ_DIVERGE = "Adjacent_Diverging"\n+ ADJ_COMPLEX = "Adjacent_Complex"\n+ INTRACHROMOSOMAL = "Intrachromosomal"\n+ INTRA_CONVERGE = "Intrachromosomal_Converging"\n+ INTRA_DIVERGE = "Intrachromsomal_Diverging"\n+ INTRA_COMPLEX = "Intrachromosomal_Complex"\n+ UNKNOWN = "Undetermined"\n+\n+class DiscordantRead(object):\n+ """\n+ stores read alignment information needed to nominate \n+ chimeric transcripts\n+\n+ (this is a subset of what is kept in SAM file)\n+ """\n+ def __init__(self):\n+ self.qname = ""\n+ self.hit_index = -1\n+ self.readnum = -1\n+ self.seq = ""\n+ self.tid = -1\n+ self.pos = -1\n+ self.aend = -1\n+ self.clipstart = -1\n+ self.clipend = -1\n+ self.is_reverse = False\n+ self.numhits = 0\n+ self.mismatches = 0\n+ self.discordant_type = 0\n+ self.orientation = 0\n+ self.is_spanning = False\n+\n+ @staticmethod\n+ def from_read(r):\n+ a = DiscordantRead()\n+ a.qname = r.qname\n+ a.hit_index = r.opt(\'HI\')\n+ a.readnum = 1 if r.is_read2 else 0\n+ a.seq = r.seq\n+ a.tid = r.rname\n+ a.pos = r.pos\n+ a.aend = r.aend\n+ a.clipstart, a.clipend = get_clipped_interval(r)\n+ a.is_reverse = r.is_reverse\n+ a.numhits = r.opt(\'NH\')\n+ a.mismatches = r.opt(\'NM\')\n+ a.discordant_type = r.opt(DISCORDANT_TAG_NAME)\n+ a.orientation = r.opt(ORIENTATION_TAG_NAME)\n+ a.is_spanning = False\n+ return a\n+\n+ @staticmethod\n+ def from_list(fields):\n+ a = DiscordantRead()\n+ a.qname = fields[0]\n+ a.hit_index = int(fields[1])\n+ a.readnum = int(fields[2])\n+ a.seq = fields[3]\n+ a.tid = int(fields[4])\n+ a.pos = int(fields[5])\n+ a.aend = int(fields[6])\n+ a.clipstart = int(fields[7])\n+ a.clipend = int(fields[8])\n+ a.is_reverse = True if int(fields[9]) == 1 else False\n+ a.numhits = int(fields[10])\n+ a.mismatches = int(fields[11])\n+ a.discordant_type = int(fields[12])\n+ a.orientation = int(fields[13])\n+ a.is_spanning = True if int(fields[14]) == 1 else False\n+ return a\n+\n+ def to_list(self):\n+ return [self.qname, self.hit_index, self.readnum, self.seq, \n+ self.tid, self.pos, self.aend, self.clipstart, \n+ self.clipend, int(self.is_reverse), self.numhits, \n+ self.mismatches, self.discordant_type, \n+ self.orientation, int(self.is_spanning)]\n+\n+\n+def frags_to_encomp_string(frags):\n+ if len(frags) == 0:\n+ return "None"\n+ # encompassing read pairs\n+ encomp_frags = []\n+ for frag in frags:\n+ r5p = Chimera.FIEL'..b' dreads.append(DiscordantRead.from_list(read_fields.split(c.FIELD_DELIM)))\n+ c.encomp_frags.append(dreads)\n+ # raw spanning read information\n+ spanning_reads_field = parse_string_none(fields[20])\n+ if spanning_reads_field is not None:\n+ for read_fields in spanning_reads_field.split(c.READ_DELIM):\n+ c.spanning_reads.append(DiscordantRead.from_list(read_fields.split(c.FIELD_DELIM))) \n+ return c\n+\n+ @staticmethod\n+ def parse(line_iter):\n+ for line in line_iter:\n+ if line.startswith("#"):\n+ continue \n+ fields = line.strip().split(\'\\t\')\n+ yield Chimera.from_list(fields)\n+\n+ def to_list(self):\n+ # reads\n+ if len(self.spanning_reads) == 0:\n+ span_string = None\n+ else:\n+ span_string = Chimera.READ_DELIM.join(Chimera.FIELD_DELIM.join(map(str,r.to_list())) \n+ for r in self.spanning_reads)\n+ return [self.tx_name_5p, self.tx_start_5p, self.tx_end_5p,\n+ self.tx_name_3p, self.tx_start_3p, self.tx_end_3p,\n+ self.name, self.score, \n+ self.tx_strand_5p, self.tx_strand_3p,\n+ self.gene_name_5p, self.gene_name_3p,\n+ "%d-%d" % (self.exons_5p[0], self.exons_5p[1]),\n+ "%d-%d" % (self.exons_3p[0], self.exons_3p[1]),\n+ self.breakpoint_name,\n+ self.breakpoint_seq_5p,\n+ self.breakpoint_seq_3p,\n+ self.homology_left,\n+ self.homology_right,\n+ frags_to_encomp_string(self.encomp_frags),\n+ span_string]\n+\n+ def get_num_unique_positions(self):\n+ """\n+ calculates total number of unique read alignment\n+ positions supporting chimera\n+ """\n+ # find all unique alignment positions and read names\n+ encomp_pos = set()\n+ qnames = set()\n+ for pair in self.encomp_frags:\n+ if pair[0].qname not in qnames:\n+ qnames.add(pair[0].qname)\n+ encomp_pos.add((pair[0].pos, pair[1].pos))\n+ # add spanning reads\n+ spanning_pos = set()\n+ for dr in self.spanning_reads:\n+ if dr.qname not in qnames:\n+ qnames.add(dr.qname)\n+ spanning_pos.add(dr.pos)\n+ return len(encomp_pos) + len(spanning_pos)\n+\n+ def get_num_frags(self, maxnumhits=0):\n+ """\n+ number of unique fragments supporting the \n+ chimera (by read name)\n+ """\n+ qnames = set()\n+ for pair in self.encomp_frags:\n+ if (maxnumhits > 0) and (min(pair[0].numhits, pair[1].numhits) > maxnumhits):\n+ continue\n+ qnames.add(pair[0].qname)\n+ for dr in self.spanning_reads:\n+ if (maxnumhits > 0) and (dr.numhits > maxnumhits):\n+ continue\n+ qnames.add(dr.qname)\n+ return len(qnames)\n+\n+ def get_num_spanning_frags(self, maxnumhits=0):\n+ """\n+ number of unique spanning fragments supporting the \n+ chimera (by read name)\n+ """\n+ qnames = set()\n+ for dpair in self.encomp_frags:\n+ if (maxnumhits > 0) and (min(dpair[0].numhits, dpair[1].numhits) > maxnumhits):\n+ continue\n+ if any(dr.is_spanning for dr in dpair):\n+ qnames.add(dpair[0].qname) \n+ for dr in self.spanning_reads:\n+ if (maxnumhits > 0) and (dr.numhits > maxnumhits):\n+ continue\n+ qnames.add(dr.qname)\n+ return len(qnames) \n+\n+ def get_spanning_reads(self):\n+ for dpair in self.encomp_frags:\n+ if dpair[0].is_spanning:\n+ yield dpair[0]\n+ if dpair[1].is_spanning:\n+ yield dpair[1]\n+ for dr in self.spanning_reads:\n+ yield dr\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/lib/chimera.pyc |
b |
Binary file chimerascan/lib/chimera.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/lib/config.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/lib/config.py Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,116 @@ +''' +Created on Jan 5, 2011 + +@author: mkiyer + +chimerascan: chimeric transcript discovery using RNA-seq + +Copyright (C) 2011 Matthew Iyer + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +''' +JOB_SUCCESS = 0 +JOB_ERROR = 1 + +# constants for index +ALIGN_INDEX = 'align_index' +ALIGN_INDEX_FASTA_FILE = 'align_index.fa' +BOWTIE_INDEX_FILE = 'align_index.1.ebwt' +GENE_REF_PREFIX = 'gene_' +GENE_FEATURE_FILE = "gene_features.txt" + +# chimerascan subdirectories +LOG_DIR = "log" +TMP_DIR = "tmp" + +# constraints for run configuration +BASE_PROCESSORS = 2 +MIN_SEGMENT_LENGTH = 20 +RUNCONFIG_XML_FILE = "runconfig.xml" + +# output after read inspection, name conversion, and +# quality score conversion +CONVERTED_FASTQ_PREFIX = "reads" +CONVERTED_FASTQ_FILES = tuple(CONVERTED_FASTQ_PREFIX + "_%d.fq" % (x+1) + for x in xrange(2)) + +# output from initial bowtie alignment +ALIGNED_READS_BAM_FILE = "aligned_reads.bam" +UNALIGNED_FASTQ_PARAM = "unaligned.fq" +UNALIGNED_FASTQ_FILES = ("unaligned_1.fq", "unaligned_2.fq") +MAXMULTIMAP_FASTQ_PARAM = "maxmulti.fq" +MAXMULTIMAP_FASTQ_FILES = ("maxmulti_1.fq", "maxmulti_2.fq") + +# sorted aligned reads bam file +SORTED_ALIGNED_READS_BAM_FILE = "sorted_aligned_reads.bam" + +# insert size estimation parameters +ISIZE_MIN_SAMPLES = 100 +ISIZE_MAX_SAMPLES = 1e6 +ISIZE_DIST_FILE = "isize_dist.txt" + +# output from realignment of trimmed reads +REALIGNED_BAM_FILE = "realigned_reads.bam" + +# output for different classes of discordant reads +GENE_PAIRED_BAM_FILE = "gene_paired_reads.bam" +GENOME_PAIRED_BAM_FILE = "genome_paired_reads.bam" +REALIGNED_UNMAPPED_BAM_FILE = "unmapped_reads.bam" +REALIGNED_COMPLEX_BAM_FILE = "complex_reads.bam" + +# discordant reads BEDPE file +DISCORDANT_BEDPE_FILE = "discordant_reads.bedpe" +SORTED_DISCORDANT_BEDPE_FILE = "discordant_reads.srt.bedpe" + +# chimera candidates with encompassing read support +ENCOMPASSING_CHIMERA_FILE = "encompassing_chimeras.txt" +FILTERED_ENCOMPASSING_CHIMERA_FILE = "encompassing_chimeras.filtered.txt" + +# amount of trimming to use to stop reads from overlapping +# exon boundaries and going into intronic space +EXON_JUNCTION_TRIM_BP = 10 + +# number of homology mismatches in breakpoint sequences +# to tolerate when computing homology distance +BREAKPOINT_HOMOLOGY_MISMATCHES = 2 +BREAKPOINT_CHIMERA_FILE = "encompassing_chimeras.breakpoint_sorted.txt" +BREAKPOINT_MAP_FILE = "breakpoints.txt" +BREAKPOINT_FASTA_FILE = "breakpoints.fa" +BREAKPOINT_BOWTIE_INDEX = "breakpoints" +BREAKPOINT_BOWTIE_INDEX_FILE = "breakpoints.1.ebwt" + +# reads to remap to breakpoint junction index +ENCOMP_SPANNING_FASTQ_FILE = "encomp_spanning_reads.fq" +SINGLE_MAPPED_BAM_FILE = "singlemap_reads.srt.bam" +SINGLEMAP_SPANNING_FASTQ_FILE = "singlemap_spanning_reads.fq" +UNALIGNED_SPANNING_FASTQ_FILE = "unaligned_spanning_reads.fq" + +# results of aligning reads to breakpoint index +ENCOMP_SPANNING_BAM_FILE = "encomp_spanning_reads.bam" +SORTED_ENCOMP_SPANNING_BAM_FILE = "encomp_spanning_reads.srt.bam" +SINGLEMAP_SPANNING_BAM_FILE = "singlemap_spanning_reads.bam" +SORTED_SINGLEMAP_SPANNING_BAM_FILE = "singlemap_spanning_reads.srt.bam" +UNALIGNED_SPANNING_BAM_FILE = "unaligned_spanning_reads.bam" +SORTED_UNALIGNED_SPANNING_BAM_FILE = "unaligned_spanning_reads.srt.bam" + +# results of merging spanning reads into chimera nominations +SPANNING_CHIMERA_FILE = "spanning_chimeras.txt" +# results of resolving ambiguous reads +RESOLVED_SPANNING_CHIMERA_FILE = "spanning_chimeras.resolved.txt" +# results of filtering chimeras +FILTERED_CHIMERA_FILE = "spanning_chimeras.resolved.filtered.txt" +HOMOLOG_FILTERED_CHIMERA_FILE = "spanning_chimeras.resolved.filtered.homolog.txt" +BEST_FILTERED_CHIMERA_FILE = "spanning_chimeras.resolved.filtered.homolog.best_isoform.txt" +# output file +CHIMERA_OUTPUT_FILE = "chimeras.bedpe" \ No newline at end of file |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/lib/config.pyc |
b |
Binary file chimerascan/lib/config.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/lib/fastq_to_bam.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/lib/fastq_to_bam.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,99 @@ +''' +Created on Apr 28, 2011 + +@author: mkiyer +''' +from chimerascan import pysam +from math import log10 +from string import maketrans + +def get_solexa_qual_conversion_table(): + """ + return a translation table that can be used by str.translate() for + converting solexa to sanger quality scores + """ + offset = 64 + conv_table = ['!'] * 256 + conv_table[offset:] = "I" * (256-offset) + for solq in xrange(-5, 40): + phredq = 10*log10(1 + 10**(solq/10.0)) + phredchr = chr(int(round(33 + phredq))) + conv_table[offset + solq] = phredchr + conv_string = ''.join(conv_table) + return maketrans(''.join(map(chr, range(256))), conv_string) + +def get_illumina_qual_conversion_table(): + """Illumina 1.3+ format can encode a Phred quality score from 0 to 62 + using ASCII 64 to 126 (although in raw read data Phred scores from 0 + to 40 only are expected). + """ + offset = 64 + conv_table = ['!'] * 256 + for x in xrange(0, 62): + conv_table[offset+x] = chr(33 + x) + conv_table[offset+40:] = "I" * (256-(offset+40)) + conv_string = ''.join(conv_table) + return maketrans(''.join(map(chr, range(256))), conv_string) + +def get_sanger_qual_conversion_table(): + offset = 33 + tbl = map(chr, range(256)) + tbl[:offset] = "!" * offset + tbl[offset+40:] = "I" * (256-(offset+40)) + return maketrans(''.join(map(chr, range(256))), ''.join(tbl)) + +conv_tables = {"sanger": get_sanger_qual_conversion_table(), + "illumina": get_illumina_qual_conversion_table(), + "solexa": get_solexa_qual_conversion_table()} + +def parse_fastq(line_iter): + with line_iter: + while True: + rid = line_iter.next().rstrip()[1:] + seq = line_iter.next().rstrip() + line_iter.next() + qual = line_iter.next().rstrip() + yield rid, seq, qual + +def fastq_to_bam(fastq_files, qual_format, bam_file): + fqfhs = [parse_fastq(open(f)) for f in fastq_files] + qual_trans_table = conv_tables[qual_format] + header = {'HD': {'VN': '1.0', 'SO': 'unknown'}} +# 'SQ': [{'LN': 1, 'SN': 'dummy'}]} + bamfh = pysam.Samfile(bam_file, "wb", header=header) + try: + while True: + for i,fqiter in enumerate(fqfhs): + id,seq,qual = fqiter.next() + a = pysam.AlignedRead() + a.rname = -1 + a.mrnm = -1 + #a.pos = 0 + #a.mpos = 0 + a.qname = id + a.seq = seq + a.qual = qual.translate(qual_trans_table) + a.is_read1 = (i == 0) + a.is_read2 = (i == 1) + bamfh.write(a) + except StopIteration: + pass + bamfh.close() + +def bam_to_fastq(bam_file, fastq_files): + fqfhs = [open(f, "w") for f in fastq_files] + bamfh = pysam.Samfile(bam_file, "rb") + for r in bamfh: + if r.is_read1: + i = 0 + elif r.is_read2: + i = 1 + record = "@%s\n%s\n+\n%s" % (r.qname,r.seq,r.qual) + print >>fqfhs[i], record + +if __name__ == '__main__': + sol2std = get_solexa_qual_conversion_table() + illumina2std = get_illumina_qual_conversion_table() + import sys + fastq_to_bam(["read1.fq", "read2.fq"], "solexa", "hi.bam") + bam_to_fastq("hi.bam", ["a1.fq", "a2.fq"]) |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/lib/feature.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/lib/feature.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,180 @@ +''' +Created on Dec 18, 2010 + +@author: mkiyer + +chimerascan: chimeric transcript discovery using RNA-seq + +Copyright (C) 2011 Matthew Iyer + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +''' +import logging +import itertools + +class GeneFeature(object): + __slots__ = ('chrom', 'tx_start', 'tx_end', 'tx_name', 'gene_name', + 'strand', 'cds_start', 'cds_end', 'exon_count', 'exons') + + def __str__(self): + fields = [self.tx_name, + self.chrom, + self.strand, + str(self.tx_start), + str(self.tx_end), + str(self.cds_start), + str(self.cds_end), + str(self.exon_count), + ','.join(map(str, [e[0] for e in self.exons])) + ',', + ','.join(map(str, [e[1] for e in self.exons])) + ',', + self.gene_name] + return '\t'.join(fields) + + @staticmethod + def from_string(line): + if line is None: + return None + line = line.strip() + if line.startswith('#'): + logging.debug("skipping comment line: %s" % (line)) + return None + if line.startswith('track'): + logging.debug("skipping track header line: %s" % (line)) + return None + fields = line.split('\t') + # first six fields are required + g = GeneFeature() + g.tx_name = fields[0] + g.chrom = fields[1] + g.strand = fields[2] + g.tx_start = int(fields[3]) + g.tx_end = int(fields[4]) + g.cds_start = int(fields[5]) + g.cds_end = int(fields[6]) + g.exon_count = int(fields[7]) + exon_starts = map(int, fields[8].split(',')[:-1]) + exon_ends = map(int, fields[9].split(',')[:-1]) + g.exons = zip(exon_starts, exon_ends) + g.gene_name = fields[10] + return g + + @staticmethod + def parse(line_iter): + for line in line_iter: + if not line: + continue + if not line.strip(): + continue + if line.startswith("#"): + continue + if line.startswith("track"): + continue + yield GeneFeature.from_string(line) + + def get_exon_interval(self, pos): + """ + returns a tuple containing the exon number and start/end + coordinates relative to the transcript + """ + exon_iter = reversed(self.exons) if self.strand == '-' else iter(self.exons) + exon_pos = 0 + exon_num = 0 + for exon_start, exon_end in exon_iter: + exon_size = exon_end - exon_start + if exon_pos + exon_size >= pos: + break + exon_pos += exon_size + exon_num += 1 + if exon_pos + exon_size < pos: + logging.warning("exon_pos %d + exon_size %d < pos %d - clipping to " + "end of gene" % (exon_pos, exon_size, pos)) + return exon_num, exon_pos, exon_pos + exon_size + + +class BEDFeature(object): + __slots__ = ('chrom', 'tx_start', 'tx_end', 'name', 'score', 'strand', + 'cds_start', 'cds_end', 'exon_count', 'block_starts', + 'block_sizes', 'exons', 'attr_fields') + + def __str__(self): + fields = [self.chrom, + str(self.tx_start), + str(self.tx_end), + self.name, + str(self.score), + self.strand, + str(self.cds_start), + str(self.cds_end), + '0', + str(self.exon_count), + ','.join(map(str, self.block_sizes)) + ',', + ','.join(map(str, self.block_starts)) + ','] + return '\t'.join(fields) + + @staticmethod + def from_string(line): + if line is None: + return None + line = line.strip() + if line.startswith('#'): + logging.debug("skipping comment line: %s" % (line)) + return None + if line.startswith('track'): + logging.debug("skipping track header line: %s" % (line)) + return None + fields = line.split('\t') + # first six fields are required + g = BEDFeature() + g.chrom = fields[0] + g.tx_start = int(fields[1]) + g.tx_end = int(fields[2]) + g.name = fields[3] + if len(fields) <= 4: + g.score = 0 + g.strand = '.' + else: + g.score = fields[4] + g.strand = fields[5] + if len(fields) <= 6: + g.cds_start = g.tx_start + g.cds_end = g.tx_end + g.exon_count = 1 + g.exons = [(g.tx_start, g.tx_end)] + else: + g.cds_start = int(fields[6]) + g.cds_end = int(fields[7]) + g.exon_count = int(fields[9]) + g.block_sizes = map(int, fields[10].split(',')[:-1]) + g.block_starts = map(int, fields[11].split(',')[:-1]) + g.exons = [] + for start, size in itertools.izip(g.block_starts, g.block_sizes): + g.exons.append((g.tx_start + start, g.tx_start + start + size)) + if len(fields) <= 12: + g.attr_fields = [] + else: + g.attr_fields = fields[12:] + return g + + @staticmethod + def parse(line_iter): + for line in line_iter: + if not line: + continue + if not line.strip(): + continue + if line.startswith("#"): + continue + if line.startswith("track"): + continue + yield BEDFeature.from_string(line) |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/lib/feature.pyc |
b |
Binary file chimerascan/lib/feature.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/lib/fix_alignment_ordering.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/lib/fix_alignment_ordering.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,137 @@ +''' +Created on Jan 23, 2011 + +@author: mkiyer + +chimerascan: chimeric transcript discovery using RNA-seq + +Copyright (C) 2011 Matthew Iyer + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +''' +import re +import collections + +ReorderBufferItem = collections.namedtuple('ReorderBufferItem', ("fqrec", "reads")) + +def fix_alignment_ordering(samfh, fqiters, + pe_sr_mode=False, + maxlen=100000): + # function for initializing new buffer entry + buf_init_func = lambda fqrecs: tuple(ReorderBufferItem(fq, []) for fq in fqrecs) + # initialize the qname dictionary to match the fastq file + buf = collections.deque() + qname_read_dict = {} + qname_mate_re = re.compile(r'/(\d)$') + for read in samfh: + # PE-SR mode means that the reads were paired in sequencing + # but aligned separately. The function uses the /1 and /2 + # suffixes in the reads to join them during buffer reordering + if pe_sr_mode: + # get read num (1 or 2) from the qname field of SAM read + read_qname, readnum = qname_mate_re.split(read.qname)[0:2] + readnum = int(readnum) - 1 + # set flags + read.is_paired = True + read.qname = read_qname + if readnum == 0: + read.is_read1 = True + elif readnum == 1: + read.is_read2 = True + else: + assert False + # if not PE-SR mode then we can trust the 'is_read1' and 'is_read2' + # attributes of the SAM read + else: + if read.is_read2: + readnum = 1 + else: + readnum = 0 + # check if this read is already in the buffer + if read.qname not in qname_read_dict: + # if buffer full empty the first entries + while len(buf) >= maxlen: + # get first qname in buf + first_qname = buf.popleft() + # return reads at this qname, then delete them + yield qname_read_dict[first_qname] + del qname_read_dict[first_qname] + # add new qnames to buffer + while True: + # get next qname from fastq file and add it to the queue + fqrecs = [it.next() for it in fqiters] + next_qname = fqrecs[0].qname + buf.append(next_qname) + qname_read_dict[next_qname] = buf_init_func(fqrecs) + # if the next qname in the fastq file is the same as the + # read qname, then we can exit the loop + if next_qname == read.qname: + break + # add read to buffer + qname_read_dict[read.qname][readnum].reads.append(read) + # empty remaining entries in buffer + while len(buf) > 0: + yield qname_read_dict[buf.popleft()] + + +def fix_sr_alignment_ordering(samfh, fqiter, + maxlen=100000): + # function for initializing new buffer entry + buf_init_func = lambda fqrec: [ReorderBufferItem(fqrec, [])] + # initialize the qname dictionary to match the fastq file + buf = collections.deque() + qname_read_dict = {} + qname_mate_re = re.compile(r'/(\d)$') + for read in samfh: + # get read num (1 or 2) from the qname field of SAM read + read_qname, readnum = qname_mate_re.split(read.qname)[0:2] + readnum = int(readnum) - 1 + # set flags + read.is_paired = True + read.qname = read_qname + if readnum == 0: + read.is_read1 = True + elif readnum == 1: + read.is_read2 = True + else: + assert False + # set key for indexing reads + key = (read_qname, readnum) + # check if this read is already in the buffer + if key not in qname_read_dict: + # if buffer full empty the first entries + while len(buf) >= maxlen: + # get first key in buf + first_key = buf.popleft() + # return reads at this qname, then delete them + yield qname_read_dict[first_key] + del qname_read_dict[first_key] + # add new qnames to buffer + while True: + # get next qname from fastq file and add it to the queue + fqrec = fqiter.next() + next_key = (fqrec.qname, fqrec.readnum-1) + buf.append(next_key) + qname_read_dict[next_key] = buf_init_func(fqrec) + # if the next qname in the fastq file is the same as the + # read qname, then we can exit the loop + if next_key == key: + break + # add read to buffer + qname_read_dict[key][0].reads.append(read) + # empty remaining entries in buffer + while len(buf) > 0: + yield qname_read_dict[buf.popleft()] + + |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/lib/fragment_size_distribution.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/lib/fragment_size_distribution.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,286 @@\n+\'\'\'\n+Created on Apr 29, 2011\n+\n+@author: mkiyer\n+\n+chimerascan: chimeric transcript discovery using RNA-seq\n+\n+Copyright (C) 2011 Matthew Iyer\n+\n+This program is free software: you can redistribute it and/or modify\n+it under the terms of the GNU General Public License as published by\n+the Free Software Foundation, either version 3 of the License, or\n+(at your option) any later version.\n+\n+This program is distributed in the hope that it will be useful,\n+but WITHOUT ANY WARRANTY; without even the implied warranty of\n+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n+GNU General Public License for more details.\n+\n+You should have received a copy of the GNU General Public License\n+along with this program. If not, see <http://www.gnu.org/licenses/>.\n+\'\'\'\n+import collections\n+import array\n+import logging\n+import random\n+\n+from chimerascan.bx.intersection import Interval, IntervalTree\n+\n+# local imports\n+from sam import parse_pe_reads, CIGAR_N, CIGAR_S, CIGAR_H, CIGAR_P\n+from feature import GeneFeature\n+\n+# SAM CIGAR flags that indicate skipping, padding, or clipping\n+SKIP_CIGAR_FLAGS = set((CIGAR_N, CIGAR_S, CIGAR_H, CIGAR_P)) \n+\n+def build_exon_trees(genes):\n+ trees = collections.defaultdict(lambda: IntervalTree())\n+ for g in genes: \n+ for e in g.exons:\n+ start, end = e\n+ trees[g.chrom].insert_interval(Interval(start, end, strand=g.strand))\n+ return trees\n+\n+def find_unambiguous_exon_intervals(genes):\n+ """\n+ returns (chrom, start, end, strand) tuples for exon\n+ intervals that are unique and have no overlapping\n+ transcripts or exons. \n+ """\n+ trees = build_exon_trees(genes) \n+ for g in genes:\n+ for start,end in g.exons:\n+ hits = set((hit.start, hit.end, hit.strand) \n+ for hit in trees[g.chrom].find(start, end))\n+ hits.add((start, end, g.strand))\n+ if len(hits) == 1:\n+ yield g.chrom, start, end, g.strand\n+\n+def sample_fragment_sizes(bamfh, genes, min_isize, max_isize):\n+ """\n+ sample fragment size distribution at genes with exons\n+ larger than the maximum insert size\n+ """\n+ # find all exons that are larger than the maximum estimated fragment size\n+ exons = set(coord for coord in find_unambiguous_exon_intervals(genes)\n+ if (coord[2] - coord[1]) >= max_isize)\n+ logging.info("Found %d exons larger than %d" % (len(exons), max_isize))\n+ refs = set(bamfh.references)\n+ # stats\n+ num_reads = 0\n+ unmapped = 0\n+ ambiguous = 0\n+ spliced = 0\n+ outside_range = 0\n+ count = 0\n+ # fetch reads from BAM file at large exons\n+ for chrom,start,end,strand in exons:\n+ if chrom not in refs:\n+ logging.warning("Skipping exon from reference %s not in BAM" % (chrom))\n+ continue \n+ qname_dict = collections.defaultdict(lambda: [])\n+ for r in bamfh.fetch(chrom, start, end):\n+ num_reads += 1\n+ # ignore unmapped reads, qc fail reads, or unpaired reads\n+ if r.is_unmapped or r.is_qcfail or (not r.is_proper_pair):\n+ unmapped += 1\n+ continue\n+ # ignore multi-mapping reads\n+ if r.opt(\'NH\') > 1:\n+ ambiguous += 1\n+ continue\n+ # ignore spliced reads\n+ has_skip = any(x[0] in SKIP_CIGAR_FLAGS for x in r.cigar)\n+ if has_skip:\n+ spliced += 1\n+ continue \n+ # group paired-end reads by read name\n+ qname_dict[r.qname].append(abs(r.isize))\n+ # keep paired reads with both mates in region\n+ for isizes in qname_dict.itervalues():\n+ isizes = set(abs(x) for x in isizes)\n+ assert len(isizes) == 1\n+ isize = isizes.pop()\n+ if (min_isize <= isize <= max_isize):\n+ count += 1\n+ yield isize\n+ else:\n+ '..b'h, \'\\t\'.join([str(i + self.min_isize), str(x)]) \n+\n+ @staticmethod\n+ def from_file(fileh):\n+ isizes = []\n+ counts = []\n+ for line in fileh:\n+ if line.startswith("#"):\n+ continue\n+ fields = line.strip().split(\'\\t\')\n+ i,x = map(int, fields[0:2])\n+ isizes.append(i)\n+ counts.append(x)\n+ d = InsertSizeDistribution()\n+ d.min_isize = isizes[0]\n+ d.max_isize = isizes[-1]\n+ d.arr = array.array(\'L\', counts) \n+ return d\n+\n+ @staticmethod\n+ def from_random(mean, stdev, min_isize, max_isize, samples=100000):\n+ """\n+ initialize from a random sample using normal distribution with \n+ mean \'mean\' and stdev \'stdev\'\n+ """\n+ d = InsertSizeDistribution()\n+ # implement simple checks\n+ assert min_isize < mean < max_isize\n+ assert stdev < (max_isize - min_isize)\n+ # initialize\n+ d.min_isize = min_isize\n+ d.max_isize = max_isize\n+ d.arr = array.array(\'L\', (0 for x in xrange(min_isize, max_isize+1)))\n+ count = 0\n+ outside_range = 0\n+ while True:\n+ if count > samples:\n+ break\n+ isize = int(round(random.normalvariate(mean, stdev),0))\n+ if (min_isize <= isize <= max_isize):\n+ # store in array\n+ d.arr[isize - min_isize] += 1\n+ count += 1\n+ else:\n+ outside_range += 1\n+ return d\n+\n+ @staticmethod\n+ def from_bam(bamfh, min_isize, max_isize, max_samples=None):\n+ # initialize\n+ d = InsertSizeDistribution()\n+ d.min_isize = min_isize\n+ d.max_isize = max_isize\n+ d.arr = array.array(\'L\', (0 for x in xrange(min_isize, max_isize+1))) \n+ frags = 0 \n+ count = 0\n+ outside_range = 0\n+ unmapped = 0\n+ isoforms = 0\n+ for pe_reads in parse_pe_reads(bamfh):\n+ frags += 1\n+ if (max_samples is not None) and (count > max_samples):\n+ break\n+ # only allow mappings where there is a single\n+ # insert size (multiple isoforms are ambiguous)\n+ isizes = set() \n+ for r in pe_reads[0]:\n+ if r.is_unmapped:\n+ continue\n+ # get insert size\n+ isize = r.isize\n+ if isize < 0: isize = -isize\n+ isizes.add(isize)\n+ # insert size must be within range\n+ if len(isizes) == 0:\n+ unmapped += 1\n+ elif len(isizes) > 1:\n+ isoforms += 1\n+ else:\n+ isize = isizes.pop()\n+ if (min_isize <= isize <= max_isize):\n+ # store in array\n+ d.arr[isize - min_isize] += 1\n+ count += 1\n+ else:\n+ outside_range += 1\n+ logging.debug("Processed fragments: %d" % (frags))\n+ logging.debug("Unique paired frags: %d" % (count))\n+ logging.debug("Unmapped: %d" % (unmapped))\n+ logging.debug("Ambiguous (isoforms): %d" % (isoforms))\n+ logging.debug("Outside range: %d" % (outside_range))\n+ return d\n+ \n+ @staticmethod\n+ def from_genome_bam(bamfh, genes, min_isize, max_isize, max_samples=None):\n+ # initialize\n+ d = InsertSizeDistribution()\n+ d.min_isize = min_isize\n+ d.max_isize = max_isize\n+ d.arr = array.array(\'L\', (0 for x in xrange(min_isize, max_isize+1)))\n+ count = 0\n+ for isize in sample_fragment_sizes(bamfh, genes, min_isize, max_isize):\n+ if (min_isize <= isize <= max_isize):\n+ # store in array\n+ d.arr[isize - min_isize] += 1\n+ count += 1\n+ if (max_samples is not None) and (count > max_samples):\n+ break\n+ return d\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/lib/fragment_size_distribution.pyc |
b |
Binary file chimerascan/lib/fragment_size_distribution.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/lib/gene_to_genome.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/lib/gene_to_genome.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,166 @@ +''' +Created on Jan 31, 2011 + +@author: mkiyer + +chimerascan: chimeric transcript discovery using RNA-seq + +Copyright (C) 2011 Matthew Iyer + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +''' +import logging +import collections + +from chimerascan.bx.cluster import ClusterTree +from chimerascan.bx.intersection import Interval, IntervalTree +# local imports +from feature import GeneFeature + +def build_tid_gene_map(bamfh, genefile, rname_prefix=None): + rname_tid_map = dict((rname,tid) for tid,rname in enumerate(bamfh.references)) + rname_prefix = '' if rname_prefix is None else rname_prefix + tid_tx_map = {} + # build gene and genome data structures for fast lookup + for g in GeneFeature.parse(open(genefile)): + # only use genes that are references in the sam file + rname = rname_prefix + g.tx_name + if rname not in rname_tid_map: + continue + tid = rname_tid_map[rname] + tid_tx_map[tid] = g + return tid_tx_map + +def build_tx_name_gene_map(genefile, rname_prefix=None): + rname_prefix = '' if rname_prefix is None else rname_prefix + tx_map = {} + # build gene and genome data structures for fast lookup + for g in GeneFeature.parse(open(genefile)): + tx_map[rname_prefix + g.tx_name] = g + return tx_map + +def build_genome_tx_trees(genefile): + genome_tx_trees = collections.defaultdict(lambda: IntervalTree()) + # build gene and genome data structures for fast lookup + for g in GeneFeature.parse(open(genefile)): + # add gene to interval tree + interval = Interval(g.tx_start, g.tx_end, strand=g.strand, value=g) + genome_tx_trees[g.chrom].insert_interval(interval) + return genome_tx_trees + +def build_transcript_cluster_map(line_iter, rname_prefix=None): + # setup cluster trees + chrom_strand_cluster_trees = \ + collections.defaultdict(lambda: {"+": ClusterTree(0,1), + "-": ClusterTree(0,1)}) + transcripts = [] + index_cluster_map = {} + for transcript in GeneFeature.parse(line_iter): + # insert exons into cluster tree + cluster_tree = chrom_strand_cluster_trees[transcript.chrom][transcript.strand] + i = len(transcripts) + for start,end in transcript.exons: + cluster_tree.insert(start, end, i) + # each transcript is initially in a cluster by itself + index_cluster_map[i] = set([i]) + transcripts.append(transcript) + # extract gene clusters + for strand_cluster_trees in chrom_strand_cluster_trees.itervalues(): + for cluster_tree in strand_cluster_trees.itervalues(): + for start, end, indexes in cluster_tree.getregions(): + # make new cluster by aggregating all existing + # clusters with new indexes + newclust = set(indexes) + for i in indexes: + newclust.update(index_cluster_map[i]) + # map every transcript to the new cluster + for i in newclust: + index_cluster_map[i] = newclust + # enumerate all clusters + rname_prefix = '' if rname_prefix is None else rname_prefix + transcript_cluster_map = {} + for cluster_id, clust in enumerate(index_cluster_map.values()): + for i in clust: + transcript = transcripts[i] + transcript_cluster_map[rname_prefix + transcript.tx_name] = cluster_id + return transcript_cluster_map + +def build_transcript_tid_cluster_map(bamfh, line_iter, rname_prefix=None): + # make the standard cluster map + transcript_cluster_map = build_transcript_cluster_map(line_iter, rname_prefix) + # map reference name to tid + transcript_tid_map = {} + rname_prefix = '' if rname_prefix is None else rname_prefix + for tid,rname in enumerate(bamfh.references): + if rname.startswith(rname_prefix): + transcript_tid_map[rname] = tid + # remake the cluster map + tid_cluster_map = {} + for rname, cluster_id in transcript_cluster_map.iteritems(): + if rname not in transcript_tid_map: + continue + tid = transcript_tid_map[rname] + tid_cluster_map[tid] = cluster_id + return tid_cluster_map + +def build_transcript_genome_map(line_iter, rname_prefix=None): + # create arrays to map genes in bed file to genome + rname_prefix = '' if rname_prefix is None else rname_prefix + transcript_genome_map = {} + for g in GeneFeature.parse(line_iter): + rname = rname_prefix + g.tx_name + strand = 1 if g.strand == '-' else 0 + exon_vectors = [(start, end) for start, end in g.exons] + if strand: + exon_vectors.reverse() + if rname in transcript_genome_map: + logging.error("Duplicate references %s found in bed file" % (rname)) + transcript_genome_map[rname] = (g.chrom, strand, exon_vectors) + return transcript_genome_map + +def build_transcript_tid_genome_map(bamfh, line_iter, rname_prefix=None): + # make the standard map + transcript_genome_map = build_transcript_genome_map(line_iter, rname_prefix) + # map reference name to tid + rname_prefix = '' if rname_prefix is None else rname_prefix + transcript_tid_map = {} + for tid,rname in enumerate(bamfh.references): + if rname.startswith(rname_prefix): + transcript_tid_map[rname] = tid + # remap using tid as key + tid_genome_map = {} + for rname, coords in transcript_genome_map.iteritems(): + if rname not in transcript_tid_map: + continue + tid = transcript_tid_map[rname] + tid_genome_map[tid] = coords + return tid_genome_map + +def transcript_to_genome_pos(rname, pos, transcript_genome_map): + ''' + translate gene 'rname' position 'gene_pos' to genomic + coordinates. returns a 3-tuple with (chrom, strand, pos) + ''' + chrom, strand, intervals = transcript_genome_map[rname] + offset = 0 + for start, end, in intervals: + exon_size = end - start + if pos < offset + exon_size: + if strand: + return chrom, strand, start + exon_size - (pos - offset) - 1 + else: + return chrom, strand, start + (pos - offset) + #print start, end, offset, pos + offset += exon_size + return None \ No newline at end of file |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/lib/gene_to_genome.pyc |
b |
Binary file chimerascan/lib/gene_to_genome.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/lib/gtf.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/lib/gtf.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,141 @@ +''' +Created on Nov 2, 2010 + +@author: mkiyer + +chimerascan: chimeric transcript discovery using RNA-seq + +Copyright (C) 2011 Matthew Iyer + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +''' +import subprocess +import os + +GTF_EMPTY_FIELD = '.' +GTF_ATTR_SEP = ';' +GTF_ATTR_TAGVALUE_SEP = ' ' + +def sort_gtf(filename, output_file): + args = ["sort", "-k1,1", "-k4,4n", "-k3,3r", filename] + myenv = os.environ.copy() + myenv["LC_ALL"] = "C" + subprocess.call(args, stdout=open(output_file, "w"), env=myenv) + +def window_overlap(a, b): + if a[0] != b[0]: + return False + return (a[1] <= b[2]) and (b[1] <= a[2]) + +def separate_loci(feature_iter): + try: + # initialize window + window = [feature_iter.next()] + window_range = (window[0].seqid, window[0].start, window[0].end) + # separate into loci + for feature in feature_iter: + # check if next transcript is outside current window + interval = (feature.seqid, feature.start, feature.end) + if not window_overlap(interval, window_range): + # yield current window + yield window + # reset window + window = [feature] + window_range = (feature.seqid, feature.start, feature.end) + else: + # add transcript to window + window.append(feature) + window_range = (feature.seqid, + min(window_range[1], feature.start), + max(window_range[2], feature.end)) + except StopIteration: + pass + # yield last window + if len(window) > 0: + yield window + +class GTFFeature(object): + ''' + 1. seqname - The name of the sequence. Must be a chromosome or scaffold. + 2. source - The program that generated this feature. + 3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. end - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If the track line useScore attribute is set to 1 for this annotation data set, the score value will determine the level of gray in which this feature is displayed (higher numbers = darker gray). If there is no score value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/don't care). + 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. + + chr1 Cufflinks transcript 136546 137059 1000 . . gene_id "VCAP_SHEZH2.657699"; transcript_id "VCAP_SHEZH2.657699.1"; FPKM "100.7219943204"; frac "1.000000"; conf_lo "80.649925"; conf_hi "120.794064"; cov "2.198209"; + ''' + __slots__ = ('seqid', 'source', 'feature_type', 'start', 'end', 'score', 'strand', 'phase', 'attrs') + + def __str__(self): + line = [self.seqid, + self.source, + self.feature_type, + # convert to 1-based intervals + str(self.start + 1), + str(self.end), + str(self.score), + str(self.strand), + self.phase] + attr_str = ' '.join('%s "%s";' % (k, v) for (k, v) in self.attrs.iteritems()) + line.append(attr_str) + return '\t'.join(line) + + @staticmethod + def from_string(line, attr_defs=None): + f = GTFFeature() + # read the GTF line + fields = line.strip().split('\t') + f.seqid = fields[0] + f.source = fields[1] + f.feature_type = fields[2] + # convert from 1-based (inclusive) to 0-based (exclusive) intervals + f.start = int(fields[3])-1 + f.end = int(fields[4]) + f.score = 0 if (fields[5] == '.') else float(fields[5]) + strand = fields[6] + if not (strand == '+' or strand == '-'): + strand = GTF_EMPTY_FIELD + f.strand = strand + f.phase = fields[7] + attrs = {} + if fields[8] != GTF_EMPTY_FIELD: + attr_strings = fields[8].split(GTF_ATTR_SEP) + for a in attr_strings: + a = a.strip() + if len(a) == 0: + continue + tag, value = a.split(GTF_ATTR_TAGVALUE_SEP, 1) + # remove quotes + value = value.split('"')[1] + # apply parsing function + if (attr_defs != None) and (tag in attr_defs) and (attr_defs[tag] != None): + value = attr_defs[tag](value) + attrs[tag] = value + f.attrs = attrs + return f + + @staticmethod + def parse(line_iter, attr_defs=None): + for line in line_iter: + # read the GTF line + if not line: + continue + if not line.strip(): + continue + if line.startswith("#"): + continue + yield GTFFeature.from_string(line, attr_defs) |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/lib/sam.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/lib/sam.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,224 @@ +''' +Created on Jun 2, 2011 + +@author: mkiyer + +chimerascan: chimeric transcript discovery using RNA-seq + +Copyright (C) 2011 Matthew Iyer + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +''' +import operator + +from chimerascan import pysam +from seq import DNA_reverse_complement + +# +# constants used for CIGAR alignments +# +CIGAR_M = 0 #match Alignment match (can be a sequence match or mismatch) +CIGAR_I = 1 #insertion Insertion to the reference +CIGAR_D = 2 #deletion Deletion from the reference +CIGAR_N = 3 #skip Skipped region from the reference +CIGAR_S = 4 #softclip Soft clip on the read (clipped sequence present in <seq>) +CIGAR_H = 5 #hardclip Hard clip on the read (clipped sequence NOT present in <seq>) +CIGAR_P = 6 #padding Padding (silent deletion from the padded reference sequence) + +def parse_reads_by_qname(samfh): + """ + generator function to parse and return lists of + reads that share the same qname + """ + reads = [] + for read in samfh: + if len(reads) > 0 and read.qname != reads[-1].qname: + yield reads + reads = [] + reads.append(read) + if len(reads) > 0: + yield reads + +def parse_pe_reads(bamfh): + pe_reads = ([], []) + # reads must be sorted by qname + num_reads = 0 + prev_qname = None + for read in bamfh: + # get read attributes + qname = read.qname + readnum = 1 if read.is_read2 else 0 + # if query name changes we have completely finished + # the fragment and can reset the read data + if num_reads > 0 and qname != prev_qname: + yield pe_reads + # reset state variables + pe_reads = ([], []) + num_reads = 0 + pe_reads[readnum].append(read) + prev_qname = qname + num_reads += 1 + if num_reads > 0: + yield pe_reads + +def parse_unpaired_pe_reads(bamfh): + """ + parses alignments that were aligned in single read mode + and hence all hits are labeled as 'read1' and lack mate + information. instead the read1 read2 information is + attached to the 'qname' field + """ + pe_reads = ([], []) + num_reads = 0 + prev_qname = None + for read in bamfh: + # extract read1/2 from qname + readnum = int(read.qname[-1]) + if readnum == 1: + read.is_read1 = True + mate = 0 + elif readnum == 2: + mate = 1 + read.is_read2 = True + # reconstitute correct qname + qname = read.qname[:-2] + read.qname = qname + # if query name changes we have completely finished + # the fragment and can reset the read data + if num_reads > 0 and qname != prev_qname: + yield pe_reads + # reset state variables + pe_reads = ([], []) + num_reads = 0 + pe_reads[mate].append(read) + prev_qname = qname + num_reads += 1 + if num_reads > 0: + yield pe_reads + +def select_best_mismatch_strata(reads, mismatch_tolerance=0): + if len(reads) == 0: + return [] + # sort reads by number of mismatches + mapped_reads = [] + unmapped_reads = [] + for r in reads: + if r.is_unmapped: + unmapped_reads.append(r) + else: + mapped_reads.append((r.opt('NM'), r)) + if len(mapped_reads) == 0: + return unmapped_reads + sorted_reads = sorted(mapped_reads, key=operator.itemgetter(0)) + best_nm = sorted_reads[0][0] + worst_nm = sorted_reads[-1][0] + sorted_reads.extend((worst_nm+1, r) for r in unmapped_reads) + # choose reads within a certain mismatch tolerance + best_reads = [] + for mismatches, r in sorted_reads: + if mismatches > (best_nm + mismatch_tolerance): + break + best_reads.append(r) + return best_reads + +def copy_read(r): + a = pysam.AlignedRead() + a.qname = r.qname + a.seq = r.seq + a.flag = r.flag + a.rname = r.rname + a.pos = r.pos + a.mapq = r.mapq + a.cigar = r.cigar + a.mrnm = r.mrnm + a.mpos = r.mpos + a.isize = r.isize + a.qual = r.qual + a.tags = r.tags + return a + +def soft_pad_read(fq, r): + """ + 'fq' is the fastq record + 'r' in the AlignedRead SAM read + """ + # make sequence soft clipped + ext_length = len(fq.seq) - len(r.seq) + cigar_softclip = [(CIGAR_S, ext_length)] + cigar = r.cigar + # reconstitute full length sequence in read + if r.is_reverse: + seq = DNA_reverse_complement(fq.seq) + qual = fq.qual[::-1] + if (cigar is not None) and (ext_length > 0): + cigar = cigar_softclip + cigar + else: + seq = fq.seq + qual = fq.qual + if (cigar is not None) and (ext_length > 0): + cigar = cigar + cigar_softclip + # replace read field + r.seq = seq + r.qual = qual + r.cigar = cigar + +def pair_reads(r1, r2, tags=None): + ''' + fill in paired-end fields in SAM record + ''' + if tags is None: + tags = [] + # convert read1 to paired-end + r1.is_paired = True + r1.is_proper_pair = True + r1.is_read1 = True + r1.mate_is_reverse = r2.is_reverse + r1.mate_is_unmapped = r2.is_unmapped + r1.mpos = r2.pos + r1.mrnm = r2.rname + r1.tags = r1.tags + tags + # convert read2 to paired-end + r2.is_paired = True + r2.is_proper_pair = True + r2.is_read2 = True + r2.mate_is_reverse = r1.is_reverse + r2.mate_is_unmapped = r1.is_unmapped + r2.mpos = r1.pos + r2.mrnm = r1.rname + r2.tags = r2.tags + tags + # compute insert size + if r1.rname != r2.rname: + r1.isize = 0 + r2.isize = 0 + elif r1.pos > r2.pos: + isize = r1.aend - r2.pos + r1.isize = -isize + r2.isize = isize + else: + isize = r2.aend - r1.pos + r1.isize = isize + r2.isize = -isize + +def get_clipped_interval(r): + cigar = r.cigar + padstart, padend = r.pos, r.aend + if len(cigar) > 1: + if (cigar[0][0] == CIGAR_S or + cigar[0][0] == CIGAR_H): + padstart -= cigar[0][1] + elif (cigar[-1][0] == CIGAR_S or + cigar[-1][0] == CIGAR_H): + padend += cigar[-1][1] + return padstart, padend + |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/lib/sam.pyc |
b |
Binary file chimerascan/lib/sam.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/lib/seq.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/lib/seq.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,149 @@ +''' +Created on Jan 5, 2011 + +@author: Dan Blankenberg + +Code from the Galaxy project (http://galaxy.psu.edu) +Contains methods to transform sequence strings +''' +import string +from math import log10 +from string import maketrans + +# Quality score formats +SANGER_FORMAT = "sanger" +SOLEXA_FORMAT = "solexa" +ILLUMINA_FORMAT = "illumina" +FASTQ_QUAL_FORMATS = [SANGER_FORMAT, SOLEXA_FORMAT, ILLUMINA_FORMAT] + +#Translation table for reverse Complement, with ambiguity codes +DNA_COMPLEMENT = string.maketrans( "ACGTRYKMBDHVacgtrykmbdhv", "TGCAYRMKVHDBtgcayrmkvhdb" ) +RNA_COMPLEMENT = string.maketrans( "ACGURYKMBDHVacgurykmbdhv", "UGCAYRMKVHDBugcayrmkvhdb" ) +#Translation table for DNA <--> RNA +DNA_TO_RNA = string.maketrans( "Tt", "Uu" ) +RNA_TO_DNA = string.maketrans( "Uu", "Tt" ) + +def DNA_complement( sequence ): + '''complement DNA sequence string''' + return sequence.translate( DNA_COMPLEMENT ) +def DNA_reverse_complement( sequence ): + '''returns the reverse complement of the sequence''' + return DNA_complement(sequence[::-1]) +def to_DNA( sequence ): + return sequence.translate( DNA_TO_RNA ) +#complement RNA sequence string +def RNA_complement( sequence ): + return sequence.translate( RNA_COMPLEMENT ) +def RNA_reverse_complement( self, sequence ): + return RNA_complement( sequence[::-1] ) +def to_RNA( sequence ): + return sequence.translate( RNA_TO_DNA ) + +def get_solexa_qual_conversion_table(): + """ + return a translation table that can be used by str.translate() for + converting solexa to sanger quality scores + """ + offset = 64 + conv_table = ['!'] * 256 + conv_table[offset:] = "I" * (256-offset) + for solq in xrange(-5, 40): + phredq = 10*log10(1 + 10**(solq/10.0)) + phredchr = chr(int(round(33 + phredq))) + conv_table[offset + solq] = phredchr + conv_string = ''.join(conv_table) + return maketrans(''.join(map(chr, range(256))), conv_string) + +def get_illumina_qual_conversion_table(): + """Illumina 1.3+ format can encode a Phred quality score from 0 to 62 + using ASCII 64 to 126 (although in raw read data Phred scores from 0 + to 40 only are expected). + """ + offset = 64 + conv_table = ['!'] * 256 + for x in xrange(0, 62): + conv_table[offset+x] = chr(33 + x) + conv_table[offset+40:] = "I" * (256-(offset+40)) + conv_string = ''.join(conv_table) + return maketrans(''.join(map(chr, range(256))), conv_string) + +def get_sanger_qual_conversion_table(): + offset = 33 + tbl = map(chr, range(256)) + tbl[:offset] = "!" * offset + tbl[offset+40:] = "I" * (256-(offset+40)) + return maketrans(''.join(map(chr, range(256))), ''.join(tbl)) + +def get_qual_conversion_func(qual_format): + conv_tables = {SANGER_FORMAT: get_sanger_qual_conversion_table(), + ILLUMINA_FORMAT: get_illumina_qual_conversion_table(), + SOLEXA_FORMAT: get_solexa_qual_conversion_table()} + tbl = conv_tables[qual_format] + return lambda q: q.translate(tbl) + +class FASTQRecord: + __slots__ = ("qname", "seq", "qual", "readnum") + def __init__(self, qname, seq, qual, readnum): + self.qname = qname + self.seq = seq + self.qual = qual + self.readnum = readnum + + def to_string(self): + return ("@%s/%d\n%s\n+\n%s" % + (self.qname, self.readnum, self.seq, self.qual)) + +def parse_fastq_record(line_iter, + convert_quals=False, + qual_format=SANGER_FORMAT): + qual_func = get_qual_conversion_func(qual_format) + try: + qname = line_iter.next().rstrip()[1:] + readnum = int(qname[-1]) + qname = qname[:-2] + seq = line_iter.next().rstrip() + line_iter.next() + qual = line_iter.next().rstrip() + if convert_quals: + qual = qual_func(qual) + yield FASTQRecord(qname, seq, qual, readnum) + while True: + # qname + qname = line_iter.next().rstrip()[1:] + readnum = int(qname[-1]) + qname = qname[:-2] + # seq + seq = line_iter.next().rstrip() + # qname again (skip) + line_iter.next() + # qual + qual = line_iter.next().rstrip() + if convert_quals: + qual = qual_func(qual) + yield FASTQRecord(qname, seq, qual, readnum) + except StopIteration: + pass + +def calc_homology(seq1, seq2, num_mismatches): + smallest_len = min(len(seq1), len(seq2)) + mm = 0 + i = 0 + for i in xrange(smallest_len): + if seq1[i] != seq2[i]: + mm += 1 + if mm > num_mismatches: + return i + return i + 1 + +BASES_PER_LINE = 50 +def split_seq(seq, chars_per_line=BASES_PER_LINE): + pos = 0 + newseq = [] + while pos < len(seq): + if pos + chars_per_line > len(seq): + endpos = len(seq) + else: + endpos = pos + chars_per_line + newseq.append(seq[pos:endpos]) + pos = endpos + return '\n'.join(newseq) |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/lib/seq.pyc |
b |
Binary file chimerascan/lib/seq.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/lib/stats.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/lib/stats.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,317 @@\n+\'\'\'\n+Created on Jan 30, 2011\n+\n+@author: mkiyer\n+\'\'\'\n+import math\n+from math import log\n+from collections import defaultdict\n+\n+def comb(N,k):\n+ """\n+ This function was taken from scipy 0.9.0rc1\n+ \n+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \n+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT \n+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS \n+ FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE \n+ COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, \n+ INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES \n+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR \n+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) \n+ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, \n+ STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING \n+ IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE \n+ POSSIBILITY OF SUCH DAMAGE.\n+ \n+ The number of combinations of N things taken k at a time.\n+ This is often expressed as "N choose k".\n+\n+ Parameters\n+ ----------\n+ N : int, array\n+ Number of things.\n+ k : int, array\n+ Number of elements taken.\n+\n+ Returns\n+ -------\n+ val : int, array\n+ The total number of combinations.\n+\n+ Notes\n+ -----\n+ - Array arguments accepted only for exact=0 case.\n+ - If k > N, N < 0, or k < 0, then a 0 is returned.\n+\n+ Examples\n+ --------\n+ >>> k = np.array([3, 4])\n+ >>> n = np.array([10, 10])\n+ >>> comb(n, k, exact=False)\n+ array([ 120., 210.])\n+ >>> comb(10, 3, exact=True)\n+ 120L\n+ """\n+ if (k > N) or (N < 0) or (k < 0):\n+ return 0L\n+ val = 1L\n+ for j in xrange(min(k, N-k)):\n+ val = (val*(N-j))//(j+1)\n+ return val\n+\n+def normal_pdf(x, m, v):\n+ return 1.0/math.sqrt(2*math.pi*v) * math.exp(-(x-m)**2/(2*v))\n+\n+def binomial_pdf(p, n, k):\n+ if n < 100:\n+ return comb(n, k) * p**k * p**(n-k) # Fall back to your current method\n+ return normal_pdf(k, n*p, n*p*(1.0-p))\n+\n+def binomial_cdf(p, n, k):\n+ return sum(binomial_pdf(p,n,x) for x in xrange(k+1))\n+\n+def _interpolate(a, b, fraction):\n+ """\n+ This function was taken from scipy 0.9.0rc1\n+ \n+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \n+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT \n+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS \n+ FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE \n+ COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, \n+ INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES \n+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR \n+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) \n+ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, \n+ STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING \n+ IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE \n+ POSSIBILITY OF SUCH DAMAGE.\n+\n+ Returns the point at the given fraction between a and b, where\n+ \'fraction\' must be between 0 and 1.\n+ """\n+ return a + (b - a)*fraction;\n+\n+def scoreatpercentile(values, p):\n+ """\n+ This function was taken from scipy 0.9.0rc1\n+ \n+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \n+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT \n+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS \n+ FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE \n+ COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, \n+ INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES \n+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR \n+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) \n'..b'for x in arr\n+ if x > 0)\n+ return kldiv\n+\n+def poisson(m):\n+ \'\'\'\n+ courtesy (http://telliott99.blogspot.com/2010/02/replot-poisson-example-with-python.html)\n+ \'\'\'\n+ def f(k):\n+ e = math.e**(-m)\n+ f = math.factorial(k)\n+ g = m**k\n+ return g*e/f\n+ return f\n+\n+def std(a):\n+ # find the mean\n+ n = len(a)\n+ mean = mean(a)\n+ # find the standard deviation\n+ std = sum((x - mean)**2 for x in a)\n+ std = (std / float(n-1))**0.5\n+ return std\n+\n+def normmeanCI(p, xbar, sd, n):\n+ """\n+ Computes a p x 100 CI for the given arguments\n+ p - confidence coefficient, common values are 0.99, 0.95, 0.90\n+ xbar - sample point estimate of unknown pop. mean.\n+ sd - standard deviation\n+ n - sample size\n+ """\n+ se = sd / (n ** 0.5)\n+ alphadiv2 = (1.0- p)/2.0\n+ z2 = stat.norm. ppf(1-alphadiv2)\n+ a = xbar - z2 * se\n+ b = xbar + z2 * se\n+ return (a, b)\n+\n+def median(a):\n+ b = sorted(a)\n+ ind,odd = divmod(len(b),2)\n+ median = (b[ind] + b[ind+odd]) / 2.0\n+\n+def mean(a):\n+ return sum(a)/float(len(a))\n+\n+class EmpiricalCdf3D(object):\n+ \n+ def prob(self, x, y, z):\n+ if self.n == 0:\n+ return 0.0\n+ # find prob(X = x) by summing all y\'s and z\'a\n+ nx = 0\n+ ydict = self.D[x]\n+ for zdict in ydict.itervalues(): \n+ nz_given_y = sum(zdict.itervalues())\n+ nx += nz_given_y\n+ if nx == 0:\n+ return 0.0\n+ px = nx / float(self.n) \n+ # find prob(Y = y | X = x)\n+ ny_given_x = sum(self.D[x][y].itervalues())\n+ if ny_given_x == 0:\n+ return 0.0\n+ py_given_x = ny_given_x / float(nx)\n+ # find prob(Z = z | Y=y, X=x)\n+ nz_given_xy = self.D[x][y][z]\n+ if nz_given_xy == 0:\n+ return 0.0\n+ pz_given_xy = nz_given_xy / float(ny_given_x) \n+ # multiply together\n+ return pz_given_xy * py_given_x * px\n+\n+ def _count(self, x, y, z):\n+ total = 0\n+ xkeys = sorted(self.D.iterkeys())\n+ for xval in xkeys:\n+ if xval > x:\n+ break\n+ ykeys = sorted(self.D[xval].iterkeys())\n+ for yval in ykeys:\n+ if yval > y:\n+ break\n+ zkeys = sorted(self.D[xval][yval].iterkeys())\n+ for zval in zkeys:\n+ if zval > z:\n+ break\n+ total += self.D[xval][yval][zval]\n+ return total\n+\n+ def __init__(self, data_iter):\n+ # use dict as sparse matrix for now\n+ self.D = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0)))\n+ self.n = 0\n+ for x,y,z in data_iter:\n+ self.n += 1\n+ self.D[x][y][z] += 1\n+ # turn into dicts\n+ for xval, ydict in self.D.iteritems():\n+ self.D[xval] = dict(ydict)\n+ for yval, zdict in ydict.iteritems():\n+ self.D[xval][yval] = dict(zdict)\n+ self.CDF = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0))) \n+ # turn into cumulative counts\n+ xkeys = sorted(self.D.iterkeys())\n+ for xval in xkeys: \n+ ykeys = sorted(self.D[xval].iterkeys())\n+ for yval in ykeys:\n+ zkeys = sorted(self.D[xval][yval].iterkeys())\n+ for zval in zkeys:\n+ c = self._count(xval, yval, zval)\n+ self.CDF[xval][yval][zval] = c \n+\n+ def __call__(self, x, y, z):\n+ return self.CDF[x][y][z] / float(self.n)\n+\n+if __name__ == \'__main__\':\n+ import random\n+ X = [random.randrange(0, 5) for x in xrange(100)]\n+ Y = [random.randrange(0, 5) for y in xrange(100)]\n+ Z = [random.randrange(0, 5) for z in xrange(100)]\n+ import itertools\n+ x = EmpiricalCdf3D(itertools.izip(X,Y,Z))\n+ print x.n \n+ print x(4, 4, 4)\n+\n+ \n+\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/__init__.pyc |
b |
Binary file chimerascan/pipeline/__init__.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/align_bowtie.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pipeline/align_bowtie.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,282 @@\n+\'\'\'\n+Created on Jun 1, 2011\n+\n+@author: mkiyer\n+\n+chimerascan: chimeric transcript discovery using RNA-seq\n+\n+Copyright (C) 2011 Matthew Iyer\n+\n+This program is free software: you can redistribute it and/or modify\n+it under the terms of the GNU General Public License as published by\n+the Free Software Foundation, either version 3 of the License, or\n+(at your option) any later version.\n+\n+This program is distributed in the hope that it will be useful,\n+but WITHOUT ANY WARRANTY; without even the implied warranty of\n+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n+GNU General Public License for more details.\n+\n+You should have received a copy of the GNU General Public License\n+along with this program. If not, see <http://www.gnu.org/licenses/>.\n+\'\'\'\n+import sys\n+import os\n+import logging\n+import subprocess\n+\n+from chimerascan.lib.base import LibraryTypes\n+from chimerascan.lib.seq import SANGER_FORMAT, SOLEXA_FORMAT, ILLUMINA_FORMAT\n+from chimerascan.lib import config\n+\n+translate_quals = {SOLEXA_FORMAT: \'solexa-quals\',\n+ ILLUMINA_FORMAT: \'solexa1.3-quals\',\n+ SANGER_FORMAT: \'phred33-quals\'}\n+\n+def translate_library_type(library_type):\n+ """\n+ returns the bowtie library type option \'--fr\' or \'--ff\' corresponding\n+ to the first two characters of the library type string\n+ """\n+ return library_type[0:2]\n+\n+_sam2bam_script = os.path.join(os.path.dirname(__file__), "sam2bam.py")\n+_fastq_trim_script = os.path.join(os.path.dirname(__file__), "fastq_merge_trim.py")\n+\n+def align_pe(fastq_files, \n+ bowtie_index,\n+ output_bam_file, \n+ unaligned_fastq_param=None,\n+ maxmultimap_fastq_param=None,\n+ min_fragment_length=0,\n+ max_fragment_length=1000,\n+ trim5=0,\n+ trim3=0,\n+ library_type=LibraryTypes.FR_UNSTRANDED,\n+ num_processors=1, \n+ quals=SANGER_FORMAT,\n+ multihits=100, \n+ mismatches=2, \n+ bowtie_bin="bowtie", \n+ bowtie_args=None,\n+ log_file=None,\n+ keep_unmapped=False):\n+ args = [bowtie_bin, "-q", "-S", \n+ "-p", str(num_processors),\n+ "--%s" % translate_quals[quals],\n+ "-k", str(multihits),\n+ "-m", str(multihits),\n+ "-v", str(mismatches),\n+ "--minins", min_fragment_length,\n+ "--maxins", max_fragment_length,\n+ "--trim5", trim5,\n+ "--trim3", trim3,\n+ "--%s" % translate_library_type(library_type)]\n+ if unaligned_fastq_param is not None:\n+ args.extend(["--un", unaligned_fastq_param])\n+ if maxmultimap_fastq_param is not None:\n+ args.extend(["--max", maxmultimap_fastq_param]) \n+ if bowtie_args is not None: \n+ args.extend(bowtie_args.split())\n+ args += [bowtie_index, \n+ "-1", fastq_files[0],\n+ "-2", fastq_files[1]]\n+ args = map(str, args)\n+ logging.debug("Bowtie alignment args: %s" % (\' \'.join(args)))\n+ # setup logging\n+ if log_file is not None:\n+ logfh = open(log_file, "w")\n+ else:\n+ logfh = None\n+ aln_p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=logfh)\n+ # pipe the bowtie SAM output to a filter that writes BAM format\n+ args = [sys.executable, _sam2bam_script, \n+ "--multihits", str(multihits),\n+ "--quals", quals]\n+ if keep_unmapped:\n+ args.append("--un")\n+ args.extend([output_bam_file, "-"])\n+ args.extend(fastq_files)\n+ logging.debug("SAM to BAM converter args: %s" % (\' \'.join(args)))\n+ retcode = subprocess.call(args, stdin=aln_p.stdout, stderr=logfh) \n+ if logfh is not None:\n+ logfh.close()\n+ if retcode != 0:\n+ logging.error("SAM to BAM conversion script failed")\n+ aln_p.terminate()\n+ # cleanup output file\n+ if os.path.exists(output_bam_file):\n+ '..b', _sam2bam_script, \n+ "--multihits", str(multihits),\n+ "--quals", quals]\n+ if keep_unmapped:\n+ args.append("--un")\n+ args.extend([output_bam_file, "-"])\n+ args.append(fastq_file) \n+ logging.debug("SAM to BAM converter args: %s" % (\' \'.join(args)))\n+ fix_p = subprocess.Popen(args, stdin=aln_p.stdout, stderr=logfh)\n+ # wait for processes to complete\n+ retcode1 = fix_p.wait()\n+ if retcode1 != 0:\n+ logging.error("SAM to BAM conversion script failed")\n+ # kill alignment process\n+ aln_p.kill()\n+ # cleanup output file\n+ if os.path.exists(output_bam_file):\n+ os.remove(output_bam_file)\n+ # end logging\n+ if logfh is not None:\n+ logfh.close()\n+ return config.JOB_ERROR\n+ retcode2 = aln_p.wait()\n+ # end logging\n+ if logfh is not None:\n+ logfh.close()\n+ if retcode2 != 0:\n+ logging.error("Alignment process failed")\n+ # cleanup output file\n+ if os.path.exists(output_bam_file):\n+ os.remove(output_bam_file)\n+ return config.JOB_ERROR\n+ return config.JOB_SUCCESS\n+\n+\n+def trim_align_pe_sr(fastq_files,\n+ bowtie_index,\n+ output_bam_file,\n+ unaligned_fastq_param=None,\n+ maxmultimap_fastq_param=None,\n+ trim5=0,\n+ library_type=LibraryTypes.FR_UNSTRANDED,\n+ num_processors=1, \n+ quals=SANGER_FORMAT,\n+ multihits=100, \n+ mismatches=2, \n+ bowtie_bin="bowtie", \n+ bowtie_args=None,\n+ log_file=None,\n+ segment_length=25,\n+ keep_unmapped=False):\n+ # setup logging\n+ if log_file is not None:\n+ logfh = open(log_file, "w")\n+ else:\n+ logfh = None\n+ #\n+ # Merge paired-end reads into single fastq file\n+ #\n+ args = [sys.executable, _fastq_trim_script, \n+ "--trim5", str(trim5), \n+ "--segment-length", str(segment_length)]\n+ args.extend(fastq_files)\n+ args.append("-")\n+ logging.debug("FASTQ trimming args: %s" % (\' \'.join(args)))\n+ trim_p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=logfh)\n+ #\n+ # Align the trimmed reads\n+ #\n+ args = [bowtie_bin, "-q", "-S", \n+ "-p", str(num_processors),\n+ "--tryhard",\n+ "--%s" % translate_quals[quals],\n+ "-k", str(multihits),\n+ "-m", str(multihits),\n+ "-v", str(mismatches),\n+ "--%s" % translate_library_type(library_type)]\n+ if unaligned_fastq_param is not None:\n+ args.extend(["--un", unaligned_fastq_param])\n+ if maxmultimap_fastq_param is not None:\n+ args.extend(["--max", maxmultimap_fastq_param]) \n+ if bowtie_args is not None: \n+ args.extend(bowtie_args.split())\n+ args += [bowtie_index, "-"]\n+ logging.debug("Alignment args: %s" % (\' \'.join(args)))\n+ aln_p = subprocess.Popen(args, stdin=trim_p.stdout, \n+ stdout=subprocess.PIPE,\n+ stderr=logfh)\n+ #\n+ # Fix alignment ordering and convert to BAM, also extend sequences\n+ # back to full length by adding padding to CIGAR string\n+ #\n+ args = [sys.executable, _sam2bam_script, \n+ "--multihits", str(multihits),\n+ "--quals", quals,\n+ "--pesr", \n+ "--softclip"] \n+ if keep_unmapped:\n+ args.append("--un")\n+ args.extend([output_bam_file, "-"])\n+ args.extend(fastq_files)\n+ logging.debug("SAM to BAM converter args: %s" % (\' \'.join(args)))\n+ fix_p = subprocess.Popen(args, stdin=aln_p.stdout, stderr=logfh)\n+ # wait for processes to complete\n+ fix_p.wait()\n+ aln_p.wait()\n+ trim_p.wait()\n+ # end logging\n+ if logfh is not None:\n+ logfh.close()\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/align_bowtie.pyc |
b |
Binary file chimerascan/pipeline/align_bowtie.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/chimeras_to_breakpoints.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pipeline/chimeras_to_breakpoints.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,74 @@ +''' +Created on Jun 11, 2011 + +@author: mkiyer +''' +import logging +import os +import collections + +from chimerascan import pysam +from chimerascan.lib import config +from chimerascan.lib.chimera import Chimera +from chimerascan.lib.batch_sort import batch_sort +from chimerascan.lib.seq import split_seq + +def chimeras_to_breakpoints(input_file, breakpoint_sorted_chimera_file, + breakpoint_map_file, breakpoint_fasta_file, + tmp_dir): + # sort chimera file by breakpoint name + def sortfunc(line): + fields = line.strip().split('\t') + return fields[Chimera.BREAKPOINT_NAME_FIELD] + tempdirs = [tmp_dir] + batch_sort(input=input_file, + output=breakpoint_sorted_chimera_file, + key=sortfunc, + buffer_size=32000, + tempdirs=tempdirs) + # parse and build breakpoint -> chimera map + fastafh = open(breakpoint_fasta_file, "w") + mapfh = open(breakpoint_map_file, "w") + prev_breakpoint_name = None + prev_seq = None + chimera_names = set() + for c in Chimera.parse(open(breakpoint_sorted_chimera_file)): + seq = c.breakpoint_seq_5p + c.breakpoint_seq_3p + if c.breakpoint_name != prev_breakpoint_name: + if len(chimera_names) > 0: + # write to fasta + print >>fastafh, ">%s\n%s" % (prev_breakpoint_name, split_seq(prev_seq)) + # write to map file + print >>mapfh, "%s\t%s\t%s" % (prev_breakpoint_name, + prev_seq, + ",".join(sorted(chimera_names))) + chimera_names = set() + prev_seq = seq + prev_breakpoint_name = c.breakpoint_name + chimera_names.add(c.name) + if len(chimera_names) > 0: + print >>fastafh, ">%s\n%s" % (prev_breakpoint_name, split_seq(prev_seq)) + print >>mapfh, "%s\t%s\t%s" % (prev_breakpoint_name, prev_seq, ",".join(chimera_names)) + fastafh.close() + mapfh.close() + + +def main(): + from optparse import OptionParser + logging.basicConfig(level=logging.DEBUG, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") + parser = OptionParser("usage: %prog [options] <chimeras.bedpe> " + "<sorted_chimeras.bedpe> " + "<breakpoints.txt> <breakpoints.fa> <tmp_dir>") + options, args = parser.parse_args() + input_file = args[0] + breakpoint_sorted_chimera_file = args[1] + breakpoint_map_file = args[2] + breakpoint_fasta_file = args[3] + tmp_dir = args[3] + chimeras_to_breakpoints(input_file, breakpoint_sorted_chimera_file, + breakpoint_map_file, breakpoint_fasta_file, tmp_dir) + + +if __name__ == '__main__': + main() |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/chimeras_to_breakpoints.pyc |
b |
Binary file chimerascan/pipeline/chimeras_to_breakpoints.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/discordant_reads_to_bedpe.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pipeline/discordant_reads_to_bedpe.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,109 @@ +''' +Created on Jul 21, 2011 + +@author: mkiyer +''' +import logging +import os +import sys + +from chimerascan import pysam +from chimerascan.lib import config +from chimerascan.lib.chimera import DiscordantTags, DISCORDANT_TAG_NAME, \ + OrientationTags, ORIENTATION_TAG_NAME, DiscordantRead +from chimerascan.lib.gene_to_genome import build_tid_gene_map +from chimerascan.lib.batch_sort import batch_sort + +def parse_pairs(bamfh): + bam_iter = iter(bamfh) + try: + while True: + r1 = bam_iter.next() + r2 = bam_iter.next() + yield r1,r2 + except StopIteration: + pass + +def parse_gene_discordant_reads(bamfh): + """ + return tuples of (5',3') reads that both align to transcripts + """ + for r1,r2 in parse_pairs(bamfh): + # TODO: + # for now we are only going to deal with gene-gene + # chimeras and leave other chimeras for study at a + # later time + dr1 = r1.opt(DISCORDANT_TAG_NAME) + dr2 = r2.opt(DISCORDANT_TAG_NAME) + if (dr1 != DiscordantTags.DISCORDANT_GENE or + dr2 != DiscordantTags.DISCORDANT_GENE): + continue + # organize key in 5' to 3' order + or1 = r1.opt(ORIENTATION_TAG_NAME) + or2 = r2.opt(ORIENTATION_TAG_NAME) + assert or1 != or2 + if or1 == OrientationTags.FIVEPRIME: + pair = (r1,r2) + else: + pair = (r2,r1) + yield pair + +def discordant_reads_to_bedpe(index_dir, input_bam_file, output_file): + # open BAM alignment file + bamfh = pysam.Samfile(input_bam_file, "rb") + # build a lookup table to get genomic intervals from transcripts + logging.debug("Reading gene information") + gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE) + tid_gene_map = build_tid_gene_map(bamfh, gene_file, + rname_prefix=config.GENE_REF_PREFIX) + outfh = open(output_file, "w") + logging.debug("Converting BAM to BEDPE format") + for r5p,r3p in parse_gene_discordant_reads(bamfh): + # store pertinent read information in lightweight structure called + # DiscordantRead object. this departs from SAM format into a + # custom read format + dr5p = DiscordantRead.from_read(r5p) + dr3p = DiscordantRead.from_read(r3p) + # get gene information + tx5p = tid_gene_map[r5p.rname] + tx3p = tid_gene_map[r3p.rname] + # write bedpe format + fields = [tx5p.tx_name, r5p.pos, r5p.aend, + tx3p.tx_name, r3p.pos, r3p.aend, + r5p.qname, # read name + 0, # score + tx5p.strand, tx3p.strand, # strand 1, strand 2 + ] + fields.append('|'.join(map(str, dr5p.to_list()))) + fields.append('|'.join(map(str, dr3p.to_list()))) + print >>outfh, '\t'.join(map(str, fields)) + outfh.close() + +def sort_bedpe(input_file, output_file, tmp_dir): + # sort BEDPE file by paired chromosome/position + def sortfunc(line): + fields = line.strip().split('\t') + return tuple([fields[0], fields[3], fields[1], fields[4]]) + tempdirs = [tmp_dir] + batch_sort(input=input_file, + output=output_file, + key=sortfunc, + buffer_size=32000, + tempdirs=tempdirs) + + +def main(): + from optparse import OptionParser + logging.basicConfig(level=logging.DEBUG, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") + parser = OptionParser("usage: %prog [options] <index> <pairs.bam> <out.bedpe>") + options, args = parser.parse_args() + index_dir = args[0] + input_bam_file = args[1] + output_file = args[2] + return discordant_reads_to_bedpe(index_dir, + input_bam_file, + output_file) + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/discordant_reads_to_bedpe.pyc |
b |
Binary file chimerascan/pipeline/discordant_reads_to_bedpe.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/fastq_inspect_reads.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pipeline/fastq_inspect_reads.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,132 @@ +''' +Created on Jul 14, 2011 + +@author: mkiyer + +chimerascan: chimeric transcript discovery using RNA-seq + +Copyright (C) 2011 Matthew Iyer + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +''' +import logging +import gzip +import bz2 +import zipfile +import os + +from chimerascan.lib.seq import get_qual_conversion_func +from chimerascan.lib.base import parse_lines +import chimerascan.lib.config as config + +def detect_format(f): + if f.endswith(".gz") or f.endswith(".z"): + return "gz" + elif f.endswith(".bz2"): + return "bz2" + elif f.endswith(".zip"): + return "zip" + else: + return "txt" + +def open_compressed(f): + compression_format = detect_format(f) + if compression_format == "gz": + fh = gzip.open(f, "r") + elif compression_format == "bz2": + fh = bz2.BZ2File(f, "r") + elif compression_format == "zip": + fh = zipfile.ZipFile(f, "r") + else: + fh = open(f, "r") + return fh + +def detect_read_length(filename): + fh = open_compressed(filename) + fh.next() + seq = fh.next() + fh.close() + return len(seq) + +def get_min_max_read_lengths(fastq_files, num_samples=10000): + read_lengths = [] + for filename in fastq_files: + f = open_compressed(filename) + count = 0 + samples = 0 + for line in f: + mod = count % 4 + if mod == 1: + read_lengths.append(len(line)) + samples += 1 + if samples >= num_samples: + break + count += 1 + f.close() + return min(read_lengths), max(read_lengths) + +def inspect_reads(fastq_files, output_prefix, quals): + """ + uncompresses reads, renames reads, and converts quality scores + to 'sanger' format + """ + # setup file iterators + filehandles = [open_compressed(f) for f in fastq_files] + fqiters = [parse_lines(f, numlines=4) for f in filehandles] + output_files = [(output_prefix + "_%d.fq" % (x+1)) + for x in xrange(len(fastq_files))] + outfhs = [open(f, "w") for f in output_files] + qual_func = get_qual_conversion_func(quals) + linenum = 0 + try: + while True: + pelines = [it.next() for it in fqiters] + for i,lines in enumerate(pelines): + # rename read using line number + lines[0] = "@%d/%d" % (linenum,i+1) + # ignore redundant header + lines[2] = "+" + # convert quality score to sanger + lines[3] = qual_func(lines[3]) + print >>outfhs[i], '\n'.join(lines) + linenum += 1 + except StopIteration: + pass + except: + logging.error("Unexpected error during FASTQ file processing") + for f in output_files: + if os.path.exists(f): + os.remove(f) + return config.JOB_ERROR + for fh in filehandles: + fh.close() + logging.debug("Inspected %d fragments" % (linenum)) + return config.JOB_SUCCESS + +def main(): + logging.basicConfig(level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") + from optparse import OptionParser + parser = OptionParser("usage: %prog [options] <outprefix> <in1.fq> <in2.fq>") + parser.add_option("--quals", dest="quals", choices=["sanger", "solexa", "illumina"], + default="sanger") + options, args = parser.parse_args() + if len(args) < 2: + parser.error("must specify output prefix and at least one fastq file") + output_prefix = args[0] + fastq_files = args[1:] + inspect_reads(fastq_files, output_prefix, options.quals) + +if __name__ == '__main__': + main() |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/fastq_inspect_reads.pyc |
b |
Binary file chimerascan/pipeline/fastq_inspect_reads.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/fastq_merge_trim.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pipeline/fastq_merge_trim.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,61 @@ +''' +Created on May 23, 2011 + +@author: mkiyer + +chimerascan: chimeric transcript discovery using RNA-seq + +Copyright (C) 2011 Matthew Iyer + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +''' +import sys + +def parse_fastq(line_iter): + with line_iter: + while True: + lines = [line_iter.next().rstrip() for x in xrange(4)] + yield lines + +def trim_and_merge_fastq(infiles, outfile, trim5, segment_length): + total_length = trim5 + segment_length + fqiters = [parse_fastq(open(f)) for f in infiles] + if outfile == "-": + outfh = sys.stdout + else: + outfh = open(outfile, "w") + try: + while True: + pe_lines = [fqiter.next() for fqiter in fqiters] + for lines in pe_lines: + seqlen = len(lines[1]) + if seqlen > total_length: + lines[1] = lines[1][trim5:total_length] + lines[3] = lines[3][trim5:total_length] + print >>outfh, '\n'.join(lines) + except StopIteration: + pass + if outfile != "-": + outfh.close() + +def main(): + from optparse import OptionParser + parser = OptionParser("usage: %prog [options] <in1.fq> <in2.fq> <out.fq>") + parser.add_option("--trim5", type="int", dest="trim5", default=0) + parser.add_option("--segment-length", type="int", dest="segment_length", default=25) + options, args = parser.parse_args() + trim_and_merge_fastq(args[:2], args[2], options.trim5, options.segment_length) + +if __name__ == '__main__': + main() |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/filter_chimeras.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pipeline/filter_chimeras.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,217 @@\n+\'\'\'\n+Created on Jan 31, 2011\n+\n+@author: mkiyer\n+\n+chimerascan: chimeric transcript discovery using RNA-seq\n+\n+Copyright (C) 2011 Matthew Iyer\n+\n+This program is free software: you can redistribute it and/or modify\n+it under the terms of the GNU General Public License as published by\n+the Free Software Foundation, either version 3 of the License, or\n+(at your option) any later version.\n+\n+This program is distributed in the hope that it will be useful,\n+but WITHOUT ANY WARRANTY; without even the implied warranty of\n+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n+GNU General Public License for more details.\n+\n+You should have received a copy of the GNU General Public License\n+along with this program. If not, see <http://www.gnu.org/licenses/>.\n+\'\'\'\n+import logging\n+import collections\n+import os\n+\n+from chimerascan import pysam\n+from chimerascan.lib.gene_to_genome import build_transcript_genome_map, \\\n+ transcript_to_genome_pos, build_transcript_cluster_map\n+from chimerascan.lib.chimera import Chimera\n+from chimerascan.lib import config\n+\n+def filter_unique_frags(c, threshold):\n+ """\n+ filters chimeras with less than \'threshold\' unique\n+ alignment positions supporting the chimera \n+ """\n+ return c.get_num_unique_positions() >= threshold\n+\n+def get_wildtype_frags_5p(rname, start, end, bamfh):\n+ num_wildtype_frags = len(set(r.qname for r in bamfh.fetch(rname, start, end)\n+ if (not r.mate_is_unmapped) and (r.mpos >= end)))\n+ return num_wildtype_frags\n+\n+def get_wildtype_frags_3p(rname, start, end, bamfh):\n+ num_wildtype_frags = len(set(r.qname for r in bamfh.fetch(rname, start, end)\n+ if (not r.mate_is_unmapped) and (r.mpos < start)))\n+ return num_wildtype_frags\n+\n+def get_wildtype_frags(c, bamfh):\n+ rname5p = config.GENE_REF_PREFIX + c.tx_name_5p\n+ rname3p = config.GENE_REF_PREFIX + c.tx_name_3p\n+ num_wt_frags_5p = get_wildtype_frags_5p(rname5p, c.tx_start_5p, c.tx_end_5p, bamfh)\n+ num_wt_frags_3p = get_wildtype_frags_3p(rname3p, c.tx_start_3p, c.tx_end_3p, bamfh)\n+ return num_wt_frags_5p, num_wt_frags_3p\n+\n+def filter_chimeric_isoform_fraction(c, frac, bamfh):\n+ """\n+ filters chimeras with fewer than \'threshold\' total\n+ unique read alignments\n+ """\n+ num_wt_frags_5p, num_wt_frags_3p = get_wildtype_frags(c, bamfh)\n+ num_chimeric_frags = c.get_num_frags()\n+ ratio5p = float(num_chimeric_frags) / (num_chimeric_frags + num_wt_frags_5p)\n+ ratio3p = float(num_chimeric_frags) / (num_chimeric_frags + num_wt_frags_3p)\n+ #print c.gene_name_5p, c.gene_name_3p, "chimeras", num_chimeric_frags, "wt5p", num_wt_frags_5p, "wt3p", num_wt_frags_3p, "r5p", ratio5p, "r3p", ratio3p\n+ return min(ratio5p, ratio3p) >= frac\n+\n+def read_false_pos_file(filename):\n+ false_pos_chimeras = set()\n+ for line in open(filename):\n+ fields = line.strip().split("\\t")\n+ tx_name_5p, end5p, tx_name_3p, start3p = fields\n+ end5p = int(end5p)\n+ start3p = int(start3p)\n+ false_pos_chimeras.add((tx_name_5p, end5p, tx_name_3p, start3p))\n+ return false_pos_chimeras\n+\n+def filter_encompassing_chimeras(input_file, output_file, min_frags):\n+ num_chimeras = 0\n+ num_filtered_chimeras = 0\n+ f = open(output_file, "w") \n+ for c in Chimera.parse(open(input_file)):\n+ num_chimeras += 1\n+ if c.get_num_frags() < min_frags:\n+ continue\n+ num_filtered_chimeras += 1\n+ print >>f, \'\\t\'.join(map(str, c.to_list()))\n+ f.close()\n+ logging.debug("\\tchimeras: %d" % (num_chimeras))\n+ logging.debug("\\tfiltered chimeras: %d" % (num_filtered_chimeras))\n+ return config.JOB_SUCCESS\n+\n+def filter_chimeras(input_file, output_file,\n+ index_dir, bam_file,\n+ unique_frags,\n+ isoform_fraction,\n+ false_pos_file):\n+ logging.debug("Parameters")\n+ logging.debug("\\'..b'coverage_isoforms(input_file, gene_file):\n+ # place overlapping chimeras into clusters\n+ logging.debug("Building isoform cluster lookup table")\n+ transcript_cluster_map = build_transcript_cluster_map(open(gene_file))\n+ # build a lookup table to get genome coordinates from transcript \n+ # coordinates\n+ transcript_genome_map = build_transcript_genome_map(open(gene_file))\n+ cluster_chimera_dict = collections.defaultdict(lambda: [])\n+ for c in Chimera.parse(open(input_file)):\n+ # TODO: adjust this to score chimeras differently!\n+ key = (c.name, c.get_num_frags())\n+ # get cluster of overlapping genes\n+ cluster5p = transcript_cluster_map[c.tx_name_5p]\n+ cluster3p = transcript_cluster_map[c.tx_name_3p]\n+ # get genomic positions of breakpoints\n+ coord5p = transcript_to_genome_pos(c.tx_name_5p, c.tx_end_5p-1, transcript_genome_map)\n+ coord3p = transcript_to_genome_pos(c.tx_name_3p, c.tx_start_3p, transcript_genome_map)\n+ # add to dictionary\n+ cluster_chimera_dict[(cluster5p,cluster3p,coord5p,coord3p)].append(key) \n+ # choose highest coverage chimeras within each pair of clusters\n+ logging.debug("Finding highest coverage isoforms")\n+ kept_chimeras = set()\n+ for stats_list in cluster_chimera_dict.itervalues():\n+ stats_dict = collections.defaultdict(lambda: set())\n+ for stats_info in stats_list:\n+ # index chimera names\n+ stats_dict[stats_info[1:]].add(stats_info[0])\n+ # find highest scoring key\n+ sorted_keys = sorted(stats_dict.keys(), reverse=True)\n+ kept_chimeras.update(stats_dict[sorted_keys[0]])\n+ return kept_chimeras\n+\n+def filter_highest_coverage_isoforms(index_dir, input_file, output_file):\n+ # find highest coverage chimeras among isoforms\n+ gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE)\n+ kept_chimeras = get_highest_coverage_isoforms(input_file, gene_file)\n+ num_filtered_chimeras = 0\n+ f = open(output_file, "w")\n+ for c in Chimera.parse(open(input_file)):\n+ if c.name in kept_chimeras:\n+ num_filtered_chimeras += 1\n+ print >>f, \'\\t\'.join(map(str, c.to_list()))\n+ f.close()\n+ logging.debug("\\tAfter choosing best isoform: %d" % \n+ num_filtered_chimeras)\n+ return config.JOB_SUCCESS\n+\n+\n+\n+def main():\n+ from optparse import OptionParser\n+ logging.basicConfig(level=logging.DEBUG,\n+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")\n+ parser = OptionParser("usage: %prog [options] <index_dir> "\n+ "<sorted_aligned_reads.bam> <in.txt> <out.txt>")\n+ parser.add_option("--unique-frags", type="float", default=2.0,\n+ dest="unique_frags", metavar="N",\n+ help="Filter chimeras with less than N unique "\n+ "aligned fragments [default=%default]")\n+ parser.add_option("--isoform-fraction", type="float", \n+ default=0.10, metavar="X",\n+ help="Filter chimeras with expression ratio "\n+ " less than X (0.0-1.0) relative to the wild-type "\n+ "5\' transcript level [default=%default]")\n+ parser.add_option("--false-pos", dest="false_pos_file",\n+ default=None, \n+ help="File containing known false positive "\n+ "transcript pairs to subtract from output")\n+ options, args = parser.parse_args()\n+ index_dir = args[0]\n+ bam_file = args[1]\n+ input_file = args[2]\n+ output_file = args[3]\n+ return filter_chimeras(input_file, output_file, index_dir, bam_file,\n+ unique_frags=options.unique_frags,\n+ isoform_fraction=options.isoform_fraction,\n+ false_pos_file=options.false_pos_file)\n+\n+if __name__ == "__main__":\n+ main()\n\\ No newline at end of file\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/filter_chimeras.pyc |
b |
Binary file chimerascan/pipeline/filter_chimeras.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/filter_homologous_genes.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pipeline/filter_homologous_genes.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,146 @@ +''' +Created on Aug 1, 2011 + +@author: mkiyer +''' +import logging +import os +import collections +import subprocess + +from chimerascan import pysam +from chimerascan.lib import config +from chimerascan.lib.chimera import Chimera +from chimerascan.bx.intersection import IntervalTree, Interval + +def get_mapped_read_intervals(c, min_isize, max_isize, homolog_segment_length): + start5p = max(0, c.tx_end_5p - min_isize + homolog_segment_length) + end5p = max(0, c.tx_end_5p + max_isize - homolog_segment_length) + if start5p > end5p: + end5p = start5p + homolog_segment_length + start3p = max(0, c.tx_start_3p - max_isize + homolog_segment_length) + end3p = max(0, c.tx_start_3p + min_isize - homolog_segment_length) + if start3p > end3p: + end3p = start3p + homolog_segment_length + return start5p, end5p, start3p, end3p + +def filter_homologous_genes(input_file, output_file, index_dir, + homolog_segment_length, + min_isize, + max_isize, + bowtie_bin, + num_processors, + tmp_dir): + logging.debug("Parameters") + logging.debug("\thomolog segment length: %d" % (homolog_segment_length)) + logging.debug("\tmin fragment size: %d" % (min_isize)) + logging.debug("\tmax fragment size: %d" % (max_isize)) + + # open the reference sequence fasta file + ref_fasta_file = os.path.join(index_dir, config.ALIGN_INDEX + ".fa") + ref_fa = pysam.Fastafile(ref_fasta_file) + bowtie_index = os.path.join(index_dir, config.ALIGN_INDEX) + interval_trees_3p = collections.defaultdict(lambda: IntervalTree()) + + # generate FASTA file of sequences to use in mapping + logging.debug("Generating homologous sequences to test") + fasta5p = os.path.join(tmp_dir, "homologous_5p.fa") + f = open(fasta5p, "w") + for c in Chimera.parse(open(input_file)): + tx_name_5p = config.GENE_REF_PREFIX + c.tx_name_5p + tx_name_3p = config.GENE_REF_PREFIX + c.tx_name_3p + start5p, end5p, start3p, end3p = get_mapped_read_intervals(c, min_isize, max_isize, homolog_segment_length) + # add 3' gene to interval trees + interval_trees_3p[tx_name_3p].insert_interval(Interval(start3p, end3p, value=c.name)) + # extract sequence of 5' gene + seq5p = ref_fa.fetch(tx_name_5p, start5p, end5p) + for i in xrange(0, len(seq5p) - homolog_segment_length): + print >>f, ">%s,%s:%d-%d\n%s" % (c.name,c.tx_name_5p, + start5p+i, + start5p+i+homolog_segment_length, + seq5p[i:i+homolog_segment_length]) + f.close() + + # map 5' sequences to reference using bowtie + logging.debug("Mapping homologous sequences") + sam5p = os.path.join(tmp_dir, "homologous_5p.sam") + args = [bowtie_bin, "-p", num_processors, "-f", "-a", "-m", 100, + "-y", "-v", 3, "-S", + bowtie_index, fasta5p, sam5p] + retcode = subprocess.call(map(str,args)) + if retcode != 0: + return config.JOB_ERROR + + # analyze results for homologous genes + logging.debug("Analyzing mapping results") + samfh = pysam.Samfile(sam5p, "r") + tid_rname_map = dict((i,refname) for i,refname in enumerate(samfh.references)) + homologous_chimeras = set() + for r in pysam.Samfile(sam5p, "r"): + if r.is_unmapped: + continue + # reference name must be in list of 3' chimeras + rname = tid_rname_map[r.rname] + if rname not in interval_trees_3p: + continue + # get chimera name from 'qname' + chimera_name = r.qname.split(",")[0] + for hit in interval_trees_3p[rname].find(r.pos,r.aend): + if hit.value == chimera_name: + homologous_chimeras.add(chimera_name) + + # write output + logging.debug("Writing output") + f = open(output_file, "w") + for c in Chimera.parse(open(input_file)): + if c.name in homologous_chimeras: + logging.debug("Removing homologous chimera %s between %s and %s" % + (c.name, c.gene_name_5p, c.gene_name_3p)) + continue + print >>f, '\t'.join(map(str, c.to_list())) + f.close() + + # cleanup + if os.path.exists(fasta5p): + os.remove(fasta5p) + if os.path.exists(sam5p): + os.remove(sam5p) + return config.JOB_SUCCESS + + +def main(): + from optparse import OptionParser + logging.basicConfig(level=logging.DEBUG, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") + parser = OptionParser("usage: %prog [options] <index_dir> " + "<in.txt> <out.txt>") + parser.add_option("--homolog-segment-length", dest="homolog_segment_length", + type="int", default=25, + help="Segment length to consider when searching for " + "homologous regions [default=%default]") + parser.add_option('--min-fragment-length', dest="min_fragment_length", + type="int", default=100) + parser.add_option('--max-fragment-length', dest="max_fragment_length", + type="int", default=300) + parser.add_option("--bowtie-bin", dest="bowtie_bin", + default="bowtie", + help="Path to bowtie binary [default: %default]") + parser.add_option("-p", type="int", dest="num_processors", default=1, + help="Number of processors to use [default: %default]") + parser.add_option("--tmp-dir", dest="tmp_dir", + default=".", + help="Temporary directory [default=%default]") + options, args = parser.parse_args() + index_dir = args[0] + input_file = args[1] + output_file = args[2] + return filter_homologous_genes(input_file, output_file, index_dir, + homolog_segment_length=options.homolog_segment_length, + min_isize=options.min_fragment_length, + max_isize=options.max_fragment_length, + bowtie_bin=options.bowtie_bin, + num_processors=options.num_processors, + tmp_dir=options.tmp_dir) + +if __name__ == "__main__": + main() \ No newline at end of file |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/filter_homologous_genes.pyc |
b |
Binary file chimerascan/pipeline/filter_homologous_genes.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/find_discordant_reads.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pipeline/find_discordant_reads.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,423 @@\n+\'\'\'\n+Created on Jun 2, 2011\n+\n+@author: mkiyer\n+\'\'\'\n+import logging\n+import collections\n+import os\n+\n+from chimerascan import pysam\n+from chimerascan.bx.cluster import ClusterTree\n+\n+from chimerascan.lib import config\n+from chimerascan.lib.base import LibraryTypes\n+from chimerascan.lib.sam import parse_pe_reads, pair_reads, copy_read, select_best_mismatch_strata\n+from chimerascan.lib.gene_to_genome import build_transcript_tid_genome_map, \\\n+ build_transcript_tid_cluster_map, transcript_to_genome_pos\n+from chimerascan.lib.chimera import DiscordantTags, DISCORDANT_TAG_NAME, \\\n+ OrientationTags, ORIENTATION_TAG_NAME, cmp_orientation\n+\n+# globals\n+imin2 = lambda a,b: a if a <= b else b\n+\n+def annotate_multihits(bamfh, reads, transcript_tid_genome_map):\n+ hits = set()\n+ any_unmapped = False\n+ for r in reads:\n+ if r.is_unmapped:\n+ any_unmapped = True\n+ continue\n+ if r.rname not in transcript_tid_genome_map:\n+ tid = r.rname\n+ pos = r.pos\n+ else:\n+ # use the position that is most 5\' relative to genome\n+ left_tid, left_strand, left_pos = transcript_to_genome_pos(r.rname, r.pos, transcript_tid_genome_map)\n+ right_tid, right_strand, right_pos = transcript_to_genome_pos(r.rname, r.aend-1, transcript_tid_genome_map)\n+ tid = left_tid\n+ pos = imin2(left_pos, right_pos)\n+ hits.add((tid, pos))\n+ #print r.qname, bamfh.getrname(r.rname), r.pos, bamfh.getrname(tid), pos \n+ for i,r in enumerate(reads):\n+ # annotate reads with \'HI\', and \'IH\' tags\n+ r.tags = r.tags + [("HI",i), ("IH",len(reads)), ("NH", len(hits))]\n+ return any_unmapped\n+\n+def map_reads_to_references(pe_reads, transcript_tid_cluster_map):\n+ """\n+ bin reads by transcript cluster and reference (tid)\n+ """\n+ refdict = collections.defaultdict(lambda: ([], []))\n+ genedict = collections.defaultdict(lambda: ([], []))\n+ for readnum, reads in enumerate(pe_reads):\n+ for r in reads:\n+ if r.is_unmapped:\n+ continue \n+ # get cluster id\n+ if r.rname in transcript_tid_cluster_map:\n+ # add to cluster dict\n+ cluster_id = transcript_tid_cluster_map[r.rname]\n+ pairs = genedict[cluster_id]\n+ pairs[readnum].append(r)\n+ # add to reference dict\n+ pairs = refdict[r.rname]\n+ pairs[readnum].append(r)\n+ return refdict, genedict\n+\n+def get_genome_orientation(r, library_type):\n+ if library_type == LibraryTypes.FR_FIRSTSTRAND:\n+ if r.is_read2:\n+ return OrientationTags.FIVEPRIME\n+ else:\n+ return OrientationTags.THREEPRIME\n+ elif library_type == LibraryTypes.FR_SECONDSTRAND:\n+ if r.is_read1:\n+ return OrientationTags.FIVEPRIME\n+ else:\n+ return OrientationTags.THREEPRIME\n+ return OrientationTags.NONE\n+\n+def get_gene_orientation(r, library_type):\n+ if library_type == LibraryTypes.FR_UNSTRANDED:\n+ if r.is_reverse:\n+ return OrientationTags.THREEPRIME\n+ else:\n+ return OrientationTags.FIVEPRIME\n+ elif library_type == LibraryTypes.FR_FIRSTSTRAND:\n+ if r.is_read2:\n+ return OrientationTags.FIVEPRIME\n+ else:\n+ return OrientationTags.THREEPRIME\n+ elif library_type == LibraryTypes.FR_SECONDSTRAND:\n+ if r.is_read1:\n+ return OrientationTags.FIVEPRIME\n+ else:\n+ return OrientationTags.THREEPRIME\n+ logging.error("Unknown library type %s, aborting" % (library_type))\n+ assert False\n+\n+def classify_unpaired_reads(reads, transcript_tid_genome_map, library_type):\n+ gene_hits_5p = []\n+ gene_hits_3p = []\n+ genome_hits = []\n+ for r in reads:\n+ # check to see if this alignment is to a gene, or genomic\n+ if (r.rname not in transcript_tid_genome_map):\n+ #'..b'nput_bam_file))\n+ logging.debug("\\tMax insert size: \'%d\'" % (max_isize))\n+ logging.debug("\\tLibrary type: \'%s\'" % (library_type))\n+ logging.debug("\\tGene paired file: %s" % (gene_paired_bam_file))\n+ logging.debug("\\tGenome paired file: %s" % (genome_paired_bam_file))\n+ logging.debug("\\tUnmapped file: %s" % (unmapped_bam_file))\n+ logging.debug("\\tComplex file: %s" % (complex_bam_file))\n+ # setup input and output files\n+ bamfh = pysam.Samfile(input_bam_file, "rb")\n+ genefh = pysam.Samfile(gene_paired_bam_file, "wb", template=bamfh)\n+ genomefh = pysam.Samfile(genome_paired_bam_file, "wb", template=bamfh)\n+ unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh)\n+ complexfh = pysam.Samfile(complex_bam_file, "wb", template=bamfh)\n+ gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE)\n+ # build a lookup table to get all the overlapping transcripts given a\n+ # transcript \'tid\'\n+ transcript_tid_cluster_map = \\\n+ build_transcript_tid_cluster_map(bamfh, open(gene_file), \n+ rname_prefix=config.GENE_REF_PREFIX)\n+ # build a lookup table to get genome coordinates from transcript \n+ # coordinates\n+ transcript_tid_genome_map = \\\n+ build_transcript_tid_genome_map(bamfh, open(gene_file), \n+ rname_prefix=config.GENE_REF_PREFIX)\n+ for pe_reads in parse_pe_reads(bamfh):\n+ # add hit index and number of multimaps information to read tags\n+ # this function also checks for unmapped reads\n+ any_unmapped = False\n+ for reads in pe_reads:\n+ any_unmapped = (any_unmapped or \n+ annotate_multihits(bamfh, reads, transcript_tid_genome_map))\n+ if any_unmapped:\n+ # write to output as discordant reads and continue to \n+ # next fragment\n+ write_pe_reads(unmappedfh, pe_reads)\n+ continue\n+ # examine all read pairing combinations and rule out invalid \n+ # pairings. this returns gene pairs and genome pairs\n+ gene_pairs, genome_pairs, unpaired_reads = \\\n+ classify_read_pairs(pe_reads, max_isize,\n+ library_type, transcript_tid_genome_map,\n+ transcript_tid_cluster_map)\n+ if len(gene_pairs) > 0 or len(genome_pairs) > 0:\n+ write_pairs(genefh, gene_pairs)\n+ write_pairs(genomefh, genome_pairs)\n+ else:\n+ write_pe_reads(complexfh, unpaired_reads)\n+ genefh.close()\n+ genomefh.close()\n+ unmappedfh.close()\n+ complexfh.close()\n+ bamfh.close() \n+ logging.info("Finished pairing reads")\n+\n+\n+def main():\n+ from optparse import OptionParser\n+ logging.basicConfig(level=logging.DEBUG,\n+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")\n+ parser = OptionParser("usage: %prog [options] <index> <in.bam> "\n+ "<gene_paired.bam> <genome_paired.bam> "\n+ "<unmapped.bam> <complex.bam>")\n+ parser.add_option(\'--max-fragment-length\', dest="max_fragment_length", \n+ type="int", default=1000)\n+ parser.add_option(\'--library\', dest="library_type", \n+ default=LibraryTypes.FR_UNSTRANDED)\n+ options, args = parser.parse_args() \n+ index_dir = args[0]\n+ input_bam_file = args[1]\n+ gene_paired_bam_file = args[2]\n+ genome_paired_bam_file = args[3]\n+ unmapped_bam_file = args[4]\n+ complex_bam_file = args[5]\n+ find_discordant_fragments(input_bam_file, gene_paired_bam_file,\n+ genome_paired_bam_file, unmapped_bam_file, \n+ complex_bam_file, index_dir,\n+ max_isize=options.max_fragment_length,\n+ library_type=options.library_type)\n+\n+if __name__ == \'__main__\':\n+ main()\n\\ No newline at end of file\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/find_discordant_reads.pyc |
b |
Binary file chimerascan/pipeline/find_discordant_reads.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/merge_spanning_alignments.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pipeline/merge_spanning_alignments.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,266 @@\n+\'\'\'\n+Created on Nov 7, 2010\n+\n+@author: mkiyer\n+\n+chimerascan: chimeric transcript discovery using RNA-seq\n+\n+Copyright (C) 2011 Matthew Iyer\n+\n+This program is free software: you can redistribute it and/or modify\n+it under the terms of the GNU General Public License as published by\n+the Free Software Foundation, either version 3 of the License, or\n+(at your option) any later version.\n+\n+This program is distributed in the hope that it will be useful,\n+but WITHOUT ANY WARRANTY; without even the implied warranty of\n+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n+GNU General Public License for more details.\n+\n+You should have received a copy of the GNU General Public License\n+along with this program. If not, see <http://www.gnu.org/licenses/>.\n+\'\'\'\n+import logging\n+import collections\n+import shutil\n+import os\n+\n+# local imports\n+from chimerascan import pysam\n+from chimerascan.lib.chimera import Chimera, DiscordantRead, \\\n+ DiscordantTags, DISCORDANT_TAG_NAME, \\\n+ OrientationTags, ORIENTATION_TAG_NAME\n+from chimerascan.lib.base import LibraryTypes\n+\n+from chimerascan.pipeline.find_discordant_reads import get_gene_orientation\n+\n+def parse_group_by_attr(myiter, attr):\n+ mylist = []\n+ prev = None\n+ for itm in myiter:\n+ cur = getattr(itm, attr)\n+ if prev != cur:\n+ if len(mylist) > 0:\n+ yield prev, mylist\n+ mylist = []\n+ prev = cur\n+ mylist.append(itm)\n+ if len(mylist) > 0:\n+ yield prev, mylist\n+\n+def parse_sync_by_breakpoint(chimera_file, bam_file):\n+ # group reads by reference name (matches breakpoint name)\n+ bamfh = pysam.Samfile(bam_file, "rb")\n+ tid_rname_map = list(bamfh.references)\n+ # initialize iterator through reads\n+ read_iter = parse_group_by_attr(bamfh, "rname")\n+ read_iter_valid = True\n+ try:\n+ rname, reads = read_iter.next()\n+ read_breakpoint_name = tid_rname_map[rname]\n+ except StopIteration:\n+ bamfh.close()\n+ read_iter_valid = False\n+ reads = []\n+ read_breakpoint_name = "ZZZZZZZZZZZZZZ"\n+ # group chimeras by breakpoint name\n+ for chimera_breakpoint_name, chimeras in \\\n+ parse_group_by_attr(Chimera.parse(open(chimera_file)), \n+ "breakpoint_name"):\n+ while (read_iter_valid) and (chimera_breakpoint_name > read_breakpoint_name):\n+ try:\n+ rname, reads = read_iter.next()\n+ read_breakpoint_name = tid_rname_map[rname]\n+ except StopIteration:\n+ read_iter_valid = False\n+ reads = []\n+ if chimera_breakpoint_name < read_breakpoint_name:\n+ yield chimeras, []\n+ else:\n+ yield chimeras, reads \n+ bamfh.close()\n+\n+def get_mismatch_positions(md):\n+ x = 0\n+ pos = []\n+ for y in xrange(len(md)):\n+ if md[y].isalpha():\n+ offset = int(md[x:y])\n+ pos.append(offset)\n+ x = y + 1\n+ return pos\n+\n+def check_breakpoint_alignment(c, r,\n+ anchor_min,\n+ anchor_length,\n+ anchor_mismatches):\n+ """\n+ returns True if read \'r\' meets criteria for a valid\n+ breakpoint spanning read, False otherwise\n+ \n+ c - Chimera object\n+ r - pysam AlignedRead object\n+ """\n+ # get position of breakpoint along seq\n+ breakpoint_pos = len(c.breakpoint_seq_5p)\n+ # check if read spans breakpoint \n+ if not (r.pos < breakpoint_pos < r.aend):\n+ return False \n+ # calculate amount in bp that read overlaps breakpoint\n+ # and ensure overlap is sufficient\n+ left_anchor_bp = breakpoint_pos - r.pos\n+ if left_anchor_bp < max(c.homology_left, anchor_min):\n+ return False\n+ right_anchor_bp = r.aend - breakpoint_pos\n+ if right_anchor_bp < max(c.homology_right, anchor_min):\n+ return False\n+ # ensure that alignment'..b' for dpair in c.encomp_frags:\n+ chimera_qname_dict[c.name][dpair[0].qname] = dpair \n+ # find valid spanning reads\n+ for c, dr in filter_spanning_reads(chimeras, reads, \n+ anchor_min, anchor_length, \n+ anchor_mismatches, library_type):\n+ # ensure encompassing read is present\n+ if dr.qname not in chimera_qname_dict[c.name]:\n+ continue\n+ # get discordant pair\n+ dpair = chimera_qname_dict[c.name][dr.qname]\n+ # mark correct read (read1/read2) as a spanning read\n+ if dr.readnum == dpair[0].readnum:\n+ dpair[0].is_spanning = True\n+ elif dr.readnum == dpair[1].readnum:\n+ dpair[1].is_spanning = True\n+ else:\n+ assert False\n+ filtered_hits += 1\n+ # write chimeras back to file\n+ for c in chimeras:\n+ fields = c.to_list()\n+ print >>f, \'\\t\'.join(map(str, fields)) \n+ f.close()\n+ logging.debug("\\tFound %d hits" % (filtered_hits))\n+ #\n+ # Process reads that are single-mapped and spanning\n+ #\n+ logging.debug("Processing single-mapping/spanning reads")\n+ tmp_singlemap_chimera_file = os.path.join(tmp_dir, "tmp_singlemap_chimeras.bedpe")\n+ f = open(tmp_singlemap_chimera_file, "w")\n+ filtered_hits = 0\n+ for chimeras, reads in parse_sync_by_breakpoint(tmp_encomp_chimera_file, singlemap_bam_file):\n+ # find valid spanning reads\n+ for c, dr in filter_spanning_reads(chimeras, reads, \n+ anchor_min, anchor_length, \n+ anchor_mismatches, library_type):\n+ # ensure mate maps to 5\' or 3\' gene\n+ # TODO: implement this using sorted/indexed BAM file?\n+ # add read as a spanning read\n+ c.spanning_reads.append(dr)\n+ filtered_hits += 1 \n+ # write chimeras back to file\n+ for c in chimeras:\n+ fields = c.to_list()\n+ print >>f, \'\\t\'.join(map(str, fields)) \n+ f.close()\n+ logging.debug("\\tFound %d hits" % (filtered_hits))\n+ # output_chimera_file \n+ shutil.copyfile(tmp_singlemap_chimera_file, output_chimera_file)\n+ # remove temporary files\n+ if os.path.exists(tmp_encomp_chimera_file):\n+ os.remove(tmp_encomp_chimera_file)\n+ if os.path.exists(tmp_singlemap_chimera_file):\n+ os.remove(tmp_singlemap_chimera_file)\n+ \n+\n+def main():\n+ from optparse import OptionParser\n+ logging.basicConfig(level=logging.DEBUG,\n+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") \n+ parser = OptionParser("usage: %prog [options] <chimeras.breakpoint_sorted.txt> "\n+ "<encomp.bam> <onemap.bam> <chimeras.out.txt>")\n+ parser.add_option("--anchor-min", type="int", dest="anchor_min", default=4)\n+ parser.add_option("--anchor-length", type="int", dest="anchor_length", default=8)\n+ parser.add_option("--anchor-mismatches", type="int", dest="anchor_mismatches", default=0)\n+ parser.add_option(\'--library\', dest="library_type", \n+ default=LibraryTypes.FR_UNSTRANDED)\n+ options, args = parser.parse_args()\n+ breakpoint_chimera_file = args[0]\n+ encomp_bam_file = args[1]\n+ singlemap_bam_file = args[2]\n+ output_chimera_file = args[4]\n+ merge_spanning_alignments(breakpoint_chimera_file,\n+ encomp_bam_file,\n+ singlemap_bam_file,\n+ output_chimera_file,\n+ options.anchor_min, \n+ options.anchor_length,\n+ options.anchor_mismatches,\n+ options.library_type)\n+\n+if __name__ == \'__main__\':\n+ main()\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/merge_spanning_alignments.pyc |
b |
Binary file chimerascan/pipeline/merge_spanning_alignments.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/nominate_chimeras.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pipeline/nominate_chimeras.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,275 @@\n+\'\'\'\n+Created on Jul 21, 2011\n+\n+@author: mkiyer\n+\n+Copyright (C) 2011 Matthew Iyer\n+\n+This program is free software: you can redistribute it and/or modify\n+it under the terms of the GNU General Public License as published by\n+the Free Software Foundation, either version 3 of the License, or\n+(at your option) any later version.\n+\n+This program is distributed in the hope that it will be useful,\n+but WITHOUT ANY WARRANTY; without even the implied warranty of\n+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n+GNU General Public License for more details.\n+\n+You should have received a copy of the GNU General Public License\n+along with this program. If not, see <http://www.gnu.org/licenses/>.\n+\'\'\'\n+import logging\n+import os\n+import sys\n+import collections\n+import itertools\n+import operator\n+\n+from chimerascan import pysam\n+\n+from chimerascan.lib import config\n+from chimerascan.lib.chimera import DiscordantRead, Chimera, frags_to_encomp_string\n+from chimerascan.lib.gene_to_genome import build_tx_name_gene_map, build_genome_tx_trees\n+from chimerascan.lib.fragment_size_distribution import InsertSizeDistribution\n+from chimerascan.lib.seq import calc_homology\n+\n+def parse_discordant_bedpe_by_transcript_pair(fh):\n+ prev_tx5p, prev_tx3p = None,None\n+ frags = []\n+ for line in fh:\n+ fields = line.strip().split(\'\\t\') \n+ tx5p = fields[0]\n+ tx3p = fields[3]\n+ dr5p = DiscordantRead.from_list(fields[10].split("|"))\n+ dr3p = DiscordantRead.from_list(fields[11].split("|"))\n+ if (tx5p, tx3p) != (prev_tx5p, prev_tx3p):\n+ if len(frags) > 0:\n+ yield prev_tx5p, prev_tx3p, frags\n+ frags = []\n+ prev_tx5p, prev_tx3p = tx5p, tx3p\n+ frags.append((dr5p, dr3p))\n+ if len(frags) > 0:\n+ yield tx5p, tx3p, frags \n+\n+def calc_isize_prob(isize, isize_dist):\n+ # find percentile of observing this insert size in the reads\n+ isize_per = isize_dist.percentile_at_isize(isize)\n+ # convert to a probability score (0.0-1.0)\n+ isize_prob = 1.0 - (2.0 * abs(50.0 - isize_per))/100.0 \n+ return isize_prob\n+\n+def choose_best_breakpoints(r5p, r3p, tx5p, tx3p, trim_bp, isize_dist):\n+ best_breakpoints = set()\n+ best_isize_prob = None\n+ # iterate through 5\' transcript exons \n+ exon_iter_5p = reversed(tx5p.exons) if tx5p.strand == \'-\' else iter(tx5p.exons)\n+ tx_end_5p = 0\n+ for exon_num_5p,coords5p in enumerate(exon_iter_5p):\n+ genome_start_5p, genome_end_5p = coords5p \n+ exon_size_5p = genome_end_5p - genome_start_5p\n+ tx_end_5p += exon_size_5p\n+ # fast forward on 5\' gene to first exon beyond read \n+ if tx_end_5p < (r5p.aend - trim_bp):\n+ continue \n+ #print "tx end 5p", tx_end_5p, "exon_size_5p", exon_size_5p, "r5p.aend", r5p.aend, "trim_bp", trim_bp\n+ # now have a candidate insert size between between 5\' read and\n+ # end of 5\' exon\n+ isize5p = tx_end_5p - r5p.pos\n+ # iterate through 3\' transcript\n+ exon_iter_3p = reversed(tx3p.exons) if tx3p.strand == \'-\' else iter(tx3p.exons)\n+ tx_start_3p = 0\n+ local_best_breakpoints = set()\n+ local_best_isize_prob = None\n+ for exon_num_3p,coords3p in enumerate(exon_iter_3p):\n+ genome_start_3p, genome_end_3p = coords3p\n+ #print "\\t", coords3p \n+ # stop after going past read on 3\' transcript\n+ if tx_start_3p >= (r3p.pos + trim_bp):\n+ break\n+ # get another candidate insert size between start of 3\'\n+ # exon and 3\' read\n+ isize3p = r3p.aend - tx_start_3p\n+ #print "\\t", isize5p, isize3p, tx_end_5p, tx_start_3p\n+ # compare the insert size against the known insert size\n+ # distribution\n+ isize_prob = calc_isize_prob(isize5p + isize3p, isize_dist)\n+ if ((local_best_isize_prob i'..b'akpoint\n+ breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \\\n+ extract_breakpoint_sequence(config.GENE_REF_PREFIX + tx5p.tx_name, tx_end_5p,\n+ config.GENE_REF_PREFIX + tx3p.tx_name, tx_start_3p,\n+ ref_fa, max_read_length,\n+ homology_mismatches) \n+ tx3p_length = sum((end - start) for start,end in tx3p.exons)\n+ # get unique breakpoint id based on sequence\n+ breakpoint_seq = breakpoint_seq_5p + breakpoint_seq_3p\n+ if breakpoint_seq in breakpoint_seq_name_map:\n+ breakpoint_name = breakpoint_seq_name_map[breakpoint_seq]\n+ else:\n+ breakpoint_name = "B%07d" % (breakpoint_num)\n+ breakpoint_seq_name_map[breakpoint_seq] = breakpoint_name\n+ breakpoint_num += 1\n+ # write gene, breakpoint, and raw reads to a file and follow the\n+ # BEDPE format\n+ gene_name_5p = \'_\'.join(tx5p.gene_name.split())\n+ gene_name_3p = \'_\'.join(tx3p.gene_name.split())\n+ fields = [tx5p.tx_name, 0, tx_end_5p, # chrom1, start1, end1\n+ tx3p.tx_name, tx_start_3p, tx3p_length, # chrom2, start2, end2\n+ "C%07d" % (chimera_num), # name\n+ 1.0, # pvalue\n+ tx5p.strand, tx3p.strand, # strand1, strand2\n+ gene_name_5p, gene_name_3p, # gene names\n+ # exon interval information\n+ \'%d-%d\' % (0, exon_num_5p),\n+ \'%d-%d\' % (exon_num_3p, len(tx3p.exons)),\n+ # breakpoint information\n+ breakpoint_name, \n+ breakpoint_seq_5p, breakpoint_seq_3p, \n+ homology_left, homology_right, \n+ # fragments\n+ frags_to_encomp_string(frags),\n+ # spanning reads\n+ None]\n+ print >>outfh, \'\\t\'.join(map(str, fields))\n+ chimera_num += 1\n+ outfh.close()\n+ ref_fa.close()\n+ return config.JOB_SUCCESS\n+ \n+\n+def main():\n+ from optparse import OptionParser\n+ logging.basicConfig(level=logging.DEBUG,\n+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")\n+ parser = OptionParser("usage: %prog [options] <index> <isize_dist.txt> "\n+ "<discordant_reads.srt.bedpe> <chimeras.txt>")\n+ parser.add_option("--trim", dest="trim", type="int", \n+ default=config.EXON_JUNCTION_TRIM_BP,\n+ help="apply trimming when choosing exon boundaries to "\n+ "to consider possible breakpoints")\n+ parser.add_option("--max-read-length", dest="max_read_length", type="int",\n+ default=100, metavar="N",\n+ help="Reads in the BAM file are guaranteed to have "\n+ "length less than N [default=%default]")\n+ parser.add_option("--homology-mismatches", type="int", \n+ dest="homology_mismatches", \n+ default=config.BREAKPOINT_HOMOLOGY_MISMATCHES,\n+ help="Number of mismatches to tolerate when computing "\n+ "homology between gene and its chimeric partner "\n+ "[default=%default]")\n+ options, args = parser.parse_args()\n+ index_dir = args[0]\n+ isize_dist_file = args[1]\n+ input_file = args[2]\n+ output_file = args[3]\n+ return nominate_chimeras(index_dir, isize_dist_file, \n+ input_file, output_file, \n+ options.trim,\n+ options.max_read_length,\n+ options.homology_mismatches)\n+\n+\n+if __name__ == \'__main__\':\n+ sys.exit(main())\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/nominate_chimeras.pyc |
b |
Binary file chimerascan/pipeline/nominate_chimeras.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/nominate_spanning_reads.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pipeline/nominate_spanning_reads.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,290 @@\n+\'\'\'\n+Created on Jan 30, 2011\n+\n+@author: mkiyer\n+\n+chimerascan: chimeric transcript discovery using RNA-seq\n+\n+Copyright (C) 2011 Matthew Iyer\n+\n+This program is free software: you can redistribute it and/or modify\n+it under the terms of the GNU General Public License as published by\n+the Free Software Foundation, either version 3 of the License, or\n+(at your option) any later version.\n+\n+This program is distributed in the hope that it will be useful,\n+but WITHOUT ANY WARRANTY; without even the implied warranty of\n+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n+GNU General Public License for more details.\n+\n+You should have received a copy of the GNU General Public License\n+along with this program. If not, see <http://www.gnu.org/licenses/>.\n+\'\'\'\n+import logging\n+import os\n+\n+from chimerascan import pysam\n+\n+from chimerascan.lib import config\n+from chimerascan.lib.base import LibraryTypes\n+from chimerascan.lib.sam import parse_pe_reads\n+from chimerascan.lib.chimera import Chimera, OrientationTags, ORIENTATION_TAG_NAME\n+from chimerascan.lib.batch_sort import batch_sort\n+from chimerascan.lib.seq import DNA_reverse_complement\n+from chimerascan.pipeline.find_discordant_reads import get_gene_orientation\n+\n+def to_fastq(qname, readnum, seq, qual, is_reverse=False):\n+ if is_reverse:\n+ seq = DNA_reverse_complement(seq)\n+ qual = qual[::-1]\n+ return "@%s/%d\\n%s\\n+\\n%s" % (qname, readnum+1, seq, qual)\n+\n+def nominate_encomp_spanning_reads(chimera_file, output_fastq_file):\n+ """\n+ find all encompassing reads that should to be remapped to see if they\n+ span the breakpoint junction\n+ """\n+ fqfh = open(output_fastq_file, "w")\n+ remap_qnames = set()\n+ for c in Chimera.parse(open(chimera_file)):\n+ # find breakpoint coords of chimera\n+ end5p = c.tx_end_5p\n+ start3p = c.tx_start_3p\n+ for r5p,r3p in c.encomp_frags: \n+ # if 5\' read overlaps breakpoint then it should be remapped\n+ if r5p.clipstart < end5p < r5p.clipend:\n+ key5p = (r5p.qname, r5p.readnum)\n+ if key5p not in remap_qnames:\n+ remap_qnames.add((r5p.qname, r5p.readnum))\n+ print >>fqfh, to_fastq(r5p.qname, r5p.readnum, \n+ r5p.seq, "I" * len(r5p.seq),\n+ is_reverse=r5p.is_reverse)\n+ # if 3\' read overlaps breakpoint then it should be remapped\n+ if r3p.clipstart < start3p < r3p.clipend:\n+ key3p = (r3p.qname, r3p.readnum)\n+ if key3p not in remap_qnames:\n+ remap_qnames.add((r3p.qname, r3p.readnum))\n+ print >>fqfh, to_fastq(r3p.qname, r3p.readnum, \n+ r3p.seq, "I" * len(r3p.seq),\n+ is_reverse=r3p.is_reverse)\n+ fqfh.close()\n+ return config.JOB_SUCCESS\n+\n+def parse_chimeras_by_gene(chimera_file, orientation):\n+ clist = []\n+ prev_tx_name = None\n+ for c in Chimera.parse(open(chimera_file)):\n+ tx_name = c.tx_name_5p if (orientation == OrientationTags.FIVEPRIME) else c.tx_name_3p\n+ if prev_tx_name != tx_name:\n+ if len(clist) > 0:\n+ yield prev_tx_name, clist\n+ clist = []\n+ prev_tx_name = tx_name\n+ clist.append(c)\n+ if len(clist) > 0:\n+ yield prev_tx_name, clist\n+\n+def parse_reads_by_rname(bamfh, orientation):\n+ """\n+ reads must be sorted and include an orientation tag\n+ """\n+ reads = []\n+ prev_rname = None\n+ for r in bamfh:\n+ o = r.opt(ORIENTATION_TAG_NAME)\n+ if o != orientation:\n+ continue\n+ if prev_rname != r.rname:\n+ if len(reads) > 0:\n+ yield reads\n+ reads = []\n+ prev_rname = r.rname\n+ reads.append(r)\n+ if len(reads) > 0:\n+ yield r'..b't >>f, \'\\t\'.join(map(str, [r.qname, readnum, r.opt("R2"), r.opt("Q2")]))\n+ # sort chimeras by 3\' partner\n+ logging.debug("Sorting chimeras by 3\' transcript")\n+ def sort_by_3p_partner(line):\n+ fields = line.strip().split(\'\\t\', Chimera.TX_NAME_3P_FIELD+1)\n+ return fields[Chimera.TX_NAME_3P_FIELD]\n+ tmp_chimera_file_sorted_3p = os.path.join(tmp_dir, "tmp_chimeras.sorted3p.bedpe")\n+ batch_sort(input=chimera_file,\n+ output=tmp_chimera_file_sorted_3p,\n+ key=sort_by_3p_partner,\n+ buffer_size=32000,\n+ tempdirs=[tmp_dir])\n+ # search for matches to 3\' chimeras\n+ logging.debug("Matching single-mapped frags to 3\' chimeras")\n+ for clist, reads in parse_sync_chimera_with_bam(tmp_chimera_file_sorted_3p, \n+ single_mapped_bam_file,\n+ OrientationTags.THREEPRIME):\n+ # TODO: test more specifically that read has a chance to cross breakpoint\n+ for r in reads:\n+ # reverse read number\n+ readnum = 1 if r.is_read1 else 0\n+ print >>f, \'\\t\'.join(map(str, [r.qname, readnum, r.opt("R2"), r.opt("Q2")]))\n+ f.close()\n+ #\n+ # now sort the file of sequences by read name/number to \n+ # eliminate duplicates\n+ # \n+ def sort_by_qname(line):\n+ fields = line.strip().split(\'\\t\')\n+ return (fields[0], int(fields[1]))\n+ tmp_sorted_seqs_to_remap = os.path.join(tmp_dir, "tmp_singlemap_seqs.sorted.txt")\n+ batch_sort(input=tmp_seqs_to_remap,\n+ output=tmp_sorted_seqs_to_remap,\n+ key=sort_by_qname,\n+ buffer_size=32000,\n+ tempdirs=[tmp_dir])\n+ #\n+ # read file and write fastq, ignoring duplicates\n+ # \n+ fqfh = open(single_mapped_fastq_file, "w")\n+ prev = None\n+ for line in open(tmp_sorted_seqs_to_remap):\n+ fields = line.strip().split(\'\\t\')\n+ qname, readnum, seq, qual = fields[0], int(fields[1]), fields[2], fields[3]\n+ cur = (fields[0], int(fields[1]))\n+ if prev != cur:\n+ if prev is not None: \n+ print >>fqfh, to_fastq(qname, readnum, seq, qual)\n+ prev = cur\n+ if prev is not None:\n+ print >>fqfh, to_fastq(qname, readnum, seq, qual)\n+ fqfh.close()\n+ # TODO: remove temporary files\n+ #os.remove(tmp_chimera_file_sorted_3p)\n+ #os.remove(tmp_seqs_to_remap)\n+ #os.remove(tmp_sorted_seqs_to_remap)\n+ return config.JOB_SUCCESS\n+\n+\n+def main():\n+ from optparse import OptionParser\n+ logging.basicConfig(level=logging.DEBUG,\n+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")\n+ parser = OptionParser("usage: %prog [options] <chimeras.txt> "\n+ "<unmapped_reads.bam> <encomp_remap.fq> "\n+ "<singlemap_remap.fq> "\n+ "<unmapped_remap.fq> ")\n+ parser.add_option(\'--library\', dest="library_type", \n+ default=LibraryTypes.FR_UNSTRANDED)\n+ options, args = parser.parse_args()\n+ chimera_file = args[0]\n+ bam_file = args[1]\n+ encomp_remap_fastq_file = args[2]\n+ singlemap_remap_fastq_file = args[3]\n+ unmapped_remap_fastq_file = args[4]\n+ nominate_encomp_spanning_reads(chimera_file, encomp_remap_fastq_file)\n+ extract_single_mapped_reads(chimera_file, \n+ bam_file,\n+ "single_mapped_reads.srt.bam",\n+ unmapped_remap_fastq_file,\n+ options.library_type,\n+ "/tmp") \n+ nominate_single_mapped_spanning_reads(chimera_file, \n+ "single_mapped_reads.srt.bam",\n+ singlemap_remap_fastq_file, \n+ "/tmp")\n+\n+if __name__ == \'__main__\':\n+ main()\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/nominate_spanning_reads.pyc |
b |
Binary file chimerascan/pipeline/nominate_spanning_reads.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/profile_insert_size.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pipeline/profile_insert_size.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,61 @@ +''' +Created on Jan 24, 2011 + +@author: mkiyer + +chimerascan: chimeric transcript discovery using RNA-seq + +Copyright (C) 2011 Matthew Iyer + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +''' +import logging +import sys +# local imports +from chimerascan import pysam +from chimerascan.lib.fragment_size_distribution import InsertSizeDistribution + +def main(): + from optparse import OptionParser + logging.basicConfig(level=logging.DEBUG, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") + parser = OptionParser("usage: %prog [options] <bam> <out.bedpe>") + parser.add_option('--min-fragment-length', dest="min_fragment_length", + type="int", default=0) + parser.add_option('--max-fragment-length', dest="max_fragment_length", + type="int", default=1000) + parser.add_option('--max-samples', dest="max_samples", + type="int", default=None) + parser.add_option('-o', dest="output_file", default=None) + options, args = parser.parse_args() + input_bam_file = args[0] + bamfh = pysam.Samfile(input_bam_file, "rb") + isizedist = InsertSizeDistribution.from_bam(bamfh, options.min_fragment_length, + options.max_fragment_length, + options.max_samples) + bamfh.close() + if options.output_file is not None: + f = open(options.output_file, "w") + else: + f = sys.stdout + isizedist.to_file(f) + if options.output_file is not None: + f.close() + logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" % + (isizedist.n, isizedist.mean(), isizedist.std(), + isizedist.percentile(50.0), isizedist.mode())) + + +if __name__ == '__main__': + main() \ No newline at end of file |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/resolve_discordant_reads.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pipeline/resolve_discordant_reads.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
b"@@ -0,0 +1,287 @@\n+'''\n+Created on Jul 28, 2011\n+\n+@author: mkiyer\n+\n+chimerascan: chimeric transcript discovery using RNA-seq\n+\n+Copyright (C) 2011 Matthew Iyer\n+\n+This program is free software: you can redistribute it and/or modify\n+it under the terms of the GNU General Public License as published by\n+the Free Software Foundation, either version 3 of the License, or\n+(at your option) any later version.\n+\n+This program is distributed in the hope that it will be useful,\n+but WITHOUT ANY WARRANTY; without even the implied warranty of\n+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n+GNU General Public License for more details.\n+\n+You should have received a copy of the GNU General Public License\n+along with this program. If not, see <http://www.gnu.org/licenses/>.\n+'''\n+import logging\n+import collections\n+import os\n+\n+from chimerascan.lib.chimera import Chimera\n+from chimerascan.lib.fragment_size_distribution import InsertSizeDistribution\n+from chimerascan.lib.batch_sort import batch_sort\n+\n+QNAME_COL = 0\n+CHIMERA_NAME_COL = 5\n+SCORE_FIELDS = (6,7,8,9,10)\n+\n+class ChimeraStats(object):\n+ __slots__ = ('qname', 'tid5p', 'pos5p', 'tid3p', 'pos3p', \n+ 'chimera_name', 'num_spanning_frags', 'num_unambiguous_frags',\n+ 'num_uniquely_aligning_frags', 'neg_mismatches',\n+ 'isize_prob')\n+\n+ @property\n+ def score_tuple(self):\n+ return (self.num_spanning_frags,\n+ self.num_unambiguous_frags,\n+ self.num_uniquely_aligning_frags,\n+ self.neg_mismatches,\n+ self.isize_prob)\n+\n+ def to_list(self):\n+ return [self.qname,\n+ self.tid5p, self.pos5p,\n+ self.tid3p, self.pos3p, \n+ self.chimera_name,\n+ self.num_spanning_frags,\n+ self.num_unambiguous_frags,\n+ self.num_uniquely_aligning_frags,\n+ self.neg_mismatches,\n+ self.isize_prob]\n+\n+ @staticmethod\n+ def from_list(fields):\n+ s = ChimeraStats()\n+ s.qname = fields[0]\n+ s.tid5p = int(fields[1])\n+ s.pos5p = int(fields[2])\n+ s.tid3p = int(fields[3])\n+ s.pos3p = int(fields[4])\n+ s.chimera_name = fields[5]\n+ s.num_spanning_frags = int(fields[6])\n+ s.num_unambiguous_frags = int(fields[7])\n+ s.num_uniquely_aligning_frags = int(fields[8])\n+ s.neg_mismatches = int(fields[9])\n+ s.isize_prob = float(fields[10])\n+ return s\n+\n+ @staticmethod\n+ def parse(line_iter):\n+ for line in line_iter:\n+ fields = line.strip().split('\\t')\n+ yield ChimeraStats.from_list(fields)\n+\n+def calc_isize_prob(isize, isize_dist):\n+ # find percentile of observing this insert size in the reads\n+ isize_per = isize_dist.percentile_at_isize(isize)\n+ # convert to a probability score (0.0-1.0)\n+ isize_prob = 1.0 - (2.0 * abs(50.0 - isize_per))/100.0 \n+ return isize_prob\n+\n+def group_by_attr(item_iter, attr):\n+ mylist = []\n+ prev = None\n+ for itm in item_iter:\n+ cur = getattr(itm, attr)\n+ if prev != cur:\n+ if len(mylist) > 0:\n+ yield prev, mylist\n+ mylist = []\n+ prev = cur\n+ mylist.append(itm)\n+ if len(mylist) > 0:\n+ yield prev, mylist\n+\n+#def group_by_field(item_iter, colnum):\n+# mylist = []\n+# prev = None\n+# for fields in item_iter:\n+# # parse read stats information\n+# cur = fields[colnum]\n+# if prev != cur:\n+# if len(mylist) > 0:\n+# yield prev, mylist\n+# mylist = []\n+# prev = cur\n+# mylist.append(fields)\n+# if len(mylist) > 0:\n+# yield prev, mylist\n+\n+def parse_sync_chimeras_read_stats(chimera_file, read_stats_file):\n+ # group reads by chimera name\n+ read_stats_iter = group_by_attr(ChimeraStats.parse(open(read_stats_file)), \n+ "..b'+ resolved_read_stats_file = os.path.join(tmp_dir, "read_stats.rname_sorted.resolved.txt")\n+ f = open(resolved_read_stats_file, "w")\n+ for rname,readstats in group_by_attr(ChimeraStats.parse(open(sorted_read_stats_file)), \n+ \'qname\'):\n+ # build a dictionary of stats -> read/chimeras\n+ stats_dict = collections.defaultdict(lambda: [])\n+ for s in readstats:\n+ # add key/value pairs\n+ stats_dict[s.score_tuple].append(s)\n+ # sort based on stats\n+ sorted_stats_keys = sorted(stats_dict.keys(), reverse=True)\n+ # use only the best key\n+ for s in stats_dict[sorted_stats_keys[0]]:\n+ # output read -> chimera relationships\n+ print >>f, \'\\t\'.join(map(str, s.to_list()))\n+ f.close()\n+ #\n+ # re-sort by chimera name\n+ #\n+ logging.debug("Resorting reads by chimera name")\n+ def sort_reads_by_chimera_name(line):\n+ return line.strip().split(\'\\t\',CHIMERA_NAME_COL+1)[CHIMERA_NAME_COL]\n+ sorted_resolved_read_stats_file = os.path.join(tmp_dir, "read_stats.chimera_name_sorted.resolved.txt")\n+ batch_sort(input=resolved_read_stats_file,\n+ output=sorted_resolved_read_stats_file,\n+ key=sort_reads_by_chimera_name,\n+ buffer_size=32000,\n+ tempdirs=[tmp_dir])\n+ logging.debug("Resorting chimeras by name")\n+ def sort_chimeras_by_name(line):\n+ return line.strip().split(\'\\t\',Chimera.NAME_FIELD+1)[Chimera.NAME_FIELD]\n+ sorted_chimera_file = os.path.join(tmp_dir, "spanning_chimeras.name_sorted.txt")\n+ batch_sort(input=input_file,\n+ output=sorted_chimera_file,\n+ key=sort_chimeras_by_name,\n+ buffer_size=32000,\n+ tempdirs=[tmp_dir])\n+ #\n+ # parse and rebuild chimeras based on best reads\n+ # \n+ logging.debug("Rewriting chimeras with lists of \'best\' reads")\n+ f = open(output_file, "w")\n+ # need to sync chimeras with stats\n+ for c,stats in parse_sync_chimeras_read_stats(sorted_chimera_file, sorted_resolved_read_stats_file):\n+ # parse and make lookup set of the resolved alignments\n+ good_alignments = set()\n+ for s in stats:\n+ if s.isize_prob < min_isize_prob:\n+ continue\n+ good_alignments.add((s.qname, s.tid5p, s.pos5p, s.tid3p, s.pos3p))\n+ # replace encompassing frags with resolved alignments\n+ new_encomp_frags = []\n+ for dpair in c.encomp_frags:\n+ # get alignment tuple\n+ aln = (dpair[0].qname, dpair[0].tid, dpair[0].pos, dpair[1].tid, dpair[1].pos)\n+ if aln in good_alignments:\n+ new_encomp_frags.append(dpair)\n+ c.encomp_frags = new_encomp_frags\n+ c.score = c.get_num_frags()\n+ print >>f, \'\\t\'.join(map(str, c.to_list()))\n+ f.close()\n+ # remove temporary files\n+ #os.remove(read_stats_file)\n+ #os.remove(sorted_read_stats_file)\n+ #os.remove(resolved_read_stats_file)\n+ #os.remove(sorted_resolved_read_stats_file)\n+ #os.remove(sorted_chimera_file)\n+\n+def main():\n+ from optparse import OptionParser\n+ logging.basicConfig(level=logging.DEBUG,\n+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")\n+ parser = OptionParser("usage: %prog [options] <in.txt> <out.txt> <isizedist.txt>")\n+ parser.add_option("--min-isize-prob", dest="min_isize_prob", \n+ type="float", default=0.01)\n+ options, args = parser.parse_args()\n+ input_file = args[0]\n+ output_file = args[1]\n+ isize_dist_file = args[2]\n+ # read insert size distribution\n+ isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file))\n+ resolve_discordant_reads(input_file, output_file, isize_dist, \n+ options.min_isize_prob,\n+ tmp_dir=".")\n+\n+if __name__ == \'__main__\':\n+ main()\n\\ No newline at end of file\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/resolve_discordant_reads.pyc |
b |
Binary file chimerascan/pipeline/resolve_discordant_reads.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/sam2bam.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pipeline/sam2bam.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,86 @@ +''' +Created on Jun 2, 2011 + +@author: mkiyer +''' +import logging + +# local imports +import chimerascan.pysam as pysam +from chimerascan.lib.fix_alignment_ordering import fix_alignment_ordering, fix_sr_alignment_ordering +from chimerascan.lib.sam import soft_pad_read +from chimerascan.lib.seq import FASTQ_QUAL_FORMATS, SANGER_FORMAT, parse_fastq_record + +def sam_to_bam(input_fastq_files, input_sam_file, output_bam_file, + quals, multihits, pe_sr_mode=False, softclip=True, + keep_unmapped=True): + samfh = pysam.Samfile(input_sam_file, "r") + num_unmapped = 0 + num_multihits = 0 + num_frags = 0 + bamfh = pysam.Samfile(output_bam_file, "wb", template=samfh) + # setup fastq parsing + if softclip and (quals != SANGER_FORMAT): + kwargs = {"convert_quals": True, "qual_format": quals} + else: + kwargs = {"convert_quals": False} + fqiters = [parse_fastq_record(open(fq), **kwargs) for fq in input_fastq_files] + + # handle single-read and paired-end + if len(fqiters) == 1: + reorder_func = fix_sr_alignment_ordering(samfh, fqiters[0]) + else: + reorder_func = fix_alignment_ordering(samfh, fqiters, pe_sr_mode) + # iterate through buffer + for bufitems in reorder_func: + num_frags += 1 + for bufitem in bufitems: + for r in bufitem.reads: + # softclip uses the fastq record to replace the sequence + # and quality scores of the read + if softclip: + soft_pad_read(bufitem.fqrec, r) + # keep statistics of unmapped/multimapped reads and + # suppress output if 'keep_unmapped' is False + if r.is_unmapped: + xm_tag = r.opt('XM') + if xm_tag < multihits: + num_unmapped += 1 + if not keep_unmapped: + continue + else: + num_multihits += 1 + bamfh.write(r) + for fqfh in fqiters: + fqfh.close() + bamfh.close() + samfh.close() + logging.debug("Found %d fragments" % (num_frags)) + logging.debug("\t%d unmapped reads" % (num_unmapped)) + logging.debug("\t%d multimapping (>%dX) reads" % + (num_multihits, multihits)) + +if __name__ == '__main__': + from optparse import OptionParser + logging.basicConfig(level=logging.DEBUG, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") + parser = OptionParser("usage: %prog [options] <out.bam> <in.sam> <in1.fq> [<in2.fq>]") + parser.add_option("--multihits", type="int", dest="multihits", default=100) + parser.add_option("--quals", dest="quals", + choices=FASTQ_QUAL_FORMATS, + default=SANGER_FORMAT) + parser.add_option("--pesr", action="store_true", dest="pe_sr_mode", default=False) + parser.add_option("--softclip", action="store_true", dest="softclip", default=False) + parser.add_option("--un", action="store_true", dest="keep_unmapped", default=False) + options, args = parser.parse_args() + output_bam_file = args[0] + input_sam_file = args[1] + input_fastq_files = args[2:] + sam_to_bam(input_fastq_files, + input_sam_file, + output_bam_file, + quals=options.quals, + multihits=options.multihits, + pe_sr_mode=options.pe_sr_mode, + softclip=options.softclip, + keep_unmapped=options.keep_unmapped) |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/write_output.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pipeline/write_output.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,185 @@\n+\'\'\'\n+Created on Jul 1, 2011\n+\n+@author: mkiyer\n+\n+chimerascan: chimeric transcript discovery using RNA-seq\n+\n+Copyright (C) 2011 Matthew Iyer\n+\n+This program is free software: you can redistribute it and/or modify\n+it under the terms of the GNU General Public License as published by\n+the Free Software Foundation, either version 3 of the License, or\n+(at your option) any later version.\n+\n+This program is distributed in the hope that it will be useful,\n+but WITHOUT ANY WARRANTY; without even the implied warranty of\n+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n+GNU General Public License for more details.\n+\n+You should have received a copy of the GNU General Public License\n+along with this program. If not, see <http://www.gnu.org/licenses/>.\n+\'\'\'\n+import logging\n+import os\n+import sys\n+import operator\n+import collections\n+\n+from chimerascan import pysam\n+from chimerascan.lib.chimera import Chimera, get_chimera_type\n+from chimerascan.lib import config\n+from chimerascan.lib.gene_to_genome import build_transcript_genome_map, \\\n+ build_transcript_cluster_map, build_genome_tx_trees, \\\n+ build_tx_name_gene_map, transcript_to_genome_pos\n+\n+from chimerascan.pipeline.filter_chimeras import get_wildtype_frags\n+\n+\n+def get_chimera_groups(input_file, gene_file):\n+ # build a lookup table to get gene clusters from transcript name \n+ transcript_cluster_map = build_transcript_cluster_map(open(gene_file))\n+ # build a lookup table to get genome coordinates from transcript \n+ # coordinates\n+ # TODO: can either group by exact breakpoint, or just by\n+ # gene cluster\n+ # transcript_genome_map = build_transcript_genome_map(open(gene_file))\n+ # group chimeras in the same genomic cluster with the same\n+ # breakpoint\n+ cluster_chimera_dict = collections.defaultdict(lambda: [])\n+ for c in Chimera.parse(open(input_file)):\n+ # get cluster of overlapping genes\n+ cluster5p = transcript_cluster_map[c.tx_name_5p]\n+ cluster3p = transcript_cluster_map[c.tx_name_3p]\n+ # get genomic positions of breakpoints\n+ #coord5p = transcript_to_genome_pos(c.partner5p.tx_name, c.partner5p.end-1, transcript_genome_map)\n+ #coord3p = transcript_to_genome_pos(c.partner3p.tx_name, c.partner3p.start, transcript_genome_map)\n+ # add to dictionary\n+ cluster_chimera_dict[(cluster5p,cluster3p)].append(c)\n+ # TODO: use this grouping instead?\n+ #cluster_chimera_dict[(cluster5p,cluster3p,coord5p,coord3p)].append(c)\n+ for key,chimeras in cluster_chimera_dict.iteritems():\n+ yield key,chimeras\n+\n+def get_best_coverage_chimera(chimeras):\n+ stats = []\n+ for c in chimeras:\n+ # TODO: come up with a way to prioritize here (spanning included?)\n+ stats.append((c,\n+ c.get_num_unique_positions(),\n+ c.get_num_frags()))\n+ sorted_stats = sorted(stats, key=operator.itemgetter(1,2), reverse=True)\n+ return sorted_stats[0][0]\n+\n+def write_output(input_file, bam_file, output_file, index_dir):\n+ gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE)\n+ # build a lookup table to get genome coordinates from transcript \n+ # coordinates\n+ transcript_genome_map = build_transcript_genome_map(open(gene_file)) \n+ tx_name_gene_map = build_tx_name_gene_map(gene_file) \n+ genome_tx_trees = build_genome_tx_trees(gene_file)\n+ # open BAM file for checking wild-type isoform\n+ bamfh = pysam.Samfile(bam_file, "rb") \n+ # group chimera isoforms together\n+ lines = []\n+ chimera_clusters = 0\n+ for key,chimeras in get_chimera_groups(input_file, gene_file):\n+ txs5p = set()\n+ txs3p = set()\n+ genes5p = set()\n+ genes3p = set()\n+ names = set()\n+ for c in chimeras:\n+ txs5p.add("%s:%d-%d" % (c.tx_name_5p, c.tx_start_5p, c.tx_end_5p-1))\n+ txs3p.add("%s:%d-%d" % (c.tx_name_3p, c.tx_start_3p, c.tx_end_3p-1))\n'..b' genome_tx_trees)\n+ # get genomic positions of chimera\n+ chrom5p,strand5p,start5p = transcript_to_genome_pos(c.tx_name_5p, c.tx_start_5p, transcript_genome_map)\n+ chrom5p,strand5p,end5p = transcript_to_genome_pos(c.tx_name_5p, c.tx_end_5p-1, transcript_genome_map)\n+ if strand5p == 1:\n+ start5p,end5p = end5p,start5p\n+ chrom3p,strand3p,start3p = transcript_to_genome_pos(c.tx_name_3p, c.tx_start_3p, transcript_genome_map)\n+ chrom3p,strand3p,end3p = transcript_to_genome_pos(c.tx_name_3p, c.tx_end_3p-1, transcript_genome_map)\n+ if strand3p == 1:\n+ start3p,end3p = end3p,start3p\n+ # get breakpoint spanning sequences\n+ spanning_seqs = set()\n+ spanning_fasta_lines = []\n+ for dr in c.get_spanning_reads():\n+ if dr.seq in spanning_seqs:\n+ continue\n+ spanning_seqs.add(dr.seq)\n+ spanning_fasta_lines.extend([">%s/%d;pos=%d;strand=%s" % \n+ (dr.qname, dr.readnum+1, dr.pos, \n+ "-" if dr.is_reverse else "+"), \n+ dr.seq])\n+ # get isoform fraction\n+ num_wt_frags_5p, num_wt_frags_3p = get_wildtype_frags(c, bamfh)\n+ num_chimeric_frags = c.get_num_frags()\n+ frac5p = float(num_chimeric_frags) / (num_chimeric_frags + num_wt_frags_5p)\n+ frac3p = float(num_chimeric_frags) / (num_chimeric_frags + num_wt_frags_3p)\n+ # setup fields of BEDPE file\n+ fields = [chrom5p, start5p, end5p,\n+ chrom3p, start3p, end3p,\n+ "CLUSTER%d" % (chimera_clusters),\n+ c.get_num_frags(),\n+ "+" if (strand5p == 0) else "-",\n+ "+" if (strand3p == 0) else "-",\n+ \',\'.join(txs5p),\n+ \',\'.join(txs3p),\n+ \',\'.join(genes5p),\n+ \',\'.join(genes3p),\n+ chimera_type, distance,\n+ c.get_num_frags(),\n+ c.get_num_spanning_frags(),\n+ c.get_num_unique_positions(),\n+ frac5p, frac3p,\n+ \',\'.join(spanning_fasta_lines),\n+ \',\'.join(names)]\n+ lines.append(fields)\n+ chimera_clusters += 1\n+ bamfh.close()\n+ logging.debug("Clustered chimeras: %d" % (chimera_clusters))\n+ # sort\n+ lines = sorted(lines, key=operator.itemgetter(18, 17, 16), reverse=True) \n+ f = open(output_file, "w")\n+ print >>f, \'\\t\'.join([\'#chrom5p\', \'start5p\', \'end5p\', \n+ \'chrom3p\', \'start3p\', \'end3p\',\n+ \'chimera_cluster_id\', \'score\', \n+ \'strand5p\', \'strand3p\',\n+ \'transcript_ids_5p\', \'transcript_ids_3p\',\n+ \'genes5p\', \'genes3p\',\n+ \'type\', \'distance\',\n+ \'total_frags\', \n+ \'spanning_frags\',\n+ \'unique_alignment_positions\',\n+ \'isoform_fraction_5p\',\n+ \'isoform_fraction_3p\',\n+ \'breakpoint_spanning_reads\',\n+ \'chimera_ids\'])\n+ for fields in lines:\n+ print >>f, \'\\t\'.join(map(str, fields))\n+ f.close()\n+ return config.JOB_SUCCESS\n+\n+def main():\n+ from optparse import OptionParser\n+ logging.basicConfig(level=logging.DEBUG,\n+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")\n+ parser = OptionParser("usage: %prog [options] <index_dir> <in.txt> <bam_file> <out.txt>")\n+ options, args = parser.parse_args()\n+ index_dir = args[0]\n+ input_file = args[1]\n+ bam_file = args[2]\n+ output_file = args[3]\n+ return write_output(input_file, bam_file, output_file, index_dir)\n+\n+if __name__ == "__main__":\n+ sys.exit(main())\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pipeline/write_output.pyc |
b |
Binary file chimerascan/pipeline/write_output.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/COPYING --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/COPYING Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,21 @@ +The MIT License + +Copyright (c) 2008-2009 Genome Research Ltd. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/Pileup.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/Pileup.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,59 @@ +'''Tools for working with files in the samtools pileup -c format.''' +import collections + +PileupSubstitution = collections.namedtuple( "PileupSubstitution", + " ".join( (\ + "chromosome", + "position", + "reference_base", + "consensus_base", + "consensus_quality", + "snp_quality", + "rms_mapping_quality", + "coverage", + "read_bases", + "base_qualities" ) ) ) + +PileupIndel = collections.namedtuple( "PileupIndel", + " ".join( (\ + "chromosome", + "position", + "reference_base", + "genotype", + "consensus_quality", + "snp_quality", + "rms_mapping_quality", + "coverage", + "first_allelle", + "second_allele", + "reads_first", + "reads_second", + "reads_diff" ) ) ) + +def iterate( infile ): + '''iterate over ``samtools pileup -c`` formatted file. + + *infile* can be any iterator over a lines. + + The function yields named tuples of the type :class:`pysam.Pileup.PileupSubstitution` + or :class:`pysam.Pileup.PileupIndel`. + + .. note:: + The parser converts to 0-based coordinates + ''' + + conv_subst = (str,lambda x: int(x)-1,str,str,int,int,int,int,str,str) + conv_indel = (str,lambda x: int(x)-1,str,str,int,int,int,int,str,str,int,int,int) + + for line in infile: + d = line[:-1].split() + if d[2] == "*": + try: + yield PileupIndel( *[x(y) for x,y in zip(conv_indel,d) ] ) + except TypeError: + raise SamtoolsError( "parsing error in line: `%s`" % line) + else: + try: + yield PileupSubstitution( *[x(y) for x,y in zip(conv_subst,d) ] ) + except TypeError: + raise SamtoolsError( "parsing error in line: `%s`" % line) |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/Pileup.pyc |
b |
Binary file chimerascan/pysam/Pileup.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/__init__.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,110 @@ +from csamtools import * +from ctabix import * +import csamtools +import ctabix +import Pileup +import sys +import os + +class SamtoolsError( Exception ): + '''exception raised in case of an error incurred in the samtools library.''' + + def __init__(self, value): + self.value = value + def __str__(self): + return repr(self.value) + +class SamtoolsDispatcher(object): + '''samtools dispatcher. + + Emulates the samtools command line as module calls. + + Captures stdout and stderr. + + Raises a :class:`pysam.SamtoolsError` exception in case + samtools exits with an error code other than 0. + + Some command line options are associated with parsers. + For example, the samtools command "pileup -c" creates + a tab-separated table on standard output. In order to + associate parsers with options, an optional list of + parsers can be supplied. The list will be processed + in order checking for the presence of each option. + + If no parser is given or no appropriate parser is found, + the stdout output of samtools commands will be returned. + ''' + dispatch=None + parsers=None + + def __init__(self,dispatch, parsers): + self.dispatch = dispatch + self.parsers = parsers + self.stderr = [] + + def __call__(self,*args, **kwargs): + '''execute the samtools command + ''' + retval, stderr, stdout = csamtools._samtools_dispatch( self.dispatch, args ) + if retval: raise SamtoolsError( "\n".join( stderr ) ) + self.stderr = stderr + # samtools commands do not propagate the return code correctly. + # I have thus added this patch to throw if there is output on stderr. + # Note that there is sometimes output on stderr that is not an error, + # for example: [sam_header_read2] 2 sequences loaded. + # Ignore messages like these + stderr = [x for x in stderr + if not (x.startswith( "[sam_header_read2]" ) or + x.startswith("[bam_index_load]") or + x.startswith("[bam_sort_core]") or \ + x.startswith("[samopen] SAM header is present"))] + if stderr: raise SamtoolsError( "\n".join( stderr ) ) + # call parser for stdout: + if not kwargs.get("raw") and stdout and self.parsers: + for options, parser in self.parsers: + for option in options: + if option not in args: break + else: + return parser(stdout) + + return stdout + + def getMessages( self ): + return self.stderr + + def usage(self): + '''return the samtools usage information for this command''' + retval, stderr, stdout = csamtools._samtools_dispatch( self.dispatch ) + return "".join(stderr) + +# +# samtools command line options to export in python +# +# import is a python reserved word. +SAMTOOLS_DISPATCH = { + "view" : ( "view", None ), + "sort" : ( "sort", None), + "samimport": ( "import", None), + "pileup" : ( "pileup", ( (("-c",), Pileup.iterate ), ), ), + "faidx" : ("faidx", None), + "tview" : ("tview", None), + "index" : ("index", None), + "fixmate" : ("fixmate", None), + "glfview" : ("glfview", None), + "flagstat" : ("flagstat", None), + "calmd" : ("calmd", None), + "merge" : ("merge", None), + "rmdup" : ("rmdup", None) } + +# instantiate samtools commands as python functions +for key, options in SAMTOOLS_DISPATCH.iteritems(): + cmd, parser = options + globals()[key] = SamtoolsDispatcher(cmd, parser) + +# hack to export all the symbols from csamtools +__all__ = csamtools.__all__ + \ + ctabix.__all__ + \ + [ "SamtoolsError", "SamtoolsDispatcher" ] + list(SAMTOOLS_DISPATCH) +\ + ["Pileup",] + +from version import __version__, __samtools_version__ |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/__init__.pyc |
b |
Binary file chimerascan/pysam/__init__.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/_cffi_backend.so |
b |
Binary file chimerascan/pysam/_cffi_backend.so has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/_yaml.so |
b |
Binary file chimerascan/pysam/_yaml.so has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/csamtools.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/csamtools.c Thu Sep 07 17:55:18 2017 -0400 |
b |
b'@@ -0,0 +1,24858 @@\n+/* Generated by Cython 0.13 on Mon Jan 31 00:58:16 2011 */\n+\n+#define PY_SSIZE_T_CLEAN\n+#include "Python.h"\n+#ifndef Py_PYTHON_H\n+ #error Python headers needed to compile C extensions, please install development version of Python.\n+#else\n+\n+#include <stddef.h> /* For offsetof */\n+#ifndef offsetof\n+#define offsetof(type, member) ( (size_t) & ((type*)0) -> member )\n+#endif\n+\n+#if !defined(WIN32) && !defined(MS_WINDOWS)\n+ #ifndef __stdcall\n+ #define __stdcall\n+ #endif\n+ #ifndef __cdecl\n+ #define __cdecl\n+ #endif\n+ #ifndef __fastcall\n+ #define __fastcall\n+ #endif\n+#endif\n+\n+#ifndef DL_IMPORT\n+ #define DL_IMPORT(t) t\n+#endif\n+#ifndef DL_EXPORT\n+ #define DL_EXPORT(t) t\n+#endif\n+\n+#ifndef PY_LONG_LONG\n+ #define PY_LONG_LONG LONG_LONG\n+#endif\n+\n+#if PY_VERSION_HEX < 0x02040000\n+ #define METH_COEXIST 0\n+ #define PyDict_CheckExact(op) (Py_TYPE(op) == &PyDict_Type)\n+ #define PyDict_Contains(d,o) PySequence_Contains(d,o)\n+#endif\n+\n+#if PY_VERSION_HEX < 0x02050000\n+ typedef int Py_ssize_t;\n+ #define PY_SSIZE_T_MAX INT_MAX\n+ #define PY_SSIZE_T_MIN INT_MIN\n+ #define PY_FORMAT_SIZE_T ""\n+ #define PyInt_FromSsize_t(z) PyInt_FromLong(z)\n+ #define PyInt_AsSsize_t(o) PyInt_AsLong(o)\n+ #define PyNumber_Index(o) PyNumber_Int(o)\n+ #define PyIndex_Check(o) PyNumber_Check(o)\n+ #define PyErr_WarnEx(category, message, stacklevel) PyErr_Warn(category, message)\n+#endif\n+\n+#if PY_VERSION_HEX < 0x02060000\n+ #define Py_REFCNT(ob) (((PyObject*)(ob))->ob_refcnt)\n+ #define Py_TYPE(ob) (((PyObject*)(ob))->ob_type)\n+ #define Py_SIZE(ob) (((PyVarObject*)(ob))->ob_size)\n+ #define PyVarObject_HEAD_INIT(type, size) \\\n+ PyObject_HEAD_INIT(type) size,\n+ #define PyType_Modified(t)\n+\n+ typedef struct {\n+ void *buf;\n+ PyObject *obj;\n+ Py_ssize_t len;\n+ Py_ssize_t itemsize;\n+ int readonly;\n+ int ndim;\n+ char *format;\n+ Py_ssize_t *shape;\n+ Py_ssize_t *strides;\n+ Py_ssize_t *suboffsets;\n+ void *internal;\n+ } Py_buffer;\n+\n+ #define PyBUF_SIMPLE 0\n+ #define PyBUF_WRITABLE 0x0001\n+ #define PyBUF_FORMAT 0x0004\n+ #define PyBUF_ND 0x0008\n+ #define PyBUF_STRIDES (0x0010 | PyBUF_ND)\n+ #define PyBUF_C_CONTIGUOUS (0x0020 | PyBUF_STRIDES)\n+ #define PyBUF_F_CONTIGUOUS (0x0040 | PyBUF_STRIDES)\n+ #define PyBUF_ANY_CONTIGUOUS (0x0080 | PyBUF_STRIDES)\n+ #define PyBUF_INDIRECT (0x0100 | PyBUF_STRIDES)\n+\n+#endif\n+\n+#if PY_MAJOR_VERSION < 3\n+ #define __Pyx_BUILTIN_MODULE_NAME "__builtin__"\n+#else\n+ #define __Pyx_BUILTIN_MODULE_NAME "builtins"\n+#endif\n+\n+#if PY_MAJOR_VERSION >= 3\n+ #define Py_TPFLAGS_CHECKTYPES 0\n+ #define Py_TPFLAGS_HAVE_INDEX 0\n+#endif\n+\n+#if (PY_VERSION_HEX < 0x02060000) || (PY_MAJOR_VERSION >= 3)\n+ #define Py_TPFLAGS_HAVE_NEWBUFFER 0\n+#endif\n+\n+#if PY_MAJOR_VERSION >= 3\n+ #define PyBaseString_Type PyUnicode_Type\n+ #define PyStringObject PyUnicodeObject\n+ #define PyString_Type PyUnicode_Type\n+ #define PyString_Check PyUnicode_Check\n+ #define PyString_CheckExact PyUnicode_CheckExact\n+#endif\n+\n+#if PY_VERSION_HEX < 0x02060000\n+ #define PyBytesObject PyStringObject\n+ #define PyBytes_Type PyString_Type\n+ #define PyBytes_Check PyString_Check\n+ #define PyBytes_CheckExact PyString_CheckExact\n+ #define PyBytes_FromString PyString_FromString\n+ #define PyBytes_FromStringAndSize PyString_FromStringAndSize\n+ #define PyBytes_FromFormat PyString_FromFormat\n+ #define PyBytes_DecodeEscape PyString_DecodeEscape\n+ #define PyBytes_AsString PyString_AsString\n+ #define PyBytes_AsStringAndSize PyString_AsStringAndSize\n+ #define PyBytes_Size PyString_Size\n+ #define PyBytes_AS_STRING PyString_AS_STRING\n+ #define PyBytes_GET_SIZE PyString_GET_SIZE\n+ #define PyBytes_Repr PyString_Repr\n+ #define PyBytes_Concat '..b'\n+ if (!py_code) goto bad;\n+ py_frame = PyFrame_New(\n+ PyThreadState_GET(), /*PyThreadState *tstate,*/\n+ py_code, /*PyCodeObject *code,*/\n+ py_globals, /*PyObject *globals,*/\n+ 0 /*PyObject *locals*/\n+ );\n+ if (!py_frame) goto bad;\n+ py_frame->f_lineno = __pyx_lineno;\n+ PyTraceBack_Here(py_frame);\n+bad:\n+ Py_XDECREF(py_srcfile);\n+ Py_XDECREF(py_funcname);\n+ Py_XDECREF(py_code);\n+ Py_XDECREF(py_frame);\n+}\n+\n+static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {\n+ while (t->p) {\n+ #if PY_MAJOR_VERSION < 3\n+ if (t->is_unicode) {\n+ *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);\n+ } else if (t->intern) {\n+ *t->p = PyString_InternFromString(t->s);\n+ } else {\n+ *t->p = PyString_FromStringAndSize(t->s, t->n - 1);\n+ }\n+ #else /* Python 3+ has unicode identifiers */\n+ if (t->is_unicode | t->is_str) {\n+ if (t->intern) {\n+ *t->p = PyUnicode_InternFromString(t->s);\n+ } else if (t->encoding) {\n+ *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL);\n+ } else {\n+ *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1);\n+ }\n+ } else {\n+ *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1);\n+ }\n+ #endif\n+ if (!*t->p)\n+ return -1;\n+ ++t;\n+ }\n+ return 0;\n+}\n+\n+/* Type Conversion Functions */\n+\n+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject* x) {\n+ int is_true = x == Py_True;\n+ if (is_true | (x == Py_False) | (x == Py_None)) return is_true;\n+ else return PyObject_IsTrue(x);\n+}\n+\n+static CYTHON_INLINE PyObject* __Pyx_PyNumber_Int(PyObject* x) {\n+ PyNumberMethods *m;\n+ const char *name = NULL;\n+ PyObject *res = NULL;\n+#if PY_VERSION_HEX < 0x03000000\n+ if (PyInt_Check(x) || PyLong_Check(x))\n+#else\n+ if (PyLong_Check(x))\n+#endif\n+ return Py_INCREF(x), x;\n+ m = Py_TYPE(x)->tp_as_number;\n+#if PY_VERSION_HEX < 0x03000000\n+ if (m && m->nb_int) {\n+ name = "int";\n+ res = PyNumber_Int(x);\n+ }\n+ else if (m && m->nb_long) {\n+ name = "long";\n+ res = PyNumber_Long(x);\n+ }\n+#else\n+ if (m && m->nb_int) {\n+ name = "int";\n+ res = PyNumber_Long(x);\n+ }\n+#endif\n+ if (res) {\n+#if PY_VERSION_HEX < 0x03000000\n+ if (!PyInt_Check(res) && !PyLong_Check(res)) {\n+#else\n+ if (!PyLong_Check(res)) {\n+#endif\n+ PyErr_Format(PyExc_TypeError,\n+ "__%s__ returned non-%s (type %.200s)",\n+ name, name, Py_TYPE(res)->tp_name);\n+ Py_DECREF(res);\n+ return NULL;\n+ }\n+ }\n+ else if (!PyErr_Occurred()) {\n+ PyErr_SetString(PyExc_TypeError,\n+ "an integer is required");\n+ }\n+ return res;\n+}\n+\n+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) {\n+ Py_ssize_t ival;\n+ PyObject* x = PyNumber_Index(b);\n+ if (!x) return -1;\n+ ival = PyInt_AsSsize_t(x);\n+ Py_DECREF(x);\n+ return ival;\n+}\n+\n+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) {\n+#if PY_VERSION_HEX < 0x02050000\n+ if (ival <= LONG_MAX)\n+ return PyInt_FromLong((long)ival);\n+ else {\n+ unsigned char *bytes = (unsigned char *) &ival;\n+ int one = 1; int little = (int)*(unsigned char*)&one;\n+ return _PyLong_FromByteArray(bytes, sizeof(size_t), little, 0);\n+ }\n+#else\n+ return PyInt_FromSize_t(ival);\n+#endif\n+}\n+\n+static CYTHON_INLINE size_t __Pyx_PyInt_AsSize_t(PyObject* x) {\n+ unsigned PY_LONG_LONG val = __Pyx_PyInt_AsUnsignedLongLong(x);\n+ if (unlikely(val == (unsigned PY_LONG_LONG)-1 && PyErr_Occurred())) {\n+ return (size_t)-1;\n+ } else if (unlikely(val != (unsigned PY_LONG_LONG)(size_t)val)) {\n+ PyErr_SetString(PyExc_OverflowError,\n+ "value too large to convert to size_t");\n+ return (size_t)-1;\n+ }\n+ return (size_t)val;\n+}\n+\n+\n+#endif /* Py_PYTHON_H */\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/csamtools.pxd --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/csamtools.pxd Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,283 @@ + +cdef extern from "string.h": + ctypedef int size_t + void *memcpy(void *dst,void *src,size_t len) + void *memmove(void *dst,void *src,size_t len) + void *memset(void *b,int c,size_t len) + +cdef extern from "stdlib.h": + void free(void *) + void *malloc(size_t) + void *calloc(size_t,size_t) + void *realloc(void *,size_t) + int c_abs "abs" (int) + void qsort(void *base, size_t nmemb, size_t size, + int (*compar)(void *,void *)) + +cdef extern from "stdio.h": + ctypedef struct FILE: + pass + FILE *fopen(char *,char *) + FILE *freopen(char *path, char *mode, FILE *stream) + int fileno(FILE *stream) + int dup2(int oldfd, int newfd) + int fflush(FILE *stream) + + FILE * stderr + FILE * stdout + int fclose(FILE *) + int sscanf(char *str,char *fmt,...) + int printf(char *fmt,...) + int sprintf(char *str,char *fmt,...) + int fprintf(FILE *ifile,char *fmt,...) + char *fgets(char *str,int size,FILE *ifile) + +cdef extern from "ctype.h": + int toupper(int c) + int tolower(int c) + +cdef extern from "unistd.h": + char *ttyname(int fd) + int isatty(int fd) + +cdef extern from "string.h": + int strcmp(char *s1, char *s2) + int strncmp(char *s1,char *s2,size_t len) + char *strcpy(char *dest,char *src) + char *strncpy(char *dest,char *src, size_t len) + char *strdup(char *) + char *strcat(char *,char *) + size_t strlen(char *s) + int memcmp( void * s1, void *s2, size_t len ) + +cdef extern from "Python.h": + long _Py_HashPointer(void*) + +cdef extern from "razf.h": + pass + +cdef extern from "stdint.h": + ctypedef int int64_t + ctypedef int int32_t + ctypedef int uint32_t + ctypedef int uint8_t + ctypedef int uint64_t + + +cdef extern from "bam.h": + + # IF _IOLIB=2, bamFile = BGZF, see bgzf.h + # samtools uses KNETFILE, check how this works + + ctypedef struct tamFile: + pass + + ctypedef struct bamFile: + pass + + ctypedef struct bam1_core_t: + int32_t tid + int32_t pos + uint32_t bin + uint32_t qual + uint32_t l_qname + uint32_t flag + uint32_t n_cigar + int32_t l_qseq + int32_t mtid + int32_t mpos + int32_t isize + + ctypedef struct bam1_t: + bam1_core_t core + int l_aux + int data_len + int m_data + uint8_t *data + + ctypedef struct bam_pileup1_t: + bam1_t *b + int32_t qpos + int indel + int level + uint32_t is_del + uint32_t is_head + uint32_t is_tail + + ctypedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, bam_pileup1_t *pl, void *data) + + ctypedef int (*bam_fetch_f)(bam1_t *b, void *data) + + ctypedef struct bam_header_t: + int32_t n_targets + char **target_name + uint32_t *target_len + void *hash + void *rg2lib + int l_text + char *text + + ctypedef struct bam_index_t: + pass + + ctypedef struct bam_plbuf_t: + pass + + ctypedef struct bam_iter_t: + pass + + bam1_t * bam_init1() + void bam_destroy1(bam1_t *) + + bamFile razf_dopen(int data_fd, char *mode) + + int64_t bam_seek( bamFile fp, uint64_t voffset, int where) + int64_t bam_tell( bamFile fp ) + + # void bam_init_header_hash(bam_header_t *header) + + ############################################### + # stand-ins for samtools macros + uint32_t * bam1_cigar( bam1_t * b) + char * bam1_qname( bam1_t * b) + uint8_t * bam1_seq( bam1_t * b) + uint8_t * bam1_qual( bam1_t * b) + uint8_t * bam1_aux( bam1_t * b) + + ############################################### + # bam iterator interface + bam_iter_t bam_iter_query( bam_index_t *idx, int tid, int beg, int end) + + int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b) + + void bam_iter_destroy(bam_iter_t iter) + + ############################################### + + bam1_t * bam_dup1( bam1_t *src ) + + bam1_t * bam_copy1(bam1_t *bdst, bam1_t *bsrc) + bam_index_t *bam_index_load(char *f ) + + void bam_index_destroy(bam_index_t *idx) + + int bam_parse_region(bam_header_t *header, char *str, int *ref_id, int *begin, int *end) + + ############################################### + bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data) + + int bam_fetch(bamFile fp, bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) + + int bam_plbuf_push(bam1_t *b, bam_plbuf_t *buf) + + void bam_plbuf_destroy(bam_plbuf_t *buf) + ######################################## + # pileup iterator interface + ctypedef struct bam_plp_t: + pass + + ctypedef int (*bam_plp_auto_f)(void *data, bam1_t *b) + + bam_plp_t bam_plp_init( bam_plp_auto_f func, void *data) + int bam_plp_push( bam_plp_t iter, bam1_t *b) + bam_pileup1_t *bam_plp_next( bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) + bam_pileup1_t *bam_plp_auto( bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) + void bam_plp_set_mask(bam_plp_t iter, int mask) + void bam_plp_reset(bam_plp_t iter) + void bam_plp_destroy(bam_plp_t iter) + + ################################################## + + int bam_read1(bamFile fp, bam1_t *b) + + int bam_write1( bamFile fp, bam1_t *b) + + bam_header_t *bam_header_init() + + int bam_header_write( bamFile fp, bam_header_t *header) + + bam_header_t *bam_header_read( bamFile fp ) + + void bam_header_destroy(bam_header_t *header) + + bam1_t * bam_dup1( bam1_t *src ) + + bam1_t * bam_copy1(bam1_t *bdst, bam1_t *bsrc) + + uint8_t *bam_aux_get(bam1_t *b, char tag[2]) + + int bam_aux2i(uint8_t *s) + float bam_aux2f(uint8_t *s) + double bam_aux2d(uint8_t *s) + char bam_aux2A( uint8_t *s) + char *bam_aux2Z( uint8_t *s) + + int bam_reg2bin(uint32_t beg, uint32_t end) + + uint32_t bam_calend(bam1_core_t *c, uint32_t *cigar) + +cdef extern from "sam.h": + + ctypedef struct samfile_t_un: + tamFile tamr + bamFile bam + FILE *tamw + + ctypedef struct samfile_t: + int type + samfile_t_un x + bam_header_t *header + + samfile_t *samopen( char *fn, char * mode, void *aux) + + int sampileup( samfile_t *fp, int mask, bam_pileup_f func, void *data) + + void samclose(samfile_t *fp) + + int samread(samfile_t *fp, bam1_t *b) + + int samwrite(samfile_t *fp, bam1_t *b) + +cdef extern from "faidx.h": + + ctypedef struct faidx_t: + pass + + int fai_build(char *fn) + + void fai_destroy(faidx_t *fai) + + faidx_t *fai_load(char *fn) + + char *fai_fetch(faidx_t *fai, char *reg, int *len) + + int faidx_fetch_nseq(faidx_t *fai) + + char *faidx_fetch_seq(faidx_t *fai, char *c_name, + int p_beg_i, int p_end_i, int *len) + +cdef extern from "pysam_util.h": + + int pysam_pileup_next(bam1_t *b, + bam_plbuf_t *buf, + bam_pileup1_t ** plp, + int * tid, + int * pos, + int * n_plp ) + + + int pysam_dispatch(int argc, char *argv[] ) + + # stand-in functions for samtools macros + void pysam_bam_destroy1( bam1_t * b) + + # add *nbytes* into the variable length data of *src* at *pos* + bam1_t * pysam_bam_update( bam1_t * b, + size_t nbytes_old, + size_t nbytes_new, + uint8_t * pos ) + + # translate char to unsigned char + unsigned char pysam_translate_sequence( char s ) + + |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/csamtools.pyx --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/csamtools.pyx Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,2064 @@\n+# cython: embedsignature=True\n+# cython: profile=True\n+# adds doc-strings for sphinx\n+\n+import tempfile, os, sys, types, itertools, struct, ctypes\n+\n+from python_string cimport PyString_FromStringAndSize, PyString_AS_STRING\n+from python_exc cimport PyErr_SetString\n+\n+# defines imported from samtools\n+DEF SEEK_SET = 0\n+DEF SEEK_CUR = 1\n+DEF SEEK_END = 2\n+\n+## These are bits set in the flag.\n+## have to put these definitions here, in csamtools.pxd they got ignored\n+## @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */\n+DEF BAM_FPAIRED =1\n+## @abstract the read is mapped in a proper pair */\n+DEF BAM_FPROPER_PAIR =2\n+## @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */\n+DEF BAM_FUNMAP =4\n+## @abstract the mate is unmapped */\n+DEF BAM_FMUNMAP =8\n+## @abstract the read is mapped to the reverse strand */\n+DEF BAM_FREVERSE =16\n+## @abstract the mate is mapped to the reverse strand */\n+DEF BAM_FMREVERSE =32\n+## @abstract this is read1 */\n+DEF BAM_FREAD1 =64\n+## @abstract this is read2 */\n+DEF BAM_FREAD2 =128\n+## @abstract not primary alignment */\n+DEF BAM_FSECONDARY =256\n+## @abstract QC failure */\n+DEF BAM_FQCFAIL =512\n+## @abstract optical or PCR duplicate */\n+DEF BAM_FDUP =1024\n+\n+DEF BAM_CIGAR_SHIFT=4\n+DEF BAM_CIGAR_MASK=((1 << BAM_CIGAR_SHIFT) - 1)\n+\n+DEF BAM_CMATCH = 0\n+DEF BAM_CINS = 1\n+DEF BAM_CDEL = 2\n+DEF BAM_CREF_SKIP = 3\n+DEF BAM_CSOFT_CLIP = 4\n+DEF BAM_CHARD_CLIP = 5\n+DEF BAM_CPAD = 6\n+\n+#####################################################################\n+#####################################################################\n+#####################################################################\n+## private factory methods\n+#####################################################################\n+cdef class AlignedRead\n+cdef makeAlignedRead( bam1_t * src):\n+ \'\'\'enter src into AlignedRead.\'\'\'\n+ cdef AlignedRead dest\n+ dest = AlignedRead()\n+ # destroy dummy delegate created in constructor\n+ # to prevent memory leak.\n+ bam_destroy1(dest._delegate)\n+ dest._delegate = bam_dup1(src)\n+ return dest\n+\n+cdef class PileupProxy\n+cdef makePileupProxy( bam_pileup1_t * plp, int tid, int pos, int n ):\n+ cdef PileupProxy dest\n+ dest = PileupProxy()\n+ dest.plp = plp\n+ dest.tid = tid\n+ dest.pos = pos\n+ dest.n = n\n+ return dest\n+\n+cdef class PileupRead\n+cdef makePileupRead( bam_pileup1_t * src ):\n+ \'\'\'fill a PileupRead object from a bam_pileup1_t * object.\'\'\'\n+ cdef PileupRead dest\n+ dest = PileupRead()\n+ dest._alignment = makeAlignedRead( src.b )\n+ dest._qpos = src.qpos\n+ dest._indel = src.indel\n+ dest._level = src.level\n+ dest._is_del = src.is_del\n+ dest._is_head = src.is_head\n+ dest._is_tail = src.is_tail\n+ return dest\n+\n+#####################################################################\n+#####################################################################\n+#####################################################################\n+## Generic callbacks for inserting python callbacks.\n+#####################################################################\n+cdef int fetch_callback( bam1_t *alignment, void *f):\n+ \'\'\'callback for bam_fetch. \n+ \n+ calls function in *f* with a new :class:`AlignedRead` object as parameter.\n+ \'\'\'\n+ a = makeAlignedRead( alignment )\n+ (<object>f)(a)\n+\n+class PileupColumn(object): \n+ \'\'\'A pileup column. A pileup column contains \n+ all the reads that map to a certain target base.\n+\n+ tid \n+ chromosome ID as is defined in the header \n+ pos \n+ the target base coordinate (0-based) \n+ n \n+ number of reads mapping to this column \n+ pileups \n+ list of reads (:class:`pysam.PileupRead`) aligned to this column \n+ \'\'\' \n+ def __str__(self): \n+ return "\\t".j'..b'is_tail:\n+ def __get__(self):\n+ return self._is_tail\n+ property level:\n+ def __get__(self):\n+ return self._level\n+\n+class Outs:\n+ \'\'\'http://mail.python.org/pipermail/python-list/2000-June/038406.html\'\'\'\n+ def __init__(self, id = 1):\n+ self.streams = []\n+ self.id = id\n+\n+ def setdevice(self, filename):\n+ \'\'\'open an existing file, like "/dev/null"\'\'\'\n+ fd = os.open(filename, os.O_WRONLY)\n+ self.setfd(fd)\n+\n+ def setfile(self, filename):\n+ \'\'\'open a new file.\'\'\'\n+ fd = os.open(filename, os.O_WRONLY|os.O_CREAT, 0660);\n+ self.setfd(fd)\n+\n+ def setfd(self, fd):\n+ ofd = os.dup(self.id) # Save old stream on new unit.\n+ self.streams.append(ofd)\n+ sys.stdout.flush() # Buffered data goes to old stream.\n+ os.dup2(fd, self.id) # Open unit 1 on new stream.\n+ os.close(fd) # Close other unit (look out, caller.)\n+ \n+ def restore(self):\n+ \'\'\'restore previous output stream\'\'\'\n+ if self.streams:\n+ # the following was not sufficient, hence flush both stderr and stdout\n+ # os.fsync( self.id )\n+ sys.stdout.flush()\n+ sys.stderr.flush()\n+ os.dup2(self.streams[-1], self.id)\n+ os.close(self.streams[-1])\n+ del self.streams[-1]\n+\n+def _samtools_dispatch( method, args = () ):\n+ \'\'\'call ``method`` in samtools providing arguments in args.\n+ \n+ .. note:: \n+ This method redirects stdout and stderr to capture it \n+ from samtools. If for some reason stdout/stderr disappears\n+ the reason might be in this method.\n+\n+ .. note::\n+ The current implementation might only work on linux.\n+ \n+ .. note:: \n+ This method captures stdout and stderr using temporary files, \n+ which are then read into memory in their entirety. This method\n+ is slow and might cause large memory overhead. \n+\n+ See http://bytes.com/topic/c/answers/487231-how-capture-stdout-temporarily\n+ on the topic of redirecting stderr/stdout.\n+ \'\'\'\n+\n+ # note that debugging this module can be a problem\n+ # as stdout/stderr will not appear\n+\n+ # redirect stderr and stdout to file\n+\n+ # open files and redirect into it\n+ stderr_h, stderr_f = tempfile.mkstemp()\n+ stdout_h, stdout_f = tempfile.mkstemp()\n+\n+ # patch for `samtools view`\n+ # samtools `view` closes stdout, from which I can not\n+ # recover. Thus redirect output to file with -o option.\n+ if method == "view":\n+ if "-o" in args: raise ValueError("option -o is forbidden in samtools view")\n+ args = ( "-o", stdout_f ) + args\n+\n+ stdout_save = Outs( sys.stdout.fileno() )\n+ stdout_save.setfd( stdout_h )\n+ stderr_save = Outs( sys.stderr.fileno() )\n+ stderr_save.setfd( stderr_h )\n+\n+ # do the function call to samtools\n+ cdef char ** cargs\n+ cdef int i, n, retval\n+\n+ n = len(args)\n+ # allocate two more for first (dummy) argument (contains command)\n+ cargs = <char**>calloc( n+2, sizeof( char *) )\n+ cargs[0] = "samtools"\n+ cargs[1] = method\n+ for i from 0 <= i < n: cargs[i+2] = args[i]\n+ retval = pysam_dispatch(n+2, cargs)\n+ free( cargs )\n+\n+ # restore stdout/stderr. This will also flush, so\n+ # needs to be before reading back the file contents\n+ stdout_save.restore()\n+ stderr_save.restore()\n+\n+ # capture stderr/stdout.\n+ out_stderr = open( stderr_f, "r").readlines()\n+ out_stdout = open( stdout_f, "r").readlines()\n+\n+ # clean up files\n+ os.remove( stderr_f )\n+ os.remove( stdout_f )\n+\n+ return retval, out_stderr, out_stdout\n+\n+__all__ = ["Samfile", \n+ "Fastafile",\n+ "IteratorRow", \n+ "IteratorRowAll", \n+ "IteratorColumn", \n+ "AlignedRead", \n+ "PileupColumn", \n+ "PileupProxy", \n+ "PileupRead" ]\n+\n+ \n+\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/csamtools.so |
b |
Binary file chimerascan/pysam/csamtools.so has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/ctabix.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/ctabix.c Thu Sep 07 17:55:18 2017 -0400 |
b |
b'@@ -0,0 +1,12808 @@\n+/* Generated by Cython 0.13 on Mon Jan 31 00:58:34 2011 */\n+\n+#define PY_SSIZE_T_CLEAN\n+#include "Python.h"\n+#ifndef Py_PYTHON_H\n+ #error Python headers needed to compile C extensions, please install development version of Python.\n+#else\n+\n+#include <stddef.h> /* For offsetof */\n+#ifndef offsetof\n+#define offsetof(type, member) ( (size_t) & ((type*)0) -> member )\n+#endif\n+\n+#if !defined(WIN32) && !defined(MS_WINDOWS)\n+ #ifndef __stdcall\n+ #define __stdcall\n+ #endif\n+ #ifndef __cdecl\n+ #define __cdecl\n+ #endif\n+ #ifndef __fastcall\n+ #define __fastcall\n+ #endif\n+#endif\n+\n+#ifndef DL_IMPORT\n+ #define DL_IMPORT(t) t\n+#endif\n+#ifndef DL_EXPORT\n+ #define DL_EXPORT(t) t\n+#endif\n+\n+#ifndef PY_LONG_LONG\n+ #define PY_LONG_LONG LONG_LONG\n+#endif\n+\n+#if PY_VERSION_HEX < 0x02040000\n+ #define METH_COEXIST 0\n+ #define PyDict_CheckExact(op) (Py_TYPE(op) == &PyDict_Type)\n+ #define PyDict_Contains(d,o) PySequence_Contains(d,o)\n+#endif\n+\n+#if PY_VERSION_HEX < 0x02050000\n+ typedef int Py_ssize_t;\n+ #define PY_SSIZE_T_MAX INT_MAX\n+ #define PY_SSIZE_T_MIN INT_MIN\n+ #define PY_FORMAT_SIZE_T ""\n+ #define PyInt_FromSsize_t(z) PyInt_FromLong(z)\n+ #define PyInt_AsSsize_t(o) PyInt_AsLong(o)\n+ #define PyNumber_Index(o) PyNumber_Int(o)\n+ #define PyIndex_Check(o) PyNumber_Check(o)\n+ #define PyErr_WarnEx(category, message, stacklevel) PyErr_Warn(category, message)\n+#endif\n+\n+#if PY_VERSION_HEX < 0x02060000\n+ #define Py_REFCNT(ob) (((PyObject*)(ob))->ob_refcnt)\n+ #define Py_TYPE(ob) (((PyObject*)(ob))->ob_type)\n+ #define Py_SIZE(ob) (((PyVarObject*)(ob))->ob_size)\n+ #define PyVarObject_HEAD_INIT(type, size) \\\n+ PyObject_HEAD_INIT(type) size,\n+ #define PyType_Modified(t)\n+\n+ typedef struct {\n+ void *buf;\n+ PyObject *obj;\n+ Py_ssize_t len;\n+ Py_ssize_t itemsize;\n+ int readonly;\n+ int ndim;\n+ char *format;\n+ Py_ssize_t *shape;\n+ Py_ssize_t *strides;\n+ Py_ssize_t *suboffsets;\n+ void *internal;\n+ } Py_buffer;\n+\n+ #define PyBUF_SIMPLE 0\n+ #define PyBUF_WRITABLE 0x0001\n+ #define PyBUF_FORMAT 0x0004\n+ #define PyBUF_ND 0x0008\n+ #define PyBUF_STRIDES (0x0010 | PyBUF_ND)\n+ #define PyBUF_C_CONTIGUOUS (0x0020 | PyBUF_STRIDES)\n+ #define PyBUF_F_CONTIGUOUS (0x0040 | PyBUF_STRIDES)\n+ #define PyBUF_ANY_CONTIGUOUS (0x0080 | PyBUF_STRIDES)\n+ #define PyBUF_INDIRECT (0x0100 | PyBUF_STRIDES)\n+\n+#endif\n+\n+#if PY_MAJOR_VERSION < 3\n+ #define __Pyx_BUILTIN_MODULE_NAME "__builtin__"\n+#else\n+ #define __Pyx_BUILTIN_MODULE_NAME "builtins"\n+#endif\n+\n+#if PY_MAJOR_VERSION >= 3\n+ #define Py_TPFLAGS_CHECKTYPES 0\n+ #define Py_TPFLAGS_HAVE_INDEX 0\n+#endif\n+\n+#if (PY_VERSION_HEX < 0x02060000) || (PY_MAJOR_VERSION >= 3)\n+ #define Py_TPFLAGS_HAVE_NEWBUFFER 0\n+#endif\n+\n+#if PY_MAJOR_VERSION >= 3\n+ #define PyBaseString_Type PyUnicode_Type\n+ #define PyStringObject PyUnicodeObject\n+ #define PyString_Type PyUnicode_Type\n+ #define PyString_Check PyUnicode_Check\n+ #define PyString_CheckExact PyUnicode_CheckExact\n+#endif\n+\n+#if PY_VERSION_HEX < 0x02060000\n+ #define PyBytesObject PyStringObject\n+ #define PyBytes_Type PyString_Type\n+ #define PyBytes_Check PyString_Check\n+ #define PyBytes_CheckExact PyString_CheckExact\n+ #define PyBytes_FromString PyString_FromString\n+ #define PyBytes_FromStringAndSize PyString_FromStringAndSize\n+ #define PyBytes_FromFormat PyString_FromFormat\n+ #define PyBytes_DecodeEscape PyString_DecodeEscape\n+ #define PyBytes_AsString PyString_AsString\n+ #define PyBytes_AsStringAndSize PyString_AsStringAndSize\n+ #define PyBytes_Size PyString_Size\n+ #define PyBytes_AS_STRING PyString_AS_STRING\n+ #define PyBytes_GET_SIZE PyString_GET_SIZE\n+ #define PyBytes_Repr PyString_Repr\n+ #define PyBytes_Concat '..b'\n+ if (!py_code) goto bad;\n+ py_frame = PyFrame_New(\n+ PyThreadState_GET(), /*PyThreadState *tstate,*/\n+ py_code, /*PyCodeObject *code,*/\n+ py_globals, /*PyObject *globals,*/\n+ 0 /*PyObject *locals*/\n+ );\n+ if (!py_frame) goto bad;\n+ py_frame->f_lineno = __pyx_lineno;\n+ PyTraceBack_Here(py_frame);\n+bad:\n+ Py_XDECREF(py_srcfile);\n+ Py_XDECREF(py_funcname);\n+ Py_XDECREF(py_code);\n+ Py_XDECREF(py_frame);\n+}\n+\n+static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {\n+ while (t->p) {\n+ #if PY_MAJOR_VERSION < 3\n+ if (t->is_unicode) {\n+ *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);\n+ } else if (t->intern) {\n+ *t->p = PyString_InternFromString(t->s);\n+ } else {\n+ *t->p = PyString_FromStringAndSize(t->s, t->n - 1);\n+ }\n+ #else /* Python 3+ has unicode identifiers */\n+ if (t->is_unicode | t->is_str) {\n+ if (t->intern) {\n+ *t->p = PyUnicode_InternFromString(t->s);\n+ } else if (t->encoding) {\n+ *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL);\n+ } else {\n+ *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1);\n+ }\n+ } else {\n+ *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1);\n+ }\n+ #endif\n+ if (!*t->p)\n+ return -1;\n+ ++t;\n+ }\n+ return 0;\n+}\n+\n+/* Type Conversion Functions */\n+\n+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject* x) {\n+ int is_true = x == Py_True;\n+ if (is_true | (x == Py_False) | (x == Py_None)) return is_true;\n+ else return PyObject_IsTrue(x);\n+}\n+\n+static CYTHON_INLINE PyObject* __Pyx_PyNumber_Int(PyObject* x) {\n+ PyNumberMethods *m;\n+ const char *name = NULL;\n+ PyObject *res = NULL;\n+#if PY_VERSION_HEX < 0x03000000\n+ if (PyInt_Check(x) || PyLong_Check(x))\n+#else\n+ if (PyLong_Check(x))\n+#endif\n+ return Py_INCREF(x), x;\n+ m = Py_TYPE(x)->tp_as_number;\n+#if PY_VERSION_HEX < 0x03000000\n+ if (m && m->nb_int) {\n+ name = "int";\n+ res = PyNumber_Int(x);\n+ }\n+ else if (m && m->nb_long) {\n+ name = "long";\n+ res = PyNumber_Long(x);\n+ }\n+#else\n+ if (m && m->nb_int) {\n+ name = "int";\n+ res = PyNumber_Long(x);\n+ }\n+#endif\n+ if (res) {\n+#if PY_VERSION_HEX < 0x03000000\n+ if (!PyInt_Check(res) && !PyLong_Check(res)) {\n+#else\n+ if (!PyLong_Check(res)) {\n+#endif\n+ PyErr_Format(PyExc_TypeError,\n+ "__%s__ returned non-%s (type %.200s)",\n+ name, name, Py_TYPE(res)->tp_name);\n+ Py_DECREF(res);\n+ return NULL;\n+ }\n+ }\n+ else if (!PyErr_Occurred()) {\n+ PyErr_SetString(PyExc_TypeError,\n+ "an integer is required");\n+ }\n+ return res;\n+}\n+\n+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) {\n+ Py_ssize_t ival;\n+ PyObject* x = PyNumber_Index(b);\n+ if (!x) return -1;\n+ ival = PyInt_AsSsize_t(x);\n+ Py_DECREF(x);\n+ return ival;\n+}\n+\n+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) {\n+#if PY_VERSION_HEX < 0x02050000\n+ if (ival <= LONG_MAX)\n+ return PyInt_FromLong((long)ival);\n+ else {\n+ unsigned char *bytes = (unsigned char *) &ival;\n+ int one = 1; int little = (int)*(unsigned char*)&one;\n+ return _PyLong_FromByteArray(bytes, sizeof(size_t), little, 0);\n+ }\n+#else\n+ return PyInt_FromSize_t(ival);\n+#endif\n+}\n+\n+static CYTHON_INLINE size_t __Pyx_PyInt_AsSize_t(PyObject* x) {\n+ unsigned PY_LONG_LONG val = __Pyx_PyInt_AsUnsignedLongLong(x);\n+ if (unlikely(val == (unsigned PY_LONG_LONG)-1 && PyErr_Occurred())) {\n+ return (size_t)-1;\n+ } else if (unlikely(val != (unsigned PY_LONG_LONG)(size_t)val)) {\n+ PyErr_SetString(PyExc_OverflowError,\n+ "value too large to convert to size_t");\n+ return (size_t)-1;\n+ }\n+ return (size_t)val;\n+}\n+\n+\n+#endif /* Py_PYTHON_H */\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/ctabix.pxd --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/ctabix.pxd Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,171 @@ + +cdef extern from "string.h": + ctypedef int size_t + void *memcpy(void *dst,void *src,size_t len) + void *memmove(void *dst,void *src,size_t len) + void *memset(void *b,int c,size_t len) + char *strtok_r(char *str, char *delim, char **saveptr) + char *strncpy(char *dest, char *src, size_t n) + void *memchr(void *s, int c, size_t n) + +cdef extern from "stdlib.h": + void free(void *) + void *malloc(size_t) + void *calloc(size_t,size_t) + void *realloc(void *,size_t) + void qsort(void *base, size_t nmemb, size_t size, + int (*compar)(void *,void *)) + int c_abs "abs" (int) + int atoi( char *nptr) + long atol( char *nptr) + double atof( char *nptr) + +cdef extern from "stdio.h": + ctypedef struct FILE: + pass + FILE *fopen(char *,char *) + FILE *freopen(char *path, char *mode, FILE *stream) + int fileno(FILE *stream) + int dup2(int oldfd, int newfd) + int fflush(FILE *stream) + + FILE * stderr + FILE * stdout + int fclose(FILE *) + int sscanf(char *str,char *fmt,...) + int printf(char *str,char *fmt,...) + int sprintf(char *str,char *fmt,...) + int fprintf(FILE *ifile,char *fmt,...) + char *fgets(char *str,int size,FILE *ifile) + +cdef extern from "ctype.h": + int toupper(int c) + int tolower(int c) + +cdef extern from "sys/types.h": + pass + +cdef extern from "sys/stat.h": + pass + +cdef extern from "fcntl.h": + int open(char *pathname, int flags) + +cdef extern from "unistd.h": + ctypedef int ssize_t + char *ttyname(int fd) + int isatty(int fd) + ssize_t read(int fd, void *buf, size_t count) + +cdef extern from "string.h": + int strcmp(char *s1, char *s2) + int strncmp(char *s1,char *s2,size_t len) + char *strcpy(char *dest,char *src) + char *strncpy(char *dest,char *src, size_t len) + char *strdup(char *) + char *strcat(char *,char *) + size_t strlen(char *s) + int memcmp( void * s1, void *s2, size_t len ) + +cdef extern from "stdint.h": + ctypedef int int64_t + ctypedef int int32_t + ctypedef int uint32_t + ctypedef int uint8_t + ctypedef int uint64_t + +cdef extern from "Python.h": + ctypedef struct FILE + FILE* PyFile_AsFile(object) + char *fgets(char *str, int size, FILE *ifile) + int feof(FILE *stream) + size_t strlen(char *s) + size_t getline(char **lineptr, size_t *n, FILE *stream) + char *strstr(char *, char *) + char *strchr(char *string, int c) + int fileno(FILE *stream) + +cdef extern from "bgzf.h": + + ctypedef struct BGZF: + pass + + int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) + + BGZF * bgzf_open(char * path, char * mode) + + int bgzf_write(BGZF * fp, void* data, int length) + + int bgzf_close(BGZF* fp) + +# tabix support +cdef extern from "tabix.h": + + ctypedef struct ti_index_t: + pass + + ctypedef struct tabix_t: + BGZF *fp + ti_index_t *idx + char *fn + char *fnidx + + ctypedef struct ti_iter_t: + pass + + ctypedef struct ti_conf_t: + int32_t preset + int32_t sc, bc, ec + int32_t meta_char, line_skip + + tabix_t *ti_open(char *fn, char *fnidx) + + int ti_lazy_index_load(tabix_t *t) + + void ti_close(tabix_t *t) + + ti_iter_t ti_query(tabix_t *t, char *name, int beg, int end) + ti_iter_t ti_queryi(tabix_t *t, int tid, int beg, int end) + ti_iter_t ti_querys(tabix_t *t, char *reg) + char * ti_read(tabix_t *t, ti_iter_t iter, int *len) + + # Get the list of sequence names. Each "char*" pointer points to a + # internal member of the index, so DO NOT modify the returned + # pointer; otherwise the index will be corrupted. The returned + # pointer should be freed by a single free() call by the routine + # calling this function. The number of sequences is returned at *n + char **ti_seqname(ti_index_t *idx, int *n) + + + # Destroy the iterator + void ti_iter_destroy(ti_iter_t iter) + + # Build the index for file <fn>. File <fn>.tbi will be generated + # and overwrite the file of the same name. Return -1 on failure. */ + int ti_index_build(char *fn, ti_conf_t *conf) + + #/* Load the index from file <fn>.tbi. If <fn> is a URL and the index + # * file is not in the working directory, <fn>.tbi will be + # * downloaded. Return NULL on failure. */ + ti_index_t *ti_index_load( char *fn) + + ti_index_t *ti_index_load_local(char *fnidx) + + #/* Destroy the index */ + void ti_index_destroy(ti_index_t *idx) + + #/* Parse a region like: chr2, chr2:100, chr2:100-200. Return -1 on failure. */ + int ti_parse_region( ti_index_t *idx, char *str, int *tid, int *begin, int *end) + + int ti_get_tid( ti_index_t *idx, char *name) + + # /* Get the iterator pointing to the first record at the current file + # * position. If the file is just openned, the iterator points to the + # * first record in the file. */ + ti_iter_t ti_iter_first() + + # /* Get the iterator pointing to the first record in region tid:beg-end */ + ti_iter_t ti_iter_query( ti_index_t *idx, int tid, int beg, int end) + + # /* Get the data line pointed by the iterator and iterate to the next record. */ + # char *ti_iter_read(BGZF *fp, ti_iter_t iter, int *len) |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/ctabix.pyx --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/ctabix.pyx Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,881 @@\n+# cython: embedsignature=True\n+# adds doc-strings for sphinx\n+\n+import tempfile, os, sys, types, itertools, struct, ctypes\n+\n+cdef class Tabixfile:\n+ \'\'\'*(filename, mode=\'r\')*\n+\n+ opens a :term:`tabix file` for reading. A missing\n+ index (*filename* + ".tbi") will raise an exception.\n+ \'\'\'\n+\n+ cdef char * filename\n+\n+ # pointer to tabixfile\n+ cdef tabix_t * tabixfile\n+\n+ def __cinit__(self, *args, **kwargs ):\n+ self.tabixfile = NULL\n+ self._open( *args, **kwargs )\n+\n+ def _isOpen( self ):\n+ \'\'\'return true if samfile has been opened.\'\'\'\n+ return self.tabixfile != NULL\n+\n+ def _open( self, \n+ char * filename, \n+ mode =\'r\',\n+ ):\n+ \'\'\'open a :term:`tabix file` for reading.\n+ \'\'\'\n+\n+ assert mode in ( "r",), "invalid file opening mode `%s`" % mode\n+\n+ # close a previously opened file\n+ if self.tabixfile != NULL: self.close()\n+ self.tabixfile = NULL\n+\n+ self.filename = filename\n+ filename_index = filename + ".tbi"\n+\n+ if mode[0] == \'w\':\n+ # open file for writing\n+ pass\n+\n+ elif mode[0] == "r":\n+ # open file for reading\n+ if not os.path.exists( self.filename ):\n+ raise IOError( "file `%s` not found" % self.filename)\n+\n+ if not os.path.exists( filename_index ):\n+ raise IOError( "index `%s` not found" % filename_index)\n+\n+ # open file and load index\n+ self.tabixfile = ti_open( self.filename, filename_index )\n+\n+ if self.tabixfile == NULL:\n+ raise IOError("could not open file `%s`" % filename )\n+\n+ def _parseRegion( self, \n+ reference = None, \n+ start = None, \n+ end = None, \n+ region = None ):\n+ \'\'\'parse region information.\n+\n+ raise ValueError for for invalid regions.\n+\n+ returns a tuple of region, tid, start and end. Region\n+ is a valid samtools :term:`region` or None if the region\n+ extends over the whole file.\n+\n+ Note that regions are 1-based, while start,end are python coordinates.\n+ \'\'\'\n+ ti_lazy_index_load( self.tabixfile )\n+\n+ cdef int rtid\n+ cdef int rstart\n+ cdef int rend\n+ cdef int max_pos\n+ max_pos = 2 << 29\n+\n+ rtid = rstart = rend = 0\n+\n+ # translate to a region\n+ if reference:\n+ if start != None and end != None:\n+ region = "%s:%i-%i" % (reference, start+1, end)\n+ elif start == None and end != None:\n+ region = "%s:%i-%i" % (reference, 1, end)\n+ elif end == None and start != None:\n+ region = "%s:%i-%i" % (reference, start+1, max_pos-1)\n+ else:\n+ region = reference\n+\n+ if region:\n+ ti_parse_region( self.tabixfile.idx, region, &rtid, &rstart, &rend) \n+ if rtid < 0: raise ValueError( "invalid region `%s`" % region )\n+ if rstart > rend: raise ValueError( \'invalid region: start (%i) > end (%i)\' % (rstart, rend) )\n+ if not 0 <= rstart < max_pos: raise ValueError( \'start out of range (%i)\' % rstart )\n+ if not 0 <= rend < max_pos: raise ValueError( \'end out of range (%i)\' % rend )\n+\n+ return region, rtid, rstart, rend\n+\n+ def fetch( self, \n+ reference = None,\n+ start = None, \n+ end = None, \n+ region = None,\n+ parser = None ):\n+ \'\'\'\n+ \n+ fetch one or more rows in a :term:`region` using 0-based indexing. The region is specified by\n+ :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.\n+\n+ Without *reference* or *region* all entries will be fetched. \n+ \n+ If only *reference* is s'..b'E = 64 * 1024\n+\n+ fp = bgzf_open( filename_out, "w")\n+ if fp == NULL:\n+ raise IOError( "could not open \'%s\' for writing" )\n+\n+ fd_src = open(filename_in, O_RDONLY)\n+ if fd_src == 0:\n+ raise IOError( "could not open \'%s\' for reading" )\n+\n+ buffer = malloc(WINDOW_SIZE)\n+\n+ while c > 0:\n+ c = read(fd_src, buffer, WINDOW_SIZE)\n+ r = bgzf_write(fp, buffer, c)\n+ if r < 0:\n+ free( buffer )\n+ raise OSError("writing failed")\n+ \n+ free( buffer )\n+ r = bgzf_close(fp)\n+ if r < 0: raise OSError("writing failed")\n+\n+def tabix_index( filename, \n+ force = False,\n+ seq_col = None, \n+ start_col = None, \n+ end_col = None,\n+ preset = None,\n+ meta_char = "#",\n+ zerobased = False,\n+ ):\n+ \'\'\'\n+ index tab-separated *filename* using tabix.\n+\n+ An existing index will not be overwritten unless\n+ *force* is set.\n+\n+ The index will be built from coordinates\n+ in columns *seq_col*, *start_col* and *end_col*.\n+\n+ The contents of *filename* have to be sorted by \n+ contig and position - the method does not check\n+ if the file is sorted.\n+\n+ Column indices are 0-based. Coordinates in the file\n+ are assumed to be 1-based.\n+\n+ If *preset* is provided, the column coordinates\n+ are taken from a preset. Valid values for preset\n+ are "gff", "bed", "sam", "vcf", psltbl", "pileup".\n+ \n+ Lines beginning with *meta_char* and the first\n+ *line_skip* lines will be skipped.\n+ \n+ If *filename* does not end in ".gz", it will be automatically\n+ compressed. The original file will be removed and only the \n+ compressed file will be retained. \n+\n+ If *filename* ends in *gz*, the file is assumed to be already\n+ compressed with bgzf.\n+\n+ returns the filename of the compressed data\n+ \'\'\'\n+ \n+ if not os.path.exists(filename): raise IOError("No such file \'%s\'" % filename)\n+\n+ if not filename.endswith(".gz"): \n+ \n+ tabix_compress( filename, filename + ".gz", force = force )\n+ os.unlink( filename )\n+ filename += ".gz"\n+\n+ if not force and os.path.exists(filename + ".tbi" ):\n+ raise IOError( "Filename \'%s.tbi\' already exists, use *force* to overwrite" )\n+\n+ # columns (1-based)\n+ # preset-code, contig, start, end, metachar for commends, lines to ignore at beginning\n+ # 0 is a missing column\n+ preset2conf = {\n+ \'gff\' : ( 0, 1, 4, 5, ord(\'#\'), 0 ),\n+ \'bed\' : ( 0x10000, 1, 2, 3, ord(\'#\'), 0 ),\n+ \'psltbl\' : ( 0x10000, 15, 17, 18, ord(\'#\'), 0 ),\n+ \'sam\' : ( 1, 3, 4, 0, ord(\'#\'), 0 ),\n+ \'vcf\' : ( 2, 1, 2, 0, ord(\'#\'), 0 ),\n+ \'pileup\': (3, 1, 2, 0, ord(\'#\'), 0 ),\n+ }\n+\n+ if preset:\n+ try:\n+ conf_data = preset2conf[preset]\n+ except KeyError:\n+ raise KeyError( "unknown preset \'%s\', valid presets are \'%s\'" % (preset, ",".join(preset2conf.keys() )))\n+ else:\n+ if end_col == None: end_col = -1\n+ preset = 0\n+\n+ # note that tabix internally works with 0-based coordinates and open/closed intervals.\n+ # When using a preset, conversion is automatically taken care of.\n+ # Otherwise, the coordinates are assumed to be 1-based closed intervals and \n+ # -1 is subtracted from the start coordinate. To avoid doing this, set\n+ # the TI_FLAG_UCSC=0x10000 flag:\n+ if zerobased: preset = preset | 0x10000\n+\n+ conf_data = (preset, seq_col+1, start_col+1, end_col+1, ord(meta_char), 0)\n+ \n+ cdef ti_conf_t conf\n+ conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data\n+\n+ ti_index_build( filename, &conf)\n+ \n+ return filename\n+ \n+__all__ = ["tabix_index", \n+ "tabix_compress",\n+ "Tabixfile", \n+ "asTuple",\n+ "asGTF",\n+ ]\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/ctabix.so |
b |
Binary file chimerascan/pysam/ctabix.so has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/namedtuple.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/namedtuple.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,117 @@ +from operator import itemgetter as _itemgetter +from keyword import iskeyword as _iskeyword +import sys as _sys + +def namedtuple(typename, field_names, verbose=False, rename=False): + """Returns a new subclass of tuple with named fields. + + >>> Point = namedtuple('Point', 'x y') + >>> Point.__doc__ # docstring for the new class + 'Point(x, y)' + >>> p = Point(11, y=22) # instantiate with positional args or keywords + >>> p[0] + p[1] # indexable like a plain tuple + 33 + >>> x, y = p # unpack like a regular tuple + >>> x, y + (11, 22) + >>> p.x + p.y # fields also accessable by name + 33 + >>> d = p._asdict() # convert to a dictionary + >>> d['x'] + 11 + >>> Point(**d) # convert from a dictionary + Point(x=11, y=22) + >>> p._replace(x=100) # _replace() is like str.replace() but targets named fields + Point(x=100, y=22) + + """ + + # Parse and validate the field names. Validation serves two purposes, + # generating informative error messages and preventing template injection attacks. + if isinstance(field_names, basestring): + field_names = field_names.replace(',', ' ').split() # names separated by whitespace and/or commas + field_names = tuple(map(str, field_names)) + if rename: + names = list(field_names) + seen = set() + for i, name in enumerate(names): + if (not min(c.isalnum() or c=='_' for c in name) or _iskeyword(name) + or not name or name[0].isdigit() or name.startswith('_') + or name in seen): + names[i] = '_%d' % i + seen.add(name) + field_names = tuple(names) + for name in (typename,) + field_names: + if not min(c.isalnum() or c=='_' for c in name): + raise ValueError('Type names and field names can only contain alphanumeric characters and underscores: %r' % name) + if _iskeyword(name): + raise ValueError('Type names and field names cannot be a keyword: %r' % name) + if name[0].isdigit(): + raise ValueError('Type names and field names cannot start with a number: %r' % name) + seen_names = set() + for name in field_names: + if name.startswith('_') and not rename: + raise ValueError('Field names cannot start with an underscore: %r' % name) + if name in seen_names: + raise ValueError('Encountered duplicate field name: %r' % name) + seen_names.add(name) + + # Create and fill-in the class template + numfields = len(field_names) + argtxt = repr(field_names).replace("'", "")[1:-1] # tuple repr without parens or quotes + reprtxt = ', '.join('%s=%%r' % name for name in field_names) + template = '''class %(typename)s(tuple): + '%(typename)s(%(argtxt)s)' \n + __slots__ = () \n + _fields = %(field_names)r \n + def __new__(_cls, %(argtxt)s): + return _tuple.__new__(_cls, (%(argtxt)s)) \n + @classmethod + def _make(cls, iterable, new=tuple.__new__, len=len): + 'Make a new %(typename)s object from a sequence or iterable' + result = new(cls, iterable) + if len(result) != %(numfields)d: + raise TypeError('Expected %(numfields)d arguments, got %%d' %% len(result)) + return result \n + def __repr__(self): + return '%(typename)s(%(reprtxt)s)' %% self \n + def _asdict(self): + 'Return a new dict which maps field names to their values' + return dict(zip(self._fields, self)) \n + def _replace(_self, **kwds): + 'Return a new %(typename)s object replacing specified fields with new values' + result = _self._make(map(kwds.pop, %(field_names)r, _self)) + if kwds: + raise ValueError('Got unexpected field names: %%r' %% kwds.keys()) + return result \n + def __getnewargs__(self): + return tuple(self) \n\n''' % locals() + for i, name in enumerate(field_names): + template += ' %s = _property(_itemgetter(%d))\n' % (name, i) + if verbose: + print template + + # Execute the template string in a temporary namespace + namespace = dict(_itemgetter=_itemgetter, __name__='namedtuple_%s' % typename, + _property=property, _tuple=tuple) + try: + exec template in namespace + except SyntaxError, e: + raise SyntaxError(e.message + ':\n' + template) + result = namespace[typename] + + # For pickling to work, the __module__ variable needs to be set to the frame + # where the named tuple is created. Bypass this step in enviroments where + # sys._getframe is not defined (Jython for example) or sys._getframe is not + # defined for arguments greater than 0 (IronPython). + try: + result.__module__ = _sys._getframe(1).f_globals.get('__name__', '__main__') + except (AttributeError, ValueError): + pass + + return result + + + + + |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/pysam_util.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/pysam_util.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,290 @@\n+#include <ctype.h>\n+#include <assert.h>\n+#include "bam.h"\n+#include "khash.h"\n+#include "ksort.h"\n+#include "bam_endian.h"\n+#include "knetfile.h"\n+#include "pysam_util.h"\n+\n+// #######################################################\n+// utility routines to avoid using callbacks in bam_fetch\n+// taken from bam_index.c\n+// The order of the following declarations is important.\n+// #######################################################\n+\n+typedef struct\n+{\n+ uint64_t u, v;\n+} pair64_t;\n+\n+#define pair64_lt(a,b) ((a).u < (b).u)\n+\n+typedef struct {\n+\tuint32_t m, n;\n+\tpair64_t *list;\n+} bam_binlist_t;\n+\n+typedef struct {\n+\tint32_t n, m;\n+\tuint64_t *offset;\n+} bam_lidx_t;\n+\n+KSORT_INIT(my_off, pair64_t, pair64_lt);\n+KHASH_MAP_INIT_INT(my_i, bam_binlist_t);\n+\n+struct __bam_index_t\n+{\n+ int32_t n;\n+ khash_t(my_i) **index;\n+ bam_lidx_t *index2;\n+};\n+\n+typedef struct __linkbuf_t {\n+\tbam1_t b;\n+\tuint32_t beg, end;\n+\tstruct __linkbuf_t *next;\n+} lbnode_t;\n+\n+typedef struct {\n+\tint cnt, n, max;\n+\tlbnode_t **buf;\n+} mempool_t;\n+\n+struct __bam_plbuf_t {\n+\tmempool_t *mp;\n+\tlbnode_t *head, *tail, *dummy;\n+\tbam_pileup_f func;\n+\tvoid *func_data;\n+\tint32_t tid, pos, max_tid, max_pos;\n+\tint max_pu, is_eof;\n+\tbam_pileup1_t *pu;\n+\tint flag_mask;\n+};\n+\n+static mempool_t *mp_init()\n+{\n+\tmempool_t *mp;\n+\tmp = (mempool_t*)calloc(1, sizeof(mempool_t));\n+\treturn mp;\n+}\n+static void mp_destroy(mempool_t *mp)\n+{\n+\tint k;\n+\tfor (k = 0; k < mp->n; ++k) {\n+\t\tfree(mp->buf[k]->b.data);\n+\t\tfree(mp->buf[k]);\n+\t}\n+\tfree(mp->buf);\n+\tfree(mp);\n+}\n+static inline lbnode_t *mp_alloc(mempool_t *mp)\n+{\n+\t++mp->cnt;\n+\tif (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));\n+\telse return mp->buf[--mp->n];\n+}\n+static inline void mp_free(mempool_t *mp, lbnode_t *p)\n+{\n+\t--mp->cnt; p->next = 0; // clear lbnode_t::next here\n+\tif (mp->n == mp->max) {\n+\t\tmp->max = mp->max? mp->max<<1 : 256;\n+\t\tmp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);\n+\t}\n+\tmp->buf[mp->n++] = p;\n+}\n+\n+static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos)\n+{\n+\tunsigned k;\n+\tbam1_t *b = p->b;\n+\tbam1_core_t *c = &b->core;\n+\tuint32_t x = c->pos, y = 0;\n+\tint ret = 1, is_restart = 1;\n+\n+\tif (c->flag&BAM_FUNMAP) return 0; // unmapped read\n+\tassert(x <= pos); // otherwise a bug\n+\tp->qpos = -1; p->indel = 0; p->is_del = p->is_head = p->is_tail = 0;\n+\tfor (k = 0; k < c->n_cigar; ++k) {\n+\t\tint op = bam1_cigar(b)[k] & BAM_CIGAR_MASK; // operation\n+\t\tint l = bam1_cigar(b)[k] >> BAM_CIGAR_SHIFT; // length\n+\t\tif (op == BAM_CMATCH) { // NOTE: this assumes the first and the last operation MUST BE a match or a clip\n+\t\t\tif (x + l > pos) { // overlap with pos\n+\t\t\t\tp->indel = p->is_del = 0;\n+\t\t\t\tp->qpos = y + (pos - x);\n+\t\t\t\tif (x == pos && is_restart) p->is_head = 1;\n+\t\t\t\tif (x + l - 1 == pos) { // come to the end of a match\n+\t\t\t\t\tif (k < c->n_cigar - 1) { // there are additional operation(s)\n+\t\t\t\t\t\tuint32_t cigar = bam1_cigar(b)[k+1]; // next CIGAR\n+\t\t\t\t\t\tint op_next = cigar&BAM_CIGAR_MASK; // next CIGAR operation\n+\t\t\t\t\t\tif (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del\n+\t\t\t\t\t\telse if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins\n+\t\t\t\t\t\tif (op_next == BAM_CSOFT_CLIP || op_next == BAM_CREF_SKIP || op_next == BAM_CHARD_CLIP)\n+\t\t\t\t\t\t\tp->is_tail = 1; // tail\n+\t\t\t\t\t} else p->is_tail = 1; // this is the last operation; set tail\n+\t\t\t\t}\n+\t\t\t}\n+\t\t\tx += l; y += l;\n+\t\t} else if (op == BAM_CDEL) { // then set ->is_del\n+\t\t\tif (x + l > pos) {\n+\t\t\t\tp->indel = 0; p->is_del = 1;\n+\t\t\t\tp->qpos = y + (pos - x);\n+\t\t\t}\n+\t\t\tx += l;\n+\t\t} else if (op == BAM_CREF_SKIP) x += l;\n+\t\telse if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;\n+\t\tis_restart = (op == BAM_CREF_SKIP || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP);\n+\t\tif (x > pos) {\n+\t\t\tif (op == BAM_CREF_SKIP) ret = 0; // then do not put it into pileup at all\n+\t\t\tbreak;\n+\t\t}\n+\t}\n+\tassert(x > pos); // otherwise a bug\n+\treturn ret;\n+\n+}\n+// the following code has been taken from bam_plbuf_'..b'plp);\n+ if (plp == NULL) return 0;\n+ return 1;\n+}\n+\n+// pysam dispatch function to emulate the samtools\n+// command line within python.\n+// taken from the main function in bamtk.c\n+// added code to reset getopt\n+extern int main_samview(int argc, char *argv[]);\n+extern int main_import(int argc, char *argv[]);\n+extern int bam_pileup(int argc, char *argv[]);\n+extern int bam_merge(int argc, char *argv[]);\n+extern int bam_sort(int argc, char *argv[]);\n+extern int bam_index(int argc, char *argv[]);\n+extern int faidx_main(int argc, char *argv[]);\n+extern int bam_mating(int argc, char *argv[]);\n+extern int bam_rmdup(int argc, char *argv[]);\n+extern int glf3_view_main(int argc, char *argv[]);\n+extern int bam_flagstat(int argc, char *argv[]);\n+extern int bam_fillmd(int argc, char *argv[]);\n+\n+int pysam_dispatch(int argc, char *argv[] )\n+{\n+\n+#ifdef _WIN32\n+ setmode(fileno(stdout), O_BINARY);\n+ setmode(fileno(stdin), O_BINARY);\n+#ifdef _USE_KNETFILE\n+ knet_win32_init();\n+#endif\n+#endif\n+\n+ extern int optind;\n+ \n+ // reset getop\n+ optind = 1;\n+\n+ if (argc < 2) return 1;\n+\n+ if (strcmp(argv[1], "view") == 0) return main_samview(argc-1, argv+1);\n+ else if (strcmp(argv[1], "import") == 0) return main_import(argc-1, argv+1);\n+ else if (strcmp(argv[1], "pileup") == 0) return bam_pileup(argc-1, argv+1);\n+ else if (strcmp(argv[1], "merge") == 0) return bam_merge(argc-1, argv+1);\n+ else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1);\n+ else if (strcmp(argv[1], "index") == 0) return bam_index(argc-1, argv+1);\n+ else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1);\n+ else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1);\n+ else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1);\n+ else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1);\n+ else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1);\n+ else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1);\n+ else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1);\n+\n+#if _CURSES_LIB != 0\n+ else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1);\n+#endif\n+ else \n+ {\n+ fprintf(stderr, "[main] unrecognized command \'%s\'\\n", argv[1]);\n+ return 1;\n+ }\n+ return 0;\n+}\n+\n+// taken from samtools/bam_import.c\n+static inline uint8_t *alloc_data(bam1_t *b, size_t size)\n+{\n+ if (b->m_data < size)\n+ {\n+ b->m_data = size;\n+ kroundup32(b->m_data);\n+ b->data = (uint8_t*)realloc(b->data, b->m_data);\n+ }\n+ return b->data;\n+}\n+\n+// update the variable length data within a bam1_t entry.\n+// Adds *nbytes_new* - *nbytes_old* into the variable length data of *src* at *pos*.\n+// Data within the bam1_t entry is moved so that it is\n+// consistent with the data field lengths.\n+bam1_t * pysam_bam_update( bam1_t * b,\n+\t\t\t const size_t nbytes_old,\n+\t\t\t const size_t nbytes_new, \n+\t\t\t uint8_t * pos )\n+{\n+ int d = nbytes_new-nbytes_old;\n+\n+ // no change\n+ if (d == 0) return b;\n+\n+ int new_size = d + b->data_len;\n+ size_t offset = pos - b->data;\n+\n+ //printf("d=%i, old=%i, new=%i, old_size=%i, new_size=%i\\n",\n+ // d, nbytes_old, nbytes_new, b->data_len, new_size);\n+ \n+ // increase memory if required\n+ if (d > 0)\n+ {\n+ alloc_data( b, new_size );\n+ pos = b->data + offset;\n+ }\n+ \n+ if (b->data_len != 0)\n+ {\n+ if (offset < 0 || offset > b->data_len)\n+\tfprintf(stderr, "[pysam_bam_insert] illegal offset: \'%i\'\\n", (int)offset);\n+ }\n+ \n+ // printf("dest=%p, src=%p, n=%i\\n", pos+nbytes_new, pos + nbytes_old, b->data_len - (offset+nbytes_old));\n+ memmove( pos + nbytes_new,\n+\t pos + nbytes_old,\n+\t b->data_len - (offset + nbytes_old));\n+ \n+ b->data_len = new_size;\n+ \n+ return b;\n+}\n+\n+// translate a nucleotide character to binary code\n+unsigned char pysam_translate_sequence( const unsigned char s )\n+{\n+ return bam_nt16_table[s];\n+}\n+\n+\n+\n+\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/pysam_util.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/pysam_util.h Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,42 @@ +#ifndef PYSAM_UTIL_H +#define PYSAM_UTIL_H + +////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////// +// various helper functions +// +// fill pileup buffer for next position. + +int pysam_pileup_next(const bam1_t *b, + bam_plbuf_t *buf, + bam_pileup1_t ** plp, + int * tid, + int * pos, + int * n_plp); + +int pysam_dispatch(int argc, char *argv[] ); + +/*! + @abstract Update the variable length data within a bam1_t entry + + Old data is deleted and the data within b are re-arranged to + make place for new data. + + @discussion Returns b + + @param b bam1_t data + @param nbytes_old size of old data + @param nbytes_new size of new data + @param pos position of data +*/ +bam1_t * pysam_bam_update( bam1_t * b, + const size_t nbytes_old, + const size_t nbytes_new, + uint8_t * pos ); + +// translate a nucleotide character to binary code +unsigned char pysam_translate_sequence( const unsigned char s ); + + +#endif |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/bam.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/bam.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,308 @@\n+#include <stdio.h>\n+#include <ctype.h>\n+#include <errno.h>\n+#include <assert.h>\n+#include "bam.h"\n+#include "bam_endian.h"\n+#include "kstring.h"\n+#include "sam_header.h"\n+\n+int bam_is_be = 0;\n+char *bam_flag2char_table = "pPuUrR12sfd\\0\\0\\0\\0\\0";\n+\n+/**************************\n+ * CIGAR related routines *\n+ **************************/\n+\n+uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar)\n+{\n+\tuint32_t k, end;\n+\tend = c->pos;\n+\tfor (k = 0; k < c->n_cigar; ++k) {\n+\t\tint op = cigar[k] & BAM_CIGAR_MASK;\n+\t\tif (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP)\n+\t\t\tend += cigar[k] >> BAM_CIGAR_SHIFT;\n+\t}\n+\treturn end;\n+}\n+\n+int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar)\n+{\n+\tuint32_t k;\n+\tint32_t l = 0;\n+\tfor (k = 0; k < c->n_cigar; ++k) {\n+\t\tint op = cigar[k] & BAM_CIGAR_MASK;\n+\t\tif (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP)\n+\t\t\tl += cigar[k] >> BAM_CIGAR_SHIFT;\n+\t}\n+\treturn l;\n+}\n+\n+/********************\n+ * BAM I/O routines *\n+ ********************/\n+\n+bam_header_t *bam_header_init()\n+{\n+\tbam_is_be = bam_is_big_endian();\n+\treturn (bam_header_t*)calloc(1, sizeof(bam_header_t));\n+}\n+\n+void bam_header_destroy(bam_header_t *header)\n+{\n+\tint32_t i;\n+\textern void bam_destroy_header_hash(bam_header_t *header);\n+\tif (header == 0) return;\n+\tif (header->target_name) {\n+\t\tfor (i = 0; i < header->n_targets; ++i)\n+\t\t\tfree(header->target_name[i]);\n+\t\tfree(header->target_name);\n+\t\tfree(header->target_len);\n+\t}\n+\tfree(header->text);\n+\tif (header->dict) sam_header_free(header->dict);\n+\tif (header->rg2lib) sam_tbl_destroy(header->rg2lib);\n+\tbam_destroy_header_hash(header);\n+\tfree(header);\n+}\n+\n+bam_header_t *bam_header_read(bamFile fp)\n+{\n+\tbam_header_t *header;\n+\tchar buf[4];\n+\tint magic_len;\n+\tint32_t i = 1, name_len;\n+\t// check EOF\n+\ti = bgzf_check_EOF(fp);\n+\tif (i < 0) {\n+\t\t// If the file is a pipe, checking the EOF marker will *always* fail\n+\t\t// with ESPIPE. Suppress the error message in this case.\n+\t\tif (errno != ESPIPE) perror("[bam_header_read] bgzf_check_EOF");\n+\t}\n+\telse if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent.\\n");\n+\t// read "BAM1"\n+\tmagic_len = bam_read(fp, buf, 4);\n+\tif (magic_len != 4 || strncmp(buf, "BAM\\001", 4) != 0) {\n+\t\tfprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\\n");\n+\t\treturn 0;\n+\t}\n+\theader = bam_header_init();\n+\t// read plain text and the number of reference sequences\n+\tbam_read(fp, &header->l_text, 4);\n+\tif (bam_is_be) bam_swap_endian_4p(&header->l_text);\n+\theader->text = (char*)calloc(header->l_text + 1, 1);\n+\tbam_read(fp, header->text, header->l_text);\n+\tbam_read(fp, &header->n_targets, 4);\n+\tif (bam_is_be) bam_swap_endian_4p(&header->n_targets);\n+\t// read reference sequence names and lengths\n+\theader->target_name = (char**)calloc(header->n_targets, sizeof(char*));\n+\theader->target_len = (uint32_t*)calloc(header->n_targets, 4);\n+\tfor (i = 0; i != header->n_targets; ++i) {\n+\t\tbam_read(fp, &name_len, 4);\n+\t\tif (bam_is_be) bam_swap_endian_4p(&name_len);\n+\t\theader->target_name[i] = (char*)calloc(name_len, 1);\n+\t\tbam_read(fp, header->target_name[i], name_len);\n+\t\tbam_read(fp, &header->target_len[i], 4);\n+\t\tif (bam_is_be) bam_swap_endian_4p(&header->target_len[i]);\n+\t}\n+\treturn header;\n+}\n+\n+int bam_header_write(bamFile fp, const bam_header_t *header)\n+{\n+\tchar buf[4];\n+\tint32_t i, name_len, x;\n+\t// write "BAM1"\n+\tstrncpy(buf, "BAM\\001", 4);\n+\tbam_write(fp, buf, 4);\n+\t// write plain text and the number of reference sequences\n+\tif (bam_is_be) {\n+\t\tx = bam_swap_endian_4(header->l_text);\n+\t\tbam_write(fp, &x, 4);\n+\t\tif (header->l_text) bam_write(fp, header->text, header->l_text);\n+\t\tx = bam_swap_endian_4(header->n_targets);\n+\t\tbam_write(fp, &x, 4);\n+\t} else {\n+\t\tbam_write(fp, &header->l_text, 4);\n+\t\tif (header->l_text) bam_write(fp, header->text, header->l_text);\n+\t\tbam_write(fp, &header->n_targets, 4);\n+\t}\n+\t// write sequence names and lengths\n+\tfor (i = 0; i != header->n_targets'..b'32);\n+\tx[0] = c->tid;\n+\tx[1] = c->pos;\n+\tx[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname;\n+\tx[3] = (uint32_t)c->flag<<16 | c->n_cigar;\n+\tx[4] = c->l_qseq;\n+\tx[5] = c->mtid;\n+\tx[6] = c->mpos;\n+\tx[7] = c->isize;\n+\tbgzf_flush_try(fp, 4 + block_len);\n+\tif (bam_is_be) {\n+\t\tfor (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);\n+\t\ty = block_len;\n+\t\tbam_write(fp, bam_swap_endian_4p(&y), 4);\n+\t\tswap_endian_data(c, data_len, data);\n+\t} else bam_write(fp, &block_len, 4);\n+\tbam_write(fp, x, BAM_CORE_SIZE);\n+\tbam_write(fp, data, data_len);\n+\tif (bam_is_be) swap_endian_data(c, data_len, data);\n+\treturn 4 + block_len;\n+}\n+\n+int bam_write1(bamFile fp, const bam1_t *b)\n+{\n+\treturn bam_write1_core(fp, &b->core, b->data_len, b->data);\n+}\n+\n+char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of)\n+{\n+\tuint8_t *s = bam1_seq(b), *t = bam1_qual(b);\n+\tint i;\n+\tconst bam1_core_t *c = &b->core;\n+\tkstring_t str;\n+\tstr.l = str.m = 0; str.s = 0;\n+\n+\tkputsn(bam1_qname(b), c->l_qname-1, &str); kputc(\'\\t\', &str);\n+\tif (of == BAM_OFDEC) { kputw(c->flag, &str); kputc(\'\\t\', &str); }\n+\telse if (of == BAM_OFHEX) ksprintf(&str, "0x%x\\t", c->flag);\n+\telse { // BAM_OFSTR\n+\t\tfor (i = 0; i < 16; ++i)\n+\t\t\tif ((c->flag & 1<<i) && bam_flag2char_table[i])\n+\t\t\t\tkputc(bam_flag2char_table[i], &str);\n+\t\tkputc(\'\\t\', &str);\n+\t}\n+\tif (c->tid < 0) kputsn("*\\t", 2, &str);\n+\telse { kputs(header->target_name[c->tid], &str); kputc(\'\\t\', &str); }\n+\tkputw(c->pos + 1, &str); kputc(\'\\t\', &str); kputw(c->qual, &str); kputc(\'\\t\', &str);\n+\tif (c->n_cigar == 0) kputc(\'*\', &str);\n+\telse {\n+\t\tfor (i = 0; i < c->n_cigar; ++i) {\n+\t\t\tkputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str);\n+\t\t\tkputc("MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str);\n+\t\t}\n+\t}\n+\tkputc(\'\\t\', &str);\n+\tif (c->mtid < 0) kputsn("*\\t", 2, &str);\n+\telse if (c->mtid == c->tid) kputsn("=\\t", 2, &str);\n+\telse { kputs(header->target_name[c->mtid], &str); kputc(\'\\t\', &str); }\n+\tkputw(c->mpos + 1, &str); kputc(\'\\t\', &str); kputw(c->isize, &str); kputc(\'\\t\', &str);\n+\tif (c->l_qseq) {\n+\t\tfor (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str);\n+\t\tkputc(\'\\t\', &str);\n+\t\tif (t[0] == 0xff) kputc(\'*\', &str);\n+\t\telse for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str);\n+\t} else kputsn("*\\t*", 3, &str);\n+\ts = bam1_aux(b);\n+\twhile (s < b->data + b->data_len) {\n+\t\tuint8_t type, key[2];\n+\t\tkey[0] = s[0]; key[1] = s[1];\n+\t\ts += 2; type = *s; ++s;\n+\t\tkputc(\'\\t\', &str); kputsn((char*)key, 2, &str); kputc(\':\', &str);\n+\t\tif (type == \'A\') { kputsn("A:", 2, &str); kputc(*s, &str); ++s; }\n+\t\telse if (type == \'C\') { kputsn("i:", 2, &str); kputw(*s, &str); ++s; }\n+\t\telse if (type == \'c\') { kputsn("i:", 2, &str); kputw(*(int8_t*)s, &str); ++s; }\n+\t\telse if (type == \'S\') { kputsn("i:", 2, &str); kputw(*(uint16_t*)s, &str); s += 2; }\n+\t\telse if (type == \'s\') { kputsn("i:", 2, &str); kputw(*(int16_t*)s, &str); s += 2; }\n+\t\telse if (type == \'I\') { kputsn("i:", 2, &str); kputuw(*(uint32_t*)s, &str); s += 4; }\n+\t\telse if (type == \'i\') { kputsn("i:", 2, &str); kputw(*(int32_t*)s, &str); s += 4; }\n+\t\telse if (type == \'f\') { ksprintf(&str, "f:%g", *(float*)s); s += 4; }\n+\t\telse if (type == \'d\') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; }\n+\t\telse if (type == \'Z\' || type == \'H\') { kputc(type, &str); kputc(\':\', &str); while (*s) kputc(*s++, &str); ++s; }\n+\t}\n+\treturn str.s;\n+}\n+\n+char *bam_format1(const bam_header_t *header, const bam1_t *b)\n+{\n+\treturn bam_format1_core(header, b, BAM_OFDEC);\n+}\n+\n+void bam_view1(const bam_header_t *header, const bam1_t *b)\n+{\n+\tchar *s = bam_format1(header, b);\n+\tputs(s);\n+\tfree(s);\n+}\n+\n+// FIXME: we should also check the LB tag associated with each alignment\n+const char *bam_get_library(bam_header_t *h, const bam1_t *b)\n+{\n+\tconst uint8_t *rg;\n+\tif (h->dict == 0) h->dict = sam_header_parse2(h->text);\n+\tif (h->rg2lib == 0) h->rg2lib = sam_header2tbl(h->dict, "RG", "ID", "LB");\n+\trg = bam_aux_get(b, "RG");\n+\treturn (rg == 0)? 0 : sam_tbl_get(h->rg2lib, (const char*)(rg + 1));\n+}\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/bam.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/bam.h Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,724 @@\n+/* The MIT License\n+\n+ Copyright (c) 2008 Genome Research Ltd (GRL).\n+\n+ Permission is hereby granted, free of charge, to any person obtaining\n+ a copy of this software and associated documentation files (the\n+ "Software"), to deal in the Software without restriction, including\n+ without limitation the rights to use, copy, modify, merge, publish,\n+ distribute, sublicense, and/or sell copies of the Software, and to\n+ permit persons to whom the Software is furnished to do so, subject to\n+ the following conditions:\n+\n+ The above copyright notice and this permission notice shall be\n+ included in all copies or substantial portions of the Software.\n+\n+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,\n+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS\n+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN\n+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN\n+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n+ SOFTWARE.\n+*/\n+\n+/* Contact: Heng Li <lh3@sanger.ac.uk> */\n+\n+#ifndef BAM_BAM_H\n+#define BAM_BAM_H\n+\n+/*!\n+ @header\n+\n+ BAM library provides I/O and various operations on manipulating files\n+ in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map)\n+ format. It now supports importing from or exporting to TAM, sorting,\n+ merging, generating pileup, and quickly retrieval of reads overlapped\n+ with a specified region.\n+\n+ @copyright Genome Research Ltd.\n+ */\n+\n+#include <stdint.h>\n+#include <stdlib.h>\n+#include <string.h>\n+#include <stdio.h>\n+\n+#ifndef BAM_LITE\n+#define BAM_VIRTUAL_OFFSET16\n+#include "bgzf.h"\n+/*! @abstract BAM file handler */\n+typedef BGZF *bamFile;\n+#define bam_open(fn, mode) bgzf_open(fn, mode)\n+#define bam_dopen(fd, mode) bgzf_fdopen(fd, mode)\n+#define bam_close(fp) bgzf_close(fp)\n+#define bam_read(fp, buf, size) bgzf_read(fp, buf, size)\n+#define bam_write(fp, buf, size) bgzf_write(fp, buf, size)\n+#define bam_tell(fp) bgzf_tell(fp)\n+#define bam_seek(fp, pos, dir) bgzf_seek(fp, pos, dir)\n+#else\n+#define BAM_TRUE_OFFSET\n+#include <zlib.h>\n+typedef gzFile bamFile;\n+#define bam_open(fn, mode) gzopen(fn, mode)\n+#define bam_dopen(fd, mode) gzdopen(fd, mode)\n+#define bam_close(fp) gzclose(fp)\n+#define bam_read(fp, buf, size) gzread(fp, buf, size)\n+/* no bam_write/bam_tell/bam_seek() here */\n+#endif\n+\n+/*! @typedef\n+ @abstract Structure for the alignment header.\n+ @field n_targets number of reference sequences\n+ @field target_name names of the reference sequences\n+ @field target_len lengths of the referene sequences\n+ @field dict header dictionary\n+ @field hash hash table for fast name lookup\n+ @field rg2lib hash table for @RG-ID -> LB lookup\n+ @field l_text length of the plain text in the header\n+ @field text plain text\n+\n+ @discussion Field hash points to null by default. It is a private\n+ member.\n+ */\n+typedef struct {\n+\tint32_t n_targets;\n+\tchar **target_name;\n+\tuint32_t *target_len;\n+\tvoid *dict, *hash, *rg2lib;\n+\tsize_t l_text, n_text;\n+\tchar *text;\n+} bam_header_t;\n+\n+/*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */\n+#define BAM_FPAIRED 1\n+/*! @abstract the read is mapped in a proper pair */\n+#define BAM_FPROPER_PAIR 2\n+/*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */\n+#define BAM_FUNMAP 4\n+/*! @abstract the mate is unmapped */\n+#define BAM_FMUNMAP 8\n+/*! @abstract the read is mapped to the reverse strand */\n+#define BAM_FREVERSE 16\n+/*! @abstract the mate is mapped to the reverse strand */\n+#define BAM_FMREVERSE 32\n+/*! @abstract this is read1 */\n+#define BAM_FREAD1 64\n+/*! @abstract this is read2 */\n+#define BAM_FREAD2 128\n+/*! @abstract not primary alignment */\n+#define BA'..b'g, int end);\n+\tint bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b);\n+\tvoid bam_iter_destroy(bam_iter_t iter);\n+\n+\t/*!\n+\t @abstract Parse a region in the format: "chr2:100,000-200,000".\n+\t @discussion bam_header_t::hash will be initialized if empty.\n+\t @param header pointer to the header structure\n+\t @param str string to be parsed\n+\t @param ref_id the returned chromosome ID\n+\t @param begin the returned start coordinate\n+\t @param end the returned end coordinate\n+\t @return 0 on success; -1 on failure\n+\t */\n+\tint bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end);\n+\n+\n+\t/**************************\n+\t * APIs for optional tags *\n+\t **************************/\n+\n+\t/*!\n+\t @abstract Retrieve data of a tag\n+\t @param b pointer to an alignment struct\n+\t @param tag two-character tag to be retrieved\n+\n+\t @return pointer to the type and data. The first character is the\n+\t type that can be \'iIsScCdfAZH\'.\n+\n+\t @discussion Use bam_aux2?() series to convert the returned data to\n+\t the corresponding type.\n+\t*/\n+\tuint8_t *bam_aux_get(const bam1_t *b, const char tag[2]);\n+\n+\tint32_t bam_aux2i(const uint8_t *s);\n+\tfloat bam_aux2f(const uint8_t *s);\n+\tdouble bam_aux2d(const uint8_t *s);\n+\tchar bam_aux2A(const uint8_t *s);\n+\tchar *bam_aux2Z(const uint8_t *s);\n+\n+\tint bam_aux_del(bam1_t *b, uint8_t *s);\n+\tvoid bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data);\n+\tuint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get()\n+\n+\n+\t/*****************\n+\t * Miscellaneous *\n+\t *****************/\n+\n+\t/*! \n+\t @abstract Calculate the rightmost coordinate of an alignment on the\n+\t reference genome.\n+\n+\t @param c pointer to the bam1_core_t structure\n+\t @param cigar the corresponding CIGAR array (from bam1_t::cigar)\n+\t @return the rightmost coordinate, 0-based\n+\t*/\n+\tuint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar);\n+\n+\t/*!\n+\t @abstract Calculate the length of the query sequence from CIGAR.\n+\t @param c pointer to the bam1_core_t structure\n+\t @param cigar the corresponding CIGAR array (from bam1_t::cigar)\n+\t @return length of the query sequence\n+\t*/\n+\tint32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar);\n+\n+#ifdef __cplusplus\n+}\n+#endif\n+\n+/*!\n+ @abstract Calculate the minimum bin that contains a region [beg,end).\n+ @param beg start of the region, 0-based\n+ @param end end of the region, 0-based\n+ @return bin\n+ */\n+static inline int bam_reg2bin(uint32_t beg, uint32_t end)\n+{\n+\t--end;\n+\tif (beg>>14 == end>>14) return 4681 + (beg>>14);\n+\tif (beg>>17 == end>>17) return 585 + (beg>>17);\n+\tif (beg>>20 == end>>20) return 73 + (beg>>20);\n+\tif (beg>>23 == end>>23) return 9 + (beg>>23);\n+\tif (beg>>26 == end>>26) return 1 + (beg>>26);\n+\treturn 0;\n+}\n+\n+/*!\n+ @abstract Copy an alignment\n+ @param bdst destination alignment struct\n+ @param bsrc source alignment struct\n+ @return pointer to the destination alignment struct\n+ */\n+static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)\n+{\n+\tuint8_t *data = bdst->data;\n+\tint m_data = bdst->m_data; // backup data and m_data\n+\tif (m_data < bsrc->m_data) { // double the capacity\n+\t\tm_data = bsrc->m_data; kroundup32(m_data);\n+\t\tdata = (uint8_t*)realloc(data, m_data);\n+\t}\n+\tmemcpy(data, bsrc->data, bsrc->data_len); // copy var-len data\n+\t*bdst = *bsrc; // copy the rest\n+\t// restore the backup\n+\tbdst->m_data = m_data;\n+\tbdst->data = data;\n+\treturn bdst;\n+}\n+\n+/*!\n+ @abstract Duplicate an alignment\n+ @param src source alignment struct\n+ @return pointer to the destination alignment struct\n+ */\n+static inline bam1_t *bam_dup1(const bam1_t *src)\n+{\n+\tbam1_t *b;\n+\tb = bam_init1();\n+\t*b = *src;\n+\tb->m_data = b->data_len;\n+\tb->data = (uint8_t*)calloc(b->data_len, 1);\n+\tmemcpy(b->data, src->data, b->data_len);\n+\treturn b;\n+}\n+\n+#endif\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/bam_aux.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/bam_aux.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,182 @@ +#include <ctype.h> +#include "bam.h" +#include "khash.h" +typedef char *str_p; +KHASH_MAP_INIT_STR(s, int) +KHASH_MAP_INIT_STR(r2l, str_p) + +void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data) +{ + int ori_len = b->data_len; + b->data_len += 3 + len; + b->l_aux += 3 + len; + if (b->m_data < b->data_len) { + b->m_data = b->data_len; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1]; + b->data[ori_len + 2] = type; + memcpy(b->data + ori_len + 3, data, len); +} + +uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]) +{ + return bam_aux_get(b, tag); +} + +#define __skip_tag(s) do { \ + int type = toupper(*(s)); \ + ++(s); \ + if (type == 'C' || type == 'A') ++(s); \ + else if (type == 'S') (s) += 2; \ + else if (type == 'I' || type == 'F') (s) += 4; \ + else if (type == 'D') (s) += 8; \ + else if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \ + } while (0) + +uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) +{ + uint8_t *s; + int y = tag[0]<<8 | tag[1]; + s = bam1_aux(b); + while (s < b->data + b->data_len) { + int x = (int)s[0]<<8 | s[1]; + s += 2; + if (x == y) return s; + __skip_tag(s); + } + return 0; +} +// s MUST BE returned by bam_aux_get() +int bam_aux_del(bam1_t *b, uint8_t *s) +{ + uint8_t *p, *aux; + aux = bam1_aux(b); + p = s - 2; + __skip_tag(s); + memmove(p, s, b->l_aux - (s - aux)); + b->data_len -= s - p; + b->l_aux -= s - p; + return 0; +} + +void bam_init_header_hash(bam_header_t *header) +{ + if (header->hash == 0) { + int ret, i; + khiter_t iter; + khash_t(s) *h; + header->hash = h = kh_init(s); + for (i = 0; i < header->n_targets; ++i) { + iter = kh_put(s, h, header->target_name[i], &ret); + kh_value(h, iter) = i; + } + } +} + +void bam_destroy_header_hash(bam_header_t *header) +{ + if (header->hash) + kh_destroy(s, (khash_t(s)*)header->hash); +} + +int32_t bam_get_tid(const bam_header_t *header, const char *seq_name) +{ + khint_t k; + khash_t(s) *h = (khash_t(s)*)header->hash; + k = kh_get(s, h, seq_name); + return k == kh_end(h)? -1 : kh_value(h, k); +} + +int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end) +{ + char *s, *p; + int i, l, k; + khiter_t iter; + khash_t(s) *h; + + bam_init_header_hash(header); + h = (khash_t(s)*)header->hash; + + l = strlen(str); + p = s = (char*)malloc(l+1); + /* squeeze out "," */ + for (i = k = 0; i != l; ++i) + if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; + s[k] = 0; + for (i = 0; i != k; ++i) if (s[i] == ':') break; + s[i] = 0; + iter = kh_get(s, h, s); /* get the ref_id */ + if (iter == kh_end(h)) { // name not found + *ref_id = -1; free(s); + return -1; + } + *ref_id = kh_value(h, iter); + if (i == k) { /* dump the whole sequence */ + *begin = 0; *end = 1<<29; free(s); + return 0; + } + for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; + *begin = atoi(p); + if (i < k) { + p = s + i + 1; + *end = atoi(p); + } else *end = 1<<29; + if (*begin > 0) --*begin; + free(s); + if (*begin > *end) { + fprintf(stderr, "[bam_parse_region] invalid region.\n"); + return -1; + } + return 0; +} + +int32_t bam_aux2i(const uint8_t *s) +{ + int type; + if (s == 0) return 0; + type = *s++; + if (type == 'c') return (int32_t)*(int8_t*)s; + else if (type == 'C') return (int32_t)*(uint8_t*)s; + else if (type == 's') return (int32_t)*(int16_t*)s; + else if (type == 'S') return (int32_t)*(uint16_t*)s; + else if (type == 'i' || type == 'I') return *(int32_t*)s; + else return 0; +} + +float bam_aux2f(const uint8_t *s) +{ + int type; + type = *s++; + if (s == 0) return 0.0; + if (type == 'f') return *(float*)s; + else return 0.0; +} + +double bam_aux2d(const uint8_t *s) +{ + int type; + type = *s++; + if (s == 0) return 0.0; + if (type == 'd') return *(double*)s; + else return 0.0; +} + +char bam_aux2A(const uint8_t *s) +{ + int type; + type = *s++; + if (s == 0) return 0; + if (type == 'A') return *(char*)s; + else return 0; +} + +char *bam_aux2Z(const uint8_t *s) +{ + int type; + type = *s++; + if (s == 0) return 0; + if (type == 'Z' || type == 'H') return (char*)s; + else return 0; +} |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/bam_color.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/bam_color.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,127 @@ +#include <ctype.h> +#include "bam.h" + +/*! + @abstract Get the color encoding the previous and current base + @param b pointer to an alignment + @param i The i-th position, 0-based + @return color + + @discussion Returns 0 no color information is found. + */ +char bam_aux_getCSi(bam1_t *b, int i) +{ + uint8_t *c = bam_aux_get(b, "CS"); + char *cs = NULL; + + // return the base if the tag was not found + if(0 == c) return 0; + + cs = bam_aux2Z(c); + // adjust for strandedness and leading adaptor + if(bam1_strand(b)) i = strlen(cs) - 1 - i; + else i++; + return cs[i]; +} + +/*! + @abstract Get the color quality of the color encoding the previous and current base + @param b pointer to an alignment + @param i The i-th position, 0-based + @return color quality + + @discussion Returns 0 no color information is found. + */ +char bam_aux_getCQi(bam1_t *b, int i) +{ + uint8_t *c = bam_aux_get(b, "CQ"); + char *cq = NULL; + + // return the base if the tag was not found + if(0 == c) return 0; + + cq = bam_aux2Z(c); + // adjust for strandedness + if(bam1_strand(b)) i = strlen(cq) - 1 - i; + return cq[i]; +} + +char bam_aux_nt2int(char a) +{ + switch(toupper(a)) { + case 'A': + return 0; + break; + case 'C': + return 1; + break; + case 'G': + return 2; + break; + case 'T': + return 3; + break; + default: + return 4; + break; + } +} + +char bam_aux_ntnt2cs(char a, char b) +{ + a = bam_aux_nt2int(a); + b = bam_aux_nt2int(b); + if(4 == a || 4 == b) return '4'; + return "0123"[(int)(a ^ b)]; +} + +/*! + @abstract Get the color error profile at the give position + @param b pointer to an alignment + @return the original color if the color was an error, '-' (dash) otherwise + + @discussion Returns 0 no color information is found. + */ +char bam_aux_getCEi(bam1_t *b, int i) +{ + int cs_i; + uint8_t *c = bam_aux_get(b, "CS"); + char *cs = NULL; + char prev_b, cur_b; + char cur_color, cor_color; + + // return the base if the tag was not found + if(0 == c) return 0; + + cs = bam_aux2Z(c); + + // adjust for strandedness and leading adaptor + if(bam1_strand(b)) { //reverse strand + cs_i = strlen(cs) - 1 - i; + // get current color + cur_color = cs[cs_i]; + // get previous base. Note: must rc adaptor + prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)]; + // get current base + cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; + } + else { + cs_i=i+1; + // get current color + cur_color = cs[cs_i]; + // get previous base + prev_b = (0 == i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i-1)]; + // get current base + cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; + } + + // corrected color + cor_color = bam_aux_ntnt2cs(prev_b, cur_b); + + if(cur_color == cor_color) { + return '-'; + } + else { + return cur_color; + } +} |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/bam_endian.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/bam_endian.h Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,42 @@ +#ifndef BAM_ENDIAN_H +#define BAM_ENDIAN_H + +#include <stdint.h> + +static inline int bam_is_big_endian() +{ + long one= 1; + return !(*((char *)(&one))); +} +static inline uint16_t bam_swap_endian_2(uint16_t v) +{ + return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); +} +static inline void *bam_swap_endian_2p(void *x) +{ + *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x); + return x; +} +static inline uint32_t bam_swap_endian_4(uint32_t v) +{ + v = ((v & 0x0000FFFFU) << 16) | (v >> 16); + return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); +} +static inline void *bam_swap_endian_4p(void *x) +{ + *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x); + return x; +} +static inline uint64_t bam_swap_endian_8(uint64_t v) +{ + v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); + v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); + return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); +} +static inline void *bam_swap_endian_8p(void *x) +{ + *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x); + return x; +} + +#endif |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/bam_import.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/bam_import.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,459 @@\n+#include <zlib.h>\n+#include <stdio.h>\n+#include <ctype.h>\n+#include <string.h>\n+#include <stdlib.h>\n+#include <unistd.h>\n+#include <assert.h>\n+#ifdef _WIN32\n+#include <fcntl.h>\n+#endif\n+#include "kstring.h"\n+#include "bam.h"\n+#include "sam_header.h"\n+#include "kseq.h"\n+#include "khash.h"\n+\n+KSTREAM_INIT(gzFile, gzread, 8192)\n+KHASH_MAP_INIT_STR(ref, uint64_t)\n+\n+void bam_init_header_hash(bam_header_t *header);\n+void bam_destroy_header_hash(bam_header_t *header);\n+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);\n+\n+unsigned char bam_nt16_table[256] = {\n+\t15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,\n+\t15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,\n+\t15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,\n+\t 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,\n+\t15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,\n+\t15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,\n+\t15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,\n+\t15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,\n+\t15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,\n+\t15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,\n+\t15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,\n+\t15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,\n+\t15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,\n+\t15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,\n+\t15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,\n+\t15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15\n+};\n+\n+unsigned short bam_char2flag_table[256] = {\n+\t0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,\n+\t0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,\n+\t0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,\n+\t0,BAM_FREAD1,BAM_FREAD2,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,\n+\t0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,\n+\tBAM_FPROPER_PAIR,0,BAM_FMREVERSE,0, 0,BAM_FMUNMAP,0,0, 0,0,0,0, 0,0,0,0,\n+\t0,0,0,0, BAM_FDUP,0,BAM_FQCFAIL,0, 0,0,0,0, 0,0,0,0,\n+\tBAM_FPAIRED,0,BAM_FREVERSE,BAM_FSECONDARY, 0,BAM_FUNMAP,0,0, 0,0,0,0, 0,0,0,0,\n+\t0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,\n+\t0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,\n+\t0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,\n+\t0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,\n+\t0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,\n+\t0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,\n+\t0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,\n+\t0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0\n+};\n+\n+char *bam_nt16_rev_table = "=ACMGRSVTWYHKDBN";\n+\n+struct __tamFile_t {\n+\tgzFile fp;\n+\tkstream_t *ks;\n+\tkstring_t *str;\n+\tuint64_t n_lines;\n+\tint is_first;\n+};\n+\n+char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only\n+{\n+\tchar **list = 0, *s;\n+\tint n = 0, dret, m = 0;\n+\tgzFile fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");\n+\tkstream_t *ks;\n+\tkstring_t *str;\n+\tstr = (kstring_t*)calloc(1, sizeof(kstring_t));\n+\tks = ks_init(fp);\n+\twhile (ks_getuntil(ks, \'\\n\', str, &dret) > 0) {\n+\t\tif (n == m) {\n+\t\t\tm = m? m << 1 : 16;\n+\t\t\tlist = (char**)realloc(list, m * sizeof(char*));\n+\t\t}\n+\t\tif (str->s[str->l-1] == \'\\r\')\n+\t\t\tstr->s[--str->l] = \'\\0\';\n+\t\ts = list[n++] = (char*)calloc(str->l + 1, 1);\n+\t\tstrcpy(s, str->s);\n+\t}\n+\tks_destroy(ks);\n+\tgzclose(fp);\n+\tfree(str->s); free(str);\n+\t*_n = n;\n+\treturn list;\n+}\n+\n+static bam_header_t *hash2header(const kh_ref_t *hash)\n+{\n+\tbam_header_t *header;\n+\tkhiter_t k;\n+\theader = bam_header_init();\n+\theader->n_targets = kh_size(hash);\n+\theader->target_name = (char**)calloc(kh_size(hash), sizeof(char*));\n+\theader->target_len = (uint32_t*)calloc(kh_size(hash), 4);\n+\tfor (k = kh_begin(hash); k != kh_end(hash); ++k) {\n+\t\tif (kh_exist(hash, k)) {\n+\t\t\tint i = (int)kh_value(hash, k);\n+\t\t\theader->target_name[i] = (char*)kh_key(hash, k);\n+\t\t\theader->target_len[i] = kh_value(hash, k)>>32;\n+\t\t}\n+\t}\n+\tbam_init_header_hash(header);\n+\treturn header;\n+}\n+bam_header_t *sam_header_read2(const char *fn)\n+{\n+\tbam_header_t *header;\n+\tint c, dret, ret, error = 0;\n+\tgzFile fp;\n+\tkstream_t *ks;\n+\tkstring_t *str;\n+\tkh_ref_t *hash;\n+\tkhiter_t k;\n+\tif (fn == 0) return 0;\n+\tfp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");\n+\tif (fp == 0) return 0;\n+\thash ='..b' sequence length are inconsistent");\n+\t\t\tp = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff;\n+\t\t\tmemset(p, 0, (c->l_qseq+1)/2);\n+\t\t\tfor (i = 0; i < c->l_qseq; ++i)\n+\t\t\t\tp[i/2] |= bam_nt16_table[(int)str->s[i]] << 4*(1-i%2);\n+\t\t} else c->l_qseq = 0;\n+\t\tif (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -6; // qual\n+\t\tz += str->l + 1;\n+\t\tif (strcmp(str->s, "*") && c->l_qseq != strlen(str->s))\n+\t\t\tparse_error(fp->n_lines, "sequence and quality are inconsistent");\n+\t\tp += (c->l_qseq+1)/2;\n+\t\tif (strcmp(str->s, "*") == 0) for (i = 0; i < c->l_qseq; ++i) p[i] = 0xff;\n+\t\telse for (i = 0; i < c->l_qseq; ++i) p[i] = str->s[i] - 33;\n+\t\tdoff += c->l_qseq + (c->l_qseq+1)/2;\n+\t}\n+\tdoff0 = doff;\n+\tif (dret != \'\\n\' && dret != \'\\r\') { // aux\n+\t\twhile (ks_getuntil(ks, KS_SEP_TAB, str, &dret) >= 0) {\n+\t\t\tuint8_t *s, type, key[2];\n+\t\t\tz += str->l + 1;\n+\t\t\tif (str->l < 6 || str->s[2] != \':\' || str->s[4] != \':\')\n+\t\t\t\tparse_error(fp->n_lines, "missing colon in auxiliary data");\n+\t\t\tkey[0] = str->s[0]; key[1] = str->s[1];\n+\t\t\ttype = str->s[3];\n+\t\t\ts = alloc_data(b, doff + 3) + doff;\n+\t\t\ts[0] = key[0]; s[1] = key[1]; s += 2; doff += 2;\n+\t\t\tif (type == \'A\' || type == \'a\' || type == \'c\' || type == \'C\') { // c and C for backward compatibility\n+\t\t\t\ts = alloc_data(b, doff + 2) + doff;\n+\t\t\t\t*s++ = \'A\'; *s = str->s[5];\n+\t\t\t\tdoff += 2;\n+\t\t\t} else if (type == \'I\' || type == \'i\') {\n+\t\t\t\tlong long x;\n+\t\t\t\ts = alloc_data(b, doff + 5) + doff;\n+\t\t\t\tx = (long long)atoll(str->s + 5);\n+\t\t\t\tif (x < 0) {\n+\t\t\t\t\tif (x >= -127) {\n+\t\t\t\t\t\t*s++ = \'c\'; *(int8_t*)s = (int8_t)x;\n+\t\t\t\t\t\ts += 1; doff += 2;\n+\t\t\t\t\t} else if (x >= -32767) {\n+\t\t\t\t\t\t*s++ = \'s\'; *(int16_t*)s = (int16_t)x;\n+\t\t\t\t\t\ts += 2; doff += 3;\n+\t\t\t\t\t} else {\n+\t\t\t\t\t\t*s++ = \'i\'; *(int32_t*)s = (int32_t)x;\n+\t\t\t\t\t\ts += 4; doff += 5;\n+\t\t\t\t\t\tif (x < -2147483648ll)\n+\t\t\t\t\t\t\tfprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.",\n+\t\t\t\t\t\t\t\t\t(long long)fp->n_lines, x);\n+\t\t\t\t\t}\n+\t\t\t\t} else {\n+\t\t\t\t\tif (x <= 255) {\n+\t\t\t\t\t\t*s++ = \'C\'; *s++ = (uint8_t)x;\n+\t\t\t\t\t\tdoff += 2;\n+\t\t\t\t\t} else if (x <= 65535) {\n+\t\t\t\t\t\t*s++ = \'S\'; *(uint16_t*)s = (uint16_t)x;\n+\t\t\t\t\t\ts += 2; doff += 3;\n+\t\t\t\t\t} else {\n+\t\t\t\t\t\t*s++ = \'I\'; *(uint32_t*)s = (uint32_t)x;\n+\t\t\t\t\t\ts += 4; doff += 5;\n+\t\t\t\t\t\tif (x > 4294967295ll)\n+\t\t\t\t\t\t\tfprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.",\n+\t\t\t\t\t\t\t\t\t(long long)fp->n_lines, x);\n+\t\t\t\t\t}\n+\t\t\t\t}\n+\t\t\t} else if (type == \'f\') {\n+\t\t\t\ts = alloc_data(b, doff + 5) + doff;\n+\t\t\t\t*s++ = \'f\';\n+\t\t\t\t*(float*)s = (float)atof(str->s + 5);\n+\t\t\t\ts += 4; doff += 5;\n+\t\t\t} else if (type == \'d\') {\n+\t\t\t\ts = alloc_data(b, doff + 9) + doff;\n+\t\t\t\t*s++ = \'d\';\n+\t\t\t\t*(float*)s = (float)atof(str->s + 9);\n+\t\t\t\ts += 8; doff += 9;\n+\t\t\t} else if (type == \'Z\' || type == \'H\') {\n+\t\t\t\tint size = 1 + (str->l - 5) + 1;\n+\t\t\t\tif (type == \'H\') { // check whether the hex string is valid\n+\t\t\t\t\tint i;\n+\t\t\t\t\tif ((str->l - 5) % 2 == 1) parse_error(fp->n_lines, "length of the hex string not even");\n+\t\t\t\t\tfor (i = 0; i < str->l - 5; ++i) {\n+\t\t\t\t\t\tint c = toupper(str->s[5 + i]);\n+\t\t\t\t\t\tif (!((c >= \'0\' && c <= \'9\') || (c >= \'A\' && c <= \'F\')))\n+\t\t\t\t\t\t\tparse_error(fp->n_lines, "invalid hex character");\n+\t\t\t\t\t}\n+\t\t\t\t}\n+\t\t\t\ts = alloc_data(b, doff + size) + doff;\n+\t\t\t\t*s++ = type;\n+\t\t\t\tmemcpy(s, str->s + 5, str->l - 5);\n+\t\t\t\ts[str->l - 5] = 0;\n+\t\t\t\tdoff += size;\n+\t\t\t} else parse_error(fp->n_lines, "unrecognized type");\n+\t\t\tif (dret == \'\\n\' || dret == \'\\r\') break;\n+\t\t}\n+\t}\n+\tb->l_aux = doff - doff0;\n+\tb->data_len = doff;\n+\treturn z;\n+}\n+\n+tamFile sam_open(const char *fn)\n+{\n+\ttamFile fp;\n+\tgzFile gzfp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "rb") : gzopen(fn, "rb");\n+\tif (gzfp == 0) return 0;\n+\tfp = (tamFile)calloc(1, sizeof(struct __tamFile_t));\n+\tfp->str = (kstring_t*)calloc(1, sizeof(kstring_t));\n+\tfp->fp = gzfp;\n+\tfp->ks = ks_init(fp->fp);\n+\treturn fp;\n+}\n+\n+void sam_close(tamFile fp)\n+{\n+\tif (fp) {\n+\t\tks_destroy(fp->ks);\n+\t\tgzclose(fp->fp);\n+\t\tfree(fp->str->s); free(fp->str);\n+\t\tfree(fp);\n+\t}\n+}\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/bam_index.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/bam_index.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,696 @@\n+#include <ctype.h>\n+#include <assert.h>\n+#include "bam.h"\n+#include "khash.h"\n+#include "ksort.h"\n+#include "bam_endian.h"\n+#ifdef _USE_KNETFILE\n+#include "knetfile.h"\n+#endif\n+\n+/*!\n+ @header\n+\n+ Alignment indexing. Before indexing, BAM must be sorted based on the\n+ leftmost coordinate of alignments. In indexing, BAM uses two indices:\n+ a UCSC binning index and a simple linear index. The binning index is\n+ efficient for alignments spanning long distance, while the auxiliary\n+ linear index helps to reduce unnecessary seek calls especially for\n+ short alignments.\n+\n+ The UCSC binning scheme was suggested by Richard Durbin and Lincoln\n+ Stein and is explained by Kent et al. (2002). In this scheme, each bin\n+ represents a contiguous genomic region which can be fully contained in\n+ another bin; each alignment is associated with a bin which represents\n+ the smallest region containing the entire alignment. The binning\n+ scheme is essentially another representation of R-tree. A distinct bin\n+ uniquely corresponds to a distinct internal node in a R-tree. Bin A is\n+ a child of Bin B if region A is contained in B.\n+\n+ In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin\n+ 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,\n+ 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to\n+ find the alignments overlapped with a region [rbeg,rend), we need to\n+ calculate the list of bins that may be overlapped the region and test\n+ the alignments in the bins to confirm the overlaps. If the specified\n+ region is short, typically only a few alignments in six bins need to\n+ be retrieved. The overlapping alignments can be quickly fetched.\n+\n+ */\n+\n+#define BAM_MIN_CHUNK_GAP 32768\n+// 1<<14 is the size of minimum bin.\n+#define BAM_LIDX_SHIFT 14\n+\n+#define BAM_MAX_BIN 37450 // =(8^6-1)/7+1\n+\n+typedef struct {\n+\tuint64_t u, v;\n+} pair64_t;\n+\n+#define pair64_lt(a,b) ((a).u < (b).u)\n+KSORT_INIT(off, pair64_t, pair64_lt)\n+\n+typedef struct {\n+\tuint32_t m, n;\n+\tpair64_t *list;\n+} bam_binlist_t;\n+\n+typedef struct {\n+\tint32_t n, m;\n+\tuint64_t *offset;\n+} bam_lidx_t;\n+\n+KHASH_MAP_INIT_INT(i, bam_binlist_t)\n+\n+struct __bam_index_t {\n+\tint32_t n;\n+\tuint64_t n_no_coor; // unmapped reads without coordinate\n+\tkhash_t(i) **index;\n+\tbam_lidx_t *index2;\n+};\n+\n+// requirement: len <= LEN_MASK\n+static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)\n+{\n+\tkhint_t k;\n+\tbam_binlist_t *l;\n+\tint ret;\n+\tk = kh_put(i, h, bin, &ret);\n+\tl = &kh_value(h, k);\n+\tif (ret) { // not present\n+\t\tl->m = 1; l->n = 0;\n+\t\tl->list = (pair64_t*)calloc(l->m, 16);\n+\t}\n+\tif (l->n == l->m) {\n+\t\tl->m <<= 1;\n+\t\tl->list = (pair64_t*)realloc(l->list, l->m * 16);\n+\t}\n+\tl->list[l->n].u = beg; l->list[l->n++].v = end;\n+}\n+\n+static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)\n+{\n+\tint i, beg, end;\n+\tbeg = b->core.pos >> BAM_LIDX_SHIFT;\n+\tend = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;\n+\tif (index2->m < end + 1) {\n+\t\tint old_m = index2->m;\n+\t\tindex2->m = end + 1;\n+\t\tkroundup32(index2->m);\n+\t\tindex2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);\n+\t\tmemset(index2->offset + old_m, 0, 8 * (index2->m - old_m));\n+\t}\n+\tif (beg == end) {\n+\t\tif (index2->offset[beg] == 0) index2->offset[beg] = offset;\n+\t} else {\n+\t\tfor (i = beg; i <= end; ++i)\n+\t\t\tif (index2->offset[i] == 0) index2->offset[i] = offset;\n+\t}\n+\tindex2->n = end + 1;\n+}\n+\n+static void merge_chunks(bam_index_t *idx)\n+{\n+#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)\n+\tkhash_t(i) *index;\n+\tint i, l, m;\n+\tkhint_t k;\n+\tfor (i = 0; i < idx->n; ++i) {\n+\t\tindex = idx->index[i];\n+\t\tfor (k = kh_begin(index); k != kh_end(index); ++k) {\n+\t\t\tbam_binlist_t *p;\n+\t\t\tif (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue;\n+\t\t\tp = &kh_value(index, k);\n+\t\t\tm = 0;\n+\t\t\tfor (l = 1; l < p->n; ++l) {\n+#ifdef BAM_TRUE_OFFSET\n+\t\t\t\tif (p->list[m].v + BAM_MIN_CHUNK_GA'..b'\tbam_iter_t iter = 0;\n+\n+\tif (beg < 0) beg = 0;\n+\tif (end < beg) return 0;\n+\t// initialize iter\n+\titer = calloc(1, sizeof(struct __bam_iter_t));\n+\titer->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;\n+\t//\n+\tbins = (uint16_t*)calloc(BAM_MAX_BIN, 2);\n+\tn_bins = reg2bins(beg, end, bins);\n+\tindex = idx->index[tid];\n+\tif (idx->index2[tid].n > 0) {\n+\t\tmin_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]\n+\t\t\t: idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];\n+\t\tif (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4\n+\t\t\tint n = beg>>BAM_LIDX_SHIFT;\n+\t\t\tif (n > idx->index2[tid].n) n = idx->index2[tid].n;\n+\t\t\tfor (i = n - 1; i >= 0; --i)\n+\t\t\t\tif (idx->index2[tid].offset[i] != 0) break;\n+\t\t\tif (i >= 0) min_off = idx->index2[tid].offset[i];\n+\t\t}\n+\t} else min_off = 0; // tabix 0.1.2 may produce such index files\n+\tfor (i = n_off = 0; i < n_bins; ++i) {\n+\t\tif ((k = kh_get(i, index, bins[i])) != kh_end(index))\n+\t\t\tn_off += kh_value(index, k).n;\n+\t}\n+\tif (n_off == 0) {\n+\t\tfree(bins); return iter;\n+\t}\n+\toff = (pair64_t*)calloc(n_off, 16);\n+\tfor (i = n_off = 0; i < n_bins; ++i) {\n+\t\tif ((k = kh_get(i, index, bins[i])) != kh_end(index)) {\n+\t\t\tint j;\n+\t\t\tbam_binlist_t *p = &kh_value(index, k);\n+\t\t\tfor (j = 0; j < p->n; ++j)\n+\t\t\t\tif (p->list[j].v > min_off) off[n_off++] = p->list[j];\n+\t\t}\n+\t}\n+\tfree(bins);\n+\t{\n+\t\tbam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));\n+\t\tint l;\n+\t\tks_introsort(off, n_off, off);\n+\t\t// resolve completely contained adjacent blocks\n+\t\tfor (i = 1, l = 0; i < n_off; ++i)\n+\t\t\tif (off[l].v < off[i].v)\n+\t\t\t\toff[++l] = off[i];\n+\t\tn_off = l + 1;\n+\t\t// resolve overlaps between adjacent blocks; this may happen due to the merge in indexing\n+\t\tfor (i = 1; i < n_off; ++i)\n+\t\t\tif (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;\n+\t\t{ // merge adjacent blocks\n+#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)\n+\t\t\tfor (i = 1, l = 0; i < n_off; ++i) {\n+#ifdef BAM_TRUE_OFFSET\n+\t\t\t\tif (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;\n+#else\n+\t\t\t\tif (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;\n+#endif\n+\t\t\t\telse off[++l] = off[i];\n+\t\t\t}\n+\t\t\tn_off = l + 1;\n+#endif\n+\t\t}\n+\t\tbam_destroy1(b);\n+\t}\n+\titer->n_off = n_off; iter->off = off;\n+\treturn iter;\n+}\n+\n+pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off)\n+{ // for pysam compatibility\n+\tbam_iter_t iter;\n+\tpair64_t *off;\n+\titer = bam_iter_query(idx, tid, beg, end);\n+\toff = iter->off; *cnt_off = iter->n_off;\n+\tfree(iter);\n+\treturn off;\n+}\n+\n+void bam_iter_destroy(bam_iter_t iter)\n+{\n+\tif (iter) { free(iter->off); free(iter); }\n+}\n+\n+int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b)\n+{\n+\tif (iter->finished) return -1;\n+\tif (iter->from_first) {\n+\t\tint ret = bam_read1(fp, b);\n+\t\tif (ret < 0) iter->finished = 1;\n+\t\treturn ret;\n+\t}\n+\tif (iter->off == 0) return -1;\n+\tfor (;;) {\n+\t\tint ret;\n+\t\tif (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk\n+\t\t\tif (iter->i == iter->n_off - 1) break; // no more chunks\n+\t\t\tif (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug\n+\t\t\tif (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek\n+\t\t\t\tbam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);\n+\t\t\t\titer->curr_off = bam_tell(fp);\n+\t\t\t}\n+\t\t\t++iter->i;\n+\t\t}\n+\t\tif ((ret = bam_read1(fp, b)) > 0) {\n+\t\t\titer->curr_off = bam_tell(fp);\n+\t\t\tif (b->core.tid != iter->tid || b->core.pos >= iter->end) break; // no need to proceed\n+\t\t\telse if (is_overlap(iter->beg, iter->end, b)) return ret;\n+\t\t} else break; // end of file\n+\t}\n+\titer->finished = 1;\n+\treturn -1;\n+}\n+\n+int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)\n+{\n+\tbam_iter_t iter;\n+\tbam1_t *b;\n+\tb = bam_init1();\n+\titer = bam_iter_query(idx, tid, beg, end);\n+\twhile (bam_iter_read(fp, iter, b) >= 0) func(b, data);\n+\tbam_destroy1(b);\n+\treturn 0;\n+}\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/bam_lpileup.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/bam_lpileup.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,198 @@ +#include <stdlib.h> +#include <stdio.h> +#include <assert.h> +#include "bam.h" +#include "ksort.h" + +#define TV_GAP 2 + +typedef struct __freenode_t { + uint32_t level:28, cnt:4; + struct __freenode_t *next; +} freenode_t, *freenode_p; + +#define freenode_lt(a,b) ((a)->cnt < (b)->cnt || ((a)->cnt == (b)->cnt && (a)->level < (b)->level)) +KSORT_INIT(node, freenode_p, freenode_lt) + +/* Memory pool, similar to the one in bam_pileup.c */ +typedef struct { + int cnt, n, max; + freenode_t **buf; +} mempool_t; + +static mempool_t *mp_init() +{ + return (mempool_t*)calloc(1, sizeof(mempool_t)); +} +static void mp_destroy(mempool_t *mp) +{ + int k; + for (k = 0; k < mp->n; ++k) free(mp->buf[k]); + free(mp->buf); free(mp); +} +static inline freenode_t *mp_alloc(mempool_t *mp) +{ + ++mp->cnt; + if (mp->n == 0) return (freenode_t*)calloc(1, sizeof(freenode_t)); + else return mp->buf[--mp->n]; +} +static inline void mp_free(mempool_t *mp, freenode_t *p) +{ + --mp->cnt; p->next = 0; p->cnt = TV_GAP; + if (mp->n == mp->max) { + mp->max = mp->max? mp->max<<1 : 256; + mp->buf = (freenode_t**)realloc(mp->buf, sizeof(freenode_t*) * mp->max); + } + mp->buf[mp->n++] = p; +} + +/* core part */ +struct __bam_lplbuf_t { + int max, n_cur, n_pre; + int max_level, *cur_level, *pre_level; + mempool_t *mp; + freenode_t **aux, *head, *tail; + int n_nodes, m_aux; + bam_pileup_f func; + void *user_data; + bam_plbuf_t *plbuf; +}; + +void bam_lplbuf_reset(bam_lplbuf_t *buf) +{ + freenode_t *p, *q; + bam_plbuf_reset(buf->plbuf); + for (p = buf->head; p->next;) { + q = p->next; + mp_free(buf->mp, p); + p = q; + } + buf->head = buf->tail; + buf->max_level = 0; + buf->n_cur = buf->n_pre = 0; + buf->n_nodes = 0; +} + +static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) +{ + bam_lplbuf_t *tv = (bam_lplbuf_t*)data; + freenode_t *p; + int i, l, max_level; + // allocate memory if necessary + if (tv->max < n) { // enlarge + tv->max = n; + kroundup32(tv->max); + tv->cur_level = (int*)realloc(tv->cur_level, sizeof(int) * tv->max); + tv->pre_level = (int*)realloc(tv->pre_level, sizeof(int) * tv->max); + } + tv->n_cur = n; + // update cnt + for (p = tv->head; p->next; p = p->next) + if (p->cnt > 0) --p->cnt; + // calculate cur_level[] + max_level = 0; + for (i = l = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (p->is_head) { + if (tv->head->next && tv->head->cnt == 0) { // then take a free slot + freenode_t *p = tv->head->next; + tv->cur_level[i] = tv->head->level; + mp_free(tv->mp, tv->head); + tv->head = p; + --tv->n_nodes; + } else tv->cur_level[i] = ++tv->max_level; + } else { + tv->cur_level[i] = tv->pre_level[l++]; + if (p->is_tail) { // then return a free slot + tv->tail->level = tv->cur_level[i]; + tv->tail->next = mp_alloc(tv->mp); + tv->tail = tv->tail->next; + ++tv->n_nodes; + } + } + if (tv->cur_level[i] > max_level) max_level = tv->cur_level[i]; + ((bam_pileup1_t*)p)->level = tv->cur_level[i]; + } + assert(l == tv->n_pre); + tv->func(tid, pos, n, pl, tv->user_data); + // sort the linked list + if (tv->n_nodes) { + freenode_t *q; + if (tv->n_nodes + 1 > tv->m_aux) { // enlarge + tv->m_aux = tv->n_nodes + 1; + kroundup32(tv->m_aux); + tv->aux = (freenode_t**)realloc(tv->aux, sizeof(void*) * tv->m_aux); + } + for (p = tv->head, i = l = 0; p->next;) { + if (p->level > max_level) { // then discard this entry + q = p->next; + mp_free(tv->mp, p); + p = q; + } else { + tv->aux[i++] = p; + p = p->next; + } + } + tv->aux[i] = tv->tail; // add a proper tail for the loop below + tv->n_nodes = i; + if (tv->n_nodes) { + ks_introsort(node, tv->n_nodes, tv->aux); + for (i = 0; i < tv->n_nodes; ++i) tv->aux[i]->next = tv->aux[i+1]; + tv->head = tv->aux[0]; + } else tv->head = tv->tail; + } + // clean up + tv->max_level = max_level; + memcpy(tv->pre_level, tv->cur_level, tv->n_cur * 4); + // squeeze out terminated levels + for (i = l = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (!p->is_tail) + tv->pre_level[l++] = tv->pre_level[i]; + } + tv->n_pre = l; +/* + fprintf(stderr, "%d\t", pos+1); + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (p->is_head) fprintf(stderr, "^"); + if (p->is_tail) fprintf(stderr, "$"); + fprintf(stderr, "%d,", p->level); + } + fprintf(stderr, "\n"); +*/ + return 0; +} + +bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data) +{ + bam_lplbuf_t *tv; + tv = (bam_lplbuf_t*)calloc(1, sizeof(bam_lplbuf_t)); + tv->mp = mp_init(); + tv->head = tv->tail = mp_alloc(tv->mp); + tv->func = func; + tv->user_data = data; + tv->plbuf = bam_plbuf_init(tview_func, tv); + return (bam_lplbuf_t*)tv; +} + +void bam_lplbuf_destroy(bam_lplbuf_t *tv) +{ + freenode_t *p, *q; + free(tv->cur_level); free(tv->pre_level); + bam_plbuf_destroy(tv->plbuf); + free(tv->aux); + for (p = tv->head; p->next;) { + q = p->next; + mp_free(tv->mp, p); p = q; + } + mp_free(tv->mp, p); + assert(tv->mp->cnt == 0); + mp_destroy(tv->mp); + free(tv); +} + +int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *tv) +{ + return bam_plbuf_push(b, tv->plbuf); +} |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/bam_maqcns.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/bam_maqcns.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,614 @@\n+#include <math.h>\n+#include <assert.h>\n+#include "bam.h"\n+#include "bam_maqcns.h"\n+#include "ksort.h"\n+#include "kaln.h"\n+KSORT_INIT_GENERIC(uint32_t)\n+\n+#define INDEL_WINDOW_SIZE 50\n+#define INDEL_EXT_DEP 0.9\n+\n+typedef struct __bmc_aux_t {\n+\tint max;\n+\tuint32_t *info;\n+} bmc_aux_t;\n+\n+typedef struct {\n+\tfloat esum[4], fsum[4];\n+\tuint32_t c[4];\n+\tuint32_t rms_mapQ;\n+} glf_call_aux_t;\n+\n+char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };\n+\n+/*\n+ P(<b1,b2>) = \\theta \\sum_{i=1}^{N-1} 1/i\n+ P(D|<b1,b2>) = \\sum_{k=1}^{N-1} p_k 1/2 [(k/N)^n_2(1-k/N)^n_1 + (k/N)^n1(1-k/N)^n_2]\n+ p_k = 1/k / \\sum_{i=1}^{N-1} 1/i\n+ */\n+static void cal_het(bam_maqcns_t *aa)\n+{\n+\tint k, n1, n2;\n+\tdouble sum_harmo; // harmonic sum\n+\tdouble poly_rate;\n+\n+\tfree(aa->lhet);\n+\taa->lhet = (double*)calloc(256 * 256, sizeof(double));\n+\tsum_harmo = 0.0;\n+\tfor (k = 1; k <= aa->n_hap - 1; ++k)\n+\t\tsum_harmo += 1.0 / k;\n+\tfor (n1 = 0; n1 < 256; ++n1) {\n+\t\tfor (n2 = 0; n2 < 256; ++n2) {\n+\t\t\tlong double sum = 0.0;\n+\t\t\tdouble lC = aa->is_soap? 0 : lgamma(n1+n2+1) - lgamma(n1+1) - lgamma(n2+1); // \\binom{n1+n2}{n1}\n+\t\t\tfor (k = 1; k <= aa->n_hap - 1; ++k) {\n+\t\t\t\tdouble pk = 1.0 / k / sum_harmo;\n+\t\t\t\tdouble log1 = log((double)k/aa->n_hap);\n+\t\t\t\tdouble log2 = log(1.0 - (double)k/aa->n_hap);\n+\t\t\t\tsum += pk * 0.5 * (expl(log1*n2) * expl(log2*n1) + expl(log1*n1) * expl(log2*n2));\n+\t\t\t}\n+\t\t\taa->lhet[n1<<8|n2] = lC + logl(sum);\n+\t\t}\n+\t}\n+\tpoly_rate = aa->het_rate * sum_harmo;\n+\taa->q_r = -4.343 * log(2.0 * poly_rate / (1.0 - poly_rate));\n+}\n+\n+/** initialize the helper structure */\n+static void cal_coef(bam_maqcns_t *aa)\n+{\n+\tint k, n, q;\n+\tlong double sum_a[257], b[256], q_c[256], tmp[256], fk2[256];\n+\tdouble *lC;\n+\n+\t// aa->lhet will be allocated and initialized \n+\tfree(aa->fk); free(aa->coef);\n+\taa->coef = 0;\n+\taa->fk = (double*)calloc(256, sizeof(double));\n+\taa->fk[0] = fk2[0] = 1.0;\n+\tfor (n = 1; n != 256; ++n) {\n+\t\taa->fk[n] = pow(aa->theta, n) * (1.0 - aa->eta) + aa->eta;\n+\t\tfk2[n] = aa->fk[n>>1]; // this is an approximation, assuming reads equally likely come from both strands\n+\t}\n+\tif (aa->is_soap) return;\n+\taa->coef = (double*)calloc(256*256*64, sizeof(double));\n+\tlC = (double*)calloc(256 * 256, sizeof(double));\n+\tfor (n = 1; n != 256; ++n)\n+\t\tfor (k = 1; k <= n; ++k)\n+\t\t\tlC[n<<8|k] = lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1);\n+\tfor (q = 1; q != 64; ++q) {\n+\t\tdouble e = pow(10.0, -q/10.0);\n+\t\tdouble le = log(e);\n+\t\tdouble le1 = log(1.0-e);\n+\t\tfor (n = 1; n != 256; ++n) {\n+\t\t\tdouble *coef = aa->coef + (q<<16|n<<8);\n+\t\t\tsum_a[n+1] = 0.0;\n+\t\t\tfor (k = n; k >= 0; --k) { // a_k = \\sum_{i=k}^n C^n_k \\epsilon^k (1-\\epsilon)^{n-k}\n+\t\t\t\tsum_a[k] = sum_a[k+1] + expl(lC[n<<8|k] + k*le + (n-k)*le1);\n+\t\t\t\tb[k] = sum_a[k+1] / sum_a[k];\n+\t\t\t\tif (b[k] > 0.99) b[k] = 0.99;\n+\t\t\t}\n+\t\t\tfor (k = 0; k != n; ++k) // log(\\bar\\beta_{nk}(\\bar\\epsilon)^{f_k})\n+\t\t\t\tq_c[k] = -4.343 * fk2[k] * logl(b[k] / e);\n+\t\t\tfor (k = 1; k != n; ++k) q_c[k] += q_c[k-1]; // \\prod_{i=0}^k c_i\n+\t\t\tfor (k = 0; k <= n; ++k) { // powl() in 64-bit mode seems broken on my Mac OS X 10.4.9\n+\t\t\t\ttmp[k] = -4.343 * logl(1.0 - expl(fk2[k] * logl(b[k])));\n+\t\t\t\tcoef[k] = (k? q_c[k-1] : 0) + tmp[k]; // this is the final c_{nk}\n+\t\t\t}\n+\t\t}\n+\t}\n+\tfree(lC);\n+}\n+\n+bam_maqcns_t *bam_maqcns_init()\n+{\n+\tbam_maqcns_t *bm;\n+\tbm = (bam_maqcns_t*)calloc(1, sizeof(bam_maqcns_t));\n+\tbm->aux = (bmc_aux_t*)calloc(1, sizeof(bmc_aux_t));\n+\tbm->het_rate = 0.001;\n+\tbm->theta = 0.85;\n+\tbm->n_hap = 2;\n+\tbm->eta = 0.03;\n+\tbm->cap_mapQ = 60;\n+\treturn bm;\n+}\n+\n+void bam_maqcns_prepare(bam_maqcns_t *bm)\n+{\n+\tcal_coef(bm); cal_het(bm);\n+}\n+\n+void bam_maqcns_destroy(bam_maqcns_t *bm)\n+{\n+\tif (bm == 0) return;\n+\tfree(bm->lhet); free(bm->fk); free(bm->coef); free(bm->aux->info);\n+\tfree(bm->aux); free(bm);\n+}\n+\n+glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam_maqcns_t *bm)\n+{\n+\tglf_call_aux_t *b;\n+\tint i, j, k, w[8], c, n;\n+\tglf1_t *g = (glf1_t*)calloc(1, sizeof(glf1_t));\n+'..b'or (l = 0; l < n_acigar; ++l) fprintf(stderr, "%d%c", acigar[l]>>4, "MIDS"[acigar[l]&0xf]);\n+\t\t\t\t\t\tfprintf(stderr, "\\n");\n+\t\t\t\t\t\tfor (l = 0; l < tend - tbeg + types[i]; ++l) fputc("ACGTN"[ref2[l+tbeg-left]], stderr);\n+\t\t\t\t\t\tfputc(\'\\n\', stderr);\n+\t\t\t\t\t\tfor (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[rs[l]], stderr);\n+\t\t\t\t\t\tfputc(\'\\n\', stderr);\n+\t\t\t\t\t\t}*/\n+\t\t\t\t\tfree(acigar);\n+\t\t\t\t}\n+\t\t\t}\n+\t\t}\n+\t\t{ // get final result\n+\t\t\tint *sum, max1, max2, max1_i, max2_i;\n+\t\t\t// pick up the best two score\n+\t\t\tsum = (int*)calloc(n_types, sizeof(int));\n+\t\t\tfor (i = 0; i < n_types; ++i)\n+\t\t\t\tfor (j = 0; j < n; ++j)\n+\t\t\t\t\tsum[i] += -pscore[i*n+j];\n+\t\t\tmax1 = max2 = -0x7fffffff; max1_i = max2_i = -1;\n+\t\t\tfor (i = 0; i < n_types; ++i) {\n+\t\t\t\tif (sum[i] > max1) {\n+\t\t\t\t\tmax2 = max1; max2_i = max1_i; max1 = sum[i]; max1_i = i;\n+\t\t\t\t} else if (sum[i] > max2) {\n+\t\t\t\t\tmax2 = sum[i]; max2_i = i;\n+\t\t\t\t}\n+\t\t\t}\n+\t\t\tfree(sum);\n+\t\t\t// write ret\n+\t\t\tret = (bam_maqindel_ret_t*)calloc(1, sizeof(bam_maqindel_ret_t));\n+\t\t\tret->indel1 = types[max1_i]; ret->indel2 = types[max2_i];\n+\t\t\tret->s[0] = (char*)calloc(abs(ret->indel1) + 2, 1);\n+\t\t\tret->s[1] = (char*)calloc(abs(ret->indel2) + 2, 1);\n+\t\t\t// write indel sequence\n+\t\t\tif (ret->indel1 > 0) {\n+\t\t\t\tret->s[0][0] = \'+\';\n+\t\t\t\tfor (k = 0; k < ret->indel1; ++k)\n+\t\t\t\t\tret->s[0][k+1] = bam_nt16_rev_table[(int)inscns[max1_i*max_ins + k]];\n+\t\t\t} else if (ret->indel1 < 0) {\n+\t\t\t\tret->s[0][0] = \'-\';\n+\t\t\t\tfor (k = 0; k < -ret->indel1 && ref[pos + k + 1]; ++k)\n+\t\t\t\t\tret->s[0][k+1] = ref[pos + k + 1];\n+\t\t\t} else ret->s[0][0] = \'*\';\n+\t\t\tif (ret->indel2 > 0) {\n+\t\t\t\tret->s[1][0] = \'+\';\n+\t\t\t\tfor (k = 0; k < ret->indel2; ++k)\n+\t\t\t\t\tret->s[1][k+1] = bam_nt16_rev_table[(int)inscns[max2_i*max_ins + k]];\n+\t\t\t} else if (ret->indel2 < 0) {\n+\t\t\t\tret->s[1][0] = \'-\';\n+\t\t\t\tfor (k = 0; k < -ret->indel2 && ref[pos + k + 1]; ++k)\n+\t\t\t\t\tret->s[1][k+1] = ref[pos + k + 1];\n+\t\t\t} else ret->s[1][0] = \'*\';\n+\t\t\t// write count\n+\t\t\tfor (i = 0; i < n; ++i) {\n+\t\t\t\tconst bam_pileup1_t *p = pl + i;\n+\t\t\t\tif (p->indel == ret->indel1) ++ret->cnt1;\n+\t\t\t\telse if (p->indel == ret->indel2) ++ret->cnt2;\n+\t\t\t\telse ++ret->cnt_anti;\n+\t\t\t}\n+\t\t\t{ // write gl[]\n+\t\t\t\tint tmp, seq_err = 0;\n+\t\t\t\tdouble x = 1.0;\n+\t\t\t\ttmp = max1_i - max2_i;\n+\t\t\t\tif (tmp < 0) tmp = -tmp;\n+\t\t\t\tfor (j = 0; j < tmp + 1; ++j) x *= INDEL_EXT_DEP;\n+\t\t\t\tseq_err = mi->q_indel * (1.0 - x) / (1.0 - INDEL_EXT_DEP);\n+\t\t\t\tret->gl[0] = ret->gl[1] = 0;\n+\t\t\t\tfor (j = 0; j < n; ++j) {\n+\t\t\t\t\tint s1 = pscore[max1_i*n + j], s2 = pscore[max2_i*n + j];\n+\t\t\t\t\t//fprintf(stderr, "id=%d, %d, %d, %d, %d, %d\\n", j, pl[j].b->core.pos+1, types[max1_i], types[max2_i], s1, s2);\n+\t\t\t\t\tif (s1 > s2) ret->gl[0] += s1 - s2 < seq_err? s1 - s2 : seq_err;\n+\t\t\t\t\telse ret->gl[1] += s2 - s1 < seq_err? s2 - s1 : seq_err;\n+\t\t\t\t}\n+\t\t\t}\n+\t\t\t// write cnt_ref and cnt_ambi\n+\t\t\tif (max1_i != 0 && max2_i != 0) {\n+\t\t\t\tfor (j = 0; j < n; ++j) {\n+\t\t\t\t\tint diff1 = score[j] - score[max1_i * n + j];\n+\t\t\t\t\tint diff2 = score[j] - score[max2_i * n + j];\n+\t\t\t\t\tif (diff1 > 0 && diff2 > 0) ++ret->cnt_ref;\n+\t\t\t\t\telse if (diff1 == 0 || diff2 == 0) ++ret->cnt_ambi;\n+\t\t\t\t}\n+\t\t\t}\n+\t\t}\n+\t\tfree(score); free(pscore); free(ref2); free(rs); free(inscns);\n+\t}\n+\t{ // call genotype\n+\t\tint q[3], qr_indel = (int)(-4.343 * log(mi->r_indel) + 0.5);\n+\t\tint min1, min2, min1_i;\n+\t\tq[0] = ret->gl[0] + (ret->s[0][0] != \'*\'? 0 : 0) * qr_indel;\n+\t\tq[1] = ret->gl[1] + (ret->s[1][0] != \'*\'? 0 : 0) * qr_indel;\n+\t\tq[2] = n * 3 + (ret->s[0][0] == \'*\' || ret->s[1][0] == \'*\'? 1 : 1) * qr_indel;\n+\t\tmin1 = min2 = 0x7fffffff; min1_i = -1;\n+\t\tfor (i = 0; i < 3; ++i) {\n+\t\t\tif (q[i] < min1) {\n+\t\t\t\tmin2 = min1; min1 = q[i]; min1_i = i;\n+\t\t\t} else if (q[i] < min2) min2 = q[i];\n+\t\t}\n+\t\tret->gt = min1_i;\n+\t\tret->q_cns = min2 - min1;\n+\t\t// set q_ref\n+\t\tif (ret->gt < 2) ret->q_ref = (ret->s[ret->gt][0] == \'*\')? 0 : q[1-ret->gt] - q[ret->gt] - qr_indel - 3;\n+\t\telse ret->q_ref = (ret->s[0][0] == \'*\')? q[0] - q[2] : q[1] - q[2];\n+\t\tif (ret->q_ref < 0) ret->q_ref = 0;\n+\t}\n+\tfree(types);\n+\treturn ret;\n+}\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/bam_maqcns.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/bam_maqcns.h Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,57 @@ +#ifndef BAM_MAQCNS_H +#define BAM_MAQCNS_H + +#include "glf.h" + +struct __bmc_aux_t; + +typedef struct { + float het_rate, theta; + int n_hap, cap_mapQ, is_soap; + + float eta, q_r; + double *fk, *coef; + double *lhet; + struct __bmc_aux_t *aux; +} bam_maqcns_t; + +typedef struct { + int q_indel; // indel sequencing error, phred scaled + float r_indel; // indel prior + float r_snp; // snp prior + // hidden parameters, unchangeable from command line + int mm_penalty, indel_err, ambi_thres; +} bam_maqindel_opt_t; + +typedef struct { + int indel1, indel2; + int cnt1, cnt2, cnt_anti; + int cnt_ref, cnt_ambi; + char *s[2]; + // + int gt, gl[2]; + int q_cns, q_ref; +} bam_maqindel_ret_t; + +#ifdef __cplusplus +extern "C" { +#endif + + bam_maqcns_t *bam_maqcns_init(); + void bam_maqcns_prepare(bam_maqcns_t *bm); + void bam_maqcns_destroy(bam_maqcns_t *bm); + glf1_t *bam_maqcns_glfgen(int n, const bam_pileup1_t *pl, uint8_t ref_base, bam_maqcns_t *bm); + uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm); + // return: cns<<28 | cns2<<24 | mapQ<<16 | cnsQ<<8 | cnsQ2 + uint32_t glf2cns(const glf1_t *g, int q_r); + + bam_maqindel_opt_t *bam_maqindel_opt_init(); + bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref, + int _n_types, int *_types); + void bam_maqindel_ret_destroy(bam_maqindel_ret_t*); + +#ifdef __cplusplus +} +#endif + +#endif |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/bam_mate.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/bam_mate.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,70 @@ +#include <stdlib.h> +#include <string.h> +#include "bam.h" + +// currently, this function ONLY works if each read has one hit +void bam_mating_core(bamFile in, bamFile out) +{ + bam_header_t *header; + bam1_t *b[2]; + int curr, has_prev; + + header = bam_header_read(in); + bam_header_write(out, header); + + b[0] = bam_init1(); + b[1] = bam_init1(); + curr = 0; has_prev = 0; + while (bam_read1(in, b[curr]) >= 0) { + bam1_t *cur = b[curr], *pre = b[1-curr]; + if (has_prev) { + if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name + cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos; + pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos; + if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) + && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) + { + uint32_t cur5, pre5; + cur5 = (cur->core.flag&BAM_FREVERSE)? bam_calend(&cur->core, bam1_cigar(cur)) : cur->core.pos; + pre5 = (pre->core.flag&BAM_FREVERSE)? bam_calend(&pre->core, bam1_cigar(pre)) : pre->core.pos; + cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; + } else cur->core.isize = pre->core.isize = 0; + if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE; + else cur->core.flag &= ~BAM_FMREVERSE; + if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FMREVERSE; + else pre->core.flag &= ~BAM_FMREVERSE; + if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; } + if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; } + bam_write1(out, pre); + bam_write1(out, cur); + has_prev = 0; + } else { // unpaired or singleton + pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; + if (pre->core.flag & BAM_FPAIRED) { + pre->core.flag |= BAM_FMUNMAP; + pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR; + } + bam_write1(out, pre); + } + } else has_prev = 1; + curr = 1 - curr; + } + if (has_prev) bam_write1(out, b[1-curr]); + bam_header_destroy(header); + bam_destroy1(b[0]); + bam_destroy1(b[1]); +} + +int bam_mating(int argc, char *argv[]) +{ + bamFile in, out; + if (argc < 3) { + fprintf(stderr, "samtools fixmate <in.nameSrt.bam> <out.nameSrt.bam>\n"); + return 1; + } + in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r"); + out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w"); + bam_mating_core(in, out); + bam_close(in); bam_close(out); + return 0; +} |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/bam_md.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/bam_md.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,175 @@ +#include <unistd.h> +#include <assert.h> +#include <string.h> +#include <ctype.h> +#include "faidx.h" +#include "sam.h" +#include "kstring.h" + +void bam_fillmd1_core(bam1_t *b, char *ref, int is_equal, int max_nm) +{ + uint8_t *seq = bam1_seq(b); + uint32_t *cigar = bam1_cigar(b); + bam1_core_t *c = &b->core; + int i, x, y, u = 0; + kstring_t *str; + uint8_t *old_md, *old_nm; + int32_t old_nm_i = -1, nm = 0; + + str = (kstring_t*)calloc(1, sizeof(kstring_t)); + for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { + int j, l = cigar[i]>>4, op = cigar[i]&0xf; + if (op == BAM_CMATCH) { + for (j = 0; j < l; ++j) { + int z = y + j; + int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; + if (ref[x+j] == 0) break; // out of boundary + if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match + if (is_equal) seq[z/2] &= (z&1)? 0xf0 : 0x0f; + ++u; + } else { + ksprintf(str, "%d", u); + kputc(ref[x+j], str); + u = 0; ++nm; + } + } + if (j < l) break; + x += l; y += l; + } else if (op == BAM_CDEL) { + ksprintf(str, "%d", u); + kputc('^', str); + for (j = 0; j < l; ++j) { + if (ref[x+j] == 0) break; + kputc(ref[x+j], str); + } + u = 0; + if (j < l) break; + x += l; nm += l; + } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { + y += l; + if (op == BAM_CINS) nm += l; + } else if (op == BAM_CREF_SKIP) { + x += l; + } + } + ksprintf(str, "%d", u); + // apply max_nm + if (max_nm > 0 && nm >= max_nm) { + for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { + int j, l = cigar[i]>>4, op = cigar[i]&0xf; + if (op == BAM_CMATCH) { + for (j = 0; j < l; ++j) { + int z = y + j; + int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; + if (ref[x+j] == 0) break; // out of boundary + if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match + seq[z/2] |= (z&1)? 0x0f : 0xf0; + bam1_qual(b)[z] = 0; + } + } + if (j < l) break; + x += l; y += l; + } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; + else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; + } + } + // update NM + old_nm = bam_aux_get(b, "NM"); + if (c->flag & BAM_FUNMAP) return; + if (old_nm) old_nm_i = bam_aux2i(old_nm); + if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); + else if (nm != old_nm_i) { + fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm); + bam_aux_del(b, old_nm); + bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); + } + // update MD + old_md = bam_aux_get(b, "MD"); + if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); + else { + int is_diff = 0; + if (strlen((char*)old_md+1) == str->l) { + for (i = 0; i < str->l; ++i) + if (toupper(old_md[i+1]) != toupper(str->s[i])) + break; + if (i < str->l) is_diff = 1; + } else is_diff = 1; + if (is_diff) { + fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s); + bam_aux_del(b, old_md); + bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); + } + } + free(str->s); free(str); +} + +void bam_fillmd1(bam1_t *b, char *ref, int is_equal) +{ + bam_fillmd1_core(b, ref, is_equal, 0); +} + +int bam_fillmd(int argc, char *argv[]) +{ + int c, is_equal = 0, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm = 0; + samfile_t *fp, *fpout = 0; + faidx_t *fai; + char *ref = 0, mode_w[8], mode_r[8]; + bam1_t *b; + + is_bam_out = is_sam_in = is_uncompressed = 0; + mode_w[0] = mode_r[0] = 0; + strcpy(mode_r, "r"); strcpy(mode_w, "w"); + while ((c = getopt(argc, argv, "eubSn:")) >= 0) { + switch (c) { + case 'e': is_equal = 1; break; + case 'b': is_bam_out = 1; break; + case 'u': is_uncompressed = is_bam_out = 1; break; + case 'S': is_sam_in = 1; break; + case 'n': max_nm = atoi(optarg); break; + default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1; + } + } + if (!is_sam_in) strcat(mode_r, "b"); + if (is_bam_out) strcat(mode_w, "b"); + else strcat(mode_w, "h"); + if (is_uncompressed) strcat(mode_w, "u"); + if (optind + 1 >= argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools fillmd [-eubS] <aln.bam> <ref.fasta>\n\n"); + fprintf(stderr, "Options: -e change identical bases to '='\n"); + fprintf(stderr, " -u uncompressed BAM output (for piping)\n"); + fprintf(stderr, " -b compressed BAM output\n"); + fprintf(stderr, " -S the input is SAM with header\n\n"); + return 1; + } + fp = samopen(argv[optind], mode_r, 0); + if (fp == 0) return 1; + if (is_sam_in && (fp->header == 0 || fp->header->n_targets == 0)) { + fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n"); + return 1; + } + fpout = samopen("-", mode_w, fp->header); + fai = fai_load(argv[optind+1]); + + b = bam_init1(); + while ((ret = samread(fp, b)) >= 0) { + if (b->core.tid >= 0) { + if (tid != b->core.tid) { + free(ref); + ref = fai_fetch(fai, fp->header->target_name[b->core.tid], &len); + tid = b->core.tid; + if (ref == 0) + fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", + fp->header->target_name[tid]); + } + if (ref) bam_fillmd1_core(b, ref, is_equal, max_nm); + } + samwrite(fpout, b); + } + bam_destroy1(b); + + free(ref); + fai_destroy(fai); + samclose(fp); samclose(fpout); + return 0; +} |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/bam_pileup.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/bam_pileup.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,396 @@\n+#include <stdio.h>\n+#include <stdlib.h>\n+#include <ctype.h>\n+#include <assert.h>\n+#include "sam.h"\n+\n+typedef struct __linkbuf_t {\n+\tbam1_t b;\n+\tuint32_t beg, end;\n+\tstruct __linkbuf_t *next;\n+} lbnode_t;\n+\n+/* --- BEGIN: Memory pool */\n+\n+typedef struct {\n+\tint cnt, n, max;\n+\tlbnode_t **buf;\n+} mempool_t;\n+\n+static mempool_t *mp_init()\n+{\n+\tmempool_t *mp;\n+\tmp = (mempool_t*)calloc(1, sizeof(mempool_t));\n+\treturn mp;\n+}\n+static void mp_destroy(mempool_t *mp)\n+{\n+\tint k;\n+\tfor (k = 0; k < mp->n; ++k) {\n+\t\tfree(mp->buf[k]->b.data);\n+\t\tfree(mp->buf[k]);\n+\t}\n+\tfree(mp->buf);\n+\tfree(mp);\n+}\n+static inline lbnode_t *mp_alloc(mempool_t *mp)\n+{\n+\t++mp->cnt;\n+\tif (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));\n+\telse return mp->buf[--mp->n];\n+}\n+static inline void mp_free(mempool_t *mp, lbnode_t *p)\n+{\n+\t--mp->cnt; p->next = 0; // clear lbnode_t::next here\n+\tif (mp->n == mp->max) {\n+\t\tmp->max = mp->max? mp->max<<1 : 256;\n+\t\tmp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);\n+\t}\n+\tmp->buf[mp->n++] = p;\n+}\n+\n+/* --- END: Memory pool */\n+\n+/* --- BEGIN: Auxiliary functions */\n+\n+static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos)\n+{\n+\tunsigned k;\n+\tbam1_t *b = p->b;\n+\tbam1_core_t *c = &b->core;\n+\tuint32_t x = c->pos, y = 0;\n+\tint ret = 1, is_restart = 1;\n+\n+\tif (c->flag&BAM_FUNMAP) return 0; // unmapped read\n+\tassert(x <= pos); // otherwise a bug\n+\tp->qpos = -1; p->indel = 0; p->is_del = p->is_head = p->is_tail = 0;\n+\tfor (k = 0; k < c->n_cigar; ++k) {\n+\t\tint op = bam1_cigar(b)[k] & BAM_CIGAR_MASK; // operation\n+\t\tint l = bam1_cigar(b)[k] >> BAM_CIGAR_SHIFT; // length\n+\t\tif (op == BAM_CMATCH) { // NOTE: this assumes the first and the last operation MUST BE a match or a clip\n+\t\t\tif (x + l > pos) { // overlap with pos\n+\t\t\t\tp->indel = p->is_del = 0;\n+\t\t\t\tp->qpos = y + (pos - x);\n+\t\t\t\tif (x == pos && is_restart) p->is_head = 1;\n+\t\t\t\tif (x + l - 1 == pos) { // come to the end of a match\n+\t\t\t\t\tint has_next_match = 0;\n+\t\t\t\t\tunsigned i;\n+\t\t\t\t\tfor (i = k + 1; i < c->n_cigar; ++i) {\n+\t\t\t\t\t\tuint32_t cigar = bam1_cigar(b)[i];\n+\t\t\t\t\t\tint opi = cigar&BAM_CIGAR_MASK;\n+\t\t\t\t\t\tif (opi == BAM_CMATCH) {\n+\t\t\t\t\t\t\thas_next_match = 1;\n+\t\t\t\t\t\t\tbreak;\n+\t\t\t\t\t\t} else if (opi == BAM_CSOFT_CLIP || opi == BAM_CREF_SKIP || opi == BAM_CHARD_CLIP) break;\n+\t\t\t\t\t}\n+\t\t\t\t\tif (!has_next_match) p->is_tail = 1;\n+\t\t\t\t\tif (k < c->n_cigar - 1 && has_next_match) { // there are additional operation(s)\n+\t\t\t\t\t\tuint32_t cigar = bam1_cigar(b)[k+1]; // next CIGAR\n+\t\t\t\t\t\tint op_next = cigar&BAM_CIGAR_MASK; // next CIGAR operation\n+\t\t\t\t\t\tif (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del\n+\t\t\t\t\t\telse if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins\n+\t\t\t\t\t\telse if (op_next == BAM_CPAD && k + 2 < c->n_cigar) { // no working for adjacent padding\n+\t\t\t\t\t\t\tcigar = bam1_cigar(b)[k+2]; op_next = cigar&BAM_CIGAR_MASK;\n+\t\t\t\t\t\t\tif (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del\n+\t\t\t\t\t\t\telse if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins\n+\t\t\t\t\t\t}\n+\t\t\t\t\t}\n+\t\t\t\t}\n+\t\t\t}\n+\t\t\tx += l; y += l;\n+\t\t} else if (op == BAM_CDEL) { // then set ->is_del\n+\t\t\tif (x + l > pos) {\n+\t\t\t\tp->indel = 0; p->is_del = 1;\n+\t\t\t\tp->qpos = y + (pos - x);\n+\t\t\t}\n+\t\t\tx += l;\n+\t\t} else if (op == BAM_CREF_SKIP) x += l;\n+\t\telse if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;\n+\t\tif (is_restart) is_restart ^= (op == BAM_CMATCH);\n+\t\telse is_restart ^= (op == BAM_CREF_SKIP || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP);\n+\t\tif (x > pos) {\n+\t\t\tif (op == BAM_CREF_SKIP) ret = 0; // then do not put it into pileup at all\n+\t\t\tbreak;\n+\t\t}\n+\t}\n+\tassert(x > pos); // otherwise a bug\n+\treturn ret;\n+}\n+\n+/* --- END: Auxiliary functions */\n+\n+/*******************\n+ * pileup iterator *\n+ *******************/\n+\n+struct __bam_plp_t {\n+\tmempool_t *mp;\n+\tlbnode_t *head, *tail, *dummy;\n+\tint32_t tid, pos, max_tid, max_pos;\n+\tint is_eof, flag_mask, max_plp, error;\n+\tbam_pileup1_t *plp;\n+\t// for the "auto"'..b're.tid > iter->tid) {\n+\t\t\titer->tail->next = mp_alloc(iter->mp);\n+\t\t\titer->tail = iter->tail->next;\n+\t\t}\n+\t} else iter->is_eof = 1;\n+\treturn 0;\n+}\n+\n+const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)\n+{\n+\tconst bam_pileup1_t *plp;\n+\tif (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }\n+\tif ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;\n+\telse {\n+\t\t*_n_plp = 0;\n+\t\tif (iter->is_eof) return 0;\n+\t\twhile (iter->func(iter->data, iter->b) >= 0) {\n+\t\t\tif (bam_plp_push(iter, iter->b) < 0) {\n+\t\t\t\t*_n_plp = -1;\n+\t\t\t\treturn 0;\n+\t\t\t}\n+\t\t\tif ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;\n+\t\t}\n+\t\tbam_plp_push(iter, 0);\n+\t\tif ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;\n+\t\treturn 0;\n+\t}\n+}\n+\n+void bam_plp_reset(bam_plp_t iter)\n+{\n+\tlbnode_t *p, *q;\n+\titer->max_tid = iter->max_pos = -1;\n+\titer->tid = iter->pos = 0;\n+\titer->is_eof = 0;\n+\tfor (p = iter->head; p->next;) {\n+\t\tq = p->next;\n+\t\tmp_free(iter->mp, p);\n+\t\tp = q;\n+\t}\n+\titer->head = iter->tail;\n+}\n+\n+void bam_plp_set_mask(bam_plp_t iter, int mask)\n+{\n+\titer->flag_mask = mask < 0? BAM_DEF_MASK : (BAM_FUNMAP | mask);\n+}\n+\n+/*****************\n+ * callback APIs *\n+ *****************/\n+\n+int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data)\n+{\n+\tbam_plbuf_t *buf;\n+\tint ret;\n+\tbam1_t *b;\n+\tb = bam_init1();\n+\tbuf = bam_plbuf_init(func, func_data);\n+\tbam_plbuf_set_mask(buf, mask);\n+\twhile ((ret = bam_read1(fp, b)) >= 0)\n+\t\tbam_plbuf_push(b, buf);\n+\tbam_plbuf_push(0, buf);\n+\tbam_plbuf_destroy(buf);\n+\tbam_destroy1(b);\n+\treturn 0;\n+}\n+\n+void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask)\n+{\n+\tbam_plp_set_mask(buf->iter, mask);\n+}\n+\n+void bam_plbuf_reset(bam_plbuf_t *buf)\n+{\n+\tbam_plp_reset(buf->iter);\n+}\n+\n+bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data)\n+{\n+\tbam_plbuf_t *buf;\n+\tbuf = calloc(1, sizeof(bam_plbuf_t));\n+\tbuf->iter = bam_plp_init(0, 0);\n+\tbuf->func = func;\n+\tbuf->data = data;\n+\treturn buf;\n+}\n+\n+void bam_plbuf_destroy(bam_plbuf_t *buf)\n+{\n+\tbam_plp_destroy(buf->iter);\n+\tfree(buf);\n+}\n+\n+int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf)\n+{\n+\tint ret, n_plp, tid, pos;\n+\tconst bam_pileup1_t *plp;\n+\tret = bam_plp_push(buf->iter, b);\n+\tif (ret < 0) return ret;\n+\twhile ((plp = bam_plp_next(buf->iter, &tid, &pos, &n_plp)) != 0)\n+\t\tbuf->func(tid, pos, n_plp, plp, buf->data);\n+\treturn 0;\n+}\n+\n+/***********\n+ * mpileup *\n+ ***********/\n+\n+struct __bam_mplp_t {\n+\tint n;\n+\tuint64_t min, *pos;\n+\tbam_plp_t *iter;\n+\tint *n_plp;\n+\tconst bam_pileup1_t **plp;\n+};\n+\n+bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)\n+{\n+\tint i;\n+\tbam_mplp_t iter;\n+\titer = calloc(1, sizeof(struct __bam_mplp_t));\n+\titer->pos = calloc(n, 8);\n+\titer->n_plp = calloc(n, sizeof(int));\n+\titer->plp = calloc(n, sizeof(void*));\n+\titer->iter = calloc(n, sizeof(void*));\n+\titer->n = n;\n+\titer->min = (uint64_t)-1;\n+\tfor (i = 0; i < n; ++i) {\n+\t\titer->iter[i] = bam_plp_init(func, data[i]);\n+\t\titer->pos[i] = iter->min;\n+\t}\n+\treturn iter;\n+}\n+\n+void bam_mplp_destroy(bam_mplp_t iter)\n+{\n+\tint i;\n+\tfor (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);\n+\tfree(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp);\n+\tfree(iter);\n+}\n+\n+int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)\n+{\n+\tint i, ret = 0;\n+\tuint64_t new_min = (uint64_t)-1;\n+\tfor (i = 0; i < iter->n; ++i) {\n+\t\tif (iter->pos[i] == iter->min) {\n+\t\t\tint tid, pos;\n+\t\t\titer->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);\n+\t\t\titer->pos[i] = (uint64_t)tid<<32 | pos;\n+\t\t}\n+\t\tif (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i];\n+\t}\n+\titer->min = new_min;\n+\tif (new_min == (uint64_t)-1) return 0;\n+\t*_tid = new_min>>32; *_pos = (uint32_t)new_min;\n+\tfor (i = 0; i < iter->n; ++i) {\n+\t\tif (iter->pos[i] == iter->min) {\n+\t\t\tn_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];\n+\t\t\t++ret;\n+\t\t} else n_plp[i] = 0, plp[i] = 0;\n+\t}\n+\treturn ret;\n+}\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/bam_plcmd.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/bam_plcmd.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,570 @@\n+#include <math.h>\n+#include <stdio.h>\n+#include <unistd.h>\n+#include <ctype.h>\n+#include "sam.h"\n+#include "faidx.h"\n+#include "bam_maqcns.h"\n+#include "khash.h"\n+#include "glf.h"\n+#include "kstring.h"\n+\n+typedef int *indel_list_t;\n+KHASH_MAP_INIT_INT64(64, indel_list_t)\n+\n+#define BAM_PLF_SIMPLE 0x01\n+#define BAM_PLF_CNS 0x02\n+#define BAM_PLF_INDEL_ONLY 0x04\n+#define BAM_PLF_GLF 0x08\n+#define BAM_PLF_VAR_ONLY 0x10\n+#define BAM_PLF_2ND 0x20\n+#define BAM_PLF_RANBASE 0x40\n+#define BAM_PLF_1STBASE 0x80\n+#define BAM_PLF_ALLBASE 0x100\n+#define BAM_PLF_READPOS 0x200\n+\n+typedef struct {\n+\tbam_header_t *h;\n+\tbam_maqcns_t *c;\n+\tbam_maqindel_opt_t *ido;\n+\tfaidx_t *fai;\n+\tkhash_t(64) *hash;\n+\tuint32_t format;\n+\tint tid, len, last_pos;\n+\tint mask;\n+ int max_depth; // for indel calling, ignore reads with the depth too high. 0 for unlimited\n+\tchar *ref;\n+\tglfFile fp_glf; // for glf output only\n+} pu_data_t;\n+\n+char **__bam_get_lines(const char *fn, int *_n);\n+void bam_init_header_hash(bam_header_t *header);\n+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);\n+\n+static khash_t(64) *load_pos(const char *fn, bam_header_t *h)\n+{\n+\tchar **list;\n+\tint i, j, n, *fields, max_fields;\n+\tkhash_t(64) *hash;\n+\tbam_init_header_hash(h);\n+\tlist = __bam_get_lines(fn, &n);\n+\thash = kh_init(64);\n+\tmax_fields = 0; fields = 0;\n+\tfor (i = 0; i < n; ++i) {\n+\t\tchar *str = list[i];\n+\t\tint chr, n_fields, ret;\n+\t\tkhint_t k;\n+\t\tuint64_t x;\n+\t\tn_fields = ksplit_core(str, 0, &max_fields, &fields);\n+\t\tif (n_fields < 2) continue;\n+\t\tchr = bam_get_tid(h, str + fields[0]);\n+\t\tif (chr < 0) {\n+\t\t\tfprintf(stderr, "[load_pos] unknown reference sequence name: %s\\n", str + fields[0]);\n+\t\t\tcontinue;\n+\t\t}\n+\t\tx = (uint64_t)chr << 32 | (atoi(str + fields[1]) - 1);\n+\t\tk = kh_put(64, hash, x, &ret);\n+\t\tif (ret == 0) {\n+\t\t\tfprintf(stderr, "[load_pos] position %s:%s has been loaded.\\n", str+fields[0], str+fields[1]);\n+\t\t\tcontinue;\n+\t\t}\n+\t\tkh_val(hash, k) = 0;\n+\t\tif (n_fields > 2) {\n+\t\t\t// count\n+\t\t\tfor (j = 2; j < n_fields; ++j) {\n+\t\t\t\tchar *s = str + fields[j];\n+\t\t\t\tif ((*s != \'+\' && *s != \'-\') || !isdigit(s[1])) break;\n+ \t\t\t}\n+\t\t\tif (j > 2) { // update kh_val()\n+\t\t\t\tint *q, y, z;\n+\t\t\t\tq = kh_val(hash, k) = (int*)calloc(j - 1, sizeof(int));\n+\t\t\t\tq[0] = j - 2; z = j; y = 1;\n+\t\t\t\tfor (j = 2; j < z; ++j)\n+\t\t\t\t\tq[y++] = atoi(str + fields[j]);\n+\t\t\t}\n+\t\t}\n+\t\tfree(str);\n+\t}\n+\tfree(list); free(fields);\n+\treturn hash;\n+}\n+\n+// an analogy to pileup_func() below\n+static int glt3_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data)\n+{\n+\tpu_data_t *d = (pu_data_t*)data;\n+\tbam_maqindel_ret_t *r = 0;\n+\tint rb, *proposed_indels = 0;\n+\tglf1_t *g;\n+\tglf3_t *g3;\n+\n+\tif (d->fai == 0) {\n+\t\tfprintf(stderr, "[glt3_func] reference sequence is required for generating GLT. Abort!\\n");\n+\t\texit(1);\n+\t}\n+\tif (d->hash) { // only output a list of sites\n+\t\tkhint_t k = kh_get(64, d->hash, (uint64_t)tid<<32|pos);\n+\t\tif (k == kh_end(d->hash)) return 0;\n+\t\tproposed_indels = kh_val(d->hash, k);\n+\t}\n+\tg3 = glf3_init1();\n+\tif (d->fai && (int)tid != d->tid) {\n+\t\tif (d->ref) { // then write the end mark\n+\t\t\tg3->rtype = GLF3_RTYPE_END;\n+\t\t\tglf3_write1(d->fp_glf, g3);\n+\t\t}\n+\t\tglf3_ref_write(d->fp_glf, d->h->target_name[tid], d->h->target_len[tid]); // write reference\n+\t\tfree(d->ref);\n+\t\td->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len);\n+\t\td->tid = tid;\n+\t\td->last_pos = 0;\n+\t}\n+\trb = (d->ref && (int)pos < d->len)? d->ref[pos] : \'N\';\n+\tg = bam_maqcns_glfgen(n, pu, bam_nt16_table[rb], d->c);\n+\tmemcpy(g3, g, sizeof(glf1_t));\n+\tg3->rtype = GLF3_RTYPE_SUB;\n+\tg3->offset = pos - d->last_pos;\n+\td->last_pos = pos;\n+\tglf3_write1(d->fp_glf, g3);\n+ if (pos < d->len) {\n+ int m = (!d->max_depth || d->max_depth>n) ? n : d->max_depth;\n+\t\tif (proposed_indels)\n+\t\t\tr = bam_maqindel(m, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1);\n+\t\telse r = bam_maqindel(m, pos, d->ido, pu, d->ref, 0, 0);\n+\t}\n+\tif (r) { '..b'"[bam_pileup] fail to read the header: non-exisiting file or wrong format.\\n");\n+\t\t\treturn 1;\n+\t\t}\n+\t\td->h = fp->header;\n+\t\tif (fn_pos) d->hash = load_pos(fn_pos, d->h);\n+\t\tsampileup(fp, d->mask, pileup_func, d);\n+\t\tsamclose(fp); // d->h will be destroyed here\n+\t}\n+\n+\t// free\n+\tif (d->format & BAM_PLF_GLF) bgzf_close(d->fp_glf);\n+\tif (fn_pos) { // free the hash table\n+\t\tkhint_t k;\n+\t\tfor (k = kh_begin(d->hash); k < kh_end(d->hash); ++k)\n+\t\t\tif (kh_exist(d->hash, k)) free(kh_val(d->hash, k));\n+\t\tkh_destroy(64, d->hash);\n+\t}\n+\tfree(fn_pos); free(fn_list); free(fn_fa);\n+\tif (d->fai) fai_destroy(d->fai);\n+\tbam_maqcns_destroy(d->c);\n+\tfree(d->ido); free(d->ref); free(d);\n+\treturn 0;\n+}\n+\n+/***********\n+ * mpileup *\n+ ***********/\n+\n+typedef struct {\n+\tchar *reg;\n+\tfaidx_t *fai;\n+} mplp_conf_t;\n+\n+typedef struct {\n+\tbamFile fp;\n+\tbam_iter_t iter;\n+} mplp_aux_t;\n+\n+static int mplp_func(void *data, bam1_t *b)\n+{\n+\tmplp_aux_t *ma = (mplp_aux_t*)data;\n+\tif (ma->iter) return bam_iter_read(ma->fp, ma->iter, b);\n+\treturn bam_read1(ma->fp, b);\n+}\n+\n+static int mpileup(mplp_conf_t *conf, int n, char **fn)\n+{\n+\tmplp_aux_t **data;\n+\tint i, tid, pos, *n_plp, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid;\n+\tconst bam_pileup1_t **plp;\n+\tbam_mplp_t iter;\n+\tbam_header_t *h = 0;\n+\tchar *ref;\n+\t// allocate\n+\tdata = calloc(n, sizeof(void*));\n+\tplp = calloc(n, sizeof(void*));\n+\tn_plp = calloc(n, sizeof(int*));\n+\t// read the header and initialize data\n+\tfor (i = 0; i < n; ++i) {\n+\t\tbam_header_t *h_tmp;\n+\t\tdata[i] = calloc(1, sizeof(mplp_aux_t));\n+\t\tdata[i]->fp = bam_open(fn[i], "r");\n+\t\th_tmp = bam_header_read(data[i]->fp);\n+\t\tif (conf->reg) {\n+\t\t\tint beg, end;\n+\t\t\tbam_index_t *idx;\n+\t\t\tidx = bam_index_load(fn[i]);\n+\t\t\tif (idx == 0) {\n+\t\t\t\tfprintf(stderr, "[%s] fail to load index for %d-th input.\\n", __func__, i+1);\n+\t\t\t\texit(1);\n+\t\t\t}\n+\t\t\tif (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) {\n+\t\t\t\tfprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\\n", __func__, i+1);\n+\t\t\t\texit(1);\n+\t\t\t}\n+\t\t\tif (i == 0) beg0 = beg, end0 = end;\n+\t\t\tdata[i]->iter = bam_iter_query(idx, tid, beg, end);\n+\t\t\tbam_index_destroy(idx);\n+\t\t}\n+\t\tif (i == 0) h = h_tmp;\n+\t\telse {\n+\t\t\t// FIXME: to check consistency\n+\t\t\tbam_header_destroy(h_tmp);\n+\t\t}\n+\t}\n+\t// mpileup\n+\tref_tid = -1; ref = 0;\n+\titer = bam_mplp_init(n, mplp_func, (void**)data);\n+\twhile (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) {\n+\t\tif (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested\n+\t\tif (tid != ref_tid) {\n+\t\t\tfree(ref);\n+\t\t\tif (conf->fai) ref = fai_fetch(conf->fai, h->target_name[tid], &ref_len);\n+\t\t\tref_tid = tid;\n+\t\t}\n+\t\tprintf("%s\\t%d\\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : \'N\');\n+\t\tfor (i = 0; i < n; ++i) {\n+\t\t\tint j;\n+\t\t\tprintf("\\t%d\\t", n_plp[i]);\n+\t\t\tif (n_plp[i] == 0) printf("*\\t*");\n+\t\t\telse {\n+\t\t\t\tfor (j = 0; j < n_plp[i]; ++j)\n+\t\t\t\t\tpileup_seq(plp[i] + j, pos, ref_len, ref);\n+\t\t\t\tputchar(\'\\t\');\n+\t\t\t\tfor (j = 0; j < n_plp[i]; ++j) {\n+\t\t\t\t\tconst bam_pileup1_t *p = plp[i] + j;\n+\t\t\t\t\tint c = bam1_qual(p->b)[p->qpos] + 33;\n+\t\t\t\t\tif (c > 126) c = 126;\n+\t\t\t\t\tputchar(c);\n+\t\t\t\t}\n+\t\t\t}\n+\t\t}\n+\t\tputchar(\'\\n\');\n+\t}\n+\tbam_mplp_destroy(iter);\n+\tbam_header_destroy(h);\n+\tfor (i = 0; i < n; ++i) {\n+\t\tbam_close(data[i]->fp);\n+\t\tif (data[i]->iter) bam_iter_destroy(data[i]->iter);\n+\t\tfree(data[i]);\n+\t}\n+\tfree(data); free(plp); free(ref); free(n_plp);\n+\treturn 0;\n+}\n+\n+int bam_mpileup(int argc, char *argv[])\n+{\n+\tint c;\n+\tmplp_conf_t mplp;\n+\tmemset(&mplp, 0, sizeof(mplp_conf_t));\n+\twhile ((c = getopt(argc, argv, "f:r:")) >= 0) {\n+\t\tswitch (c) {\n+\t\tcase \'f\':\n+\t\t\tmplp.fai = fai_load(optarg);\n+\t\t\tif (mplp.fai == 0) return 1;\n+\t\t\tbreak;\n+\t\tcase \'r\': mplp.reg = strdup(optarg);\n+\t\t}\n+\t}\n+\tif (argc == 1) {\n+\t\tfprintf(stderr, "Usage: samtools mpileup [-r reg] [-f in.fa] in1.bam [in2.bam [...]]\\n");\n+\t\treturn 1;\n+\t}\n+\tmpileup(&mplp, argc - optind, argv + optind);\n+\tfree(mplp.reg);\n+\tif (mplp.fai) fai_destroy(mplp.fai);\n+\treturn 0;\n+}\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/bam_reheader.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/bam_reheader.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,60 @@ +#include <stdio.h> +#include <stdlib.h> +#include "bgzf.h" +#include "bam.h" + +#define BUF_SIZE 0x10000 + +int bam_reheader(BGZF *in, const bam_header_t *h, int fd) +{ + BGZF *fp; + bam_header_t *old; + int len; + uint8_t *buf; + if (in->open_mode != 'r') return -1; + buf = malloc(BUF_SIZE); + old = bam_header_read(in); + fp = bgzf_fdopen(fd, "w"); + bam_header_write(fp, h); + if (in->block_offset < in->block_length) { + bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); + bgzf_flush(fp); + } +#ifdef _USE_KNETFILE + while ((len = knet_read(in->x.fpr, buf, BUF_SIZE)) > 0) +#else + while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) +#endif + fwrite(buf, 1, len, fp->x.fpw); + free(buf); + fp->block_offset = in->block_offset = 0; + bgzf_close(fp); + return 0; +} + +int main_reheader(int argc, char *argv[]) +{ + bam_header_t *h; + BGZF *in; + if (argc != 3) { + fprintf(stderr, "Usage: samtools reheader <in.header.sam> <in.bam>\n"); + return 1; + } + { // read the header + tamFile fph = sam_open(argv[1]); + if (fph == 0) { + fprintf(stderr, "[%s] fail to read the header from %s.\n", __func__, argv[1]); + return 1; + } + h = sam_header_read(fph); + sam_close(fph); + } + in = strcmp(argv[2], "-")? bam_open(argv[2], "r") : bam_dopen(fileno(stdin), "r"); + if (in == 0) { + fprintf(stderr, "[%s] fail to open file %s.\n", __func__, argv[2]); + return 1; + } + bam_reheader(in, h, fileno(stdout)); + bgzf_close(in); + return 0; +} |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/bam_rmdup.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/bam_rmdup.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,206 @@ +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <zlib.h> +#include <unistd.h> +#include "sam.h" + +typedef bam1_t *bam1_p; + +#include "khash.h" +KHASH_SET_INIT_STR(name) +KHASH_MAP_INIT_INT64(pos, bam1_p) + +#define BUFFER_SIZE 0x40000 + +typedef struct { + uint64_t n_checked, n_removed; + khash_t(pos) *best_hash; +} lib_aux_t; +KHASH_MAP_INIT_STR(lib, lib_aux_t) + +typedef struct { + int n, max; + bam1_t **a; +} tmp_stack_t; + +static inline void stack_insert(tmp_stack_t *stack, bam1_t *b) +{ + if (stack->n == stack->max) { + stack->max = stack->max? stack->max<<1 : 0x10000; + stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max); + } + stack->a[stack->n++] = b; +} + +static inline void dump_best(tmp_stack_t *stack, samfile_t *out) +{ + int i; + for (i = 0; i != stack->n; ++i) { + samwrite(out, stack->a[i]); + bam_destroy1(stack->a[i]); + } + stack->n = 0; +} + +static void clear_del_set(khash_t(name) *del_set) +{ + khint_t k; + for (k = kh_begin(del_set); k < kh_end(del_set); ++k) + if (kh_exist(del_set, k)) + free((char*)kh_key(del_set, k)); + kh_clear(name, del_set); +} + +static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib) +{ + khint_t k = kh_get(lib, aux, lib); + if (k == kh_end(aux)) { + int ret; + char *p = strdup(lib); + lib_aux_t *q; + k = kh_put(lib, aux, p, &ret); + q = &kh_val(aux, k); + q->n_checked = q->n_removed = 0; + q->best_hash = kh_init(pos); + return q; + } else return &kh_val(aux, k); +} + +static void clear_best(khash_t(lib) *aux, int max) +{ + khint_t k; + for (k = kh_begin(aux); k != kh_end(aux); ++k) { + if (kh_exist(aux, k)) { + lib_aux_t *q = &kh_val(aux, k); + if (kh_size(q->best_hash) >= max) + kh_clear(pos, q->best_hash); + } + } +} + +static inline int sum_qual(const bam1_t *b) +{ + int i, q; + uint8_t *qual = bam1_qual(b); + for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i]; + return q; +} + +void bam_rmdup_core(samfile_t *in, samfile_t *out) +{ + bam1_t *b; + int last_tid = -1, last_pos = -1; + tmp_stack_t stack; + khint_t k; + khash_t(lib) *aux; + khash_t(name) *del_set; + + aux = kh_init(lib); + del_set = kh_init(name); + b = bam_init1(); + memset(&stack, 0, sizeof(tmp_stack_t)); + + kh_resize(name, del_set, 4 * BUFFER_SIZE); + while (samread(in, b) >= 0) { + bam1_core_t *c = &b->core; + if (c->tid != last_tid || last_pos != c->pos) { + dump_best(&stack, out); // write the result + clear_best(aux, BUFFER_SIZE); + if (c->tid != last_tid) { + clear_best(aux, 0); + if (kh_size(del_set)) { // check + fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set)); + clear_del_set(del_set); + } + if ((int)c->tid == -1) { // append unmapped reads + samwrite(out, b); + while (samread(in, b) >= 0) samwrite(out, b); + break; + } + last_tid = c->tid; + fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", in->header->target_name[c->tid]); + } + } + if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) { + samwrite(out, b); + } else if (c->isize > 0) { // paired, head + uint64_t key = (uint64_t)c->pos<<32 | c->isize; + const char *lib; + lib_aux_t *q; + int ret; + lib = bam_get_library(in->header, b); + q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); + ++q->n_checked; + k = kh_put(pos, q->best_hash, key, &ret); + if (ret == 0) { // found in best_hash + bam1_t *p = kh_val(q->best_hash, k); + ++q->n_removed; + if (sum_qual(p) < sum_qual(b)) { // the current alignment is better; this can be accelerated in principle + kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed + bam_copy1(p, b); // replaced as b + } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed + if (ret == 0) + fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b)); + } else { // not found in best_hash + kh_val(q->best_hash, k) = bam_dup1(b); + stack_insert(&stack, kh_val(q->best_hash, k)); + } + } else { // paired, tail + k = kh_get(name, del_set, bam1_qname(b)); + if (k != kh_end(del_set)) { + free((char*)kh_key(del_set, k)); + kh_del(name, del_set, k); + } else samwrite(out, b); + } + last_pos = c->pos; + } + + for (k = kh_begin(aux); k != kh_end(aux); ++k) { + if (kh_exist(aux, k)) { + lib_aux_t *q = &kh_val(aux, k); + dump_best(&stack, out); + fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, + (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); + kh_destroy(pos, q->best_hash); + free((char*)kh_key(aux, k)); + } + } + kh_destroy(lib, aux); + + clear_del_set(del_set); + kh_destroy(name, del_set); + free(stack.a); + bam_destroy1(b); +} + +void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se); + +int bam_rmdup(int argc, char *argv[]) +{ + int c, is_se = 0, force_se = 0; + samfile_t *in, *out; + while ((c = getopt(argc, argv, "sS")) >= 0) { + switch (c) { + case 's': is_se = 1; break; + case 'S': force_se = is_se = 1; break; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools rmdup [-sS] <input.srt.bam> <output.bam>\n\n"); + fprintf(stderr, "Option: -s rmdup for SE reads\n"); + fprintf(stderr, " -S treat PE reads as SE in rmdup (force -s)\n\n"); + return 1; + } + in = samopen(argv[optind], "rb", 0); + out = samopen(argv[optind+1], "wb", in->header); + if (in == 0 || out == 0) { + fprintf(stderr, "[bam_rmdup] fail to read/write input files\n"); + return 1; + } + if (is_se) bam_rmdupse_core(in, out, force_se); + else bam_rmdup_core(in, out); + samclose(in); samclose(out); + return 0; +} |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/bam_rmdupse.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/bam_rmdupse.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,159 @@ +#include <math.h> +#include "sam.h" +#include "khash.h" +#include "klist.h" + +#define QUEUE_CLEAR_SIZE 0x100000 +#define MAX_POS 0x7fffffff + +typedef struct { + int endpos; + uint32_t score:31, discarded:1; + bam1_t *b; +} elem_t, *elem_p; +#define __free_elem(p) bam_destroy1((p)->data.b) +KLIST_INIT(q, elem_t, __free_elem) +typedef klist_t(q) queue_t; + +KHASH_MAP_INIT_INT(best, elem_p) +typedef khash_t(best) besthash_t; + +typedef struct { + uint64_t n_checked, n_removed; + besthash_t *left, *rght; +} lib_aux_t; +KHASH_MAP_INIT_STR(lib, lib_aux_t) + +static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib) +{ + khint_t k = kh_get(lib, aux, lib); + if (k == kh_end(aux)) { + int ret; + char *p = strdup(lib); + lib_aux_t *q; + k = kh_put(lib, aux, p, &ret); + q = &kh_val(aux, k); + q->left = kh_init(best); + q->rght = kh_init(best); + q->n_checked = q->n_removed = 0; + return q; + } else return &kh_val(aux, k); +} + +static inline int sum_qual(const bam1_t *b) +{ + int i, q; + uint8_t *qual = bam1_qual(b); + for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i]; + return q; +} + +static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, int score) +{ + elem_t *p = kl_pushp(q, queue); + p->discarded = 0; + p->endpos = endpos; p->score = score; + if (p->b == 0) p->b = bam_init1(); + bam_copy1(p->b, b); + return p; +} + +static void clear_besthash(besthash_t *h, int32_t pos) +{ + khint_t k; + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k) && kh_val(h, k)->endpos <= pos) + kh_del(best, h, k); +} + +static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h) +{ + if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) { + khint_t k; + while (1) { + elem_t *q; + if (queue->head == queue->tail) break; + q = &kl_val(queue->head); + if (q->discarded) { + q->b->data_len = 0; + kl_shift(q, queue, 0); + continue; + } + if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break; + samwrite(out, q->b); + q->b->data_len = 0; + kl_shift(q, queue, 0); + } + for (k = kh_begin(h); k != kh_end(h); ++k) { + if (kh_exist(h, k)) { + clear_besthash(kh_val(h, k).left, pos); + clear_besthash(kh_val(h, k).rght, pos); + } + } + } +} + +void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se) +{ + bam1_t *b; + queue_t *queue; + khint_t k; + int last_tid = -2; + khash_t(lib) *aux; + + aux = kh_init(lib); + b = bam_init1(); + queue = kl_init(q); + while (samread(in, b) >= 0) { + bam1_core_t *c = &b->core; + int endpos = bam_calend(c, bam1_cigar(b)); + int score = sum_qual(b); + + if (last_tid != c->tid) { + if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux); + last_tid = c->tid; + } else dump_alignment(out, queue, c->pos, aux); + if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) { + push_queue(queue, b, endpos, score); + } else { + const char *lib; + lib_aux_t *q; + besthash_t *h; + uint32_t key; + int ret; + lib = bam_get_library(in->header, b); + q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); + ++q->n_checked; + h = (c->flag&BAM_FREVERSE)? q->rght : q->left; + key = (c->flag&BAM_FREVERSE)? endpos : c->pos; + k = kh_put(best, h, key, &ret); + if (ret == 0) { // in the hash table + elem_t *p = kh_val(h, k); + ++q->n_removed; + if (p->score < score) { + if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue + p->discarded = 1; + kh_val(h, k) = push_queue(queue, b, endpos, score); + } else { // replace + p->score = score; p->endpos = endpos; + bam_copy1(p->b, b); + } + } // otherwise, discard the alignment + } else kh_val(h, k) = push_queue(queue, b, endpos, score); + } + } + dump_alignment(out, queue, MAX_POS, aux); + + for (k = kh_begin(aux); k != kh_end(aux); ++k) { + if (kh_exist(aux, k)) { + lib_aux_t *q = &kh_val(aux, k); + fprintf(stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, + (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); + kh_destroy(best, q->left); kh_destroy(best, q->rght); + free((char*)kh_key(aux, k)); + } + } + kh_destroy(lib, aux); + bam_destroy1(b); + kl_destroy(q, queue); +} |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/bam_sort.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/bam_sort.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,357 @@\n+#include <stdlib.h>\n+#include <ctype.h>\n+#include <assert.h>\n+#include <stdio.h>\n+#include <string.h>\n+#include <unistd.h>\n+#include "bam.h"\n+#include "ksort.h"\n+\n+static int g_is_by_qname = 0;\n+\n+static inline int strnum_cmp(const char *a, const char *b)\n+{\n+\tchar *pa, *pb;\n+\tpa = (char*)a; pb = (char*)b;\n+\twhile (*pa && *pb) {\n+\t\tif (isdigit(*pa) && isdigit(*pb)) {\n+\t\t\tlong ai, bi;\n+\t\t\tai = strtol(pa, &pa, 10);\n+\t\t\tbi = strtol(pb, &pb, 10);\n+\t\t\tif (ai != bi) return ai<bi? -1 : ai>bi? 1 : 0;\n+\t\t} else {\n+\t\t\tif (*pa != *pb) break;\n+\t\t\t++pa; ++pb;\n+\t\t}\n+\t}\n+\tif (*pa == *pb)\n+\t\treturn (pa-a) < (pb-b)? -1 : (pa-a) > (pb-b)? 1 : 0;\n+\treturn *pa<*pb? -1 : *pa>*pb? 1 : 0;\n+}\n+\n+#define HEAP_EMPTY 0xffffffffffffffffull\n+\n+typedef struct {\n+\tint i;\n+\tuint64_t pos, idx;\n+\tbam1_t *b;\n+} heap1_t;\n+\n+#define __pos_cmp(a, b) ((a).pos > (b).pos || ((a).pos == (b).pos && ((a).i > (b).i || ((a).i == (b).i && (a).idx > (b).idx))))\n+\n+static inline int heap_lt(const heap1_t a, const heap1_t b)\n+{\n+\tif (g_is_by_qname) {\n+\t\tint t;\n+\t\tif (a.b == 0 || b.b == 0) return a.b == 0? 1 : 0;\n+\t\tt = strnum_cmp(bam1_qname(a.b), bam1_qname(b.b));\n+\t\treturn (t > 0 || (t == 0 && __pos_cmp(a, b)));\n+\t} else return __pos_cmp(a, b);\n+}\n+\n+KSORT_INIT(heap, heap1_t, heap_lt)\n+\n+static void swap_header_text(bam_header_t *h1, bam_header_t *h2)\n+{\n+\tint tempi;\n+\tchar *temps;\n+\ttempi = h1->l_text, h1->l_text = h2->l_text, h2->l_text = tempi;\n+\ttemps = h1->text, h1->text = h2->text, h2->text = temps;\n+}\n+\n+/*!\n+ @abstract Merge multiple sorted BAM.\n+ @param is_by_qname whether to sort by query name\n+ @param out output BAM file name\n+ @param headers name of SAM file from which to copy \'@\' header lines,\n+ or NULL to copy them from the first file to be merged\n+ @param n number of files to be merged\n+ @param fn names of files to be merged\n+\n+ @discussion Padding information may NOT correctly maintained. This\n+ function is NOT thread safe.\n+ */\n+void bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, int add_RG)\n+{\n+\tbamFile fpout, *fp;\n+\theap1_t *heap;\n+\tbam_header_t *hout = 0;\n+\tbam_header_t *hheaders = NULL;\n+\tint i, j, *RG_len = 0;\n+\tuint64_t idx = 0;\n+\tchar **RG = 0;\n+\n+\tif (headers) {\n+\t\ttamFile fpheaders = sam_open(headers);\n+\t\tif (fpheaders == 0) {\n+\t\t\tfprintf(stderr, "[bam_merge_core] Cannot open file `%s\'. Continue anyway.\\n", headers);\n+\t\t} else {\n+\t\t\thheaders = sam_header_read(fpheaders);\n+\t\t\tsam_close(fpheaders);\n+\t\t}\n+\t}\n+\n+\tg_is_by_qname = by_qname;\n+\tfp = (bamFile*)calloc(n, sizeof(bamFile));\n+\theap = (heap1_t*)calloc(n, sizeof(heap1_t));\n+\t// prepare RG tag\n+\tif (add_RG) {\n+\t\tRG = (char**)calloc(n, sizeof(void*));\n+\t\tRG_len = (int*)calloc(n, sizeof(int));\n+\t\tfor (i = 0; i != n; ++i) {\n+\t\t\tint l = strlen(fn[i]);\n+\t\t\tconst char *s = fn[i];\n+\t\t\tif (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4;\n+\t\t\tfor (j = l - 1; j >= 0; --j) if (s[j] == \'/\') break;\n+\t\t\t++j; l -= j;\n+\t\t\tRG[i] = calloc(l + 1, 1);\n+\t\t\tRG_len[i] = l;\n+\t\t\tstrncpy(RG[i], s + j, l);\n+\t\t}\n+\t}\n+\t// read the first\n+\tfor (i = 0; i != n; ++i) {\n+\t\theap1_t *h;\n+\t\tbam_header_t *hin;\n+\t\tfp[i] = bam_open(fn[i], "r");\n+\t\tif (fp[i] == 0) {\n+\t\t\tint j;\n+\t\t\tfprintf(stderr, "[bam_merge_core] fail to open file %s\\n", fn[i]);\n+\t\t\tfor (j = 0; j < i; ++j) bam_close(fp[j]);\n+\t\t\tfree(fp); free(heap);\n+\t\t\t// FIXME: possible memory leak\n+\t\t\treturn;\n+\t\t}\n+\t\thin = bam_header_read(fp[i]);\n+\t\tif (i == 0) { // the first SAM\n+\t\t\thout = hin;\n+\t\t\tif (hheaders) {\n+\t\t\t\t// If the text headers to be swapped in include any @SQ headers,\n+\t\t\t\t// check that they are consistent with the existing binary list\n+\t\t\t\t// of reference information.\n+\t\t\t\tif (hheaders->n_targets > 0) {\n+\t\t\t\t\tif (hout->n_targets != hheaders->n_targets)\n+\t\t\t\t\t\tfprintf(stderr, "[bam_merge_core] number of @SQ headers in `%s\' differs from number of target sequences", headers);\n+\t\t\t\t\tfor (j = 0; j < hout->n_targets; ++j)\n+\t\t\t\t\t\tif (strcmp(hout->target_name[j], hheaders'..b' 0 && (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos))));\n+\t} else return (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos));\n+}\n+KSORT_INIT(sort, bam1_p, bam1_lt)\n+\n+static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam_header_t *h, int is_stdout)\n+{\n+\tchar *name;\n+\tint i;\n+\tbamFile fp;\n+\tks_mergesort(sort, k, buf, 0);\n+\tname = (char*)calloc(strlen(prefix) + 20, 1);\n+\tif (n >= 0) sprintf(name, "%s.%.4d.bam", prefix, n);\n+\telse sprintf(name, "%s.bam", prefix);\n+\tfp = is_stdout? bam_dopen(fileno(stdout), "w") : bam_open(name, "w");\n+\tif (fp == 0) {\n+\t\tfprintf(stderr, "[sort_blocks] fail to create file %s.\\n", name);\n+\t\tfree(name);\n+\t\t// FIXME: possible memory leak\n+\t\treturn;\n+\t}\n+\tfree(name);\n+\tbam_header_write(fp, h);\n+\tfor (i = 0; i < k; ++i)\n+\t\tbam_write1_core(fp, &buf[i]->core, buf[i]->data_len, buf[i]->data);\n+\tbam_close(fp);\n+}\n+\n+/*!\n+ @abstract Sort an unsorted BAM file based on the chromosome order\n+ and the leftmost position of an alignment\n+\n+ @param is_by_qname whether to sort by query name\n+ @param fn name of the file to be sorted\n+ @param prefix prefix of the output and the temporary files; upon\n+\t sucessess, prefix.bam will be written.\n+ @param max_mem approxiate maximum memory (very inaccurate)\n+\n+ @discussion It may create multiple temporary subalignment files\n+ and then merge them by calling bam_merge_core(). This function is\n+ NOT thread safe.\n+ */\n+void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t max_mem, int is_stdout)\n+{\n+\tint n, ret, k, i;\n+\tsize_t mem;\n+\tbam_header_t *header;\n+\tbamFile fp;\n+\tbam1_t *b, **buf;\n+\n+\tg_is_by_qname = is_by_qname;\n+\tn = k = 0; mem = 0;\n+\tfp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");\n+\tif (fp == 0) {\n+\t\tfprintf(stderr, "[bam_sort_core] fail to open file %s\\n", fn);\n+\t\treturn;\n+\t}\n+\theader = bam_header_read(fp);\n+\tbuf = (bam1_t**)calloc(max_mem / BAM_CORE_SIZE, sizeof(bam1_t*));\n+\t// write sub files\n+\tfor (;;) {\n+\t\tif (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t));\n+\t\tb = buf[k];\n+\t\tif ((ret = bam_read1(fp, b)) < 0) break;\n+\t\tmem += ret;\n+\t\t++k;\n+\t\tif (mem >= max_mem) {\n+\t\t\tsort_blocks(n++, k, buf, prefix, header, 0);\n+\t\t\tmem = 0; k = 0;\n+\t\t}\n+\t}\n+\tif (ret != -1)\n+\t\tfprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\\n");\n+\tif (n == 0) sort_blocks(-1, k, buf, prefix, header, is_stdout);\n+\telse { // then merge\n+\t\tchar **fns, *fnout;\n+\t\tfprintf(stderr, "[bam_sort_core] merging from %d files...\\n", n+1);\n+\t\tsort_blocks(n++, k, buf, prefix, header, 0);\n+\t\tfnout = (char*)calloc(strlen(prefix) + 20, 1);\n+\t\tif (is_stdout) sprintf(fnout, "-");\n+\t\telse sprintf(fnout, "%s.bam", prefix);\n+\t\tfns = (char**)calloc(n, sizeof(char*));\n+\t\tfor (i = 0; i < n; ++i) {\n+\t\t\tfns[i] = (char*)calloc(strlen(prefix) + 20, 1);\n+\t\t\tsprintf(fns[i], "%s.%.4d.bam", prefix, i);\n+\t\t}\n+\t\tbam_merge_core(is_by_qname, fnout, 0, n, fns, 0);\n+\t\tfree(fnout);\n+\t\tfor (i = 0; i < n; ++i) {\n+\t\t\tunlink(fns[i]);\n+\t\t\tfree(fns[i]);\n+\t\t}\n+\t\tfree(fns);\n+\t}\n+\tfor (k = 0; k < max_mem / BAM_CORE_SIZE; ++k) {\n+\t\tif (buf[k]) {\n+\t\t\tfree(buf[k]->data);\n+\t\t\tfree(buf[k]);\n+\t\t}\n+\t}\n+\tfree(buf);\n+\tbam_header_destroy(header);\n+\tbam_close(fp);\n+}\n+\n+void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem)\n+{\n+\tbam_sort_core_ext(is_by_qname, fn, prefix, max_mem, 0);\n+}\n+\n+int bam_sort(int argc, char *argv[])\n+{\n+\tsize_t max_mem = 500000000;\n+\tint c, is_by_qname = 0, is_stdout = 0;\n+\twhile ((c = getopt(argc, argv, "nom:")) >= 0) {\n+\t\tswitch (c) {\n+\t\tcase \'o\': is_stdout = 1; break;\n+\t\tcase \'n\': is_by_qname = 1; break;\n+\t\tcase \'m\': max_mem = atol(optarg); break;\n+\t\t}\n+\t}\n+\tif (optind + 2 > argc) {\n+\t\tfprintf(stderr, "Usage: samtools sort [-on] [-m <maxMem>] <in.bam> <out.prefix>\\n");\n+\t\treturn 1;\n+\t}\n+\tbam_sort_core_ext(is_by_qname, argv[optind], argv[optind+1], max_mem, is_stdout);\n+\treturn 0;\n+}\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/bam_stat.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/bam_stat.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,78 @@ +#include <unistd.h> +#include <assert.h> +#include "bam.h" + +typedef struct { + long long n_reads, n_mapped, n_pair_all, n_pair_map, n_pair_good; + long long n_sgltn, n_read1, n_read2; + long long n_qcfail, n_dup; + long long n_diffchr, n_diffhigh; +} bam_flagstat_t; + +#define flagstat_loop(s, c) do { \ + ++(s)->n_reads; \ + if ((c)->flag & BAM_FPAIRED) { \ + ++(s)->n_pair_all; \ + if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good; \ + if ((c)->flag & BAM_FREAD1) ++(s)->n_read1; \ + if ((c)->flag & BAM_FREAD2) ++(s)->n_read2; \ + if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn; \ + if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \ + ++(s)->n_pair_map; \ + if ((c)->mtid != (c)->tid) { \ + ++(s)->n_diffchr; \ + if ((c)->qual >= 5) ++(s)->n_diffhigh; \ + } \ + } \ + } \ + if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped; \ + if ((c)->flag & BAM_FQCFAIL) ++(s)->n_qcfail; \ + if ((c)->flag & BAM_FDUP) ++(s)->n_dup; \ + } while (0) + +bam_flagstat_t *bam_flagstat_core(bamFile fp) +{ + bam_flagstat_t *s; + bam1_t *b; + bam1_core_t *c; + int ret; + s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t)); + b = bam_init1(); + c = &b->core; + while ((ret = bam_read1(fp, b)) >= 0) + flagstat_loop(s, c); + bam_destroy1(b); + if (ret != -1) + fprintf(stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n"); + return s; +} +int bam_flagstat(int argc, char *argv[]) +{ + bamFile fp; + bam_header_t *header; + bam_flagstat_t *s; + if (argc == optind) { + fprintf(stderr, "Usage: samtools flagstat <in.bam>\n"); + return 1; + } + fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); + assert(fp); + header = bam_header_read(fp); + s = bam_flagstat_core(fp); + printf("%lld in total\n", s->n_reads); + printf("%lld QC failure\n", s->n_qcfail); + printf("%lld duplicates\n", s->n_dup); + printf("%lld mapped (%.2f%%)\n", s->n_mapped, (float)s->n_mapped / s->n_reads * 100.0); + printf("%lld paired in sequencing\n", s->n_pair_all); + printf("%lld read1\n", s->n_read1); + printf("%lld read2\n", s->n_read2); + printf("%lld properly paired (%.2f%%)\n", s->n_pair_good, (float)s->n_pair_good / s->n_pair_all * 100.0); + printf("%lld with itself and mate mapped\n", s->n_pair_map); + printf("%lld singletons (%.2f%%)\n", s->n_sgltn, (float)s->n_sgltn / s->n_pair_all * 100.0); + printf("%lld with mate mapped to a different chr\n", s->n_diffchr); + printf("%lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh); + free(s); + bam_header_destroy(header); + bam_close(fp); + return 0; +} |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/bam_tview.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/bam_tview.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,424 @@\n+#undef _HAVE_CURSES\n+\n+#if _CURSES_LIB == 0\n+#elif _CURSES_LIB == 1\n+#include <curses.h>\n+#ifndef NCURSES_VERSION\n+#warning "_CURSES_LIB=1 but NCURSES_VERSION not defined; tview is NOT compiled"\n+#else\n+#define _HAVE_CURSES\n+#endif\n+#elif _CURSES_LIB == 2\n+#include <xcurses.h>\n+#define _HAVE_CURSES\n+#else\n+#warning "_CURSES_LIB is not 0, 1 or 2; tview is NOT compiled"\n+#endif\n+\n+#ifdef _HAVE_CURSES\n+#include <ctype.h>\n+#include <assert.h>\n+#include <string.h>\n+#include "bam.h"\n+#include "faidx.h"\n+#include "bam_maqcns.h"\n+\n+char bam_aux_getCEi(bam1_t *b, int i);\n+char bam_aux_getCSi(bam1_t *b, int i);\n+char bam_aux_getCQi(bam1_t *b, int i);\n+\n+#define TV_MIN_ALNROW 2\n+#define TV_MAX_GOTO 40\n+#define TV_LOW_MAPQ 10\n+\n+#define TV_COLOR_MAPQ 0\n+#define TV_COLOR_BASEQ 1\n+#define TV_COLOR_NUCL 2\n+#define TV_COLOR_COL 3\n+#define TV_COLOR_COLQ 4\n+\n+#define TV_BASE_NUCL 0\n+#define TV_BASE_COLOR_SPACE 1\n+\n+typedef struct {\n+\tint mrow, mcol;\n+\tWINDOW *wgoto, *whelp;\n+\n+\tbam_index_t *idx;\n+\tbam_lplbuf_t *lplbuf;\n+\tbam_header_t *header;\n+\tbamFile fp;\n+\tint curr_tid, left_pos;\n+\tfaidx_t *fai;\n+\tbam_maqcns_t *bmc;\n+\n+\tint ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins, no_skip, show_name;\n+\tchar *ref;\n+} tview_t;\n+\n+int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)\n+{\n+\ttview_t *tv = (tview_t*)data;\n+\tint i, j, c, rb, attr, max_ins = 0;\n+\tuint32_t call = 0;\n+\tif (pos < tv->left_pos || tv->ccol > tv->mcol) return 0; // out of screen\n+\t// print referece\n+\trb = (tv->ref && pos - tv->left_pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : \'N\';\n+\tfor (i = tv->last_pos + 1; i < pos; ++i) {\n+\t\tif (i%10 == 0 && tv->mcol - tv->ccol >= 10) mvprintw(0, tv->ccol, "%-d", i+1);\n+\t\tc = tv->ref? tv->ref[i - tv->left_pos] : \'N\';\n+\t\tmvaddch(1, tv->ccol++, c);\n+\t}\n+\tif (pos%10 == 0 && tv->mcol - tv->ccol >= 10) mvprintw(0, tv->ccol, "%-d", pos+1);\n+\t// print consensus\n+\tcall = bam_maqcns_call(n, pl, tv->bmc);\n+\tattr = A_UNDERLINE;\n+\tc = ",ACMGRSVTWYHKDBN"[call>>28&0xf];\n+\ti = (call>>8&0xff)/10+1;\n+\tif (i > 4) i = 4;\n+\tattr |= COLOR_PAIR(i);\n+\tif (c == toupper(rb)) c = \'.\';\n+\tattron(attr);\n+\tmvaddch(2, tv->ccol, c);\n+\tattroff(attr);\n+\tif(tv->ins) {\n+\t\t// calculate maximum insert\n+\t\tfor (i = 0; i < n; ++i) {\n+\t\t\tconst bam_pileup1_t *p = pl + i;\n+\t\t\tif (p->indel > 0 && max_ins < p->indel) max_ins = p->indel;\n+\t\t}\n+\t}\n+\t// core loop\n+\tfor (j = 0; j <= max_ins; ++j) {\n+\t\tfor (i = 0; i < n; ++i) {\n+\t\t\tconst bam_pileup1_t *p = pl + i;\n+\t\t\tint row = TV_MIN_ALNROW + p->level - tv->row_shift;\n+\t\t\tif (j == 0) {\n+\t\t\t\tif (!p->is_del) {\n+\t\t\t\t\tif (tv->base_for == TV_BASE_COLOR_SPACE && \n+\t\t\t\t\t\t\t(c = bam_aux_getCSi(p->b, p->qpos))) {\n+\t\t\t\t\t\tc = bam_aux_getCSi(p->b, p->qpos);\n+\t\t\t\t\t\t// assume that if we found one color, we will be able to get the color error\n+\t\t\t\t\t\tif (tv->is_dot && \'-\' == bam_aux_getCEi(p->b, p->qpos)) c = bam1_strand(p->b)? \',\' : \'.\';\n+\t\t\t\t\t} else {\n+\t\t\t\t\t\tif (tv->show_name) {\n+\t\t\t\t\t\t\tchar *name = bam1_qname(p->b);\n+\t\t\t\t\t\t\tc = (p->qpos + 1 >= p->b->core.l_qname)? \' \' : name[p->qpos];\n+\t\t\t\t\t\t} else {\n+\t\t\t\t\t\t\tc = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];\n+\t\t\t\t\t\t\tif (tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? \',\' : \'.\';\n+\t\t\t\t\t\t}\n+\t\t\t\t\t}\n+\t\t\t\t} else c = \'*\';\n+\t\t\t} else { // padding\n+\t\t\t\tif (j > p->indel) c = \'*\';\n+\t\t\t\telse { // insertion\n+\t\t\t\t\tif (tv->base_for == TV_BASE_NUCL) {\n+\t\t\t\t\t\tif (tv->show_name) {\n+\t\t\t\t\t\t\tchar *name = bam1_qname(p->b);\n+\t\t\t\t\t\t\tc = (p->qpos + j + 1 >= p->b->core.l_qname)? \' \' : name[p->qpos + j];\n+\t\t\t\t\t\t} else {\n+\t\t\t\t\t\t\tc = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];\n+\t\t\t\t\t\t\tif (j == 0 && tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? \',\' : \'.\';\n+\t\t\t\t\t\t}\n+\t\t\t\t\t} else {\n+\t\t\t\t\t\tc = bam_aux_getCSi(p->b, p->qpos + j);\n+\t\t\t\t\t\tif (tv->is_dot && \'-\' == bam_aux_getCEi(p->b, p->qpos + j)) c = bam1_strand(p->b)? \',\' : \'.\';\n+\t\t\t\t\t}\n+\t\t\t\t}\n+\t\t\t}\n+\t\t\tif (row > TV_MIN_ALNROW && row < tv->mrow) {\n+\t'..b' l = 0;\n+\t\telse if (c == \'\\033\') return;\n+\t\tstr[l] = \'\\0\';\n+\t\tfor (i = 0; i < TV_MAX_GOTO; ++i) mvwaddch(tv->wgoto, 1, 8 + i, \' \');\n+\t\tmvwprintw(tv->wgoto, 1, 8, "%s", str);\n+\t}\n+}\n+\n+static void tv_win_help(tview_t *tv) {\n+\tint r = 1;\n+\tWINDOW *win = tv->whelp;\n+\twborder(win, \'|\', \'|\', \'-\', \'-\', \'+\', \'+\', \'+\', \'+\');\n+\tmvwprintw(win, r++, 2, " -=- Help -=- ");\n+\tr++;\n+\tmvwprintw(win, r++, 2, "? This window");\n+\tmvwprintw(win, r++, 2, "Arrows Small scroll movement");\n+\tmvwprintw(win, r++, 2, "h,j,k,l Small scroll movement");\n+\tmvwprintw(win, r++, 2, "H,J,K,L Large scroll movement");\n+\tmvwprintw(win, r++, 2, "ctrl-H Scroll 1k left");\n+\tmvwprintw(win, r++, 2, "ctrl-L Scroll 1k right");\n+\tmvwprintw(win, r++, 2, "space Scroll one screen");\n+\tmvwprintw(win, r++, 2, "backspace Scroll back one screen");\n+\tmvwprintw(win, r++, 2, "g Go to specific location");\n+\tmvwprintw(win, r++, 2, "m Color for mapping qual");\n+\tmvwprintw(win, r++, 2, "n Color for nucleotide");\n+\tmvwprintw(win, r++, 2, "b Color for base quality");\n+\tmvwprintw(win, r++, 2, "c Color for cs color");\n+\tmvwprintw(win, r++, 2, "z Color for cs qual");\n+\tmvwprintw(win, r++, 2, ". Toggle on/off dot view");\n+\tmvwprintw(win, r++, 2, "s Toggle on/off ref skip");\n+\tmvwprintw(win, r++, 2, "r Toggle on/off rd name");\n+\tmvwprintw(win, r++, 2, "N Turn on nt view");\n+\tmvwprintw(win, r++, 2, "C Turn on cs view");\n+\tmvwprintw(win, r++, 2, "i Toggle on/off ins");\n+\tmvwprintw(win, r++, 2, "q Exit");\n+\tr++;\n+\tmvwprintw(win, r++, 2, "Underline: Secondary or orphan");\n+\tmvwprintw(win, r++, 2, "Blue: 0-9 Green: 10-19");\n+\tmvwprintw(win, r++, 2, "Yellow: 20-29 White: >=30");\n+\twrefresh(win);\n+\twgetch(win);\n+}\n+\n+void tv_loop(tview_t *tv)\n+{\n+\tint tid, pos;\n+\ttid = tv->curr_tid; pos = tv->left_pos;\n+\twhile (1) {\n+\t\tint c = getch();\n+\t\tswitch (c) {\n+\t\t\tcase \'?\': tv_win_help(tv); break;\n+\t\t\tcase \'\\033\':\n+\t\t\tcase \'q\': goto end_loop;\n+\t\t\tcase \'/\': \n+\t\t\tcase \'g\': tv_win_goto(tv, &tid, &pos); break;\n+\t\t\tcase \'m\': tv->color_for = TV_COLOR_MAPQ; break;\n+\t\t\tcase \'b\': tv->color_for = TV_COLOR_BASEQ; break;\n+\t\t\tcase \'n\': tv->color_for = TV_COLOR_NUCL; break;\n+\t\t\tcase \'c\': tv->color_for = TV_COLOR_COL; break;\n+\t\t\tcase \'z\': tv->color_for = TV_COLOR_COLQ; break;\n+\t\t\tcase \'s\': tv->no_skip = !tv->no_skip; break;\n+\t\t\tcase \'r\': tv->show_name = !tv->show_name; break;\n+\t\t\tcase KEY_LEFT:\n+\t\t\tcase \'h\': --pos; break;\n+\t\t\tcase KEY_RIGHT:\n+\t\t\tcase \'l\': ++pos; break;\n+\t\t\tcase KEY_SLEFT:\n+\t\t\tcase \'H\': pos -= 20; break;\n+\t\t\tcase KEY_SRIGHT:\n+\t\t\tcase \'L\': pos += 20; break;\n+\t\t\tcase \'.\': tv->is_dot = !tv->is_dot; break;\n+\t\t\tcase \'N\': tv->base_for = TV_BASE_NUCL; break;\n+\t\t\tcase \'C\': tv->base_for = TV_BASE_COLOR_SPACE; break;\n+\t\t\tcase \'i\': tv->ins = !tv->ins; break;\n+\t\t\tcase \'\\010\': pos -= 1000; break;\n+\t\t\tcase \'\\014\': pos += 1000; break;\n+\t\t\tcase \' \': pos += tv->mcol; break;\n+\t\t\tcase KEY_UP:\n+\t\t\tcase \'j\': --tv->row_shift; break;\n+\t\t\tcase KEY_DOWN:\n+\t\t\tcase \'k\': ++tv->row_shift; break;\n+\t\t\tcase KEY_BACKSPACE:\n+\t\t\tcase \'\\177\': pos -= tv->mcol; break;\n+\t\t\tcase KEY_RESIZE: getmaxyx(stdscr, tv->mrow, tv->mcol); break;\n+\t\t\tdefault: continue;\n+\t\t}\n+\t\tif (pos < 0) pos = 0;\n+\t\tif (tv->row_shift < 0) tv->row_shift = 0;\n+\t\ttv_draw_aln(tv, tid, pos);\n+\t}\n+end_loop:\n+\treturn;\n+}\n+\n+int bam_tview_main(int argc, char *argv[])\n+{\n+\ttview_t *tv;\n+\tif (argc == 1) {\n+\t\tfprintf(stderr, "Usage: bamtk tview <aln.bam> [ref.fasta]\\n");\n+\t\treturn 1;\n+\t}\n+\ttv = tv_init(argv[1], (argc == 2)? 0 : argv[2]);\n+\ttv_draw_aln(tv, 0, 0);\n+\ttv_loop(tv);\n+\ttv_destroy(tv);\n+\treturn 0;\n+}\n+#else // #ifdef _HAVE_CURSES\n+#include <stdio.h>\n+#warning "No curses library is available; tview is disabled."\n+int bam_tview_main(int argc, char *argv[])\n+{\n+\tfprintf(stderr, "[bam_tview_main] The ncurses library is unavailable; tview is not compiled.\\n");\n+\treturn 1;\n+}\n+#endif // #ifdef _HAVE_CURSES\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/bgzf.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/bgzf.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,671 @@\n+/* The MIT License\n+\n+ Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology\n+\n+ Permission is hereby granted, free of charge, to any person obtaining a copy\n+ of this software and associated documentation files (the "Software"), to deal\n+ in the Software without restriction, including without limitation the rights\n+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n+ copies of the Software, and to permit persons to whom the Software is\n+ furnished to do so, subject to the following conditions:\n+\n+ The above copyright notice and this permission notice shall be included in\n+ all copies or substantial portions of the Software.\n+\n+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n+ THE SOFTWARE.\n+*/\n+\n+/*\n+ 2009-06-29 by lh3: cache recent uncompressed blocks.\n+ 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP.\n+ 2009-06-12 by lh3: support a mode string like "wu" where \'u\' for uncompressed output */\n+\n+#include <stdio.h>\n+#include <stdlib.h>\n+#include <string.h>\n+#include <unistd.h>\n+#include <fcntl.h>\n+#include <sys/types.h>\n+#include <sys/stat.h>\n+#include "bgzf.h"\n+\n+#include "khash.h"\n+typedef struct {\n+\tint size;\n+\tuint8_t *block;\n+\tint64_t end_offset;\n+} cache_t;\n+KHASH_MAP_INIT_INT64(cache, cache_t)\n+\n+#if defined(_WIN32) || defined(_MSC_VER)\n+#define ftello(fp) ftell(fp)\n+#define fseeko(fp, offset, whence) fseek(fp, offset, whence)\n+#else\n+extern off_t ftello(FILE *stream);\n+extern int fseeko(FILE *stream, off_t offset, int whence);\n+#endif\n+\n+typedef int8_t bgzf_byte_t;\n+\n+static const int DEFAULT_BLOCK_SIZE = 64 * 1024;\n+static const int MAX_BLOCK_SIZE = 64 * 1024;\n+\n+static const int BLOCK_HEADER_LENGTH = 18;\n+static const int BLOCK_FOOTER_LENGTH = 8;\n+\n+static const int GZIP_ID1 = 31;\n+static const int GZIP_ID2 = 139;\n+static const int CM_DEFLATE = 8;\n+static const int FLG_FEXTRA = 4;\n+static const int OS_UNKNOWN = 255;\n+static const int BGZF_ID1 = 66; // \'B\'\n+static const int BGZF_ID2 = 67; // \'C\'\n+static const int BGZF_LEN = 2;\n+static const int BGZF_XLEN = 6; // BGZF_LEN+4\n+\n+static const int GZIP_WINDOW_BITS = -15; // no zlib header\n+static const int Z_DEFAULT_MEM_LEVEL = 8;\n+\n+\n+inline\n+void\n+packInt16(uint8_t* buffer, uint16_t value)\n+{\n+ buffer[0] = value;\n+ buffer[1] = value >> 8;\n+}\n+\n+inline\n+int\n+unpackInt16(const uint8_t* buffer)\n+{\n+ return (buffer[0] | (buffer[1] << 8));\n+}\n+\n+inline\n+void\n+packInt32(uint8_t* buffer, uint32_t value)\n+{\n+ buffer[0] = value;\n+ buffer[1] = value >> 8;\n+ buffer[2] = value >> 16;\n+ buffer[3] = value >> 24;\n+}\n+\n+static inline\n+int\n+bgzf_min(int x, int y)\n+{\n+ return (x < y) ? x : y;\n+}\n+\n+static\n+void\n+report_error(BGZF* fp, const char* message) {\n+ fp->error = message;\n+}\n+\n+static BGZF *bgzf_read_init()\n+{\n+\tBGZF *fp;\n+\tfp = calloc(1, sizeof(BGZF));\n+ fp->uncompressed_block_size = MAX_BLOCK_SIZE;\n+ fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);\n+ fp->compressed_block_size = MAX_BLOCK_SIZE;\n+ fp->compressed_block = malloc(MAX_BLOCK_SIZE);\n+\tfp->cache_size = 0;\n+\tfp->cache = kh_init(cache);\n+\treturn fp;\n+}\n+\n+static\n+BGZF*\n+open_read(int fd)\n+{\n+#ifdef _USE_KNETFILE\n+ knetFile *file = knet_dopen(fd, "r");\n+#else\n+ FILE* file = fdopen(fd, "r");\n+#endif\n+ BGZF* fp;\n+\tif (file == 0) return 0;\n+\tfp = bgzf_read_init();\n+ fp->file_descriptor = fd;\n+ fp->open_mode = \'r\';\n+#ifdef _USE_KNETFILE\n+ fp->x.fpr = file;\n+#else\n+ fp->file = file;\n+#endif\n+ return fp;\n+}\n+\n+static\n+BGZF*\n+open_write(int'..b'_length = deflate_block(fp, fp->block_offset);\n+ if (block_length < 0) return -1;\n+#ifdef _USE_KNETFILE\n+ count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);\n+#else\n+ count = fwrite(fp->compressed_block, 1, block_length, fp->file);\n+#endif\n+ if (count != block_length) {\n+ report_error(fp, "write failed");\n+ return -1;\n+ }\n+ fp->block_address += block_length;\n+ }\n+ return 0;\n+}\n+\n+int bgzf_flush_try(BGZF *fp, int size)\n+{\n+\tif (fp->block_offset + size > fp->uncompressed_block_size)\n+\t\treturn bgzf_flush(fp);\n+\treturn -1;\n+}\n+\n+int bgzf_write(BGZF* fp, const void* data, int length)\n+{\n+ if (fp->open_mode != \'w\') {\n+ report_error(fp, "file not open for writing");\n+ return -1;\n+ }\n+\n+ if (fp->uncompressed_block == NULL)\n+ fp->uncompressed_block = malloc(fp->uncompressed_block_size);\n+\n+ const bgzf_byte_t* input = data;\n+ int block_length = fp->uncompressed_block_size;\n+ int bytes_written = 0;\n+ while (bytes_written < length) {\n+ int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written);\n+ bgzf_byte_t* buffer = fp->uncompressed_block;\n+ memcpy(buffer + fp->block_offset, input, copy_length);\n+ fp->block_offset += copy_length;\n+ input += copy_length;\n+ bytes_written += copy_length;\n+ if (fp->block_offset == block_length) {\n+ if (bgzf_flush(fp) != 0) {\n+ break;\n+ }\n+ }\n+ }\n+ return bytes_written;\n+}\n+\n+int bgzf_close(BGZF* fp)\n+{\n+ if (fp->open_mode == \'w\') {\n+ if (bgzf_flush(fp) != 0) return -1;\n+\t\t{ // add an empty block\n+\t\t\tint count, block_length = deflate_block(fp, 0);\n+#ifdef _USE_KNETFILE\n+\t\t\tcount = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);\n+#else\n+\t\t\tcount = fwrite(fp->compressed_block, 1, block_length, fp->file);\n+#endif\n+\t\t}\n+#ifdef _USE_KNETFILE\n+ if (fflush(fp->x.fpw) != 0) {\n+#else\n+ if (fflush(fp->file) != 0) {\n+#endif\n+ report_error(fp, "flush failed");\n+ return -1;\n+ }\n+ }\n+ if (fp->owned_file) {\n+#ifdef _USE_KNETFILE\n+\t\tint ret;\n+\t\tif (fp->open_mode == \'w\') ret = fclose(fp->x.fpw);\n+\t\telse ret = knet_close(fp->x.fpr);\n+ if (ret != 0) return -1;\n+#else\n+ if (fclose(fp->file) != 0) return -1;\n+#endif\n+ }\n+ free(fp->uncompressed_block);\n+ free(fp->compressed_block);\n+\tfree_cache(fp);\n+ free(fp);\n+ return 0;\n+}\n+\n+void bgzf_set_cache_size(BGZF *fp, int cache_size)\n+{\n+\tif (fp) fp->cache_size = cache_size;\n+}\n+\n+int bgzf_check_EOF(BGZF *fp)\n+{\n+\tstatic uint8_t magic[28] = "\\037\\213\\010\\4\\0\\0\\0\\0\\0\\377\\6\\0\\102\\103\\2\\0\\033\\0\\3\\0\\0\\0\\0\\0\\0\\0\\0\\0";\n+\tuint8_t buf[28];\n+\toff_t offset;\n+#ifdef _USE_KNETFILE\n+\toffset = knet_tell(fp->x.fpr);\n+\tif (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1;\n+\tknet_read(fp->x.fpr, buf, 28);\n+\tknet_seek(fp->x.fpr, offset, SEEK_SET);\n+#else\n+\toffset = ftello(fp->file);\n+\tif (fseeko(fp->file, -28, SEEK_END) != 0) return -1;\n+\tfread(buf, 1, 28, fp->file);\n+\tfseeko(fp->file, offset, SEEK_SET);\n+#endif\n+\treturn (memcmp(magic, buf, 28) == 0)? 1 : 0;\n+}\n+\n+int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)\n+{\n+\tint block_offset;\n+\tint64_t block_address;\n+\n+ if (fp->open_mode != \'r\') {\n+ report_error(fp, "file not open for read");\n+ return -1;\n+ }\n+ if (where != SEEK_SET) {\n+ report_error(fp, "unimplemented seek option");\n+ return -1;\n+ }\n+ block_offset = pos & 0xFFFF;\n+ block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;\n+#ifdef _USE_KNETFILE\n+ if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) {\n+#else\n+ if (fseeko(fp->file, block_address, SEEK_SET) != 0) {\n+#endif\n+ report_error(fp, "seek failed");\n+ return -1;\n+ }\n+ fp->block_length = 0; // indicates current block is not loaded\n+ fp->block_address = block_address;\n+ fp->block_offset = block_offset;\n+ return 0;\n+}\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/bgzf.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/bgzf.h Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,157 @@ +/* The MIT License + + Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#ifndef __BGZF_H +#define __BGZF_H + +#include <stdint.h> +#include <stdio.h> +#include <stdbool.h> +#include <zlib.h> +#ifdef _USE_KNETFILE +#include "knetfile.h" +#endif + +//typedef int8_t bool; + +typedef struct { + int file_descriptor; + char open_mode; // 'r' or 'w' + bool owned_file, is_uncompressed; +#ifdef _USE_KNETFILE + union { + knetFile *fpr; + FILE *fpw; + } x; +#else + FILE* file; +#endif + int uncompressed_block_size; + int compressed_block_size; + void* uncompressed_block; + void* compressed_block; + int64_t block_address; + int block_length; + int block_offset; + int cache_size; + const char* error; + void *cache; // a pointer to a hash table +} BGZF; + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Open an existing file descriptor for reading or writing. + * Mode must be either "r" or "w". + * A subsequent bgzf_close will not close the file descriptor. + * Returns null on error. + */ +BGZF* bgzf_fdopen(int fd, const char* __restrict mode); + +/* + * Open the specified file for reading or writing. + * Mode must be either "r" or "w". + * Returns null on error. + */ +BGZF* bgzf_open(const char* path, const char* __restrict mode); + +/* + * Close the BGZ file and free all associated resources. + * Does not close the underlying file descriptor if created with bgzf_fdopen. + * Returns zero on success, -1 on error. + */ +int bgzf_close(BGZF* fp); + +/* + * Read up to length bytes from the file storing into data. + * Returns the number of bytes actually read. + * Returns zero on end of file. + * Returns -1 on error. + */ +int bgzf_read(BGZF* fp, void* data, int length); + +/* + * Write length bytes from data to the file. + * Returns the number of bytes written. + * Returns -1 on error. + */ +int bgzf_write(BGZF* fp, const void* data, int length); + +/* + * Return a virtual file pointer to the current location in the file. + * No interpetation of the value should be made, other than a subsequent + * call to bgzf_seek can be used to position the file at the same point. + * Return value is non-negative on success. + * Returns -1 on error. + */ +#define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)) + +/* + * Set the file to read from the location specified by pos, which must + * be a value previously returned by bgzf_tell for this file (but not + * necessarily one returned by this file handle). + * The where argument must be SEEK_SET. + * Seeking on a file opened for write is not supported. + * Returns zero on success, -1 on error. + */ +int64_t bgzf_seek(BGZF* fp, int64_t pos, int where); + +/* + * Set the cache size. Zero to disable. By default, caching is + * disabled. The recommended cache size for frequent random access is + * about 8M bytes. + */ +void bgzf_set_cache_size(BGZF *fp, int cache_size); + +int bgzf_check_EOF(BGZF *fp); +int bgzf_read_block(BGZF* fp); +int bgzf_flush(BGZF* fp); +int bgzf_flush_try(BGZF *fp, int size); + +#ifdef __cplusplus +} +#endif + +static inline int bgzf_getc(BGZF *fp) +{ + int c; + if (fp->block_offset >= fp->block_length) { + if (bgzf_read_block(fp) != 0) return -2; /* error */ + if (fp->block_length == 0) return -1; /* end-of-file */ + } + c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++]; + if (fp->block_offset == fp->block_length) { +#ifdef _USE_KNETFILE + fp->block_address = knet_tell(fp->x.fpr); +#else + fp->block_address = ftello(fp->file); +#endif + fp->block_offset = 0; + fp->block_length = 0; + } + return c; +} + +#endif |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/faidx.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/faidx.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,422 @@\n+#include <ctype.h>\n+#include <string.h>\n+#include <stdlib.h>\n+#include <stdio.h>\n+#include "faidx.h"\n+#include "khash.h"\n+\n+typedef struct {\n+\tuint64_t len:32, line_len:16, line_blen:16;\n+\tuint64_t offset;\n+} faidx1_t;\n+KHASH_MAP_INIT_STR(s, faidx1_t)\n+\n+#ifndef _NO_RAZF\n+#include "razf.h"\n+#else\n+#ifdef _WIN32\n+#define ftello(fp) ftell(fp)\n+#define fseeko(fp, offset, whence) fseek(fp, offset, whence)\n+#else\n+extern off_t ftello(FILE *stream);\n+extern int fseeko(FILE *stream, off_t offset, int whence);\n+#endif\n+#define RAZF FILE\n+#define razf_read(fp, buf, size) fread(buf, 1, size, fp)\n+#define razf_open(fn, mode) fopen(fn, mode)\n+#define razf_close(fp) fclose(fp)\n+#define razf_seek(fp, offset, whence) fseeko(fp, offset, whence)\n+#define razf_tell(fp) ftello(fp)\n+#endif\n+#ifdef _USE_KNETFILE\n+#include "knetfile.h"\n+#endif\n+\n+struct __faidx_t {\n+\tRAZF *rz;\n+\tint n, m;\n+\tchar **name;\n+\tkhash_t(s) *hash;\n+};\n+\n+#ifndef kroundup32\n+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))\n+#endif\n+\n+static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset)\n+{\n+\tkhint_t k;\n+\tint ret;\n+\tfaidx1_t t;\n+\tif (idx->n == idx->m) {\n+\t\tidx->m = idx->m? idx->m<<1 : 16;\n+\t\tidx->name = (char**)realloc(idx->name, sizeof(void*) * idx->m);\n+\t}\n+\tidx->name[idx->n] = strdup(name);\n+\tk = kh_put(s, idx->hash, idx->name[idx->n], &ret);\n+\tt.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset;\n+\tkh_value(idx->hash, k) = t;\n+\t++idx->n;\n+}\n+\n+faidx_t *fai_build_core(RAZF *rz)\n+{\n+\tchar c, *name;\n+\tint l_name, m_name, ret;\n+\tint len, line_len, line_blen, state;\n+\tint l1, l2;\n+\tfaidx_t *idx;\n+\tuint64_t offset;\n+\n+\tidx = (faidx_t*)calloc(1, sizeof(faidx_t));\n+\tidx->hash = kh_init(s);\n+\tname = 0; l_name = m_name = 0;\n+\tlen = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;\n+\twhile (razf_read(rz, &c, 1)) {\n+\t\tif (c == \'\\n\') { // an empty line\n+\t\t\tif (state == 1) {\n+\t\t\t\toffset = razf_tell(rz);\n+\t\t\t\tcontinue;\n+\t\t\t} else if ((state == 0 && len < 0) || state == 2) continue;\n+\t\t}\n+\t\tif (c == \'>\') { // fasta header\n+\t\t\tif (len >= 0)\n+\t\t\t\tfai_insert_index(idx, name, len, line_len, line_blen, offset);\n+\t\t\tl_name = 0;\n+\t\t\twhile ((ret = razf_read(rz, &c, 1)) != 0 && !isspace(c)) {\n+\t\t\t\tif (m_name < l_name + 2) {\n+\t\t\t\t\tm_name = l_name + 2;\n+\t\t\t\t\tkroundup32(m_name);\n+\t\t\t\t\tname = (char*)realloc(name, m_name);\n+\t\t\t\t}\n+\t\t\t\tname[l_name++] = c;\n+\t\t\t}\n+\t\t\tname[l_name] = \'\\0\';\n+\t\t\tif (ret == 0) {\n+\t\t\t\tfprintf(stderr, "[fai_build_core] the last entry has no sequence\\n");\n+\t\t\t\tfree(name); fai_destroy(idx);\n+\t\t\t\treturn 0;\n+\t\t\t}\n+\t\t\tif (c != \'\\n\') while (razf_read(rz, &c, 1) && c != \'\\n\');\n+\t\t\tstate = 1; len = 0;\n+\t\t\toffset = razf_tell(rz);\n+\t\t} else {\n+\t\t\tif (state == 3) {\n+\t\t\t\tfprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence \'%s\'.\\n", name);\n+\t\t\t\tfree(name); fai_destroy(idx);\n+\t\t\t\treturn 0;\n+\t\t\t}\n+\t\t\tif (state == 2) state = 3;\n+\t\t\tl1 = l2 = 0;\n+\t\t\tdo {\n+\t\t\t\t++l1;\n+\t\t\t\tif (isgraph(c)) ++l2;\n+\t\t\t} while ((ret = razf_read(rz, &c, 1)) && c != \'\\n\');\n+\t\t\tif (state == 3 && l2) {\n+\t\t\t\tfprintf(stderr, "[fai_build_core] different line length in sequence \'%s\'.\\n", name);\n+\t\t\t\tfree(name); fai_destroy(idx);\n+\t\t\t\treturn 0;\n+\t\t\t}\n+\t\t\t++l1; len += l2;\n+\t\t\tif (l2 >= 0x10000) {\n+\t\t\t\tfprintf(stderr, "[fai_build_core] line length exceeds 65535 in sequence \'%s\'.\\n", name);\n+\t\t\t\tfree(name); fai_destroy(idx);\n+\t\t\t\treturn 0;\n+\t\t\t}\n+\t\t\tif (state == 1) line_len = l1, line_blen = l2, state = 0;\n+\t\t\telse if (state == 0) {\n+\t\t\t\tif (l1 != line_len || l2 != line_blen) state = 2;\n+\t\t\t}\n+\t\t}\n+\t}\n+\tfai_insert_index(idx, name, len, line_len, line_blen, offset);\n+\tfree(name);\n+\treturn idx;\n+}\n+\n+void fai_save(const faidx_t *fai, FILE *fp)\n+{\n+\tkhint_t k;\n+\tint i;\n+\tfor (i = 0; i < fai->n; ++i) {\n+\t\tfaidx1_t x;\n+\t\tk = kh_get(s, fai->hash, fai->name[i]);\n+\t\tx = kh_value(fai->hash, k);\n+#ifdef _WIN32\n+\t\tfprintf(fp, "%s\\t%'..b' knet_close(fp_remote);\n+\n+ return fopen(fn, "r");\n+}\n+#endif\n+\n+faidx_t *fai_load(const char *fn)\n+{\n+\tchar *str;\n+\tFILE *fp;\n+\tfaidx_t *fai;\n+\tstr = (char*)calloc(strlen(fn) + 5, 1);\n+\tsprintf(str, "%s.fai", fn);\n+\n+#ifdef _USE_KNETFILE\n+ if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)\n+ {\n+ fp = download_and_open(str);\n+ if ( !fp )\n+ {\n+ fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\\n", str);\n+ free(str);\n+ return 0;\n+ }\n+ }\n+ else\n+#endif\n+ fp = fopen(str, "rb");\n+\tif (fp == 0) {\n+\t\tfprintf(stderr, "[fai_load] build FASTA index.\\n");\n+\t\tfai_build(fn);\n+\t\tfp = fopen(str, "rb");\n+\t\tif (fp == 0) {\n+\t\t\tfprintf(stderr, "[fai_load] fail to open FASTA index.\\n");\n+\t\t\tfree(str);\n+\t\t\treturn 0;\n+\t\t}\n+\t}\n+\n+\tfai = fai_read(fp);\n+\tfclose(fp);\n+\n+\tfai->rz = razf_open(fn, "rb");\n+\tfree(str);\n+\tif (fai->rz == 0) {\n+\t\tfprintf(stderr, "[fai_load] fail to open FASTA file.\\n");\n+\t\treturn 0;\n+\t}\n+\treturn fai;\n+}\n+\n+char *fai_fetch(const faidx_t *fai, const char *str, int *len)\n+{\n+\tchar *s, *p, c;\n+\tint i, l, k;\n+\tkhiter_t iter;\n+\tfaidx1_t val;\n+\tkhash_t(s) *h;\n+\tint beg, end;\n+\n+\tbeg = end = -1;\n+\th = fai->hash;\n+\tl = strlen(str);\n+\tp = s = (char*)malloc(l+1);\n+\t/* squeeze out "," */\n+\tfor (i = k = 0; i != l; ++i)\n+\t\tif (str[i] != \',\' && !isspace(str[i])) s[k++] = str[i];\n+\ts[k] = 0;\n+\tfor (i = 0; i != k; ++i) if (s[i] == \':\') break;\n+\ts[i] = 0;\n+\titer = kh_get(s, h, s); /* get the ref_id */\n+\tif (iter == kh_end(h)) {\n+\t\t*len = 0;\n+\t\tfree(s); return 0;\n+\t}\n+\tval = kh_value(h, iter);\n+\tif (i == k) { /* dump the whole sequence */\n+\t\tbeg = 0; end = val.len;\n+\t} else {\n+\t\tfor (p = s + i + 1; i != k; ++i) if (s[i] == \'-\') break;\n+\t\tbeg = atoi(p);\n+\t\tif (i < k) {\n+\t\t\tp = s + i + 1;\n+\t\t\tend = atoi(p);\n+\t\t} else end = val.len;\n+\t}\n+\tif (beg > 0) --beg;\n+\tif (beg >= val.len) beg = val.len;\n+\tif (end >= val.len) end = val.len;\n+\tif (beg > end) beg = end;\n+\tfree(s);\n+\n+\t// now retrieve the sequence\n+\tl = 0;\n+\ts = (char*)malloc(end - beg + 2);\n+\trazf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET);\n+\twhile (razf_read(fai->rz, &c, 1) == 1 && l < end - beg && !fai->rz->z_err)\n+\t\tif (isgraph(c)) s[l++] = c;\n+\ts[l] = \'\\0\';\n+\t*len = l;\n+\treturn s;\n+}\n+\n+int faidx_main(int argc, char *argv[])\n+{\n+\tif (argc == 1) {\n+\t\tfprintf(stderr, "Usage: faidx <in.fasta> [<reg> [...]]\\n");\n+\t\treturn 1;\n+\t} else {\n+\t\tif (argc == 2) fai_build(argv[1]);\n+\t\telse {\n+\t\t\tint i, j, k, l;\n+\t\t\tchar *s;\n+\t\t\tfaidx_t *fai;\n+\t\t\tfai = fai_load(argv[1]);\n+\t\t\tif (fai == 0) return 1;\n+\t\t\tfor (i = 2; i != argc; ++i) {\n+\t\t\t\tprintf(">%s\\n", argv[i]);\n+\t\t\t\ts = fai_fetch(fai, argv[i], &l);\n+\t\t\t\tfor (j = 0; j < l; j += 60) {\n+\t\t\t\t\tfor (k = 0; k < 60 && k < l - j; ++k)\n+\t\t\t\t\t\tputchar(s[j + k]);\n+\t\t\t\t\tputchar(\'\\n\');\n+\t\t\t\t}\n+\t\t\t\tfree(s);\n+\t\t\t}\n+\t\t\tfai_destroy(fai);\n+\t\t}\n+\t}\n+\treturn 0;\n+}\n+\n+int faidx_fetch_nseq(const faidx_t *fai) \n+{\n+\treturn fai->n;\n+}\n+\n+char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len)\n+{\n+\tint l;\n+\tchar c;\n+ khiter_t iter;\n+ faidx1_t val;\n+\tchar *seq=NULL;\n+\n+ // Adjust position\n+ iter = kh_get(s, fai->hash, c_name);\n+ if(iter == kh_end(fai->hash)) return 0;\n+ val = kh_value(fai->hash, iter);\n+\tif(p_end_i < p_beg_i) p_beg_i = p_end_i;\n+ if(p_beg_i < 0) p_beg_i = 0;\n+ else if(val.len <= p_beg_i) p_beg_i = val.len - 1;\n+ if(p_end_i < 0) p_end_i = 0;\n+ else if(val.len <= p_end_i) p_end_i = val.len - 1;\n+\n+ // Now retrieve the sequence \n+\tl = 0;\n+\tseq = (char*)malloc(p_end_i - p_beg_i + 2);\n+\trazf_seek(fai->rz, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET);\n+\twhile (razf_read(fai->rz, &c, 1) == 1 && l < p_end_i - p_beg_i + 1)\n+\t\tif (isgraph(c)) seq[l++] = c;\n+\tseq[l] = \'\\0\';\n+\t*len = l;\n+\treturn seq;\n+}\n+\n+#ifdef FAIDX_MAIN\n+int main(int argc, char *argv[]) { return faidx_main(argc, argv); }\n+#endif\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/faidx.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/faidx.h Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,103 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li <lh3@sanger.ac.uk> */ + +#ifndef FAIDX_H +#define FAIDX_H + +/*! + @header + + Index FASTA files and extract subsequence. + + @copyright The Wellcome Trust Sanger Institute. + */ + +struct __faidx_t; +typedef struct __faidx_t faidx_t; + +#ifdef __cplusplus +extern "C" { +#endif + + /*! + @abstract Build index for a FASTA or razip compressed FASTA file. + @param fn FASTA file name + @return 0 on success; or -1 on failure + @discussion File "fn.fai" will be generated. + */ + int fai_build(const char *fn); + + /*! + @abstract Distroy a faidx_t struct. + @param fai Pointer to the struct to be destroyed + */ + void fai_destroy(faidx_t *fai); + + /*! + @abstract Load index from "fn.fai". + @param fn File name of the FASTA file + */ + faidx_t *fai_load(const char *fn); + + /*! + @abstract Fetch the sequence in a region. + @param fai Pointer to the faidx_t struct + @param reg Region in the format "chr2:20,000-30,000" + @param len Length of the region + @return Pointer to the sequence; null on failure + + @discussion The returned sequence is allocated by malloc family + and should be destroyed by end users by calling free() on it. + */ + char *fai_fetch(const faidx_t *fai, const char *reg, int *len); + + /*! + @abstract Fetch the number of sequences. + @param fai Pointer to the faidx_t struct + @return The number of sequences + */ + int faidx_fetch_nseq(const faidx_t *fai); + + /*! + @abstract Fetch the sequence in a region. + @param fai Pointer to the faidx_t struct + @param c_name Region name + @param p_beg_i Beginning position number (zero-based) + @param p_end_i End position number (zero-based) + @param len Length of the region + @return Pointer to the sequence; null on failure + + @discussion The returned sequence is allocated by malloc family + and should be destroyed by end users by calling free() on it. + */ + char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len); + +#ifdef __cplusplus +} +#endif + +#endif |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/glf.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/glf.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,236 @@ +#include <string.h> +#include <stdlib.h> +#include "glf.h" + +#ifdef _NO_BGZF +// then alias bgzf_*() functions +#endif + +static int glf3_is_BE = 0; + +static inline uint32_t bam_swap_endian_4(uint32_t v) +{ + v = ((v & 0x0000FFFFU) << 16) | (v >> 16); + return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); +} + +static inline uint16_t bam_swap_endian_2(uint16_t v) +{ + return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); +} + +static inline int bam_is_big_endian() +{ + long one= 1; + return !(*((char *)(&one))); +} + +glf3_header_t *glf3_header_init() +{ + glf3_is_BE = bam_is_big_endian(); + return (glf3_header_t*)calloc(1, sizeof(glf3_header_t)); +} + +glf3_header_t *glf3_header_read(glfFile fp) +{ + glf3_header_t *h; + char magic[4]; + h = glf3_header_init(); + bgzf_read(fp, magic, 4); + if (strncmp(magic, "GLF\3", 4)) { + fprintf(stderr, "[glf3_header_read] invalid magic.\n"); + glf3_header_destroy(h); + return 0; + } + bgzf_read(fp, &h->l_text, 4); + if (glf3_is_BE) h->l_text = bam_swap_endian_4(h->l_text); + if (h->l_text) { + h->text = (uint8_t*)calloc(h->l_text + 1, 1); + bgzf_read(fp, h->text, h->l_text); + } + return h; +} + +void glf3_header_write(glfFile fp, const glf3_header_t *h) +{ + int32_t x; + bgzf_write(fp, "GLF\3", 4); + x = glf3_is_BE? bam_swap_endian_4(h->l_text) : h->l_text; + bgzf_write(fp, &x, 4); + if (h->l_text) bgzf_write(fp, h->text, h->l_text); +} + +void glf3_header_destroy(glf3_header_t *h) +{ + free(h->text); + free(h); +} + +char *glf3_ref_read(glfFile fp, int *len) +{ + int32_t n, x; + char *str; + *len = 0; + if (bgzf_read(fp, &n, 4) != 4) return 0; + if (glf3_is_BE) n = bam_swap_endian_4(n); + if (n < 0) { + fprintf(stderr, "[glf3_ref_read] invalid reference name length: %d.\n", n); + return 0; + } + str = (char*)calloc(n + 1, 1); // not necesarily n+1 in fact + x = bgzf_read(fp, str, n); + x += bgzf_read(fp, len, 4); + if (x != n + 4) { + free(str); *len = -1; return 0; // truncated + } + if (glf3_is_BE) *len = bam_swap_endian_4(*len); + return str; +} + +void glf3_ref_write(glfFile fp, const char *str, int len) +{ + int32_t m, n = strlen(str) + 1; + m = glf3_is_BE? bam_swap_endian_4(n) : n; + bgzf_write(fp, &m, 4); + bgzf_write(fp, str, n); + if (glf3_is_BE) len = bam_swap_endian_4(len); + bgzf_write(fp, &len, 4); +} + +void glf3_view1(const char *ref_name, const glf3_t *g3, int pos) +{ + int j; + if (g3->rtype == GLF3_RTYPE_END) return; + printf("%s\t%d\t%c\t%d\t%d\t%d", ref_name, pos + 1, + g3->rtype == GLF3_RTYPE_INDEL? '*' : "XACMGRSVTWYHKDBN"[g3->ref_base], + g3->depth, g3->rms_mapQ, g3->min_lk); + if (g3->rtype == GLF3_RTYPE_SUB) + for (j = 0; j != 10; ++j) printf("\t%d", g3->lk[j]); + else { + printf("\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t", g3->lk[0], g3->lk[1], g3->lk[2], g3->indel_len[0], g3->indel_len[1], + g3->indel_len[0]? g3->indel_seq[0] : "*", g3->indel_len[1]? g3->indel_seq[1] : "*"); + } + printf("\n"); +} + +int glf3_write1(glfFile fp, const glf3_t *g3) +{ + int r; + uint8_t c; + uint32_t y[2]; + c = g3->rtype<<4 | g3->ref_base; + r = bgzf_write(fp, &c, 1); + if (g3->rtype == GLF3_RTYPE_END) return r; + y[0] = g3->offset; + y[1] = g3->min_lk<<24 | g3->depth; + if (glf3_is_BE) { + y[0] = bam_swap_endian_4(y[0]); + y[1] = bam_swap_endian_4(y[1]); + } + r += bgzf_write(fp, y, 8); + r += bgzf_write(fp, &g3->rms_mapQ, 1); + if (g3->rtype == GLF3_RTYPE_SUB) r += bgzf_write(fp, g3->lk, 10); + else { + int16_t x[2]; + r += bgzf_write(fp, g3->lk, 3); + x[0] = glf3_is_BE? bam_swap_endian_2(g3->indel_len[0]) : g3->indel_len[0]; + x[1] = glf3_is_BE? bam_swap_endian_2(g3->indel_len[1]) : g3->indel_len[1]; + r += bgzf_write(fp, x, 4); + if (g3->indel_len[0]) r += bgzf_write(fp, g3->indel_seq[0], abs(g3->indel_len[0])); + if (g3->indel_len[1]) r += bgzf_write(fp, g3->indel_seq[1], abs(g3->indel_len[1])); + } + return r; +} + +#ifndef kv_roundup32 +#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +int glf3_read1(glfFile fp, glf3_t *g3) +{ + int r; + uint8_t c; + uint32_t y[2]; + r = bgzf_read(fp, &c, 1); + if (r == 0) return 0; + g3->ref_base = c & 0xf; + g3->rtype = c>>4; + if (g3->rtype == GLF3_RTYPE_END) return r; + r += bgzf_read(fp, y, 8); + if (glf3_is_BE) { + y[0] = bam_swap_endian_4(y[0]); + y[1] = bam_swap_endian_4(y[1]); + } + g3->offset = y[0]; + g3->min_lk = y[1]>>24; + g3->depth = y[1]<<8>>8; + r += bgzf_read(fp, &g3->rms_mapQ, 1); + if (g3->rtype == GLF3_RTYPE_SUB) r += bgzf_read(fp, g3->lk, 10); + else { + int16_t x[2], max; + r += bgzf_read(fp, g3->lk, 3); + r += bgzf_read(fp, x, 4); + if (glf3_is_BE) { + x[0] = bam_swap_endian_2(x[0]); + x[1] = bam_swap_endian_2(x[1]); + } + g3->indel_len[0] = x[0]; + g3->indel_len[1] = x[1]; + x[0] = abs(x[0]); x[1] = abs(x[1]); + max = (x[0] > x[1]? x[0] : x[1]) + 1; + if (g3->max_len < max) { + g3->max_len = max; + kv_roundup32(g3->max_len); + g3->indel_seq[0] = (char*)realloc(g3->indel_seq[0], g3->max_len); + g3->indel_seq[1] = (char*)realloc(g3->indel_seq[1], g3->max_len); + } + r += bgzf_read(fp, g3->indel_seq[0], x[0]); + r += bgzf_read(fp, g3->indel_seq[1], x[1]); + g3->indel_seq[0][x[0]] = g3->indel_seq[1][x[1]] = 0; + } + return r; +} + +void glf3_view(glfFile fp) +{ + glf3_header_t *h; + char *name; + glf3_t *g3; + int len; + h = glf3_header_read(fp); + g3 = glf3_init1(); + while ((name = glf3_ref_read(fp, &len)) != 0) { + int pos = 0; + while (glf3_read1(fp, g3) && g3->rtype != GLF3_RTYPE_END) { + pos += g3->offset; + glf3_view1(name, g3, pos); + } + free(name); + } + glf3_header_destroy(h); + glf3_destroy1(g3); +} + +int glf3_view_main(int argc, char *argv[]) +{ + glfFile fp; + if (argc == 1) { + fprintf(stderr, "Usage: glfview <in.glf>\n"); + return 1; + } + fp = (strcmp(argv[1], "-") == 0)? bgzf_fdopen(fileno(stdin), "r") : bgzf_open(argv[1], "r"); + if (fp == 0) { + fprintf(stderr, "Fail to open file '%s'\n", argv[1]); + return 1; + } + glf3_view(fp); + bgzf_close(fp); + return 0; +} + +#ifdef GLFVIEW_MAIN +int main(int argc, char *argv[]) +{ + return glf3_view_main(argc, argv); +} +#endif |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/glf.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/glf.h Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,56 @@ +#ifndef GLF_H_ +#define GLF_H_ + +typedef struct { + unsigned char ref_base:4, dummy:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */ + unsigned char max_mapQ; /** maximum mapping quality */ + unsigned char lk[10]; /** log likelihood ratio, capped at 255 */ + unsigned min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */ +} glf1_t; + +#include <stdint.h> +#include "bgzf.h" +typedef BGZF *glfFile; + +#define GLF3_RTYPE_END 0 +#define GLF3_RTYPE_SUB 1 +#define GLF3_RTYPE_INDEL 2 + +typedef struct { + uint8_t ref_base:4, rtype:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */ + uint8_t rms_mapQ; /** RMS mapping quality */ + uint8_t lk[10]; /** log likelihood ratio, capped at 255 */ + uint32_t min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */ + int32_t offset; /** the first base in a chromosome has offset zero. */ + // for indel (lkHom1, lkHom2 and lkHet are the first three elements in lk[10]) + int16_t indel_len[2]; + int32_t max_len; // maximum indel len; will be modified by glf3_read1() + char *indel_seq[2]; +} glf3_t; + +typedef struct { + int32_t l_text; + uint8_t *text; +} glf3_header_t; + +#ifdef __cplusplus +extern "C" { +#endif + +#define glf3_init1() ((glf3_t*)calloc(1, sizeof(glf3_t))) +#define glf3_destroy1(g3) do { free((g3)->indel_seq[0]); free((g3)->indel_seq[1]); free(g3); } while (0) + + glf3_header_t *glf3_header_init(); + glf3_header_t *glf3_header_read(glfFile fp); + void glf3_header_write(glfFile fp, const glf3_header_t *h); + void glf3_header_destroy(glf3_header_t *h); + char *glf3_ref_read(glfFile fp, int *len); + void glf3_ref_write(glfFile fp, const char *name, int len); + int glf3_write1(glfFile fp, const glf3_t *g3); + int glf3_read1(glfFile fp, glf3_t *g3); + +#ifdef __cplusplus +} +#endif + +#endif |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/kaln.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/kaln.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,370 @@\n+/* The MIT License\n+\n+ Copyright (c) 2003-2006, 2008, 2009, by Heng Li <lh3lh3@gmail.com>\n+\n+ Permission is hereby granted, free of charge, to any person obtaining\n+ a copy of this software and associated documentation files (the\n+ "Software"), to deal in the Software without restriction, including\n+ without limitation the rights to use, copy, modify, merge, publish,\n+ distribute, sublicense, and/or sell copies of the Software, and to\n+ permit persons to whom the Software is furnished to do so, subject to\n+ the following conditions:\n+\n+ The above copyright notice and this permission notice shall be\n+ included in all copies or substantial portions of the Software.\n+\n+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,\n+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS\n+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN\n+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN\n+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n+ SOFTWARE.\n+*/\n+\n+#include <stdlib.h>\n+#include <stdio.h>\n+#include <string.h>\n+#include <stdint.h>\n+#include "kaln.h"\n+\n+#define FROM_M 0\n+#define FROM_I 1\n+#define FROM_D 2\n+\n+typedef struct {\n+\tint i, j;\n+\tunsigned char ctype;\n+} path_t;\n+\n+int aln_sm_blosum62[] = {\n+/*\t A R N D C Q E G H I L K M F P S T W Y V * X */\n+\t 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,\n+\t-1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,\n+\t-2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,\n+\t-2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,\n+\t 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,\n+\t-1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,\n+\t-1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,\n+\t 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,\n+\t-2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,\n+\t-1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,\n+\t-1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,\n+\t-1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,\n+\t-1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,\n+\t-2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,\n+\t-1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,\n+\t 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,\n+\t 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,\n+\t-3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,\n+\t-2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,\n+\t 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,\n+\t-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,\n+\t 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1\n+};\n+\n+int aln_sm_blast[] = {\n+\t1, -3, -3, -3, -2,\n+\t-3, 1, -3, -3, -2,\n+\t-3, -3, 1, -3, -2,\n+\t-3, -3, -3, 1, -2,\n+\t-2, -2, -2, -2, -2\n+};\n+\n+ka_param_t ka_param_blast = { 5, 2, 2, aln_sm_blast, 5, 50 };\n+ka_param_t ka_param_aa2aa = { 10, 2, 2, aln_sm_blosum62, 22, 50 };\n+\n+static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)\n+{\n+\tint i, n;\n+\tuint32_t *cigar;\n+\tunsigned char last_type;\n+\n+\tif (path_len == 0 || path == 0) {\n+\t\t*n_cigar = 0;\n+\t\treturn 0;\n+\t}\n+\n+\tlast_type = path->ctype;\n+\tfor (i = n = 1; i < path_len; ++i) {\n+\t\tif (last_type != path[i].ctype) ++n;\n+\t\tlast_type = path[i].ctype;\n+\t}\n+\t*n_cigar = n;\n+\tcigar = (uint32_t*)calloc(*n_cigar, 4);\n+\n+\tcigar[0] = 1u << 4 | path[path_len-1].ctype;\n+\tlast_type = path[path_len-1].ctype;\n+\tfor (i = path_len - 2, n = 0; i >= 0; --i) {\n+\t\tif (path[i].ctype == last_type) cigar[n] += 1u << 4;\n+\t\telse {\n+\t\t\tcigar[++n] = 1u << 4 | path[i].ctype;\n+\t\t\tlast'..b'= last; last = s;\n+\n+\t/* core dynamic programming, part 1 */\n+\ttmp_end = (b2 < len2)? b2 : len2 - 1;\n+\tfor (j = 1; j <= tmp_end; ++j) {\n+\t\tq = dpcell[j]; s = curr; SET_INF(*s);\n+\t\tset_end_I(s->I, q, last);\n+\t\tend = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;\n+\t\tmat = score_matrix + seq2[j] * N_MATRIX_ROW;\n+\t\t++s; ++q;\n+\t\tfor (i = 1; i != end; ++i, ++s, ++q) {\n+\t\t\tset_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */\n+\t\t\tset_I(s->I, q, last + i);\n+\t\t\tset_D(s->D, q, s - 1);\n+\t\t}\n+\t\tset_M(s->M, q, last + i - 1, mat[seq1[i]]);\n+\t\tset_D(s->D, q, s - 1);\n+\t\tif (j + b1 - 1 > len1) { /* bug fixed, 040227 */\n+\t\t\tset_end_I(s->I, q, last + i);\n+\t\t} else s->I = MINOR_INF;\n+\t\ts = curr; curr = last; last = s;\n+\t}\n+\t/* last row for part 1, use set_end_D() instead of set_D() */\n+\tif (j == len2 && b2 != len2 - 1) {\n+\t\tq = dpcell[j]; s = curr; SET_INF(*s);\n+\t\tset_end_I(s->I, q, last);\n+\t\tend = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;\n+\t\tmat = score_matrix + seq2[j] * N_MATRIX_ROW;\n+\t\t++s; ++q;\n+\t\tfor (i = 1; i != end; ++i, ++s, ++q) {\n+\t\t\tset_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */\n+\t\t\tset_I(s->I, q, last + i);\n+\t\t\tset_end_D(s->D, q, s - 1);\n+\t\t}\n+\t\tset_M(s->M, q, last + i - 1, mat[seq1[i]]);\n+\t\tset_end_D(s->D, q, s - 1);\n+\t\tif (j + b1 - 1 > len1) { /* bug fixed, 040227 */\n+\t\t\tset_end_I(s->I, q, last + i);\n+\t\t} else s->I = MINOR_INF;\n+\t\ts = curr; curr = last; last = s;\n+\t\t++j;\n+\t}\n+\n+\t/* core dynamic programming, part 2 */\n+\tfor (; j <= len2 - b2 + 1; ++j) {\n+\t\tSET_INF(curr[j - b2]);\n+\t\tmat = score_matrix + seq2[j] * N_MATRIX_ROW;\n+\t\tend = j + b1 - 1;\n+\t\tfor (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {\n+\t\t\tset_M(s->M, q, last + i - 1, mat[seq1[i]]);\n+\t\t\tset_I(s->I, q, last + i);\n+\t\t\tset_D(s->D, q, s - 1);\n+\t\t}\n+\t\tset_M(s->M, q, last + i - 1, mat[seq1[i]]);\n+\t\tset_D(s->D, q, s - 1);\n+\t\ts->I = MINOR_INF;\n+\t\ts = curr; curr = last; last = s;\n+\t}\n+\n+\t/* core dynamic programming, part 3 */\n+\tfor (; j < len2; ++j) {\n+\t\tSET_INF(curr[j - b2]);\n+\t\tmat = score_matrix + seq2[j] * N_MATRIX_ROW;\n+\t\tfor (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {\n+\t\t\tset_M(s->M, q, last + i - 1, mat[seq1[i]]);\n+\t\t\tset_I(s->I, q, last + i);\n+\t\t\tset_D(s->D, q, s - 1);\n+\t\t}\n+\t\tset_M(s->M, q, last + len1 - 1, mat[seq1[i]]);\n+\t\tset_end_I(s->I, q, last + i);\n+\t\tset_D(s->D, q, s - 1);\n+\t\ts = curr; curr = last; last = s;\n+\t}\n+\t/* last row */\n+\tif (j == len2) {\n+\t\tSET_INF(curr[j - b2]);\n+\t\tmat = score_matrix + seq2[j] * N_MATRIX_ROW;\n+\t\tfor (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {\n+\t\t\tset_M(s->M, q, last + i - 1, mat[seq1[i]]);\n+\t\t\tset_I(s->I, q, last + i);\n+\t\t\tset_end_D(s->D, q, s - 1);\n+\t\t}\n+\t\tset_M(s->M, q, last + len1 - 1, mat[seq1[i]]);\n+\t\tset_end_I(s->I, q, last + i);\n+\t\tset_end_D(s->D, q, s - 1);\n+\t\ts = curr; curr = last; last = s;\n+\t}\n+\n+\t*_score = last[len1].M;\n+\tif (n_cigar) { /* backtrace */\n+\t\tpath_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2));\n+\t\ti = len1; j = len2;\n+\t\tq = dpcell[j] + i;\n+\t\ts = last + len1;\n+\t\tmax = s->M; type = q->Mt; ctype = FROM_M;\n+\t\tif (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }\n+\t\tif (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }\n+\n+\t\tp = path;\n+\t\tp->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */\n+\t\t++p;\n+\t\tdo {\n+\t\t\tswitch (ctype) {\n+\t\t\tcase FROM_M: --i; --j; break;\n+\t\t\tcase FROM_I: --j; break;\n+\t\t\tcase FROM_D: --i; break;\n+\t\t\t}\n+\t\t\tq = dpcell[j] + i;\n+\t\t\tctype = type;\n+\t\t\tswitch (type) {\n+\t\t\tcase FROM_M: type = q->Mt; break;\n+\t\t\tcase FROM_I: type = q->It; break;\n+\t\t\tcase FROM_D: type = q->Dt; break;\n+\t\t\t}\n+\t\t\tp->ctype = ctype; p->i = i; p->j = j;\n+\t\t\t++p;\n+\t\t} while (i || j);\n+\t\tcigar = ka_path2cigar32(path, p - path - 1, n_cigar);\n+\t\tfree(path);\n+\t}\n+\n+\t/* free memory */\n+\tfor (j = b2 + 1; j <= len2; ++j)\n+\t\tdpcell[j] += j - b2;\n+\tfor (j = 0; j <= len2; ++j)\n+\t\tfree(dpcell[j]);\n+\tfree(dpcell);\n+\tfree(curr); free(last);\n+\n+\treturn cigar;\n+}\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/kaln.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/kaln.h Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,55 @@ +/* The MIT License + + Copyright (c) 2003-2006, 2008, 2009 by Heng Li <lh3@live.co.uk> + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef LH3_KALN_H_ +#define LH3_KALN_H_ + +#include <stdint.h> + +#define MINOR_INF -1073741823 + +typedef struct { + int gap_open; + int gap_ext; + int gap_end; + + int *matrix; + int row; + int band_width; +} ka_param_t; + +#ifdef __cplusplus +extern "C" { +#endif + + uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar); + +#ifdef __cplusplus +} +#endif + +extern ka_param_t ka_param_blast; /* = { 5, 2, 2, aln_sm_blast, 5, 50 }; */ + +#endif |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/khash.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/khash.h Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,486 @@\n+/* The MIT License\n+\n+ Copyright (c) 2008 Genome Research Ltd (GRL).\n+\n+ Permission is hereby granted, free of charge, to any person obtaining\n+ a copy of this software and associated documentation files (the\n+ "Software"), to deal in the Software without restriction, including\n+ without limitation the rights to use, copy, modify, merge, publish,\n+ distribute, sublicense, and/or sell copies of the Software, and to\n+ permit persons to whom the Software is furnished to do so, subject to\n+ the following conditions:\n+\n+ The above copyright notice and this permission notice shall be\n+ included in all copies or substantial portions of the Software.\n+\n+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,\n+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS\n+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN\n+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN\n+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n+ SOFTWARE.\n+*/\n+\n+/* Contact: Heng Li <lh3@sanger.ac.uk> */\n+\n+/*\n+ An example:\n+\n+#include "khash.h"\n+KHASH_MAP_INIT_INT(32, char)\n+int main() {\n+\tint ret, is_missing;\n+\tkhiter_t k;\n+\tkhash_t(32) *h = kh_init(32);\n+\tk = kh_put(32, h, 5, &ret);\n+\tif (!ret) kh_del(32, h, k);\n+\tkh_value(h, k) = 10;\n+\tk = kh_get(32, h, 10);\n+\tis_missing = (k == kh_end(h));\n+\tk = kh_get(32, h, 5);\n+\tkh_del(32, h, k);\n+\tfor (k = kh_begin(h); k != kh_end(h); ++k)\n+\t\tif (kh_exist(h, k)) kh_value(h, k) = 1;\n+\tkh_destroy(32, h);\n+\treturn 0;\n+}\n+*/\n+\n+/*\n+ 2008-09-19 (0.2.3):\n+\n+\t* Corrected the example\n+\t* Improved interfaces\n+\n+ 2008-09-11 (0.2.2):\n+\n+\t* Improved speed a little in kh_put()\n+\n+ 2008-09-10 (0.2.1):\n+\n+\t* Added kh_clear()\n+\t* Fixed a compiling error\n+\n+ 2008-09-02 (0.2.0):\n+\n+\t* Changed to token concatenation which increases flexibility.\n+\n+ 2008-08-31 (0.1.2):\n+\n+\t* Fixed a bug in kh_get(), which has not been tested previously.\n+\n+ 2008-08-31 (0.1.1):\n+\n+\t* Added destructor\n+*/\n+\n+\n+#ifndef __AC_KHASH_H\n+#define __AC_KHASH_H\n+\n+/*!\n+ @header\n+\n+ Generic hash table library.\n+\n+ @copyright Heng Li\n+ */\n+\n+#define AC_VERSION_KHASH_H "0.2.2"\n+\n+#include <stdint.h>\n+#include <stdlib.h>\n+#include <string.h>\n+\n+typedef uint32_t khint_t;\n+typedef khint_t khiter_t;\n+\n+#define __ac_HASH_PRIME_SIZE 32\n+static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =\n+{\n+ 0ul, 3ul, 11ul, 23ul, 53ul,\n+ 97ul, 193ul, 389ul, 769ul, 1543ul,\n+ 3079ul, 6151ul, 12289ul, 24593ul, 49157ul,\n+ 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,\n+ 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,\n+ 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,\n+ 3221225473ul, 4294967291ul\n+};\n+\n+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)\n+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)\n+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)\n+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))\n+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))\n+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))\n+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))\n+\n+static const double __ac_HASH_UPPER = 0.77;\n+\n+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \\\n+\ttypedef struct {\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tkhint_t n_buckets, size, n_occupied, upper_bound;\t\t\t\t\\\n+\t\tuint32_t *flags;\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tkhkey_t *keys;\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tkhval_t *vals;\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t} kh_##name##_t;\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\tstatic inline kh_##name##_t *kh_init_##name() {\t\t\t\t\t\t\\\n+\t\treturn (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t));\t\t\\\n+\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\tstatic inlin'..b'e, h, k) kh_get_##name(h, k)\n+\n+/*! @function\n+ @abstract Remove a key from the hash table.\n+ @param name Name of the hash table [symbol]\n+ @param h Pointer to the hash table [khash_t(name)*]\n+ @param k Iterator to the element to be deleted [khint_t]\n+ */\n+#define kh_del(name, h, k) kh_del_##name(h, k)\n+\n+\n+/*! @function\n+ @abstract Test whether a bucket contains data.\n+ @param h Pointer to the hash table [khash_t(name)*]\n+ @param x Iterator to the bucket [khint_t]\n+ @return 1 if containing data; 0 otherwise [int]\n+ */\n+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))\n+\n+/*! @function\n+ @abstract Get key given an iterator\n+ @param h Pointer to the hash table [khash_t(name)*]\n+ @param x Iterator to the bucket [khint_t]\n+ @return Key [type of keys]\n+ */\n+#define kh_key(h, x) ((h)->keys[x])\n+\n+/*! @function\n+ @abstract Get value given an iterator\n+ @param h Pointer to the hash table [khash_t(name)*]\n+ @param x Iterator to the bucket [khint_t]\n+ @return Value [type of values]\n+ @discussion For hash sets, calling this results in segfault.\n+ */\n+#define kh_val(h, x) ((h)->vals[x])\n+\n+/*! @function\n+ @abstract Alias of kh_val()\n+ */\n+#define kh_value(h, x) ((h)->vals[x])\n+\n+/*! @function\n+ @abstract Get the start iterator\n+ @param h Pointer to the hash table [khash_t(name)*]\n+ @return The start iterator [khint_t]\n+ */\n+#define kh_begin(h) (khint_t)(0)\n+\n+/*! @function\n+ @abstract Get the end iterator\n+ @param h Pointer to the hash table [khash_t(name)*]\n+ @return The end iterator [khint_t]\n+ */\n+#define kh_end(h) ((h)->n_buckets)\n+\n+/*! @function\n+ @abstract Get the number of elements in the hash table\n+ @param h Pointer to the hash table [khash_t(name)*]\n+ @return Number of elements in the hash table [khint_t]\n+ */\n+#define kh_size(h) ((h)->size)\n+\n+/*! @function\n+ @abstract Get the number of buckets in the hash table\n+ @param h Pointer to the hash table [khash_t(name)*]\n+ @return Number of buckets in the hash table [khint_t]\n+ */\n+#define kh_n_buckets(h) ((h)->n_buckets)\n+\n+/* More conenient interfaces */\n+\n+/*! @function\n+ @abstract Instantiate a hash set containing integer keys\n+ @param name Name of the hash table [symbol]\n+ */\n+#define KHASH_SET_INIT_INT(name)\t\t\t\t\t\t\t\t\t\t\\\n+\tKHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)\n+\n+/*! @function\n+ @abstract Instantiate a hash map containing integer keys\n+ @param name Name of the hash table [symbol]\n+ @param khval_t Type of values [type]\n+ */\n+#define KHASH_MAP_INIT_INT(name, khval_t)\t\t\t\t\t\t\t\t\\\n+\tKHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)\n+\n+/*! @function\n+ @abstract Instantiate a hash map containing 64-bit integer keys\n+ @param name Name of the hash table [symbol]\n+ */\n+#define KHASH_SET_INIT_INT64(name)\t\t\t\t\t\t\t\t\t\t\\\n+\tKHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)\n+\n+/*! @function\n+ @abstract Instantiate a hash map containing 64-bit integer keys\n+ @param name Name of the hash table [symbol]\n+ @param khval_t Type of values [type]\n+ */\n+#define KHASH_MAP_INIT_INT64(name, khval_t)\t\t\t\t\t\t\t\t\\\n+\tKHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)\n+\n+typedef const char *kh_cstr_t;\n+/*! @function\n+ @abstract Instantiate a hash map containing const char* keys\n+ @param name Name of the hash table [symbol]\n+ */\n+#define KHASH_SET_INIT_STR(name)\t\t\t\t\t\t\t\t\t\t\\\n+\tKHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)\n+\n+/*! @function\n+ @abstract Instantiate a hash map containing const char* keys\n+ @param name Name of the hash table [symbol]\n+ @param khval_t Type of values [type]\n+ */\n+#define KHASH_MAP_INIT_STR(name, khval_t)\t\t\t\t\t\t\t\t\\\n+\tKHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)\n+\n+#endif /* __AC_KHASH_H */\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/klist.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/klist.h Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,96 @@ +#ifndef _LH3_KLIST_H +#define _LH3_KLIST_H + +#include <stdlib.h> + +#define KMEMPOOL_INIT(name, kmptype_t, kmpfree_f) \ + typedef struct { \ + size_t cnt, n, max; \ + kmptype_t **buf; \ + } kmp_##name##_t; \ + static inline kmp_##name##_t *kmp_init_##name() { \ + return calloc(1, sizeof(kmp_##name##_t)); \ + } \ + static inline void kmp_destroy_##name(kmp_##name##_t *mp) { \ + size_t k; \ + for (k = 0; k < mp->n; ++k) { \ + kmpfree_f(mp->buf[k]); free(mp->buf[k]); \ + } \ + free(mp->buf); free(mp); \ + } \ + static inline kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) { \ + ++mp->cnt; \ + if (mp->n == 0) return calloc(1, sizeof(kmptype_t)); \ + return mp->buf[--mp->n]; \ + } \ + static inline void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \ + --mp->cnt; \ + if (mp->n == mp->max) { \ + mp->max = mp->max? mp->max<<1 : 16; \ + mp->buf = realloc(mp->buf, sizeof(void*) * mp->max); \ + } \ + mp->buf[mp->n++] = p; \ + } + +#define kmempool_t(name) kmp_##name##_t +#define kmp_init(name) kmp_init_##name() +#define kmp_destroy(name, mp) kmp_destroy_##name(mp) +#define kmp_alloc(name, mp) kmp_alloc_##name(mp) +#define kmp_free(name, mp, p) kmp_free_##name(mp, p) + +#define KLIST_INIT(name, kltype_t, kmpfree_t) \ + struct __kl1_##name { \ + kltype_t data; \ + struct __kl1_##name *next; \ + }; \ + typedef struct __kl1_##name kl1_##name; \ + KMEMPOOL_INIT(name, kl1_##name, kmpfree_t) \ + typedef struct { \ + kl1_##name *head, *tail; \ + kmp_##name##_t *mp; \ + size_t size; \ + } kl_##name##_t; \ + static inline kl_##name##_t *kl_init_##name() { \ + kl_##name##_t *kl = calloc(1, sizeof(kl_##name##_t)); \ + kl->mp = kmp_init(name); \ + kl->head = kl->tail = kmp_alloc(name, kl->mp); \ + kl->head->next = 0; \ + return kl; \ + } \ + static inline void kl_destroy_##name(kl_##name##_t *kl) { \ + kl1_##name *p; \ + for (p = kl->head; p != kl->tail; p = p->next) \ + kmp_free(name, kl->mp, p); \ + kmp_free(name, kl->mp, p); \ + kmp_destroy(name, kl->mp); \ + free(kl); \ + } \ + static inline kltype_t *kl_pushp_##name(kl_##name##_t *kl) { \ + kl1_##name *q, *p = kmp_alloc(name, kl->mp); \ + q = kl->tail; p->next = 0; kl->tail->next = p; kl->tail = p; \ + ++kl->size; \ + return &q->data; \ + } \ + static inline int kl_shift_##name(kl_##name##_t *kl, kltype_t *d) { \ + kl1_##name *p; \ + if (kl->head->next == 0) return -1; \ + --kl->size; \ + p = kl->head; kl->head = kl->head->next; \ + if (d) *d = p->data; \ + kmp_free(name, kl->mp, p); \ + return 0; \ + } + +#define kliter_t(name) kl1_##name +#define klist_t(name) kl_##name##_t +#define kl_val(iter) ((iter)->data) +#define kl_next(iter) ((iter)->next) +#define kl_begin(kl) ((kl)->head) +#define kl_end(kl) ((kl)->tail) + +#define kl_init(name) kl_init_##name() +#define kl_destroy(name, kl) kl_destroy_##name(kl) +#define kl_pushp(name, kl) kl_pushp_##name(kl) +#define kl_shift(name, kl, d) kl_shift_##name(kl, d) + +#endif |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/knetfile.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/knetfile.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,630 @@\n+/* The MIT License\n+\n+ Copyright (c) 2008 Genome Research Ltd (GRL).\n+\n+ Permission is hereby granted, free of charge, to any person obtaining\n+ a copy of this software and associated documentation files (the\n+ "Software"), to deal in the Software without restriction, including\n+ without limitation the rights to use, copy, modify, merge, publish,\n+ distribute, sublicense, and/or sell copies of the Software, and to\n+ permit persons to whom the Software is furnished to do so, subject to\n+ the following conditions:\n+\n+ The above copyright notice and this permission notice shall be\n+ included in all copies or substantial portions of the Software.\n+\n+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,\n+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS\n+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN\n+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN\n+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n+ SOFTWARE.\n+*/\n+\n+/* Contact: Heng Li <lh3@sanger.ac.uk> */\n+\n+/* Probably I will not do socket programming in the next few years and\n+ therefore I decide to heavily annotate this file, for Linux and\n+ Windows as well. -lh3 */\n+\n+#include <time.h>\n+#include <stdio.h>\n+#include <ctype.h>\n+#include <stdlib.h>\n+#include <string.h>\n+#include <errno.h>\n+#include <unistd.h>\n+#include <sys/types.h>\n+\n+#ifndef _WIN32\n+#include <netdb.h>\n+#include <arpa/inet.h>\n+#include <sys/socket.h>\n+#endif\n+\n+#include "knetfile.h"\n+\n+/* In winsock.h, the type of a socket is SOCKET, which is: "typedef\n+ * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed\n+ * integer -1. In knetfile.c, I use "int" for socket type\n+ * throughout. This should be improved to avoid confusion.\n+ *\n+ * In Linux/Mac, recv() and read() do almost the same thing. You can see\n+ * in the header file that netread() is simply an alias of read(). In\n+ * Windows, however, they are different and using recv() is mandatory.\n+ */\n+\n+/* This function tests if the file handler is ready for reading (or\n+ * writing if is_read==0). */\n+static int socket_wait(int fd, int is_read)\n+{\n+\tfd_set fds, *fdr = 0, *fdw = 0;\n+\tstruct timeval tv;\n+\tint ret;\n+\ttv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out\n+\tFD_ZERO(&fds);\n+\tFD_SET(fd, &fds);\n+\tif (is_read) fdr = &fds;\n+\telse fdw = &fds;\n+\tret = select(fd+1, fdr, fdw, 0, &tv);\n+#ifndef _WIN32\n+\tif (ret == -1) perror("select");\n+#else\n+\tif (ret == 0)\n+\t\tfprintf(stderr, "select time-out\\n");\n+\telse if (ret == SOCKET_ERROR)\n+\t\tfprintf(stderr, "select: %d\\n", WSAGetLastError());\n+#endif\n+\treturn ret;\n+}\n+\n+#ifndef _WIN32\n+/* This function does not work with Windows due to the lack of\n+ * getaddrinfo() in winsock. It is addapted from an example in "Beej\'s\n+ * Guide to Network Programming" (http://beej.us/guide/bgnet/). */\n+static int socket_connect(const char *host, const char *port)\n+{\n+#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)\n+\n+\tint on = 1, fd;\n+\tstruct linger lng = { 0, 0 };\n+\tstruct addrinfo hints, *res;\n+\tmemset(&hints, 0, sizeof(struct addrinfo));\n+\thints.ai_family = AF_UNSPEC;\n+\thints.ai_socktype = SOCK_STREAM;\n+\t/* In Unix/Mac, getaddrinfo() is the most convenient way to get\n+\t * server information. */\n+\tif (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");\n+\tif ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");\n+\t/* The following two setsockopt() are used by ftplib\n+\t * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they\n+\t * necessary. */\n+\tif (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");\n+\tif (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");\n+\tif (connect('..b'fp->fd == -1) {\n+\t\tknet_close(fp);\n+\t\treturn 0;\n+\t}\n+\treturn fp;\n+}\n+\n+knetFile *knet_dopen(int fd, const char *mode)\n+{\n+\tknetFile *fp = (knetFile*)calloc(1, sizeof(knetFile));\n+\tfp->type = KNF_TYPE_LOCAL;\n+\tfp->fd = fd;\n+\treturn fp;\n+}\n+\n+off_t knet_read(knetFile *fp, void *buf, off_t len)\n+{\n+\toff_t l = 0;\n+\tif (fp->fd == -1) return 0;\n+\tif (fp->type == KNF_TYPE_FTP) {\n+\t\tif (fp->is_ready == 0) {\n+\t\t\tif (!fp->no_reconnect) kftp_reconnect(fp);\n+\t\t\tkftp_connect_file(fp);\n+\t\t}\n+\t} else if (fp->type == KNF_TYPE_HTTP) {\n+\t\tif (fp->is_ready == 0)\n+\t\t\tkhttp_connect_file(fp);\n+\t}\n+\tif (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX\n+\t\toff_t rest = len, curr;\n+\t\twhile (rest) {\n+\t\t\tcurr = read(fp->fd, buf + l, rest);\n+\t\t\tif (curr == 0) break;\n+\t\t\tl += curr; rest -= curr;\n+\t\t}\n+\t} else l = my_netread(fp->fd, buf, len);\n+\tfp->offset += l;\n+\treturn l;\n+}\n+\n+off_t knet_seek(knetFile *fp, int64_t off, int whence)\n+{\n+\tif (whence == SEEK_SET && off == fp->offset) return 0;\n+\tif (fp->type == KNF_TYPE_LOCAL) {\n+\t\t/* Be aware that lseek() returns the offset after seeking,\n+\t\t * while fseek() returns zero on success. */\n+\t\toff_t offset = lseek(fp->fd, off, whence);\n+\t\tif (offset == -1) {\n+ // Be silent, it is OK for knet_seek to fail when the file is streamed\n+ // fprintf(stderr,"[knet_seek] %s\\n", strerror(errno));\n+\t\t\treturn -1;\n+\t\t}\n+\t\tfp->offset = offset;\n+\t\treturn 0;\n+\t}\n+ else if (fp->type == KNF_TYPE_FTP) \n+ {\n+ if (whence==SEEK_CUR)\n+ fp->offset += off;\n+ else if (whence==SEEK_SET)\n+ fp->offset = off;\n+ else if ( whence==SEEK_END)\n+ fp->offset = fp->file_size+off;\n+\t\tfp->is_ready = 0;\n+\t\treturn 0;\n+\t} \n+ else if (fp->type == KNF_TYPE_HTTP) \n+ {\n+\t\tif (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future?\n+\t\t\tfprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\\n");\n+\t\t\terrno = ESPIPE;\n+\t\t\treturn -1;\n+\t\t}\n+ if (whence==SEEK_CUR)\n+ fp->offset += off;\n+ else if (whence==SEEK_SET)\n+ fp->offset = off;\n+\t\tfp->is_ready = 0;\n+\t\treturn 0;\n+\t}\n+\terrno = EINVAL;\n+ fprintf(stderr,"[knet_seek] %s\\n", strerror(errno));\n+\treturn -1;\n+}\n+\n+int knet_close(knetFile *fp)\n+{\n+\tif (fp == 0) return 0;\n+\tif (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific\n+\tif (fp->fd != -1) {\n+\t\t/* On Linux/Mac, netclose() is an alias of close(), but on\n+\t\t * Windows, it is an alias of closesocket(). */\n+\t\tif (fp->type == KNF_TYPE_LOCAL) close(fp->fd);\n+\t\telse netclose(fp->fd);\n+\t}\n+\tfree(fp->host); free(fp->port);\n+\tfree(fp->response); free(fp->retr); // FTP specific\n+\tfree(fp->path); free(fp->http_host); // HTTP specific\n+\tfree(fp);\n+\treturn 0;\n+}\n+\n+#ifdef KNETFILE_MAIN\n+int main(void)\n+{\n+\tchar *buf;\n+\tknetFile *fp;\n+\tint type = 4, l;\n+#ifdef _WIN32\n+\tknet_win32_init();\n+#endif\n+\tbuf = calloc(0x100000, 1);\n+\tif (type == 0) {\n+\t\tfp = knet_open("knetfile.c", "r");\n+\t\tknet_seek(fp, 1000, SEEK_SET);\n+\t} else if (type == 1) { // NCBI FTP, large file\n+\t\tfp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");\n+\t\tknet_seek(fp, 2500000000ll, SEEK_SET);\n+\t\tl = knet_read(fp, buf, 255);\n+\t} else if (type == 2) {\n+\t\tfp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");\n+\t\tknet_seek(fp, 1000, SEEK_SET);\n+\t} else if (type == 3) {\n+\t\tfp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");\n+\t\tknet_seek(fp, 1000, SEEK_SET);\n+\t} else if (type == 4) {\n+\t\tfp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");\n+\t\tknet_read(fp, buf, 10000);\n+\t\tknet_seek(fp, 20000, SEEK_SET);\n+\t\tknet_seek(fp, 10000, SEEK_SET);\n+\t\tl = knet_read(fp, buf+10000, 10000000) + 10000;\n+\t}\n+\tif (type != 4 && type != 1) {\n+\t\tknet_read(fp, buf, 255);\n+\t\tbuf[255] = 0;\n+\t\tprintf("%s\\n", buf);\n+\t} else write(fileno(stdout), buf, l);\n+\tknet_close(fp);\n+\tfree(buf);\n+\treturn 0;\n+}\n+#endif\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/knetfile.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/knetfile.h Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,75 @@ +#ifndef KNETFILE_H +#define KNETFILE_H + +#include <stdint.h> +#include <fcntl.h> + +#ifndef _WIN32 +#define netread(fd, ptr, len) read(fd, ptr, len) +#define netwrite(fd, ptr, len) write(fd, ptr, len) +#define netclose(fd) close(fd) +#else +#include <winsock2.h> +#define netread(fd, ptr, len) recv(fd, ptr, len, 0) +#define netwrite(fd, ptr, len) send(fd, ptr, len, 0) +#define netclose(fd) closesocket(fd) +#endif + +// FIXME: currently I/O is unbuffered + +#define KNF_TYPE_LOCAL 1 +#define KNF_TYPE_FTP 2 +#define KNF_TYPE_HTTP 3 + +typedef struct knetFile_s { + int type, fd; + int64_t offset; + char *host, *port; + + // the following are for FTP only + int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; + char *response, *retr, *size_cmd; + int64_t seek_offset; // for lazy seek + int64_t file_size; + + // the following are for HTTP only + char *path, *http_host; +} knetFile; + +#define knet_tell(fp) ((fp)->offset) +#define knet_fileno(fp) ((fp)->fd) + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _WIN32 + int knet_win32_init(); + void knet_win32_destroy(); +#endif + + knetFile *knet_open(const char *fn, const char *mode); + + /* + This only works with local files. + */ + knetFile *knet_dopen(int fd, const char *mode); + + /* + If ->is_ready==0, this routine updates ->fd; otherwise, it simply + reads from ->fd. + */ + off_t knet_read(knetFile *fp, void *buf, off_t len); + + /* + This routine only sets ->offset and ->is_ready=0. It does not + communicate with the FTP server. + */ + off_t knet_seek(knetFile *fp, int64_t off, int whence); + int knet_close(knetFile *fp); + +#ifdef __cplusplus +} +#endif + +#endif |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/kseq.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/kseq.h Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,227 @@\n+/* The MIT License\n+\n+ Copyright (c) 2008 Genome Research Ltd (GRL).\n+\n+ Permission is hereby granted, free of charge, to any person obtaining\n+ a copy of this software and associated documentation files (the\n+ "Software"), to deal in the Software without restriction, including\n+ without limitation the rights to use, copy, modify, merge, publish,\n+ distribute, sublicense, and/or sell copies of the Software, and to\n+ permit persons to whom the Software is furnished to do so, subject to\n+ the following conditions:\n+\n+ The above copyright notice and this permission notice shall be\n+ included in all copies or substantial portions of the Software.\n+\n+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,\n+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS\n+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN\n+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN\n+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n+ SOFTWARE.\n+*/\n+\n+/* Contact: Heng Li <lh3@sanger.ac.uk> */\n+\n+/*\n+ 2009-07-16 (lh3): in kstream_t, change "char*" to "unsigned char*"\n+ */\n+\n+/* Last Modified: 12APR2009 */\n+\n+#ifndef AC_KSEQ_H\n+#define AC_KSEQ_H\n+\n+#include <ctype.h>\n+#include <string.h>\n+#include <stdlib.h>\n+\n+#define KS_SEP_SPACE 0 // isspace(): \\t, \\n, \\v, \\f, \\r\n+#define KS_SEP_TAB 1 // isspace() && !\' \'\n+#define KS_SEP_MAX 1\n+\n+#define __KS_TYPE(type_t)\t\t\t\t\t\t\\\n+\ttypedef struct __kstream_t {\t\t\t\t\\\n+\t\tunsigned char *buf;\t\t\t\t\t\t\\\n+\t\tint begin, end, is_eof;\t\t\t\t\t\\\n+\t\ttype_t f;\t\t\t\t\t\t\t\t\\\n+\t} kstream_t;\n+\n+#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)\n+#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)\n+\n+#define __KS_BASIC(type_t, __bufsize)\t\t\t\t\t\t\t\t\\\n+\tstatic inline kstream_t *ks_init(type_t f)\t\t\t\t\t\t\\\n+\t{\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tkstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));\t\\\n+\t\tks->f = f;\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tks->buf = malloc(__bufsize);\t\t\t\t\t\t\t\t\\\n+\t\treturn ks;\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\tstatic inline void ks_destroy(kstream_t *ks)\t\t\t\t\t\\\n+\t{\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tif (ks) {\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\tfree(ks->buf);\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\tfree(ks);\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t}\n+\n+#define __KS_GETC(__read, __bufsize)\t\t\t\t\t\t\\\n+\tstatic inline int ks_getc(kstream_t *ks)\t\t\t\t\\\n+\t{\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tif (ks->is_eof && ks->begin >= ks->end) return -1;\t\\\n+\t\tif (ks->begin >= ks->end) {\t\t\t\t\t\t\t\\\n+\t\t\tks->begin = 0;\t\t\t\t\t\t\t\t\t\\\n+\t\t\tks->end = __read(ks->f, ks->buf, __bufsize);\t\\\n+\t\t\tif (ks->end < __bufsize) ks->is_eof = 1;\t\t\\\n+\t\t\tif (ks->end == 0) return -1;\t\t\t\t\t\\\n+\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\treturn (int)ks->buf[ks->begin++];\t\t\t\t\t\\\n+\t}\n+\n+#ifndef KSTRING_T\n+#define KSTRING_T kstring_t\n+typedef struct __kstring_t {\n+\tsize_t l, m;\n+\tchar *s;\n+} kstring_t;\n+#endif\n+\n+#ifndef kroundup32\n+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))\n+#endif\n+\n+#define __KS_GETUNTIL(__read, __bufsize)\t\t\t\t\t\t\t\t\\\n+\tstatic int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \\\n+\t{\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tif (dret) *dret = 0;\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tstr->l = 0;\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tif (ks->begin >= ks->end && ks->is_eof) return -1;\t\t\t\t\\\n+\t\tfor (;;) {\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\tint i;\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\tif (ks->begin >= ks->end) {\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\tif (!ks->is_eof) {\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\tks->begin = 0;\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\tks->end = __read(ks->f, ks->buf, __bufsize);\t\t\\\n+\t\t\t\t\tif (ks->end < __bufsize) ks->is_eof = 1;\t\t\t\\\n+\t\t\t\t\tif (ks->end == 0) break;\t\t\t\t\t\t\t\\\n+\t\t\t\t} else break;\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\tif (delimiter > KS_SEP_MAX) {\t\t\t\t\t\t\t\t\\\n+\t\t\t\tfor (i = ks->begin; i < ks->end; ++i)\t\t\t\t\t\\\n+\t\t\t\t\tif (ks->buf[i] == delimiter) break;\t\t\t\t\t\\\n+\t\t\t} else if (delimiter == KS_SEP_SPACE) {\t\t\t\t\t\t\\\n+\t\t\t\tfor (i = ks->begin; i < ks->end; ++i)\t\t\t\t\t\\\n+\t\t\t\t\tif (isspace(ks->buf[i])) break'..b"\t\\\n+\t\t\t\tstr->s = (char*)realloc(str->s, str->m);\t\t\t\t\\\n+\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\tmemcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \\\n+\t\t\tstr->l = str->l + (i - ks->begin);\t\t\t\t\t\t\t\\\n+\t\t\tks->begin = i + 1;\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\tif (i < ks->end) {\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\tif (dret) *dret = ks->buf[i];\t\t\t\t\t\t\t\\\n+\t\t\t\tbreak;\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tif (str->l == 0) {\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\tstr->m = 1;\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\tstr->s = (char*)calloc(1, 1);\t\t\t\t\t\t\t\t\\\n+\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tstr->s[str->l] = '\\0';\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\treturn str->l;\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t}\n+\n+#define KSTREAM_INIT(type_t, __read, __bufsize) \\\n+\t__KS_TYPE(type_t)\t\t\t\t\t\t\t\\\n+\t__KS_BASIC(type_t, __bufsize)\t\t\t\t\\\n+\t__KS_GETC(__read, __bufsize)\t\t\t\t\\\n+\t__KS_GETUNTIL(__read, __bufsize)\n+\n+#define __KSEQ_BASIC(type_t)\t\t\t\t\t\t\t\t\t\t\t\\\n+\tstatic inline kseq_t *kseq_init(type_t fd)\t\t\t\t\t\t\t\\\n+\t{\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tkseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));\t\t\t\t\t\\\n+\t\ts->f = ks_init(fd);\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\treturn s;\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\tstatic inline void kseq_rewind(kseq_t *ks)\t\t\t\t\t\t\t\\\n+\t{\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tks->last_char = 0;\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tks->f->is_eof = ks->f->begin = ks->f->end = 0;\t\t\t\t\t\\\n+\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\tstatic inline void kseq_destroy(kseq_t *ks)\t\t\t\t\t\t\t\\\n+\t{\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tif (!ks) return;\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tfree(ks->name.s); free(ks->comment.s); free(ks->seq.s);\tfree(ks->qual.s); \\\n+\t\tks_destroy(ks->f);\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tfree(ks);\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t}\n+\n+/* Return value:\n+ >=0 length of the sequence (normal)\n+ -1 end-of-file\n+ -2 truncated quality string\n+ */\n+#define __KSEQ_READ\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\tstatic int kseq_read(kseq_t *seq)\t\t\t\t\t\t\t\t\t\\\n+\t{\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tint c;\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tkstream_t *ks = seq->f;\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tif (seq->last_char == 0) { /* then jump to the next header line */ \\\n+\t\t\twhile ((c = ks_getc(ks)) != -1 && c != '>' && c != '@');\t\\\n+\t\t\tif (c == -1) return -1; /* end of file */\t\t\t\t\t\\\n+\t\t\tseq->last_char = c;\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t} /* the first header char has been read */\t\t\t\t\t\t\\\n+\t\tseq->comment.l = seq->seq.l = seq->qual.l = 0;\t\t\t\t\t\\\n+\t\tif (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1;\t\t\t\\\n+\t\tif (c != '\\n') ks_getuntil(ks, '\\n', &seq->comment, 0);\t\t\t\\\n+\t\twhile ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \\\n+\t\t\tif (isgraph(c)) { /* printable non-space character */\t\t\\\n+\t\t\t\tif (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \\\n+\t\t\t\t\tseq->seq.m = seq->seq.l + 2;\t\t\t\t\t\t\\\n+\t\t\t\t\tkroundup32(seq->seq.m); /* rounded to next closest 2^k */ \\\n+\t\t\t\t\tseq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \\\n+\t\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\tseq->seq.s[seq->seq.l++] = (char)c;\t\t\t\t\t\t\\\n+\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tif (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */\t\\\n+\t\tseq->seq.s[seq->seq.l] = 0;\t/* null terminated string */\t\t\\\n+\t\tif (c != '+') return seq->seq.l; /* FASTA */\t\t\t\t\t\\\n+\t\tif (seq->qual.m < seq->seq.m) {\t/* allocate enough memory */\t\\\n+\t\t\tseq->qual.m = seq->seq.m;\t\t\t\t\t\t\t\t\t\\\n+\t\t\tseq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m);\t\t\\\n+\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\twhile ((c = ks_getc(ks)) != -1 && c != '\\n'); /* skip the rest of '+' line */ \\\n+\t\tif (c == -1) return -2; /* we should not stop here */\t\t\t\\\n+\t\twhile ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l)\t\t\\\n+\t\t\tif (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c;\t\\\n+\t\tseq->qual.s[seq->qual.l] = 0; /* null terminated string */\t\t\\\n+\t\tseq->last_char = 0;\t/* we have not come to the next header line */ \\\n+\t\tif (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \\\n+\t\treturn seq->seq.l;\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t}\n+\n+#define __KSEQ_TYPE(type_t)\t\t\t\t\t\t\\\n+\ttypedef struct {\t\t\t\t\t\t\t\\\n+\t\tkstring_t name, comment, seq, qual;\t\t\\\n+\t\tint last_char;\t\t\t\t\t\t\t\\\n+\t\tkstream_t *f;\t\t\t\t\t\t\t\\\n+\t} kseq_t;\n+\n+#define KSEQ_INIT(type_t, __read)\t\t\t\t\\\n+\tKSTREAM_INIT(type_t, __read, 4096)\t\t\t\\\n+\t__KSEQ_TYPE(type_t)\t\t\t\t\t\t\t\\\n+\t__KSEQ_BASIC(type_t)\t\t\t\t\t\t\\\n+\t__KSEQ_READ\n+\n+#endif\n" |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/ksort.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/ksort.h Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,271 @@\n+/* The MIT License\n+\n+ Copyright (c) 2008 Genome Research Ltd (GRL).\n+\n+ Permission is hereby granted, free of charge, to any person obtaining\n+ a copy of this software and associated documentation files (the\n+ "Software"), to deal in the Software without restriction, including\n+ without limitation the rights to use, copy, modify, merge, publish,\n+ distribute, sublicense, and/or sell copies of the Software, and to\n+ permit persons to whom the Software is furnished to do so, subject to\n+ the following conditions:\n+\n+ The above copyright notice and this permission notice shall be\n+ included in all copies or substantial portions of the Software.\n+\n+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,\n+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS\n+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN\n+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN\n+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n+ SOFTWARE.\n+*/\n+\n+/* Contact: Heng Li <lh3@sanger.ac.uk> */\n+\n+/*\n+ 2008-11-16 (0.1.4):\n+\n+ * Fixed a bug in introsort() that happens in rare cases.\n+\n+ 2008-11-05 (0.1.3):\n+\n+ * Fixed a bug in introsort() for complex comparisons.\n+\n+\t* Fixed a bug in mergesort(). The previous version is not stable.\n+\n+ 2008-09-15 (0.1.2):\n+\n+\t* Accelerated introsort. On my Mac (not on another Linux machine),\n+\t my implementation is as fast as std::sort on random input.\n+\n+\t* Added combsort and in introsort, switch to combsort if the\n+\t recursion is too deep.\n+\n+ 2008-09-13 (0.1.1):\n+\n+\t* Added k-small algorithm\n+\n+ 2008-09-05 (0.1.0):\n+\n+\t* Initial version\n+\n+*/\n+\n+#ifndef AC_KSORT_H\n+#define AC_KSORT_H\n+\n+#include <stdlib.h>\n+#include <string.h>\n+\n+typedef struct {\n+\tvoid *left, *right;\n+\tint depth;\n+} ks_isort_stack_t;\n+\n+#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }\n+\n+#define KSORT_INIT(name, type_t, __sort_lt)\t\t\t\t\t\t\t\t\\\n+\tvoid ks_mergesort_##name(size_t n, type_t array[], type_t temp[])\t\\\n+\t{\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\ttype_t *a2[2], *a, *b;\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tint curr, shift;\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\ta2[0] = array;\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\ta2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n);\t\t\\\n+\t\tfor (curr = 0, shift = 0; (1ul<<shift) < n; ++shift) {\t\t\t\\\n+\t\t\ta = a2[curr]; b = a2[1-curr];\t\t\t\t\t\t\t\t\\\n+\t\t\tif (shift == 0) {\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\ttype_t *p = b, *i, *eb = a + n;\t\t\t\t\t\t\t\\\n+\t\t\t\tfor (i = a; i < eb; i += 2) {\t\t\t\t\t\t\t\\\n+\t\t\t\t\tif (i == eb - 1) *p++ = *i;\t\t\t\t\t\t\t\\\n+\t\t\t\t\telse {\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\t\tif (__sort_lt(*(i+1), *i)) {\t\t\t\t\t\\\n+\t\t\t\t\t\t\t*p++ = *(i+1); *p++ = *i;\t\t\t\t\t\\\n+\t\t\t\t\t\t} else {\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\t\t\t*p++ = *i; *p++ = *(i+1);\t\t\t\t\t\\\n+\t\t\t\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t} else {\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\tsize_t i, step = 1ul<<shift;\t\t\t\t\t\t\t\\\n+\t\t\t\tfor (i = 0; i < n; i += step<<1) {\t\t\t\t\t\t\\\n+\t\t\t\t\ttype_t *p, *j, *k, *ea, *eb;\t\t\t\t\t\t\\\n+\t\t\t\t\tif (n < i + step) {\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\t\tea = a + n; eb = a;\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\t} else {\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\t\tea = a + i + step;\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\t\teb = a + (n < i + (step<<1)? n : i + (step<<1)); \\\n+\t\t\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\tj = a + i; k = a + i + step; p = b + i;\t\t\t\t\\\n+\t\t\t\t\twhile (j < ea && k < eb) {\t\t\t\t\t\t\t\\\n+\t\t\t\t\t\tif (__sort_lt(*k, *j)) *p++ = *k++;\t\t\t\t\\\n+\t\t\t\t\t\telse *p++ = *j++;\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\twhile (j < ea) *p++ = *j++;\t\t\t\t\t\t\t\\\n+\t\t\t\t\twhile (k < eb) *p++ = *k++;\t\t\t\t\t\t\t\\\n+\t\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\tcurr = 1 - curr;\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tif (curr == 1) {\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\ttype_t *p = a2[0], *i = a2[1], *eb = array + n;\t\t\t\t\\\n+\t\t\tfor (; p < eb; ++i) *p++ = *i;\t\t\t\t\t\t\t\t\\\n+\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tif (temp == 0) free(a2[1]);\t\t\t\t\t\t\t\t\t\t\\\n+\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\tvoid ks_heapadjust_##name(size_t i, size_t n, type_t l[])\t\t\t\\\n+\t{\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tsize_t k ='..b'\t\t\t\t\t\t\t\t\\\n+\tvoid ks_introsort_##name(size_t n, type_t a[])\t\t\t\t\t\t\\\n+\t{\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tint d;\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tks_isort_stack_t *top, *stack;\t\t\t\t\t\t\t\t\t\\\n+\t\ttype_t rp, swap_tmp;\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\ttype_t *s, *t, *i, *j, *k;\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tif (n < 1) return;\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\telse if (n == 2) {\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\tif (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \\\n+\t\t\treturn;\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tfor (d = 2; 1ul<<d < n; ++d);\t\t\t\t\t\t\t\t\t\\\n+\t\tstack = (ks_isort_stack_t*)malloc(sizeof(ks_isort_stack_t) * ((sizeof(size_t)*d)+2)); \\\n+\t\ttop = stack; s = a; t = a + (n-1); d <<= 1;\t\t\t\t\t\t\\\n+\t\twhile (1) {\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\tif (s < t) {\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\tif (--d == 0) {\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\tks_combsort_##name(t - s + 1, s);\t\t\t\t\t\\\n+\t\t\t\t\tt = s;\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\tcontinue;\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\ti = s; j = t; k = i + ((j-i)>>1) + 1;\t\t\t\t\t\\\n+\t\t\t\tif (__sort_lt(*k, *i)) {\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\tif (__sort_lt(*k, *j)) k = j;\t\t\t\t\t\t\\\n+\t\t\t\t} else k = __sort_lt(*j, *i)? i : j;\t\t\t\t\t\\\n+\t\t\t\trp = *k;\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\tif (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; }\t\\\n+\t\t\t\tfor (;;) {\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\tdo ++i; while (__sort_lt(*i, rp));\t\t\t\t\t\\\n+\t\t\t\t\tdo --j; while (i <= j && __sort_lt(rp, *j));\t\t\\\n+\t\t\t\t\tif (j <= i) break;\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\tswap_tmp = *i; *i = *j; *j = swap_tmp;\t\t\t\t\\\n+\t\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\tswap_tmp = *i; *i = *t; *t = swap_tmp;\t\t\t\t\t\\\n+\t\t\t\tif (i-s > t-i) {\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\tif (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \\\n+\t\t\t\t\ts = t-i > 16? i+1 : t;\t\t\t\t\t\t\t\t\\\n+\t\t\t\t} else {\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\tif (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \\\n+\t\t\t\t\tt = i-s > 16? i-1 : s;\t\t\t\t\t\t\t\t\\\n+\t\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t} else {\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\tif (top == stack) {\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\tfree(stack);\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\t__ks_insertsort_##name(a, a+n);\t\t\t\t\t\t\\\n+\t\t\t\t\treturn;\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t} else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \\\n+\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t/* This function is adapted from: http://ndevilla.free.fr/median/ */ \\\n+\t/* 0 <= kk < n */\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\ttype_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk)\t\t\t\\\n+\t{\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\ttype_t *low, *high, *k, *ll, *hh, *mid;\t\t\t\t\t\t\t\\\n+\t\tlow = arr; high = arr + n - 1; k = arr + kk;\t\t\t\t\t\\\n+\t\tfor (;;) {\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\tif (high <= low) return *k;\t\t\t\t\t\t\t\t\t\\\n+\t\t\tif (high == low + 1) {\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\tif (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \\\n+\t\t\t\treturn *k;\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\tmid = low + (high - low) / 2;\t\t\t\t\t\t\t\t\\\n+\t\t\tif (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \\\n+\t\t\tif (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \\\n+\t\t\tif (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low);\t\\\n+\t\t\tKSORT_SWAP(type_t, *mid, *(low+1));\t\t\t\t\t\t\t\\\n+\t\t\tll = low + 1; hh = high;\t\t\t\t\t\t\t\t\t\\\n+\t\t\tfor (;;) {\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\tdo ++ll; while (__sort_lt(*ll, *low));\t\t\t\t\t\\\n+\t\t\t\tdo --hh; while (__sort_lt(*low, *hh));\t\t\t\t\t\\\n+\t\t\t\tif (hh < ll) break;\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\tKSORT_SWAP(type_t, *ll, *hh);\t\t\t\t\t\t\t\\\n+\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\tKSORT_SWAP(type_t, *low, *hh);\t\t\t\t\t\t\t\t\\\n+\t\t\tif (hh <= k) low = ll;\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\tif (hh >= k) high = hh - 1;\t\t\t\t\t\t\t\t\t\\\n+\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t}\n+\n+#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t)\n+#define ks_introsort(name, n, a) ks_introsort_##name(n, a)\n+#define ks_combsort(name, n, a) ks_combsort_##name(n, a)\n+#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a)\n+#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a)\n+#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a)\n+#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)\n+\n+#define ks_lt_generic(a, b) ((a) < (b))\n+#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)\n+\n+typedef const char *ksstr_t;\n+\n+#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)\n+#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)\n+\n+#endif\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/kstring.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/kstring.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,165 @@ +#include <stdarg.h> +#include <stdio.h> +#include <ctype.h> +#include <string.h> +#include <stdint.h> +#include "kstring.h" + +int ksprintf(kstring_t *s, const char *fmt, ...) +{ + va_list ap; + int l; + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'. + va_end(ap); + if (l + 1 > s->m - s->l) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); + } + va_end(ap); + s->l += l; + return l; +} + +// s MUST BE a null terminated string; l = strlen(s) +int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) +{ + int i, n, max, last_char, last_start, *offsets, l; + n = 0; max = *_max; offsets = *_offsets; + l = strlen(s); + +#define __ksplit_aux do { \ + if (_offsets) { \ + s[i] = 0; \ + if (n == max) { \ + max = max? max<<1 : 2; \ + offsets = (int*)realloc(offsets, sizeof(int) * max); \ + } \ + offsets[n++] = last_start; \ + } else ++n; \ + } while (0) + + for (i = 0, last_char = last_start = 0; i <= l; ++i) { + if (delimiter == 0) { + if (isspace(s[i]) || s[i] == 0) { + if (isgraph(last_char)) __ksplit_aux; // the end of a field + } else { + if (isspace(last_char) || last_char == 0) last_start = i; + } + } else { + if (s[i] == delimiter || s[i] == 0) { + if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field + } else { + if (last_char == delimiter || last_char == 0) last_start = i; + } + } + last_char = s[i]; + } + *_max = max; *_offsets = offsets; + return n; +} + +/********************** + * Boyer-Moore search * + **********************/ + +// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html +int *ksBM_prep(const uint8_t *pat, int m) +{ + int i, *suff, *prep, *bmGs, *bmBc; + prep = calloc(m + 256, 1); + bmGs = prep; bmBc = prep + m; + { // preBmBc() + for (i = 0; i < 256; ++i) bmBc[i] = m; + for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1; + } + suff = calloc(m, sizeof(int)); + { // suffixes() + int f = 0, g; + suff[m - 1] = m; + g = m - 1; + for (i = m - 2; i >= 0; --i) { + if (i > g && suff[i + m - 1 - f] < i - g) + suff[i] = suff[i + m - 1 - f]; + else { + if (i < g) g = i; + f = i; + while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g; + suff[i] = f - g; + } + } + } + { // preBmGs() + int j = 0; + for (i = 0; i < m; ++i) bmGs[i] = m; + for (i = m - 1; i >= 0; --i) + if (suff[i] == i + 1) + for (; j < m - 1 - i; ++j) + if (bmGs[j] == m) + bmGs[j] = m - 1 - i; + for (i = 0; i <= m - 2; ++i) + bmGs[m - 1 - suff[i]] = m - 1 - i; + } + free(suff); + return prep; +} + +int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches) +{ + int i, j, *prep, *bmGs, *bmBc; + int *matches = 0, mm = 0, nm = 0; + prep = _prep? _prep : ksBM_prep(pat, m); + bmGs = prep; bmBc = prep + m; + j = 0; + while (j <= n - m) { + for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i); + if (i < 0) { + if (nm == mm) { + mm = mm? mm<<1 : 1; + matches = realloc(matches, mm * sizeof(int)); + } + matches[nm++] = j; + j += bmGs[0]; + } else { + int max = bmBc[str[i+j]] - m + 1 + i; + if (max < bmGs[i]) max = bmGs[i]; + j += max; + } + } + *n_matches = nm; + if (_prep == 0) free(prep); + return matches; +} + +#ifdef KSTRING_MAIN +#include <stdio.h> +int main() +{ + kstring_t *s; + int *fields, n, i; + s = (kstring_t*)calloc(1, sizeof(kstring_t)); + // test ksprintf() + ksprintf(s, " abcdefg: %d ", 100); + printf("'%s'\n", s->s); + // test ksplit() + fields = ksplit(s, 0, &n); + for (i = 0; i < n; ++i) + printf("field[%d] = '%s'\n", i, s->s + fields[i]); + free(s); + + { + static char *str = "abcdefgcdg"; + static char *pat = "cd"; + int n, *matches; + matches = ksBM_search(str, strlen(str), pat, strlen(pat), 0, &n); + printf("%d: \n", n); + for (i = 0; i < n; ++i) + printf("- %d\n", matches[i]); + free(matches); + } + return 0; +} +#endif |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/kstring.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/kstring.h Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,102 @@ +#ifndef KSTRING_H +#define KSTRING_H + +#include <stdlib.h> +#include <string.h> +#include <stdint.h> + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +int ksprintf(kstring_t *s, const char *fmt, ...); +int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); + +// calculate the auxiliary array, allocated by calloc() +int *ksBM_prep(const uint8_t *pat, int m); + +/* Search pat in str and returned the list of matches. The size of the + * list is returned as n_matches. _prep is the array returned by + * ksBM_prep(). If it is a NULL pointer, ksBM_prep() will be called. */ +int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches); + +static inline int kputsn(const char *p, int l, kstring_t *s) +{ + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + strncpy(s->s + s->l, p, l); + s->l += l; + s->s[s->l] = 0; + return l; +} + +static inline int kputs(const char *p, kstring_t *s) +{ + return kputsn(p, strlen(p), s); +} + +static inline int kputc(int c, kstring_t *s) +{ + if (s->l + 1 >= s->m) { + s->m = s->l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + s->s[s->l++] = c; + s->s[s->l] = 0; + return c; +} + +static inline int kputw(int c, kstring_t *s) +{ + char buf[16]; + int l, x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (c < 0) buf[l++] = '-'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; + s->s[s->l] = 0; + return 0; +} + +static inline int kputuw(unsigned c, kstring_t *s) +{ + char buf[16]; + int l, i; + unsigned x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; + s->s[s->l] = 0; + return 0; +} + +static inline int *ksplit(kstring_t *s, int delimiter, int *n) +{ + int max = 0, *offsets = 0; + *n = ksplit_core(s->s, delimiter, &max, &offsets); + return offsets; +} + +#endif |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/razf.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/razf.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,853 @@\n+/*\n+ * RAZF : Random Access compressed(Z) File\n+ * Version: 1.0\n+ * Release Date: 2008-10-27\n+ *\n+ * Copyright 2008, Jue Ruan <ruanjue@gmail.com>, Heng Li <lh3@sanger.ac.uk>\n+ *\n+ * All rights reserved.\n+ *\n+ * Redistribution and use in source and binary forms, with or without\n+ * modification, are permitted provided that the following conditions\n+ * are met:\n+ * 1. Redistributions of source code must retain the above copyright\n+ * notice, this list of conditions and the following disclaimer.\n+ * 2. Redistributions in binary form must reproduce the above copyright\n+ * notice, this list of conditions and the following disclaimer in the\n+ * documentation and/or other materials provided with the distribution.\n+ *\n+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS\'\' AND\n+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\n+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE\n+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS\n+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)\n+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT\n+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY\n+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF\n+ * SUCH DAMAGE.\n+ */\n+\n+#ifndef _NO_RAZF\n+\n+#include <fcntl.h>\n+#include <stdio.h>\n+#include <stdlib.h>\n+#include <string.h>\n+#include <unistd.h>\n+#include "razf.h"\n+\n+\n+#if ZLIB_VERNUM < 0x1221\n+struct _gz_header_s {\n+ int text;\n+ uLong time;\n+ int xflags;\n+ int os;\n+ Bytef *extra;\n+ uInt extra_len;\n+ uInt extra_max;\n+ Bytef *name;\n+ uInt name_max;\n+ Bytef *comment;\n+ uInt comm_max;\n+ int hcrc;\n+ int done;\n+};\n+#warning "zlib < 1.2.2.1; RAZF writing is disabled."\n+#endif\n+\n+#define DEF_MEM_LEVEL 8\n+\n+static inline uint32_t byte_swap_4(uint32_t v){\n+\tv = ((v & 0x0000FFFFU) << 16) | (v >> 16);\n+\treturn ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);\n+}\n+\n+static inline uint64_t byte_swap_8(uint64_t v){\n+\tv = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);\n+\tv = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);\n+\treturn ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);\n+}\n+\n+static inline int is_big_endian(){\n+\tint x = 0x01;\n+\tchar *c = (char*)&x;\n+\treturn (c[0] != 0x01);\n+}\n+\n+#ifndef _RZ_READONLY\n+static void add_zindex(RAZF *rz, int64_t in, int64_t out){\n+\tif(rz->index->size == rz->index->cap){\n+\t\trz->index->cap = rz->index->cap * 1.5 + 2;\n+\t\trz->index->cell_offsets = realloc(rz->index->cell_offsets, sizeof(int) * rz->index->cap);\n+\t\trz->index->bin_offsets = realloc(rz->index->bin_offsets, sizeof(int64_t) * (rz->index->cap/RZ_BIN_SIZE + 1));\n+\t}\n+\tif(rz->index->size % RZ_BIN_SIZE == 0) rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE] = out;\n+\trz->index->cell_offsets[rz->index->size] = out - rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE];\n+\trz->index->size ++;\n+}\n+\n+static void save_zindex(RAZF *rz, int fd){\n+\tint32_t i, v32;\n+\tint is_be;\n+\tis_be = is_big_endian();\n+\tif(is_be) write(fd, &rz->index->size, sizeof(int));\n+\telse {\n+\t\tv32 = byte_swap_4((uint32_t)rz->index->size);\n+\t\twrite(fd, &v32, sizeof(uint32_t));\n+\t}\n+\tv32 = rz->index->size / RZ_BIN_SIZE + 1;\n+\tif(!is_be){\n+\t\tfor(i=0;i<v32;i++) rz->index->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]);\n+\t\tfor(i=0;i<rz->index->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]);\n+\t}\n+\twrite(fd, rz->index->bin_offsets, sizeof(int64_t) * v32);\n+\twrite(fd, rz->index->cell_offsets, sizeof(int32_t) * rz->index->size);\n+}\n+#endif\n+\n+#ifdef _USE_KNETFILE\n+static void load_zindex(RAZF *rz, knetFile *fp'..b'E_TYPE_PLAIN){\n+\t\trz->buf_off = rz->buf_len = 0;\n+\t\tpos = block_start + block_offset;\n+#ifdef _USE_KNETFILE\n+\t\tknet_seek(rz->x.fpr, pos, SEEK_SET);\n+ pos = knet_tell(rz->x.fpr);\n+#else\n+\t\tpos = lseek(rz->filedes, pos, SEEK_SET);\n+#endif\n+\t\trz->out = rz->in = pos;\n+\t\treturn pos;\n+\t}\n+\tif(block_start == rz->block_pos && block_offset >= rz->block_off) {\n+\t\tblock_offset -= rz->block_off;\n+\t\tgoto SKIP; // Needn\'t reset inflate\n+\t}\n+\tif(block_start == 0) block_start = rz->header_size; // Automaticly revist wrong block_start\n+\t_razf_reset_read(rz, block_start, 0);\n+\tSKIP:\n+\tif(block_offset) razf_skip(rz, block_offset);\n+\treturn rz->block_off;\n+}\n+\n+int64_t razf_seek(RAZF* rz, int64_t pos, int where){\n+\tint64_t idx;\n+\tint64_t seek_pos, new_out;\n+\trz->z_eof = 0;\n+\tif (where == SEEK_CUR) pos += rz->out;\n+\telse if (where == SEEK_END) pos += rz->src_end;\n+\tif(rz->file_type == FILE_TYPE_PLAIN){\n+#ifdef _USE_KNETFILE\n+\t\tknet_seek(rz->x.fpr, pos, SEEK_SET);\n+ seek_pos = knet_tell(rz->x.fpr);\n+#else\n+\t\tseek_pos = lseek(rz->filedes, pos, SEEK_SET);\n+#endif\n+\t\trz->buf_off = rz->buf_len = 0;\n+\t\trz->out = rz->in = seek_pos;\n+\t\treturn seek_pos;\n+\t} else if(rz->file_type == FILE_TYPE_GZ){\n+\t\tif(pos >= rz->out) goto SKIP;\n+\t\treturn rz->out;\n+\t}\n+\tif(pos == rz->out) return pos;\n+\tif(pos > rz->src_end) return rz->out;\n+\tif(!rz->seekable || !rz->load_index){\n+\t\tif(pos >= rz->out) goto SKIP;\n+\t}\n+\tidx = pos / RZ_BLOCK_SIZE - 1;\n+\tseek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]);\n+\tnew_out = (idx + 1) * RZ_BLOCK_SIZE;\n+\tif(pos > rz->out && new_out <= rz->out) goto SKIP;\n+\t_razf_reset_read(rz, seek_pos, new_out);\n+\tSKIP:\n+\trazf_skip(rz, (int)(pos - rz->out));\n+\treturn rz->out;\n+}\n+\n+uint64_t razf_tell2(RAZF *rz)\n+{\n+\t/*\n+\tif (rz->load_index) {\n+\t\tint64_t idx, seek_pos;\n+\t\tidx = rz->out / RZ_BLOCK_SIZE - 1;\n+\t\tseek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]);\n+\t\tif (seek_pos != rz->block_pos || rz->out%RZ_BLOCK_SIZE != rz->block_off)\n+\t\t\tfprintf(stderr, "[razf_tell2] inconsistent block offset: (%lld, %lld) != (%lld, %lld)\\n",\n+\t\t\t\t\t(long long)seek_pos, (long long)rz->out%RZ_BLOCK_SIZE, (long long)rz->block_pos, (long long) rz->block_off);\n+\t}\n+\t*/\n+\treturn (uint64_t)rz->block_pos<<16 | (rz->block_off&0xffff);\n+}\n+\n+int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where)\n+{\n+\tif (where != SEEK_SET) return -1;\n+\treturn razf_jump(rz, voffset>>16, voffset&0xffff);\n+}\n+\n+void razf_close(RAZF *rz){\n+\tif(rz->mode == \'w\'){\n+#ifndef _RZ_READONLY\n+\t\trazf_end_flush(rz);\n+\t\tdeflateEnd(rz->stream);\n+#ifdef _USE_KNETFILE\n+\t\tsave_zindex(rz, rz->x.fpw);\n+\t\tif(is_big_endian()){\n+\t\t\twrite(rz->x.fpw, &rz->in, sizeof(int64_t));\n+\t\t\twrite(rz->x.fpw, &rz->out, sizeof(int64_t));\n+\t\t} else {\n+\t\t\tuint64_t v64 = byte_swap_8((uint64_t)rz->in);\n+\t\t\twrite(rz->x.fpw, &v64, sizeof(int64_t));\n+\t\t\tv64 = byte_swap_8((uint64_t)rz->out);\n+\t\t\twrite(rz->x.fpw, &v64, sizeof(int64_t));\n+\t\t}\n+#else\n+\t\tsave_zindex(rz, rz->filedes);\n+\t\tif(is_big_endian()){\n+\t\t\twrite(rz->filedes, &rz->in, sizeof(int64_t));\n+\t\t\twrite(rz->filedes, &rz->out, sizeof(int64_t));\n+\t\t} else {\n+\t\t\tuint64_t v64 = byte_swap_8((uint64_t)rz->in);\n+\t\t\twrite(rz->filedes, &v64, sizeof(int64_t));\n+\t\t\tv64 = byte_swap_8((uint64_t)rz->out);\n+\t\t\twrite(rz->filedes, &v64, sizeof(int64_t));\n+\t\t}\n+#endif\n+#endif\n+\t} else if(rz->mode == \'r\'){\n+\t\tif(rz->stream) inflateEnd(rz->stream);\n+\t}\n+\tif(rz->inbuf) free(rz->inbuf);\n+\tif(rz->outbuf) free(rz->outbuf);\n+\tif(rz->header){\n+\t\tfree(rz->header->extra);\n+\t\tfree(rz->header->name);\n+\t\tfree(rz->header->comment);\n+\t\tfree(rz->header);\n+\t}\n+\tif(rz->index){\n+\t\tfree(rz->index->bin_offsets);\n+\t\tfree(rz->index->cell_offsets);\n+\t\tfree(rz->index);\n+\t}\n+\tfree(rz->stream);\n+#ifdef _USE_KNETFILE\n+ if (rz->mode == \'r\')\n+ knet_close(rz->x.fpr);\n+ if (rz->mode == \'w\')\n+ close(rz->x.fpw);\n+#else\n+\tclose(rz->filedes);\n+#endif\n+\tfree(rz);\n+}\n+\n+#endif\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/razf.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/razf.h Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,134 @@ + /*- + * RAZF : Random Access compressed(Z) File + * Version: 1.0 + * Release Date: 2008-10-27 + * + * Copyright 2008, Jue Ruan <ruanjue@gmail.com>, Heng Li <lh3@sanger.ac.uk> + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#ifndef __RAZF_RJ_H +#define __RAZF_RJ_H + +#include <stdint.h> +#include <stdio.h> +#include "zlib.h" + +#ifdef _USE_KNETFILE +#include "knetfile.h" +#endif + +#if ZLIB_VERNUM < 0x1221 +#define _RZ_READONLY +struct _gz_header_s; +typedef struct _gz_header_s _gz_header; +#define gz_header _gz_header +#endif + +#define WINDOW_BITS 15 + +#ifndef RZ_BLOCK_SIZE +#define RZ_BLOCK_SIZE (1<<WINDOW_BITS) +#endif + +#ifndef RZ_BUFFER_SIZE +#define RZ_BUFFER_SIZE 4096 +#endif + +#ifndef RZ_COMPRESS_LEVEL +#define RZ_COMPRESS_LEVEL 6 +#endif + +#define RZ_BIN_SIZE ((1LLU << 32) / RZ_BLOCK_SIZE) + +typedef struct { + uint32_t *cell_offsets; // i + int64_t *bin_offsets; // i / BIN_SIZE + int size; + int cap; +} ZBlockIndex; +/* When storing index, output bytes in Big-Endian everywhere */ + +#define FILE_TYPE_RZ 1 +#define FILE_TYPE_PLAIN 2 +#define FILE_TYPE_GZ 3 + +typedef struct RandomAccessZFile { + char mode; /* 'w' : write mode; 'r' : read mode */ + int file_type; + /* plain file or rz file, razf_read support plain file as input too, in this case, razf_read work as buffered fread */ +#ifdef _USE_KNETFILE + union { + knetFile *fpr; + int fpw; + } x; +#else + int filedes; /* the file descriptor */ +#endif + z_stream *stream; + ZBlockIndex *index; + int64_t in, out, end, src_end; + /* in: n bytes total in; out: n bytes total out; */ + /* end: the end of all data blocks, while the start of index; src_end: the true end position in uncompressed file */ + int buf_flush; // buffer should be flush, suspend inflate util buffer is empty + int64_t block_pos, block_off, next_block_pos; + /* block_pos: the start postiion of current block in compressed file */ + /* block_off: tell how many bytes have been read from current block */ + void *inbuf, *outbuf; + int header_size; + gz_header *header; + /* header is used to transfer inflate_state->mode from HEAD to TYPE after call inflateReset */ + int buf_off, buf_len; + int z_err, z_eof; + int seekable; + /* Indice where the source is seekable */ + int load_index; + /* set has_index to 0 in mode 'w', then index will be discarded */ +} RAZF; + +#ifdef __cplusplus +extern "C" { +#endif + + RAZF* razf_dopen(int data_fd, const char *mode); + RAZF *razf_open(const char *fn, const char *mode); + int razf_write(RAZF* rz, const void *data, int size); + int razf_read(RAZF* rz, void *data, int size); + int64_t razf_seek(RAZF* rz, int64_t pos, int where); + void razf_close(RAZF* rz); + +#define razf_tell(rz) ((rz)->out) + + RAZF* razf_open2(const char *filename, const char *mode); + RAZF* razf_dopen2(int fd, const char *mode); + uint64_t razf_tell2(RAZF *rz); + int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where); + +#ifdef __cplusplus +} +#endif + +#endif |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/sam.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/sam.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,175 @@ +#include <string.h> +#include <unistd.h> +#include "faidx.h" +#include "sam.h" + +#define TYPE_BAM 1 +#define TYPE_READ 2 + +bam_header_t *bam_header_dup(const bam_header_t *h0) +{ + bam_header_t *h; + int i; + h = bam_header_init(); + *h = *h0; + h->hash = h->dict = h->rg2lib = 0; + h->text = (char*)calloc(h->l_text + 1, 1); + memcpy(h->text, h0->text, h->l_text); + h->target_len = (uint32_t*)calloc(h->n_targets, 4); + h->target_name = (char**)calloc(h->n_targets, sizeof(void*)); + for (i = 0; i < h->n_targets; ++i) { + h->target_len[i] = h0->target_len[i]; + h->target_name[i] = strdup(h0->target_name[i]); + } + return h; +} +static void append_header_text(bam_header_t *header, char* text, int len) +{ + int x = header->l_text + 1; + int y = header->l_text + len + 1; // 1 byte null + if (text == 0) return; + kroundup32(x); + kroundup32(y); + if (x < y) header->text = (char*)realloc(header->text, y); + strncpy(header->text + header->l_text, text, len); // we cannot use strcpy() here. + header->l_text += len; + header->text[header->l_text] = 0; +} + +samfile_t *samopen(const char *fn, const char *mode, const void *aux) +{ + samfile_t *fp; + fp = (samfile_t*)calloc(1, sizeof(samfile_t)); + if (mode[0] == 'r') { // read + fp->type |= TYPE_READ; + if (mode[1] == 'b') { // binary + fp->type |= TYPE_BAM; + fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); + if (fp->x.bam == 0) goto open_err_ret; + fp->header = bam_header_read(fp->x.bam); + } else { // text + fp->x.tamr = sam_open(fn); + if (fp->x.tamr == 0) goto open_err_ret; + fp->header = sam_header_read(fp->x.tamr); + if (fp->header->n_targets == 0) { // no @SQ fields + if (aux) { // check if aux is present + bam_header_t *textheader = fp->header; + fp->header = sam_header_read2((const char*)aux); + if (fp->header == 0) goto open_err_ret; + append_header_text(fp->header, textheader->text, textheader->l_text); + bam_header_destroy(textheader); + } + if (fp->header->n_targets == 0) + fprintf(stderr, "[samopen] no @SQ lines in the header.\n"); + } else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); + } + } else if (mode[0] == 'w') { // write + fp->header = bam_header_dup((const bam_header_t*)aux); + if (mode[1] == 'b') { // binary + char bmode[3]; + bmode[0] = 'w'; bmode[1] = strstr(mode, "u")? 'u' : 0; bmode[2] = 0; + fp->type |= TYPE_BAM; + fp->x.bam = strcmp(fn, "-")? bam_open(fn, bmode) : bam_dopen(fileno(stdout), bmode); + if (fp->x.bam == 0) goto open_err_ret; + bam_header_write(fp->x.bam, fp->header); + } else { // text + // open file + fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout; + if (fp->x.tamr == 0) goto open_err_ret; + if (strstr(mode, "X")) fp->type |= BAM_OFSTR<<2; + else if (strstr(mode, "x")) fp->type |= BAM_OFHEX<<2; + else fp->type |= BAM_OFDEC<<2; + // write header + if (strstr(mode, "h")) { + int i; + bam_header_t *alt; + // parse the header text + alt = bam_header_init(); + alt->l_text = fp->header->l_text; alt->text = fp->header->text; + sam_header_parse(alt); + alt->l_text = 0; alt->text = 0; + // check if there are @SQ lines in the header + fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); + if (alt->n_targets) { // then write the header text without dumping ->target_{name,len} + if (alt->n_targets != fp->header->n_targets) + fprintf(stderr, "[samopen] inconsistent number of target sequences.\n"); + } else { // then dump ->target_{name,len} + for (i = 0; i < fp->header->n_targets; ++i) + fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]); + } + bam_header_destroy(alt); + } + } + } + return fp; + +open_err_ret: + free(fp); + return 0; +} + +void samclose(samfile_t *fp) +{ + if (fp == 0) return; + if (fp->header) bam_header_destroy(fp->header); + if (fp->type & TYPE_BAM) bam_close(fp->x.bam); + else if (fp->type & TYPE_READ) sam_close(fp->x.tamr); + else fclose(fp->x.tamw); + free(fp); +} + +int samread(samfile_t *fp, bam1_t *b) +{ + if (fp == 0 || !(fp->type & TYPE_READ)) return -1; // not open for reading + if (fp->type & TYPE_BAM) return bam_read1(fp->x.bam, b); + else return sam_read1(fp->x.tamr, fp->header, b); +} + +int samwrite(samfile_t *fp, const bam1_t *b) +{ + if (fp == 0 || (fp->type & TYPE_READ)) return -1; // not open for writing + if (fp->type & TYPE_BAM) return bam_write1(fp->x.bam, b); + else { + char *s = bam_format1_core(fp->header, b, fp->type>>2&3); + int l = strlen(s); + fputs(s, fp->x.tamw); fputc('\n', fp->x.tamw); + free(s); + return l + 1; + } +} + +int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *func_data) +{ + bam_plbuf_t *buf; + int ret; + bam1_t *b; + b = bam_init1(); + buf = bam_plbuf_init(func, func_data); + bam_plbuf_set_mask(buf, mask); + while ((ret = samread(fp, b)) >= 0) + bam_plbuf_push(b, buf); + bam_plbuf_push(0, buf); + bam_plbuf_destroy(buf); + bam_destroy1(b); + return 0; +} + +char *samfaipath(const char *fn_ref) +{ + char *fn_list = 0; + if (fn_ref == 0) return 0; + fn_list = calloc(strlen(fn_ref) + 5, 1); + strcat(strcpy(fn_list, fn_ref), ".fai"); + if (access(fn_list, R_OK) == -1) { // fn_list is unreadable + if (access(fn_ref, R_OK) == -1) { + fprintf(stderr, "[samfaipath] fail to read file %s.\n", fn_ref); + } else { + fprintf(stderr, "[samfaipath] build FASTA index...\n"); + if (fai_build(fn_ref) == -1) { + fprintf(stderr, "[samfaipath] fail to build FASTA index.\n"); + free(fn_list); fn_list = 0; + } + } + } + return fn_list; +} |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/sam.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/sam.h Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,98 @@ +#ifndef BAM_SAM_H +#define BAM_SAM_H + +#include "bam.h" + +/*! + @header + + This file provides higher level of I/O routines and unifies the APIs + for SAM and BAM formats. These APIs are more convenient and + recommended. + + @copyright Genome Research Ltd. + */ + +/*! @typedef + @abstract SAM/BAM file handler + @field type type of the handler; bit 1 for BAM, 2 for reading and bit 3-4 for flag format + @field bam BAM file handler; valid if (type&1) == 1 + @field tamr SAM file handler for reading; valid if type == 2 + @field tamw SAM file handler for writing; valid if type == 0 + @field header header struct + */ +typedef struct { + int type; + union { + tamFile tamr; + bamFile bam; + FILE *tamw; + } x; + bam_header_t *header; +} samfile_t; + +#ifdef __cplusplus +extern "C" { +#endif + + /*! + @abstract Open a SAM/BAM file + + @param fn SAM/BAM file name; "-" is recognized as stdin (for + reading) or stdout (for writing). + + @param mode open mode /[rw](b?)(u?)(h?)([xX]?)/: 'r' for reading, + 'w' for writing, 'b' for BAM I/O, 'u' for uncompressed BAM output, + 'h' for outputing header in SAM, 'x' for HEX flag and 'X' for + string flag. If 'b' present, it must immediately follow 'r' or + 'w'. Valid modes are "r", "w", "wh", "wx", "whx", "wX", "whX", + "rb", "wb" and "wbu" exclusively. + + @param aux auxiliary data; if mode[0]=='w', aux points to + bam_header_t; if strcmp(mode, "rb")!=0 and @SQ header lines in SAM + are absent, aux points the file name of the list of the reference; + aux is not used otherwise. If @SQ header lines are present in SAM, + aux is not used, either. + + @return SAM/BAM file handler + */ + samfile_t *samopen(const char *fn, const char *mode, const void *aux); + + /*! + @abstract Close a SAM/BAM handler + @param fp file handler to be closed + */ + void samclose(samfile_t *fp); + + /*! + @abstract Read one alignment + @param fp file handler + @param b alignment + @return bytes read + */ + int samread(samfile_t *fp, bam1_t *b); + + /*! + @abstract Write one alignment + @param fp file handler + @param b alignment + @return bytes written + */ + int samwrite(samfile_t *fp, const bam1_t *b); + + /*! + @abstract Get the pileup for a whole alignment file + @param fp file handler + @param mask mask transferred to bam_plbuf_set_mask() + @param func user defined function called in the pileup process + #param data user provided data for func() + */ + int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *data); + + char *samfaipath(const char *fn_ref); + +#ifdef __cplusplus +} +#endif + +#endif |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/sam_header.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/sam_header.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,735 @@\n+#include "sam_header.h"\n+#include <stdio.h>\n+#include <string.h>\n+#include <ctype.h>\n+#include <stdlib.h>\n+#include <stdarg.h>\n+\n+#include "khash.h"\n+KHASH_MAP_INIT_STR(str, const char *)\n+\n+struct _HeaderList\n+{\n+ struct _HeaderList *last; // Hack: Used and maintained only by list_append_to_end. Maintained in the root node only.\n+ struct _HeaderList *next;\n+ void *data;\n+};\n+typedef struct _HeaderList list_t;\n+typedef list_t HeaderDict;\n+\n+typedef struct\n+{\n+ char key[2];\n+ char *value;\n+}\n+HeaderTag;\n+\n+typedef struct\n+{\n+ char type[2];\n+ list_t *tags;\n+}\n+HeaderLine;\n+\n+const char *o_hd_tags[] = {"SO","GO",NULL};\n+const char *r_hd_tags[] = {"VN",NULL};\n+\n+const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL};\n+const char *r_sq_tags[] = {"SN","LN",NULL};\n+const char *u_sq_tags[] = {"SN",NULL};\n+\n+const char *o_rg_tags[] = {"LB","DS","PU","PI","CN","DT","PL",NULL};\n+const char *r_rg_tags[] = {"ID",NULL};\n+const char *u_rg_tags[] = {"ID",NULL};\n+\n+const char *o_pg_tags[] = {"VN","CL",NULL};\n+const char *r_pg_tags[] = {"ID",NULL};\n+\n+const char *types[] = {"HD","SQ","RG","PG","CO",NULL};\n+const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL};\n+const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL};\n+const char **unique_tags[] = {NULL, u_sq_tags,u_rg_tags,NULL,NULL,NULL};\n+\n+\n+static void debug(const char *format, ...)\n+{\n+ va_list ap;\n+ va_start(ap, format);\n+ vfprintf(stderr, format, ap);\n+ va_end(ap);\n+}\n+\n+#if 0\n+// Replaced by list_append_to_end\n+static list_t *list_prepend(list_t *root, void *data)\n+{\n+ list_t *l = malloc(sizeof(list_t));\n+ l->next = root;\n+ l->data = data;\n+ return l;\n+}\n+#endif\n+\n+// Relies on the root->last being correct. Do not use with the other list_*\n+// routines unless they are fixed to modify root->last as well.\n+static list_t *list_append_to_end(list_t *root, void *data)\n+{\n+ list_t *l = malloc(sizeof(list_t));\n+ l->last = l;\n+ l->next = NULL;\n+ l->data = data;\n+\n+ if ( !root )\n+ return l;\n+\n+ root->last->next = l;\n+ root->last = l;\n+ return root;\n+}\n+\n+static list_t *list_append(list_t *root, void *data)\n+{\n+ list_t *l = root;\n+ while (l && l->next)\n+ l = l->next;\n+ if ( l ) \n+ {\n+ l->next = malloc(sizeof(list_t));\n+ l = l->next;\n+ }\n+ else\n+ {\n+ l = malloc(sizeof(list_t));\n+ root = l;\n+ }\n+ l->data = data;\n+ l->next = NULL;\n+ return root;\n+}\n+\n+static void list_free(list_t *root)\n+{\n+ list_t *l = root;\n+ while (root)\n+ {\n+ l = root;\n+ root = root->next;\n+ free(l);\n+ }\n+}\n+\n+\n+\n+// Look for a tag "XY" in a predefined const char *[] array.\n+static int tag_exists(const char *tag, const char **tags)\n+{\n+ int itag=0;\n+ if ( !tags ) return -1;\n+ while ( tags[itag] )\n+ {\n+ if ( tags[itag][0]==tag[0] && tags[itag][1]==tag[1] ) return itag; \n+ itag++;\n+ }\n+ return -1;\n+}\n+\n+\n+\n+// Mimics the behaviour of getline, except it returns pointer to the next chunk of the text\n+// or NULL if everything has been read. The lineptr should be freed by the caller. The\n+// newline character is stripped.\n+static const char *nextline(char **lineptr, size_t *n, const char *text)\n+{\n+ int len;\n+ const char *to = text;\n+\n+ if ( !*to ) return NULL;\n+\n+ while ( *to && *to!=\'\\n\' && *to!=\'\\r\' ) to++;\n+ len = to - text + 1;\n+\n+ if ( *to )\n+ {\n+ // Advance the pointer for the next call\n+ if ( *to==\'\\n\' ) to++;\n+ else if ( *to==\'\\r\' && *(to+1)==\'\\n\' ) to+=2;\n+ }\n+ if ( !len )\n+ return to;\n+\n+ if ( !*lineptr ) \n+ {\n+ *lineptr = malloc(len);\n+ *n = len;\n+ }\n+ else if ( *n<len ) \n+ {\n+ *lineptr = realloc(*lineptr, len);\n+ *n = len;\n+ }\n+ if ( !*lineptr ) {\n+\t\tdebug("[nextline] Insufficient memory!\\n");\n+\t\treturn 0;\n+\t}\n+\n+ memcp'..b'e);\n+ else\n+ {\n+\t\t\tif (hline) sam_header_line_free(hline);\n+\t\t\tsam_header_free(hlines);\n+ if ( buf ) free(buf);\n+ return NULL;\n+ }\n+ }\n+ if ( buf ) free(buf);\n+\n+ return hlines;\n+}\n+\n+void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2])\n+{\n+\tconst HeaderDict *dict = (const HeaderDict*)_dict;\n+ const list_t *l = dict;\n+ khash_t(str) *tbl = kh_init(str);\n+ khiter_t k;\n+ int ret;\n+\n+\tif (_dict == 0) return tbl; // return an empty (not null) hash table\n+ while (l)\n+ {\n+ HeaderLine *hline = l->data;\n+ if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) \n+ {\n+ l = l->next;\n+ continue;\n+ }\n+ \n+ HeaderTag *key, *value;\n+ key = header_line_has_tag(hline,key_tag);\n+ value = header_line_has_tag(hline,value_tag); \n+ if ( !key || !value )\n+ {\n+ l = l->next;\n+ continue;\n+ }\n+ \n+ k = kh_get(str, tbl, key->value);\n+ if ( k != kh_end(tbl) )\n+ debug("[sam_header_lookup_table] They key %s not unique.\\n", key->value);\n+ k = kh_put(str, tbl, key->value, &ret);\n+ kh_value(tbl, k) = value->value;\n+\n+ l = l->next;\n+ }\n+ return tbl;\n+}\n+\n+char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n)\n+{\n+\tconst HeaderDict *dict = (const HeaderDict*)_dict;\n+ const list_t *l = dict;\n+ int max, n;\n+\tchar **ret;\n+\n+\tret = 0; *_n = max = n = 0;\n+ while (l)\n+ {\n+ HeaderLine *hline = l->data;\n+ if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) \n+ {\n+ l = l->next;\n+ continue;\n+ }\n+ \n+ HeaderTag *key;\n+ key = header_line_has_tag(hline,key_tag);\n+ if ( !key )\n+ {\n+ l = l->next;\n+ continue;\n+ }\n+\n+\t\tif (n == max) {\n+\t\t\tmax = max? max<<1 : 4;\n+\t\t\tret = realloc(ret, max * sizeof(void*));\n+\t\t}\n+\t\tret[n++] = key->value;\n+\n+ l = l->next;\n+ }\n+\t*_n = n;\n+ return ret;\n+}\n+\n+const char *sam_tbl_get(void *h, const char *key)\n+{\n+\tkhash_t(str) *tbl = (khash_t(str)*)h;\n+\tkhint_t k;\n+\tk = kh_get(str, tbl, key);\n+\treturn k == kh_end(tbl)? 0 : kh_val(tbl, k);\n+}\n+\n+int sam_tbl_size(void *h)\n+{\n+\tkhash_t(str) *tbl = (khash_t(str)*)h;\n+\treturn h? kh_size(tbl) : 0;\n+}\n+\n+void sam_tbl_destroy(void *h)\n+{\n+\tkhash_t(str) *tbl = (khash_t(str)*)h;\n+\tkh_destroy(str, tbl);\n+}\n+\n+void *sam_header_merge(int n, const void **_dicts)\n+{\n+\tconst HeaderDict **dicts = (const HeaderDict**)_dicts;\n+ HeaderDict *out_dict;\n+ int idict, status;\n+\n+ if ( n<2 ) return NULL;\n+\n+ out_dict = sam_header_clone(dicts[0]);\n+\n+ for (idict=1; idict<n; idict++)\n+ {\n+ const list_t *tmpl_hlines = dicts[idict];\n+\n+ while ( tmpl_hlines )\n+ {\n+ list_t *out_hlines = out_dict;\n+ int inserted = 0;\n+ while ( out_hlines )\n+ {\n+ status = sam_header_compare_lines(tmpl_hlines->data, out_hlines->data);\n+ if ( status==0 )\n+ {\n+ out_hlines = out_hlines->next;\n+ continue;\n+ }\n+ \n+ if ( status==2 ) \n+ {\n+ print_header_line(stderr,tmpl_hlines->data);\n+ print_header_line(stderr,out_hlines->data);\n+ debug("Conflicting lines, cannot merge the headers.\\n");\n+\t\t\t\t\treturn 0;\n+ }\n+ if ( status==3 )\n+ sam_header_line_merge_with(out_hlines->data, tmpl_hlines->data);\n+\n+ inserted = 1;\n+ break;\n+ }\n+ if ( !inserted )\n+ out_dict = list_append(out_dict, sam_header_line_clone(tmpl_hlines->data));\n+\n+ tmpl_hlines = tmpl_hlines->next;\n+ }\n+ }\n+\n+ return out_dict;\n+}\n+\n+\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/sam_header.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/sam_header.h Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,24 @@ +#ifndef __SAM_HEADER_H__ +#define __SAM_HEADER_H__ + +#ifdef __cplusplus +extern "C" { +#endif + + void *sam_header_parse2(const char *headerText); + void *sam_header_merge(int n, const void **dicts); + void sam_header_free(void *header); + char *sam_header_write(const void *headerDict); // returns a newly allocated string + + char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n); + + void *sam_header2tbl(const void *dict, char type[2], char key_tag[2], char value_tag[2]); + const char *sam_tbl_get(void *h, const char *key); + int sam_tbl_size(void *h); + void sam_tbl_destroy(void *h); + +#ifdef __cplusplus +} +#endif + +#endif |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/samtools/sam_view.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/samtools/sam_view.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,255 @@\n+#include <stdlib.h>\n+#include <string.h>\n+#include <stdio.h>\n+#include <unistd.h>\n+#include <math.h>\n+#include "sam_header.h"\n+#include "sam.h"\n+#include "faidx.h"\n+#include "khash.h"\n+KHASH_SET_INIT_STR(rg)\n+\n+typedef khash_t(rg) *rghash_t;\n+\n+rghash_t g_rghash = 0;\n+static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0;\n+static char *g_library, *g_rg;\n+static int g_sol2sanger_tbl[128];\n+\n+static void sol2sanger(bam1_t *b)\n+{\n+\tint l;\n+\tuint8_t *qual = bam1_qual(b);\n+\tif (g_sol2sanger_tbl[30] == 0) {\n+\t\tfor (l = 0; l != 128; ++l) {\n+\t\t\tg_sol2sanger_tbl[l] = (int)(10.0 * log(1.0 + pow(10.0, (l - 64 + 33) / 10.0)) / log(10.0) + .499);\n+\t\t\tif (g_sol2sanger_tbl[l] >= 93) g_sol2sanger_tbl[l] = 93;\n+\t\t}\n+\t}\n+\tfor (l = 0; l < b->core.l_qseq; ++l) {\n+\t\tint q = qual[l];\n+\t\tif (q > 127) q = 127;\n+\t\tqual[l] = g_sol2sanger_tbl[q];\n+\t}\n+}\n+\n+static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b)\n+{\n+\tif (b->core.qual < g_min_mapQ || ((b->core.flag & g_flag_on) != g_flag_on) || (b->core.flag & g_flag_off))\n+\t\treturn 1;\n+\tif (g_rg || g_rghash) {\n+\t\tuint8_t *s = bam_aux_get(b, "RG");\n+\t\tif (s) {\n+\t\t\tif (g_rg) return (strcmp(g_rg, (char*)(s + 1)) == 0)? 0 : 1;\n+\t\t\tif (g_rghash) {\n+\t\t\t\tkhint_t k = kh_get(rg, g_rghash, (char*)(s + 1));\n+\t\t\t\treturn (k != kh_end(g_rghash))? 0 : 1;\n+\t\t\t}\n+\t\t}\n+\t}\n+\tif (g_library) {\n+\t\tconst char *p = bam_get_library((bam_header_t*)h, b);\n+\t\treturn (p && strcmp(p, g_library) == 0)? 0 : 1;\n+\t}\n+\treturn 0;\n+}\n+\n+// callback function for bam_fetch()\n+static int view_func(const bam1_t *b, void *data)\n+{\n+\tif (!__g_skip_aln(((samfile_t*)data)->header, b))\n+\t\tsamwrite((samfile_t*)data, b);\n+\treturn 0;\n+}\n+\n+static int usage(int is_long_help);\n+\n+int main_samview(int argc, char *argv[])\n+{\n+\tint c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, is_uncompressed = 0, is_bamout = 0, slx2sngr = 0;\n+\tint of_type = BAM_OFDEC, is_long_help = 0;\n+\tsamfile_t *in = 0, *out = 0;\n+\tchar in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0, *fn_rg = 0;\n+\n+\t/* parse command-line options */\n+\tstrcpy(in_mode, "r"); strcpy(out_mode, "w");\n+\twhile ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:ul:r:xX?T:CR:")) >= 0) {\n+\t\tswitch (c) {\n+\t\tcase \'C\': slx2sngr = 1; break;\n+\t\tcase \'S\': is_bamin = 0; break;\n+\t\tcase \'b\': is_bamout = 1; break;\n+\t\tcase \'t\': fn_list = strdup(optarg); is_bamin = 0; break;\n+\t\tcase \'h\': is_header = 1; break;\n+\t\tcase \'H\': is_header_only = 1; break;\n+\t\tcase \'o\': fn_out = strdup(optarg); break;\n+\t\tcase \'f\': g_flag_on = strtol(optarg, 0, 0); break;\n+\t\tcase \'F\': g_flag_off = strtol(optarg, 0, 0); break;\n+\t\tcase \'q\': g_min_mapQ = atoi(optarg); break;\n+\t\tcase \'u\': is_uncompressed = 1; break;\n+\t\tcase \'l\': g_library = strdup(optarg); break;\n+\t\tcase \'r\': g_rg = strdup(optarg); break;\n+\t\tcase \'R\': fn_rg = strdup(optarg); break;\n+\t\tcase \'x\': of_type = BAM_OFHEX; break;\n+\t\tcase \'X\': of_type = BAM_OFSTR; break;\n+\t\tcase \'?\': is_long_help = 1; break;\n+\t\tcase \'T\': fn_ref = strdup(optarg); is_bamin = 0; break;\n+\t\tdefault: return usage(is_long_help);\n+\t\t}\n+\t}\n+\tif (is_uncompressed) is_bamout = 1;\n+\tif (is_header_only) is_header = 1;\n+\tif (is_bamout) strcat(out_mode, "b");\n+\telse {\n+\t\tif (of_type == BAM_OFHEX) strcat(out_mode, "x");\n+\t\telse if (of_type == BAM_OFSTR) strcat(out_mode, "X");\n+\t}\n+\tif (is_bamin) strcat(in_mode, "b");\n+\tif (is_header) strcat(out_mode, "h");\n+\tif (is_uncompressed) strcat(out_mode, "u");\n+\tif (argc == optind) return usage(is_long_help); // potential memory leak...\n+\n+\t// read the list of read groups\n+\tif (fn_rg) {\n+\t\tFILE *fp_rg;\n+\t\tchar buf[1024];\n+\t\tint ret;\n+\t\tg_rghash = kh_init(rg);\n+\t\tfp_rg = fopen(fn_rg, "r");\n+\t\twhile (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but bear me...\n+\t\t\tkh_put(rg, g_rghash, strdup(buf), &ret); // we\'d better check duplicates...\n+\t\tfclose(fp_rg);\n+\t}\n+\n+\t// generate the fn_list if necessary\n+\tif (fn_list == 0 && fn_ref) fn_list = samfaipath(fn_ref);\n+\t// open file handlers\n+\tif ((in = samopen(arg'..b'rr, "[main_samview] fail to get the reference name. Continue anyway.\\n");\n+\t\t\t\tcontinue;\n+\t\t\t}\n+\t\t\tbam_fetch(in->x.bam, idx, tid, beg, end, out, view_func); // fetch alignments\n+\t\t}\n+\t\tbam_index_destroy(idx); // destroy the BAM index\n+\t}\n+\n+view_end:\n+\t// close files, free and return\n+\tfree(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg); free(fn_rg);\n+\tif (g_rghash) {\n+\t\tkhint_t k;\n+\t\tfor (k = 0; k < kh_end(g_rghash); ++k)\n+\t\t\tif (kh_exist(g_rghash, k)) free((char*)kh_key(g_rghash, k));\n+\t\tkh_destroy(rg, g_rghash);\n+\t}\n+\tsamclose(in);\n+\tsamclose(out);\n+\treturn ret;\n+}\n+\n+static int usage(int is_long_help)\n+{\n+\tfprintf(stderr, "\\n");\n+\tfprintf(stderr, "Usage: samtools view [options] <in.bam>|<in.sam> [region1 [...]]\\n\\n");\n+\tfprintf(stderr, "Options: -b output BAM\\n");\n+\tfprintf(stderr, " -h print header for the SAM output\\n");\n+\tfprintf(stderr, " -H print header only (no alignments)\\n");\n+\tfprintf(stderr, " -S input is SAM\\n");\n+\tfprintf(stderr, " -u uncompressed BAM output (force -b)\\n");\n+\tfprintf(stderr, " -x output FLAG in HEX (samtools-C specific)\\n");\n+\tfprintf(stderr, " -X output FLAG in string (samtools-C specific)\\n");\n+\tfprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\\n");\n+\tfprintf(stderr, " -T FILE reference sequence file (force -S) [null]\\n");\n+\tfprintf(stderr, " -o FILE output file name [stdout]\\n");\n+\tfprintf(stderr, " -R FILE list of read groups to be outputted [null]\\n");\n+\tfprintf(stderr, " -f INT required flag, 0 for unset [0]\\n");\n+\tfprintf(stderr, " -F INT filtering flag, 0 for unset [0]\\n");\n+\tfprintf(stderr, " -q INT minimum mapping quality [0]\\n");\n+\tfprintf(stderr, " -l STR only output reads in library STR [null]\\n");\n+\tfprintf(stderr, " -r STR only output reads in read group STR [null]\\n");\n+\tfprintf(stderr, " -? longer help\\n");\n+\tfprintf(stderr, "\\n");\n+\tif (is_long_help)\n+\t\tfprintf(stderr, "Notes:\\n\\\n+\\n\\\n+ 1. By default, this command assumes the file on the command line is in\\n\\\n+ the BAM format and it prints the alignments in SAM. If `-t\' is\\n\\\n+ applied, the input file is assumed to be in the SAM format. The\\n\\\n+ file supplied with `-t\' is SPACE/TAB delimited with the first two\\n\\\n+ fields of each line consisting of the reference name and the\\n\\\n+ corresponding sequence length. The `.fai\' file generated by `faidx\'\\n\\\n+ can be used here. This file may be empty if reads are unaligned.\\n\\\n+\\n\\\n+ 2. SAM->BAM conversion: `samtools view -bT ref.fa in.sam.gz\'.\\n\\\n+\\n\\\n+ 3. BAM->SAM conversion: `samtools view in.bam\'.\\n\\\n+\\n\\\n+ 4. A region should be presented in one of the following formats:\\n\\\n+ `chr1\', `chr2:1,000\' and `chr3:1000-2,000\'. When a region is\\n\\\n+ specified, the input alignment file must be an indexed BAM file.\\n\\\n+\\n\\\n+ 5. Option `-u\' is preferred over `-b\' when the output is piped to\\n\\\n+ another samtools command.\\n\\\n+\\n\\\n+ 6. In a string FLAG, each character represents one bit with\\n\\\n+ p=0x1 (paired), P=0x2 (properly paired), u=0x4 (unmapped),\\n\\\n+ U=0x8 (mate unmapped), r=0x10 (reverse), R=0x20 (mate reverse)\\n\\\n+ 1=0x40 (first), 2=0x80 (second), s=0x100 (not primary), \\n\\\n+ f=0x200 (failure) and d=0x400 (duplicate). Note that `-x\' and\\n\\\n+ `-X\' are samtools-C specific. Picard and older samtools do not\\n\\\n+ support HEX or string flags.\\n\\\n+\\n");\n+\treturn 1;\n+}\n+\n+int main_import(int argc, char *argv[])\n+{\n+\tint argc2, ret;\n+\tchar **argv2;\n+\tif (argc != 4) {\n+\t\tfprintf(stderr, "Usage: bamtk import <in.ref_list> <in.sam> <out.bam>\\n");\n+\t\treturn 1;\n+\t}\n+\targc2 = 6;\n+\targv2 = calloc(6, sizeof(char*));\n+\targv2[0] = "import", argv2[1] = "-o", argv2[2] = argv[3], argv2[3] = "-bt", argv2[4] = argv[1], argv2[5] = argv[2];\n+\tret = main_samview(argc2, argv2);\n+\tfree(argv2);\n+\treturn ret;\n+}\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/setup.cfg --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/setup.cfg Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,6 @@ +[bdist_rpm] +doc_files = README doc/*.html ChangeLog +vendor = TDB +packager = TDB <email@email.com> +distribution-name = Red Hat Linux +requires = python |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/setup.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/setup.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,116 @@ +#!/usr/bin/python +''' + +pysam +***** + +''' + +import os, sys, glob, shutil, hashlib + +name = "pysam" + +# collect pysam version +sys.path.insert( 0, "pysam") +import version + +version = version.__version__ + +samtools_exclude = ( "bamtk.c", "razip.c", "bgzip.c", "errmod.c", "bam_reheader.c", "bam2bcf.c" ) +samtools_dest = os.path.abspath( "samtools" ) +tabix_exclude = ( "main.c", ) +tabix_dest = os.path.abspath( "tabix" ) + +# copy samtools source +if len(sys.argv) >= 2 and sys.argv[1] == "import": + if len(sys.argv) < 3: raise ValueError("missing PATH to samtools source directory") + if len(sys.argv) < 4: raise ValueError("missing PATH to tabix source directory") + + for destdir, srcdir, exclude in zip( + (samtools_dest, tabix_dest), + sys.argv[2:4], + (samtools_exclude, tabix_exclude)): + + srcdir = os.path.abspath( srcdir ) + if not os.path.exists( srcdir ): raise IOError( "samtools src dir `%s` does not exist." % srcdir ) + + cfiles = glob.glob( os.path.join( srcdir, "*.c" ) ) + hfiles = glob.glob( os.path.join( srcdir, "*.h" ) ) + ncopied = 0 + for new_file in cfiles + hfiles: + f = os.path.basename(new_file) + if f in exclude: continue + old_file = os.path.join( destdir, f ) + if os.path.exists( old_file ): + md5_old = hashlib.md5("".join(open(old_file,"r").readlines())).digest() + md5_new = hashlib.md5("".join(open(new_file,"r").readlines())).digest() + if md5_old == md5_new: continue + raise ValueError( "incompatible files for %s and %s" % (old_file, new_file )) + + shutil.copy( new_file, destdir ) + ncopied += 1 + print "installed latest source code from %s: %i files copied" % (srcdir, ncopied) + sys.exit(0) + +from distutils.core import setup, Extension +from Cython.Distutils import build_ext + +classifiers = """ +Development Status :: 2 - Alpha +Operating System :: MacOS :: MacOS X +Operating System :: Microsoft :: Windows :: Windows NT/2000 +Operating System :: OS Independent +Operating System :: POSIX +Operating System :: POSIX :: Linux +Operating System :: Unix +Programming Language :: Python +Topic :: Scientific/Engineering +Topic :: Scientific/Engineering :: Bioinformatics +""" + +samtools = Extension( + "csamtools", # name of extension + [ "pysam/csamtools.pyx" ] +\ + [ "pysam/%s" % x for x in ( + "pysam_util.c", )] +\ + glob.glob( os.path.join( "samtools", "*.c" ) ), + library_dirs=[], + include_dirs=[ "samtools", "pysam" ], + libraries=[ "z", ], + language="c", + define_macros = [('FILE_OFFSET_BITS','64'), + ('_USE_KNETFILE','')], + ) + +tabix = Extension( + "ctabix", # name of extension + [ "pysam/ctabix.pyx" ] +\ + [ "pysam/%s" % x for x in ()] +\ + glob.glob( os.path.join( "tabix", "*.c" ) ), + library_dirs=[], + include_dirs=[ "tabix", "pysam" ], + libraries=[ "z", ], + language="c", + ) + +metadata = { + 'name': name, + 'version': version, + 'description': "pysam", + 'long_description': __doc__, + 'author': "Andreas Heger", + 'author_email': "andreas.heger@gmail.com", + 'license': "MIT", + 'platforms': "ALL", + 'url': "http://code.google.com/p/pysam/", + 'py_modules': [ + "pysam/__init__", + "pysam/Pileup", + "pysam/namedtuple", + "pysam/version" ], + 'ext_modules': [samtools, tabix], + 'cmdclass' : {'build_ext': build_ext}, + } + +if __name__=='__main__': + dist = setup(**metadata) |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tabix/bam_endian.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tabix/bam_endian.h Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,42 @@ +#ifndef BAM_ENDIAN_H +#define BAM_ENDIAN_H + +#include <stdint.h> + +static inline int bam_is_big_endian() +{ + long one= 1; + return !(*((char *)(&one))); +} +static inline uint16_t bam_swap_endian_2(uint16_t v) +{ + return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); +} +static inline void *bam_swap_endian_2p(void *x) +{ + *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x); + return x; +} +static inline uint32_t bam_swap_endian_4(uint32_t v) +{ + v = ((v & 0x0000FFFFU) << 16) | (v >> 16); + return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); +} +static inline void *bam_swap_endian_4p(void *x) +{ + *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x); + return x; +} +static inline uint64_t bam_swap_endian_8(uint64_t v) +{ + v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); + v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); + return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); +} +static inline void *bam_swap_endian_8p(void *x) +{ + *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x); + return x; +} + +#endif |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tabix/bgzf.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tabix/bgzf.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,676 @@\n+/* The MIT License\n+\n+ Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology\n+\n+ Permission is hereby granted, free of charge, to any person obtaining a copy\n+ of this software and associated documentation files (the "Software"), to deal\n+ in the Software without restriction, including without limitation the rights\n+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n+ copies of the Software, and to permit persons to whom the Software is\n+ furnished to do so, subject to the following conditions:\n+\n+ The above copyright notice and this permission notice shall be included in\n+ all copies or substantial portions of the Software.\n+\n+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n+ THE SOFTWARE.\n+*/\n+\n+/*\n+ 2009-06-29 by lh3: cache recent uncompressed blocks.\n+ 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP.\n+ 2009-06-12 by lh3: support a mode string like "wu" where \'u\' for uncompressed output */\n+\n+#include <stdio.h>\n+#include <stdlib.h>\n+#include <string.h>\n+#include <unistd.h>\n+#include <fcntl.h>\n+#include <sys/types.h>\n+#include <sys/stat.h>\n+#include "bgzf.h"\n+\n+#include "khash.h"\n+typedef struct {\n+\tint size;\n+\tuint8_t *block;\n+\tint64_t end_offset;\n+} cache_t;\n+KHASH_MAP_INIT_INT64(cache, cache_t)\n+\n+#if defined(_WIN32) || defined(_MSC_VER)\n+#define ftello(fp) ftell(fp)\n+#define fseeko(fp, offset, whence) fseek(fp, offset, whence)\n+#else\n+extern off_t ftello(FILE *stream);\n+extern int fseeko(FILE *stream, off_t offset, int whence);\n+#endif\n+\n+typedef int8_t bgzf_byte_t;\n+\n+static const int DEFAULT_BLOCK_SIZE = 64 * 1024;\n+static const int MAX_BLOCK_SIZE = 64 * 1024;\n+\n+static const int BLOCK_HEADER_LENGTH = 18;\n+static const int BLOCK_FOOTER_LENGTH = 8;\n+\n+static const int GZIP_ID1 = 31;\n+static const int GZIP_ID2 = 139;\n+static const int CM_DEFLATE = 8;\n+static const int FLG_FEXTRA = 4;\n+static const int OS_UNKNOWN = 255;\n+static const int BGZF_ID1 = 66; // \'B\'\n+static const int BGZF_ID2 = 67; // \'C\'\n+static const int BGZF_LEN = 2;\n+static const int BGZF_XLEN = 6; // BGZF_LEN+4\n+\n+static const int GZIP_WINDOW_BITS = -15; // no zlib header\n+static const int Z_DEFAULT_MEM_LEVEL = 8;\n+\n+\n+inline\n+void\n+packInt16(uint8_t* buffer, uint16_t value)\n+{\n+ buffer[0] = value;\n+ buffer[1] = value >> 8;\n+}\n+\n+inline\n+int\n+unpackInt16(const uint8_t* buffer)\n+{\n+ return (buffer[0] | (buffer[1] << 8));\n+}\n+\n+inline\n+void\n+packInt32(uint8_t* buffer, uint32_t value)\n+{\n+ buffer[0] = value;\n+ buffer[1] = value >> 8;\n+ buffer[2] = value >> 16;\n+ buffer[3] = value >> 24;\n+}\n+\n+static inline\n+int\n+bgzf_min(int x, int y)\n+{\n+ return (x < y) ? x : y;\n+}\n+\n+static\n+void\n+report_error(BGZF* fp, const char* message) {\n+ fp->error = message;\n+}\n+\n+static BGZF *bgzf_read_init()\n+{\n+\tBGZF *fp;\n+\tfp = calloc(1, sizeof(BGZF));\n+ fp->uncompressed_block_size = MAX_BLOCK_SIZE;\n+ fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);\n+ fp->compressed_block_size = MAX_BLOCK_SIZE;\n+ fp->compressed_block = malloc(MAX_BLOCK_SIZE);\n+\tfp->cache_size = 0;\n+\tfp->cache = kh_init(cache);\n+\treturn fp;\n+}\n+\n+static\n+BGZF*\n+open_read(int fd)\n+{\n+#ifdef _USE_KNETFILE\n+ knetFile *file = knet_dopen(fd, "r");\n+#else\n+ FILE* file = fdopen(fd, "r");\n+#endif\n+ BGZF* fp;\n+\tif (file == 0) return 0;\n+\tfp = bgzf_read_init();\n+ fp->file_descriptor = fd;\n+ fp->open_mode = \'r\';\n+#ifdef _USE_KNETFILE\n+ fp->x.fpr = file;\n+#else\n+ fp->file = file;\n+#endif\n+ return fp;\n+}\n+\n+static\n+BGZF*\n+open_write(int'..b'nt\n+flush_block(BGZF* fp)\n+{\n+ while (fp->block_offset > 0) {\n+ int block_length = deflate_block(fp, fp->block_offset);\n+ if (block_length < 0) {\n+ return -1;\n+ }\n+#ifdef _USE_KNETFILE\n+ int count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);\n+#else\n+ int count = fwrite(fp->compressed_block, 1, block_length, fp->file);\n+#endif\n+ if (count != block_length) {\n+ report_error(fp, "write failed");\n+ return -1;\n+ }\n+ fp->block_address += block_length;\n+ }\n+ return 0;\n+}\n+\n+int\n+bgzf_write(BGZF* fp, const void* data, int length)\n+{\n+ if (fp->open_mode != \'w\') {\n+ report_error(fp, "file not open for writing");\n+ return -1;\n+ }\n+\n+ if (fp->uncompressed_block == NULL) {\n+ fp->uncompressed_block = malloc(fp->uncompressed_block_size);\n+ }\n+\n+ const bgzf_byte_t* input = data;\n+ int block_length = fp->uncompressed_block_size;\n+ int bytes_written = 0;\n+ while (bytes_written < length) {\n+ int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written);\n+ bgzf_byte_t* buffer = fp->uncompressed_block;\n+ memcpy(buffer + fp->block_offset, input, copy_length);\n+ fp->block_offset += copy_length;\n+ input += copy_length;\n+ bytes_written += copy_length;\n+ if (fp->block_offset == block_length) {\n+ if (flush_block(fp) != 0) {\n+ break;\n+ }\n+ }\n+ }\n+ return bytes_written;\n+}\n+\n+int\n+bgzf_close(BGZF* fp)\n+{\n+ if (fp->open_mode == \'w\') {\n+ if (flush_block(fp) != 0) {\n+ return -1;\n+ }\n+\t\t{ // add an empty block\n+\t\t\tint count, block_length = deflate_block(fp, 0);\n+#ifdef _USE_KNETFILE\n+\t\t\tcount = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);\n+#else\n+\t\t\tcount = fwrite(fp->compressed_block, 1, block_length, fp->file);\n+#endif\n+\t\t}\n+#ifdef _USE_KNETFILE\n+ if (fflush(fp->x.fpw) != 0) {\n+#else\n+ if (fflush(fp->file) != 0) {\n+#endif\n+ report_error(fp, "flush failed");\n+ return -1;\n+ }\n+ }\n+ if (fp->owned_file) {\n+#ifdef _USE_KNETFILE\n+\t\tint ret;\n+\t\tif (fp->open_mode == \'w\') ret = fclose(fp->x.fpw);\n+\t\telse ret = knet_close(fp->x.fpr);\n+ if (ret != 0) return -1;\n+#else\n+ if (fclose(fp->file) != 0) {\n+ return -1;\n+ }\n+#endif\n+ }\n+ free(fp->uncompressed_block);\n+ free(fp->compressed_block);\n+\tfree_cache(fp);\n+ free(fp);\n+ return 0;\n+}\n+\n+void bgzf_set_cache_size(BGZF *fp, int cache_size)\n+{\n+\tif (fp) fp->cache_size = cache_size;\n+}\n+\n+int bgzf_check_EOF(BGZF *fp)\n+{\n+\tstatic uint8_t magic[28] = "\\037\\213\\010\\4\\0\\0\\0\\0\\0\\377\\6\\0\\102\\103\\2\\0\\033\\0\\3\\0\\0\\0\\0\\0\\0\\0\\0\\0";\n+\tuint8_t buf[28];\n+\toff_t offset;\n+#ifdef _USE_KNETFILE\n+\toffset = knet_tell(fp->x.fpr);\n+\tif (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1;\n+\tknet_read(fp->x.fpr, buf, 28);\n+\tknet_seek(fp->x.fpr, offset, SEEK_SET);\n+#else\n+\toffset = ftello(fp->file);\n+\tif (fseeko(fp->file, -28, SEEK_END) != 0) return -1;\n+\tfread(buf, 1, 28, fp->file);\n+\tfseeko(fp->file, offset, SEEK_SET);\n+#endif\n+\treturn (memcmp(magic, buf, 28) == 0)? 1 : 0;\n+}\n+\n+int64_t\n+bgzf_seek(BGZF* fp, int64_t pos, int where)\n+{\n+ if (fp->open_mode != \'r\') {\n+ report_error(fp, "file not open for read");\n+ return -1;\n+ }\n+ if (where != SEEK_SET) {\n+ report_error(fp, "unimplemented seek option");\n+ return -1;\n+ }\n+ int block_offset = pos & 0xFFFF;\n+ int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;\n+#ifdef _USE_KNETFILE\n+ if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) {\n+#else\n+ if (fseeko(fp->file, block_address, SEEK_SET) != 0) {\n+#endif\n+ report_error(fp, "seek failed");\n+ return -1;\n+ }\n+ fp->block_length = 0; // indicates current block is not loaded\n+ fp->block_address = block_address;\n+ fp->block_offset = block_offset;\n+ return 0;\n+}\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tabix/bgzf.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tabix/bgzf.h Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,156 @@ +/* The MIT License + + Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#ifndef __BGZF_H +#define __BGZF_H + +#include <stdint.h> +#include <stdio.h> +#include <stdbool.h> +#include <zlib.h> +#ifdef _USE_KNETFILE +#include "knetfile.h" +#endif + +//typedef int8_t bool; + +typedef struct { + int file_descriptor; + char open_mode; // 'r' or 'w' + bool owned_file, is_uncompressed; +#ifdef _USE_KNETFILE + union { + knetFile *fpr; + FILE *fpw; + } x; +#else + FILE* file; +#endif + int uncompressed_block_size; + int compressed_block_size; + void* uncompressed_block; + void* compressed_block; + int64_t block_address; + int block_length; + int block_offset; + int cache_size; + const char* error; + void *cache; // a pointer to a hash table +} BGZF; + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Open an existing file descriptor for reading or writing. + * Mode must be either "r" or "w". + * A subsequent bgzf_close will not close the file descriptor. + * Returns null on error. + */ +BGZF* bgzf_fdopen(int fd, const char* __restrict mode); + +/* + * Open the specified file for reading or writing. + * Mode must be either "r" or "w". + * Returns null on error. + */ +BGZF* bgzf_open(const char* path, const char* __restrict mode); + +/* + * Close the BGZ file and free all associated resources. + * Does not close the underlying file descriptor if created with bgzf_fdopen. + * Returns zero on success, -1 on error. + */ +int bgzf_close(BGZF* fp); + +/* + * Read up to length bytes from the file storing into data. + * Returns the number of bytes actually read. + * Returns zero on end of file. + * Returns -1 on error. + */ +int bgzf_read(BGZF* fp, void* data, int length); + +/* + * Write length bytes from data to the file. + * Returns the number of bytes written. + * Returns -1 on error. + */ +int bgzf_write(BGZF* fp, const void* data, int length); + +/* + * Return a virtual file pointer to the current location in the file. + * No interpetation of the value should be made, other than a subsequent + * call to bgzf_seek can be used to position the file at the same point. + * Return value is non-negative on success. + * Returns -1 on error. + */ +#define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)) + +/* + * Set the file to read from the location specified by pos, which must + * be a value previously returned by bgzf_tell for this file (but not + * necessarily one returned by this file handle). + * The where argument must be SEEK_SET. + * Seeking on a file opened for write is not supported. + * Returns zero on success, -1 on error. + */ +int64_t bgzf_seek(BGZF* fp, int64_t pos, int where); + +/* + * Set the cache size. Zero to disable. By default, caching is + * disabled. The recommended cache size for frequent random access is + * about 8M bytes. + */ +void bgzf_set_cache_size(BGZF *fp, int cache_size); + +int bgzf_check_EOF(BGZF *fp); + +int bgzf_read_block(BGZF* fp); + +#ifdef __cplusplus +} +#endif + +static inline int bgzf_getc(BGZF *fp) +{ + int c; + if (fp->block_offset >= fp->block_length) { + if (bgzf_read_block(fp) != 0) return -2; /* error */ + if (fp->block_length == 0) return -1; /* end-of-file */ + } + c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++]; + if (fp->block_offset == fp->block_length) { +#ifdef _USE_KNETFILE + fp->block_address = knet_tell(fp->x.fpr); +#else + fp->block_address = ftello(fp->file); +#endif + fp->block_offset = 0; + fp->block_length = 0; + } + return c; +} + +#endif |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tabix/bgzip.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tabix/bgzip.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,201 @@ +/* The MIT License + + Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <fcntl.h> +#include <unistd.h> +#include <errno.h> +#include <sys/select.h> +#include <sys/stat.h> +#include "bgzf.h" + +static const int WINDOW_SIZE = 64 * 1024; + +static int bgzip_main_usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bgzip [options] [file] ...\n\n"); + fprintf(stderr, "Options: -c write on standard output, keep original files unchanged\n"); + fprintf(stderr, " -d decompress\n"); + fprintf(stderr, " -f overwrite files without asking\n"); + fprintf(stderr, " -b INT decompress at virtual file pointer INT\n"); + fprintf(stderr, " -s INT decompress INT bytes in the uncompressed file\n"); + fprintf(stderr, " -h give this help\n"); + fprintf(stderr, "\n"); + return 1; +} + +static int write_open(const char *fn, int is_forced) +{ + int fd = -1; + char c; + if (!is_forced) { + if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) { + fprintf(stderr, "[bgzip] %s already exists; do you wish to overwrite (y or n)? ", fn); + scanf("%c", &c); + if (c != 'Y' && c != 'y') { + fprintf(stderr, "[bgzip] not overwritten\n"); + exit(1); + } + } + } + if (fd < 0) { + if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) { + fprintf(stderr, "[bgzip] %s: Fail to write\n", fn); + exit(1); + } + } + return fd; +} + +static void fail(BGZF* fp) +{ + fprintf(stderr, "Error: %s\n", fp->error); + exit(1); +} + +int main(int argc, char **argv) +{ + int c, compress, pstdout, is_forced; + BGZF *fp; + void *buffer; + long start, end, size; + + compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; + while((c = getopt(argc, argv, "cdhfb:s:")) >= 0){ + switch(c){ + case 'h': return bgzip_main_usage(); + case 'd': compress = 0; break; + case 'c': pstdout = 1; break; + case 'b': start = atol(optarg); break; + case 's': size = atol(optarg); break; + case 'f': is_forced = 1; break; + } + } + if (size >= 0) end = start + size; + if (end >= 0 && end < start) { + fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end); + return 1; + } + if (compress == 1) { + struct stat sbuf; + int f_src = fileno(stdin); + int f_dst = fileno(stdout); + + if ( argc>optind ) + { + if ( stat(argv[optind],&sbuf)<0 ) + { + fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); + return 1; + } + + if ((f_src = open(argv[optind], O_RDONLY)) < 0) { + fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); + return 1; + } + + if (pstdout) + f_dst = fileno(stdout); + else + { + char *name = malloc(strlen(argv[optind]) + 5); + strcpy(name, argv[optind]); + strcat(name, ".gz"); + f_dst = write_open(name, is_forced); + if (f_dst < 0) return 1; + free(name); + } + } + else if (!pstdout && isatty(fileno((FILE *)stdout)) ) + return bgzip_main_usage(); + + fp = bgzf_fdopen(f_dst, "w"); + buffer = malloc(WINDOW_SIZE); + while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) + if (bgzf_write(fp, buffer, c) < 0) fail(fp); + // f_dst will be closed here + if (bgzf_close(fp) < 0) fail(fp); + if (argc > optind) unlink(argv[optind]); + free(buffer); + close(f_src); + return 0; + } else { + struct stat sbuf; + int f_dst; + + if ( argc>optind ) + { + if ( stat(argv[optind],&sbuf)<0 ) + { + fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); + return 1; + } + char *name; + int len = strlen(argv[optind]); + if ( strcmp(argv[optind]+len-3,".gz") ) + { + fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]); + return 1; + } + fp = bgzf_open(argv[optind], "r"); + if (fp == NULL) { + fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]); + return 1; + } + + name = strdup(argv[optind]); + name[strlen(name) - 3] = '\0'; + f_dst = write_open(name, is_forced); + free(name); + } + else if (!pstdout && isatty(fileno((FILE *)stdin)) ) + return bgzip_main_usage(); + else + { + f_dst = fileno(stdout); + fp = bgzf_fdopen(fileno(stdin), "r"); + if (fp == NULL) { + fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno)); + return 1; + } + } + buffer = malloc(WINDOW_SIZE); + if (bgzf_seek(fp, start, SEEK_SET) < 0) fail(fp); + while (1) { + if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE); + else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); + if (c == 0) break; + if (c < 0) fail(fp); + start += c; + write(f_dst, buffer, c); + if (end >= 0 && start >= end) break; + } + free(buffer); + if (bgzf_close(fp) < 0) fail(fp); + if (!pstdout) unlink(argv[optind]); + return 0; + } +} |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tabix/index.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tabix/index.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,954 @@\n+#include <ctype.h>\n+#include <assert.h>\n+#include <sys/stat.h>\n+#include "khash.h"\n+#include "ksort.h"\n+#include "kstring.h"\n+#include "bam_endian.h"\n+#ifdef _USE_KNETFILE\n+#include "knetfile.h"\n+#endif\n+#include "tabix.h"\n+\n+#define TAD_MIN_CHUNK_GAP 32768\n+// 1<<14 is the size of minimum bin.\n+#define TAD_LIDX_SHIFT 14\n+\n+typedef struct {\n+\tuint64_t u, v;\n+} pair64_t;\n+\n+#define pair64_lt(a,b) ((a).u < (b).u)\n+KSORT_INIT(off, pair64_t, pair64_lt)\n+\n+typedef struct {\n+\tuint32_t m, n;\n+\tpair64_t *list;\n+} ti_binlist_t;\n+\n+typedef struct {\n+\tint32_t n, m;\n+\tuint64_t *offset;\n+} ti_lidx_t;\n+\n+KHASH_MAP_INIT_INT(i, ti_binlist_t)\n+KHASH_MAP_INIT_STR(s, int)\n+\n+struct __ti_index_t {\n+\tti_conf_t conf;\n+\tint32_t n, max;\n+\tkhash_t(s) *tname;\n+\tkhash_t(i) **index;\n+\tti_lidx_t *index2;\n+};\n+\n+struct __ti_iter_t {\n+\tint from_first; // read from the first record; no random access\n+\tint tid, beg, end, n_off, i, finished;\n+\tuint64_t curr_off;\n+\tkstring_t str;\n+\tconst ti_index_t *idx;\n+\tpair64_t *off;\n+};\n+\n+typedef struct {\n+\tint tid, beg, end, bin;\n+} ti_intv_t;\n+\n+ti_conf_t ti_conf_gff = { 0, 1, 4, 5, \'#\', 0 };\n+ti_conf_t ti_conf_bed = { TI_FLAG_UCSC, 1, 2, 3, \'#\', 0 };\n+ti_conf_t ti_conf_psltbl = { TI_FLAG_UCSC, 15, 17, 18, \'#\', 0 };\n+ti_conf_t ti_conf_sam = { TI_PRESET_SAM, 3, 4, 0, \'@\', 0 };\n+ti_conf_t ti_conf_vcf = { TI_PRESET_VCF, 1, 2, 0, \'#\', 0 };\n+\n+/***************\n+ * read a line *\n+ ***************/\n+\n+/*\n+int ti_readline(BGZF *fp, kstring_t *str)\n+{\n+\tint c, l = 0;\n+\tstr->l = 0;\n+\twhile ((c = bgzf_getc(fp)) >= 0 && c != \'\\n\') {\n+\t\t++l;\n+\t\tif (c != \'\\r\') kputc(c, str);\n+\t}\n+\tif (c < 0 && l == 0) return -1; // end of file\n+\treturn str->l;\n+}\n+*/\n+\n+/* Below is a faster implementation largely equivalent to the one\n+ * commented out above. */\n+int ti_readline(BGZF *fp, kstring_t *str)\n+{\n+\tint l, state = 0;\n+\tunsigned char *buf = (unsigned char*)fp->uncompressed_block;\n+\tstr->l = 0;\n+\tdo {\n+\t\tif (fp->block_offset >= fp->block_length) {\n+\t\t\tif (bgzf_read_block(fp) != 0) { state = -2; break; }\n+\t\t\tif (fp->block_length == 0) { state = -1; break; }\n+\t\t}\n+\t\tfor (l = fp->block_offset; l < fp->block_length && buf[l] != \'\\n\'; ++l);\n+\t\tif (l < fp->block_length) state = 1;\n+\t\tl -= fp->block_offset;\n+\t\tif (str->l + l + 1 >= str->m) {\n+\t\t\tstr->m = str->l + l + 2;\n+\t\t\tkroundup32(str->m);\n+\t\t\tstr->s = (char*)realloc(str->s, str->m);\n+\t\t}\n+\t\tmemcpy(str->s + str->l, buf + fp->block_offset, l);\n+\t\tstr->l += l;\n+\t\tfp->block_offset += l + 1;\n+\t\tif (fp->block_offset >= fp->block_length) {\n+#ifdef _USE_KNETFILE\n+\t\t\tfp->block_address = knet_tell(fp->x.fpr);\n+#else\n+\t\t\tfp->block_address = ftello(fp->file);\n+#endif\n+\t\t\tfp->block_offset = 0;\n+\t\t\tfp->block_length = 0;\n+\t\t} \n+\t} while (state == 0);\n+\tif (str->l == 0 && state < 0) return state;\n+\tstr->s[str->l] = 0;\n+\treturn str->l;\n+}\n+\n+/*************************************\n+ * get the interval from a data line *\n+ *************************************/\n+\n+static inline int ti_reg2bin(uint32_t beg, uint32_t end)\n+{\n+\t--end;\n+\tif (beg>>14 == end>>14) return 4681 + (beg>>14);\n+\tif (beg>>17 == end>>17) return 585 + (beg>>17);\n+\tif (beg>>20 == end>>20) return 73 + (beg>>20);\n+\tif (beg>>23 == end>>23) return 9 + (beg>>23);\n+\tif (beg>>26 == end>>26) return 1 + (beg>>26);\n+\treturn 0;\n+}\n+\n+static int get_tid(ti_index_t *idx, const char *ss)\n+{\n+\tkhint_t k;\n+\tint tid;\n+\tk = kh_get(s, idx->tname, ss);\n+\tif (k == kh_end(idx->tname)) { // a new target sequence\n+\t\tint ret, size;\n+\t\t// update idx->n, ->max, ->index and ->index2\n+\t\tif (idx->n == idx->max) {\n+\t\t\tidx->max = idx->max? idx->max<<1 : 8;\n+\t\t\tidx->index = realloc(idx->index, idx->max * sizeof(void*));\n+\t\t\tidx->index2 = realloc(idx->index2, idx->max * sizeof(ti_lidx_t));\n+\t\t}\n+\t\tmemset(&idx->index2[idx->n], 0, sizeof(ti_lidx_t));\n+\t\tidx->index[idx->n++] = kh_init(i);\n+\t\t// update ->tname\n+\t\ttid = size = kh_size(idx->tname);\n+\t\tk = kh_put(s, idx->tname, strdup(ss), &ret);\n+\t\tkh_value(idx->tname, k) = size;\n+\t\tassert(idx->n == kh_'..b'n_off, off);\n+\t\t// resolve completely contained adjacent blocks\n+\t\tfor (i = 1, l = 0; i < n_off; ++i)\n+\t\t\tif (off[l].v < off[i].v)\n+\t\t\t\toff[++l] = off[i];\n+\t\tn_off = l + 1;\n+\t\t// resolve overlaps between adjacent blocks; this may happen due to the merge in indexing\n+\t\tfor (i = 1; i < n_off; ++i)\n+\t\t\tif (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;\n+\t\t{ // merge adjacent blocks\n+\t\t\tfor (i = 1, l = 0; i < n_off; ++i) {\n+\t\t\t\tif (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;\n+\t\t\t\telse off[++l] = off[i];\n+\t\t\t}\n+\t\t\tn_off = l + 1;\n+\t\t}\n+\t}\n+\titer->n_off = n_off; iter->off = off;\n+\treturn iter;\n+}\n+\n+const char *ti_iter_read(BGZF *fp, ti_iter_t iter, int *len)\n+{\n+\tif (iter->finished) return 0;\n+\tif (iter->from_first) {\n+\t\tint ret;\n+\t\tif ((ret = ti_readline(fp, &iter->str)) < 0) {\n+\t\t\titer->finished = 1;\n+\t\t\treturn 0;\n+\t\t} else {\n+\t\t\tif (len) *len = iter->str.l;\n+\t\t\treturn iter->str.s;\n+\t\t}\n+\t}\n+\tif (iter->n_off == 0) return 0;\n+\twhile (1) {\n+\t\tint ret;\n+\t\tif (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk\n+\t\t\tif (iter->i == iter->n_off - 1) break; // no more chunks\n+\t\t\tif (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug\n+\t\t\tif (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek\n+\t\t\t\tbgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET);\n+\t\t\t\titer->curr_off = bgzf_tell(fp);\n+\t\t\t}\n+\t\t\t++iter->i;\n+\t\t}\n+\t\tif ((ret = ti_readline(fp, &iter->str)) >= 0) {\n+\t\t\tti_intv_t intv;\n+\t\t\titer->curr_off = bgzf_tell(fp);\n+\t\t\tif (iter->str.s[0] == iter->idx->conf.meta_char) continue;\n+\t\t\tget_intv((ti_index_t*)iter->idx, &iter->str, &intv);\n+\t\t\tif (intv.tid != iter->tid || intv.beg >= iter->end) break; // no need to proceed\n+\t\t\telse if (intv.end > iter->beg && iter->end > intv.beg) {\n+\t\t\t\tif (len) *len = iter->str.l;\n+\t\t\t\treturn iter->str.s;\n+\t\t\t}\n+\t\t} else break; // end of file\n+\t}\n+\titer->finished = 1;\n+\treturn 0;\n+}\n+\n+void ti_iter_destroy(ti_iter_t iter)\n+{\n+\tif (iter) {\n+\t\tfree(iter->str.s); free(iter->off);\n+\t\tfree(iter);\n+\t}\n+}\n+\n+int ti_fetch(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end, void *data, ti_fetch_f func)\n+{\n+\tti_iter_t iter;\n+\tconst char *s;\n+\tint len;\n+\titer = ti_iter_query(idx, tid, beg, end);\n+\twhile ((s = ti_iter_read(fp, iter, &len)) != 0)\n+\t\tfunc(len, s, data);\n+\tti_iter_destroy(iter);\n+\treturn 0;\n+}\n+\n+/*******************\n+ * High-level APIs *\n+ *******************/\n+\n+tabix_t *ti_open(const char *fn, const char *fnidx)\n+{\n+\ttabix_t *t;\n+\tBGZF *fp;\n+\tif ((fp = bgzf_open(fn, "r")) == 0) return 0;\n+\tt = calloc(1, sizeof(tabix_t));\n+\tt->fn = strdup(fn);\n+\tif (fnidx) t->fnidx = strdup(fnidx);\n+\tt->fp = fp;\n+\treturn t;\n+}\n+\n+void ti_close(tabix_t *t)\n+{\n+\tif (t) {\n+\t\tbgzf_close(t->fp);\n+\t\tif (t->idx) ti_index_destroy(t->idx);\n+\t\tfree(t->fn); free(t->fnidx);\n+\t\tfree(t);\n+\t}\n+}\n+\n+int ti_lazy_index_load(tabix_t *t)\n+{\n+\tif (t->idx == 0) { // load index\n+\t\tif (t->fnidx) t->idx = ti_index_load_local(t->fnidx);\n+\t\telse t->idx = ti_index_load(t->fn);\n+\t\tif (t->idx == 0) return -1; // fail to load index\n+\t}\n+\treturn 0;\n+}\n+\n+ti_iter_t ti_queryi(tabix_t *t, int tid, int beg, int end)\n+{\n+\tif (tid < 0) return ti_iter_first();\n+\tif (ti_lazy_index_load(t) != 0) return 0;\n+\treturn ti_iter_query(t->idx, tid, beg, end);\t\n+}\n+\n+ti_iter_t ti_querys(tabix_t *t, const char *reg)\n+{\n+\tint tid, beg, end;\n+\tif (reg == 0) return ti_iter_first();\n+\tif (ti_lazy_index_load(t) != 0) return 0;\n+\tif (ti_parse_region(t->idx, reg, &tid, &beg, &end) < 0) return 0;\n+\treturn ti_iter_query(t->idx, tid, beg, end);\n+}\n+\n+ti_iter_t ti_query(tabix_t *t, const char *name, int beg, int end)\n+{\n+\tint tid;\n+\tif (name == 0) return ti_iter_first();\n+\t// then need to load the index\n+\tif (ti_lazy_index_load(t) != 0) return 0;\n+\tif ((tid = ti_get_tid(t->idx, name)) < 0) return 0;\n+\treturn ti_iter_query(t->idx, tid, beg, end);\n+}\n+\n+const char *ti_read(tabix_t *t, ti_iter_t iter, int *len)\n+{\n+\treturn ti_iter_read(t->fp, iter, len);\n+}\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tabix/khash.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tabix/khash.h Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,486 @@\n+/* The MIT License\n+\n+ Copyright (c) 2008 Genome Research Ltd (GRL).\n+\n+ Permission is hereby granted, free of charge, to any person obtaining\n+ a copy of this software and associated documentation files (the\n+ "Software"), to deal in the Software without restriction, including\n+ without limitation the rights to use, copy, modify, merge, publish,\n+ distribute, sublicense, and/or sell copies of the Software, and to\n+ permit persons to whom the Software is furnished to do so, subject to\n+ the following conditions:\n+\n+ The above copyright notice and this permission notice shall be\n+ included in all copies or substantial portions of the Software.\n+\n+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,\n+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS\n+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN\n+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN\n+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n+ SOFTWARE.\n+*/\n+\n+/* Contact: Heng Li <lh3@sanger.ac.uk> */\n+\n+/*\n+ An example:\n+\n+#include "khash.h"\n+KHASH_MAP_INIT_INT(32, char)\n+int main() {\n+\tint ret, is_missing;\n+\tkhiter_t k;\n+\tkhash_t(32) *h = kh_init(32);\n+\tk = kh_put(32, h, 5, &ret);\n+\tif (!ret) kh_del(32, h, k);\n+\tkh_value(h, k) = 10;\n+\tk = kh_get(32, h, 10);\n+\tis_missing = (k == kh_end(h));\n+\tk = kh_get(32, h, 5);\n+\tkh_del(32, h, k);\n+\tfor (k = kh_begin(h); k != kh_end(h); ++k)\n+\t\tif (kh_exist(h, k)) kh_value(h, k) = 1;\n+\tkh_destroy(32, h);\n+\treturn 0;\n+}\n+*/\n+\n+/*\n+ 2008-09-19 (0.2.3):\n+\n+\t* Corrected the example\n+\t* Improved interfaces\n+\n+ 2008-09-11 (0.2.2):\n+\n+\t* Improved speed a little in kh_put()\n+\n+ 2008-09-10 (0.2.1):\n+\n+\t* Added kh_clear()\n+\t* Fixed a compiling error\n+\n+ 2008-09-02 (0.2.0):\n+\n+\t* Changed to token concatenation which increases flexibility.\n+\n+ 2008-08-31 (0.1.2):\n+\n+\t* Fixed a bug in kh_get(), which has not been tested previously.\n+\n+ 2008-08-31 (0.1.1):\n+\n+\t* Added destructor\n+*/\n+\n+\n+#ifndef __AC_KHASH_H\n+#define __AC_KHASH_H\n+\n+/*!\n+ @header\n+\n+ Generic hash table library.\n+\n+ @copyright Heng Li\n+ */\n+\n+#define AC_VERSION_KHASH_H "0.2.2"\n+\n+#include <stdint.h>\n+#include <stdlib.h>\n+#include <string.h>\n+\n+typedef uint32_t khint_t;\n+typedef khint_t khiter_t;\n+\n+#define __ac_HASH_PRIME_SIZE 32\n+static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =\n+{\n+ 0ul, 3ul, 11ul, 23ul, 53ul,\n+ 97ul, 193ul, 389ul, 769ul, 1543ul,\n+ 3079ul, 6151ul, 12289ul, 24593ul, 49157ul,\n+ 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,\n+ 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,\n+ 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,\n+ 3221225473ul, 4294967291ul\n+};\n+\n+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)\n+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)\n+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)\n+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))\n+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))\n+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))\n+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))\n+\n+static const double __ac_HASH_UPPER = 0.77;\n+\n+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \\\n+\ttypedef struct {\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tkhint_t n_buckets, size, n_occupied, upper_bound;\t\t\t\t\\\n+\t\tuint32_t *flags;\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tkhkey_t *keys;\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tkhval_t *vals;\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t} kh_##name##_t;\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\tstatic inline kh_##name##_t *kh_init_##name() {\t\t\t\t\t\t\\\n+\t\treturn (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t));\t\t\\\n+\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\tstatic inlin'..b'e, h, k) kh_get_##name(h, k)\n+\n+/*! @function\n+ @abstract Remove a key from the hash table.\n+ @param name Name of the hash table [symbol]\n+ @param h Pointer to the hash table [khash_t(name)*]\n+ @param k Iterator to the element to be deleted [khint_t]\n+ */\n+#define kh_del(name, h, k) kh_del_##name(h, k)\n+\n+\n+/*! @function\n+ @abstract Test whether a bucket contains data.\n+ @param h Pointer to the hash table [khash_t(name)*]\n+ @param x Iterator to the bucket [khint_t]\n+ @return 1 if containing data; 0 otherwise [int]\n+ */\n+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))\n+\n+/*! @function\n+ @abstract Get key given an iterator\n+ @param h Pointer to the hash table [khash_t(name)*]\n+ @param x Iterator to the bucket [khint_t]\n+ @return Key [type of keys]\n+ */\n+#define kh_key(h, x) ((h)->keys[x])\n+\n+/*! @function\n+ @abstract Get value given an iterator\n+ @param h Pointer to the hash table [khash_t(name)*]\n+ @param x Iterator to the bucket [khint_t]\n+ @return Value [type of values]\n+ @discussion For hash sets, calling this results in segfault.\n+ */\n+#define kh_val(h, x) ((h)->vals[x])\n+\n+/*! @function\n+ @abstract Alias of kh_val()\n+ */\n+#define kh_value(h, x) ((h)->vals[x])\n+\n+/*! @function\n+ @abstract Get the start iterator\n+ @param h Pointer to the hash table [khash_t(name)*]\n+ @return The start iterator [khint_t]\n+ */\n+#define kh_begin(h) (khint_t)(0)\n+\n+/*! @function\n+ @abstract Get the end iterator\n+ @param h Pointer to the hash table [khash_t(name)*]\n+ @return The end iterator [khint_t]\n+ */\n+#define kh_end(h) ((h)->n_buckets)\n+\n+/*! @function\n+ @abstract Get the number of elements in the hash table\n+ @param h Pointer to the hash table [khash_t(name)*]\n+ @return Number of elements in the hash table [khint_t]\n+ */\n+#define kh_size(h) ((h)->size)\n+\n+/*! @function\n+ @abstract Get the number of buckets in the hash table\n+ @param h Pointer to the hash table [khash_t(name)*]\n+ @return Number of buckets in the hash table [khint_t]\n+ */\n+#define kh_n_buckets(h) ((h)->n_buckets)\n+\n+/* More conenient interfaces */\n+\n+/*! @function\n+ @abstract Instantiate a hash set containing integer keys\n+ @param name Name of the hash table [symbol]\n+ */\n+#define KHASH_SET_INIT_INT(name)\t\t\t\t\t\t\t\t\t\t\\\n+\tKHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)\n+\n+/*! @function\n+ @abstract Instantiate a hash map containing integer keys\n+ @param name Name of the hash table [symbol]\n+ @param khval_t Type of values [type]\n+ */\n+#define KHASH_MAP_INIT_INT(name, khval_t)\t\t\t\t\t\t\t\t\\\n+\tKHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)\n+\n+/*! @function\n+ @abstract Instantiate a hash map containing 64-bit integer keys\n+ @param name Name of the hash table [symbol]\n+ */\n+#define KHASH_SET_INIT_INT64(name)\t\t\t\t\t\t\t\t\t\t\\\n+\tKHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)\n+\n+/*! @function\n+ @abstract Instantiate a hash map containing 64-bit integer keys\n+ @param name Name of the hash table [symbol]\n+ @param khval_t Type of values [type]\n+ */\n+#define KHASH_MAP_INIT_INT64(name, khval_t)\t\t\t\t\t\t\t\t\\\n+\tKHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)\n+\n+typedef const char *kh_cstr_t;\n+/*! @function\n+ @abstract Instantiate a hash map containing const char* keys\n+ @param name Name of the hash table [symbol]\n+ */\n+#define KHASH_SET_INIT_STR(name)\t\t\t\t\t\t\t\t\t\t\\\n+\tKHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)\n+\n+/*! @function\n+ @abstract Instantiate a hash map containing const char* keys\n+ @param name Name of the hash table [symbol]\n+ @param khval_t Type of values [type]\n+ */\n+#define KHASH_MAP_INIT_STR(name, khval_t)\t\t\t\t\t\t\t\t\\\n+\tKHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)\n+\n+#endif /* __AC_KHASH_H */\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tabix/knetfile.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tabix/knetfile.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,632 @@\n+/* The MIT License\n+\n+ Copyright (c) 2008 Genome Research Ltd (GRL).\n+\n+ Permission is hereby granted, free of charge, to any person obtaining\n+ a copy of this software and associated documentation files (the\n+ "Software"), to deal in the Software without restriction, including\n+ without limitation the rights to use, copy, modify, merge, publish,\n+ distribute, sublicense, and/or sell copies of the Software, and to\n+ permit persons to whom the Software is furnished to do so, subject to\n+ the following conditions:\n+\n+ The above copyright notice and this permission notice shall be\n+ included in all copies or substantial portions of the Software.\n+\n+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,\n+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS\n+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN\n+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN\n+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n+ SOFTWARE.\n+*/\n+\n+/* Contact: Heng Li <lh3@sanger.ac.uk> */\n+\n+/* Probably I will not do socket programming in the next few years and\n+ therefore I decide to heavily annotate this file, for Linux and\n+ Windows as well. -lh3 */\n+\n+#include <time.h>\n+#include <stdio.h>\n+#include <ctype.h>\n+#include <stdlib.h>\n+#include <string.h>\n+#include <errno.h>\n+#include <unistd.h>\n+#include <sys/types.h>\n+\n+#ifdef _WIN32\n+#include <winsock.h>\n+#else\n+#include <netdb.h>\n+#include <arpa/inet.h>\n+#include <sys/socket.h>\n+#endif\n+\n+#include "knetfile.h"\n+\n+/* In winsock.h, the type of a socket is SOCKET, which is: "typedef\n+ * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed\n+ * integer -1. In knetfile.c, I use "int" for socket type\n+ * throughout. This should be improved to avoid confusion.\n+ *\n+ * In Linux/Mac, recv() and read() do almost the same thing. You can see\n+ * in the header file that netread() is simply an alias of read(). In\n+ * Windows, however, they are different and using recv() is mandatory.\n+ */\n+\n+/* This function tests if the file handler is ready for reading (or\n+ * writing if is_read==0). */\n+static int socket_wait(int fd, int is_read)\n+{\n+\tfd_set fds, *fdr = 0, *fdw = 0;\n+\tstruct timeval tv;\n+\tint ret;\n+\ttv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out\n+\tFD_ZERO(&fds);\n+\tFD_SET(fd, &fds);\n+\tif (is_read) fdr = &fds;\n+\telse fdw = &fds;\n+\tret = select(fd+1, fdr, fdw, 0, &tv);\n+#ifndef _WIN32\n+\tif (ret == -1) perror("select");\n+#else\n+\tif (ret == 0)\n+\t\tfprintf(stderr, "select time-out\\n");\n+\telse if (ret == SOCKET_ERROR)\n+\t\tfprintf(stderr, "select: %d\\n", WSAGetLastError());\n+#endif\n+\treturn ret;\n+}\n+\n+#ifndef _WIN32\n+/* This function does not work with Windows due to the lack of\n+ * getaddrinfo() in winsock. It is addapted from an example in "Beej\'s\n+ * Guide to Network Programming" (http://beej.us/guide/bgnet/). */\n+static int socket_connect(const char *host, const char *port)\n+{\n+#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)\n+\n+\tint on = 1, fd;\n+\tstruct linger lng = { 0, 0 };\n+\tstruct addrinfo hints, *res;\n+\tmemset(&hints, 0, sizeof(struct addrinfo));\n+\thints.ai_family = AF_UNSPEC;\n+\thints.ai_socktype = SOCK_STREAM;\n+\t/* In Unix/Mac, getaddrinfo() is the most convenient way to get\n+\t * server information. */\n+\tif (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");\n+\tif ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");\n+\t/* The following two setsockopt() are used by ftplib\n+\t * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they\n+\t * necessary. */\n+\tif (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");\n+\tif (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("'..b'(fp);\n+\t\treturn 0;\n+\t}\n+\treturn fp;\n+}\n+\n+knetFile *knet_dopen(int fd, const char *mode)\n+{\n+\tknetFile *fp = (knetFile*)calloc(1, sizeof(knetFile));\n+\tfp->type = KNF_TYPE_LOCAL;\n+\tfp->fd = fd;\n+\treturn fp;\n+}\n+\n+off_t knet_read(knetFile *fp, void *buf, off_t len)\n+{\n+\toff_t l = 0;\n+\tif (fp->fd == -1) return 0;\n+\tif (fp->type == KNF_TYPE_FTP) {\n+\t\tif (fp->is_ready == 0) {\n+\t\t\tif (!fp->no_reconnect) kftp_reconnect(fp);\n+\t\t\tkftp_connect_file(fp);\n+\t\t}\n+\t} else if (fp->type == KNF_TYPE_HTTP) {\n+\t\tif (fp->is_ready == 0)\n+\t\t\tkhttp_connect_file(fp);\n+\t}\n+\tif (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX\n+\t\toff_t rest = len, curr;\n+\t\twhile (rest) {\n+\t\t\tcurr = read(fp->fd, buf + l, rest);\n+\t\t\tif (curr == 0) break;\n+\t\t\tl += curr; rest -= curr;\n+\t\t}\n+\t} else l = my_netread(fp->fd, buf, len);\n+\tfp->offset += l;\n+\treturn l;\n+}\n+\n+off_t knet_seek(knetFile *fp, int64_t off, int whence)\n+{\n+\tif (whence == SEEK_SET && off == fp->offset) return 0;\n+\tif (fp->type == KNF_TYPE_LOCAL) {\n+\t\t/* Be aware that lseek() returns the offset after seeking,\n+\t\t * while fseek() returns zero on success. */\n+\t\toff_t offset = lseek(fp->fd, off, whence);\n+\t\tif (offset == -1) {\n+ // Be silent, it is OK for knet_seek to fail when the file is streamed\n+ // fprintf(stderr,"[knet_seek] %s\\n", strerror(errno));\n+\t\t\treturn -1;\n+\t\t}\n+\t\tfp->offset = offset;\n+\t\treturn 0;\n+\t}\n+ else if (fp->type == KNF_TYPE_FTP) \n+ {\n+ if (whence==SEEK_CUR)\n+ fp->offset += off;\n+ else if (whence==SEEK_SET)\n+ fp->offset = off;\n+ else if ( whence==SEEK_END)\n+ fp->offset = fp->file_size+off;\n+\t\tfp->is_ready = 0;\n+\t\treturn 0;\n+\t} \n+ else if (fp->type == KNF_TYPE_HTTP) \n+ {\n+\t\tif (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future?\n+\t\t\tfprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\\n");\n+\t\t\terrno = ESPIPE;\n+\t\t\treturn -1;\n+\t\t}\n+ if (whence==SEEK_CUR)\n+ fp->offset += off;\n+ else if (whence==SEEK_SET)\n+ fp->offset = off;\n+\t\tfp->is_ready = 0;\n+\t\treturn fp->offset;\n+\t}\n+\terrno = EINVAL;\n+ fprintf(stderr,"[knet_seek] %s\\n", strerror(errno));\n+\treturn -1;\n+}\n+\n+int knet_close(knetFile *fp)\n+{\n+\tif (fp == 0) return 0;\n+\tif (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific\n+\tif (fp->fd != -1) {\n+\t\t/* On Linux/Mac, netclose() is an alias of close(), but on\n+\t\t * Windows, it is an alias of closesocket(). */\n+\t\tif (fp->type == KNF_TYPE_LOCAL) close(fp->fd);\n+\t\telse netclose(fp->fd);\n+\t}\n+\tfree(fp->host); free(fp->port);\n+\tfree(fp->response); free(fp->retr); free(fp->size_cmd); // FTP specific\n+\tfree(fp->path); free(fp->http_host); // HTTP specific\n+\tfree(fp);\n+\treturn 0;\n+}\n+\n+#ifdef KNETFILE_MAIN\n+int main(void)\n+{\n+\tchar *buf;\n+\tknetFile *fp;\n+\tint type = 4, l;\n+#ifdef _WIN32\n+\tknet_win32_init();\n+#endif\n+\tbuf = calloc(0x100000, 1);\n+\tif (type == 0) {\n+\t\tfp = knet_open("knetfile.c", "r");\n+\t\tknet_seek(fp, 1000, SEEK_SET);\n+\t} else if (type == 1) { // NCBI FTP, large file\n+\t\tfp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");\n+\t\tknet_seek(fp, 2500000000ll, SEEK_SET);\n+\t\tl = knet_read(fp, buf, 255);\n+\t} else if (type == 2) {\n+\t\tfp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");\n+\t\tknet_seek(fp, 1000, SEEK_SET);\n+\t} else if (type == 3) {\n+\t\tfp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");\n+\t\tknet_seek(fp, 1000, SEEK_SET);\n+\t} else if (type == 4) {\n+\t\tfp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");\n+\t\tknet_read(fp, buf, 10000);\n+\t\tknet_seek(fp, 20000, SEEK_SET);\n+\t\tknet_seek(fp, 10000, SEEK_SET);\n+\t\tl = knet_read(fp, buf+10000, 10000000) + 10000;\n+\t}\n+\tif (type != 4 && type != 1) {\n+\t\tknet_read(fp, buf, 255);\n+\t\tbuf[255] = 0;\n+\t\tprintf("%s\\n", buf);\n+\t} else write(fileno(stdout), buf, l);\n+\tknet_close(fp);\n+\tfree(buf);\n+\treturn 0;\n+}\n+#endif\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tabix/knetfile.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tabix/knetfile.h Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,75 @@ +#ifndef KNETFILE_H +#define KNETFILE_H + +#include <stdint.h> +#include <fcntl.h> + +#ifndef _WIN32 +#define netread(fd, ptr, len) read(fd, ptr, len) +#define netwrite(fd, ptr, len) write(fd, ptr, len) +#define netclose(fd) close(fd) +#else +#include <winsock2.h> +#define netread(fd, ptr, len) recv(fd, ptr, len, 0) +#define netwrite(fd, ptr, len) send(fd, ptr, len, 0) +#define netclose(fd) closesocket(fd) +#endif + +// FIXME: currently I/O is unbuffered + +#define KNF_TYPE_LOCAL 1 +#define KNF_TYPE_FTP 2 +#define KNF_TYPE_HTTP 3 + +typedef struct knetFile_s { + int type, fd; + int64_t offset; + char *host, *port; + + // the following are for FTP only + int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; + char *response, *retr, *size_cmd; + int64_t seek_offset; // for lazy seek + int64_t file_size; + + // the following are for HTTP only + char *path, *http_host; +} knetFile; + +#define knet_tell(fp) ((fp)->offset) +#define knet_fileno(fp) ((fp)->fd) + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _WIN32 + int knet_win32_init(); + void knet_win32_destroy(); +#endif + + knetFile *knet_open(const char *fn, const char *mode); + + /* + This only works with local files. + */ + knetFile *knet_dopen(int fd, const char *mode); + + /* + If ->is_ready==0, this routine updates ->fd; otherwise, it simply + reads from ->fd. + */ + off_t knet_read(knetFile *fp, void *buf, off_t len); + + /* + This routine only sets ->offset and ->is_ready=0. It does not + communicate with the FTP server. + */ + off_t knet_seek(knetFile *fp, int64_t off, int whence); + int knet_close(knetFile *fp); + +#ifdef __cplusplus +} +#endif + +#endif |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tabix/ksort.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tabix/ksort.h Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,271 @@\n+/* The MIT License\n+\n+ Copyright (c) 2008 Genome Research Ltd (GRL).\n+\n+ Permission is hereby granted, free of charge, to any person obtaining\n+ a copy of this software and associated documentation files (the\n+ "Software"), to deal in the Software without restriction, including\n+ without limitation the rights to use, copy, modify, merge, publish,\n+ distribute, sublicense, and/or sell copies of the Software, and to\n+ permit persons to whom the Software is furnished to do so, subject to\n+ the following conditions:\n+\n+ The above copyright notice and this permission notice shall be\n+ included in all copies or substantial portions of the Software.\n+\n+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,\n+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS\n+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN\n+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN\n+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n+ SOFTWARE.\n+*/\n+\n+/* Contact: Heng Li <lh3@sanger.ac.uk> */\n+\n+/*\n+ 2008-11-16 (0.1.4):\n+\n+ * Fixed a bug in introsort() that happens in rare cases.\n+\n+ 2008-11-05 (0.1.3):\n+\n+ * Fixed a bug in introsort() for complex comparisons.\n+\n+\t* Fixed a bug in mergesort(). The previous version is not stable.\n+\n+ 2008-09-15 (0.1.2):\n+\n+\t* Accelerated introsort. On my Mac (not on another Linux machine),\n+\t my implementation is as fast as std::sort on random input.\n+\n+\t* Added combsort and in introsort, switch to combsort if the\n+\t recursion is too deep.\n+\n+ 2008-09-13 (0.1.1):\n+\n+\t* Added k-small algorithm\n+\n+ 2008-09-05 (0.1.0):\n+\n+\t* Initial version\n+\n+*/\n+\n+#ifndef AC_KSORT_H\n+#define AC_KSORT_H\n+\n+#include <stdlib.h>\n+#include <string.h>\n+\n+typedef struct {\n+\tvoid *left, *right;\n+\tint depth;\n+} ks_isort_stack_t;\n+\n+#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }\n+\n+#define KSORT_INIT(name, type_t, __sort_lt)\t\t\t\t\t\t\t\t\\\n+\tvoid ks_mergesort_##name(size_t n, type_t array[], type_t temp[])\t\\\n+\t{\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\ttype_t *a2[2], *a, *b;\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tint curr, shift;\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\ta2[0] = array;\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\ta2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n);\t\t\\\n+\t\tfor (curr = 0, shift = 0; (1ul<<shift) < n; ++shift) {\t\t\t\\\n+\t\t\ta = a2[curr]; b = a2[1-curr];\t\t\t\t\t\t\t\t\\\n+\t\t\tif (shift == 0) {\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\ttype_t *p = b, *i, *eb = a + n;\t\t\t\t\t\t\t\\\n+\t\t\t\tfor (i = a; i < eb; i += 2) {\t\t\t\t\t\t\t\\\n+\t\t\t\t\tif (i == eb - 1) *p++ = *i;\t\t\t\t\t\t\t\\\n+\t\t\t\t\telse {\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\t\tif (__sort_lt(*(i+1), *i)) {\t\t\t\t\t\\\n+\t\t\t\t\t\t\t*p++ = *(i+1); *p++ = *i;\t\t\t\t\t\\\n+\t\t\t\t\t\t} else {\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\t\t\t*p++ = *i; *p++ = *(i+1);\t\t\t\t\t\\\n+\t\t\t\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t} else {\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\tsize_t i, step = 1ul<<shift;\t\t\t\t\t\t\t\\\n+\t\t\t\tfor (i = 0; i < n; i += step<<1) {\t\t\t\t\t\t\\\n+\t\t\t\t\ttype_t *p, *j, *k, *ea, *eb;\t\t\t\t\t\t\\\n+\t\t\t\t\tif (n < i + step) {\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\t\tea = a + n; eb = a;\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\t} else {\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\t\tea = a + i + step;\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\t\teb = a + (n < i + (step<<1)? n : i + (step<<1)); \\\n+\t\t\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\tj = a + i; k = a + i + step; p = b + i;\t\t\t\t\\\n+\t\t\t\t\twhile (j < ea && k < eb) {\t\t\t\t\t\t\t\\\n+\t\t\t\t\t\tif (__sort_lt(*k, *j)) *p++ = *k++;\t\t\t\t\\\n+\t\t\t\t\t\telse *p++ = *j++;\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\twhile (j < ea) *p++ = *j++;\t\t\t\t\t\t\t\\\n+\t\t\t\t\twhile (k < eb) *p++ = *k++;\t\t\t\t\t\t\t\\\n+\t\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\tcurr = 1 - curr;\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tif (curr == 1) {\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\ttype_t *p = a2[0], *i = a2[1], *eb = array + n;\t\t\t\t\\\n+\t\t\tfor (; p < eb; ++i) *p++ = *i;\t\t\t\t\t\t\t\t\\\n+\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tif (temp == 0) free(a2[1]);\t\t\t\t\t\t\t\t\t\t\\\n+\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\tvoid ks_heapadjust_##name(size_t i, size_t n, type_t l[])\t\t\t\\\n+\t{\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tsize_t k ='..b'\t\t\t\t\t\t\t\t\\\n+\tvoid ks_introsort_##name(size_t n, type_t a[])\t\t\t\t\t\t\\\n+\t{\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tint d;\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tks_isort_stack_t *top, *stack;\t\t\t\t\t\t\t\t\t\\\n+\t\ttype_t rp, swap_tmp;\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\ttype_t *s, *t, *i, *j, *k;\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tif (n < 1) return;\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\telse if (n == 2) {\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\tif (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \\\n+\t\t\treturn;\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\tfor (d = 2; 1ul<<d < n; ++d);\t\t\t\t\t\t\t\t\t\\\n+\t\tstack = (ks_isort_stack_t*)malloc(sizeof(ks_isort_stack_t) * ((sizeof(size_t)*d)+2)); \\\n+\t\ttop = stack; s = a; t = a + (n-1); d <<= 1;\t\t\t\t\t\t\\\n+\t\twhile (1) {\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\tif (s < t) {\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\tif (--d == 0) {\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\tks_combsort_##name(t - s + 1, s);\t\t\t\t\t\\\n+\t\t\t\t\tt = s;\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\tcontinue;\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\ti = s; j = t; k = i + ((j-i)>>1) + 1;\t\t\t\t\t\\\n+\t\t\t\tif (__sort_lt(*k, *i)) {\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\tif (__sort_lt(*k, *j)) k = j;\t\t\t\t\t\t\\\n+\t\t\t\t} else k = __sort_lt(*j, *i)? i : j;\t\t\t\t\t\\\n+\t\t\t\trp = *k;\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\tif (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; }\t\\\n+\t\t\t\tfor (;;) {\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\tdo ++i; while (__sort_lt(*i, rp));\t\t\t\t\t\\\n+\t\t\t\t\tdo --j; while (i <= j && __sort_lt(rp, *j));\t\t\\\n+\t\t\t\t\tif (j <= i) break;\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\tswap_tmp = *i; *i = *j; *j = swap_tmp;\t\t\t\t\\\n+\t\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\tswap_tmp = *i; *i = *t; *t = swap_tmp;\t\t\t\t\t\\\n+\t\t\t\tif (i-s > t-i) {\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\tif (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \\\n+\t\t\t\t\ts = t-i > 16? i+1 : t;\t\t\t\t\t\t\t\t\\\n+\t\t\t\t} else {\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\tif (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \\\n+\t\t\t\t\tt = i-s > 16? i-1 : s;\t\t\t\t\t\t\t\t\\\n+\t\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t} else {\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\tif (top == stack) {\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\tfree(stack);\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t\t__ks_insertsort_##name(a, a+n);\t\t\t\t\t\t\\\n+\t\t\t\t\treturn;\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\t} else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \\\n+\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t/* This function is adapted from: http://ndevilla.free.fr/median/ */ \\\n+\t/* 0 <= kk < n */\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\ttype_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk)\t\t\t\\\n+\t{\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\ttype_t *low, *high, *k, *ll, *hh, *mid;\t\t\t\t\t\t\t\\\n+\t\tlow = arr; high = arr + n - 1; k = arr + kk;\t\t\t\t\t\\\n+\t\tfor (;;) {\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\tif (high <= low) return *k;\t\t\t\t\t\t\t\t\t\\\n+\t\t\tif (high == low + 1) {\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\tif (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \\\n+\t\t\t\treturn *k;\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\tmid = low + (high - low) / 2;\t\t\t\t\t\t\t\t\\\n+\t\t\tif (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \\\n+\t\t\tif (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \\\n+\t\t\tif (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low);\t\\\n+\t\t\tKSORT_SWAP(type_t, *mid, *(low+1));\t\t\t\t\t\t\t\\\n+\t\t\tll = low + 1; hh = high;\t\t\t\t\t\t\t\t\t\\\n+\t\t\tfor (;;) {\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\tdo ++ll; while (__sort_lt(*ll, *low));\t\t\t\t\t\\\n+\t\t\t\tdo --hh; while (__sort_lt(*low, *hh));\t\t\t\t\t\\\n+\t\t\t\tif (hh < ll) break;\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\t\tKSORT_SWAP(type_t, *ll, *hh);\t\t\t\t\t\t\t\\\n+\t\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\tKSORT_SWAP(type_t, *low, *hh);\t\t\t\t\t\t\t\t\\\n+\t\t\tif (hh <= k) low = ll;\t\t\t\t\t\t\t\t\t\t\\\n+\t\t\tif (hh >= k) high = hh - 1;\t\t\t\t\t\t\t\t\t\\\n+\t\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n+\t}\n+\n+#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t)\n+#define ks_introsort(name, n, a) ks_introsort_##name(n, a)\n+#define ks_combsort(name, n, a) ks_combsort_##name(n, a)\n+#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a)\n+#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a)\n+#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a)\n+#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)\n+\n+#define ks_lt_generic(a, b) ((a) < (b))\n+#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)\n+\n+typedef const char *ksstr_t;\n+\n+#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)\n+#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)\n+\n+#endif\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tabix/kstring.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tabix/kstring.c Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,165 @@ +#include <stdarg.h> +#include <stdio.h> +#include <ctype.h> +#include <string.h> +#include <stdint.h> +#include "kstring.h" + +int ksprintf(kstring_t *s, const char *fmt, ...) +{ + va_list ap; + int l; + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'. + va_end(ap); + if (l + 1 > s->m - s->l) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); + } + va_end(ap); + s->l += l; + return l; +} + +// s MUST BE a null terminated string; l = strlen(s) +int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) +{ + int i, n, max, last_char, last_start, *offsets, l; + n = 0; max = *_max; offsets = *_offsets; + l = strlen(s); + +#define __ksplit_aux do { \ + if (_offsets) { \ + s[i] = 0; \ + if (n == max) { \ + max = max? max<<1 : 2; \ + offsets = (int*)realloc(offsets, sizeof(int) * max); \ + } \ + offsets[n++] = last_start; \ + } else ++n; \ + } while (0) + + for (i = 0, last_char = last_start = 0; i <= l; ++i) { + if (delimiter == 0) { + if (isspace(s[i]) || s[i] == 0) { + if (isgraph(last_char)) __ksplit_aux; // the end of a field + } else { + if (isspace(last_char) || last_char == 0) last_start = i; + } + } else { + if (s[i] == delimiter || s[i] == 0) { + if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field + } else { + if (last_char == delimiter || last_char == 0) last_start = i; + } + } + last_char = s[i]; + } + *_max = max; *_offsets = offsets; + return n; +} + +/********************** + * Boyer-Moore search * + **********************/ + +// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html +int *ksBM_prep(const uint8_t *pat, int m) +{ + int i, *suff, *prep, *bmGs, *bmBc; + prep = calloc(m + 256, 1); + bmGs = prep; bmBc = prep + m; + { // preBmBc() + for (i = 0; i < 256; ++i) bmBc[i] = m; + for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1; + } + suff = calloc(m, sizeof(int)); + { // suffixes() + int f = 0, g; + suff[m - 1] = m; + g = m - 1; + for (i = m - 2; i >= 0; --i) { + if (i > g && suff[i + m - 1 - f] < i - g) + suff[i] = suff[i + m - 1 - f]; + else { + if (i < g) g = i; + f = i; + while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g; + suff[i] = f - g; + } + } + } + { // preBmGs() + int j = 0; + for (i = 0; i < m; ++i) bmGs[i] = m; + for (i = m - 1; i >= 0; --i) + if (suff[i] == i + 1) + for (; j < m - 1 - i; ++j) + if (bmGs[j] == m) + bmGs[j] = m - 1 - i; + for (i = 0; i <= m - 2; ++i) + bmGs[m - 1 - suff[i]] = m - 1 - i; + } + free(suff); + return prep; +} + +int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches) +{ + int i, j, *prep, *bmGs, *bmBc; + int *matches = 0, mm = 0, nm = 0; + prep = _prep? _prep : ksBM_prep(pat, m); + bmGs = prep; bmBc = prep + m; + j = 0; + while (j <= n - m) { + for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i); + if (i < 0) { + if (nm == mm) { + mm = mm? mm<<1 : 1; + matches = realloc(matches, mm * sizeof(int)); + } + matches[nm++] = j; + j += bmGs[0]; + } else { + int max = bmBc[str[i+j]] - m + 1 + i; + if (max < bmGs[i]) max = bmGs[i]; + j += max; + } + } + *n_matches = nm; + if (_prep == 0) free(prep); + return matches; +} + +#ifdef KSTRING_MAIN +#include <stdio.h> +int main() +{ + kstring_t *s; + int *fields, n, i; + s = (kstring_t*)calloc(1, sizeof(kstring_t)); + // test ksprintf() + ksprintf(s, " abcdefg: %d ", 100); + printf("'%s'\n", s->s); + // test ksplit() + fields = ksplit(s, 0, &n); + for (i = 0; i < n; ++i) + printf("field[%d] = '%s'\n", i, s->s + fields[i]); + free(s); + + { + static char *str = "abcdefgcdg"; + static char *pat = "cd"; + int n, *matches; + matches = ksBM_search(str, strlen(str), pat, strlen(pat), 0, &n); + printf("%d: \n", n); + for (i = 0; i < n; ++i) + printf("- %d\n", matches[i]); + free(matches); + } + return 0; +} +#endif |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tabix/kstring.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tabix/kstring.h Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,68 @@ +#ifndef KSTRING_H +#define KSTRING_H + +#include <stdlib.h> +#include <string.h> +#include <stdint.h> + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +int ksprintf(kstring_t *s, const char *fmt, ...); +int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); + +// calculate the auxiliary array, allocated by calloc() +int *ksBM_prep(const uint8_t *pat, int m); + +/* Search pat in str and returned the list of matches. The size of the + * list is returned as n_matches. _prep is the array returned by + * ksBM_prep(). If it is a NULL pointer, ksBM_prep() will be called. */ +int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches); + +static inline int kputsn(const char *p, int l, kstring_t *s) +{ + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + strncpy(s->s + s->l, p, l); + s->l += l; + s->s[s->l] = 0; + return l; +} + +static inline int kputs(const char *p, kstring_t *s) +{ + return kputsn(p, strlen(p), s); +} + +static inline int kputc(int c, kstring_t *s) +{ + if (s->l + 1 >= s->m) { + s->m = s->l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + s->s[s->l++] = c; + s->s[s->l] = 0; + return c; +} + +static inline int *ksplit(kstring_t *s, int delimiter, int *n) +{ + int max = 0, *offsets = 0; + *n = ksplit_core(s->s, delimiter, &max, &offsets); + return offsets; +} + +#endif |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tabix/tabix.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tabix/tabix.h Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,137 @@ +/* The MIT License + + Copyright (c) 2009 Genome Research Ltd (GRL), 2010 Broad Institute + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li <lh3@live.co.uk> */ + +#ifndef __TABIDX_H +#define __TABIDX_H + +#include <stdint.h> +#include "kstring.h" +#include "bgzf.h" + +#define TI_PRESET_GENERIC 0 +#define TI_PRESET_SAM 1 +#define TI_PRESET_VCF 2 + +#define TI_FLAG_UCSC 0x10000 + +typedef int (*ti_fetch_f)(int l, const char *s, void *data); + +struct __ti_index_t; +typedef struct __ti_index_t ti_index_t; + +struct __ti_iter_t; +typedef struct __ti_iter_t *ti_iter_t; + +typedef struct { + BGZF *fp; + ti_index_t *idx; + char *fn, *fnidx; +} tabix_t; + +typedef struct { + int32_t preset; + int32_t sc, bc, ec; // seq col., beg col. and end col. + int32_t meta_char, line_skip; +} ti_conf_t; + +extern ti_conf_t ti_conf_gff, ti_conf_bed, ti_conf_psltbl, ti_conf_vcf, ti_conf_sam; // preset + +#ifdef __cplusplus +extern "C" { +#endif + + /******************* + * High-level APIs * + *******************/ + + tabix_t *ti_open(const char *fn, const char *fnidx); + int ti_lazy_index_load(tabix_t *t); + void ti_close(tabix_t *t); + ti_iter_t ti_query(tabix_t *t, const char *name, int beg, int end); + ti_iter_t ti_queryi(tabix_t *t, int tid, int beg, int end); + ti_iter_t ti_querys(tabix_t *t, const char *reg); + const char *ti_read(tabix_t *t, ti_iter_t iter, int *len); + + /* Destroy the iterator */ + void ti_iter_destroy(ti_iter_t iter); + + /* Get the list of sequence names. Each "char*" pointer points to a + * internal member of the index, so DO NOT modify the returned + * pointer; otherwise the index will be corrupted. The returned + * pointer should be freed by a single free() call by the routine + * calling this function. The number of sequences is returned at *n. */ + const char **ti_seqname(const ti_index_t *idx, int *n); + + /****************** + * Low-level APIs * + ******************/ + + /* Build the index for file <fn>. File <fn>.tbi will be generated + * and overwrite the file of the same name. Return -1 on failure. */ + int ti_index_build(const char *fn, const ti_conf_t *conf); + + /* Load the index from file <fn>.tbi. If <fn> is a URL and the index + * file is not in the working directory, <fn>.tbi will be + * downloaded. Return NULL on failure. */ + ti_index_t *ti_index_load(const char *fn); + + ti_index_t *ti_index_load_local(const char *fnidx); + + /* Destroy the index */ + void ti_index_destroy(ti_index_t *idx); + + /* Parse a region like: chr2, chr2:100, chr2:100-200. Return -1 on failure. */ + int ti_parse_region(const ti_index_t *idx, const char *str, int *tid, int *begin, int *end); + + int ti_get_tid(const ti_index_t *idx, const char *name); + + /* Get the iterator pointing to the first record at the current file + * position. If the file is just openned, the iterator points to the + * first record in the file. */ + ti_iter_t ti_iter_first(void); + + /* Get the iterator pointing to the first record in region tid:beg-end */ + ti_iter_t ti_iter_query(const ti_index_t *idx, int tid, int beg, int end); + + /* Get the data line pointed by the iterator and iterate to the next record. */ + const char *ti_iter_read(BGZF *fp, ti_iter_t iter, int *len); + + /******************* + * Deprecated APIs * + *******************/ + + /* The callback version for random access */ + int ti_fetch(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end, void *data, ti_fetch_f func); + + /* Read one line. */ + int ti_readline(BGZF *fp, kstring_t *str); + +#ifdef __cplusplus +} +#endif + +#endif |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tests/00README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tests/00README.txt Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,32 @@ +File ex1.fa contains two sequences cut from the human genome +build36. They were exatracted with command: + + samtools faidx human_b36.fa 2:2043966-2045540 20:67967-69550 + +Sequence names were changed manually for simplicity. File ex1.sam.gz +contains MAQ alignments exatracted with: + + (samtools view NA18507_maq.bam 2:2044001-2045500; + samtools view NA18507_maq.bam 20:68001-69500) + +and processed with `samtools fixmate' to make it self-consistent as a +standalone alignment. + +To try samtools, you may run the following commands: + + samtools faidx ex1.fa # index the reference FASTA + samtools import ex1.fa.fai ex1.sam.gz ex1.bam # SAM->BAM + samtools index ex1.bam # index BAM + samtools tview ex1.bam ex1.fa # view alignment + samtools pileup -cf ex1.fa ex1.bam # pileup and consensus + samtools pileup -cf ex1.fa -t ex1.fa.fai ex1.sam.gz + +In order for the script pysam_test.py to work, you will need pysam +in your PYTHONPATH. + +In order for the script example.py to work, you will need pysam +in your PYTHONPATH and run + + make all + +beforehand. |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tests/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tests/Makefile Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,33 @@ +all: ex1.glf ex1.pileup.gz ex1.bam.bai ex1.glfview.gz \ + ex2.sam.gz ex2.sam ex1.sam \ + ex2.bam \ + ex3.bam ex3.bam.bai \ + ex4.bam ex4.bam.bai \ + ex5.bam ex5.bam.bai \ + ex6.bam \ + ex8.bam + +ex2.sam.gz: ex1.bam ex1.bam.bai + samtools view -h ex1.bam | gzip > ex2.sam.gz + +%.bam: %.sam ex1.fa.fai + samtools import ex1.fa.fai $< $@ + +%.sam: %.sam.gz + gunzip < $< > $@ + +ex1.fa.fai:ex1.fa + samtools faidx ex1.fa +ex1.bam:ex1.sam.gz ex1.fa.fai + samtools import ex1.fa.fai ex1.sam.gz ex1.bam +%.bam.bai:%.bam + samtools index $< +ex1.pileup.gz:ex1.bam ex1.fa + samtools pileup -cf ex1.fa ex1.bam | gzip > ex1.pileup.gz +ex1.glf:ex1.bam ex1.fa + samtools pileup -gf ex1.fa ex1.bam > ex1.glf +ex1.glfview.gz:ex1.glf + samtools glfview ex1.glf | gzip > ex1.glfview.gz + +clean: + rm -fr *.bam *.bai *.glf* *.fai *.pileup* *~ calDepth *.dSYM pysam_*.sam ex2.sam ex2.sam.gz ex1.sam |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tests/ex1.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tests/ex1.fa Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,56 @@ +>chr1 +CACTAGTGGCTCATTGTAAATGTGTGGTTTAACTCGTCCATGGCCCAGCATTAGGGAGCT +GTGGACCCTGCAGCCTGGCTGTGGGGGCCGCAGTGGCTGAGGGGTGCAGAGCCGAGTCAC +GGGGTTGCCAGCACAGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTGGCAAGCTAGAG +TCCCATTTGGAGCCCCTCTAAGCCGTTCTATTTGTAATGAAAACTATATTTATGCTATTC +AGTTCTAAATATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAA +CAACCTTGAGAACCCCAGGGAATTTGTCAATGTCAGGGAAGGAGCATTTTGTCAGTTACC +AAATGTGTTTATTACCAGAGGGATGGAGGGAAGAGGGACGCTGAAGAACTTTGATGCCCT +CTTCTTCCAAAGATGAAACGCGTAACTGCGCTCTCATTCACTCCAGCTCCCTGTCACCCA +ATGGACCTGTGATATCTGGATTCTGGGAAATTCTTCATCCTGGACCCTGAGAGATTCTGC +AGCCCAGCTCCAGATTGCTTGTGGTCTGACAGGCTGCAACTGTGAGCCATCACAATGAAC +AACAGGAAGAAAAGGTCTTTCAAAAGGTGATGTGTGTTCTCATCAACCTCATACACACAC +ATGGTTTAGGGGTATAATACCTCTACATGGCTGATTATGAAAACAATGTTCCCCAGATAC +CATCCCTGTCTTACTTCCAGCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTCT +TTTGGCATTTGCCTTCAGACCCTACACGAATGCGTCTCTACCACAGGGGGCTGCGCGGTT +TCCCATCATGAAGCACTGAACTTCCACGTCTCATCTAGGGGAACAGGGAGGTGCACTAAT +GCGCTCCACGCCCAAGCCCTTCTCACAGTTTCTGCCCCCAGCATGGTTGTACTGGGCAAT +ACATGAGATTATTAGGAAATGCTTTACTGTCATAACTATGAAGAGACTATTGCCAGATGA +ACCACACATTAATACTATGTTTCTTATCTGCACATTACTACCCTGCAATTAATATAATTG +TGTCCATGTACACACGCTGTCCTATGTACTTATCATGACTCTATCCCAAATTCCCAATTA +CGTCCTATCTTCTTCTTAGGGAAGAACAGCTTAGGTATCAATTTGGTGTTCTGTGTAAAG +TCTCAGGGAGCCGTCCGTGTCCTCCCATCTGGCCTCGTCCACACTGGTTCTCTTGAAAGC +TTGGGCTGTAATGATGCCCCTTGGCCATCACCCAGTCCCTGCCCCATCTCTTGTAATCTC +TCTCCTTTTTGCTGCATCCCTGTCTTCCTCTGTCTTGATTTACTTGTTGTTGGTTTTCTG +TTTCTTTGTTTGATTTGGTGGAAGACATAATCCCACGCTTCCTATGGAAAGGTTGTTGGG +AGATTTTTAATGATTCCTCAATGTTAAAATGTCTATTTTTGTCTTGACACCCAACTAATA +TTTGTCTGAGCAAAACAGTCTAGATGAGAGAGAACTTCCCTGGAGGTCTGATGGCGTTTC +TCCCTCGTCTTCTTA +>chr2 +TTCAAATGAACTTCTGTAATTGAAAAATTCATTTAAGAAATTACAAAATATAGTTGAAAG +CTCTAACAATAGACTAAACCAAGCAGAAGAAAGAGGTTCAGAACTTGAAGACAAGTCTCT +TATGAATTAACCCAGTCAGACAAAAATAAAGAAAAAAATTTTAAAAATGAACAGAGCTTT +CAAGAAGTATGAGATTATGTAAAGTAACTGAACCTATGAGTCACAGGTATTCCTGAGGAA +AAAGAAAAAGTGAGAAGTTTGGAAAAACTATTTGAGGAAGTAATTGGGGAAAACCTCTTT +AGTCTTGCTAGAGATTTAGACATCTAAATGAAAGAGGCTCAAAGAATGCCAGGAAGATAC +ATTGCAAGACAGACTTCATCAAGATATGTAGTCATCAGACTATCTAAAGTCAACATGAAG +GAAAAAAATTCTAAAATCAGCAAGAGAAAAGCATACAGTCATCTATAAAGGAAATCCCAT +CAGAATAACAATGGGCTTCTCAGCAGAAACCTTACAAGCCAGAAGAGATTGGATCTAATT +TTTGGACTTCTTAAAGAAAAAAAAACCTGTCAAACACGAATGTTATGCCCTGCTAAACTA +AGCATCATAAATGAAGGGGAAATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGATA +ATTCATCATCACTAAACCAGTCCTATAAGAAATGCTCAAAAGAATTGTAAAAGTCAAAAT +TAAAGTTCAATACTCACCATCATAAATACACACAAAAGTACAAAACTCACAGGTTTTATA +AAACAATTGAGACTACAGAGCAACTAGGTAAAAAATTAACATTACAACAGGAACAAAACC +TCATATATCAATATTAACTTTGAATAAAAAGGGATTAAATTCCCCCACTTAAGAGATATA +GATTGGCAGAACAGATTTAAAAACATGAACTAACTATATGCTGTTTACAAGAAACTCATT +AATAAAGACATGAGTTCAGGTAAAGGGGTGGAAAAAGATGTTCTACGCAAACAGAAACCA +AATGAGAGAAGGAGTAGCTATACTTATATCAGATAAAGCACACTTTAAATCAACAACAGT +AAAATAAAACAAAGGAGGTCATCATACAATGATAAAAAGATCAATTCAGCAAGAAGATAT +AACCATCCTACTAAATACATATGCACCTAACACAAGACTACCCAGATTCATAAAACAAAT +ACTACTAGACCTAAGAGGGATGAGAAATTACCTAATTGGTACAATGTACAATATTCTGAT +GATGGTTACACTAAAAGCCCATACTTTACTGCTACTCAATATATCCATGTAACAAATCTG +CGCTTGTACTTCTAAATCTATAAAAAAATTAAAATTTAACAAAAGTAAATAAAACACATA +GCTAAAACTAAAAAAGCAAAAACAAAAACTATGCTAAGTATTGGTAAAGATGTGGGGAAA +AAAGTAAACTCTCAAATATTGCTAGTGGGAGTATAAATTGTTTTCCACTTTGGAAAACAA +TTTGGTAATTTCGTTTTTTTTTTTTTCTTTTCTCTTTTTTTTTTTTTTTTTTTTGCATGC +CAGAAAAAAATATTTACAGTAACT |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tests/ex1.sam.gz |
b |
Binary file chimerascan/pysam/tests/ex1.sam.gz has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tests/ex3.sam --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tests/ex3.sam Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,13 @@ +@HD VN:1.0 +@SQ SN:chr1 LN:1575 +@SQ SN:chr2 LN:1584 +@RG ID:L1 PU:SC_1_10 LB:SC_1 SM:NA12891 CN:name:with:colon +@RG ID:L2 PU:SC_2_12 LB:SC_2 SM:NA12891 CN:name:with:colon +@PG ID:P1 VN:1.0 +@PG ID:P2 VN:1.1 +@CO this is a comment +@CO this is another comment +read_28833_29006_6945 99 chr1 33 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1 RG:Z:L1 PG:Z:P1 XT:A:U +read_28701_28881_323b 147 chr2 88 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< MF:i:18 RG:Z:L2 PG:Z:P2 XT:A:R +read_28701_28881_323c 147 chr2 88 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< +test_clipped1 99 chr2 997 20 4S6M1D20M5S = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1 RG:Z:L1 PG:Z:P1 XT:A:U |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tests/ex4.sam --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tests/ex4.sam Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,9 @@ +@HD VN:1.0 +@SQ SN:chr1 LN:100 +@SQ SN:chr2 LN:100 +@RG ID:L1 PU:SC_1_10 LB:SC_1 SM:NA12891 +@RG ID:L2 PU:SC_2_12 LB:SC_2 SM:NA12891 +@CO this is a comment +@CO this is another comment +read_28833_29006_6945 99 chr1 21 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1 RG:Z:L1 +read_28701_28881_323b 147 chr2 21 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< MF:i:18 RG:Z:L2 |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tests/ex5.sam --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tests/ex5.sam Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,5 @@ +@HD VN:1.0 +@SQ SN:chr1 LN:100 +@SQ SN:chr2 LN:100 +read_28833_29006_6945 0 * * * * * 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< +read_28701_28881_323b 0 * * * * * 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tests/ex6.sam --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tests/ex6.sam Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,5 @@ +@HD VN:1.0 +@SQ SN:chr1 LN:1575 +@SQ SN:chr2 LN:1584 +read_28833_29006_6945 99 chr1 33 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1 RG:Z:L1 +read_28701_28881_323b 147 chr2 88 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< MF:i:18 RG:Z:L2 |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tests/ex7.sam --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tests/ex7.sam Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,2 @@ +read_28833_29006_6945 99 chr1 33 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1 RG:Z:L1 PG:Z:P1 XT:A:U +read_28701_28881_323b 147 chr2 88 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< MF:i:18 RG:Z:L2 PG:Z:P2 XT:A:R |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tests/ex8.sam --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tests/ex8.sam Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,3 @@ +@HD VN:1.0 +@SQ SN:2 LN:48297693 +GJP00TM04CAQ5W 0 2 38297693 60 45H51M1D13M1D12M1D9M2D5M1D7M4D2M1I6M1D28M1D5M1D2M1D18M55H * 0 0 CATGAAGAACCGCTGGGTATGGAGCACACCTCACCTGATGGACAGTTGATTATGCTCACCTTAACGCTAATTGAGAGCAGCACAAGAGGACTGGAAACTAGAATTTACTCCTCATCTCCGAAGATGTGAATATTCTAAATTCAGCTTGCCTCTTGCTTC IID7757111/=;?///:D>777;EEGAAAEEIHHIIIIIIIIIIIIIIBBBIIIIH==<<<DDGEEE;<<<A><<<DEDDA>>>D?1112544556::03---//25.22=;DD?;;;>BDDDEEEGGGA<888<BAA888<GGGGGEB?9::DD551 NM:i:15 MD:Z:51^T13^A12^A9^AA5^A7^AAAA8^T28^T5^A2^T18 RG:Z:GJP00TM04 |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tests/example.gtf.gz |
b |
Binary file chimerascan/pysam/tests/example.gtf.gz has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tests/example.gtf.gz.tbi |
b |
Binary file chimerascan/pysam/tests/example.gtf.gz.tbi has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tests/example.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tests/example.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,121 @@ +import sys +import pysam + +samfile = pysam.Samfile( "ex1.bam", "rb" ) + +print "###################" +# check different ways to iterate +print len(list(samfile.fetch())) +print len(list(samfile.fetch( "chr1", 10, 200 ))) +print len(list(samfile.fetch( region="chr1:10-200" ))) +print len(list(samfile.fetch( "chr1" ))) +print len(list(samfile.fetch( region="chr1"))) +print len(list(samfile.fetch( "chr2" ))) +print len(list(samfile.fetch( region="chr2"))) +print len(list(samfile.fetch())) +print len(list(samfile.fetch( "chr1" ))) +print len(list(samfile.fetch( region="chr1"))) +print len(list(samfile.fetch())) + +print len(list(samfile.pileup( "chr1", 10, 200 ))) +print len(list(samfile.pileup( region="chr1:10-200" ))) +print len(list(samfile.pileup( "chr1" ))) +print len(list(samfile.pileup( region="chr1"))) +print len(list(samfile.pileup( "chr2" ))) +print len(list(samfile.pileup( region="chr2"))) +print len(list(samfile.pileup())) +print len(list(samfile.pileup())) + +print "########### fetch with callback ################" +def my_fetch_callback( alignment ): print str(alignment) +samfile.fetch( region="chr1:10-200", callback=my_fetch_callback ) + +print "########## pileup with callback ################" +def my_pileup_callback( column ): print str(column) +samfile.pileup( region="chr1:10-200", callback=my_pileup_callback ) + +print "##########iterator row #################" +iter = pysam.IteratorRow( samfile, 0, 10, 200) +for x in iter: print str(x) + +print "##########iterator col #################" +iter = pysam.IteratorColumn( samfile, 0, 10, 200 ) +for x in iter: print str(x) + +print "#########row all##################" +iter = pysam.IteratorRowAll( samfile ) +for x in iter: print str(x) + + +print "###################" + +class Counter: + mCounts = 0 + def __call__(self, alignment): + self.mCounts += 1 + +c = Counter() +samfile.fetch( "chr1:10-200", c ) +print "counts=", c.mCounts + +sys.exit(0) +print samfile.getTarget( 0 ) +print samfile.getTarget( 1 ) + +for p in pysam.pileup( "-c", "ex1.bam" ): + print str(p) + +print pysam.pileup.getMessages() + +for p in pysam.pileup( "-c", "ex1.bam", raw=True ): + print str(p), + + + +print "###########################" + +samfile = pysam.Samfile( "ex2.sam.gz", "r" ) + +print "num targets=", samfile.getNumTargets() + +iter = pysam.IteratorRowAll( samfile ) +for x in iter: print str(x) + +samfile.close() + +print "###########################" +samfile = pysam.Samfile( "ex2.sam.gz", "r" ) +def my_fetch_callback( alignment ): + print str(alignment) + +try: + samfile.fetch( "chr1:10-20", my_fetch_callback ) +except AssertionError: + print "caught fetch exception" + +samfile.close() + +print "###########################" +samfile = pysam.Samfile( "ex2.sam.gz", "r" ) +def my_pileup_callback( pileups ): + print str(pileups) +try: + samfile.pileup( "chr1:10-20", my_pileup_callback ) +except NotImplementedError: + print "caught pileup exception" + +# playing arount with headers +samfile = pysam.Samfile( "ex3.sam", "r" ) +print samfile.targets +print samfile.lengths +print samfile.text +print samdile.header +header = samfile.header +samfile.close() + +header["HD"]["SO"] = "unsorted" +outfile = pysam.Samfile( "out.sam", "wh", + header = header ) + +outfile.close() + |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tests/pysam_test.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tests/pysam_test.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,1008 @@\n+#!/usr/bin/env python\n+\'\'\'unit testing code for pysam.\n+\n+Execute in the :file:`tests` directory as it requires the Makefile\n+and data files located there.\n+\'\'\'\n+\n+import pysam\n+import unittest\n+import os, re\n+import itertools\n+import subprocess\n+import shutil\n+\n+\n+def checkBinaryEqual( filename1, filename2 ):\n+ \'\'\'return true if the two files are binary equal.\'\'\'\n+ if os.path.getsize( filename1 ) != os.path.getsize( filename2 ):\n+ return False\n+\n+ infile1 = open(filename1, "rb")\n+ infile2 = open(filename2, "rb")\n+\n+ def chariter( infile ):\n+ while 1:\n+ c = infile.read(1)\n+ if c == "": break\n+ yield c\n+\n+ found = False\n+ for c1,c2 in itertools.izip( chariter( infile1), chariter( infile2) ):\n+ if c1 != c2: break\n+ else:\n+ found = True\n+\n+ infile1.close()\n+ infile2.close()\n+ return found\n+\n+def runSamtools( cmd ):\n+ \'\'\'run a samtools command\'\'\'\n+\n+ try:\n+ retcode = subprocess.call(cmd, shell=True)\n+ if retcode < 0:\n+ print >>sys.stderr, "Child was terminated by signal", -retcode\n+ except OSError, e:\n+ print >>sys.stderr, "Execution failed:", e\n+\n+def getSamtoolsVersion():\n+ \'\'\'return samtools version\'\'\'\n+\n+ pipe = subprocess.Popen("samtools", shell=True, stderr=subprocess.PIPE).stderr\n+ lines = "".join(pipe.readlines())\n+ return re.search( "Version:\\s+(\\S+)", lines).groups()[0]\n+\n+class BinaryTest(unittest.TestCase):\n+ \'\'\'test samtools command line commands and compare\n+ against pysam commands.\n+\n+ Tests fail, if the output is not binary identical.\n+ \'\'\'\n+\n+ first_time = True\n+\n+ # a list of commands to test\n+ mCommands = \\\n+ { "faidx" : \\\n+ ( \n+ ("ex1.fa.fai", "samtools faidx ex1.fa"), \n+ ("pysam_ex1.fa.fai", (pysam.faidx, "ex1.fa") ),\n+ ),\n+ "import" :\n+ (\n+ ("ex1.bam", "samtools import ex1.fa.fai ex1.sam.gz ex1.bam" ),\n+ ("pysam_ex1.bam", (pysam.samimport, "ex1.fa.fai ex1.sam.gz pysam_ex1.bam") ),\n+ ),\n+ "index":\n+ (\n+ ("ex1.bam.bai", "samtools index ex1.bam" ),\n+ ("pysam_ex1.bam.bai", (pysam.index, "pysam_ex1.bam" ) ),\n+ ),\n+ "pileup1" :\n+ (\n+ ("ex1.pileup", "samtools pileup -cf ex1.fa ex1.bam > ex1.pileup" ),\n+ ("pysam_ex1.pileup", (pysam.pileup, "-c -f ex1.fa ex1.bam" ) )\n+ ),\n+ "pileup2" :\n+ (\n+ ("ex1.glf", "samtools pileup -gf ex1.fa ex1.bam > ex1.glf" ),\n+ ("pysam_ex1.glf", (pysam.pileup, "-g -f ex1.fa ex1.bam" ) )\n+ ),\n+ "glfview" :\n+ (\n+ ("ex1.glfview", "samtools glfview ex1.glf > ex1.glfview"),\n+ ("pysam_ex1.glfview", (pysam.glfview, "ex1.glf" ) ),\n+ ),\n+ "view" :\n+ (\n+ ("ex1.view", "samtools view ex1.bam > ex1.view"),\n+ ("pysam_ex1.view", (pysam.view, "ex1.bam" ) ),\n+ ),\n+ "view2" :\n+ (\n+ ("ex1.view", "samtools view -bT ex1.fa -o ex1.view2 ex1.sam"),\n+ # note that -o ex1.view2 throws exception.\n+ ("pysam_ex1.view", (pysam.view, "-bT ex1.fa -oex1.view2 ex1.sam" ) ),\n+ ),\n+ }\n+\n+ # some tests depend on others. The order specifies in which order\n+ # the samtools commands are executed.\n+ mOrder = (\'faidx\', \'import\', \'index\', \'pileup1\', \'pileup2\', \'glfview\', \'view\', \'view2\' )\n+\n+ def setUp( self ):\n+ \'\'\'setup tests. \n+\n+ For setup, all commands will be run before the first test is\n+ executed. Individual tests will then just compare the output\n+ files.\n+ \'\'\'\n+ if BinaryTest.first_time:\n+ # copy the source \n+ shutil.copy( "ex1.fa", "pysam_ex1.fa" )\n+\n+ '..b', self.reads):\n+ self.checkFieldEqual( other, denovo )\n+ self.assertEqual( other.compare( denovo ), 0 )\n+\n+ def testSAMPerRead( self ):\n+ \'\'\'check if individual reads are binary equal.\'\'\'\n+ infile = pysam.Samfile( self.samfile, "r")\n+\n+ others = list(infile)\n+ for denovo, other in zip( others, self.reads):\n+ self.checkFieldEqual( other, denovo )\n+ self.assertEqual( other.compare( denovo), 0 )\n+ \n+ def testBAMWholeFile( self ):\n+ \n+ tmpfilename = "tmp_%i.bam" % id(self)\n+\n+ outfile = pysam.Samfile( tmpfilename, "wb", header = self.header )\n+\n+ for x in self.reads: outfile.write( x )\n+ outfile.close()\n+ \n+ self.assertTrue( checkBinaryEqual( tmpfilename, self.bamfile ),\n+ "mismatch when construction BAM file, see %s %s" % (tmpfilename, self.bamfile))\n+ \n+ os.unlink( tmpfilename )\n+\n+\n+class TestDoubleFetch(unittest.TestCase):\n+ \'\'\'check if two iterators on the same bamfile are independent.\'\'\'\n+ \n+ def testDoubleFetch( self ):\n+\n+ samfile1 = pysam.Samfile(\'ex1.bam\', \'rb\')\n+\n+ for a,b in zip(samfile1.fetch(), samfile1.fetch()):\n+ self.assertEqual( a.compare( b ), 0 )\n+\n+ def testDoubleFetchWithRegion( self ):\n+\n+ samfile1 = pysam.Samfile(\'ex1.bam\', \'rb\')\n+ chr, start, stop = \'chr1\', 200, 3000000\n+ self.assertTrue(len(list(samfile1.fetch ( chr, start, stop))) > 0) #just making sure the test has something to catch\n+\n+ for a,b in zip(samfile1.fetch( chr, start, stop), samfile1.fetch( chr, start, stop)):\n+ self.assertEqual( a.compare( b ), 0 ) \n+\n+ def testDoubleFetchUntilEOF( self ):\n+\n+ samfile1 = pysam.Samfile(\'ex1.bam\', \'rb\')\n+\n+ for a,b in zip(samfile1.fetch( until_eof = True), \n+ samfile1.fetch( until_eof = True )):\n+ self.assertEqual( a.compare( b), 0 )\n+\n+class TestRemoteFileFTP(unittest.TestCase):\n+ \'\'\'test remote access.\n+\n+ \'\'\'\n+\n+ # Need to find an ftp server without password on standard\n+ # port.\n+\n+ url = "ftp://ftp.sanger.ac.uk/pub/rd/humanSequences/CV.bam"\n+ region = "1:1-1000"\n+\n+ def testFTPView( self ):\n+ result = pysam.view( self.url, self.region )\n+ self.assertEqual( len(result), 36 )\n+ \n+ def testFTPFetch( self ):\n+ samfile = pysam.Samfile(self.url, "rb") \n+ result = list(samfile.fetch( region = self.region ))\n+ self.assertEqual( len(result), 36 )\n+\n+class TestRemoteFileHTTP( unittest.TestCase):\n+\n+ url = "http://genserv.anat.ox.ac.uk/downloads/pysam/test/ex1.bam"\n+ region = "chr1:1-1000"\n+ local = "ex1.bam"\n+\n+ def testView( self ):\n+ self.assertRaises( pysam.SamtoolsError, pysam.view, self.url, self.region )\n+ \n+ def testFetch( self ):\n+ samfile = pysam.Samfile(self.url, "rb") \n+ result = list(samfile.fetch( region = self.region ))\n+ samfile_local = pysam.Samfile(self.local, "rb") \n+ ref = list(samfile_local.fetch( region = self.region ))\n+\n+ self.assertEqual( len(ref), len(result) )\n+ for x, y in zip(result, ref):\n+ self.assertEqual( x.compare( y ), 0 )\n+\n+ def testFetchAll( self ):\n+ samfile = pysam.Samfile(self.url, "rb") \n+ result = list(samfile.fetch())\n+ samfile_local = pysam.Samfile(self.local, "rb") \n+ ref = list(samfile_local.fetch() )\n+\n+ self.assertEqual( len(ref), len(result) )\n+ for x, y in zip(result, ref):\n+ self.assertEqual( x.compare( y ), 0 )\n+\n+\n+# TODOS\n+# 1. finish testing all properties within pileup objects\n+# 2. check exceptions and bad input problems (missing files, optional fields that aren\'t present, etc...)\n+\n+if __name__ == "__main__":\n+ # build data files\n+ print "building data files"\n+ subprocess.call( "make", shell=True)\n+ print "starting tests"\n+ unittest.main()\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tests/segfault_tests.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tests/segfault_tests.py Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,37 @@ +#!/usr/bin/env python +'''unit testing code for pysam.''' + +import pysam +import unittest +import os +import itertools +import subprocess +import shutil + +class TestExceptions(unittest.TestCase): + + def setUp(self): + self.samfile=pysam.Samfile( "ex1.bam","rb" ) + + def testOutOfRangeNegativeNewFormat(self): + self.assertRaises( ValueError, self.samfile.fetch, "chr1", 5, -10 ) + self.assertRaises( ValueError, self.samfile.fetch, "chr1", 5, 0 ) + self.assertRaises( ValueError, self.samfile.fetch, "chr1", -5, -10 ) + + def testOutOfRangeNegativeOldFormat(self): + self.assertRaises( ValueError, self.samfile.fetch, "chr1:-5-10" ) + self.assertRaises( ValueError, self.samfile.fetch, "chr1:-5-0" ) + self.assertRaises( ValueError, self.samfile.fetch, "chr1:-5--10" ) + + def testOutOfRangeLargeNewFormat(self): + self.assertRaises( ValueError, self.samfile.fetch, "chr1", 99999999999999999, 999999999999999999 ) + + def testOutOfRangeLargeOldFormat(self): + self.assertRaises( ValueError, self.samfile.fetch, "chr1:99999999999999999-999999999999999999" ) + + def tearDown(self): + self.samfile.close() + +if __name__ == "__main__": + unittest.main() + |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/tests/tabix_test.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/tests/tabix_test.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,225 @@ +#!/usr/bin/env python +'''unit testing code for pysam. + +Execute in the :file:`tests` directory as it requires the Makefile +and data files located there. +''' + +import sys, os, shutil, gzip +import pysam +import unittest +import itertools +import subprocess + +def checkBinaryEqual( filename1, filename2 ): + '''return true if the two files are binary equal.''' + if os.path.getsize( filename1 ) != os.path.getsize( filename2 ): + return False + + infile1 = open(filename1, "rb") + infile2 = open(filename2, "rb") + + def chariter( infile ): + while 1: + c = infile.read(1) + if c == "": break + yield c + + found = False + for c1,c2 in itertools.izip( chariter( infile1), chariter( infile2) ): + if c1 != c2: break + else: + found = True + + infile1.close() + infile2.close() + return found + +class TestIndexing(unittest.TestCase): + filename = "example.gtf.gz" + filename_idx = "example.gtf.gz.tbi" + + def setUp( self ): + + self.tmpfilename = "tmp_%i.gtf.gz" % id(self) + shutil.copyfile( self.filename, self.tmpfilename ) + + def testIndexPreset( self ): + '''test indexing via preset.''' + + pysam.tabix_index( self.tmpfilename, preset = "gff" ) + checkBinaryEqual( self.tmpfilename + ".tbi", self.filename_idx ) + + def tearDown( self ): + os.unlink( self.tmpfilename ) + os.unlink( self.tmpfilename + ".tbi" ) + +class TestCompression(unittest.TestCase): + filename = "example.gtf.gz" + filename_idx = "example.gtf.gz.tbi" + + def setUp( self ): + + self.tmpfilename = "tmp_%i.gtf" % id(self) + infile = gzip.open( self.filename, "r") + outfile = open( self.tmpfilename, "w" ) + outfile.write( "".join(infile.readlines()) ) + outfile.close() + infile.close() + + def testIndexPreset( self ): + '''test indexing via preset.''' + + pysam.tabix_index( self.tmpfilename, preset = "gff" ) + checkBinaryEqual( self.tmpfilename + ".gz", self.filename ) + checkBinaryEqual( self.tmpfilename + ".gz.tbi", self.filename_idx ) + + def tearDown( self ): + os.unlink( self.tmpfilename + ".gz" ) + os.unlink( self.tmpfilename + ".gz.tbi" ) + +class TestIteration( unittest.TestCase ): + + filename = "example.gtf.gz" + + def setUp( self ): + + self.tabix = pysam.Tabixfile( self.filename ) + lines = gzip.open(self.filename).readlines() + # creates index of contig, start, end, adds content without newline. + self.compare = [ + (x[0][0], int(x[0][3]), int(x[0][4]), x[1]) + for x in [ (y.split("\t"), y[:-1]) for y in lines ] ] + + def getSubset( self, contig = None, start = None, end = None): + + if contig == None: + # all lines + subset = [ x[3] for x in self.compare ] + else: + if start != None and end == None: + # until end of contig + subset = [ x[3] for x in self.compare if x[0] == contig and x[2] > start ] + elif start == None and end != None: + # from start of contig + subset = [ x[3] for x in self.compare if x[0] == contig and x[1] <= end ] + elif start == None and end == None: + subset = [ x[3] for x in self.compare if x[0] == contig ] + else: + # all within interval + subset = [ x[3] for x in self.compare if x[0] == contig and \ + min( x[2], end) - max(x[1], start) > 0 ] + + return subset + + def checkPairwise( self, result, ref ): + + result.sort() + ref.sort() + + a = set(result) + b = set(ref) + + self.assertEqual( len(result), len(ref), + "unexpected number of results: %i, expected %i, differences are %s: %s" \ + % (len(result), len(ref), + a.difference(b), + b.difference(a) )) + + for x, d in enumerate( zip( result, ref )): + + self.assertEqual( d[0], d[1], + "unexpected results in pair %i: '%s', expected '%s'" % \ + (x, + d[0], + d[1]) ) + + + def testAll( self ): + result = list(self.tabix.fetch()) + ref = self.getSubset( ) + self.checkPairwise( result, ref ) + + def testPerContig( self ): + for contig in ("chr1", "chr2", "chr1", "chr2" ): + result = list(self.tabix.fetch( contig )) + ref = self.getSubset( contig ) + self.checkPairwise( result, ref ) + + def testPerContigToEnd( self ): + + end = None + for contig in ("chr1", "chr2", "chr1", "chr2" ): + for start in range( 0, 200000, 1000): + result = list(self.tabix.fetch( contig, start, end )) + ref = self.getSubset( contig, start, end ) + self.checkPairwise( result, ref ) + + def testPerContigFromStart( self ): + + start = None + for contig in ("chr1", "chr2", "chr1", "chr2" ): + for end in range( 0, 200000, 1000): + result = list(self.tabix.fetch( contig, start, end )) + ref = self.getSubset( contig, start, end ) + self.checkPairwise( result, ref ) + + def testPerContig( self ): + + start, end = None, None + for contig in ("chr1", "chr2", "chr1", "chr2" ): + result = list(self.tabix.fetch( contig, start, end )) + ref = self.getSubset( contig, start, end ) + self.checkPairwise( result, ref ) + + def testPerInterval( self ): + + start, end = None, None + for contig in ("chr1", "chr2", "chr1", "chr2" ): + for start in range( 0, 200000, 2000): + for end in range( start, start + 2000, 500): + result = list(self.tabix.fetch( contig, start, end )) + ref = self.getSubset( contig, start, end ) + self.checkPairwise( result, ref ) + + + def testInvalidIntervals( self ): + + self.assertRaises( ValueError, self.tabix.fetch, "chr1", 0, -10) + self.assertRaises( ValueError, self.tabix.fetch, "chr1", -10, 200) + self.assertRaises( ValueError, self.tabix.fetch, "chr1", 200, 0) + self.assertRaises( ValueError, self.tabix.fetch, "chr1", -10, -20) + self.assertRaises( ValueError, self.tabix.fetch, "chrUn" ) + + def testGetContigs( self ): + self.assertEqual( sorted(self.tabix.contigs), ["chr1", "chr2"] ) + # check that contigs is read-only + self.assertRaises( AttributeError, setattr, self.tabix, "contigs", ["chr1", "chr2"] ) + +class TestParser( unittest.TestCase ): + + filename = "example.gtf.gz" + + def setUp( self ): + + self.tabix = pysam.Tabixfile( self.filename ) + self.compare = [ x[:-1].split("\t") for x in gzip.open( self.filename, "r") ] + + def testGTF( self ): + + for x, r in enumerate(self.tabix.fetch( parser = pysam.asGTF() )): + self.assertEqual( "\t".join( self.compare[x]), str(r) ) + + def testTuple( self ): + + for x, r in enumerate(self.tabix.fetch( parser = pysam.asTuple() )): + self.assertEqual( self.compare[x], list(r) ) + + self.assertEqual( len(self.compare[x]), len(r) ) + for c in range(0,len(r)): + self.assertEqual( self.compare[x][c], r[c] ) + +if __name__ == "__main__": + unittest.main() + + |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/version.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/pysam/version.py Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,4 @@ +# pysam versioning information +__version__ = "0.3.1" +__samtools_version__ = "0.1.8" +__tabix_version__ = "0.2.1" |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/pysam/version.pyc |
b |
Binary file chimerascan/pysam/version.pyc has changed |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/test/test_homology.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/test/test_homology.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,30 @@ +''' +Created on Jul 21, 2011 + +@author: mkiyer +''' +import unittest + +from chimerascan.lib.seq import calc_homology + +class TestLibraries(unittest.TestCase): + + def testHomology(self): + a = "AAAAGGGGTTTTCCCC" + b = "AAAAGGGGTTTTCCCC" + self.assertEquals(calc_homology(a, b, 0), 16) + b = "AAAAGGGGTTTTCCCG" + self.assertEquals(calc_homology(a, b, 0), 15) + b = "AAATTTGGTTTTCCCC" + self.assertEquals(calc_homology(a, b, 0), 3) + self.assertEquals(calc_homology(a, b, 1), 4) + self.assertEquals(calc_homology(a, b, 2), 5) + self.assertEquals(calc_homology(a, b, 3), 16) + + + + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/tools/chimerascan_html_table.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/tools/chimerascan_html_table.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,150 @@ +#!/usr/bin/env python +''' +Created on Feb 12, 2011 + +@author: mkiyer + +chimerascan: chimeric transcript discovery using RNA-seq + +Copyright (C) 2011 Matthew Iyer + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +''' +import logging +import sys +from jinja2 import Environment, PackageLoader + +# local imports +from chimerascan.lib.chimera import Chimera, ChimeraTypes + +# setup html template environment +env = Environment(loader=PackageLoader("chimerascan", "tools")) + +# URLs for special links +GENECARDS_URL = "http://www.genecards.org/cgi-bin/carddisp.pl?gene=" +UCSC_POS_URL = "http://genome.ucsc.edu/cgi-bin/hgTracks?" + +def get_header_row(): + return ["5' genomic region", + "5' strand", + "3' genomic region", + "3' strand", + "Cluster ID", + "5' transcripts", "3' transcripts", + "5' genes", "3' genes", + "Type", "5' -> 3' distance", + "Total frags", + "Spanning frags", + "Unique alignment positions", + "Isoform fraction 5'", + "Isoform fraction 3'", + "Breakpoint spanning reads", + "Chimera IDs"] + +def generate_row_data(line_iter, show_read_throughs, + header_fields): + type_col_num = header_fields.index("type") + txs5p_col_num = header_fields.index("transcript_ids_5p") + txs3p_col_num = header_fields.index("transcript_ids_3p") + genes5p_col_num = header_fields.index("genes5p") + genes3p_col_num = header_fields.index("genes3p") + spanning_reads_col_num = header_fields.index("breakpoint_spanning_reads") + chimera_ids_col_num = header_fields.index("chimera_ids") + for line in line_iter: + fields = line.strip().split('\t') + if ((not show_read_throughs) and + (fields[type_col_num] == ChimeraTypes.READTHROUGH)): + continue + newfields = [] + # 5' position (chr12:65432) and strand + newfields.append(("ucsc_pos", ["%s:%s-%s" % (fields[0], fields[1], fields[2])])) + newfields.append(("string", fields[8])) + # 3' position (chr12:76543) and strand + newfields.append(("ucsc_pos", ["%s:%s-%s" % (fields[3], fields[4], fields[5])])) + newfields.append(("string", fields[9])) + # cluster id + newfields.append(("string", fields[6])) + # transcripts + newfields.append(("ucsc_pos", fields[txs5p_col_num].split(","))) + newfields.append(("ucsc_pos", fields[txs3p_col_num].split(","))) + # genes + newfields.append(("genecards", fields[genes5p_col_num].split(","))) + newfields.append(("genecards", fields[genes3p_col_num].split(","))) + # chimera type + newfields.append(("string", fields[14])) + # distance + newfields.append(("string", fields[15])) + # total frags + newfields.append(("string", fields[16])) + # spanning frags + newfields.append(("string", fields[17])) + # unique alignment positions + newfields.append(("string", fields[18])) + # isoform fraction 5p + newfields.append(("string", fields[19])) + # isoform fraction 3p + newfields.append(("string", fields[20])) + # breakpoint spanning reads + newfields.append(("list", fields[21].split(","))) + # chimera ids + newfields.append(("list", fields[22].split(","))) + yield newfields + +def make_html_table(input_file, + ucsc_db, + show_read_throughs=False): + ucsc_pos_url = UCSC_POS_URL + "db=%s&position=" % (ucsc_db) + line_iter = open(input_file) + header_line = line_iter.next()[1:] + header_fields = header_line.strip().split('\t') + row_iter = generate_row_data(line_iter, + show_read_throughs=show_read_throughs, + header_fields=header_fields) + t = env.get_template("table_template.html") + htmlstring = t.render(colnames=get_header_row(), + ucsc_pos_url=ucsc_pos_url, + genecards_url=GENECARDS_URL, + rows=row_iter) + return htmlstring + +def main(): + from optparse import OptionParser + logging.basicConfig(level=logging.DEBUG, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") + parser = OptionParser("usage: %prog [options] <chimeras.txt>") + parser.add_option("-o", dest="output_file", default=None, + help="output file [default=stdout]") + parser.add_option("--ucsc-db", dest="ucsc_db", default="hg19", + help="UCSC Genome Version (specific to organism and " + "revision e.g. 'hg19'") + parser.add_option("--read-throughs", dest="show_read_throughs", + action="store_true", default=False, + help="include read-through chimeras in output " + "[default=%default]") + options, args = parser.parse_args() + input_file = args[0] + if options.output_file is None: + fileh = sys.stdout + else: + fileh = open(options.output_file, "w") + res = make_html_table(input_file, + ucsc_db=options.ucsc_db, + show_read_throughs=options.show_read_throughs) + print >>fileh, res + if options.output_file is not None: + fileh.close() + + +if __name__ == '__main__': + main() \ No newline at end of file |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/tools/gtf_to_genepred.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/tools/gtf_to_genepred.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,84 @@ +#!/usr/bin/env python +''' +Created on Feb 6, 2012 + +@author: mkiyer + +chimerascan: chimeric transcript discovery using RNA-seq + +Copyright (C) 2012 Matthew Iyer + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +''' +import logging +import collections +import operator +import os +import sys +from optparse import OptionParser + +from chimerascan.lib import gtf + +def gtf_to_genepred(gtf_file, genepred_file): + # group by transcript id + logging.info("Reading GTF file") + chrom_exon_features = collections.defaultdict(lambda: collections.defaultdict(lambda: [])) + for feature in gtf.GTFFeature.parse(open(gtf_file)): + if feature.feature_type == "exon": + transcript_id = feature.attrs["transcript_id"] + chrom_exon_features[feature.seqid][transcript_id].append(feature) + # convert to genepred + logging.info("Writing GenePred file") + outfh = open(genepred_file, "w") + for chrom in sorted(chrom_exon_features): + logging.debug("Chromosome %s" % (chrom)) + exon_features = chrom_exon_features[chrom].values() + exon_features.sort(key=lambda exon_list: min(x.start for x in exon_list)) + for exons in exon_features: + # sort exons + exons.sort(key=operator.attrgetter('start')) + chrom = exons[0].seqid + tx_start = exons[0].start + tx_end = exons[-1].end + strand = exons[0].strand + transcript_id = exons[0].attrs['transcript_id'] + gene_name = exons[0].attrs['gene_name'] + # write genepred fields + fields = [transcript_id, chrom, strand, str(tx_start), + str(tx_end), str(tx_start), str(tx_start), + str(len(exons)), + ",".join(map(str,[x.start for x in exons])) + ",", + ",".join(map(str,[x.end for x in exons])) + ",", + gene_name] + print >>outfh, "\t".join(fields) + outfh.close() + +def main(): + logging.basicConfig(level=logging.DEBUG, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") + parser = OptionParser("usage: %prog <input.gtf> <genepred_output.txt>") + options, args = parser.parse_args() + # check command line arguments + if len(args) < 2: + parser.error("Incorrect number of command line arguments") + gtf_file = args[0] + genepred_file = args[1] + # check that input files exist + if not os.path.isfile(gtf_file): + parser.error("GTF file '%s' not found" % (gtf_file)) + gtf_to_genepred(gtf_file, genepred_file) + return 0 + +if __name__ == '__main__': + sys.exit(main()) |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/tools/make_false_positive_file.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/tools/make_false_positive_file.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
@@ -0,0 +1,47 @@ +#!/usr/bin/env python +''' +Created on Jul 6, 2011 + +@author: mkiyer +''' +import logging +import sys +import collections + +from chimerascan.lib.chimera import Chimera + +def main(): + from optparse import OptionParser + logging.basicConfig(level=logging.DEBUG, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") + parser = OptionParser("usage: %prog [options] <chimeras.txt> [<chimeras2.txt> <chimeras3.txt> ...]") + parser.add_option("-o", dest="output_file", default=None, + help="output file [default=stdout]") + parser.add_option("-n", dest="num_files", type="int", default=1, + help="chimera must be recurrent in N samples " + "to make considered a false positive " + "[default=%default]") + options, args = parser.parse_args() + input_files = args + false_pos_chimeras = collections.defaultdict(lambda: 0) + for input_file in input_files: + logging.info("Processing file %s" % (input_file)) + num_chimeras = 0 + for c in Chimera.parse(open(input_file)): + key = (c.partner5p.tx_name, c.partner5p.end, c.partner3p.tx_name, c.partner3p.start) + false_pos_chimeras[key] += 1 + num_chimeras += 1 + logging.info("\tchimeras in file: %d" % (num_chimeras)) + logging.info("\tcurrent false positive candidates: %d" % (len(false_pos_chimeras))) + if options.output_file is None: + fileh = sys.stdout + else: + fileh = open(options.output_file, "w") + for key,recurrence in false_pos_chimeras.iteritems(): + if recurrence >= options.num_files: + print >>fileh, '\t'.join(map(str,key)) + if options.output_file is not None: + fileh.close() + +if __name__ == '__main__': + main() \ No newline at end of file |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/tools/sortable.js --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/tools/sortable.js Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,323 @@\n+/*\n+Table sorting script by Joost de Valk, check it out at http://www.joostdevalk.nl/code/sortable-table/.\n+Based on a script from http://www.kryogenix.org/code/browser/sorttable/.\n+Distributed under the MIT license: http://www.kryogenix.org/code/browser/licence.html .\n+\n+Copyright (c) 1997-2007 Stuart Langridge, Joost de Valk.\n+\n+Version 1.5.7\n+*/\n+\n+/* You can change these values */\n+var image_path = "http://www.joostdevalk.nl/code/sortable-table/";\n+var image_up = "arrow-up.gif";\n+var image_down = "arrow-down.gif";\n+var image_none = "arrow-none.gif";\n+var europeandate = true;\n+var alternate_row_colors = true;\n+\n+/* Don\'t change anything below this unless you know what you\'re doing */\n+addEvent(window, "load", sortables_init);\n+\n+var SORT_COLUMN_INDEX;\n+var thead = false;\n+\n+function sortables_init() {\n+\t// Find all tables with class sortable and make them sortable\n+\tif (!document.getElementsByTagName) return;\n+\ttbls = document.getElementsByTagName("table");\n+\tfor (ti=0;ti<tbls.length;ti++) {\n+\t\tthisTbl = tbls[ti];\n+\t\tif (((\' \'+thisTbl.className+\' \').indexOf("sortable") != -1) && (thisTbl.id)) {\n+\t\t\tts_makeSortable(thisTbl);\n+\t\t}\n+\t}\n+}\n+\n+function ts_makeSortable(t) {\n+\tif (t.rows && t.rows.length > 0) {\n+\t\tif (t.tHead && t.tHead.rows.length > 0) {\n+\t\t\tvar firstRow = t.tHead.rows[t.tHead.rows.length-1];\n+\t\t\tthead = true;\n+\t\t} else {\n+\t\t\tvar firstRow = t.rows[0];\n+\t\t}\n+\t}\n+\tif (!firstRow) return;\n+\t\n+\t// We have a first row: assume it\'s the header, and make its contents clickable links\n+\tfor (var i=0;i<firstRow.cells.length;i++) {\n+\t\tvar cell = firstRow.cells[i];\n+\t\tvar txt = ts_getInnerText(cell);\n+\t\tif (cell.className != "unsortable" && cell.className.indexOf("unsortable") == -1) {\n+\t\t\tcell.innerHTML = \'<a href="#" class="sortheader" onclick="ts_resortTable(this, \'+i+\');return false;">\'+txt+\'<span class="sortarrow"> <img src="\'+ image_path + image_none + \'" alt="↓"/></span></a>\';\n+\t\t}\n+\t}\n+\tif (alternate_row_colors) {\n+\t\talternate(t);\n+\t}\n+}\n+\n+function ts_getInnerText(el) {\n+\tif (typeof el == "string") return el;\n+\tif (typeof el == "undefined") { return el };\n+\tif (el.innerText) return el.innerText;\t//Not needed but it is faster\n+\tvar str = "";\n+\t\n+\tvar cs = el.childNodes;\n+\tvar l = cs.length;\n+\tfor (var i = 0; i < l; i++) {\n+\t\tswitch (cs[i].nodeType) {\n+\t\t\tcase 1: //ELEMENT_NODE\n+\t\t\t\tstr += ts_getInnerText(cs[i]);\n+\t\t\t\tbreak;\n+\t\t\tcase 3:\t//TEXT_NODE\n+\t\t\t\tstr += cs[i].nodeValue;\n+\t\t\t\tbreak;\n+\t\t}\n+\t}\n+\treturn str;\n+}\n+\n+function ts_resortTable(lnk, clid) {\n+\tvar span;\n+\tfor (var ci=0;ci<lnk.childNodes.length;ci++) {\n+\t\tif (lnk.childNodes[ci].tagName && lnk.childNodes[ci].tagName.toLowerCase() == \'span\') span = lnk.childNodes[ci];\n+\t}\n+\tvar spantext = ts_getInnerText(span);\n+\tvar td = lnk.parentNode;\n+\tvar column = clid || td.cellIndex;\n+\tvar t = getParent(td,\'TABLE\');\n+\t// Work out a type for the column\n+\tif (t.rows.length <= 1) return;\n+\tvar itm = "";\n+\tvar i = 0;\n+\twhile (itm == "" && i < t.tBodies[0].rows.length) {\n+\t\tvar itm = ts_getInnerText(t.tBodies[0].rows[i].cells[column]);\n+\t\titm = trim(itm);\n+\t\tif (itm.substr(0,4) == "<!--" || itm.length == 0) {\n+\t\t\titm = "";\n+\t\t}\n+\t\ti++;\n+\t}\n+\tif (itm == "") return; \n+\tsortfn = ts_sort_caseinsensitive;\n+\tif (itm.match(/^\\d\\d[\\/\\.-][a-zA-z][a-zA-Z][a-zA-Z][\\/\\.-]\\d\\d\\d\\d$/)) sortfn = ts_sort_date;\n+\tif (itm.match(/^\\d\\d[\\/\\.-]\\d\\d[\\/\\.-]\\d\\d\\d{2}?$/)) sortfn = ts_sort_date;\n+\tif (itm.match(/^-?[\xa3$\x80\xdb\xa2\xb4]\\d/)) sortfn = ts_sort_numeric;\n+\tif (itm.match(/^-?(\\d+[,\\.]?)+(E[-+][\\d]+)?%?$/)) sortfn = ts_sort_numeric;\n+\tSORT_COLUMN_INDEX = column;\n+\tvar firstRow = new Array();\n+\tvar newRows = new Array();\n+\tfor (k=0;k<t.tBodies.length;k++) {\n+\t\tfor (i=0;i<t.tBodies[k].rows[0].length;i++) { \n+\t\t\tfirstRow[i] = t.tBodies[k].rows[0][i]; \n+\t\t}\n+\t}\n+\tfor (k=0;k<t.tBodies.length;k++) {\n+\t\tif (!thead) {\n+\t\t\t// Skip the first row\n+\t\t\tfor (j=1;j<t.tBodies[k].rows.length;j++) { \n+\t\t\t\tnewRows[j-1] = t.tBodies[k].rows[j];\n+\t\t\t}\n+\t\t} else {\n+\t\t\t// Do NOT skip the f'..b'{\n+\t\tmtstr = date.substr(3,3);\n+\t\tmtstr = mtstr.toLowerCase();\n+\t\tswitch(mtstr) {\n+\t\t\tcase "jan": var mt = "01"; break;\n+\t\t\tcase "feb": var mt = "02"; break;\n+\t\t\tcase "mar": var mt = "03"; break;\n+\t\t\tcase "apr": var mt = "04"; break;\n+\t\t\tcase "may": var mt = "05"; break;\n+\t\t\tcase "jun": var mt = "06"; break;\n+\t\t\tcase "jul": var mt = "07"; break;\n+\t\t\tcase "aug": var mt = "08"; break;\n+\t\t\tcase "sep": var mt = "09"; break;\n+\t\t\tcase "oct": var mt = "10"; break;\n+\t\t\tcase "nov": var mt = "11"; break;\n+\t\t\tcase "dec": var mt = "12"; break;\n+\t\t\t// default: var mt = "00";\n+\t\t}\n+\t\tdt = date.substr(7,4)+mt+date.substr(0,2);\n+\t\treturn dt;\n+\t} else if (date.length == 10) {\n+\t\tif (europeandate == false) {\n+\t\t\tdt = date.substr(6,4)+date.substr(0,2)+date.substr(3,2);\n+\t\t\treturn dt;\n+\t\t} else {\n+\t\t\tdt = date.substr(6,4)+date.substr(3,2)+date.substr(0,2);\n+\t\t\treturn dt;\n+\t\t}\n+\t} else if (date.length == 8) {\n+\t\tyr = date.substr(6,2);\n+\t\tif (parseInt(yr) < 50) { \n+\t\t\tyr = \'20\'+yr; \n+\t\t} else { \n+\t\t\tyr = \'19\'+yr; \n+\t\t}\n+\t\tif (europeandate == true) {\n+\t\t\tdt = yr+date.substr(3,2)+date.substr(0,2);\n+\t\t\treturn dt;\n+\t\t} else {\n+\t\t\tdt = yr+date.substr(0,2)+date.substr(3,2);\n+\t\t\treturn dt;\n+\t\t}\n+\t}\n+\treturn dt;\n+}\n+\n+function ts_sort_date(a,b) {\n+\tdt1 = sort_date(ts_getInnerText(a.cells[SORT_COLUMN_INDEX]));\n+\tdt2 = sort_date(ts_getInnerText(b.cells[SORT_COLUMN_INDEX]));\n+\t\n+\tif (dt1==dt2) {\n+\t\treturn 0;\n+\t}\n+\tif (dt1<dt2) { \n+\t\treturn -1;\n+\t}\n+\treturn 1;\n+}\n+function ts_sort_numeric(a,b) {\n+\tvar aa = ts_getInnerText(a.cells[SORT_COLUMN_INDEX]);\n+\taa = clean_num(aa);\n+\tvar bb = ts_getInnerText(b.cells[SORT_COLUMN_INDEX]);\n+\tbb = clean_num(bb);\n+\treturn compare_numeric(aa,bb);\n+}\n+function compare_numeric(a,b) {\n+\tvar a = parseFloat(a);\n+\ta = (isNaN(a) ? 0 : a);\n+\tvar b = parseFloat(b);\n+\tb = (isNaN(b) ? 0 : b);\n+\treturn a - b;\n+}\n+function ts_sort_caseinsensitive(a,b) {\n+\taa = ts_getInnerText(a.cells[SORT_COLUMN_INDEX]).toLowerCase();\n+\tbb = ts_getInnerText(b.cells[SORT_COLUMN_INDEX]).toLowerCase();\n+\tif (aa==bb) {\n+\t\treturn 0;\n+\t}\n+\tif (aa<bb) {\n+\t\treturn -1;\n+\t}\n+\treturn 1;\n+}\n+function ts_sort_default(a,b) {\n+\taa = ts_getInnerText(a.cells[SORT_COLUMN_INDEX]);\n+\tbb = ts_getInnerText(b.cells[SORT_COLUMN_INDEX]);\n+\tif (aa==bb) {\n+\t\treturn 0;\n+\t}\n+\tif (aa<bb) {\n+\t\treturn -1;\n+\t}\n+\treturn 1;\n+}\n+function addEvent(elm, evType, fn, useCapture)\n+// addEvent and removeEvent\n+// cross-browser event handling for IE5+,\tNS6 and Mozilla\n+// By Scott Andrew\n+{\n+\tif (elm.addEventListener){\n+\t\telm.addEventListener(evType, fn, useCapture);\n+\t\treturn true;\n+\t} else if (elm.attachEvent){\n+\t\tvar r = elm.attachEvent("on"+evType, fn);\n+\t\treturn r;\n+\t} else {\n+\t\talert("Handler could not be removed");\n+\t}\n+}\n+function clean_num(str) {\n+\tstr = str.replace(new RegExp(/[^-?0-9.]/g),"");\n+\treturn str;\n+}\n+function trim(s) {\n+\treturn s.replace(/^\\s+|\\s+$/g, "");\n+}\n+function alternate(table) {\n+\t// Take object table and get all it\'s tbodies.\n+\tvar tableBodies = table.getElementsByTagName("tbody");\n+\t// Loop through these tbodies\n+\tfor (var i = 0; i < tableBodies.length; i++) {\n+\t\t// Take the tbody, and get all it\'s rows\n+\t\tvar tableRows = tableBodies[i].getElementsByTagName("tr");\n+\t\t// Loop through these rows\n+\t\t// Start at 1 because we want to leave the heading row untouched\n+\t\tfor (var j = 0; j < tableRows.length; j++) {\n+\t\t\t// Check if j is even, and apply classes for both possible results\n+\t\t\tif ( (j % 2) == 0 ) {\n+\t\t\t\tif ( !(tableRows[j].className.indexOf(\'odd\') == -1) ) {\n+\t\t\t\t\ttableRows[j].className = tableRows[j].className.replace(\'odd\', \'even\');\n+\t\t\t\t} else {\n+\t\t\t\t\tif ( tableRows[j].className.indexOf(\'even\') == -1 ) {\n+\t\t\t\t\t\ttableRows[j].className += " even";\n+\t\t\t\t\t}\n+\t\t\t\t}\n+\t\t\t} else {\n+\t\t\t\tif ( !(tableRows[j].className.indexOf(\'even\') == -1) ) {\n+\t\t\t\t\ttableRows[j].className = tableRows[j].className.replace(\'even\', \'odd\');\n+\t\t\t\t} else {\n+\t\t\t\t\tif ( tableRows[j].className.indexOf(\'odd\') == -1 ) {\n+\t\t\t\t\t\ttableRows[j].className += " odd";\n+\t\t\t\t\t}\n+\t\t\t\t}\n+\t\t\t} \n+\t\t}\n+\t}\n+}\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/tools/sortable_us.js --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/tools/sortable_us.js Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,323 @@\n+/*\n+Table sorting script by Joost de Valk, check it out at http://www.joostdevalk.nl/code/sortable-table/.\n+Based on a script from http://www.kryogenix.org/code/browser/sorttable/.\n+Distributed under the MIT license: http://www.kryogenix.org/code/browser/licence.html .\n+\n+Copyright (c) 1997-2007 Stuart Langridge, Joost de Valk.\n+\n+Version 1.5.7\n+*/\n+\n+/* You can change these values */\n+var image_path = "http://www.joostdevalk.nl/code/sortable-table/";\n+var image_up = "arrow-up.gif";\n+var image_down = "arrow-down.gif";\n+var image_none = "arrow-none.gif";\n+var europeandate = false;\n+var alternate_row_colors = true;\n+\n+/* Don\'t change anything below this unless you know what you\'re doing */\n+addEvent(window, "load", sortables_init);\n+\n+var SORT_COLUMN_INDEX;\n+var thead = false;\n+\n+function sortables_init() {\n+\t// Find all tables with class sortable and make them sortable\n+\tif (!document.getElementsByTagName) return;\n+\ttbls = document.getElementsByTagName("table");\n+\tfor (ti=0;ti<tbls.length;ti++) {\n+\t\tthisTbl = tbls[ti];\n+\t\tif (((\' \'+thisTbl.className+\' \').indexOf("sortable") != -1) && (thisTbl.id)) {\n+\t\t\tts_makeSortable(thisTbl);\n+\t\t}\n+\t}\n+}\n+\n+function ts_makeSortable(t) {\n+\tif (t.rows && t.rows.length > 0) {\n+\t\tif (t.tHead && t.tHead.rows.length > 0) {\n+\t\t\tvar firstRow = t.tHead.rows[t.tHead.rows.length-1];\n+\t\t\tthead = true;\n+\t\t} else {\n+\t\t\tvar firstRow = t.rows[0];\n+\t\t}\n+\t}\n+\tif (!firstRow) return;\n+\t\n+\t// We have a first row: assume it\'s the header, and make its contents clickable links\n+\tfor (var i=0;i<firstRow.cells.length;i++) {\n+\t\tvar cell = firstRow.cells[i];\n+\t\tvar txt = ts_getInnerText(cell);\n+\t\tif (cell.className != "unsortable" && cell.className.indexOf("unsortable") == -1) {\n+\t\t\tcell.innerHTML = \'<a href="#" class="sortheader" onclick="ts_resortTable(this, \'+i+\');return false;">\'+txt+\'<span class="sortarrow"> <img src="\'+ image_path + image_none + \'" alt="↓"/></span></a>\';\n+\t\t}\n+\t}\n+\tif (alternate_row_colors) {\n+\t\talternate(t);\n+\t}\n+}\n+\n+function ts_getInnerText(el) {\n+\tif (typeof el == "string") return el;\n+\tif (typeof el == "undefined") { return el };\n+\tif (el.innerText) return el.innerText;\t//Not needed but it is faster\n+\tvar str = "";\n+\t\n+\tvar cs = el.childNodes;\n+\tvar l = cs.length;\n+\tfor (var i = 0; i < l; i++) {\n+\t\tswitch (cs[i].nodeType) {\n+\t\t\tcase 1: //ELEMENT_NODE\n+\t\t\t\tstr += ts_getInnerText(cs[i]);\n+\t\t\t\tbreak;\n+\t\t\tcase 3:\t//TEXT_NODE\n+\t\t\t\tstr += cs[i].nodeValue;\n+\t\t\t\tbreak;\n+\t\t}\n+\t}\n+\treturn str;\n+}\n+\n+function ts_resortTable(lnk, clid) {\n+\tvar span;\n+\tfor (var ci=0;ci<lnk.childNodes.length;ci++) {\n+\t\tif (lnk.childNodes[ci].tagName && lnk.childNodes[ci].tagName.toLowerCase() == \'span\') span = lnk.childNodes[ci];\n+\t}\n+\tvar spantext = ts_getInnerText(span);\n+\tvar td = lnk.parentNode;\n+\tvar column = clid || td.cellIndex;\n+\tvar t = getParent(td,\'TABLE\');\n+\t// Work out a type for the column\n+\tif (t.rows.length <= 1) return;\n+\tvar itm = "";\n+\tvar i = 0;\n+\twhile (itm == "" && i < t.tBodies[0].rows.length) {\n+\t\tvar itm = ts_getInnerText(t.tBodies[0].rows[i].cells[column]);\n+\t\titm = trim(itm);\n+\t\tif (itm.substr(0,4) == "<!--" || itm.length == 0) {\n+\t\t\titm = "";\n+\t\t}\n+\t\ti++;\n+\t}\n+\tif (itm == "") return; \n+\tsortfn = ts_sort_caseinsensitive;\n+\tif (itm.match(/^\\d\\d[\\/\\.-][a-zA-z][a-zA-Z][a-zA-Z][\\/\\.-]\\d\\d\\d\\d$/)) sortfn = ts_sort_date;\n+\tif (itm.match(/^\\d\\d[\\/\\.-]\\d\\d[\\/\\.-]\\d\\d\\d{2}?$/)) sortfn = ts_sort_date;\n+\tif (itm.match(/^-?[\xa3$\x80\xdb\xa2\xb4]\\d/)) sortfn = ts_sort_numeric;\n+\tif (itm.match(/^-?(\\d+[,\\.]?)+(E[-+][\\d]+)?%?$/)) sortfn = ts_sort_numeric;\n+\tSORT_COLUMN_INDEX = column;\n+\tvar firstRow = new Array();\n+\tvar newRows = new Array();\n+\tfor (k=0;k<t.tBodies.length;k++) {\n+\t\tfor (i=0;i<t.tBodies[k].rows[0].length;i++) { \n+\t\t\tfirstRow[i] = t.tBodies[k].rows[0][i]; \n+\t\t}\n+\t}\n+\tfor (k=0;k<t.tBodies.length;k++) {\n+\t\tif (!thead) {\n+\t\t\t// Skip the first row\n+\t\t\tfor (j=1;j<t.tBodies[k].rows.length;j++) { \n+\t\t\t\tnewRows[j-1] = t.tBodies[k].rows[j];\n+\t\t\t}\n+\t\t} else {\n+\t\t\t// Do NOT skip the '..b'{\n+\t\tmtstr = date.substr(3,3);\n+\t\tmtstr = mtstr.toLowerCase();\n+\t\tswitch(mtstr) {\n+\t\t\tcase "jan": var mt = "01"; break;\n+\t\t\tcase "feb": var mt = "02"; break;\n+\t\t\tcase "mar": var mt = "03"; break;\n+\t\t\tcase "apr": var mt = "04"; break;\n+\t\t\tcase "may": var mt = "05"; break;\n+\t\t\tcase "jun": var mt = "06"; break;\n+\t\t\tcase "jul": var mt = "07"; break;\n+\t\t\tcase "aug": var mt = "08"; break;\n+\t\t\tcase "sep": var mt = "09"; break;\n+\t\t\tcase "oct": var mt = "10"; break;\n+\t\t\tcase "nov": var mt = "11"; break;\n+\t\t\tcase "dec": var mt = "12"; break;\n+\t\t\t// default: var mt = "00";\n+\t\t}\n+\t\tdt = date.substr(7,4)+mt+date.substr(0,2);\n+\t\treturn dt;\n+\t} else if (date.length == 10) {\n+\t\tif (europeandate == false) {\n+\t\t\tdt = date.substr(6,4)+date.substr(0,2)+date.substr(3,2);\n+\t\t\treturn dt;\n+\t\t} else {\n+\t\t\tdt = date.substr(6,4)+date.substr(3,2)+date.substr(0,2);\n+\t\t\treturn dt;\n+\t\t}\n+\t} else if (date.length == 8) {\n+\t\tyr = date.substr(6,2);\n+\t\tif (parseInt(yr) < 50) { \n+\t\t\tyr = \'20\'+yr; \n+\t\t} else { \n+\t\t\tyr = \'19\'+yr; \n+\t\t}\n+\t\tif (europeandate == true) {\n+\t\t\tdt = yr+date.substr(3,2)+date.substr(0,2);\n+\t\t\treturn dt;\n+\t\t} else {\n+\t\t\tdt = yr+date.substr(0,2)+date.substr(3,2);\n+\t\t\treturn dt;\n+\t\t}\n+\t}\n+\treturn dt;\n+}\n+\n+function ts_sort_date(a,b) {\n+\tdt1 = sort_date(ts_getInnerText(a.cells[SORT_COLUMN_INDEX]));\n+\tdt2 = sort_date(ts_getInnerText(b.cells[SORT_COLUMN_INDEX]));\n+\t\n+\tif (dt1==dt2) {\n+\t\treturn 0;\n+\t}\n+\tif (dt1<dt2) { \n+\t\treturn -1;\n+\t}\n+\treturn 1;\n+}\n+function ts_sort_numeric(a,b) {\n+\tvar aa = ts_getInnerText(a.cells[SORT_COLUMN_INDEX]);\n+\taa = clean_num(aa);\n+\tvar bb = ts_getInnerText(b.cells[SORT_COLUMN_INDEX]);\n+\tbb = clean_num(bb);\n+\treturn compare_numeric(aa,bb);\n+}\n+function compare_numeric(a,b) {\n+\tvar a = parseFloat(a);\n+\ta = (isNaN(a) ? 0 : a);\n+\tvar b = parseFloat(b);\n+\tb = (isNaN(b) ? 0 : b);\n+\treturn a - b;\n+}\n+function ts_sort_caseinsensitive(a,b) {\n+\taa = ts_getInnerText(a.cells[SORT_COLUMN_INDEX]).toLowerCase();\n+\tbb = ts_getInnerText(b.cells[SORT_COLUMN_INDEX]).toLowerCase();\n+\tif (aa==bb) {\n+\t\treturn 0;\n+\t}\n+\tif (aa<bb) {\n+\t\treturn -1;\n+\t}\n+\treturn 1;\n+}\n+function ts_sort_default(a,b) {\n+\taa = ts_getInnerText(a.cells[SORT_COLUMN_INDEX]);\n+\tbb = ts_getInnerText(b.cells[SORT_COLUMN_INDEX]);\n+\tif (aa==bb) {\n+\t\treturn 0;\n+\t}\n+\tif (aa<bb) {\n+\t\treturn -1;\n+\t}\n+\treturn 1;\n+}\n+function addEvent(elm, evType, fn, useCapture)\n+// addEvent and removeEvent\n+// cross-browser event handling for IE5+,\tNS6 and Mozilla\n+// By Scott Andrew\n+{\n+\tif (elm.addEventListener){\n+\t\telm.addEventListener(evType, fn, useCapture);\n+\t\treturn true;\n+\t} else if (elm.attachEvent){\n+\t\tvar r = elm.attachEvent("on"+evType, fn);\n+\t\treturn r;\n+\t} else {\n+\t\talert("Handler could not be removed");\n+\t}\n+}\n+function clean_num(str) {\n+\tstr = str.replace(new RegExp(/[^-?0-9.]/g),"");\n+\treturn str;\n+}\n+function trim(s) {\n+\treturn s.replace(/^\\s+|\\s+$/g, "");\n+}\n+function alternate(table) {\n+\t// Take object table and get all it\'s tbodies.\n+\tvar tableBodies = table.getElementsByTagName("tbody");\n+\t// Loop through these tbodies\n+\tfor (var i = 0; i < tableBodies.length; i++) {\n+\t\t// Take the tbody, and get all it\'s rows\n+\t\tvar tableRows = tableBodies[i].getElementsByTagName("tr");\n+\t\t// Loop through these rows\n+\t\t// Start at 1 because we want to leave the heading row untouched\n+\t\tfor (var j = 0; j < tableRows.length; j++) {\n+\t\t\t// Check if j is even, and apply classes for both possible results\n+\t\t\tif ( (j % 2) == 0 ) {\n+\t\t\t\tif ( !(tableRows[j].className.indexOf(\'odd\') == -1) ) {\n+\t\t\t\t\ttableRows[j].className = tableRows[j].className.replace(\'odd\', \'even\');\n+\t\t\t\t} else {\n+\t\t\t\t\tif ( tableRows[j].className.indexOf(\'even\') == -1 ) {\n+\t\t\t\t\t\ttableRows[j].className += " even";\n+\t\t\t\t\t}\n+\t\t\t\t}\n+\t\t\t} else {\n+\t\t\t\tif ( !(tableRows[j].className.indexOf(\'even\') == -1) ) {\n+\t\t\t\t\ttableRows[j].className = tableRows[j].className.replace(\'even\', \'odd\');\n+\t\t\t\t} else {\n+\t\t\t\t\tif ( tableRows[j].className.indexOf(\'odd\') == -1 ) {\n+\t\t\t\t\t\ttableRows[j].className += " odd";\n+\t\t\t\t\t}\n+\t\t\t\t}\n+\t\t\t} \n+\t\t}\n+\t}\n+}\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/tools/table_style.css --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/tools/table_style.css Thu Sep 07 17:55:18 2017 -0400 |
b |
@@ -0,0 +1,35 @@ +/* Copyright 2006 Joost de Valk */ +a img { + border: 0; +} +table.sortable { + border-spacing: 0; + border: 1px solid #000; + border-collapse: collapse; +} +table.sortable th, table.sortable td { + text-align: left; + padding: 2px 4px 2px 4px; + width: 100px; + border-style: solid; + border-color: #444; +} +table.sortable th { + border-width: 0px 1px 1px 1px; + background-color: #ccc; +} +table.sortable td { + border-width: 0px 1px 0px 1px; + font: 12px "Lucida Grande", Helvetica, "Arial Unicode MS", "Arial Unicode", Arial, sans-serif; +} +table.sortable tr.odd td { + background-color: #BFEFFF; +} +table.sortable tr.even td { + background-color: #ffffff; +} +table.sortable tr.sortbottom td { + border-top: 1px solid #444; + background-color: #ccc; + font-weight: bold; +} \ No newline at end of file |
b |
diff -r 000000000000 -r d85dea371064 chimerascan/tools/table_template.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan/tools/table_template.html Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,404 @@\n+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n+<html xmlns="http://www.w3.org/1999/xhtml">\n+\n+<head>\n+<head>\n+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />\n+ <title>chimerascan results</title>\n+ <!-- <link rel="stylesheet" type="text/css" href="table_style.css"/> -->\n+ <!-- <script type="text/javascript" src="sortable.js"></script> -->\n+ <script type="text/javascript">\n+/*\n+Table sorting script by Joost de Valk, check it out at http://www.joostdevalk.nl/code/sortable-table/.\n+Based on a script from http://www.kryogenix.org/code/browser/sorttable/.\n+Distributed under the MIT license: http://www.kryogenix.org/code/browser/licence.html .\n+\n+Copyright (c) 1997-2007 Stuart Langridge, Joost de Valk.\n+\n+Version 1.5.7\n+*/\n+\n+/* You can change these values */\n+var image_path = "http://www.joostdevalk.nl/code/sortable-table/";\n+var image_up = "arrow-up.gif";\n+var image_down = "arrow-down.gif";\n+var image_none = "arrow-none.gif";\n+var europeandate = false;\n+var alternate_row_colors = true;\n+\n+/* Don\'t change anything below this unless you know what you\'re doing */\n+addEvent(window, "load", sortables_init);\n+\n+var SORT_COLUMN_INDEX;\n+var thead = false;\n+\n+function sortables_init() {\n+\t// Find all tables with class sortable and make them sortable\n+\tif (!document.getElementsByTagName) return;\n+\ttbls = document.getElementsByTagName("table");\n+\tfor (ti=0;ti<tbls.length;ti++) {\n+\t\tthisTbl = tbls[ti];\n+\t\tif (((\' \'+thisTbl.className+\' \').indexOf("sortable") != -1) && (thisTbl.id)) {\n+\t\t\tts_makeSortable(thisTbl);\n+\t\t}\n+\t}\n+}\n+\n+function ts_makeSortable(t) {\n+\tif (t.rows && t.rows.length > 0) {\n+\t\tif (t.tHead && t.tHead.rows.length > 0) {\n+\t\t\tvar firstRow = t.tHead.rows[t.tHead.rows.length-1];\n+\t\t\tthead = true;\n+\t\t} else {\n+\t\t\tvar firstRow = t.rows[0];\n+\t\t}\n+\t}\n+\tif (!firstRow) return;\n+\t\n+\t// We have a first row: assume it\'s the header, and make its contents clickable links\n+\tfor (var i=0;i<firstRow.cells.length;i++) {\n+\t\tvar cell = firstRow.cells[i];\n+\t\tvar txt = ts_getInnerText(cell);\n+\t\tif (cell.className != "unsortable" && cell.className.indexOf("unsortable") == -1) {\n+\t\t\tcell.innerHTML = \'<a href="#" class="sortheader" onclick="ts_resortTable(this, \'+i+\');return false;">\'+txt+\'<span class="sortarrow"> <img src="\'+ image_path + image_none + \'" alt="↓"/></span></a>\';\n+\t\t}\n+\t}\n+\tif (alternate_row_colors) {\n+\t\talternate(t);\n+\t}\n+}\n+\n+function ts_getInnerText(el) {\n+\tif (typeof el == "string") return el;\n+\tif (typeof el == "undefined") { return el };\n+\tif (el.innerText) return el.innerText;\t//Not needed but it is faster\n+\tvar str = "";\n+\t\n+\tvar cs = el.childNodes;\n+\tvar l = cs.length;\n+\tfor (var i = 0; i < l; i++) {\n+\t\tswitch (cs[i].nodeType) {\n+\t\t\tcase 1: //ELEMENT_NODE\n+\t\t\t\tstr += ts_getInnerText(cs[i]);\n+\t\t\t\tbreak;\n+\t\t\tcase 3:\t//TEXT_NODE\n+\t\t\t\tstr += cs[i].nodeValue;\n+\t\t\t\tbreak;\n+\t\t}\n+\t}\n+\treturn str;\n+}\n+\n+function ts_resortTable(lnk, clid) {\n+\tvar span;\n+\tfor (var ci=0;ci<lnk.childNodes.length;ci++) {\n+\t\tif (lnk.childNodes[ci].tagName && lnk.childNodes[ci].tagName.toLowerCase() == \'span\') span = lnk.childNodes[ci];\n+\t}\n+\tvar spantext = ts_getInnerText(span);\n+\tvar td = lnk.parentNode;\n+\tvar column = clid || td.cellIndex;\n+\tvar t = getParent(td,\'TABLE\');\n+\t// Work out a type for the column\n+\tif (t.rows.length <= 1) return;\n+\tvar itm = "";\n+\tvar i = 1;\n+\twhile (itm == "" && i < t.tBodies[0].rows.length) {\n+\t\tvar itm = ts_getInnerText(t.tBodies[0].rows[i].cells[column]);\n+\t\titm = trim(itm);\n+\t\tif (itm.substr(0,4) == "<!--" || itm.length == 0) {\n+\t\t\titm = "";\n+\t\t}\n+\t\ti++;\n+\t}\n+\tif (itm == "") return; \n+\t// alert(itm)\n+\tsortfn = ts_sort_caseinsensitive;\n+\tif (itm.match(/^\\d\\d[\\/\\.-][a-zA-z][a-zA-Z][a-zA-Z][\\/\\.-]\\d\\d\\d\\d$/)) sortfn = ts_sort_date;\n+\tif (itm.match(/^\\d\\d[\\/\\.-]\\d\\d[\\/\\.-]\\d\\d\\d{2}?$/)) sortfn = ts_sort_date;\n+\tif (itm.match(/^-?[$\\u017d]\\d/)) sortfn = ts_sort_numeric;\n+\tif (itm.match(/'..b'aN(a) ? 0 : a);\n+\tvar b = parseFloat(b);\n+\tb = (isNaN(b) ? 0 : b);\n+\treturn a - b;\n+}\n+function ts_sort_caseinsensitive(a,b) {\n+\taa = ts_getInnerText(a.cells[SORT_COLUMN_INDEX]).toLowerCase();\n+\tbb = ts_getInnerText(b.cells[SORT_COLUMN_INDEX]).toLowerCase();\n+\tif (aa==bb) {\n+\t\treturn 0;\n+\t}\n+\tif (aa<bb) {\n+\t\treturn -1;\n+\t}\n+\treturn 1;\n+}\n+function ts_sort_default(a,b) {\n+\taa = ts_getInnerText(a.cells[SORT_COLUMN_INDEX]);\n+\tbb = ts_getInnerText(b.cells[SORT_COLUMN_INDEX]);\n+\tif (aa==bb) {\n+\t\treturn 0;\n+\t}\n+\tif (aa<bb) {\n+\t\treturn -1;\n+\t}\n+\treturn 1;\n+}\n+function addEvent(elm, evType, fn, useCapture)\n+// addEvent and removeEvent\n+// cross-browser event handling for IE5+,\tNS6 and Mozilla\n+// By Scott Andrew\n+{\n+\tif (elm.addEventListener){\n+\t\telm.addEventListener(evType, fn, useCapture);\n+\t\treturn true;\n+\t} else if (elm.attachEvent){\n+\t\tvar r = elm.attachEvent("on"+evType, fn);\n+\t\treturn r;\n+\t} else {\n+\t\talert("Handler could not be removed");\n+\t}\n+}\n+function clean_num(str) {\n+\tstr = str.replace(new RegExp(/[^-?0-9.]/g),"");\n+\treturn str;\n+}\n+function trim(s) {\n+\treturn s.replace(/^\\s+|\\s+$/g, "");\n+}\n+function alternate(table) {\n+\t// Take object table and get all it\'s tbodies.\n+\tvar tableBodies = table.getElementsByTagName("tbody");\n+\t// Loop through these tbodies\n+\tfor (var i = 0; i < tableBodies.length; i++) {\n+\t\t// Take the tbody, and get all it\'s rows\n+\t\tvar tableRows = tableBodies[i].getElementsByTagName("tr");\n+\t\t// Loop through these rows\n+\t\t// Start at 1 because we want to leave the heading row untouched\n+\t\tfor (var j = 0; j < tableRows.length; j++) {\n+\t\t\t// Check if j is even, and apply classes for both possible results\n+\t\t\tif ( (j % 2) == 0 ) {\n+\t\t\t\tif ( !(tableRows[j].className.indexOf(\'odd\') == -1) ) {\n+\t\t\t\t\ttableRows[j].className = tableRows[j].className.replace(\'odd\', \'even\');\n+\t\t\t\t} else {\n+\t\t\t\t\tif ( tableRows[j].className.indexOf(\'even\') == -1 ) {\n+\t\t\t\t\t\ttableRows[j].className += " even";\n+\t\t\t\t\t}\n+\t\t\t\t}\n+\t\t\t} else {\n+\t\t\t\tif ( !(tableRows[j].className.indexOf(\'even\') == -1) ) {\n+\t\t\t\t\ttableRows[j].className = tableRows[j].className.replace(\'even\', \'odd\');\n+\t\t\t\t} else {\n+\t\t\t\t\tif ( tableRows[j].className.indexOf(\'odd\') == -1 ) {\n+\t\t\t\t\t\ttableRows[j].className += " odd";\n+\t\t\t\t\t}\n+\t\t\t\t}\n+\t\t\t} \n+\t\t}\n+\t}\n+}\n+ </script>\n+ <style type="text/css">\n+a img {\n+\tborder: 0;\n+}\n+table.sortable {\n+\tborder-spacing: 0;\n+\tborder: 1px solid #000;\n+\tborder-collapse: collapse;\n+}\n+table.sortable th, table.sortable td {\n+\ttext-align: left;\n+\tpadding: 2px 4px 2px 4px;\n+\twidth: 100px;\n+\tborder-style: solid;\n+\tborder-color: #444;\n+}\n+table.sortable th {\n+\tborder-width: 0px 1px 1px 1px;\n+\tbackground-color: #ccc;\n+\tfont: 14px "Lucida Grande", Helvetica, "Arial Unicode MS", "Arial Unicode", Arial, sans-serif;\t\n+\tfont-weight: bold;\n+}\n+table.sortable td {\n+\tborder-width: 0px 1px 0px 1px;\n+\tfont: 12px "Lucida Grande", Helvetica, "Arial Unicode MS", "Arial Unicode", Arial, sans-serif;\n+}\n+table.sortable tr.odd td {\n+\tbackground-color: #BFEFFF;\n+}\n+table.sortable tr.even td {\n+\tbackground-color: #ffffff;\n+}\n+table.sortable tr.sortbottom td {\n+\tborder-top: 1px solid #444;\n+\tbackground-color: #ccc;\n+\tfont-weight: bold;\n+} \n+ </style>\n+</head>\n+\n+<body>\n+\n+<table class="sortable" id="anyid" cellpadding="0" cellspacing="0">\n+<tr>{% for colname in colnames %}<th>{{ colname }}</th>{% endfor %}</tr>\n+{% for rowdata in rows %}\n+<tr>\n+ {% for datatype,col in rowdata %}\n+ <td>\n+ {% if datatype == "ucsc_pos" %}\n+ {% for itm in col %}\n+ <a href="{{ ucsc_pos_url }}{{ itm }}" target="_blank">{{ itm }}</a><br/>\n+ {% endfor %}\n+ {% elif datatype == "genecards" %}\n+ {% for itm in col %}\n+ <a href="{{ genecards_url }}{{ itm }}" target="_blank">{{ itm }}</a><br/>\n+ {% endfor %}\n+ {% elif datatype == "list" %}\n+ {% for itm in col %}{{ itm }}<br/>{% endfor %}\n+ {% else %}\n+ {{ col }}\n+ {% endif %}\n+ </td>\n+ {% endfor %}\n+</tr>\n+{% endfor %}\n+</table>\n+\n+</body>\n+</html>\n' |
b |
diff -r 000000000000 -r d85dea371064 chimerascan_run.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chimerascan_run.py Thu Sep 07 17:55:18 2017 -0400 |
[ |
b'@@ -0,0 +1,1025 @@\n+#!/usr/bin/env python\n+\'\'\'\n+Created on Jan 5, 2011\n+\n+@author: mkiyer\n+\n+chimerascan: chimeric transcript discovery using RNA-seq\n+\n+Copyright (C) 2011 Matthew Iyer\n+\n+This program is free software: you can redistribute it and/or modify\n+it under the terms of the GNU General Public License as published by\n+the Free Software Foundation, either version 3 of the License, or\n+(at your option) any later version.\n+\n+This program is distributed in the hope that it will be useful,\n+but WITHOUT ANY WARRANTY; without even the implied warranty of\n+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n+GNU General Public License for more details.\n+\n+You should have received a copy of the GNU General Public License\n+along with this program. If not, see <http://www.gnu.org/licenses/>.\n+\'\'\'\n+from chimerascan import __version__\n+\n+__author__ = "Matthew Iyer"\n+__copyright__ = "Copyright 2011, chimerascan project"\n+__credits__ = ["Matthew Iyer", "Christopher Maher"]\n+__license__ = "GPL"\n+__maintainer__ = "Matthew Iyer"\n+__email__ = "mkiyer@med.umich.edu"\n+__status__ = "beta"\n+\n+import logging\n+import os\n+import subprocess\n+import sys\n+import shutil\n+from optparse import OptionParser, OptionGroup\n+import xml.etree.ElementTree as etree\n+\n+# check for python version 2.6.0 or greater\n+if sys.version_info < (2,6,0):\n+ sys.stderr.write("You need python 2.6 or later to run chimerascan\\n")\n+ sys.exit(1)\n+\n+# local imports\n+from chimerascan import pysam\n+import chimerascan.lib.config as config\n+from chimerascan.lib.config import JOB_SUCCESS, JOB_ERROR, MIN_SEGMENT_LENGTH\n+from chimerascan.lib.base import LibraryTypes, check_executable, \\\n+ parse_bool, indent_xml, up_to_date\n+from chimerascan.lib.seq import FASTQ_QUAL_FORMATS, SANGER_FORMAT\n+from chimerascan.lib.fragment_size_distribution import InsertSizeDistribution\n+\n+from chimerascan.pipeline.fastq_inspect_reads import inspect_reads, detect_read_length, get_min_max_read_lengths\n+from chimerascan.pipeline.align_bowtie import align_pe, align_sr, trim_align_pe_sr\n+from chimerascan.pipeline.find_discordant_reads import find_discordant_fragments\n+from chimerascan.pipeline.discordant_reads_to_bedpe import discordant_reads_to_bedpe, sort_bedpe\n+from chimerascan.pipeline.nominate_chimeras import nominate_chimeras\n+from chimerascan.pipeline.chimeras_to_breakpoints import chimeras_to_breakpoints\n+from chimerascan.pipeline.nominate_spanning_reads import nominate_encomp_spanning_reads, extract_single_mapped_reads, nominate_single_mapped_spanning_reads\n+from chimerascan.pipeline.merge_spanning_alignments import merge_spanning_alignments\n+from chimerascan.pipeline.resolve_discordant_reads import resolve_discordant_reads\n+from chimerascan.pipeline.filter_chimeras import filter_chimeras, filter_highest_coverage_isoforms, filter_encompassing_chimeras\n+from chimerascan.pipeline.filter_homologous_genes import filter_homologous_genes\n+from chimerascan.pipeline.write_output import write_output\n+\n+# defaults for bowtie\n+DEFAULT_NUM_PROCESSORS = config.BASE_PROCESSORS\n+DEFAULT_BOWTIE_PATH = ""\n+DEFAULT_BOWTIE_ARGS = "--best --strata"\n+DEFAULT_DISCORD_BOWTIE_ARGS = "--best"\n+DEFAULT_MULTIHITS = 100\n+DEFAULT_MISMATCHES = 2\n+DEFAULT_DISCORD_MISMATCHES = 3\n+DEFAULT_SEGMENT_LENGTH = 25\n+DEFAULT_TRIM5 = 0\n+DEFAULT_TRIM3 = 0\n+DEFAULT_MIN_FRAG_LENGTH = 0\n+DEFAULT_MAX_FRAG_LENGTH = 1000\n+DEFAULT_NUM_SAMPLES_TO_DETERMINE_READ_LENGTHS = 10000\n+DEFAULT_FASTQ_QUAL_FORMAT = SANGER_FORMAT\n+DEFAULT_LIBRARY_TYPE = LibraryTypes.FR_UNSTRANDED\n+\n+DEFAULT_ISIZE_MEAN = 200\n+DEFAULT_ISIZE_STDEV = 40\n+DEFAULT_HOMOLOGY_MISMATCHES = config.BREAKPOINT_HOMOLOGY_MISMATCHES\n+DEFAULT_ANCHOR_MIN = 4\n+DEFAULT_ANCHOR_LENGTH = 8\n+DEFAULT_ANCHOR_MISMATCHES = 0\n+DEFAULT_FILTER_ISIZE_PROB = 0.01\n+DEFAULT_FILTER_UNIQUE_FRAGS = 2.0\n+DEFAULT_FILTER_ISOFORM_FRACTION = 0.01\n+NUM_POSITIONAL_ARGS = 4\n+DEFAULT_KEEP_TMP = True\n+\n+class RunConfig(object):\n+\n+ attrs = (("num_processors", int, DEFAULT_NUM_PROCESSORS),\n+ '..b'E)\n+ msg = "Filtering chimeras"\n+ if up_to_date(filtered_chimera_file, resolved_spanning_chimera_file):\n+ logging.info("[SKIPPED] %s" % (msg))\n+ else:\n+ logging.info(msg)\n+ # get insert size at prob\n+ filter_chimeras(input_file=resolved_spanning_chimera_file,\n+ output_file=filtered_chimera_file,\n+ index_dir=runconfig.index_dir,\n+ bam_file=sorted_aligned_bam_file,\n+ unique_frags=runconfig.filter_unique_frags,\n+ isoform_fraction=runconfig.filter_isoform_fraction,\n+ false_pos_file=runconfig.filter_false_pos_file)\n+ #\n+ # Filter homologous genes\n+ #\n+ homolog_filtered_chimera_file = os.path.join(tmp_dir, config.HOMOLOG_FILTERED_CHIMERA_FILE)\n+ msg = "Filtering homologous chimeras"\n+ if up_to_date(homolog_filtered_chimera_file, filtered_chimera_file):\n+ logging.info("[SKIPPED] %s" % (msg))\n+ else:\n+ logging.info(msg)\n+ min_isize = isize_dist.isize_at_percentile(1.0)\n+ max_isize = isize_dist.isize_at_percentile(99.0)\n+ filter_homologous_genes(input_file=filtered_chimera_file,\n+ output_file=homolog_filtered_chimera_file,\n+ index_dir=runconfig.index_dir,\n+ homolog_segment_length=runconfig.segment_length-1,\n+ min_isize=min_isize,\n+ max_isize=max_isize,\n+ bowtie_bin=bowtie_bin,\n+ num_processors=runconfig.num_processors,\n+ tmp_dir=tmp_dir)\n+ #\n+ # Choose best isoform for chimeras that share the same breakpoint\n+ #\n+ best_isoform_chimera_file = os.path.join(tmp_dir, config.BEST_FILTERED_CHIMERA_FILE)\n+ msg = "Choosing best isoform for each chimera"\n+ if up_to_date(best_isoform_chimera_file, homolog_filtered_chimera_file):\n+ logging.info("[SKIPPED] %s" % (msg))\n+ else:\n+ logging.info(msg)\n+ retcode = filter_highest_coverage_isoforms(index_dir=runconfig.index_dir,\n+ input_file=homolog_filtered_chimera_file,\n+ output_file=best_isoform_chimera_file)\n+ #\n+ # Write user-friendly output file\n+ #\n+ chimera_output_file = os.path.join(runconfig.output_dir, config.CHIMERA_OUTPUT_FILE)\n+ #msg = "Writing chimeras to file %s" % (chimera_output_file)\n+ if up_to_date(chimera_output_file, best_isoform_chimera_file):\n+ logging.info("[SKIPPED] %s" % (msg))\n+ else:\n+ logging.info(msg)\n+ write_output(best_isoform_chimera_file,\n+ bam_file=sorted_aligned_bam_file,\n+ output_file=chimera_output_file,\n+ index_dir=runconfig.index_dir)\n+ \n+ #\n+ # Move output to Galaxy data file\n+ #\n+ cmd = "mv %s/chimerascan_tmp/chimeras.bedpe %s/%s" % (os.path.dirname(runconfig.output_file_path), os.path.dirname(runconfig.output_file_path), runconfig.output_file_name)\n+ p = subprocess.check_output(cmd.split())\n+\n+ #\n+ # Cleanup\n+ #\n+ if not runconfig.keep_tmp:\n+ logging.info("Cleaning up temporary files")\n+ shutil.rmtree(tmp_dir)\n+ cmd_rm = "rm -r %s/chimerascan_tmp" % os.path.dirname(runconfig.output_file_path)\n+ p = subprocess.check_output(cmd_rm.split())\n+\n+ #\n+ # Done\n+ #\n+ logging.info("Finished run.")\n+ return JOB_SUCCESS\n+\n+\n+def main():\n+ logging.basicConfig(level=logging.INFO,\n+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")\n+ # parse run parameters in config file and command line\n+ runconfig = RunConfig()\n+ runconfig.from_args(sys.argv[1:])\n+ # run chimerascan\n+ sys.exit(run_chimerascan(runconfig))\n+\n+if __name__ == \'__main__\':\n+ main()\n+\n' |
b |
diff -r 000000000000 -r d85dea371064 test-data/input1.fastq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/input1.fastq Thu Sep 07 17:55:18 2017 -0400 |
b |
b"@@ -0,0 +1,200484 @@\n+@ERR030881.107 HWI-BRUNOP16X_0001:2:1:13663:1096#0/1\n+ATCTTTTGTGGCTACAGTAAGTTCAATCTGAAGTCAAAACCAACCAATTT\n++\n+5.544,444344555CC?CAEF@EEFFFFFFFFFFFFFFFFFEFFFEFFF\n+@ERR030881.311 HWI-BRUNOP16X_0001:2:1:18330:1130#0/1\n+TCCATACATAGGCCTCGGGGTGGGGGAGTCAGAAGCCCCCAGACCCTGTG\n++\n+GFFFGFFBFCHHHHHHHHHHIHEEE@@@=GHGHHHHHHHHHHHHHHHHHH\n+@ERR030881.1487 HWI-BRUNOP16X_0001:2:1:4144:1420#0/1\n+GTATAACGCTAGACACAGCGGAGCTCGGGATTGGCTAAACTCCCATAGTA\n++\n+55*'+&&5'55('''888:8FFFFFFFFFF4/1;/4./++FFFFF=5:E#\n+@ERR030881.9549 HWI-BRUNOP16X_0001:2:1:1453:3458#0/1\n+AACGGATCCATTGTTTCGAGAACGTGATCGCCCTCATCTACCTAGCCTCA\n++\n+D<@DDA@A:AHHHHHHHHHHHHHHIHHHHHHHHHHHHHHHHHBHHHHHHH\n+@ERR030881.13497 HWI-BRUNOP16X_0001:2:1:16344:4145#0/1\n+GCTAATCCGACTTCTCGCCATCATCCTCCTGGTGGGTGTCACCATCGTGC\n++\n+F@FFFGGFGFHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHEHHHHHHHHH\n+@ERR030881.14070 HWI-BRUNOP16X_0001:2:1:4377:4232#0/1\n+TGGACAGTTGCTCCTGGCTCCAGAACCTGTCTTGCAAGGGACAGTGGGGT\n++\n+A:AA@HHHHHHHHHHHHHHHHHHHIHHHHHHHHHHGF=GFHHHH@@?AA*\n+@ERR030881.16375 HWI-BRUNOP16X_0001:2:1:2265:4573#0/1\n+ATTAGGAAACATGGAATTTTTTTAAAGGTTTTTCTTGTATCTTTTTTTTT\n++\n+@<><CHHHHHHHHHHHHHHHHHGGHHHHHHHHHHGGGHHHHHHHHGGGGG\n+@ERR030881.18437 HWI-BRUNOP16X_0001:2:1:13904:4828#0/1\n+CAATAGCCAGATGGTTGGTGGGGCAGCCAGGCAGGGAGGACCCAGGGCTG\n++\n+555544555544555;AAAAFFBBEEEE;=FCB9F===<<FFFFEFFEEE\n+@ERR030881.18768 HWI-BRUNOP16X_0001:2:1:15563:4868#0/1\n+GTGCCAAATTGTCACATTCGAGCTTGAGGCTGTGGTACTGAGCTTGCAGT\n++\n+D>BFD@@?>>54454?FFGFGGGGGGGGGGGGGEGGGGGGGGGEGGGGGG\n+@ERR030881.20718 HWI-BRUNOP16X_0001:2:1:12184:5115#0/1\n+CCCGGCCTAACTTTCATTTAATTTCAATGAATTTTCTTTTTTTTTTTTTT\n++\n+56455==@=>HHHHHHHHHGHHHHHHHHGH=HHHHHHEEEECEEEEEEEE\n+@ERR030881.22833 HWI-BRUNOP16X_0001:2:1:13089:5358#0/1\n+GGAGAAGGGGCGAGGGAAGAAGACCTTTGCTATCCCAGATACCAGGACTG\n++\n+55544145444/444GFDFG9A@@@DD>.F@><<=FDD@AGG>GGEGGEG\n+@ERR030881.23643 HWI-BRUNOP16X_0001:2:1:7921:5452#0/1\n+CGGCCCCCTGCTAATCCGACTTCTCGCCATCATCCTCCTGGTGGGTGTCA\n++\n+FBDFFHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHDHHHH\n+@ERR030881.28299 HWI-BRUNOP16X_0001:2:1:6428:5960#0/1\n+ATGAGAAGGAGCCATCAGGACCTTATGAAAGCGACGAAGACAAGAGTGAT\n++\n+55554DDFFFBBFFFHHGHHHHHHHHHHHHHHHHHHDHH8HHHHHHHHFH\n+@ERR030881.28475 HWI-BRUNOP16X_0001:2:1:14780:5977#0/1\n+CGAAAACCAACTCTTTACCTAACTTTGCATGGTGCTTAGTCAAGGACTCC\n++\n+555,4&4551FFFFFBF3BDFFFFFFEFFFFBEFFFFFFDFFFFFFFFF=\n+@ERR030881.29253 HWI-BRUNOP16X_0001:2:1:1570:6070#0/1\n+GGAATGTTTAGCACAAGACACAGCGGAGCTCGGGATTGGCTAAACTCCCA\n++\n+HGHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH\n+@ERR030881.30545 HWI-BRUNOP16X_0001:2:1:4103:6216#0/1\n+CAACTCTTTACCTAACTTTGCATGGTGCTTAGTCAAGGACTCCTGCGACC\n++\n+54-55A@A@@HHHHHFFGGE555558<=;=55555AAAA?HHHHH>8@@>\n+@ERR030881.32582 HWI-BRUNOP16X_0001:2:1:12474:6471#0/1\n+CTTGCCTCACATGTCAGGGCAGGTATCCACCTAACCAGGCTGCAGGGGAG\n++\n+555555544444544HHHHGHHHHHHHHHHHHHHHHHHHHHH5@HFFF*F\n+@ERR030881.33730 HWI-BRUNOP16X_0001:2:1:14154:6628#0/1\n+CCAGCCTTGATACAGCATTTTCCACTTCTCTCTGTAGAGATCAGACGATT\n++\n+55555555(5@>@=:@=8.@04554CCCCC.441445444-555445555\n+@ERR030881.35226 HWI-BRUNOP16X_0001:2:1:3903:6867#0/1\n+CAGCATCCTGCTTAGGGCCCTGGAAACTGGGGAAATAGGTAGCCAGGTGG\n++\n+55555A@AAAGGEGGGGGGGGGGGGGGGGGGGCGGGFEGFGGGGFGGCGG\n+@ERR030881.38182 HWI-BRUNOP16X_0001:2:1:17495:7451#0/1\n+CACCATCGTGCCCGTTCTTGTCTTCCTTGGAGAGGTGGGCCTGGGAACCC\n++\n+5544455,0545445FFFEEFFFFFFFFFEEBC;D6<5-?FFFFFFFFFF\n+@ERR030881.41234 HWI-BRUNOP16X_0001:2:1:14816:8065#0/1\n+CTCTCCTCTAACCCTCCAGGCCTTAGCTTGCCTCACATGTCAGGGCAGGT\n++\n+55,34)4-53HHEHHGGGGG7DC?@GG;BGGEGGGGGGGGGGGGGGGGGA\n+@ERR030881.55301 HWI-BRUNOP16X_0001:2:1:7892:11256#0/1\n+CAAAAATGTAGCTGCCCTGACCTGGTCTCCCCTGACCCTTCCACGGGGCT\n++\n+56624545442525554455FFECECGEDGFF8DF###############\n+@ERR030881.57346 HWI-BRUNOP16X_0001:2:1:20039:11573#0/1\n+GACAGATGATGTCCAAGCCCCTACATGCCCCAGACCCCAGGGCACGGCTG\n++\n+##################################################\n+@ERR030881.57608 HWI-BRUNOP16X_0001:2:1:16788:11614#0/1\n+ATCTCGTAGTACATCACATAGTGACGCTGCATCTCTGACTTCTCACTGGC\n++\n+5653445555HHHHHHHHHH9;@=@HHHHHHDHHHHHHHHHHHHHHHHDH\n+@ERR030881.58998 HWI-BRUNOP16X_0001:2:1:14252:11816#0/1\n+CACCATTTGACCCTGAGCCAG"..b':6601:197274#0/1\n+CGGCCCCCTGCTAATCCGACTTCTCGCCATCATCCTCCTGGTGGGTGTCA\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHH\n+@ERR030881.74446016 HWI-BRUNOP16X_0001:2:68:6384:197508#0/1\n+TGTGTCTTGTGCTAAACATTCCTTTCTCTCCGTGCCTCTGTCTCCCCTCT\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHHHHHHHHHHHHHHH\n+@ERR030881.74446277 HWI-BRUNOP16X_0001:2:68:20062:197534#0/1\n+CAGCCCTCTCACCCTGGTACTGCATGCACGCAATGCTAGCTGCCCCTTTC\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHEHHHGHIIHHHHHAHHHHHHHHHGH\n+@ERR030881.74446743 HWI-BRUNOP16X_0001:2:68:3752:197585#0/1\n+CTGGGACCCAGGCAGCTGCCACCTTGTCACCATGAGAGAATTTGGGGAGT\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHG\n+@ERR030881.74446915 HWI-BRUNOP16X_0001:2:68:8353:197599#0/1\n+GGACTGTCCACCAGGTCCCGACGGGCAGGAATGCAGATGGGTACCTTTCC\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHFHHHHHHHE\n+@ERR030881.74447547 HWI-BRUNOP16X_0001:2:68:9591:197654#0/1\n+GCCAGTGGTGGGCATGCGGCTGCGGAGCACGTCCTGAGCTGTGGGGACGT\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHBDDBB@9@AAHHGHHHHHGHHDHHH\n+@ERR030881.74449534 HWI-BRUNOP16X_0001:2:68:1488:197840#0/1\n+CTACTCCTTCCGCAGCAGGGAGGTGTGCAGAGCCGTGCTCAGCTTCCTCT\n++\n+HHHHHHHHHHHHHHHHHHHHHHH8HAGFGGFHHHFGGHHHHHGHHHIHGH\n+@ERR030881.74453424 HWI-BRUNOP16X_0001:2:68:5325:198191#0/1\n+GTCCTGCCCTACCTCTCCCAAGAGCACCAGCAGCAGGTCTTGGGAGCCAT\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH\n+@ERR030881.74454854 HWI-BRUNOP16X_0001:2:68:18716:198301#0/1\n+GCCGGGGCTGCTGCGCTTCGCGAGGTCTTGCTCCCTTGGGACCTGGTCTC\n++\n+55555?>?>>5555444555444442=5<=55444C=6C>2555551544\n+@ERR030881.74455894 HWI-BRUNOP16X_0001:2:68:18831:198398#0/1\n+CTGGGACCTGCGGGAGGGCCGCCAGCTGCAGCAGCATGACTTCAGCTCCC\n++\n+HHHHHHHHHHHHHHHHHGEHHHHHHHCHHHFHHHHHEFGDFHHHEHBFHH\n+@ERR030881.74457151 HWI-BRUNOP16X_0001:2:68:9093:198528#0/1\n+AAACAAAACATTTTCCTTTGGGTTTTTTTTTTTCTTTCTTTTTTCTCCGC\n++\n+HHHHHHGGGHHHHHHHHHHHHHHHHHHHGGGGGBGGGBHHGGGGGGGHHH\n+@ERR030881.74458067 HWI-BRUNOP16X_0001:2:68:15716:198600#0/1\n+GTTCCAACCACCGCCGGGGAGGGAGAGGGCCCCTGTCCCTGCAGGGCCGC\n++\n+ADAD?DEFBEHHHHHCCDGDHCEEHCGBGAHHHHHCDCGD5555424554\n+@ERR030881.74460390 HWI-BRUNOP16X_0001:2:68:15056:198815#0/1\n+CCTGGAACTGCCTGACCATAGTCTGATTCTGCAGGTCCCAGACCACAATG\n++\n+?ACDC?DDGG=DDD>55554GGFFADDDA==<==>D=DAD5445544445\n+@ERR030881.74460430 HWI-BRUNOP16X_0001:2:68:19789:198814#0/1\n+CACAAATCCCGTTCAGCCTTTTGACGATCTCAGCCTGTTTGTGCATCTCG\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH\n+@ERR030881.74460883 HWI-BRUNOP16X_0001:2:68:19795:198864#0/1\n+CTGCCTGGCACGCACCCGGTGGCTGCACCATCCACACGCAAGACTGCAAC\n++\n+HHHHHHHHHHHHHHHHHDHHHHHGHHFHHHHHHHHHHHFHHHFHGHFHHH\n+@ERR030881.74463349 HWI-BRUNOP16X_0001:2:68:7211:199081#0/1\n+CGGGGAGGTTGGGAGGGGGGACAGAGGGGAGACAGAGGCACGGAGAGAAA\n++\n+HHHHHHHGEHHHHHHHGGGGEHGHHHHHHHHHHHHHHHHHHHHHHHHHHH\n+@ERR030881.74463429 HWI-BRUNOP16X_0001:2:68:16435:199090#0/1\n+CGGGCTCCTCGCACCTACCCCAGCAACTCAAATTCACCACCTCGGACTCC\n++\n+HHHHHHHHHHHHHHHHHHHHHHHEHHHHHHHHHHHHHHHHHHHHHHHHHH\n+@ERR030881.74466171 HWI-BRUNOP16X_0001:2:68:1844:199339#0/1\n+ATTTTTTTAAAGGTTTTTCTTGTATCTTTTTTTTTTTTTTTTTTTTTTTT\n++\n+HHHHHHHGGHHHHGHHHGHC83=;><=@=<CCCCCCCCCCCCCCCCCCCC\n+@ERR030881.74466232 HWI-BRUNOP16X_0001:2:68:10444:199339#0/1\n+CCTGGGTCGCCCACCCTCACCCTGCTCCTCCCAGCTCAGCTAAGCTCGTC\n++\n+HHHHHHHHHHHHHHHGGHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH\n+@ERR030881.74466444 HWI-BRUNOP16X_0001:2:68:18815:199349#0/1\n+GTTTAGCACAAGACACAGCGGAGCTCGGGATTGGCTAAACTCCCATAGTA\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHIH\n+@ERR030881.74468879 HWI-BRUNOP16X_0001:2:68:9428:199583#0/1\n+CACCAACCAGCCGCGGGCCGCGCAGCTGGTGGACAAGGACAGCACCTTCC\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHHHHHHHGHH\n+@ERR030881.74470889 HWI-BRUNOP16X_0001:2:68:4971:199775#0/1\n+CAGAGCTTAGCGGGGGGCTGAGCTGGTGTCTTTGAACCTCTAGTCCCAGG\n++\n+HHHHHHHHHHHHHHHCGGGHEHHFHHEHHHHHHHHHHHEHHHHHFHHHHH\n+@ERR030881.74471439 HWI-BRUNOP16X_0001:2:68:16981:199816#0/1\n+TGTGTGCCCCATTTCTCCATATAGTCTTCCTCAGGCAGGTCCTAGGTCCC\n++\n+??DDDEDECC<=@><CCC@?<<<=@EGGGGG?GGGGCGCE>@@6=55554\n+@ERR030881.74471978 HWI-BRUNOP16X_0001:2:68:9605:199866#0/1\n+CCCAGGTCCTGCCCTACCTCTCCCAAGAGCACCAGCAGCAGGTCTTGGGA\n++\n+HHHHHHHHHGHHHHHHHHGHHHHHHHHHHHHFHHHHHHHHHHGHHHHAHE\n' |
b |
diff -r 000000000000 -r d85dea371064 test-data/input2.fastq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/input2.fastq Thu Sep 07 17:55:18 2017 -0400 |
b |
b"@@ -0,0 +1,200484 @@\n+@ERR030881.107 HWI-BRUNOP16X_0001:2:1:13663:1096#0/2\n+CGGATTTCAGCTACTGCAAGCTCAGTACCACAGCCTCAAGCTCGAATGTG\n++\n+HH;HHHHHGHHHHHHHHHHGHDHEHHHHHEHHHHBHHFHHHHHHHHHD0F\n+@ERR030881.311 HWI-BRUNOP16X_0001:2:1:18330:1130#0/2\n+GAGTGCGAGGGAAGTCAGGGGAGGATCGCGAGGGAAGCCAGGGGAGGATC\n++\n+HHHHHBF8G>&4555GGGGGHHGGEHHHHHHHHH=HHHHHHHHHHHGB9H\n+@ERR030881.1487 HWI-BRUNOP16X_0001:2:1:4144:1420#0/2\n+AACCGGGGGACGGGCCGGGGCTGCTGCGCTTCGCGAGGTCTTGCTCCCTT\n++\n+@FEEH>==9=05544FGFGFHHHBHHHFHF>AAAAHHHHHHHEHHHHHHH\n+@ERR030881.9549 HWI-BRUNOP16X_0001:2:1:1453:3458#0/2\n+TCAGCATGCTTCTTAGGGCCCTGGAAACTGGGGAAATAGGTAGCCAGGTG\n++\n+5515555/5515444FFHHHHHHHHHHHHHHHHHHHHHHHEHHHHGHH@H\n+@ERR030881.13497 HWI-BRUNOP16X_0001:2:1:16344:4145#0/2\n+GGCCAAGCAGGTCACCGCTCCCGAGCTGAACTCTATCATCCGACAGCAGC\n++\n+HHHHFGHHHGFAFFFHHFHHHHH/HHHHGHHEHHEHGFHHDGF=AA=@@8\n+@ERR030881.14070 HWI-BRUNOP16X_0001:2:1:4377:4232#0/2\n+TGGAGTCCTTCATGCCCAGGTCTGGAACCCAGGTTCTGACCCCAGGGCCC\n++\n+FDFFFEGGGGHHHHGHHHHH>AAA8GGGGGHHHGHHHHHHHHHHHGFHHH\n+@ERR030881.16375 HWI-BRUNOP16X_0001:2:1:2265:4573#0/2\n+GGCCAGCCGGGCTCCAGAGGGGTCAGGGCGCGACGAGAACCAACTCTTTA\n++\n+FDFFBDFDDBAAADDGHGHHHHBHHHHHGHGHHHHHHHHHHHHHHHHHFH\n+@ERR030881.18437 HWI-BRUNOP16X_0001:2:1:13904:4828#0/2\n+GGGCTCTCCCTCTGTATCGCCTGGGGAGGCTGCTGAGGTGACTTTTTGGA\n++\n+A?DDABFBFFHGHEHHHHHHHHHIHHDHCC55555BFFCD;:9=;=@=><\n+@ERR030881.18768 HWI-BRUNOP16X_0001:2:1:15563:4868#0/2\n+CACAGTAGGCGTTCTATAAATGTGTCACAAGAATGGCTTCCCTCAGGAAG\n++\n+55444;@=@>HHHDHHHHHFFGHHHHHHHHHIHHHFH=HHBB?<D#####\n+@ERR030881.20718 HWI-BRUNOP16X_0001:2:1:12184:5115#0/2\n+GCCTGGGCAACATAGCGAAACCACATCTCTACAAAAAAATCCTCCAAAAT\n++\n+HGIEHHHHGHF=@FF8A>>@HFHH=HHHHHHHIHHHGGGGH@@HHGGGEG\n+@ERR030881.22833 HWI-BRUNOP16X_0001:2:1:13089:5358#0/2\n+AGCCACTGCCTTTCTGCTCAGATGCTGGCACCTCCGCCCCCGGGGCTGCC\n++\n+EHHHFF?GFDGFFB???DDAD<FC<55555FFGGG<?>>61/5444-555\n+@ERR030881.23643 HWI-BRUNOP16X_0001:2:1:7921:5452#0/2\n+CGAGCTGAACTCTATCATCCGACAGCAGCTCCAAGCCCACCAGCTGTCCC\n++\n+HHHHHHHHHHHHHHHGHHHHGGHHHHHHHHHHHHHHHHHHHHHHHHDHHH\n+@ERR030881.28299 HWI-BRUNOP16X_0001:2:1:6428:5960#0/2\n+GGAGTCACAGGATTTGGAGGCAGGAGTGCTGGCGGGAAGGGCATTCAGGA\n++\n+HHHHHHHFEH?=DDDHIFHHEHEDE?DAADH@FHHC'@CE##########\n+@ERR030881.28475 HWI-BRUNOP16X_0001:2:1:14780:5977#0/2\n+CTCGGAAGGCAAGGCACATCTTGTGGTAGAAAATTTCGTGCAAATTAGGA\n++\n+HHHHHGGH=IADDADHHGHH444-4A?A?AGHGHHFGFG@/5544HDHEE\n+@ERR030881.29253 HWI-BRUNOP16X_0001:2:1:1570:6070#0/2\n+CTTCGCGAGGTCTTGCTCCCTTGGGACCTGGTCTCCCATCTGACCCTCCA\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH\n+@ERR030881.30545 HWI-BRUNOP16X_0001:2:1:4103:6216#0/2\n+GTTTAAAGGTGATACTTATTCTCGGAAGGCAAGGCACATCTTGTGGGAGA\n++\n+EF;GG4445544544FF@FFEHFHFFHGHH####################\n+@ERR030881.32582 HWI-BRUNOP16X_0001:2:1:12474:6471#0/2\n+GGGACAGGGAGGTTGGGAGGGGGGACAGAGGGGAGACAGAGGCACGGAGA\n++\n+FF8FFBFFFFFDF@FCD>CFF@@F:HEHEHHHHBHHHHHF==<>5?DDA;\n+@ERR030881.33730 HWI-BRUNOP16X_0001:2:1:14154:6628#0/2\n+GTGAGGGTGGGCGACCCAGGATTCCCCCTCCCCTTCCCAAATAAAGATGA\n++\n+BEFDB44(4411445DA?ADHHHHIFDDC>:::5@DDDC?HHHDEBFFB>\n+@ERR030881.35226 HWI-BRUNOP16X_0001:2:1:3903:6867#0/2\n+CAGAGCGTAAGAAATGGATCCATTGTTCCGAGAACGTGATCGCCCTCATC\n++\n+HH@HHHFDHHHHHHHFHHHGHGHHHHHHHGGHHHHHFHHAHHHHHGHHGH\n+@ERR030881.38182 HWI-BRUNOP16X_0001:2:1:17495:7451#0/2\n+CCTCTCCCGAGCTGAACTCTATCATCCGACAGCAGCTCCAAGCCCACCAG\n++\n+GG/GGHHHHHHHHHHHHHHHHHHHDHHHHHFDHHHHHH@HHEHHHHHHHH\n+@ERR030881.41234 HWI-BRUNOP16X_0001:2:1:14816:8065#0/2\n+GGCAGGTTGGGAGGGGGGACAGAGGGGAGACAGAGGCACGGAGAGAAAGG\n++\n+FFGHH55,5514441>><<BHHEHFF?9F4FFFBFHHHHHHHHGHHFF4H\n+@ERR030881.55301 HWI-BRUNOP16X_0001:2:1:7892:11256#0/2\n+CTTCGCAAATTTGTCCCAGGGATGGATCGCCTGTGCTGCCTTCGCCCGCC\n++\n+D@5AA4453451444GGGFDHH@GEA;DDD=:=+:D@DFDEDHHB#####\n+@ERR030881.57346 HWI-BRUNOP16X_0001:2:1:20039:11573#0/2\n+CCTGTCCAGAGTCTGAGGGGGGAGGCCAGGCCCTGCCTTGGGGTCTGAGG\n++\n+##################################################\n+@ERR030881.57608 HWI-BRUNOP16X_0001:2:1:16788:11614#0/2\n+GGGGGGCGCCGCAGCTGCGCGGCCGCTCCCTCCTAGCCGGCCCTTGAGGG\n++\n+HHHHHHHEGHIHHHHHHDHF@@<A?FFE@FGGGAG4====HHHHHHHEHB\n+@ERR030881.58998 HWI-BRUNOP16X_0001:2:1:14252:11816#0/2\n+CTGAATCCCTTGCCCAGAGGA"..b':6601:197274#0/2\n+CCGCTCCCGAGCTGAACTCTATCATCCGACAGCAGCTCCAAGCCCACCAG\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGH\n+@ERR030881.74446016 HWI-BRUNOP16X_0001:2:68:6384:197508#0/2\n+GGCCCTGCCCTTGACCCCACTACCCGTGGGGCTGCAGCCGCCTTCGCTGC\n++\n+HHHHHHHHHHHHHHHHHHIHHGHHH>A??@FHHHFHHFHDHH=HHB>4FF\n+@ERR030881.74446277 HWI-BRUNOP16X_0001:2:68:20062:197534#0/2\n+CTTTATTTGGGAAGGGGAGGGGGAATCCTGGGTCGCCCACCCTCACCCTG\n++\n+HHHHHHHGGHHHHHHHHFHGGGGGHHHHHHHGHHHHFHHEH9BHEDD###\n+@ERR030881.74446743 HWI-BRUNOP16X_0001:2:68:3752:197585#0/2\n+CGGCCGGCTGCATCCCACACCAGCCTGAGCCCCAGACGGTCAGTCAGTGC\n++\n+HHHHHHHHHHHHIHHHHHHHHHHHHHHHHHHHHHHHHHHEHHHHHHHHHH\n+@ERR030881.74446915 HWI-BRUNOP16X_0001:2:68:8353:197599#0/2\n+CGAGGGGTCCAGAGTGGAGAGAGCCCCGAGCAGGAGTGCATCTCCCTCGC\n++\n+HHHHHHHFHHHHHHHFHHHHHHHHHHHHHHHGHHHHHHHHHHIHFHHHGH\n+@ERR030881.74447547 HWI-BRUNOP16X_0001:2:68:9591:197654#0/2\n+GGCTGCAGATTCCATTCAGCAGGCCCGAGAGCAAGCACCACGCTAGCCTG\n++\n+HHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHHHHHHHHHHHHHHHG\n+@ERR030881.74449534 HWI-BRUNOP16X_0001:2:68:1488:197840#0/2\n+CAAGACTGCAACTTCAGATGCTCCGCACGCTGGAGATGCTGGACAGGGGC\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFFHHHHHHHEHEHE\n+@ERR030881.74453424 HWI-BRUNOP16X_0001:2:68:5325:198191#0/2\n+CTTCCTTGGAGAGGTGGGCCTGGGAACCCAGCGCGGACAGCGAGAGGAGG\n++\n+HHGHHHHHHHHHHHHHHHHHHGHHGHHHHHHHHHHHHHHHHHHHEHHHHG\n+@ERR030881.74454854 HWI-BRUNOP16X_0001:2:68:18716:198301#0/2\n+GGAATGTTTAGCACAAGACACAGCGGAGCTCGGGATTGGCTAAACTCCCA\n++\n+HF@GHD?>DA=<>;=444444245444445>>@>;BECBF@?A<>@AAA8\n+@ERR030881.74455894 HWI-BRUNOP16X_0001:2:68:18831:198398#0/2\n+GGACTGAGGACGACTCCTTGGACTGGAAAATGCTGGCCCCGTACGGCGTC\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHDHHHHHHIH\n+@ERR030881.74457151 HWI-BRUNOP16X_0001:2:68:9093:198528#0/2\n+GGAACCTTCTCCGGATTGGGTTCATGAGCATTTTTGTGGGTGTGTATGTG\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGGHHHHHHHHHHFH\n+@ERR030881.74458067 HWI-BRUNOP16X_0001:2:68:15716:198600#0/2\n+GGCCGTCTTTGACCTGCTCCTGGCTGTTGGCATTGCTGCCTACCCTGGCA\n++\n+55555<@@@@===<655244A??DAC:C?#####################\n+@ERR030881.74460390 HWI-BRUNOP16X_0001:2:68:15056:198815#0/2\n+GGTGAGGCCAGCACCTTGTCCATTTGGGACCTGGCGGCGCCCACCCCCCG\n++\n+5-5449=;==BFFBFDBFDDC>?>>D?DDDHHHHHBFFC@44244<<<<<\n+@ERR030881.74460430 HWI-BRUNOP16X_0001:2:68:19789:198814#0/2\n+ATGATGTTTCCACAAAGCAGGCATTCGGGCTCCTCGCACCTACCCCAGCA\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHHHHHHHHH\n+@ERR030881.74460883 HWI-BRUNOP16X_0001:2:68:19795:198864#0/2\n+TGCTGCGGGTGTCTCCGGCTGGGCATGCGGGGGCCCGGGGACTGCCTGGC\n++\n+HHHHHHHHHHHHGHHHHFDEBDDBB5552*DDBBFHHHHH@FDF######\n+@ERR030881.74463349 HWI-BRUNOP16X_0001:2:68:7211:199081#0/2\n+CTGGTCTCCCATCTGACCCTCCAGGCCTTAGCTTGCCTCACATGTCAGGG\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFH\n+@ERR030881.74463429 HWI-BRUNOP16X_0001:2:68:16435:199090#0/2\n+GGACCTGGGCACAAATCCCGTTCAGCCTTTTGACGATCTCAGCCTGTTTG\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH\n+@ERR030881.74466171 HWI-BRUNOP16X_0001:2:68:1844:199339#0/2\n+GGTGGGGGTCGTGGAGTGGGGGAGGGAGGCCAGCCGGGCTCCAGAGGGGT\n++\n+HHHHHHGGGHGBGEFHHHFHHG9GGC;HHEHHHCHFG@FFAA;=9DD;C7\n+@ERR030881.74466232 HWI-BRUNOP16X_0001:2:68:10444:199339#0/2\n+CCGTTTTGAACATGTGTAACCGACAGTCTGCCTGGGCCACAGCCCTCTCA\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHIHHHHHGHDHGHHHBHCFFFFHHHHH\n+@ERR030881.74466444 HWI-BRUNOP16X_0001:2:68:18815:199349#0/2\n+GGAAGGGCCGGGGCTGCTGCGCTTCGCGAGGTCTTGCTCCCTTGGGACCT\n++\n+HHHHIHIHHHHHGHHHHHHHFHDHFGHHHHHEHEHHHHHHHHHHHHGHHH\n+@ERR030881.74468879 HWI-BRUNOP16X_0001:2:68:9428:199583#0/2\n+GGAGGCTGAAGTGCTGGACAGCCACGTAGGCCATGCCGAGGTAGGCAGCA\n++\n+HFHHHHHHHHHHIHGHHHHHHHHHHHHHHHHHEHHHHGGHH?FHHHHHGH\n+@ERR030881.74470889 HWI-BRUNOP16X_0001:2:68:4971:199775#0/2\n+GACATATTTGAGAGACACTGGGGAGACAGAATCGACCTGACCTTGCTGAC\n++\n+HHHHHHHHHHHHHHHHHHHHHHH@HHHEHHHFHHHHHGHAHFBEHHGFBG\n+@ERR030881.74471439 HWI-BRUNOP16X_0001:2:68:16981:199816#0/2\n+GTGACACTGCATTGCTGCTGCCAGCACCCCTTGTTAGGGTTTGTAATTGC\n++\n+F8HHHFGGG8DC>A>ADD1?##############################\n+@ERR030881.74471978 HWI-BRUNOP16X_0001:2:68:9605:199866#0/2\n+CTTGTCTTCCTTGGAGAGGTGGGCCTGGGAACCCAGCGCGGACAGCGAGA\n++\n+HHHHHHIHHHHHDHHGHHHGHHHHHHHHHHHHFGHHHHFHHDHHHCHHHH\n' |