Previous changeset 2:7a68005de299 (2016-11-27) Next changeset 4:aecfe10118ed (2017-11-17) |
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 4a3c9f195ba5d899b1a1ce5e80281cdf230f456a |
modified:
bam_filter.xml filter.py macros.xml |
removed:
ngsutils/__init__.py ngsutils/__init__.pyc ngsutils/bam/__init__.py ngsutils/bam/__init__.pyc ngsutils/bed/__init__.py ngsutils/bed/__init__.pyc ngsutils/support/__init__.py ngsutils/support/__init__.pyc ngsutils/support/bgzip.py ngsutils/support/dbsnp.py ngsutils/support/dbsnp.pyc ngsutils/support/llh.py ngsutils/support/ngs_utils.py ngsutils/support/ngs_utils.pyc ngsutils/support/regions.py ngsutils/support/stats.py |
b |
diff -r 7a68005de299 -r 9b9ae5963d3c bam_filter.xml --- a/bam_filter.xml Sun Nov 27 15:01:21 2016 -0500 +++ b/bam_filter.xml Mon Oct 23 13:27:02 2017 -0400 |
[ |
@@ -1,4 +1,4 @@ -<tool id="ngsutils_bam_filter" name="BAM filter" version="@WRAPPER_VERSION@.1"> +<tool id="ngsutils_bam_filter" name="BAM filter" version="@WRAPPER_VERSION@"> <description>Removes reads from a BAM file based on criteria</description> <macros> <import>macros.xml</import> @@ -7,67 +7,61 @@ <expand macro="stdio" /> <expand macro="version" /> <command><![CDATA[ - - ## If the tool is executed with no filtering option, - ## the default parameters simply copy over the input file - if grep -q "\w" ${parameters}; - then - $__tool_directory__/filter.py - $infile - $outfile - `cat ${parameters}`; - else - cp $infile $outfile; - fi - -]]> - </command> +## If the tool is executed with no filtering option, +## the default parameters simply copy over the input file +if grep -q "\w" '${parameters}'; then + '$__tool_directory__/filter.py' + '$infile' + '$outfile' + `cat ${parameters}`; +else + cp '$infile' '$outfile'; +fi + ]]></command> <configfiles> - <configfile name="parameters"> -<![CDATA[ - #if $minlen: - -minlen $minlen - #end if - #if $maxlen - -maxlen $maxlen - #end if - $mapped - $unmapped - $properpair - $noproperpair - #if $mask: - -mask ${mask} - #end if - #if int($uniq) > -1: - -uniq - #if int($uniq) > 0: - $uniq - #end if - #end if - $uniq_start - #if $mismatch: - -mismatch $mismatch - #end if - $nosecondary - $noqcfail - $nopcrdup - #if $excludebed: - -excludebed ${excludebed} $ignore_strand - #end if - #if $includebed: - -includebed ${includebed} $ignore_strand - #end if - #if $includeref: - -includeref ${includeref} - #end if - #if $excluderef: - -excluderef ${excluderef} - #end if - #if $maximum_mismatch_ratio - -maximum_mismatch_ratio $maximum_mismatch_ratio - #end if - ]]> - </configfile> + <configfile name="parameters"><![CDATA[ +#if $minlen: + -minlen $minlen +#end if +#if $maxlen + -maxlen $maxlen +#end if +$mapped +$unmapped +$properpair +$noproperpair +#if $mask: + -mask ${mask} +#end if +#if int($uniq) > -1: + -uniq + #if int($uniq) > 0: + $uniq + #end if +#end if +$uniq_start +#if $mismatch: + -mismatch $mismatch +#end if +$nosecondary +$noqcfail +$nopcrdup +#if $excludebed: + -excludebed ${excludebed} $ignore_strand +#end if +#if $includebed: + -includebed ${includebed} $ignore_strand +#end if +#if $includeref: + -includeref ${includeref} +#end if +#if $excluderef: + -excluderef ${excluderef} +#end if +#if $maximum_mismatch_ratio + -maximum_mismatch_ratio $maximum_mismatch_ratio +#end if + ]]></configfile> </configfiles> <inputs> <param name="infile" type="data" format="bam" label="Select BAM dataset" /> @@ -129,7 +123,6 @@ <param argument="-maximum_mismatch_ratio" type="float" value="" optional="True" min="0.0" max="1.0" label="Filter by maximum mismatch ratio" help="fraction of length"/> - </inputs> <outputs> <data format="bam" name="outfile" /> @@ -234,6 +227,5 @@ | -excluderef refname | Exclude reads mapped to a particular reference | | | (e.g. chrM, or _dup chromosomes) | +--------------------------------+-------------------------------------------------+ -]]> - </help> + ]]></help> </tool> |
b |
diff -r 7a68005de299 -r 9b9ae5963d3c filter.py --- a/filter.py Sun Nov 27 15:01:21 2016 -0500 +++ b/filter.py Mon Oct 23 13:27:02 2017 -0400 |
[ |
@@ -107,6 +107,7 @@ The tag type (:i, :f, :Z) is optional. """ +from __future__ import print_function import os import sys @@ -118,8 +119,8 @@ def usage(): - print __doc__ - print """ + print(__doc__) + print(""" Usage: bamutils filter in.bam out.bam {-failed out.txt} criteria... Options: @@ -131,7 +132,7 @@ This will remove all unmapped reads, as well as any reads that have an AS:i value less than 1000. -""" +""") sys.exit(1) @@ -760,10 +761,10 @@ # guess at type... try: self.value = int(value) - except: + except ValueError: try: self.value = float(value) - except: + except ValueError: self.value = value def get_value(self, read): @@ -960,7 +961,7 @@ outfile = arg elif arg[0] == '-': if not arg[1:] in _criteria: - print "Unknown criterion: %s" % arg + print("Unknown criterion: %s" % arg) fail = True if crit_args: criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:])) @@ -968,7 +969,7 @@ elif crit_args: crit_args.append(arg) else: - print "Unknown argument: %s" % arg + print("Unknown argument: %s" % arg) fail = True if not fail and crit_args: @@ -979,11 +980,11 @@ usage() if not infile: - print "Missing: input bamfile" + print("Missing: input bamfile") if not outfile: - print "Missing: output bamfile" + print("Missing: output bamfile") if not criteria: - print "Missing: filtering criteria" + print("Missing: filtering criteria") usage() else: bam_filter(infile, outfile, criteria, failed, verbose) |
b |
diff -r 7a68005de299 -r 9b9ae5963d3c macros.xml --- a/macros.xml Sun Nov 27 15:01:21 2016 -0500 +++ b/macros.xml Mon Oct 23 13:27:02 2017 -0400 |
b |
@@ -1,8 +1,8 @@ <macros> - <token name="@WRAPPER_VERSION@">0.5.8</token> + <token name="@WRAPPER_VERSION@">0.5.9</token> <xml name="requirements"> <requirements> - <requirement type="package" version="0.9.1.4">pysam</requirement> + <requirement type="package" version="@WRAPPER_VERSION@">ngsutils</requirement> </requirements> </xml> <xml name="version"> |
b |
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/__init__.pyc |
b |
Binary file ngsutils/__init__.pyc has changed |
b |
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/bam/__init__.py --- a/ngsutils/bam/__init__.py Sun Nov 27 15:01:21 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b"@@ -1,880 +0,0 @@\n-import os\n-import re\n-import sys\n-\n-import ngsutils.support\n-import pysam\n-try:\n- from eta import ETA\n-except:\n- pass\n-\n-\n-def bam_open(fname, mode='r', *args, **kwargs):\n- if fname.lower()[-4:] == '.bam':\n- return pysam.Samfile(fname, '%sb' % mode, *args, **kwargs)\n- return pysam.Samfile(fname, '%s' % mode, *args, **kwargs)\n-\n-\n-def bam_pileup_iter(bam, mask=1796, quiet=False, callback=None):\n- if not quiet and bam.filename:\n- eta = ETA(os.stat(bam.filename).st_size)\n- else:\n- eta = None\n-\n- for pileup in bam.pileup(mask=mask):\n- pos = bam.tell()\n- bgz_offset = pos >> 16\n-\n- if not quiet:\n- if callback:\n- eta.print_status(bgz_offset, extra=callback(pileup))\n- else:\n- eta.print_status(bgz_offset, extra='%s:%s' % (bam.getrname(pileup.tid), pileup.pos))\n-\n- yield pileup\n-\n- if eta:\n- eta.done()\n-\n-\n-def bam_iter(bam, quiet=False, show_ref_pos=False, ref=None, start=None, end=None, callback=None):\n- '''\n- >>> [x.qname for x in bam_iter(bam_open(os.path.join(os.path.dirname(__file__), 't', 'test.bam')), quiet=True)]\n- ['A', 'B', 'E', 'C', 'D', 'F', 'Z']\n- '''\n-\n- if os.path.exists('%s.bai' % bam.filename):\n- # This is an indexed file, so it is ref sorted...\n- # Meaning that we should show chrom:pos, instead of read names\n- show_ref_pos = True\n-\n- eta = None\n-\n- if not ref:\n- if not quiet and bam.filename:\n- eta = ETA(os.stat(bam.filename).st_size)\n-\n- for read in bam:\n- pos = bam.tell()\n- bgz_offset = pos >> 16\n-\n- if not quiet and eta:\n- if callback:\n- eta.print_status(bgz_offset, extra=callback(read))\n- elif (show_ref_pos):\n- if read.tid > -1:\n- eta.print_status(bgz_offset, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))\n- else:\n- eta.print_status(bgz_offset, extra='unmapped %s' % (read.qname))\n- else:\n- eta.print_status(bgz_offset, extra='%s' % read.qname)\n-\n- yield read\n-\n- else:\n- working_chrom = None\n- if ref in bam.references:\n- working_chrom = ref\n- elif ref[0:3] == 'chr':\n- # compensate for Ensembl vs UCSC ref naming\n- if ref[3:] in bam.references:\n- working_chrom = ref[3:]\n-\n- if not working_chrom:\n- raise ValueError('Missing reference: %s' % ref)\n-\n- tid = bam.gettid(working_chrom)\n-\n- if not start:\n- start = 0\n- if not end:\n- end = bam.lengths[tid]\n-\n- if not quiet and bam.filename:\n- eta = ETA(end - start)\n-\n- for read in bam.fetch(working_chrom, start, end):\n- if not quiet and eta:\n- if callback:\n- eta.print_status(read.pos - start, extra=callback(read))\n- else:\n- eta.print_status(read.pos - start, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))\n-\n- yield read\n-\n- if eta:\n- eta.done()\n-\n-\n-def bam_batch_reads(bam, quiet=False):\n- '''\n- Batch mapping for the same reads (qname) together, this way\n- they can all be compared/converted together.\n- '''\n- reads = []\n- last = None\n- for read in bam_iter(bam, quiet=quiet):\n- if last and read.qname != last:\n- yield reads\n- reads = []\n- last = read.qname\n- reads.append(read)\n-\n- if reads:\n- yield reads\n-\n-\n-bam_cigar = ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X']\n-bam_cigar_op = {\n- 'M': 0,\n- 'I': 1,\n- 'D': 2,\n- 'N': 3,\n- 'S': 4,\n- 'H': 5,\n- 'P': 6,\n- '=': 7,\n- 'X': 8,\n-}\n-\n-\n-def cigar_fromstr(s):\n- '''\n- >>> cigar_fromstr('10M5I20M')\n- [(0, 10), (1, 5), ("..b'- return True, \'\'\n-\n-\n-def read_alignment_fragments_gen(read):\n- \'\'\'\n- Takes a read and returns the start and end positions for each match/mismatch region.\n- This will let us know where each read alignment "touches" the genome.\n- \'\'\'\n-\n- for start, end in _read_alignment_fragments_gen(read.pos, read.cigar):\n- yield (start, end)\n-\n-\n-def _read_alignment_fragments_gen(pos, cigar):\n- \'\'\' Test-able version of read_alignment_fragments_gen\n- >>> list(_read_alignment_fragments_gen(1, cigar_fromstr(\'50M\')))\n- [(1, 51)]\n-\n- >>> list(_read_alignment_fragments_gen(1, cigar_fromstr(\'25M100N25M\')))\n- [(1, 26), (126, 151)]\n-\n- >>> list(_read_alignment_fragments_gen(1, cigar_fromstr(\'20M1D4M100N10M5I10M\')))\n- [(1, 21), (22, 26), (126, 136), (136, 146)]\n- \'\'\'\n- ref_pos = pos\n-\n- for op, length in cigar:\n- if op == 0:\n- yield (ref_pos, ref_pos + length)\n- ref_pos += length\n- elif op == 1:\n- pass\n- elif op == 2:\n- ref_pos += length\n- elif op == 3:\n- ref_pos += length\n- else:\n- raise ValueError("Unsupported CIGAR operation: %s" % op)\n-\n-\n-def read_cigar_at_pos(cigar, qpos, is_del):\n- \'\'\'\n- Returns the CIGAR operation for a given read position\n-\n- qpos is the 0-based index of the base in the read\n- \'\'\'\n- pos = 0\n- returnnext = False\n- for op, length in cigar:\n- if returnnext:\n- return op\n- if op == 0:\n- pos += length\n- elif op == 1:\n- pos += length\n- elif op == 2:\n- pass\n- elif op == 3:\n- pass\n- elif op == 4:\n- pos += length\n- elif op == 5:\n- pass\n- else:\n- raise ValueError("Unsupported CIGAR operation: %s" % op)\n-\n- if pos > qpos:\n- return op\n- if is_del and pos == qpos:\n- returnnext = True\n-\n- return None\n-\n-\n-def cleancigar(cigar):\n- \'\'\'\n- Cleans a CIGAR alignment to remove zero length operations\n-\n- >>> cigar_tostr(cleancigar(cigar_fromstr(\'31M0N52M18S\')))\n- \'83M18S\'\n-\n- \'\'\'\n- newcigar = []\n-\n- changed = False\n- zero = False\n-\n- for op, size in cigar:\n- if size > 0:\n- if zero and newcigar:\n- if newcigar[-1][0] == op:\n- newsize = newcigar[-1][1] + size\n- newcigar[-1] = (op, newsize)\n- zero = 0\n- else:\n- newcigar.append((op, size))\n- else:\n- changed = True\n- zero = True\n-\n- if changed:\n- return newcigar\n-\n- return None\n-\n-\n-def read_cleancigar(read):\n- \'\'\'\n- Replaces the CIGAR string for a read to remove any operations that are zero length.\n-\n- This happens to the read in-place\n- \'\'\'\n- if read.is_unmapped:\n- return False\n-\n- newcigar = []\n-\n- newcigar = cleancigar(read.cigar)\n-\n- if newcigar:\n- read.cigar = newcigar\n- return True\n-\n- return False\n-\n-\n-def read_to_unmapped(read, ref=None):\n- \'\'\'\n- Converts a read from mapped to unmapped.\n-\n- Sets the \'ZR\' tag to indicate the original ref/pos/cigar (if ref is passed)\n- \'\'\'\n-\n- newread = pysam.AlignedRead()\n-\n- if ref:\n- tags = [(\'ZR\', \'%s:%s:%s\' % (ref, read.pos, cigar_tostr(read.cigar)))]\n-\n- newread.is_unmapped = True\n- newread.mapq = 0\n- newread.tlen = 0\n- newread.pos = -1\n- newread.pnext = -1\n- newread.rnext = -1\n- newread.tid = -1\n-\n- newread.qname = read.qname\n-\n- if read.is_paired:\n- newread.is_paired = True\n-\n- if not read.is_unmapped and read.is_reverse:\n- newread.seq = ngsutils.support.revcomp(read.seq)\n- newread.qual = read.qual[::-1]\n- else:\n- newread.seq = read.seq\n- newread.qual = read.qual\n-\n- newread.tags = tags\n-\n- return newread\n-\n-\n-if __name__ == \'__main__\':\n- import doctest\n- doctest.testmod()\n' |
b |
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/bam/__init__.pyc |
b |
Binary file ngsutils/bam/__init__.pyc has changed |
b |
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/bed/__init__.py --- a/ngsutils/bed/__init__.py Sun Nov 27 15:01:21 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,282 +0,0 @@\n-import os\n-\n-import ngsutils.support.ngs_utils\n-import pysam\n-\n-\n-class BedStreamer(object):\n- \'\'\'\n- Streams BedRegions from a BED file\n-\n- Note - this can only be used once! There is no mechanism to seek the stream.\n- \'\'\'\n-\n- def __init__(self, fname=None, fileobj=None, quiet=False):\n- if not fname and not fileobj:\n- raise ValueError("You must specify either fname or fileobj!")\n-\n- self.reader = ngsutils.support.gzip_reader(fname=fname, quiet=quiet, fileobj=fileobj)\n-\n- def __iter__(self):\n- return self\n-\n- def next(self):\n- try:\n- while True:\n- line = self.reader.next().strip()\n- if line and line[0] != \'#\':\n- cols = line.split(\'\\t\')\n- while len(cols) < 6:\n- cols.append(\'\')\n-\n- return BedRegion(*cols)\n- except:\n- raise StopIteration\n-\n-\n-class BedFile(object):\n- \'\'\'\n- BED files are read in their entirety memory, in a series of bins. Each bin\n- is ~100kb in size. Each bin can then be iterated over.\n-\n- This is less efficient than using a proper index, but in reality, this\n- usually isn\'t an issue. However, if the BED file has been Tabix indexed,\n- that index will be used for random access.\n-\n- NOTE: This isn\'t very efficient, so perhaps this can be remodeled into a BedFile\n- and a BedFileIndex where the file is indexed only if random access is requested.\n- \'\'\'\n-\n- _bin_const = 100000\n-\n- def __init__(self, fname=None, fileobj=None, region=None):\n- self._bins = {}\n- self._bin_list = []\n- self._cur_bin_idx = 0\n- self._cur_bin_pos = 0\n- self._tellpos = 0\n- self._total = 0\n- self._length = 0\n- self.__tabix = None\n-\n- self.filename = fname\n-\n- if os.path.exists(\'%s.tbi\' % fname):\n- self.__tabix = pysam.Tabixfile(fname)\n-\n- if fileobj:\n- self.__readfile(fileobj)\n- elif fname:\n- with ngsutils.support.ngs_utils.gzip_opener(fname) as fobj:\n- self.__readfile(fobj)\n- elif region:\n- chrom, startend = region.split(\':\')\n- if \'-\' in startend:\n- start, end = [int(x) for x in startend.split(\'-\')]\n- else:\n- start = int(startend)\n- end = start\n- start -= 1\n-\n- self.__add_region(BedRegion(chrom, start, end))\n- else:\n- raise ValueError("Must specify either filename, fileobj, or region")\n-\n- def __readfile(self, fobj):\n- for line in fobj:\n- line = line.strip()\n- if line and line[0] != \'#\':\n- cols = line.split(\'\\t\')\n- while len(cols) < 6:\n- cols.append(\'\')\n-\n- region = BedRegion(*cols)\n- self.__add_region(region)\n-\n- self._bin_list.sort()\n- for bin in self._bins:\n- self._bins[bin].sort()\n-\n- def __add_region(self, region):\n- self._total += region.end - region.start\n- self._length += 1\n-\n- startbin = region.start / BedFile._bin_const\n- endbin = region.end / BedFile._bin_const\n-\n- for bin in xrange(startbin, endbin + 1):\n- if not (region.chrom, bin) in self._bins:\n- self._bin_list.append((region.chrom, bin))\n- self._bins[(region.chrom, bin)] = []\n- self._bins[(region.chrom, bin)].append(region)\n-\n- def fetch(self, chrom, start, end, strand=None):\n- \'\'\'\n- For TABIX indexed BED files, find all regions w/in a range\n-\n- For non-TABIX index BED files, use the calculated bins, and\n- output matching regions\n- \'\'\'\n-\n- if self.__tabix:\n- for match in self.__tabix.fetch(chrom, start, end):\n- region = BedRegion(*match.split(\'\\t\'))\n- if not strand or (strand'..b"\n-\n- def close(self):\n- pass\n-\n- @property\n- def length(self):\n- return self._length\n-\n- @property\n- def total(self):\n- return self._total\n-\n- def __iter__(self):\n- self._cur_bin_idx = 0\n- self._cur_bin_pos = 0\n- self._tellpos = 0\n- return self\n-\n- def next(self):\n- if self._cur_bin_idx >= len(self._bin_list):\n- raise StopIteration\n-\n- binvals = self._bins[self._bin_list[self._cur_bin_idx]]\n- while self._cur_bin_pos < len(binvals):\n- val = binvals[self._cur_bin_pos]\n- self._cur_bin_pos += 1\n-\n- startbin = (val.chrom, val.start / BedFile._bin_const)\n- if startbin == self._bin_list[self._cur_bin_idx]:\n- self._tellpos += 1\n- return val\n-\n- self._cur_bin_idx += 1\n- self._cur_bin_pos = 0\n- return self.next()\n-\n-\n-class BedRegion(object):\n- def __init__(self, chrom, start, end, name='', score='', strand='', thickStart='', thickEnd='', rgb='', *args):\n- self.chrom = chrom\n- self.start = int(start)\n- self.end = int(end)\n- self.name = name\n-\n- if score == '':\n- self.score = 0\n- else:\n- self.score = float(score)\n-\n- if strand == '':\n- self.strand = None\n- else:\n- self.strand = strand\n-\n- if thickStart == '':\n- self.thickStart = None\n- else:\n- self.thickStart = thickStart\n-\n- if thickEnd == '':\n- self.thickEnd = None\n- else:\n- self.thickEnd = thickEnd\n-\n- if rgb == '':\n- self.rgb = None\n- else:\n- self.rgb = rgb\n-\n- self.extras = args\n-\n- def clone(self, chrom=None, start=None, end=None, name=None, score=None, strand=None, thickStart=None, thickEnd=None, rgb=None, *args):\n- cols = []\n- cols.append(self.chrom if chrom is None else chrom)\n- cols.append(self.start if start is None else start)\n- cols.append(self.end if end is None else end)\n- cols.append(self.name if name is None else name)\n- cols.append(self.score if score is None else score)\n- cols.append(self.strand if strand is None else strand)\n- cols.append(self.thickStart if thickStart is None else thickStart)\n- cols.append(self.thickEnd if thickEnd is None else thickEnd)\n- cols.append(self.rgb if rgb is None else rgb)\n-\n- for i, val in enumerate(self.extras):\n- if len(args) > i:\n- cols.append(args[i])\n- else:\n- cols.append(val)\n-\n- return BedRegion(*cols)\n-\n- @property\n- def score_int(self):\n- score = str(self.score)\n- if score[-2:] == '.0':\n- score = score[:-2]\n-\n- return score\n-\n- def __key(self):\n- return (self.chrom, self.start, self.end, self.strand, self.name)\n-\n- def __lt__(self, other):\n- return self.__key() < other.__key()\n-\n- def __gt__(self, other):\n- return self.__key() > other.__key()\n-\n- def __eq__(self, other):\n- return self.__key() == other.__key()\n-\n- def write(self, out):\n- out.write('%s\\n' % self)\n-\n- def __repr__(self):\n- outcols = []\n-\n- if self.rgb:\n- outcols.append(self.rgb)\n- if self.thickEnd or outcols:\n- outcols.append(self.thickEnd if self.thickEnd else self.end)\n- if self.thickStart or outcols:\n- outcols.append(self.thickStart if self.thickStart else self.start)\n- if self.strand or outcols:\n- outcols.append(self.strand)\n- if self.score_int != '' or outcols:\n- outcols.append(self.score_int)\n- if self.name or outcols:\n- outcols.append(self.name)\n-\n- outcols.append(self.end)\n- outcols.append(self.start)\n- outcols.append(self.chrom)\n-\n- return '\\t'. join([str(x) for x in outcols[::-1]])\n" |
b |
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/bed/__init__.pyc |
b |
Binary file ngsutils/bed/__init__.pyc has changed |
b |
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/__init__.py --- a/ngsutils/support/__init__.py Sun Nov 27 15:01:21 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,254 +0,0 @@ -import collections -import gzip -import os -import re -import sys -try: - from eta import ETA -except: - pass - - -class FASTARead(collections.namedtuple('FASTARecord', 'name comment seq')): - def __repr__(self): - if self.comment: - return '>%s %s\n%s\n' % (self.name, self.comment, self.seq) - return '>%s\n%s\n' % (self.name, self.seq) - - def subseq(self, start, end, comment=None): - if self.comment: - comment = '%s %s' % (self.comment, comment) - - return FASTARead(self.name, comment, self.seq[start:end]) - - def clone(self, name=None, comment=None, seq=None): - n = name if name else self.name - c = comment if comment else self.comment - s = seq if seq else self.seq - - return FASTARead(n, c, s) - - def write(self, out): - out.write(repr(self)) - - -class FASTA(object): - def __init__(self, fname=None, fileobj=None, qual=False): - self.fname = fname - self.qual = qual - if fileobj: - self.fileobj = fileobj - else: - if self.fname == '-': - self.fileobj = sys.stdin - elif self.fname[-3:] == '.gz' or self.fname[-4:] == '.bgz': - self.fileobj = gzip.open(os.path.expanduser(self.fname)) - else: - self.fileobj = open(os.path.expanduser(self.fname)) - - if not self.fileobj: - raise ValueError("Missing valid filename or fileobj") - - def close(self): - if self.fileobj != sys.stdout: - self.fileobj.close() - - def tell(self): - # always relative to uncompressed... - return self.fileobj.tell() - - def seek(self, pos, whence=0): - self.fileobj.seek(pos, whence) - - def fetch(self, quiet=False): - name = '' - comment = '' - seq = '' - - if not quiet and self.fname and self.fname != '-': - eta = ETA(os.stat(self.fname).st_size, fileobj=self.fileobj) - else: - eta = None - - for line in self.fileobj: - line = line.strip() - if not line: - continue - if line[0] == '#': - continue - - if line[0] == '>': - if name and seq: - if eta: - eta.print_status(extra=name) - yield FASTARead(name, comment, seq) - - spl = re.split(r'[ \t]', line[1:], maxsplit=1) - name = spl[0] - if len(spl) > 1: - comment = spl[1] - else: - comment = '' - seq = '' - - else: - if self.qual: - seq = seq + ' ' + line - else: - seq += line - - if name and seq: - if eta: - eta.print_status(extra=name) - yield FASTARead(name, comment, seq) - - if eta: - eta.done() - - -def gzip_reader(fname, quiet=False, callback=None, done_callback=None, fileobj=None): - if fileobj: - f = fileobj - elif fname == '-': - f = sys.stdin - elif fname[-3:] == '.gz' or fname[-4:] == '.bgz': - f = gzip.open(os.path.expanduser(fname)) - else: - f = open(os.path.expanduser(fname)) - - if quiet or fname == '-': - eta = None - else: - eta = ETA(os.stat(fname).st_size, fileobj=f) - - for line in f: - if eta: - if callback: - extra = callback() - else: - extra = '' - - eta.print_status(extra=extra) - yield line - - if done_callback and done_callback(): - break - - if f != sys.stdin: - f.close() - - if eta: - eta.done() - - -class Symbolize(object): - 'Converts strings to symbols - basically a cache of strings' - - def __init__(self): - self.__cache = {} - - def __getitem__(self, k): - if k not in self.__cache: - self.__cache[k] = k - - return self.__cache[k] - - -symbols = Symbolize() - -_compliments = { - 'a': 't', - 'A': 'T', - 'c': 'g', - 'C': 'G', - 'g': 'c', - 'G': 'C', - 't': 'a', - 'T': 'A', - 'n': 'n', - 'N': 'N' -} - - -def revcomp(seq): - ''' - >>> revcomp('ATCGatcg') - 'cgatCGAT' - ''' - ret = [] - - for s in seq: - ret.append(_compliments[s]) - - ret.reverse() - return ''.join(ret) - - -class Counts(object): - ''' - Setup simple binning. Bins are continuous 0->max. Values are added to - bins and then means / distributions can be calculated. - ''' - - def __init__(self): - self.bins = [] - - def add(self, val): - while len(self.bins) <= val: - self.bins.append(0) - self.bins[val] += 1 - - def mean(self): - acc = 0 - count = 0 - - for i, val in enumerate(self.bins): - acc += (i * val) - count += val - - if count > 0: - return float(acc) / count - - def max(self): - return len(self.bins) - 1 - - -def memoize(func): - if 'TESTING' in os.environ or 'DEBUG' in os.environ: - return func - - __cache = {} - - def inner(*args, **kwargs): - k = (args, tuple(kwargs.iteritems())) - if k not in __cache: - __cache[k] = func(*args, **kwargs) - return __cache[k] - - inner.__doc__ = '(@memoized %s)\n%s' % (func.__name__, func.__doc__) - return inner - - -def quoted_split(s, delim, quote_char='"'): - tokens = [] - - buf = "" - inquote = False - - for c in s: - if inquote: - buf += c - if c == quote_char: - inquote = False - elif c == delim: - tokens.append(buf) - buf = "" - else: - buf += c - if c == quote_char: - inquote = True - - if buf: - tokens.append(buf) - - return tokens |
b |
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/__init__.pyc |
b |
Binary file ngsutils/support/__init__.pyc has changed |
b |
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/bgzip.py --- a/ngsutils/support/bgzip.py Sun Nov 27 15:01:21 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,138 +0,0 @@ -#!/usr/bin/env python -''' -Extract the blocks from a BGZip file. - -BAM files are stored as blocks in a bgzip archive. This class -will load the bgzip archive and output the block information. -''' - -import os -import struct -import sys - - -class BGZip(object): - def __init__(self, fname): - self.fname = fname - self.pos = 0 - self.fileobj = open(self.fname) - self.fsize = os.stat(self.fname).st_size - self.cur_chunk = 0 - - self.cpos = 0 - self.cdata = 0 - - def close(self): - self.fileobj.close() - - def seek(self, offset): - bgz_offset = offset >> 16 - chunk_offset = offset & 0xFFFF - - self.fileobj.seek(bgz_offset) - self.read_chunk() - self.chunk_pos = chunk_offset - - def read(self, amount, whence=0): - if whence not in [0, 1]: - print "Bad Whence value!: %s" % whence - return - - if whence == 0: - self.seek(0, 0) - - # read into chunk, if not enough data in chunk, read next chunk - ret = '' - while amount and self.pos < self.fsize: - if len(self.cdata) - self.cpos < amount: - ret += self.cdata[self.cpos:self.cpos + amount] - self.cpos += amount - return ret - else: - ret += self.cdata[self.cpos:] - amount = amount - len(ret) - self.read_chunk() - return ret - - def read_chunk(self): - self.fileobj.seek(10) - id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('<BBBBIBBH') - subpos = 0 - bsize = 0 - - while subpos < xlen: - si1, si2, slen = self._read_fields('<BBH') - if si1 == 66 and si2 == 67: - bsize, = self._read_fields('<H') - else: - self.fileobj.seek(slen, 1) - self.pos += slen - - subpos += 6 + slen - - cdata_size = bsize - xlen - 19 - self.cdata = self.fileobj.read(cdata_size) # inflate value - self.fileobj.seek(8) - - self.cur_chunk += 1 - self.cpos = 0 - - def dump(self): - self.fileobj.seek(0) - block_num = 0 - - while self.pos < self.fsize: - print "[BLOCK %s]" % block_num - # read header - id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('<BBBBIBBH') - print 'id1: %s' % id1 - print 'id2: %s' % id2 - print 'cm: %s' % cm - print 'flg: %s' % flg - print 'mtime: %s' % mtime - print 'xfl: %s' % xfl - print 'os: %s' % os - print 'xlen: %s' % xlen - - # read subfields - subpos = 0 - bsize = 0 - - while subpos < xlen: - si1, si2, slen = self._read_fields('<BBH') - print ' si1: %s' % si1 - print ' si1: %s' % si2 - print ' slen: %s' % slen - print ' data: [%s]' % slen - - if si1 == 66 and si2 == 67: - bsize, = self._read_fields('<H') - else: - self.fileobj.seek(slen, 1) - self.pos += slen - - subpos += 6 + slen - - cdata_size = bsize - xlen - 19 - - print 'bsize: %s' % bsize - print 'cdata: [%s]' % (cdata_size) - - self.fileobj.seek(cdata_size, 1) - self.pos += cdata_size - crc, isize = self._read_fields('<II') - - print "crc: %s" % crc - print "isize: %s" % isize - # print "POS: %s" % self.pos - - block_num += 1 - - def _read_fields(self, field_types): - size = struct.calcsize(field_types) - self.pos += size - return struct.unpack(field_types, self.fileobj.read(size)) - - -if __name__ == '__main__': - print BGZip(sys.argv[1]).dump() |
b |
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/dbsnp.py --- a/ngsutils/support/dbsnp.py Sun Nov 27 15:01:21 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,142 +0,0 @@ -''' -Support package for processing a dbSNP tabix dump from UCSC. -''' - -import collections -import sys - -import pysam -from ngsutils.support import revcomp - - -class SNPRecord(collections.namedtuple('SNPRecord', '''bin -chrom -chromStart -chromEnd -name -score -strand -refNCBI -refUCSC -observed -molType -clazz -valid -avHet -avHetSE -func -locType -weight -exceptions -submitterCount -submitters -alleleFreqCount -alleles -alleleNs -alleleFreqs -bitfields -''')): - __slots__ = () - - @property - def alleles(self): - alts = [] - for alt in self.observed.split('/'): - if alt != '-' and self.strand == '-': - alt = revcomp(alt) - - alts.append(alt) - - return alts - - @property - def snp_length(self): - return self.chromEnd - self.chromStart - - -def autotype(ar, length=26): - out = [] - for el in ar: - val = None - try: - val = int(el) - except: - try: - val = float(el) - except: - val = el - - out.append(val) - - if len(out) < 12: - raise TypeError("Invalid dbSNP file! We need at least 12 columns to work with.") - - while len(out) < length: - out.append(None) - return out - - -class DBSNP(object): - def __init__(self, fname): - self.dbsnp = pysam.Tabixfile(fname) - self.asTup = pysam.asTuple() - - def fetch(self, chrom, pos): - 'Note: pos is 0-based' - - # Note: tabix the command uses 1-based positions, but - # pysam.Tabixfile uses 0-based positions - - for tup in self.dbsnp.fetch(chrom, pos, pos + 1, parser=self.asTup): - snp = SNPRecord._make(autotype(tup)) - if snp.chromStart == pos: - yield snp - - def close(self): - self.dbsnp.close() - - def dump(self, chrom, op, pos, base, snp, exit=True): - print ' ->', op, chrom, pos, base - print snp - print snp.alleles - if exit: - sys.exit(1) - - def is_valid_variation(self, chrom, op, pos, seq, verbose=False): - for snp in self.fetch(chrom, pos): - if '/' not in snp.observed or snp.clazz not in ['single', 'mixed', 'in-del', 'insertion', 'deletion']: - # these are odd variations that we can't deal with... (microsatellites, tooLongToDisplay members, etc) - continue - - if op == 0: - if snp.clazz in ['single', 'mixed'] and seq in snp.alleles: - return True - # elif verbose: - # for alt in snp.alleles: - # if len(alt) > 1: - # self.dump(chrom, op, pos, seq, snp, False) - - elif op == 1: - if snp.clazz in ['insertion', 'mixed', 'in-del']: - if seq in snp.alleles: - if len(seq) > 1 and verbose: - self.dump(chrom, op, pos, seq, snp, False) - return True - - if verbose: - if len(seq) > 1: - self.dump(chrom, op, pos, seq, snp, False) - else: - for alt in snp.alleles: - if len(alt) > 1: - self.dump(chrom, op, pos, seq, snp, False) - - elif op == 2: - if snp.clazz in ['deletion', 'mixed', 'in-del']: - if '-' in snp.alleles and seq in snp.alleles: - if len(seq) > 1 and verbose: - self.dump(chrom, op, pos, seq, snp, False) - return True - - return False |
b |
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/dbsnp.pyc |
b |
Binary file ngsutils/support/dbsnp.pyc has changed |
b |
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/llh.py --- a/ngsutils/support/llh.py Sun Nov 27 15:01:21 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,56 +0,0 @@ -''' -Methods for calculating log-likelihoods for nucleotide frequencies -''' -import collections -import math - -from ngsutils.support import memoize - -_default_background = {'A': 0.3, 'T': 0.3, 'C': 0.2, 'G': 0.2} - -NucleotideLogLikelihood = collections.namedtuple('NucleotideLogLikelihood', 'A C G T pseudo') - - -@memoize -def pseudo_count(N, bg): - ''' - >>> pseudo_count(100, _default_background['A']) - 3 - >>> pseudo_count(100, _default_background['C']) - 2 - ''' - - return bg * math.sqrt(N) - - -def calc_llh(A, C, G, T, bg=_default_background, pseudo='auto'): - if pseudo == 'auto': - N = A + C + G + T - Ap = float(A) + pseudo_count(N, bg['A']) - Cp = float(C) + pseudo_count(N, bg['C']) - Gp = float(G) + pseudo_count(N, bg['G']) - Tp = float(T) + pseudo_count(N, bg['T']) - elif pseudo: - Ap = float(A) + pseudo - Cp = float(C) + pseudo - Gp = float(G) + pseudo - Tp = float(T) + pseudo - else: - Ap = float(A) - Cp = float(C) - Gp = float(G) - Tp = float(T) - - Np = Ap + Cp + Gp + Tp - - freqA = Ap / Np - freqC = Cp / Np - freqG = Gp / Np - freqT = Tp / Np - - return NucleotideLogLikelihood(math.log(freqA / bg['A']), math.log(freqC / bg['C']), math.log(freqG / bg['G']), math.log(freqT / bg['T']), pseudo) - - -if __name__ == '__main__': - import doctest - doctest.testmod() |
b |
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/ngs_utils.py --- a/ngsutils/support/ngs_utils.py Sun Nov 27 15:01:21 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,225 +0,0 @@ -#!/usr/bin/env python -""" -Common util classes / functions for the NGS project -""" -import collections -import gzip -import os -import re -import sys - - -def format_number(n): - ''' - >>> format_number(1000) - '1,000' - >>> format_number(1234567) - '1,234,567' - ''' - ar = list(str(n)) - for i in range(len(ar))[::-3][1:]: - ar.insert(i + 1, ',') - return ''.join(ar) - - -def natural_sort(ar): - ''' - >>> natural_sort('1 3 4 2 5'.split()) - ['1', '2', '3', '4', '5'] - >>> natural_sort('1 10 20 2 3 4'.split()) - ['1', '2', '3', '4', '10', '20'] - ''' - to_sort = [] - for item in ar: - spl = re.split('(\d+)', item) - l2 = [] - for el in spl: - try: - n = int(el) - except: - n = el - l2.append(n) - to_sort.append((l2, item)) - - to_sort.sort() - return [x[1] for x in to_sort] - - -def dictify(values, colnames): - """ - Convert a list of values into a dictionary based upon given column names. - - If the column name starts with an '@', the value is assumed to be a comma - separated list. - - If the name starts with a '#', the value is assumed to be an int. - - If the name starts with '@#', the value is assumed to a comma separated - list of ints. - - """ - d = {} - for i in xrange(len(colnames)): - key = colnames[i] - split = False - num = False - - if key[0] == '@': - key = key[1:] - split = True - if key[0] == '#': - key = key[1:] - num = True - - if i < len(values): - if num and split: - val = [int(x) for x in values[i].rstrip(',').split(',')] - elif num: - val = int(values[i]) - elif split: - val = values[i].rstrip(',').split(',') - else: - val = values[i] - - d[key] = val - - else: - d[key] = None - - return d - - -def gzip_aware_open(fname): - if fname == '-': - f = sys.stdin - elif fname[-3:] == '.gz' or fname[-4:] == '.bgz': - f = gzip.open(os.path.expanduser(fname)) - else: - f = open(os.path.expanduser(fname)) - return f - - -class gzip_opener: - ''' - A Python 2.6 class to handle 'with' opening of text files that may - or may not be gzip compressed. - ''' - - def __init__(self, fname): - self.fname = fname - - def __enter__(self): - self.f = gzip_aware_open(self.fname) - return self.f - - def __exit__(self, type, value, traceback): - if self.f != sys.stdin: - self.f.close() - return False - - -def filenames_to_uniq(names, new_delim='.'): - ''' - Given a set of file names, produce a list of names consisting of the - uniq parts of the names. This works from the end of the name. Chunks of - the name are split on '.' and '-'. - - For example: - A.foo.bar.txt - B.foo.bar.txt - returns: ['A','B'] - - AA.BB.foo.txt - CC.foo.txt - returns: ['AA.BB','CC'] - - >>> filenames_to_uniq('a.foo.bar.txt b.foo.bar.txt'.split()) - ['a', 'b'] - >>> filenames_to_uniq('a.b.foo.txt c.foo.txt'.split()) - ['a.b', 'c'] - - ''' - name_words = [] - maxlen = 0 - for name in names: - name_words.append(name.replace('.', ' ').replace('-', ' ').strip().split()) - name_words[-1].reverse() - if len(name_words[-1]) > maxlen: - maxlen = len(name_words[-1]) - - common = [False, ] * maxlen - for i in xrange(maxlen): - last = None - same = True - for nameword in name_words: - if i >= len(nameword): - same = False - break - if not last: - last = nameword[i] - elif nameword[i] != last: - same = False - break - common[i] = same - - newnames = [] - for nameword in name_words: - nn = [] - for (i, val) in enumerate(common): - if not val and i < len(nameword): - nn.append(nameword[i]) - nn.reverse() - newnames.append(new_delim.join(nn)) - return newnames - - -def parse_args(argv, defaults=None, expected_argc=0): - opts = {} - if defaults: - opts.update(defaults) - - args = [] - - i = 0 - while i < len(argv): - if argv[i][0] == '-': - arg = argv[i].lstrip('-') - if '=' in arg: - k, v = arg.split('=', 2) - if k in defaults: - if type(defaults[k]) == float: - opts[k] = float(v) - elif type(defaults[k]) == int: - opts[k] = int(v) - else: - opts[k] = v - else: - opts[arg] = True - else: - args.append(argv[i]) - i += 1 - - while len(args) < expected_argc: - args.append(None) - return opts, args - - -class memoize(object): - 'Simple memoizing decorator to cache results' - - def __init__(self, func): - self.func = func - self.cache = {} - - def __call__(self, *args): - if not isinstance(args, collections.Hashable): - # uncacheable. a list, for instance. - # better to not cache than blow up. - return self.func(*args) - - if args in self.cache: - return self.cache[args] - else: - value = self.func(*args) - self.cache[args] = value - return value |
b |
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/ngs_utils.pyc |
b |
Binary file ngsutils/support/ngs_utils.pyc has changed |
b |
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/regions.py --- a/ngsutils/support/regions.py Sun Nov 27 15:01:21 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,166 +0,0 @@ -class RangeMatch(object): - ''' - Simple genomic ranges. You can define chrom:start-end ranges, then ask if a - particular genomic coordinate maps to any of those ranges. This is less- - efficient than an R-Tree, but easier to code. - ''' - - def __init__(self, name): - self.ranges = {} - self.name = name - - def add_range(self, chrom, strand, start, end): - if chrom not in self.ranges: - self.ranges[chrom] = {} - - bin = start / 100000 - if bin not in self.ranges[chrom]: - self.ranges[chrom][bin] = [] - self.ranges[chrom][bin].insert(0, (start, end, strand)) - - if (end / 100000) != bin: - for bin in xrange(bin + 1, (end / 100000) + 1): - if bin not in self.ranges[chrom]: - self.ranges[chrom][bin] = [] - self.ranges[chrom][bin].insert(0, (start, end, strand)) - - def get_tag(self, chrom, strand, pos, ignore_strand=False): - ''' - returns (region, is_reverse_orientation) - ''' - if chrom not in self.ranges: - return None, False - bin = pos / 100000 - if bin not in self.ranges[chrom]: - return None, False - for start, end, r_strand in self.ranges[chrom][bin]: - if pos >= start and pos <= end: - if ignore_strand or strand == r_strand: - return self.name, False - return self.name, True - return None, False - - -class RegionTagger(object): - def __init__(self, gtf, valid_chroms=None, only_first_fragment=True): - self.regions = [] - self.counts = {} - self.only_first_fragment = only_first_fragment - - coding = RangeMatch('coding') - exons = RangeMatch('other-exon') - utr_5 = RangeMatch('utr-5') - utr_3 = RangeMatch('utr-3') - introns = RangeMatch('intron') - promoters = RangeMatch('promoter') - - for gene in gtf.genes: - if valid_chroms and gene.chrom not in valid_chroms: - continue - if gene.strand == '+': - promoters.add_range(gene.chrom, gene.strand, gene.start - 2000, gene.start) - else: - promoters.add_range(gene.chrom, gene.strand, gene.end, gene.end + 2000) - - for transcript in gene.transcripts: - if transcript.has_cds: - for start, end in transcript.cds: - coding.add_range(gene.chrom, gene.strand, start, end) - - # TODO: Fix this so that it iterates over exons in the 5'/3' UTRS - for s, e in transcript.utr_5: - utr_5.add_range(gene.chrom, gene.strand, s, e) - for s, e in transcript.utr_3: - utr_3.add_range(gene.chrom, gene.strand, s, e) - - last_end = None - for start, end in transcript.exons: - if last_end: - introns.add_range(gene.chrom, gene.strand, last_end, start) - exons.add_range(gene.chrom, gene.strand, start, end) - last_end = end - - self.regions.append(coding) - self.regions.append(utr_5) - self.regions.append(utr_3) - self.regions.append(exons) - self.regions.append(introns) - self.regions.append(promoters) - - self.counts['coding'] = 0 - self.counts['coding-rev'] = 0 - self.counts['other-exon'] = 0 - self.counts['utr-5'] = 0 - self.counts['utr-3'] = 0 - self.counts['utr-5-rev'] = 0 - self.counts['utr-3-rev'] = 0 - self.counts['intron'] = 0 - self.counts['promoter'] = 0 - self.counts['other-exon-rev'] = 0 - self.counts['intron-rev'] = 0 - self.counts['promoter-rev'] = 0 - self.counts['junction'] = 0 - self.counts['intergenic'] = 0 - self.counts['mitochondrial'] = 0 - - def add_read(self, read, chrom): - if read.is_unmapped: - return - - if self.only_first_fragment and read.is_paired and not read.is_read1: - return - - tag = None - is_rev = False - - strand = '-' if read.is_reverse else '+' - - if chrom == 'chrM': - tag = 'mitochondrial' - - if not tag: - for op, length in read.cigar: - if op == 3: - tag = 'junction' - break - - if not tag: - for region in self.regions: - tag, is_rev = region.get_tag(chrom, strand, read.pos) - if tag: - break - - if not tag: - tag = 'intergenic' - - if tag: - if is_rev: - self.counts['%s-rev' % tag] += 1 - else: - self.counts[tag] += 1 - - return tag - - def tag_region(self, chrom, start, end, strand): - tag = None - is_rev = False - - if chrom == 'chrM' or chrom == 'M': - tag = 'mitochondrial' - - if not tag: - for region in self.regions: - tag, is_rev = region.get_tag(chrom, strand, start) - if is_rev: - tag = '%s-rev' % tag - - if start != end: - endtag, is_rev = region.get_tag(chrom, strand, end) - if is_rev: - endtag = '%s-rev' % endtag - - if tag and endtag and endtag != tag: - tag = '%s/%s' % (tag, endtag) - - if not tag: - tag = 'intergenic' |
b |
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/stats.py --- a/ngsutils/support/stats.py Sun Nov 27 15:01:21 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,166 +0,0 @@ -''' -various statistical tests and methods... -''' -import math - -from ngsutils.support import memoize - - -def median(vals): - ''' - >>> median([1,2,3]) - 2 - >>> median([1,2,3,4]) - 2.5 - ''' - vals.sort() - - if len(vals) % 2 == 1: - return vals[len(vals) / 2] - else: - a = vals[(len(vals) / 2) - 1] - b = vals[(len(vals) / 2)] - return float(a + b) / 2 - - -def mean_stdev(l): - ''' - >>> mean_stdev([1,2,2,2]) - (1.75, 0.5) - >>> mean_stdev([2,2,2,2]) - (2.0, 0.0) - ''' - - acc = 0 - for el in l: - acc += el - - mean = float(acc) / len(l) - acc = 0 - for el in l: - acc += (el - mean) ** 2 - - if len(l) > 2: - stdev = math.sqrt(float(acc) / (len(l) - 1)) - else: - stdev = 0.0 - - return (mean, stdev) - - -def counts_median(d): - ''' - Calculate the median from counts stored in a dictionary - >>> counts_median({ 1: 4, 2: 1, 3: 4 }) - 2 - >>> counts_median({ 1: 4, 3: 4 }) - 2 - - ''' - count = 0 - for k in d: - count += d[k] - - if count == 0: - return 0 - - acc = 0.0 - last = 0 - for k in sorted(d): - if last: - return (last + k) / 2 - acc += d[k] - if acc / count == 0.5: - last = k - elif acc / count > 0.5: - return k - - -def counts_mean_stdev(d): - ''' - - calc mean / stdev when data is stored as counts in a dictionary - - Ex: - { 1: 4, 2: 1, 3: 4 } = [1, 1, 1, 1, 2, 3, 3, 3, 3] - - >>> counts_mean_stdev({ 1: 4, 2: 1, 3: 4 }) - (2.0, 1.0) - - ''' - - acc = 0 - count = 0 - for k in d: - acc += k * d[k] - count += d[k] - - mean = float(acc) / count - - acc = 0 - for k in d: - acc += (((k - mean) ** 2) * d[k]) - - if count > 2: - stdev = math.sqrt(float(acc) / (count - 1)) - else: - stdev = 0.0 - - return (mean, stdev) - - -@memoize -def poisson_prob(x, mean): - ''' - Return the probability that you could get x counts in - a Poisson test with a mean value. - - prob(x) = sum(i=1..x){poisson(i)} - - >>> poisson_prob(6,10) - 0.1300960209527205 - >>> poisson_prob(8,10) - 0.33277427882095645 - ''' - acc = 0.0 - for i in xrange(1, x + 1): - acc += poisson_func(i, mean) - return acc - - -@memoize -def poisson_func(mu, lambd): - ''' - This is the Poisson distribution function - - p(mu) = (lambda^mu * e^(-lambda)) / (mu!) - - mu is a count - lambd is the mean - - >>> poisson_func(1,10) - 0.00045399929762484856 - >>> poisson_func(2,10) - 0.0022699964881242427 - >>> poisson_func(3,10) - 0.007566654960414142 - ''' - return (lambd ** mu) * (math.exp(-1 * lambd)) / _factorial(mu) - - -@memoize -def _factorial(x): - ''' - >>> _factorial(1) - 1 - >>> _factorial(2) - 2 - >>> _factorial(3) - 6 - ''' - return math.factorial(x) - - -if __name__ == '__main__': - import doctest - doctest.testmod() |