Next changeset 1:8187a729d9f4 (2015-12-06) |
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068 |
added:
bam_filter.xml filter.py macros.xml ngsutils/__init__.py ngsutils/__init__.pyc ngsutils/bam/__init__.py ngsutils/bam/__init__.pyc ngsutils/bed/__init__.py ngsutils/bed/__init__.pyc ngsutils/support/__init__.py ngsutils/support/__init__.pyc ngsutils/support/bgzip.py ngsutils/support/dbsnp.py ngsutils/support/dbsnp.pyc ngsutils/support/llh.py ngsutils/support/ngs_utils.py ngsutils/support/ngs_utils.pyc ngsutils/support/regions.py ngsutils/support/stats.py test-data/ngsutils_bam_filter_input1.bam test-data/ngsutils_bam_filter_result1.bam test-data/ngsutils_bam_filter_result2.bam test-data/ngsutils_bam_filter_result3.bam test-data/ngsutils_bam_filter_result4.bam tool_dependencies.xml |
b |
diff -r 000000000000 -r 4e4e4093d65d bam_filter.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bam_filter.xml Wed Nov 11 13:04:07 2015 -0500 |
[ |
b'@@ -0,0 +1,231 @@\n+<tool id="ngsutils_bam_filter" name="BAM filter" version="@WRAPPER_VERSION@.0">\n+ <description>Removes reads from a BAM file based on criteria</description>\n+ <macros>\n+ <import>macros.xml</import>\n+ </macros>\n+ <expand macro="requirements" />\n+ <expand macro="stdio" />\n+ <expand macro="version" />\n+ <command><![CDATA[\n+ ## If the tool is executed with no filtering option,\n+ ## the default parameters simply copy over the input file\n+ if grep -q "\\w" ${parameters};\n+ then\n+ $__tool_directory__/filter.py\n+ $infile\n+ $outfile\n+ `cat ${parameters}`;\n+ else\n+ cp $infile $outfile;\n+ fi\n+]]>\n+ </command>\n+ <configfiles>\n+ <configfile name="parameters">\n+<![CDATA[\n+ #if $minlen:\n+ -minlen $minlen\n+ #end if\n+ #if $maxlen\n+ -maxlen $maxlen\n+ #end if\n+ $mapped\n+ $unmapped\n+ $properpair\n+ $noproperpair\n+ #if $mask:\n+ -mask "${mask}"\n+ #end if\n+ #if int($uniq) > -1:\n+ -uniq\n+ #if int($uniq) > 0:\n+ $uniq\n+ #end if\n+ #end if\n+ $uniq_start\n+ #if $mismatch:\n+ -mismatch $mismatch\n+ #end if\n+ $nosecondary\n+ $noqcfail\n+ $nopcrdup\n+ #if $excludebed:\n+ -excludebed "${excludebed}" $ignore_strand\n+ #end if\n+ #if $includebed:\n+ -includebed "${includebed}" $ignore_strand\n+ #end if\n+ #if $includeref:\n+ -includeref "${includeref}"\n+ #end if\n+ #if $excluderef:\n+ -excluderef "${excluderef}"\n+ #end if\n+ #if $maximum_mismatch_ratio\n+ -maximum_mismatch_ratio $maximum_mismatch_ratio\n+ #end if\n+ ]]>\n+ </configfile>\n+ </configfiles>\n+ <inputs>\n+ <param name="infile" type="data" format="bam" label="Select BAM dataset" />\n+ <param argument="-minlen" type="integer" value="" optional="True" min="0"\n+ label="Remove reads that are smaller than"\n+ help="in bp"/>\n+ <param argument="-maxlen" type="integer" value="" optional="True" min="0"\n+ label="Remove reads that are larger than"\n+ help="in bp"/>\n+ <param argument="-mapped" truevalue="-mapped" type="boolean" falsevalue=""\n+ label="Keep only mapped reads"\n+ help="" />\n+ <param argument="-unmapped" truevalue="-unmapped" type="boolean" falsevalue=""\n+ label="Keep only unmapped reads"\n+ help="" />\n+ <param argument="-properpair" truevalue="-properpair" type="boolean" falsevalue=""\n+ label="Keep only properly paired reads"\n+ help="both mapped, correct orientation, flag set in BAM" />\n+ <param argument="-noproperpair" truevalue="-noproperpair" type="boolean" falsevalue=""\n+ label="Discard properly paired reads"\n+ help="" />\n+ <param argument="-mask" type="text" value="" optional="True" label="Remove reads that match the mask"\n+ help="e.g. 0x400, 0x2" />\n+\n+ <param argument="-uniq" type="integer" value="-1" optional="True" min="-1"\n+ label="Remove reads that have the same sequence"\n+ help="up to the Nth nucleotide starting from the 5prime end. -1 means do not filter, 0 means check along the entire read,\n+ 10 would make filter reads that are unique over the first 10 nucleotides."/>\n+\n+ <param argument="-uniq_start" truevalue="-uniq_start" type="boolean" falsevalue=""\n+ label="Remove reads that start at the same position"\n+ help="Use only for low-coverage samples" />\n+\n+ <param argument="-mismatch" type="integer" value="" optional="True" min="0"\n+ label="Remove reads with that many mismatches"\n+ help="Indels always counts as 1 regardless of length. Requires NM tag to b'..b'--------------+\n+| -maxlen val | Remove reads that are larger than {val} |\n++--------------------------------+-------------------------------------------------+\n+| -mapped | Keep only mapped reads |\n++--------------------------------+-------------------------------------------------+\n+| -unmapped | Keep only unmapped reads |\n++--------------------------------+-------------------------------------------------+\n+| -properpair | Keep only properly paired reads (both mapped, |\n+| | correct orientation, flag set in BAM) |\n++--------------------------------+-------------------------------------------------+\n+| -noproperpair | Keep only not-properly paired reads |\n++--------------------------------+-------------------------------------------------+\n+| -mask bitmask | Remove reads that match the mask (e.g. 0x400) |\n++--------------------------------+-------------------------------------------------+\n+| -uniq {length} | Remove reads that are have the same sequence |\n+| | Note: BAM file should be sorted |\n+| | (up to an optional length) |\n++--------------------------------+-------------------------------------------------+\n+| -uniq_start | Remove reads that start at the same position |\n+| | Note: BAM file should be sorted |\n+| | (Use only for low-coverage samples) |\n++--------------------------------+-------------------------------------------------+\n+|-mismatch num | Number of mismatches or indels |\n+| | indel always counts as 1 regardless of length |\n+| | (requires NM tag in reads) |\n++--------------------------------+-------------------------------------------------+\n+|-nosecondary | Remove reads that have the 0x100 flag set |\n++--------------------------------+-------------------------------------------------+\n+|-noqcfail | Remove reads that have the 0x200 flag set |\n++--------------------------------+-------------------------------------------------+\n+|-nopcrdup | Remove reads that have the 0x400 flag set |\n++--------------------------------+-------------------------------------------------+\n+|-excludebed file.bed {nostrand} | Remove reads that are in any of the regions |\n+| | from the given BED file. If \'nostrand\' is given,|\n+| | strand information from the BED file is ignored.|\n++--------------------------------+-------------------------------------------------+\n+|-includebed file.bed {nostrand} | Remove reads that are NOT any of the regions |\n+| | from the given BED file. If \'nostrand\' is given,|\n+| | strand information from the BED file is ignored.|\n+| | Note: If this is a large dataset, use |\n+| | "bamutils extract" instead. |\n++--------------------------------+-------------------------------------------------+\n+| -includeref refname | Exclude reads NOT mapped to a reference |\n++--------------------------------+-------------------------------------------------+\n+| -excluderef refname | Exclude reads mapped to a particular reference |\n+| | (e.g. chrM, or _dup chromosomes) |\n++--------------------------------+-------------------------------------------------+\n+]]>\n+ </help>\n+</tool>\n' |
b |
diff -r 000000000000 -r 4e4e4093d65d filter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter.py Wed Nov 11 13:04:07 2015 -0500 |
[ |
b'@@ -0,0 +1,986 @@\n+#!/usr/bin/env python\n+## category General\n+## desc Removes reads from a BAM file based on criteria\n+"""\n+Removes reads from a BAM file based on criteria\n+\n+Given a BAM file, this script will only allow reads that meet filtering\n+criteria to be written to output. The output is another BAM file with the\n+reads not matching the criteria removed.\n+\n+Note: this does not adjust tag values reflecting any filtering. (for example:\n+ if a read mapped to two locations (IH:i:2), and one was removed by\n+ filtering, the IH:i tag would still read IH:i:2).\n+\n+Currently, the available filters are:\n+ -minlen val Remove reads that are smaller than {val}\n+ -maxlen val Remove reads that are larger than {val}\n+ -mapped Keep only mapped reads\n+ -unmapped Keep only unmapped reads\n+ -properpair Keep only properly paired reads (both mapped, \n+ correct orientation, flag set in BAM)\n+ -noproperpair Keep only not-properly paired reads\n+\n+ -mask bitmask Remove reads that match the mask (base 10/hex)\n+ -uniq {length} Remove reads that are have the same sequence\n+ Note: BAM file should be sorted\n+ (up to an optional length)\n+ -uniq_start Remove reads that start at the same position\n+ Note: BAM file should be sorted\n+ (Use only for low-coverage samples)\n+\n+ -mismatch num # mismatches or indels\n+ indel always counts as 1 regardless of length\n+ (requires NM tag in reads)\n+\n+ -mismatch_dbsnp num dbsnp.txt.bgz\n+ # mismatches or indels - not in dbSNP.\n+ Variations are called using the MD tag.\n+ Variations that are found in the dbSNP list are\n+ not counted as mismatches. The dbSNP list is a\n+ Tabix-indexed dump of dbSNP (from UCSC Genome\n+ Browser). Indels in dbSNP are also counted.\n+ Adds a \'ZS:i\' tag with the number of found SNPs\n+ in the read.\n+ (requires NM and MD tags)\n+\n+ Example command for indexing:\n+ ngsutils tabixindex snp.txt.gz -s 2 -b 3 -e 4 -0\n+\n+ -mismatch_ref num ref.fa # mismatches or indel - looks up mismatches\n+ directly in a reference FASTA file\n+ (use if NM tag not present)\n+\n+ -mismatch_ref_dbsnp num ref.fa dbsnp.txt.bgz\n+ # mismatches or indels - looks up mismatches\n+ directly from a reference FASTA file. (See\n+ -mismatch_dbsnp for dbSNP matching)\n+ (use if NM or MD tag not present)\n+\n+ -nosecondary Remove reads that have the 0x100 flag set\n+ -noqcfail Remove reads that have the 0x200 flag set\n+ -nopcrdup Remove reads that have the 0x400 flag set\n+\n+\n+ -exclude ref:start-end Remove reads in this region (1-based start)\n+ -excludebed file.bed {nostrand}\n+ Remove reads that are in any of the regions\n+ from the given BED file. If \'nostrand\' is given,\n+ strand information from the BED file is ignored.\n+\n+ -include ref:start-end Remove reads NOT in the region (can only be one)\n+ -includebed file.bed {nostrand}\n+ Remove reads that are NOT any of the regions\n+ from the given BED file. If \'nostrand\' is given'..b' if failedfile:\n+ failed_out = open(failedfile, \'w\')\n+ else:\n+ failed_out = None\n+\n+ passed = 0\n+ failed = 0\n+\n+ def _callback(read):\n+ return "%s | %s kept,%s failed" % (\'%s:%s\' % (bamfile.getrname(read.tid), read.pos) if read.tid > -1 else \'unk\', passed, failed)\n+\n+ for read in bam_iter(bamfile, quiet=True):\n+ p = True\n+\n+ for criterion in criteria:\n+ if not criterion.filter(bamfile, read):\n+ p = False\n+ failed += 1\n+ if failed_out:\n+ failed_out.write(\'%s\\t%s\\n\' % (read.qname, criterion))\n+ #outfile.write(read_to_unmapped(read))\n+ break\n+ if p:\n+ passed += 1\n+ outfile.write(read)\n+\n+ bamfile.close()\n+ outfile.close()\n+ if failed_out:\n+ failed_out.close()\n+ sys.stdout.write("%s kept\\n%s failed\\n" % (passed, failed))\n+\n+ for criterion in criteria:\n+ criterion.close()\n+\n+\n+def read_to_unmapped(read):\n+ \'\'\'\n+ Example unmapped read\n+ 2358_2045_1839 | 4 | * | 0 | 0 | * | * | 0 | 0 | GGACGAGAAGGAGTATTTCTCCGAGAACACATTCACGGAGAGTCTAACTC | 0QT\\VNO^IIRJKXTIHIKTY\\STZZ[XOJKPWLHJQQQ^XPQYIIRQII | PG:Z:bfast | AS:i:- | NH:i:0 | IH:i:1 | HI:i:1 | CS:Z:T10213222020221330022203222011113021130222212230122 | CQ:Z:019=2+><%@.\'>52%)\'15?90<7;:5+(-49(\'-7-<>5-@5%<.2%= | CM:i:0 | XA:i:4\n+\n+ Example mapped read:\n+ 2216_1487_198 | 16 | chr11:19915291-19925392 | 5531 | 12 | 24M2D26M | * | 0 | 0 | TCTGTCTGGGTGCAGTAGCTATACGTAATCCCAGCACTTTGGGAGGCCAA | 1C8FZ`M""""""WZ""%"#\\I;"`R@D]``ZZ``\\QKX\\Y]````IK^` | PG:Z:bfast | AS:i:1150 | NM:i:2 | NH:i:10 | IH:i:1 | HI:i:1 | MD:Z:24^CT26 | CS:Z:T00103022001002113210023031213331122121223321221122 | CQ:Z:A8=%;AB<;97<3/9:>>3>@?5&18@-%;866:94=14:=?,%?:7&)1 | CM:i:9 | XA:i:4 | XE:Z:-------23322---2-11----2----------------------------\n+\n+ \'\'\'\n+ # TODO: rewrite read properties to unmapped state\n+ # if colorspace: convert CS to NT directly\n+ # remove excess tags\n+\n+ read.is_unmapped = True\n+ read.tid = None\n+ read.pos = 0\n+ read.mapq = 0\n+ return read\n+\n+if __name__ == \'__main__\':\n+ infile = None\n+ outfile = None\n+ failed = None\n+ criteria = []\n+\n+ crit_args = []\n+ last = None\n+ verbose = False\n+ fail = False\n+\n+ for arg in sys.argv[1:]:\n+ if last == \'-failed\':\n+ failed = arg\n+ last = None\n+ elif arg == \'-h\':\n+ usage()\n+ elif arg == \'-failed\':\n+ last = arg\n+ elif arg == \'-v\':\n+ verbose = True\n+ elif not infile and os.path.exists(arg):\n+ infile = arg\n+ elif not outfile:\n+ outfile = arg\n+ elif arg[0] == \'-\':\n+ if not arg[1:] in _criteria:\n+ print "Unknown criterion: %s" % arg\n+ fail = True\n+ if crit_args:\n+ criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:]))\n+ crit_args = [arg, ]\n+ elif crit_args:\n+ crit_args.append(arg)\n+ else:\n+ print "Unknown argument: %s" % arg\n+ fail = True\n+\n+ if not fail and crit_args:\n+ criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:]))\n+\n+ if fail or not infile or not outfile or not criteria:\n+ if not infile and not outfile and not criteria:\n+ usage()\n+\n+ if not infile:\n+ print "Missing: input bamfile"\n+ if not outfile:\n+ print "Missing: output bamfile"\n+ if not criteria:\n+ print "Missing: filtering criteria"\n+ usage()\n+ else:\n+ bam_filter(infile, outfile, criteria, failed, verbose)\n' |
b |
diff -r 000000000000 -r 4e4e4093d65d macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Wed Nov 11 13:04:07 2015 -0500 |
b |
@@ -0,0 +1,16 @@ +<macros> + <token name="@WRAPPER_VERSION@">0.5.7</token> + <xml name="requirements"> + <requirements> + <requirement type="package" version="0.7.7">pysam</requirement> + </requirements> + </xml> + <xml name="version"> + <version_command>@WRAPPER_VERSION@</version_command> + </xml> + <xml name="stdio"> + <stdio> + <exit_code range="1:" level="fatal" /> + </stdio> + </xml> +</macros> |
b |
diff -r 000000000000 -r 4e4e4093d65d ngsutils/__init__.pyc |
b |
Binary file ngsutils/__init__.pyc has changed |
b |
diff -r 000000000000 -r 4e4e4093d65d ngsutils/bam/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsutils/bam/__init__.py Wed Nov 11 13:04:07 2015 -0500 |
[ |
b"@@ -0,0 +1,879 @@\n+import sys\n+import os\n+import re\n+import pysam\n+try:\n+ from eta import ETA\n+except:\n+ pass\n+import ngsutils.support\n+\n+\n+def bam_open(fname, mode='r', *args, **kwargs):\n+ if fname.lower()[-4:] == '.bam':\n+ return pysam.Samfile(fname, '%sb' % mode, *args, **kwargs)\n+ return pysam.Samfile(fname, '%s' % mode, *args, **kwargs)\n+\n+\n+def bam_pileup_iter(bam, mask=1796, quiet=False, callback=None):\n+ if not quiet and bam.filename:\n+ eta = ETA(os.stat(bam.filename).st_size)\n+ else:\n+ eta = None\n+\n+ for pileup in bam.pileup(mask=mask):\n+ pos = bam.tell()\n+ bgz_offset = pos >> 16\n+\n+ if not quiet:\n+ if callback:\n+ eta.print_status(bgz_offset, extra=callback(pileup))\n+ else:\n+ eta.print_status(bgz_offset, extra='%s:%s' % (bam.getrname(pileup.tid), pileup.pos))\n+\n+ yield pileup\n+\n+ if eta:\n+ eta.done()\n+\n+\n+def bam_iter(bam, quiet=False, show_ref_pos=False, ref=None, start=None, end=None, callback=None):\n+ '''\n+ >>> [x.qname for x in bam_iter(bam_open(os.path.join(os.path.dirname(__file__), 't', 'test.bam')), quiet=True)]\n+ ['A', 'B', 'E', 'C', 'D', 'F', 'Z']\n+ '''\n+\n+ if os.path.exists('%s.bai' % bam.filename):\n+ # This is an indexed file, so it is ref sorted...\n+ # Meaning that we should show chrom:pos, instead of read names\n+ show_ref_pos = True\n+\n+ eta = None\n+\n+ if not ref:\n+ if not quiet and bam.filename:\n+ eta = ETA(os.stat(bam.filename).st_size)\n+\n+ for read in bam:\n+ pos = bam.tell()\n+ bgz_offset = pos >> 16\n+\n+ if not quiet and eta:\n+ if callback:\n+ eta.print_status(bgz_offset, extra=callback(read))\n+ elif (show_ref_pos):\n+ if read.tid > -1:\n+ eta.print_status(bgz_offset, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))\n+ else:\n+ eta.print_status(bgz_offset, extra='unmapped %s' % (read.qname))\n+ else:\n+ eta.print_status(bgz_offset, extra='%s' % read.qname)\n+\n+ yield read\n+\n+ else:\n+ working_chrom = None\n+ if ref in bam.references:\n+ working_chrom = ref\n+ elif ref[0:3] == 'chr':\n+ # compensate for Ensembl vs UCSC ref naming\n+ if ref[3:] in bam.references:\n+ working_chrom = ref[3:]\n+\n+ if not working_chrom:\n+ raise ValueError('Missing reference: %s' % ref)\n+\n+ tid = bam.gettid(working_chrom)\n+\n+ if not start:\n+ start = 0\n+ if not end:\n+ end = bam.lengths[tid]\n+\n+ if not quiet and bam.filename:\n+ eta = ETA(end - start)\n+\n+ for read in bam.fetch(working_chrom, start, end):\n+ if not quiet and eta:\n+ if callback:\n+ eta.print_status(read.pos - start, extra=callback(read))\n+ else:\n+ eta.print_status(read.pos - start, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))\n+\n+ yield read\n+\n+ if eta:\n+ eta.done()\n+\n+\n+def bam_batch_reads(bam, quiet=False):\n+ '''\n+ Batch mapping for the same reads (qname) together, this way\n+ they can all be compared/converted together.\n+ '''\n+ reads = []\n+ last = None\n+ for read in bam_iter(bam, quiet=quiet):\n+ if last and read.qname != last:\n+ yield reads\n+ reads = []\n+ last = read.qname\n+ reads.append(read)\n+\n+ if reads:\n+ yield reads\n+\n+\n+bam_cigar = ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X']\n+bam_cigar_op = {\n+ 'M': 0,\n+ 'I': 1,\n+ 'D': 2,\n+ 'N': 3,\n+ 'S': 4,\n+ 'H': 5,\n+ 'P': 6,\n+ '=': 7,\n+ 'X': 8,\n+}\n+\n+\n+def cigar_fromstr(s):\n+ '''\n+ >>> cigar_fromstr('10M5I20M')\n+ [(0, 10), (1, 5), (0,"..b'n True, \'\'\n+\n+\n+def read_alignment_fragments_gen(read):\n+ \'\'\'\n+ Takes a read and returns the start and end positions for each match/mismatch region.\n+ This will let us know where each read alignment "touches" the genome.\n+ \'\'\'\n+\n+ for start, end in _read_alignment_fragments_gen(read.pos, read.cigar):\n+ yield (start, end)\n+\n+\n+def _read_alignment_fragments_gen(pos, cigar):\n+ \'\'\' Test-able version of read_alignment_fragments_gen\n+ >>> list(_read_alignment_fragments_gen(1, cigar_fromstr(\'50M\')))\n+ [(1, 51)]\n+\n+ >>> list(_read_alignment_fragments_gen(1, cigar_fromstr(\'25M100N25M\')))\n+ [(1, 26), (126, 151)]\n+\n+ >>> list(_read_alignment_fragments_gen(1, cigar_fromstr(\'20M1D4M100N10M5I10M\')))\n+ [(1, 21), (22, 26), (126, 136), (136, 146)]\n+ \'\'\'\n+ ref_pos = pos\n+\n+ for op, length in cigar:\n+ if op == 0:\n+ yield (ref_pos, ref_pos + length)\n+ ref_pos += length\n+ elif op == 1:\n+ pass\n+ elif op == 2:\n+ ref_pos += length\n+ elif op == 3:\n+ ref_pos += length\n+ else:\n+ raise ValueError("Unsupported CIGAR operation: %s" % op)\n+\n+\n+def read_cigar_at_pos(cigar, qpos, is_del):\n+ \'\'\'\n+ Returns the CIGAR operation for a given read position\n+\n+ qpos is the 0-based index of the base in the read\n+ \'\'\'\n+ pos = 0\n+ returnnext = False\n+ for op, length in cigar:\n+ if returnnext:\n+ return op\n+ if op == 0:\n+ pos += length\n+ elif op == 1:\n+ pos += length\n+ elif op == 2:\n+ pass\n+ elif op == 3:\n+ pass\n+ elif op == 4:\n+ pos += length\n+ elif op == 5:\n+ pass\n+ else:\n+ raise ValueError("Unsupported CIGAR operation: %s" % op)\n+\n+ if pos > qpos:\n+ return op\n+ if is_del and pos == qpos:\n+ returnnext = True\n+\n+ return None\n+\n+\n+def cleancigar(cigar):\n+ \'\'\'\n+ Cleans a CIGAR alignment to remove zero length operations\n+\n+ >>> cigar_tostr(cleancigar(cigar_fromstr(\'31M0N52M18S\')))\n+ \'83M18S\'\n+\n+ \'\'\'\n+ newcigar = []\n+\n+ changed = False\n+ zero = False\n+\n+ for op, size in cigar:\n+ if size > 0:\n+ if zero and newcigar:\n+ if newcigar[-1][0] == op:\n+ newsize = newcigar[-1][1] + size\n+ newcigar[-1] = (op, newsize)\n+ zero = 0\n+ else:\n+ newcigar.append((op, size))\n+ else:\n+ changed = True\n+ zero = True\n+\n+ if changed:\n+ return newcigar\n+\n+ return None\n+\n+\n+def read_cleancigar(read):\n+ \'\'\'\n+ Replaces the CIGAR string for a read to remove any operations that are zero length.\n+\n+ This happens to the read in-place\n+ \'\'\'\n+ if read.is_unmapped:\n+ return False\n+\n+ newcigar = []\n+\n+ newcigar = cleancigar(read.cigar)\n+\n+ if newcigar:\n+ read.cigar = newcigar\n+ return True\n+\n+ return False\n+\n+\n+def read_to_unmapped(read, ref=None):\n+ \'\'\'\n+ Converts a read from mapped to unmapped.\n+\n+ Sets the \'ZR\' tag to indicate the original ref/pos/cigar (if ref is passed)\n+ \'\'\'\n+\n+ newread = pysam.AlignedRead()\n+\n+ if ref:\n+ tags = [(\'ZR\', \'%s:%s:%s\' % (ref, read.pos, cigar_tostr(read.cigar)))]\n+\n+ newread.is_unmapped = True\n+ newread.mapq = 0\n+ newread.tlen = 0\n+ newread.pos = -1\n+ newread.pnext = -1\n+ newread.rnext = -1\n+ newread.tid = -1\n+\n+ newread.qname = read.qname\n+\n+ if read.is_paired:\n+ newread.is_paired = True\n+\n+ if not read.is_unmapped and read.is_reverse:\n+ newread.seq = ngsutils.support.revcomp(read.seq)\n+ newread.qual = read.qual[::-1]\n+ else: \n+ newread.seq = read.seq\n+ newread.qual = read.qual\n+\n+ newread.tags = tags\n+\n+ return newread\n+\n+\n+\n+if __name__ == \'__main__\':\n+ import doctest\n+ doctest.testmod()\n' |
b |
diff -r 000000000000 -r 4e4e4093d65d ngsutils/bam/__init__.pyc |
b |
Binary file ngsutils/bam/__init__.pyc has changed |
b |
diff -r 000000000000 -r 4e4e4093d65d ngsutils/bed/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsutils/bed/__init__.py Wed Nov 11 13:04:07 2015 -0500 |
[ |
b'@@ -0,0 +1,282 @@\n+import os\n+import ngsutils.support.ngs_utils\n+import pysam\n+\n+\n+class BedStreamer(object):\n+ \'\'\'\n+ Streams BedRegions from a BED file\n+\n+ Note - this can only be used once! There is no mechanism to seek the stream.\n+ \'\'\'\n+\n+ def __init__(self, fname=None, fileobj=None, quiet=False):\n+ if not fname and not fileobj:\n+ raise ValueError("You must specify either fname or fileobj!")\n+\n+ self.reader = ngsutils.support.gzip_reader(fname=fname, quiet=quiet, fileobj=fileobj)\n+\n+ def __iter__(self):\n+ return self\n+\n+ def next(self):\n+ try:\n+ while True:\n+ line = self.reader.next().strip()\n+ if line and line[0] != \'#\':\n+ cols = line.split(\'\\t\')\n+ while len(cols) < 6:\n+ cols.append(\'\')\n+\n+ return BedRegion(*cols)\n+ except:\n+ raise StopIteration\n+\n+\n+\n+class BedFile(object):\n+ \'\'\'\n+ BED files are read in their entirety memory, in a series of bins. Each bin\n+ is ~100kb in size. Each bin can then be iterated over.\n+\n+ This is less efficient than using a proper index, but in reality, this\n+ usually isn\'t an issue. However, if the BED file has been Tabix indexed,\n+ that index will be used for random access.\n+\n+ NOTE: This isn\'t very efficient, so perhaps this can be remodeled into a BedFile\n+ and a BedFileIndex where the file is indexed only if random access is requested.\n+ \'\'\'\n+\n+ _bin_const = 100000\n+\n+ def __init__(self, fname=None, fileobj=None, region=None):\n+ self._bins = {}\n+ self._bin_list = []\n+ self._cur_bin_idx = 0\n+ self._cur_bin_pos = 0\n+ self._tellpos = 0\n+ self._total = 0\n+ self._length = 0\n+ self.__tabix = None\n+\n+ self.filename = fname\n+\n+ if os.path.exists(\'%s.tbi\' % fname):\n+ self.__tabix = pysam.Tabixfile(fname)\n+\n+ if fileobj:\n+ self.__readfile(fileobj)\n+ elif fname:\n+ with ngsutils.support.ngs_utils.gzip_opener(fname) as fobj:\n+ self.__readfile(fobj)\n+ elif region:\n+ chrom, startend = region.split(\':\')\n+ if \'-\' in startend:\n+ start, end = [int(x) for x in startend.split(\'-\')]\n+ else:\n+ start = int(startend)\n+ end = start\n+ start -= 1\n+\n+ self.__add_region(BedRegion(chrom, start, end))\n+ else:\n+ raise ValueError("Must specify either filename, fileobj, or region")\n+\n+ def __readfile(self, fobj):\n+ for line in fobj:\n+ line = line.strip()\n+ if line and line[0] != \'#\':\n+ cols = line.split(\'\\t\')\n+ while len(cols) < 6:\n+ cols.append(\'\')\n+\n+ region = BedRegion(*cols)\n+ self.__add_region(region)\n+\n+ self._bin_list.sort()\n+ for bin in self._bins:\n+ self._bins[bin].sort()\n+\n+ def __add_region(self, region):\n+ self._total += region.end - region.start\n+ self._length += 1\n+\n+ startbin = region.start / BedFile._bin_const\n+ endbin = region.end / BedFile._bin_const\n+\n+ for bin in xrange(startbin, endbin + 1):\n+ if not (region.chrom, bin) in self._bins:\n+ self._bin_list.append((region.chrom, bin))\n+ self._bins[(region.chrom, bin)] = []\n+ self._bins[(region.chrom, bin)].append(region)\n+\n+ def fetch(self, chrom, start, end, strand=None):\n+ \'\'\'\n+ For TABIX indexed BED files, find all regions w/in a range\n+\n+ For non-TABIX index BED files, use the calculated bins, and\n+ output matching regions\n+ \'\'\'\n+\n+ if self.__tabix:\n+ for match in self.__tabix.fetch(chrom, start, end):\n+ region = BedRegion(*match.split(\'\\t\'))\n+ if not strand or (strand'..b"\n+\n+ def close(self):\n+ pass\n+\n+ @property\n+ def length(self):\n+ return self._length\n+\n+ @property\n+ def total(self):\n+ return self._total\n+\n+ def __iter__(self):\n+ self._cur_bin_idx = 0\n+ self._cur_bin_pos = 0\n+ self._tellpos = 0\n+ return self\n+\n+ def next(self):\n+ if self._cur_bin_idx >= len(self._bin_list):\n+ raise StopIteration\n+\n+ binvals = self._bins[self._bin_list[self._cur_bin_idx]]\n+ while self._cur_bin_pos < len(binvals):\n+ val = binvals[self._cur_bin_pos]\n+ self._cur_bin_pos += 1\n+\n+ startbin = (val.chrom, val.start / BedFile._bin_const)\n+ if startbin == self._bin_list[self._cur_bin_idx]:\n+ self._tellpos += 1\n+ return val\n+\n+ self._cur_bin_idx += 1\n+ self._cur_bin_pos = 0\n+ return self.next()\n+\n+\n+class BedRegion(object):\n+ def __init__(self, chrom, start, end, name='', score='', strand='', thickStart='', thickEnd='', rgb='', *args):\n+ self.chrom = chrom\n+ self.start = int(start)\n+ self.end = int(end)\n+ self.name = name\n+\n+ if score == '':\n+ self.score = 0\n+ else:\n+ self.score = float(score)\n+\n+ if strand == '':\n+ self.strand = None\n+ else:\n+ self.strand = strand\n+\n+ if thickStart == '':\n+ self.thickStart = None\n+ else:\n+ self.thickStart = thickStart\n+\n+ if thickEnd == '':\n+ self.thickEnd = None\n+ else:\n+ self.thickEnd = thickEnd\n+\n+ if rgb == '':\n+ self.rgb = None\n+ else:\n+ self.rgb = rgb\n+\n+ self.extras = args\n+\n+ def clone(self, chrom=None, start=None, end=None, name=None, score=None, strand=None, thickStart=None, thickEnd=None, rgb=None, *args):\n+ cols = []\n+ cols.append(self.chrom if chrom is None else chrom)\n+ cols.append(self.start if start is None else start)\n+ cols.append(self.end if end is None else end)\n+ cols.append(self.name if name is None else name)\n+ cols.append(self.score if score is None else score)\n+ cols.append(self.strand if strand is None else strand)\n+ cols.append(self.thickStart if thickStart is None else thickStart)\n+ cols.append(self.thickEnd if thickEnd is None else thickEnd)\n+ cols.append(self.rgb if rgb is None else rgb)\n+\n+ for i, val in enumerate(self.extras):\n+ if len(args) > i:\n+ cols.append(args[i])\n+ else:\n+ cols.append(val)\n+\n+ return BedRegion(*cols)\n+\n+ @property\n+ def score_int(self):\n+ score = str(self.score)\n+ if score[-2:] == '.0':\n+ score = score[:-2]\n+\n+ return score\n+\n+ def __key(self):\n+ return (self.chrom, self.start, self.end, self.strand, self.name)\n+\n+ def __lt__(self, other):\n+ return self.__key() < other.__key()\n+\n+ def __gt__(self, other):\n+ return self.__key() > other.__key()\n+\n+ def __eq__(self, other):\n+ return self.__key() == other.__key()\n+\n+ def write(self, out):\n+ out.write('%s\\n' % self)\n+\n+ def __repr__(self):\n+ outcols = []\n+\n+ if self.rgb:\n+ outcols.append(self.rgb)\n+ if self.thickEnd or outcols:\n+ outcols.append(self.thickEnd if self.thickEnd else self.end)\n+ if self.thickStart or outcols:\n+ outcols.append(self.thickStart if self.thickStart else self.start)\n+ if self.strand or outcols:\n+ outcols.append(self.strand)\n+ if self.score_int != '' or outcols:\n+ outcols.append(self.score_int)\n+ if self.name or outcols:\n+ outcols.append(self.name)\n+\n+ outcols.append(self.end)\n+ outcols.append(self.start)\n+ outcols.append(self.chrom)\n+\n+ return '\\t'. join([str(x) for x in outcols[::-1]])\n" |
b |
diff -r 000000000000 -r 4e4e4093d65d ngsutils/bed/__init__.pyc |
b |
Binary file ngsutils/bed/__init__.pyc has changed |
b |
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsutils/support/__init__.py Wed Nov 11 13:04:07 2015 -0500 |
[ |
@@ -0,0 +1,249 @@ +import collections +import gzip +import os +import sys +import re +try: + from eta import ETA +except: + pass + +class FASTARead(collections.namedtuple('FASTARecord', 'name comment seq')): + def __repr__(self): + if self.comment: + return '>%s %s\n%s\n' % (self.name, self.comment, self.seq) + return '>%s\n%s\n' % (self.name, self.seq) + + def subseq(self, start, end, comment=None): + if self.comment: + comment = '%s %s' % (self.comment, comment) + + return FASTARead(self.name, comment, self.seq[start:end]) + + def clone(self, name=None, comment=None, seq=None): + n = name if name else self.name + c = comment if comment else self.comment + s = seq if seq else self.seq + + return FASTARead(n, c, s) + + def write(self, out): + out.write(repr(self)) + + +class FASTA(object): + def __init__(self, fname=None, fileobj=None, qual=False): + self.fname = fname + self.qual = qual + if fileobj: + self.fileobj = fileobj + else: + if self.fname == '-': + self.fileobj = sys.stdin + elif self.fname[-3:] == '.gz' or self.fname[-4:] == '.bgz': + self.fileobj = gzip.open(os.path.expanduser(self.fname)) + else: + self.fileobj = open(os.path.expanduser(self.fname)) + + if not self.fileobj: + raise ValueError("Missing valid filename or fileobj") + + def close(self): + if self.fileobj != sys.stdout: + self.fileobj.close() + + def tell(self): + # always relative to uncompressed... + return self.fileobj.tell() + + def seek(self, pos, whence=0): + self.fileobj.seek(pos, whence) + + def fetch(self, quiet=False): + name = '' + comment = '' + seq = '' + + if not quiet and self.fname and self.fname != '-': + eta = ETA(os.stat(self.fname).st_size, fileobj=self.fileobj) + else: + eta = None + + for line in self.fileobj: + line = line.strip() + if not line: + continue + if line[0] == '#': + continue + + if line[0] == '>': + if name and seq: + if eta: + eta.print_status(extra=name) + yield FASTARead(name, comment, seq) + + spl = re.split(r'[ \t]', line[1:], maxsplit=1) + name = spl[0] + if len(spl) > 1: + comment = spl[1] + else: + comment = '' + seq = '' + + else: + if self.qual: + seq = seq + ' ' + line + else: + seq += line + + if name and seq: + if eta: + eta.print_status(extra=name) + yield FASTARead(name, comment, seq) + + if eta: + eta.done() + + +def gzip_reader(fname, quiet=False, callback=None, done_callback=None, fileobj=None): + if fileobj: + f = fileobj + elif fname == '-': + f = sys.stdin + elif fname[-3:] == '.gz' or fname[-4:] == '.bgz': + f = gzip.open(os.path.expanduser(fname)) + else: + f = open(os.path.expanduser(fname)) + + if quiet or fname == '-': + eta = None + else: + eta = ETA(os.stat(fname).st_size, fileobj=f) + + for line in f: + if eta: + if callback: + extra = callback() + else: + extra = '' + + eta.print_status(extra=extra) + yield line + + if done_callback and done_callback(): + break + + if f != sys.stdin: + f.close() + + if eta: + eta.done() + + +class Symbolize(object): + 'Converts strings to symbols - basically a cache of strings' + def __init__(self): + self.__cache = {} + + def __getitem__(self, k): + if not k in self.__cache: + self.__cache[k] = k + + return self.__cache[k] + +symbols = Symbolize() + +_compliments = { +'a': 't', +'A': 'T', +'c': 'g', +'C': 'G', +'g': 'c', +'G': 'C', +'t': 'a', +'T': 'A', +'n': 'n', +'N': 'N' +} + + +def revcomp(seq): + ''' + >>> revcomp('ATCGatcg') + 'cgatCGAT' + ''' + ret = [] + + for s in seq: + ret.append(_compliments[s]) + + ret.reverse() + return ''.join(ret) + + +class Counts(object): + ''' + Setup simple binning. Bins are continuous 0->max. Values are added to + bins and then means / distributions can be calculated. + ''' + def __init__(self): + self.bins = [] + + def add(self, val): + while len(self.bins) <= val: + self.bins.append(0) + self.bins[val] += 1 + + def mean(self): + acc = 0 + count = 0 + + for i, val in enumerate(self.bins): + acc += (i * val) + count += val + + if count > 0: + return float(acc) / count + + def max(self): + return len(self.bins) - 1 + + +def memoize(func): + if 'TESTING' in os.environ or 'DEBUG' in os.environ: + return func + + __cache = {} + def inner(*args, **kwargs): + k = (args, tuple(kwargs.iteritems())) + if k not in __cache: + __cache[k] = func(*args, **kwargs) + return __cache[k] + + inner.__doc__ = '(@memoized %s)\n%s' % (func.__name__, func.__doc__) + return inner + + +def quoted_split(s, delim, quote_char='"'): + tokens = [] + + buf = "" + inquote = False + + for c in s: + if inquote: + buf += c + if c == quote_char: + inquote = False + elif c == delim: + tokens.append(buf) + buf = "" + else: + buf += c + if c == quote_char: + inquote = True + + if buf: + tokens.append(buf) + + return tokens |
b |
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/__init__.pyc |
b |
Binary file ngsutils/support/__init__.pyc has changed |
b |
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/bgzip.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsutils/support/bgzip.py Wed Nov 11 13:04:07 2015 -0500 |
[ |
@@ -0,0 +1,137 @@ +#!/usr/bin/env python +''' +Extract the blocks from a BGZip file. + +BAM files are stored as blocks in a bgzip archive. This class +will load the bgzip archive and output the block information. +''' + +import sys +import os +import struct + + +class BGZip(object): + def __init__(self, fname): + self.fname = fname + self.pos = 0 + self.fileobj = open(self.fname) + self.fsize = os.stat(self.fname).st_size + self.cur_chunk = 0 + + self.cpos = 0 + self.cdata = 0 + + def close(self): + self.fileobj.close() + + def seek(self, offset): + bgz_offset = offset >> 16 + chunk_offset = offset & 0xFFFF + + self.fileobj.seek(bgz_offset) + self.read_chunk() + self.chunk_pos = chunk_offset + + def read(self, amount, whence=0): + if whence not in [0, 1]: + print "Bad Whence value!: %s" % whence + return + + if whence == 0: + self.seek(0, 0) + + ### read into chunk, if not enough data in chunk, read next chunk + ret = '' + while amount and self.pos < self.fsize: + if len(self.cdata) - self.cpos < amount: + ret += self.cdata[self.cpos:self.cpos + amount] + self.cpos += amount + return ret + else: + ret += self.cdata[self.cpos:] + amount = amount - len(ret) + self.read_chunk() + return ret + + def read_chunk(self): + self.fileobj.seek(10) + id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('<BBBBIBBH') + subpos = 0 + bsize = 0 + + while subpos < xlen: + si1, si2, slen = self._read_fields('<BBH') + if si1 == 66 and si2 == 67: + bsize, = self._read_fields('<H') + else: + self.fileobj.seek(slen, 1) + self.pos += slen + + subpos += 6 + slen + + cdata_size = bsize - xlen - 19 + self.cdata = self.fileobj.read(cdata_size) # inflate value + self.fileobj.seek(8) + + self.cur_chunk += 1 + self.cpos = 0 + + def dump(self): + self.fileobj.seek(0) + block_num = 0 + + while self.pos < self.fsize: + print "[BLOCK %s]" % block_num + # read header + id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('<BBBBIBBH') + print 'id1: %s' % id1 + print 'id2: %s' % id2 + print 'cm: %s' % cm + print 'flg: %s' % flg + print 'mtime: %s' % mtime + print 'xfl: %s' % xfl + print 'os: %s' % os + print 'xlen: %s' % xlen + + # read subfields + subpos = 0 + bsize = 0 + + while subpos < xlen: + si1, si2, slen = self._read_fields('<BBH') + print ' si1: %s' % si1 + print ' si1: %s' % si2 + print ' slen: %s' % slen + print ' data: [%s]' % slen + + if si1 == 66 and si2 == 67: + bsize, = self._read_fields('<H') + else: + self.fileobj.seek(slen, 1) + self.pos += slen + + subpos += 6 + slen + + cdata_size = bsize - xlen - 19 + + print 'bsize: %s' % bsize + print 'cdata: [%s]' % (cdata_size) + + self.fileobj.seek(cdata_size, 1) + self.pos += cdata_size + crc, isize = self._read_fields('<II') + + print "crc: %s" % crc + print "isize: %s" % isize + # print "POS: %s" % self.pos + + block_num += 1 + + def _read_fields(self, field_types): + size = struct.calcsize(field_types) + self.pos += size + return struct.unpack(field_types, self.fileobj.read(size)) + +if __name__ == '__main__': + print BGZip(sys.argv[1]).dump() |
b |
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/dbsnp.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsutils/support/dbsnp.py Wed Nov 11 13:04:07 2015 -0500 |
[ |
@@ -0,0 +1,141 @@ +''' +Support package for processing a dbSNP tabix dump from UCSC. +''' + +import pysam +import collections +import sys +from ngsutils.support import revcomp + + +class SNPRecord(collections.namedtuple('SNPRecord', '''bin +chrom +chromStart +chromEnd +name +score +strand +refNCBI +refUCSC +observed +molType +clazz +valid +avHet +avHetSE +func +locType +weight +exceptions +submitterCount +submitters +alleleFreqCount +alleles +alleleNs +alleleFreqs +bitfields +''')): + __slots__ = () + + @property + def alleles(self): + alts = [] + for alt in self.observed.split('/'): + if alt != '-' and self.strand == '-': + alt = revcomp(alt) + + alts.append(alt) + + return alts + + @property + def snp_length(self): + return self.chromEnd - self.chromStart + + +def autotype(ar, length=26): + out = [] + for el in ar: + val = None + try: + val = int(el) + except: + try: + val = float(el) + except: + val = el + + out.append(val) + + if len(out) < 12: + raise TypeError("Invalid dbSNP file! We need at least 12 columns to work with.") + + while len(out) < length: + out.append(None) + return out + + +class DBSNP(object): + def __init__(self, fname): + self.dbsnp = pysam.Tabixfile(fname) + self.asTup = pysam.asTuple() + + def fetch(self, chrom, pos): + 'Note: pos is 0-based' + + # Note: tabix the command uses 1-based positions, but + # pysam.Tabixfile uses 0-based positions + + for tup in self.dbsnp.fetch(chrom, pos, pos + 1, parser=self.asTup): + snp = SNPRecord._make(autotype(tup)) + if snp.chromStart == pos: + yield snp + + def close(self): + self.dbsnp.close() + + def dump(self, chrom, op, pos, base, snp, exit=True): + print ' ->', op, chrom, pos, base + print snp + print snp.alleles + if exit: + sys.exit(1) + + def is_valid_variation(self, chrom, op, pos, seq, verbose=False): + for snp in self.fetch(chrom, pos): + if not '/' in snp.observed or snp.clazz not in ['single', 'mixed', 'in-del', 'insertion', 'deletion']: + # these are odd variations that we can't deal with... (microsatellites, tooLongToDisplay members, etc) + continue + + if op == 0: + if snp.clazz in ['single', 'mixed'] and seq in snp.alleles: + return True + # elif verbose: + # for alt in snp.alleles: + # if len(alt) > 1: + # self.dump(chrom, op, pos, seq, snp, False) + + elif op == 1: + if snp.clazz in ['insertion', 'mixed', 'in-del']: + if seq in snp.alleles: + if len(seq) > 1 and verbose: + self.dump(chrom, op, pos, seq, snp, False) + return True + + if verbose: + if len(seq) > 1: + self.dump(chrom, op, pos, seq, snp, False) + else: + for alt in snp.alleles: + if len(alt) > 1: + self.dump(chrom, op, pos, seq, snp, False) + + elif op == 2: + if snp.clazz in ['deletion', 'mixed', 'in-del']: + if '-' in snp.alleles and seq in snp.alleles: + if len(seq) > 1 and verbose: + self.dump(chrom, op, pos, seq, snp, False) + return True + + return False |
b |
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/dbsnp.pyc |
b |
Binary file ngsutils/support/dbsnp.pyc has changed |
b |
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/llh.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsutils/support/llh.py Wed Nov 11 13:04:07 2015 -0500 |
[ |
@@ -0,0 +1,55 @@ +''' +Methods for calculating log-likelihoods for nucleotide frequencies +''' +import math +import collections +from ngsutils.support import memoize + +_default_background = {'A': 0.3, 'T': 0.3, 'C': 0.2, 'G': 0.2} + +NucleotideLogLikelihood = collections.namedtuple('NucleotideLogLikelihood', 'A C G T pseudo') + +@memoize +def pseudo_count(N, bg): + ''' + >>> pseudo_count(100, _default_background['A']) + 3 + >>> pseudo_count(100, _default_background['C']) + 2 + ''' + + return bg * math.sqrt(N) + + +def calc_llh(A, C, G, T, bg=_default_background, pseudo='auto'): + if pseudo == 'auto': + N = A + C + G + T + Ap = float(A) + pseudo_count(N, bg['A']) + Cp = float(C) + pseudo_count(N, bg['C']) + Gp = float(G) + pseudo_count(N, bg['G']) + Tp = float(T) + pseudo_count(N, bg['T']) + elif pseudo: + Ap = float(A) + pseudo + Cp = float(C) + pseudo + Gp = float(G) + pseudo + Tp = float(T) + pseudo + else: + Ap = float(A) + Cp = float(C) + Gp = float(G) + Tp = float(T) + + Np = Ap + Cp + Gp + Tp + + freqA = Ap / Np + freqC = Cp / Np + freqG = Gp / Np + freqT = Tp / Np + + return NucleotideLogLikelihood(math.log(freqA / bg['A']), math.log(freqC / bg['C']), math.log(freqG / bg['G']), math.log(freqT / bg['T']), pseudo) + + + +if __name__ == '__main__': + import doctest + doctest.testmod() |
b |
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/ngs_utils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsutils/support/ngs_utils.py Wed Nov 11 13:04:07 2015 -0500 |
[ |
@@ -0,0 +1,225 @@ +#!/usr/bin/env python +""" + +Common util classes / functions for the NGS project + +""" +import sys +import os +import gzip +import re +import collections + + +def format_number(n): + ''' + >>> format_number(1000) + '1,000' + >>> format_number(1234567) + '1,234,567' + ''' + ar = list(str(n)) + for i in range(len(ar))[::-3][1:]: + ar.insert(i + 1, ',') + return ''.join(ar) + + +def natural_sort(ar): + ''' + >>> natural_sort('1 3 4 2 5'.split()) + ['1', '2', '3', '4', '5'] + >>> natural_sort('1 10 20 2 3 4'.split()) + ['1', '2', '3', '4', '10', '20'] + ''' + to_sort = [] + for item in ar: + spl = re.split('(\d+)', item) + l2 = [] + for el in spl: + try: + n = int(el) + except: + n = el + l2.append(n) + to_sort.append((l2, item)) + + to_sort.sort() + return [x[1] for x in to_sort] + + +def dictify(values, colnames): + """ + Convert a list of values into a dictionary based upon given column names. + + If the column name starts with an '@', the value is assumed to be a comma + separated list. + + If the name starts with a '#', the value is assumed to be an int. + + If the name starts with '@#', the value is assumed to a comma separated + list of ints. + + """ + d = {} + for i in xrange(len(colnames)): + key = colnames[i] + split = False + num = False + + if key[0] == '@': + key = key[1:] + split = True + if key[0] == '#': + key = key[1:] + num = True + + if i < len(values): + if num and split: + val = [int(x) for x in values[i].rstrip(',').split(',')] + elif num: + val = int(values[i]) + elif split: + val = values[i].rstrip(',').split(',') + else: + val = values[i] + + d[key] = val + + else: + d[key] = None + + return d + + +def gzip_aware_open(fname): + if fname == '-': + f = sys.stdin + elif fname[-3:] == '.gz' or fname[-4:] == '.bgz': + f = gzip.open(os.path.expanduser(fname)) + else: + f = open(os.path.expanduser(fname)) + return f + + +class gzip_opener: + ''' + A Python 2.6 class to handle 'with' opening of text files that may + or may not be gzip compressed. + ''' + def __init__(self, fname): + self.fname = fname + + def __enter__(self): + self.f = gzip_aware_open(self.fname) + return self.f + + def __exit__(self, type, value, traceback): + if self.f != sys.stdin: + self.f.close() + return False + + +def filenames_to_uniq(names, new_delim='.'): + ''' + Given a set of file names, produce a list of names consisting of the + uniq parts of the names. This works from the end of the name. Chunks of + the name are split on '.' and '-'. + + For example: + A.foo.bar.txt + B.foo.bar.txt + returns: ['A','B'] + + AA.BB.foo.txt + CC.foo.txt + returns: ['AA.BB','CC'] + + >>> filenames_to_uniq('a.foo.bar.txt b.foo.bar.txt'.split()) + ['a', 'b'] + >>> filenames_to_uniq('a.b.foo.txt c.foo.txt'.split()) + ['a.b', 'c'] + + ''' + name_words = [] + maxlen = 0 + for name in names: + name_words.append(name.replace('.', ' ').replace('-', ' ').strip().split()) + name_words[-1].reverse() + if len(name_words[-1]) > maxlen: + maxlen = len(name_words[-1]) + + common = [False, ] * maxlen + for i in xrange(maxlen): + last = None + same = True + for nameword in name_words: + if i >= len(nameword): + same = False + break + if not last: + last = nameword[i] + elif nameword[i] != last: + same = False + break + common[i] = same + + newnames = [] + for nameword in name_words: + nn = [] + for (i, val) in enumerate(common): + if not val and i < len(nameword): + nn.append(nameword[i]) + nn.reverse() + newnames.append(new_delim.join(nn)) + return newnames + + +def parse_args(argv, defaults=None, expected_argc=0): + opts = {} + if defaults: + opts.update(defaults) + + args = [] + + i = 0 + while i < len(argv): + if argv[i][0] == '-': + arg = argv[i].lstrip('-') + if '=' in arg: + k, v = arg.split('=', 2) + if k in defaults: + if type(defaults[k]) == float: + opts[k] = float(v) + elif type(defaults[k]) == int: + opts[k] = int(v) + else: + opts[k] = v + else: + opts[arg] = True + else: + args.append(argv[i]) + i += 1 + + while len(args) < expected_argc: + args.append(None) + return opts, args + + +class memoize(object): + 'Simple memoizing decorator to cache results' + def __init__(self, func): + self.func = func + self.cache = {} + + def __call__(self, *args): + if not isinstance(args, collections.Hashable): + # uncacheable. a list, for instance. + # better to not cache than blow up. + return self.func(*args) + + if args in self.cache: + return self.cache[args] + else: + value = self.func(*args) + self.cache[args] = value + return value |
b |
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/ngs_utils.pyc |
b |
Binary file ngsutils/support/ngs_utils.pyc has changed |
b |
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/regions.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsutils/support/regions.py Wed Nov 11 13:04:07 2015 -0500 |
[ |
@@ -0,0 +1,166 @@ +class RangeMatch(object): + ''' + Simple genomic ranges. You can define chrom:start-end ranges, then ask if a + particular genomic coordinate maps to any of those ranges. This is less- + efficient than an R-Tree, but easier to code. + ''' + def __init__(self, name): + self.ranges = {} + self.name = name + + def add_range(self, chrom, strand, start, end): + if not chrom in self.ranges: + self.ranges[chrom] = {} + + bin = start / 100000 + if not bin in self.ranges[chrom]: + self.ranges[chrom][bin] = [] + self.ranges[chrom][bin].insert(0, (start, end, strand)) + + if (end / 100000) != bin: + for bin in xrange(bin + 1, (end / 100000) + 1): + if not bin in self.ranges[chrom]: + self.ranges[chrom][bin] = [] + self.ranges[chrom][bin].insert(0, (start, end, strand)) + + def get_tag(self, chrom, strand, pos, ignore_strand=False): + ''' + returns (region, is_reverse_orientation) + ''' + if not chrom in self.ranges: + return None, False + bin = pos / 100000 + if not bin in self.ranges[chrom]: + return None, False + for start, end, r_strand in self.ranges[chrom][bin]: + if pos >= start and pos <= end: + if ignore_strand or strand == r_strand: + return self.name, False + return self.name, True + return None, False + + +class RegionTagger(object): + def __init__(self, gtf, valid_chroms=None, only_first_fragment=True): + self.regions = [] + self.counts = {} + self.only_first_fragment = only_first_fragment + + coding = RangeMatch('coding') + exons = RangeMatch('other-exon') + utr_5 = RangeMatch('utr-5') + utr_3 = RangeMatch('utr-3') + introns = RangeMatch('intron') + promoters = RangeMatch('promoter') + + for gene in gtf.genes: + if valid_chroms and not gene.chrom in valid_chroms: + continue + if gene.strand == '+': + promoters.add_range(gene.chrom, gene.strand, gene.start - 2000, gene.start) + else: + promoters.add_range(gene.chrom, gene.strand, gene.end, gene.end + 2000) + + for transcript in gene.transcripts: + if transcript.has_cds: + for start, end in transcript.cds: + coding.add_range(gene.chrom, gene.strand, start, end) + + # TODO: Fix this so that it iterates over exons in the 5'/3' UTRS + for s, e in transcript.utr_5: + utr_5.add_range(gene.chrom, gene.strand, s, e) + for s, e in transcript.utr_3: + utr_3.add_range(gene.chrom, gene.strand, s, e) + + last_end = None + for start, end in transcript.exons: + if last_end: + introns.add_range(gene.chrom, gene.strand, last_end, start) + exons.add_range(gene.chrom, gene.strand, start, end) + last_end = end + + + self.regions.append(coding) + self.regions.append(utr_5) + self.regions.append(utr_3) + self.regions.append(exons) + self.regions.append(introns) + self.regions.append(promoters) + + self.counts['coding'] = 0 + self.counts['coding-rev'] = 0 + self.counts['other-exon'] = 0 + self.counts['utr-5'] = 0 + self.counts['utr-3'] = 0 + self.counts['utr-5-rev'] = 0 + self.counts['utr-3-rev'] = 0 + self.counts['intron'] = 0 + self.counts['promoter'] = 0 + self.counts['other-exon-rev'] = 0 + self.counts['intron-rev'] = 0 + self.counts['promoter-rev'] = 0 + self.counts['junction'] = 0 + self.counts['intergenic'] = 0 + self.counts['mitochondrial'] = 0 + + def add_read(self, read, chrom): + if read.is_unmapped: + return + + if self.only_first_fragment and read.is_paired and not read.is_read1: + return + + tag = None + is_rev = False + + strand = '-' if read.is_reverse else '+' + + if chrom == 'chrM': + tag = 'mitochondrial' + + if not tag: + for op, length in read.cigar: + if op == 3: + tag = 'junction' + break + + if not tag: + for region in self.regions: + tag, is_rev = region.get_tag(chrom, strand, read.pos) + if tag: + break + + if not tag: + tag = 'intergenic' + + if tag: + if is_rev: + self.counts['%s-rev' % tag] += 1 + else: + self.counts[tag] += 1 + + return tag + + def tag_region(self, chrom, start, end, strand): + tag = None + is_rev = False + + if chrom == 'chrM' or chrom == 'M': + tag = 'mitochondrial' + + if not tag: + for region in self.regions: + tag, is_rev = region.get_tag(chrom, strand, start) + if is_rev: + tag = '%s-rev' % tag + + if start != end: + endtag, is_rev = region.get_tag(chrom, strand, end) + if is_rev: + endtag = '%s-rev' % endtag + + if tag and endtag and endtag != tag: + tag = '%s/%s' % (tag, endtag) + + if not tag: + tag = 'intergenic' \ No newline at end of file |
b |
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/stats.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsutils/support/stats.py Wed Nov 11 13:04:07 2015 -0500 |
[ |
@@ -0,0 +1,161 @@ +''' +various statistical tests and methods... +''' +import math +from ngsutils.support import memoize + +def median(vals): + ''' + >>> median([1,2,3]) + 2 + >>> median([1,2,3,4]) + 2.5 + ''' + vals.sort() + + if len(vals) % 2 == 1: + return vals[len(vals) / 2] + else: + a = vals[(len(vals) / 2) - 1] + b = vals[(len(vals) / 2)] + return float(a + b) / 2 + + +def mean_stdev(l): + ''' + >>> mean_stdev([1,2,2,2]) + (1.75, 0.5) + >>> mean_stdev([2,2,2,2]) + (2.0, 0.0) + ''' + + acc = 0 + for el in l: + acc += el + + mean = float(acc) / len(l) + acc = 0 + for el in l: + acc += (el - mean) ** 2 + + if len(l) > 2: + stdev = math.sqrt(float(acc) / (len(l) - 1)) + else: + stdev = 0.0 + + return (mean, stdev) + + +def counts_median(d): + ''' + Calculate the median from counts stored in a dictionary + >>> counts_median({ 1: 4, 2: 1, 3: 4 }) + 2 + >>> counts_median({ 1: 4, 3: 4 }) + 2 + + ''' + count = 0 + for k in d: + count += d[k] + + if count == 0: + return 0 + + acc = 0.0 + last = 0 + for k in sorted(d): + if last: + return (last + k) / 2 + acc += d[k] + if acc / count == 0.5: + last = k + elif acc / count > 0.5: + return k + + +def counts_mean_stdev(d): + ''' + + calc mean / stdev when data is stored as counts in a dictionary + + Ex: + { 1: 4, 2: 1, 3: 4 } = [1, 1, 1, 1, 2, 3, 3, 3, 3] + + >>> counts_mean_stdev({ 1: 4, 2: 1, 3: 4 }) + (2.0, 1.0) + + ''' + + acc = 0 + count = 0 + for k in d: + acc += k * d[k] + count += d[k] + + mean = float(acc) / count + + acc = 0 + for k in d: + acc += (((k - mean) ** 2) * d[k]) + + if count > 2: + stdev = math.sqrt(float(acc) / (count - 1)) + else: + stdev = 0.0 + + return (mean, stdev) + +@memoize +def poisson_prob(x, mean): + ''' + Return the probability that you could get x counts in + a Poisson test with a mean value. + + prob(x) = sum(i=1..x){poisson(i)} + + >>> poisson_prob(6,10) + 0.1300960209527205 + >>> poisson_prob(8,10) + 0.33277427882095645 + ''' + acc = 0.0 + for i in xrange(1, x+1): + acc += poisson_func(i, mean) + return acc + +@memoize +def poisson_func(mu, lambd): + ''' + This is the Poisson distribution function + + p(mu) = (lambda^mu * e^(-lambda)) / (mu!) + + mu is a count + lambd is the mean + + >>> poisson_func(1,10) + 0.00045399929762484856 + >>> poisson_func(2,10) + 0.0022699964881242427 + >>> poisson_func(3,10) + 0.007566654960414142 + ''' + return (lambd ** mu) * (math.exp(-1 * lambd)) / _factorial(mu) + + +@memoize +def _factorial(x): + ''' + >>> _factorial(1) + 1 + >>> _factorial(2) + 2 + >>> _factorial(3) + 6 + ''' + return math.factorial(x) + +if __name__ == '__main__': + import doctest + doctest.testmod() |
b |
diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_input1.bam |
b |
Binary file test-data/ngsutils_bam_filter_input1.bam has changed |
b |
diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_result1.bam |
b |
Binary file test-data/ngsutils_bam_filter_result1.bam has changed |
b |
diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_result2.bam |
b |
Binary file test-data/ngsutils_bam_filter_result2.bam has changed |
b |
diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_result3.bam |
b |
Binary file test-data/ngsutils_bam_filter_result3.bam has changed |
b |
diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_result4.bam |
b |
Binary file test-data/ngsutils_bam_filter_result4.bam has changed |
b |
diff -r 000000000000 -r 4e4e4093d65d tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Wed Nov 11 13:04:07 2015 -0500 |
b |
@@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="pysam" version="0.7.7"> + <repository changeset_revision="0a5141bdf9d0" name="package_pysam_0_7_7" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency> |