Mercurial > repos > drosofff > lumpy
comparison pairend_distro.py @ 0:8b3daa745d9b draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/lumpy commit c0bfc4b2215705e1b5fd1d4e60b1d72e5da13c92
| author | drosofff |
|---|---|
| date | Tue, 06 Dec 2016 05:46:28 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:8b3daa745d9b |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # (c) 2012 - Ryan M. Layer | |
| 3 # Hall Laboratory | |
| 4 # Quinlan Laboratory | |
| 5 # Department of Computer Science | |
| 6 # Department of Biochemistry and Molecular Genetics | |
| 7 # Department of Public Health Sciences and Center for Public Health Genomics, | |
| 8 # University of Virginia | |
| 9 # rl6sf@virginia.edu | |
| 10 | |
| 11 import sys | |
| 12 import numpy as np | |
| 13 from operator import itemgetter | |
| 14 from optparse import OptionParser | |
| 15 | |
| 16 # some constants for sam/bam field ids | |
| 17 SAM_FLAG = 1 | |
| 18 SAM_REFNAME = 2 | |
| 19 SAM_MATE_REFNAME = 6 | |
| 20 SAM_ISIZE = 8 | |
| 21 | |
| 22 parser = OptionParser() | |
| 23 | |
| 24 parser.add_option("-r", | |
| 25 "--read_length", | |
| 26 type="int", | |
| 27 dest="read_length", | |
| 28 help="Read length") | |
| 29 | |
| 30 parser.add_option("-X", | |
| 31 dest="X", | |
| 32 type="int", | |
| 33 help="Number of stdevs from mean to extend") | |
| 34 | |
| 35 parser.add_option("-N", | |
| 36 dest="N", | |
| 37 type="int", | |
| 38 help="Number to sample") | |
| 39 | |
| 40 parser.add_option("-o", | |
| 41 dest="output_file", | |
| 42 help="Output file") | |
| 43 | |
| 44 parser.add_option("-m", | |
| 45 dest="mads", | |
| 46 type="int", | |
| 47 default=10, | |
| 48 help="Outlier cutoff in # of median absolute deviations (unscaled, upper only)") | |
| 49 | |
| 50 def unscaled_upper_mad(xs): | |
| 51 """Return a tuple consisting of the median of xs followed by the | |
| 52 unscaled median absolute deviation of the values in xs that lie | |
| 53 above the median. | |
| 54 """ | |
| 55 med = np.median(xs) | |
| 56 return med, np.median(xs[xs > med] - med) | |
| 57 | |
| 58 | |
| 59 (options, args) = parser.parse_args() | |
| 60 | |
| 61 if not options.read_length: | |
| 62 parser.error('Read length not given') | |
| 63 | |
| 64 if not options.X: | |
| 65 parser.error('X not given') | |
| 66 | |
| 67 if not options.N: | |
| 68 parser.error('N not given') | |
| 69 | |
| 70 if not options.output_file: | |
| 71 parser.error('Output file not given') | |
| 72 | |
| 73 | |
| 74 required = 97 | |
| 75 restricted = 3484 | |
| 76 flag_mask = required | restricted | |
| 77 | |
| 78 L = [] | |
| 79 c = 0 | |
| 80 | |
| 81 for l in sys.stdin: | |
| 82 if c >= options.N: | |
| 83 break | |
| 84 | |
| 85 A = l.rstrip().split('\t') | |
| 86 flag = int(A[SAM_FLAG]) | |
| 87 refname = A[SAM_REFNAME] | |
| 88 mate_refname = A[SAM_MATE_REFNAME] | |
| 89 isize = int(A[SAM_ISIZE]) | |
| 90 | |
| 91 want = mate_refname == "=" and flag & flag_mask == required and isize >= 0 | |
| 92 if want: | |
| 93 c += 1 | |
| 94 L.append(isize) | |
| 95 | |
| 96 # warn if very few elements in distribution | |
| 97 min_elements = 1000 | |
| 98 if len(L) < min_elements: | |
| 99 sys.stderr.write("Warning: only %s elements in distribution (min: %s)\n" % (len(L), min_elements)) | |
| 100 mean = "NA" | |
| 101 stdev = "NA" | |
| 102 | |
| 103 else: | |
| 104 # Remove outliers | |
| 105 L = np.array(L) | |
| 106 L.sort() | |
| 107 med, umad = unscaled_upper_mad(L) | |
| 108 upper_cutoff = med + options.mads * umad | |
| 109 L = L[L < upper_cutoff] | |
| 110 new_len = len(L) | |
| 111 removed = c - new_len | |
| 112 sys.stderr.write("Removed %d outliers with isize >= %d\n" % | |
| 113 (removed, upper_cutoff)) | |
| 114 c = new_len | |
| 115 | |
| 116 mean = np.mean(L) | |
| 117 stdev = np.std(L) | |
| 118 | |
| 119 start = options.read_length | |
| 120 end = int(mean + options.X*stdev) | |
| 121 | |
| 122 H = [0] * (end - start + 1) | |
| 123 s = 0 | |
| 124 | |
| 125 for x in L: | |
| 126 if (x >= start) and (x <= end): | |
| 127 j = int(x - start) | |
| 128 H[j] = H[ int(x - start) ] + 1 | |
| 129 s += 1 | |
| 130 | |
| 131 f = open(options.output_file, 'w') | |
| 132 | |
| 133 for i in range(end - start): | |
| 134 o = str(i) + "\t" + str(float(H[i])/float(s)) + "\n" | |
| 135 f.write(o) | |
| 136 | |
| 137 | |
| 138 f.close() | |
| 139 | |
| 140 print('mean:' + str(mean) + '\tstdev:' + str(stdev)) |
