comparison pairend_distro.py @ 0:796552c157de draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/lumpy-sv commit d06124e8a097f3f665b4955281f40fe811eaee64
author artbio
date Mon, 24 Jul 2017 08:03:17 -0400
parents
children 1ed8619a5611
comparison
equal deleted inserted replaced
-1:000000000000 0:796552c157de
1 #!/usr/bin/env python
2 # (c) 2012 - Ryan M. Layer
3 # Hall Laboratory
4 # Quinlan Laboratory
5 # Department of Computer Science
6 # Department of Biochemistry and Molecular Genetics
7 # Department of Public Health Sciences and Center for Public Health Genomics,
8 # University of Virginia
9 # rl6sf@virginia.edu
10
11 import sys
12 import numpy as np
13 from operator import itemgetter
14 from optparse import OptionParser
15
16 # some constants for sam/bam field ids
17 SAM_FLAG = 1
18 SAM_REFNAME = 2
19 SAM_MATE_REFNAME = 6
20 SAM_ISIZE = 8
21
22 parser = OptionParser()
23
24 parser.add_option("-r",
25 "--read_length",
26 type="int",
27 dest="read_length",
28 help="Read length")
29
30 parser.add_option("-X",
31 dest="X",
32 type="int",
33 help="Number of stdevs from mean to extend")
34
35 parser.add_option("-N",
36 dest="N",
37 type="int",
38 help="Number to sample")
39
40 parser.add_option("-o",
41 dest="output_file",
42 help="Output file")
43
44 parser.add_option("-m",
45 dest="mads",
46 type="int",
47 default=10,
48 help="Outlier cutoff in # of median absolute deviations (unscaled, upper only)")
49
50 def unscaled_upper_mad(xs):
51 """Return a tuple consisting of the median of xs followed by the
52 unscaled median absolute deviation of the values in xs that lie
53 above the median.
54 """
55 med = np.median(xs)
56 return med, np.median(xs[xs > med] - med)
57
58
59 (options, args) = parser.parse_args()
60
61 if not options.read_length:
62 parser.error('Read length not given')
63
64 if not options.X:
65 parser.error('X not given')
66
67 if not options.N:
68 parser.error('N not given')
69
70 if not options.output_file:
71 parser.error('Output file not given')
72
73
74 required = 97
75 restricted = 3484
76 flag_mask = required | restricted
77
78 L = []
79 c = 0
80
81 for l in sys.stdin:
82 if c >= options.N:
83 break
84
85 A = l.rstrip().split('\t')
86 flag = int(A[SAM_FLAG])
87 refname = A[SAM_REFNAME]
88 mate_refname = A[SAM_MATE_REFNAME]
89 isize = int(A[SAM_ISIZE])
90
91 want = mate_refname == "=" and flag & flag_mask == required and isize >= 0
92 if want:
93 c += 1
94 L.append(isize)
95
96 # warn if very few elements in distribution
97 min_elements = 1000
98 if len(L) < min_elements:
99 sys.stderr.write("Warning: only %s elements in distribution (min: %s)\n" % (len(L), min_elements))
100 mean = "NA"
101 stdev = "NA"
102
103 else:
104 # Remove outliers
105 L = np.array(L)
106 L.sort()
107 med, umad = unscaled_upper_mad(L)
108 upper_cutoff = med + options.mads * umad
109 L = L[L < upper_cutoff]
110 new_len = len(L)
111 removed = c - new_len
112 sys.stderr.write("Removed %d outliers with isize >= %d\n" %
113 (removed, upper_cutoff))
114 c = new_len
115
116 mean = np.mean(L)
117 stdev = np.std(L)
118
119 start = options.read_length
120 end = int(mean + options.X*stdev)
121
122 H = [0] * (end - start + 1)
123 s = 0
124
125 for x in L:
126 if (x >= start) and (x <= end):
127 j = int(x - start)
128 H[j] = H[ int(x - start) ] + 1
129 s += 1
130
131 f = open(options.output_file, 'w')
132
133 for i in range(end - start):
134 o = str(i) + "\t" + str(float(H[i])/float(s)) + "\n"
135 f.write(o)
136
137
138 f.close()
139
140 print('mean:' + str(mean) + '\tstdev:' + str(stdev))