comparison utilities/computeFraglen.py @ 0:6e75a84e9338 draft

planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author thondeboer
date Tue, 15 May 2018 02:39:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6e75a84e9338
1 #
2 #
3 # Compute Fragment Length Model for genReads.py
4 # computeFraglen.py
5 #
6 #
7 # Usage: samtools view normal.bam | python computeFraglen.py
8 #
9 #
10
11 import sys
12 import fileinput
13 import cPickle as pickle
14 import numpy as np
15
16 FILTER_MAPQUAL = 10 # only consider reads that are mapped with at least this mapping quality
17 FILTER_MINREADS = 100 # only consider fragment lengths that have at least this many read pairs supporting it
18 FILTER_MEDDEV_M = 10 # only consider fragment lengths this many median deviations above the median
19
20 def quick_median(countDict):
21 midPoint = sum(countDict.values())/2
22 mySum = 0
23 myInd = 0
24 sk = sorted(countDict.keys())
25 while mySum < midPoint:
26 mySum += countDict[sk[myInd]]
27 if mySum >= midPoint:
28 break
29 myInd += 1
30 return myInd
31
32 def median_deviation_from_median(countDict):
33 myMedian = quick_median(countDict)
34 deviations = {}
35 for k in sorted(countDict.keys()):
36 d = abs(k-myMedian)
37 deviations[d] = countDict[k]
38 return quick_median(deviations)
39
40 if len(sys.argv) != 1:
41 print "Usage: samtools view normal.bam | python computeFraglen.py"
42 exit(1)
43
44 all_tlens = {}
45 PRINT_EVERY = 100000
46 BREAK_AFTER = 1000000
47 i = 0
48 for line in fileinput.input():
49 splt = line.strip().split('\t')
50 samFlag = int(splt[1])
51 myRef = splt[2]
52 mapQual = int(splt[4])
53 mateRef = splt[6]
54 myTlen = abs(int(splt[8]))
55
56 if samFlag&1 and samFlag&64 and mapQual > FILTER_MAPQUAL: # if read is paired, and is first in pair, and is confidently mapped...
57 if mateRef == '=' or mateRef == myRef: # and mate is mapped to same reference
58 if myTlen not in all_tlens:
59 all_tlens[myTlen] = 0
60 all_tlens[myTlen] += 1
61 i += 1
62 if i%PRINT_EVERY == 0:
63 print '---',i, quick_median(all_tlens), median_deviation_from_median(all_tlens)
64 #for k in sorted(all_tlens.keys()):
65 # print k, all_tlens[k]
66
67 #if i > BREAK_AFTER:
68 # break
69
70
71 med = quick_median(all_tlens)
72 mdm = median_deviation_from_median(all_tlens)
73
74 outVals = []
75 outProbs = []
76 for k in sorted(all_tlens.keys()):
77 if k > 0 and k < med + FILTER_MEDDEV_M * mdm:
78 if all_tlens[k] >= FILTER_MINREADS:
79 print k, all_tlens[k]
80 outVals.append(k)
81 outProbs.append(all_tlens[k])
82 countSum = float(sum(outProbs))
83 outProbs = [n/countSum for n in outProbs]
84
85 print '\nsaving model...'
86 pickle.dump([outVals, outProbs],open('fraglen.p','wb'))
87
88