annotate utilities/computeFraglen.py @ 1:362e0b0f7024 draft

planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author thondeboer
date Tue, 15 May 2018 15:32:13 -0400
parents 6e75a84e9338
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
1 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
2 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
3 # Compute Fragment Length Model for genReads.py
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
4 # computeFraglen.py
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
5 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
6 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
7 # Usage: samtools view normal.bam | python computeFraglen.py
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
8 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
9 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
10
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
11 import sys
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
12 import fileinput
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
13 import cPickle as pickle
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
14 import numpy as np
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
15
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
16 FILTER_MAPQUAL = 10 # only consider reads that are mapped with at least this mapping quality
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
17 FILTER_MINREADS = 100 # only consider fragment lengths that have at least this many read pairs supporting it
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
18 FILTER_MEDDEV_M = 10 # only consider fragment lengths this many median deviations above the median
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
19
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
20 def quick_median(countDict):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
21 midPoint = sum(countDict.values())/2
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
22 mySum = 0
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
23 myInd = 0
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
24 sk = sorted(countDict.keys())
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
25 while mySum < midPoint:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
26 mySum += countDict[sk[myInd]]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
27 if mySum >= midPoint:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
28 break
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
29 myInd += 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
30 return myInd
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
31
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
32 def median_deviation_from_median(countDict):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
33 myMedian = quick_median(countDict)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
34 deviations = {}
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
35 for k in sorted(countDict.keys()):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
36 d = abs(k-myMedian)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
37 deviations[d] = countDict[k]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
38 return quick_median(deviations)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
39
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
40 if len(sys.argv) != 1:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
41 print "Usage: samtools view normal.bam | python computeFraglen.py"
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
42 exit(1)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
43
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
44 all_tlens = {}
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
45 PRINT_EVERY = 100000
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
46 BREAK_AFTER = 1000000
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
47 i = 0
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
48 for line in fileinput.input():
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
49 splt = line.strip().split('\t')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
50 samFlag = int(splt[1])
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
51 myRef = splt[2]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
52 mapQual = int(splt[4])
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
53 mateRef = splt[6]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
54 myTlen = abs(int(splt[8]))
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
55
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
56 if samFlag&1 and samFlag&64 and mapQual > FILTER_MAPQUAL: # if read is paired, and is first in pair, and is confidently mapped...
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
57 if mateRef == '=' or mateRef == myRef: # and mate is mapped to same reference
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
58 if myTlen not in all_tlens:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
59 all_tlens[myTlen] = 0
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
60 all_tlens[myTlen] += 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
61 i += 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
62 if i%PRINT_EVERY == 0:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
63 print '---',i, quick_median(all_tlens), median_deviation_from_median(all_tlens)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
64 #for k in sorted(all_tlens.keys()):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
65 # print k, all_tlens[k]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
66
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
67 #if i > BREAK_AFTER:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
68 # break
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
69
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
70
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
71 med = quick_median(all_tlens)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
72 mdm = median_deviation_from_median(all_tlens)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
73
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
74 outVals = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
75 outProbs = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
76 for k in sorted(all_tlens.keys()):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
77 if k > 0 and k < med + FILTER_MEDDEV_M * mdm:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
78 if all_tlens[k] >= FILTER_MINREADS:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
79 print k, all_tlens[k]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
80 outVals.append(k)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
81 outProbs.append(all_tlens[k])
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
82 countSum = float(sum(outProbs))
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
83 outProbs = [n/countSum for n in outProbs]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
84
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
85 print '\nsaving model...'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
86 pickle.dump([outVals, outProbs],open('fraglen.p','wb'))
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
87
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
88