diff utilities/computeFraglen.py @ 0:6e75a84e9338 draft

planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author thondeboer
date Tue, 15 May 2018 02:39:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utilities/computeFraglen.py	Tue May 15 02:39:53 2018 -0400
@@ -0,0 +1,88 @@
+#
+#
+#      Compute Fragment Length Model for genReads.py
+#                  computeFraglen.py
+#
+#
+#      Usage: samtools view normal.bam | python computeFraglen.py
+#
+#
+
+import sys
+import fileinput
+import cPickle as pickle
+import numpy as np
+
+FILTER_MAPQUAL  = 10	# only consider reads that are mapped with at least this mapping quality
+FILTER_MINREADS = 100	# only consider fragment lengths that have at least this many read pairs supporting it
+FILTER_MEDDEV_M = 10	# only consider fragment lengths this many median deviations above the median
+
+def quick_median(countDict):
+	midPoint = sum(countDict.values())/2
+	mySum    = 0
+	myInd    = 0
+	sk       = sorted(countDict.keys())
+	while mySum < midPoint:
+		mySum += countDict[sk[myInd]]
+		if mySum >= midPoint:
+			break
+		myInd += 1
+	return myInd
+
+def median_deviation_from_median(countDict):
+	myMedian = quick_median(countDict)
+	deviations = {}
+	for k in sorted(countDict.keys()):
+		d = abs(k-myMedian)
+		deviations[d] = countDict[k]
+	return quick_median(deviations)
+
+if len(sys.argv) != 1:
+	print "Usage: samtools view normal.bam | python computeFraglen.py"
+	exit(1)
+
+all_tlens = {}
+PRINT_EVERY = 100000
+BREAK_AFTER = 1000000
+i = 0
+for line in fileinput.input():
+	splt = line.strip().split('\t')
+	samFlag = int(splt[1])
+	myRef   = splt[2]
+	mapQual = int(splt[4])
+	mateRef = splt[6]
+	myTlen  = abs(int(splt[8]))
+
+	if samFlag&1 and samFlag&64 and mapQual > FILTER_MAPQUAL:	# if read is paired, and is first in pair, and is confidently mapped...
+		if mateRef == '=' or mateRef == myRef:					# and mate is mapped to same reference
+			if myTlen not in all_tlens:
+				all_tlens[myTlen] = 0
+			all_tlens[myTlen] += 1
+			i += 1
+			if i%PRINT_EVERY == 0:
+				print '---',i, quick_median(all_tlens), median_deviation_from_median(all_tlens)
+				#for k in sorted(all_tlens.keys()):
+				#	print k, all_tlens[k]
+
+			#if i > BREAK_AFTER:
+			#	break
+
+
+med = quick_median(all_tlens)
+mdm = median_deviation_from_median(all_tlens)
+
+outVals  = []
+outProbs = []
+for k in sorted(all_tlens.keys()):
+	if k > 0 and k < med + FILTER_MEDDEV_M * mdm:
+		if all_tlens[k] >= FILTER_MINREADS:
+			print k, all_tlens[k]
+			outVals.append(k)
+			outProbs.append(all_tlens[k])
+countSum = float(sum(outProbs))
+outProbs = [n/countSum for n in outProbs]
+
+print '\nsaving model...'
+pickle.dump([outVals, outProbs],open('fraglen.p','wb'))
+
+