Mercurial > repos > thondeboer > neat_genreads
comparison utilities/computeFraglen.py @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
| author | thondeboer |
|---|---|
| date | Tue, 15 May 2018 02:39:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:6e75a84e9338 |
|---|---|
| 1 # | |
| 2 # | |
| 3 # Compute Fragment Length Model for genReads.py | |
| 4 # computeFraglen.py | |
| 5 # | |
| 6 # | |
| 7 # Usage: samtools view normal.bam | python computeFraglen.py | |
| 8 # | |
| 9 # | |
| 10 | |
| 11 import sys | |
| 12 import fileinput | |
| 13 import cPickle as pickle | |
| 14 import numpy as np | |
| 15 | |
| 16 FILTER_MAPQUAL = 10 # only consider reads that are mapped with at least this mapping quality | |
| 17 FILTER_MINREADS = 100 # only consider fragment lengths that have at least this many read pairs supporting it | |
| 18 FILTER_MEDDEV_M = 10 # only consider fragment lengths this many median deviations above the median | |
| 19 | |
| 20 def quick_median(countDict): | |
| 21 midPoint = sum(countDict.values())/2 | |
| 22 mySum = 0 | |
| 23 myInd = 0 | |
| 24 sk = sorted(countDict.keys()) | |
| 25 while mySum < midPoint: | |
| 26 mySum += countDict[sk[myInd]] | |
| 27 if mySum >= midPoint: | |
| 28 break | |
| 29 myInd += 1 | |
| 30 return myInd | |
| 31 | |
| 32 def median_deviation_from_median(countDict): | |
| 33 myMedian = quick_median(countDict) | |
| 34 deviations = {} | |
| 35 for k in sorted(countDict.keys()): | |
| 36 d = abs(k-myMedian) | |
| 37 deviations[d] = countDict[k] | |
| 38 return quick_median(deviations) | |
| 39 | |
| 40 if len(sys.argv) != 1: | |
| 41 print "Usage: samtools view normal.bam | python computeFraglen.py" | |
| 42 exit(1) | |
| 43 | |
| 44 all_tlens = {} | |
| 45 PRINT_EVERY = 100000 | |
| 46 BREAK_AFTER = 1000000 | |
| 47 i = 0 | |
| 48 for line in fileinput.input(): | |
| 49 splt = line.strip().split('\t') | |
| 50 samFlag = int(splt[1]) | |
| 51 myRef = splt[2] | |
| 52 mapQual = int(splt[4]) | |
| 53 mateRef = splt[6] | |
| 54 myTlen = abs(int(splt[8])) | |
| 55 | |
| 56 if samFlag&1 and samFlag&64 and mapQual > FILTER_MAPQUAL: # if read is paired, and is first in pair, and is confidently mapped... | |
| 57 if mateRef == '=' or mateRef == myRef: # and mate is mapped to same reference | |
| 58 if myTlen not in all_tlens: | |
| 59 all_tlens[myTlen] = 0 | |
| 60 all_tlens[myTlen] += 1 | |
| 61 i += 1 | |
| 62 if i%PRINT_EVERY == 0: | |
| 63 print '---',i, quick_median(all_tlens), median_deviation_from_median(all_tlens) | |
| 64 #for k in sorted(all_tlens.keys()): | |
| 65 # print k, all_tlens[k] | |
| 66 | |
| 67 #if i > BREAK_AFTER: | |
| 68 # break | |
| 69 | |
| 70 | |
| 71 med = quick_median(all_tlens) | |
| 72 mdm = median_deviation_from_median(all_tlens) | |
| 73 | |
| 74 outVals = [] | |
| 75 outProbs = [] | |
| 76 for k in sorted(all_tlens.keys()): | |
| 77 if k > 0 and k < med + FILTER_MEDDEV_M * mdm: | |
| 78 if all_tlens[k] >= FILTER_MINREADS: | |
| 79 print k, all_tlens[k] | |
| 80 outVals.append(k) | |
| 81 outProbs.append(all_tlens[k]) | |
| 82 countSum = float(sum(outProbs)) | |
| 83 outProbs = [n/countSum for n in outProbs] | |
| 84 | |
| 85 print '\nsaving model...' | |
| 86 pickle.dump([outVals, outProbs],open('fraglen.p','wb')) | |
| 87 | |
| 88 |
