Mercurial > repos > thondeboer > neat_genreads
comparison utilities/computeGC.py @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
| author | thondeboer |
|---|---|
| date | Tue, 15 May 2018 02:39:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:6e75a84e9338 |
|---|---|
| 1 # | |
| 2 # | |
| 3 # computeGC.py | |
| 4 # Compute GC and coverage model for genReads.py | |
| 5 # | |
| 6 # Takes output file from bedtools genomecov to generate GC/coverage model | |
| 7 # | |
| 8 # Usage: bedtools genomecov -d -ibam normal.bam -g reference.fa | |
| 9 # python computeGC.py -r reference.fa -i genomecovfile -W [sliding window length] -o path/to/output_name.p | |
| 10 # | |
| 11 # | |
| 12 | |
| 13 | |
| 14 import time | |
| 15 import sys | |
| 16 import argparse | |
| 17 import numpy as np | |
| 18 import cPickle as pickle | |
| 19 | |
| 20 parser = argparse.ArgumentParser(description='computeGC.py') | |
| 21 parser.add_argument('-i', type=str, required=True, metavar='<str>', help="input.genomecov") | |
| 22 parser.add_argument('-r', type=str, required=True, metavar='<str>', help="reference.fa") | |
| 23 parser.add_argument('-w', type=int, required=True, metavar='<int>', help="sliding window length") | |
| 24 parser.add_argument('-o', type=str, required=True, metavar='<str>', help="output.p") | |
| 25 args = parser.parse_args() | |
| 26 | |
| 27 (IN_GCB, REF_FILE, WINDOW_SIZE, OUT_P) = (args.i, args.r, args.w, args.o) | |
| 28 | |
| 29 GC_BINS = {n:[] for n in range(WINDOW_SIZE+1)} | |
| 30 | |
| 31 print 'reading ref...' | |
| 32 allRefs = {} | |
| 33 f = open(REF_FILE,'r') | |
| 34 for line in f: | |
| 35 if line[0] == '>': | |
| 36 refName = line.strip()[1:] | |
| 37 allRefs[refName] = [] | |
| 38 print refName | |
| 39 #if refName == 'chr2': | |
| 40 # break | |
| 41 else: | |
| 42 allRefs[refName].append(line.strip()) | |
| 43 f.close() | |
| 44 | |
| 45 print 'capitalizing ref...' | |
| 46 for k in sorted(allRefs.keys()): | |
| 47 print k | |
| 48 allRefs[k] = ''.join(allRefs[k]) | |
| 49 allRefs[k] = allRefs[k].upper() | |
| 50 | |
| 51 print 'reading genomecov file...' | |
| 52 tt = time.time() | |
| 53 f = open(IN_GCB,'r') | |
| 54 currentLine = 0 | |
| 55 currentRef = None | |
| 56 currentCov = 0 | |
| 57 linesProcessed = 0 | |
| 58 PRINT_EVERY = 1000000 | |
| 59 STOP_AFTER = 1000000 | |
| 60 for line in f: | |
| 61 splt = line.strip().split('\t') | |
| 62 if linesProcessed%PRINT_EVERY == 0: | |
| 63 print linesProcessed | |
| 64 linesProcessed += 1 | |
| 65 | |
| 66 if currentLine == 0: | |
| 67 currentRef = splt[0] | |
| 68 sPos = int(splt[1])-1 | |
| 69 | |
| 70 if currentRef not in allRefs: | |
| 71 continue | |
| 72 | |
| 73 currentLine += 1 | |
| 74 currentCov += int(splt[2]) | |
| 75 | |
| 76 if currentLine == WINDOW_SIZE: | |
| 77 currentLine = 0 | |
| 78 seq = allRefs[currentRef][sPos:sPos+WINDOW_SIZE] | |
| 79 if 'N' not in seq: | |
| 80 gc_count = seq.count('G') + seq.count('C') | |
| 81 GC_BINS[gc_count].append(currentCov) | |
| 82 currentCov = 0 | |
| 83 | |
| 84 #if linesProcessed >= STOP_AFTER: | |
| 85 # break | |
| 86 | |
| 87 f.close() | |
| 88 | |
| 89 runningTot = 0 | |
| 90 allMean = 0.0 | |
| 91 for k in sorted(GC_BINS.keys()): | |
| 92 if len(GC_BINS[k]) == 0: | |
| 93 print '{0:0.2%}'.format(k/float(WINDOW_SIZE)), 0.0, 0 | |
| 94 GC_BINS[k] = 0 | |
| 95 else: | |
| 96 myMean = np.mean(GC_BINS[k]) | |
| 97 myLen = len(GC_BINS[k]) | |
| 98 print '{0:0.2%}'.format(k/float(WINDOW_SIZE)), myMean, myLen | |
| 99 allMean += myMean * myLen | |
| 100 runningTot += myLen | |
| 101 GC_BINS[k] = myMean | |
| 102 | |
| 103 avgCov = allMean/float(runningTot) | |
| 104 print 'AVERAGE COVERAGE =',avgCov | |
| 105 | |
| 106 y_out = [] | |
| 107 for k in sorted(GC_BINS.keys()): | |
| 108 GC_BINS[k] /= avgCov | |
| 109 y_out.append(GC_BINS[k]) | |
| 110 | |
| 111 print 'saving model...' | |
| 112 pickle.dump([range(WINDOW_SIZE+1),y_out],open(OUT_P,'wb')) | |
| 113 | |
| 114 print time.time()-tt,'(sec)' | |
| 115 |
