Mercurial > repos > thondeboer > neat_genreads
view genReads.py @ 10:7d10b55965c9 draft default tip
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author | thondeboer |
---|---|
date | Wed, 16 May 2018 17:02:51 -0400 |
parents | 6e75a84e9338 |
children |
line wrap: on
line source
#!/usr/bin/env python # encoding: utf-8 """ //////////////////////////////////////////////////////////////////////////////// /// /// /// genReads.py /// /// VERSION 2.0: HARDER, BETTER, FASTER, STRONGER! /// /////// ////// /// Variant and read simulator for benchmarking NGS workflows /// /// /// /// Written by: Zach Stephens /// /////// For: DEPEND Research Group, UIUC /////// /// Date: May 29, 2015 /// /// Contact: zstephe2@illinois.edu /// /// /// /////////////////////////////////////////////////////////////////////////////// """ import os import sys import copy import random import re import time import bisect import cPickle as pickle import numpy as np #import matplotlib.pyplot as mpl import argparse # absolute path to this script SIM_PATH = '/'.join(os.path.realpath(__file__).split('/')[:-1]) sys.path.append(SIM_PATH+'/py/') from inputChecking import requiredField, checkFileOpen, checkDir, isInRange from refFunc import indexRef, readRef, getAllRefRegions, partitionRefRegions, ALLOWED_NUCL from vcfFunc import parseVCF from OutputFileWriter import OutputFileWriter from probability import DiscreteDistribution, mean_ind_of_weighted_list from SequenceContainer import SequenceContainer, ReadContainer, parseInputMutationModel # if coverage val for a given window/position is below this value, consider it effectively zero. LOW_COV_THRESH = 50 """////////////////////////////////////////////////// //////////// PARSE INPUT ARGUMENTS //////////// //////////////////////////////////////////////////""" parser = argparse.ArgumentParser(description='NEAT-genReads V2.0') parser.add_argument('-r', type=str, required=True, metavar='<str>', help="* ref.fa") parser.add_argument('-R', type=int, required=True, metavar='<int>', help="* read length") parser.add_argument('-o', type=str, required=True, metavar='<str>', help="* output prefix") parser.add_argument('-c', type=float, required=False, metavar='<float>', default=10., help="average coverage") parser.add_argument('-e', type=str, required=False, metavar='<str>', default=None, help="sequencing error model") parser.add_argument('-E', type=float, required=False, metavar='<float>', default=-1, help="rescale avg sequencing error rate to this") parser.add_argument('-p', type=int, required=False, metavar='<int>', default=2, help="ploidy") parser.add_argument('-t', type=str, required=False, metavar='<str>', default=None, help="bed file containing targeted regions") parser.add_argument('-to',type=float, required=False, metavar='<float>', default=0.00, help="off-target coverage scalar") parser.add_argument('-m', type=str, required=False, metavar='<str>', default=None, help="mutation model pickle file") parser.add_argument('-M', type=float, required=False, metavar='<float>', default=-1, help="rescale avg mutation rate to this") parser.add_argument('-Mb',type=str, required=False, metavar='<str>', default=None, help="bed file containing positional mut rates") parser.add_argument('-N', type=int, required=False, metavar='<int>', default=-1, help="below this qual, replace base-calls with 'N's") #parser.add_argument('-s', type=str, required=False, metavar='<str>', default=None, help="input sample model") parser.add_argument('-v', type=str, required=False, metavar='<str>', default=None, help="input VCF file") parser.add_argument('--pe', nargs=2, type=int, required=False, metavar=('<int>','<int>'), default=(None,None), help='paired-end fragment length mean and std') parser.add_argument('--pe-model', type=str, required=False, metavar='<str>', default=None, help='empirical fragment length distribution') #parser.add_argument('--cancer', required=False, action='store_true', default=False, help='produce tumor/normal datasets') #parser.add_argument('-cm', type=str, required=False, metavar='<str>', default=None, help="cancer mutation model directory") #parser.add_argument('-cp', type=float, required=False, metavar='<float>', default=0.8, help="tumor sample purity") parser.add_argument('--gc-model', type=str, required=False, metavar='<str>', default=None, help='empirical GC coverage bias distribution') parser.add_argument('--job', nargs=2, type=int, required=False, metavar=('<int>','<int>'), default=(0,0), help='jobs IDs for generating reads in parallel') parser.add_argument('--nnr', required=False, action='store_true', default=False, help='save non-N ref regions (for parallel jobs)') parser.add_argument('--bam', required=False, action='store_true', default=False, help='output golden BAM file') parser.add_argument('--vcf', required=False, action='store_true', default=False, help='output golden VCF file') parser.add_argument('--rng', type=int, required=False, metavar='<int>', default=-1, help='rng seed value; identical RNG value should produce identical runs of the program, so things like read locations, variant positions, error positions, etc, should all be the same.') parser.add_argument('--gz', required=False, action='store_true', default=False, help='gzip output FQ and VCF') parser.add_argument('--no-fastq', required=False, action='store_true', default=False, help='bypass fastq generation') args = parser.parse_args() # required args (REFERENCE, READLEN, OUT_PREFIX) = (args.r, args.R, args.o) # various dataset parameters (COVERAGE, PLOIDS, INPUT_BED, SE_MODEL, SE_RATE, MUT_MODEL, MUT_RATE, MUT_BED, INPUT_VCF) = (args.c, args.p, args.t, args.e, args.E, args.m, args.M, args.Mb, args.v) # cancer params (disabled currently) #(CANCER, CANCER_MODEL, CANCER_PURITY) = (args.cancer, args.cm, args.cp) (CANCER, CANCER_MODEL, CANCER_PURITY) = (False, None, 0.8) (OFFTARGET_SCALAR) = (args.to) # important flags (SAVE_BAM, SAVE_VCF, GZIPPED_OUT, NO_FASTQ) = (args.bam, args.vcf, args.gz, args.no_fastq) ONLY_VCF = (NO_FASTQ and SAVE_VCF and not(SAVE_BAM)) if ONLY_VCF: print 'Only producing VCF output, that should speed things up a bit...' # sequencing model parameters (FRAGMENT_SIZE, FRAGMENT_STD) = args.pe FRAGLEN_MODEL = args.pe_model GC_BIAS_MODEL = args.gc_model N_MAX_QUAL = args.N # if user specified no fastq, no bam, no vcf, then inform them of their wasteful ways and exit if NO_FASTQ == True and SAVE_BAM == False and SAVE_VCF == False: print '\nError: No files will be written when --no-fastq is specified without --vcf or --bam.' exit(1) # if user specified mean/std, use artificial fragment length distribution, otherwise use # the empirical model specified. If neither, then we're doing single-end reads. PAIRED_END = False PAIRED_END_ARTIFICIAL = False if FRAGMENT_SIZE != None and FRAGMENT_STD != None: PAIRED_END = True PAIRED_END_ARTIFICIAL = True elif FRAGLEN_MODEL != None: PAIRED_END = True PAIRED_END_ARTIFICIAL = False (MYJOB, NJOBS) = args.job if MYJOB == 0: MYJOB = 1 NJOBS = 1 SAVE_NON_N = args.nnr RNG_SEED = args.rng if RNG_SEED == -1: RNG_SEED = random.randint(1,99999999) random.seed(RNG_SEED) """************************************************ **** INPUT ERROR CHECKING ************************************************""" checkFileOpen(REFERENCE,'ERROR: could not open reference',required=True) checkFileOpen(INPUT_VCF,'ERROR: could not open input VCF',required=False) checkFileOpen(INPUT_BED,'ERROR: could not open input BED',required=False) requiredField(OUT_PREFIX,'ERROR: no output prefix provided') if (FRAGMENT_SIZE == None and FRAGMENT_STD != None) or (FRAGMENT_SIZE != None and FRAGMENT_STD == None): print '\nError: --pe argument takes 2 space-separated arguments.\n' exit(1) """************************************************ **** LOAD INPUT MODELS ************************************************""" # mutation models # MUT_MODEL = parseInputMutationModel(MUT_MODEL,1) if CANCER: CANCER_MODEL = parseInputMutationModel(CANCER_MODEL,2) if MUT_RATE < 0.: MUT_RATE = None # sequencing error model # if SE_RATE < 0.: SE_RATE = None if SE_MODEL == None: print 'Using default sequencing error model.' SE_MODEL = SIM_PATH+'/models/errorModel_toy.p' SE_CLASS = ReadContainer(READLEN,SE_MODEL,SE_RATE) else: # probably need to do some sanity checking SE_CLASS = ReadContainer(READLEN,SE_MODEL,SE_RATE) # GC-bias model # if GC_BIAS_MODEL == None: print 'Using default gc-bias model.' GC_BIAS_MODEL = SIM_PATH+'/models/gcBias_toy.p' [GC_SCALE_COUNT, GC_SCALE_VAL] = pickle.load(open(GC_BIAS_MODEL,'rb')) GC_WINDOW_SIZE = GC_SCALE_COUNT[-1] else: [GC_SCALE_COUNT, GC_SCALE_VAL] = pickle.load(open(GC_BIAS_MODEL,'rb')) GC_WINDOW_SIZE = GC_SCALE_COUNT[-1] # fragment length distribution # if PAIRED_END and not(PAIRED_END_ARTIFICIAL): print 'Using empirical fragment length distribution.' [potential_vals, potential_prob] = pickle.load(open(FRAGLEN_MODEL,'rb')) FRAGLEN_VALS = [] FRAGLEN_PROB = [] for i in xrange(len(potential_vals)): if potential_vals[i] > READLEN: FRAGLEN_VALS.append(potential_vals[i]) FRAGLEN_PROB.append(potential_prob[i]) # should probably add some validation and sanity-checking code here... FRAGLEN_DISTRIBUTION = DiscreteDistribution(FRAGLEN_PROB,FRAGLEN_VALS) FRAGMENT_SIZE = FRAGLEN_VALS[mean_ind_of_weighted_list(FRAGLEN_PROB)] # Indicate not writing FASTQ reads # if NO_FASTQ: print 'Bypassing FASTQ generation...' """************************************************ **** HARD-CODED CONSTANTS ************************************************""" # target window size for read sampling. how many times bigger than read/frag length WINDOW_TARGET_SCALE = 100 # sub-window size for read sampling windows. this is basically the finest resolution # that can be obtained for targeted region boundaries and GC% bias SMALL_WINDOW = 20 # is the mutation model constant throughout the simulation? If so we can save a lot of time CONSTANT_MUT_MODEL = True """************************************************ **** DEFAULT MODELS ************************************************""" # fragment length distribution: normal distribution that goes out to +- 6 standard deviations if PAIRED_END and PAIRED_END_ARTIFICIAL: print 'Using artificial fragment length distribution. mean='+str(FRAGMENT_SIZE)+', std='+str(FRAGMENT_STD) if FRAGMENT_STD == 0: FRAGLEN_DISTRIBUTION = DiscreteDistribution([1],[FRAGMENT_SIZE],degenerateVal=FRAGMENT_SIZE) else: potential_vals = range(FRAGMENT_SIZE-6*FRAGMENT_STD,FRAGMENT_SIZE+6*FRAGMENT_STD+1) FRAGLEN_VALS = [] for i in xrange(len(potential_vals)): if potential_vals[i] > READLEN: FRAGLEN_VALS.append(potential_vals[i]) FRAGLEN_PROB = [np.exp(-(((n-float(FRAGMENT_SIZE))**2)/(2*(FRAGMENT_STD**2)))) for n in FRAGLEN_VALS] FRAGLEN_DISTRIBUTION = DiscreteDistribution(FRAGLEN_PROB,FRAGLEN_VALS) """************************************************ **** MORE INPUT ERROR CHECKING ************************************************""" isInRange(READLEN, 10,1000000, 'Error: -R must be between 10 and 1,000,000') isInRange(COVERAGE, 0,1000000, 'Error: -c must be between 0 and 1,000,000') isInRange(PLOIDS, 1,100, 'Error: -p must be between 1 and 100') isInRange(OFFTARGET_SCALAR, 0,1, 'Error: -to must be between 0 and 1') if MUT_RATE != -1 and MUT_RATE != None: isInRange(MUT_RATE, 0,0.3, 'Error: -M must be between 0 and 0.3') if SE_RATE != -1 and SE_RATE != None: isInRange(SE_RATE, 0,0.3, 'Error: -E must be between 0 and 0.3') if NJOBS != 1: isInRange(NJOBS, 1,1000, 'Error: --job must be between 1 and 1,000') isInRange(MYJOB, 1,1000, 'Error: --job must be between 1 and 1,000') isInRange(MYJOB, 1,NJOBS, 'Error: job id must be less than or equal to number of jobs') if N_MAX_QUAL != -1: isInRange(N_MAX_QUAL, 1,40, 'Error: -N must be between 1 and 40') """************************************************ **** MAIN() ************************************************""" def main(): # index reference refIndex = indexRef(REFERENCE) if PAIRED_END: N_HANDLING = ('random',FRAGMENT_SIZE) else: N_HANDLING = ('ignore',READLEN) indices_by_refName = {refIndex[n][0]:n for n in xrange(len(refIndex))} # parse input variants, if present inputVariants = [] if INPUT_VCF != None: if CANCER: (sampNames, inputVariants) = parseVCF(INPUT_VCF,tumorNormal=True,ploidy=PLOIDS) tumorInd = sampNames.index('TUMOR') normalInd = sampNames.index('NORMAL') else: (sampNames, inputVariants) = parseVCF(INPUT_VCF,ploidy=PLOIDS) for k in sorted(inputVariants.keys()): inputVariants[k].sort() # parse input targeted regions, if present inputRegions = {} refList = [n[0] for n in refIndex] if INPUT_BED != None: with open(INPUT_BED,'r') as f: for line in f: [myChr,pos1,pos2] = line.strip().split('\t')[:3] if myChr not in inputRegions: inputRegions[myChr] = [-1] inputRegions[myChr].extend([int(pos1),int(pos2)]) # some validation nInBedOnly = 0 nInRefOnly = 0 for k in refList: if k not in inputRegions: nInRefOnly += 1 for k in inputRegions.keys(): if not k in refList: nInBedOnly += 1 del inputRegions[k] if nInRefOnly > 0: print 'Warning: Reference contains sequences not found in targeted regions BED file.' if nInBedOnly > 0: print 'Warning: Targeted regions BED file contains sequence names not found in reference (regions ignored).' # parse input mutation rate rescaling regions, if present mutRateRegions = {} mutRateValues = {} if MUT_BED != None: with open(MUT_BED,'r') as f: for line in f: [myChr,pos1,pos2,metaData] = line.strip().split('\t')[:4] mutStr = re.findall(r"MUT_RATE=.*?(?=;)",metaData+';') (pos1,pos2) = (int(pos1),int(pos2)) if len(mutStr) and (pos2-pos1) > 1: # mutRate = #_mutations / length_of_region, let's bound it by a reasonable amount mutRate = max([0.0,min([float(mutStr[0][9:]),0.3])]) if myChr not in mutRateRegions: mutRateRegions[myChr] = [-1] mutRateValues[myChr] = [0.0] mutRateRegions[myChr].extend([pos1,pos2]) mutRateValues.extend([mutRate*(pos2-pos1)]*2) # initialize output files (part I) bamHeader = None if SAVE_BAM: bamHeader = [copy.deepcopy(refIndex)] vcfHeader = None if SAVE_VCF: vcfHeader = [REFERENCE] # If processing jobs in parallel, precompute the independent regions that can be process separately if NJOBS > 1: parallelRegionList = getAllRefRegions(REFERENCE,refIndex,N_HANDLING,saveOutput=SAVE_NON_N) (myRefs, myRegions) = partitionRefRegions(parallelRegionList,refIndex,MYJOB,NJOBS) if not len(myRegions): print 'This job id has no regions to process, exiting...' exit(1) for i in xrange(len(refIndex)-1,-1,-1): # delete reference not used in our job if not refIndex[i][0] in myRefs: del refIndex[i] # if value of NJOBS is too high, let's change it to the maximum possible, to avoid output filename confusion corrected_nJobs = min([NJOBS,sum([len(n) for n in parallelRegionList.values()])]) else: corrected_nJobs = 1 # initialize output files (part II) if CANCER: OFW = OutputFileWriter(OUT_PREFIX+'_normal',paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT,noFASTQ=NO_FASTQ) OFW_CANCER = OutputFileWriter(OUT_PREFIX+'_tumor',paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT,jobTuple=(MYJOB,corrected_nJobs),noFASTQ=NO_FASTQ) else: OFW = OutputFileWriter(OUT_PREFIX,paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT,jobTuple=(MYJOB,corrected_nJobs),noFASTQ=NO_FASTQ) OUT_PREFIX_NAME = OUT_PREFIX.split('/')[-1] """************************************************ **** LET'S GET THIS PARTY STARTED... ************************************************""" readNameCount = 1 # keep track of the number of reads we've sampled, for read-names unmapped_records = [] for RI in xrange(len(refIndex)): # read in reference sequence and notate blocks of Ns (refSequence,N_regions) = readRef(REFERENCE,refIndex[RI],N_HANDLING) # if we're processing jobs in parallel only take the regions relevant for the current job if NJOBS > 1: for i in xrange(len(N_regions['non_N'])-1,-1,-1): if not (refIndex[RI][0],N_regions['non_N'][i][0],N_regions['non_N'][i][1]) in myRegions: del N_regions['non_N'][i] # count total bp we'll be spanning so we can get an idea of how far along we are (for printing progress indicators) total_bp_span = sum([n[1]-n[0] for n in N_regions['non_N']]) currentProgress = 0 currentPercent = 0 # prune invalid input variants, e.g variants that: # - try to delete or alter any N characters # - don't match the reference base at their specified position # - any alt allele contains anything other than allowed characters validVariants = [] nSkipped = [0,0,0] if refIndex[RI][0] in inputVariants: for n in inputVariants[refIndex[RI][0]]: span = (n[0],n[0]+len(n[1])) rseq = str(refSequence[span[0]-1:span[1]-1]) # -1 because going from VCF coords to array coords anyBadChr = any((nn not in ALLOWED_NUCL) for nn in [item for sublist in n[2] for item in sublist]) if rseq != n[1]: nSkipped[0] += 1 continue elif 'N' in rseq: nSkipped[1] += 1 continue elif anyBadChr: nSkipped[2] += 1 continue #if bisect.bisect(N_regions['big'],span[0])%2 or bisect.bisect(N_regions['big'],span[1])%2: # continue validVariants.append(n) print 'found',len(validVariants),'valid variants for '+refIndex[RI][0]+' in input VCF...' if any(nSkipped): print sum(nSkipped),'variants skipped...' print ' - ['+str(nSkipped[0])+'] ref allele does not match reference' print ' - ['+str(nSkipped[1])+'] attempting to insert into N-region' print ' - ['+str(nSkipped[2])+'] alt allele contains non-ACGT characters' # add large random structural variants # # TBD!!! # determine sampling windows based on read length, large N regions, and structural mutations. # in order to obtain uniform coverage, windows should overlap by: # - READLEN, if single-end reads # - FRAGMENT_SIZE (mean), if paired-end reads # ploidy is fixed per large sampling window, # coverage distributions due to GC% and targeted regions are specified within these windows samplingWindows = [] ALL_VARIANTS_OUT = {} sequences = None if PAIRED_END: targSize = WINDOW_TARGET_SCALE*FRAGMENT_SIZE overlap = FRAGMENT_SIZE overlap_minWindowSize = max(FRAGLEN_DISTRIBUTION.values) + 10 else: targSize = WINDOW_TARGET_SCALE*READLEN overlap = READLEN overlap_minWindowSize = READLEN + 10 print '--------------------------------' if ONLY_VCF: print 'generating vcf...' else: print 'sampling reads...' tt = time.time() for i in xrange(len(N_regions['non_N'])): (pi,pf) = N_regions['non_N'][i] nTargWindows = max([1,(pf-pi)/targSize]) bpd = int((pf-pi)/float(nTargWindows)) #bpd += GC_WINDOW_SIZE - bpd%GC_WINDOW_SIZE #print len(refSequence), (pi,pf), nTargWindows #print structuralVars # if for some reason our region is too small to process, skip it! (sorry) if nTargWindows == 1 and (pf-pi) < overlap_minWindowSize: #print 'Does this ever happen?' continue start = pi end = min([start+bpd,pf]) #print '------------------RAWR:', (pi,pf), nTargWindows, bpd varsFromPrevOverlap = [] varsCancerFromPrevOverlap = [] vindFromPrev = 0 isLastTime = False havePrinted100 = False while True: # which inserted variants are in this window? varsInWindow = [] updated = False for j in xrange(vindFromPrev,len(validVariants)): vPos = validVariants[j][0] if vPos > start and vPos < end: # update: changed >= to >, so variant cannot be inserted in first position varsInWindow.append(tuple([vPos-1]+list(validVariants[j][1:]))) # vcf --> array coords if vPos >= end-overlap-1 and updated == False: updated = True vindFromPrev = j if vPos >= end: break # determine which structural variants will affect our sampling window positions structuralVars = [] for n in varsInWindow: bufferNeeded = max([max([abs(len(n[1])-len(alt_allele)),1]) for alt_allele in n[2]]) # change: added abs() so that insertions are also buffered. structuralVars.append((n[0]-1,bufferNeeded)) # -1 because going from VCF coords to array coords # adjust end-position of window based on inserted structural mutations buffer_added = 0 keepGoing = True while keepGoing: keepGoing = False for n in structuralVars: # adding "overlap" here to prevent SVs from being introduced in overlap regions # (which can cause problems if random mutations from the previous window land on top of them) delta = (end-1) - (n[0] + n[1]) - 2 - overlap if delta < 0: #print 'DELTA:', delta, 'END:', end, '-->', buffer_added = -delta end += buffer_added ####print end keepGoing = True break next_start = end-overlap next_end = min([next_start+bpd,pf]) if next_end-next_start < bpd: end = next_end isLastTime = True # print progress indicator ####print 'PROCESSING WINDOW:',(start,end), [buffer_added], 'next:', (next_start,next_end) currentProgress += end-start newPercent = int((currentProgress*100)/float(total_bp_span)) if newPercent > currentPercent: if newPercent <= 99 or (newPercent == 100 and not havePrinted100): sys.stdout.write(str(newPercent)+'% ') sys.stdout.flush() currentPercent = newPercent if currentPercent == 100: havePrinted100 = True skip_this_window = False # compute coverage modifiers coverage_avg = None coverage_dat = [GC_WINDOW_SIZE,GC_SCALE_VAL,[]] if INPUT_BED == None: coverage_dat[2] = [1.0]*(end-start) else: if refIndex[RI][0] not in inputRegions: coverage_dat[2] = [OFFTARGET_SCALAR]*(end-start) else: for j in xrange(start,end): if not(bisect.bisect(inputRegions[refIndex[RI][0]],j)%2): coverage_dat[2].append(1.0) else: coverage_dat[2].append(OFFTARGET_SCALAR) #print len(coverage_dat[2]), sum(coverage_dat[2]) if sum(coverage_dat[2]) < LOW_COV_THRESH: coverage_avg = 0.0 skip_this_window = True # check for small window sizes if (end-start) < overlap_minWindowSize: skip_this_window = True if skip_this_window: # skip window, save cpu time start = next_start end = next_end if isLastTime: break if end >= pf: isLastTime = True continue ##### skip windows if we can ####skip_this_window = False ####if INPUT_BED != None and OFFTARGET_SCALAR < 1.0e-12: #### if refIndex[RI][0] in inputRegions: #### skip_this_window = True #### else: #### skip_this_window = True ####if skip_this_window: #### # prepare indices of next window #### start = next_start #### end = next_end #### if isLastTime: #### break #### if end >= pf: #### isLastTime = True #### continue ##### if computing only VCF, we can skip this... ####if ONLY_VCF: #### coverage_dat = None #### coverage_avg = None ####else: #### # pre-compute gc-bias and targeted sequencing coverage modifiers #### nSubWindows = (end-start)/GC_WINDOW_SIZE #### coverage_dat = (GC_WINDOW_SIZE,[]) #### for j in xrange(nSubWindows): #### rInd = start + j*GC_WINDOW_SIZE #### if INPUT_BED == None: #### tCov = True #### elif refIndex[RI][0] in inputRegions: #### tCov = not(bisect.bisect(inputRegions[refIndex[RI][0]],rInd)%2) or not(bisect.bisect(inputRegions[refIndex[RI][0]],rInd+GC_WINDOW_SIZE)%2) #### else: #### tCov = False #### if tCov: #### tScl = 1.0 #### else: #### tScl = OFFTARGET_SCALAR #### gc_v = refSequence[rInd:rInd+GC_WINDOW_SIZE].count('G') + refSequence[rInd:rInd+GC_WINDOW_SIZE].count('C') #### gScl = GC_SCALE_VAL[gc_v] #### coverage_dat[1].append(1.0*tScl*gScl) #### coverage_avg = np.mean(coverage_dat[1]) #### ##### pre-compute mutation rate tracks ##### PROVIDED MUTATION RATES OVERRIDE AVERAGE VALUE # construct sequence data that we will sample reads from if sequences == None: sequences = SequenceContainer(start,refSequence[start:end],PLOIDS,overlap,READLEN,[MUT_MODEL]*PLOIDS,MUT_RATE,onlyVCF=ONLY_VCF) else: sequences.update(start,refSequence[start:end],PLOIDS,overlap,READLEN,[MUT_MODEL]*PLOIDS,MUT_RATE) # some inserted variant debugging... ####print '\n',start,end,end-overlap,'\n',varsFromPrevOverlap,'\n',varsInWindow,'\n' # insert variants sequences.insert_mutations(varsFromPrevOverlap + varsInWindow) all_inserted_variants = sequences.random_mutations() #print all_inserted_variants # init coverage if sum(coverage_dat[2]) >= LOW_COV_THRESH: if PAIRED_END: coverage_avg = sequences.init_coverage(tuple(coverage_dat),fragDist=FRAGLEN_DISTRIBUTION) else: coverage_avg = sequences.init_coverage(tuple(coverage_dat)) # unused cancer stuff if CANCER: tumor_sequences = SequenceContainer(start,refSequence[start:end],PLOIDS,overlap,READLEN,[CANCER_MODEL]*PLOIDS,MUT_RATE,coverage_dat) tumor_sequences.insert_mutations(varsCancerFromPrevOverlap + all_inserted_variants) all_cancer_variants = tumor_sequences.random_mutations() # which variants do we need to keep for next time (because of window overlap)? varsFromPrevOverlap = [] varsCancerFromPrevOverlap = [] for n in all_inserted_variants: if n[0] >= end-overlap-1: varsFromPrevOverlap.append(n) if CANCER: for n in all_cancer_variants: if n[0] >= end-overlap-1: varsCancerFromPrevOverlap.append(n) # if we're only producing VCF, no need to go through the hassle of generating reads if ONLY_VCF: pass else: if PAIRED_END: readsToSample = int(((end-start)*float(COVERAGE)*coverage_avg)/(2*READLEN))+1 else: readsToSample = int(((end-start)*float(COVERAGE)*coverage_avg)/(READLEN))+1 # if coverage is so low such that no reads are to be sampled, skip region # (i.e., remove buffer of +1 reads we add to every window) if readsToSample == 1 and sum(coverage_dat[2]) < LOW_COV_THRESH: readsToSample = 0 # sample reads ASDF2_TT = time.time() for i in xrange(readsToSample): isUnmapped = [] if PAIRED_END: myFraglen = FRAGLEN_DISTRIBUTION.sample() myReadData = sequences.sample_read(SE_CLASS,myFraglen) if myReadData == None: # skip if we failed to find a valid position to sample read continue if myReadData[0][0] == None: isUnmapped.append(True) else: isUnmapped.append(False) myReadData[0][0] += start # adjust mapping position based on window start if myReadData[1][0] == None: isUnmapped.append(True) else: isUnmapped.append(False) myReadData[1][0] += start else: myReadData = sequences.sample_read(SE_CLASS) if myReadData == None: # skip if we failed to find a valid position to sample read continue if myReadData[0][0] == None: # unmapped read (lives in large insertion) isUnmapped = [True] else: isUnmapped = [False] myReadData[0][0] += start # adjust mapping position based on window start if NJOBS > 1: myReadName = OUT_PREFIX_NAME+'-j'+str(MYJOB)+'-'+refIndex[RI][0]+'-r'+str(readNameCount) else: myReadName = OUT_PREFIX_NAME+'-'+refIndex[RI][0]+'-'+str(readNameCount) readNameCount += len(myReadData) # if desired, replace all low-quality bases with Ns if N_MAX_QUAL > -1: for j in xrange(len(myReadData)): myReadString = [n for n in myReadData[j][2]] for k in xrange(len(myReadData[j][3])): adjusted_qual = ord(myReadData[j][3][k])-SE_CLASS.offQ if adjusted_qual <= N_MAX_QUAL: myReadString[k] = 'N' myReadData[j][2] = ''.join(myReadString) # if read (or read + mate for PE) are unmapped, put them at end of bam file if all(isUnmapped): if PAIRED_END: unmapped_records.append((myReadName+'/1',myReadData[0],109)) unmapped_records.append((myReadName+'/2',myReadData[1],157)) else: unmapped_records.append((myReadName+'/1',myReadData[0],4)) # write read data out to FASTQ and BAM files, bypass FASTQ if option specified myRefIndex = indices_by_refName[refIndex[RI][0]] if len(myReadData) == 1: if NO_FASTQ != True: OFW.writeFASTQRecord(myReadName,myReadData[0][2],myReadData[0][3]) if SAVE_BAM: if isUnmapped[0] == False: OFW.writeBAMRecord(myRefIndex, myReadName+'/1', myReadData[0][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=0) elif len(myReadData) == 2: if NO_FASTQ != True: OFW.writeFASTQRecord(myReadName,myReadData[0][2],myReadData[0][3],read2=myReadData[1][2],qual2=myReadData[1][3]) if SAVE_BAM: if isUnmapped[0] == False and isUnmapped[1] == False: OFW.writeBAMRecord(myRefIndex, myReadName+'/1', myReadData[0][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=99, matePos=myReadData[1][0]) OFW.writeBAMRecord(myRefIndex, myReadName+'/2', myReadData[1][0], myReadData[1][1], myReadData[1][2], myReadData[1][3], samFlag=147, matePos=myReadData[0][0]) elif isUnmapped[0] == False and isUnmapped[1] == True: OFW.writeBAMRecord(myRefIndex, myReadName+'/1', myReadData[0][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=105, matePos=myReadData[0][0]) OFW.writeBAMRecord(myRefIndex, myReadName+'/2', myReadData[0][0], myReadData[1][1], myReadData[1][2], myReadData[1][3], samFlag=149, matePos=myReadData[0][0], alnMapQual=0) elif isUnmapped[0] == True and isUnmapped[1] == False: OFW.writeBAMRecord(myRefIndex, myReadName+'/1', myReadData[1][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=101, matePos=myReadData[1][0], alnMapQual=0) OFW.writeBAMRecord(myRefIndex, myReadName+'/2', myReadData[1][0], myReadData[1][1], myReadData[1][2], myReadData[1][3], samFlag=153, matePos=myReadData[1][0]) else: print '\nError: Unexpected number of reads generated...\n' exit(1) #print 'READS:',time.time()-ASDF2_TT if not isLastTime: OFW.flushBuffers(bamMax=next_start) else: OFW.flushBuffers(bamMax=end+1) # tally up all the variants that got successfully introduced for n in all_inserted_variants: ALL_VARIANTS_OUT[n] = True # prepare indices of next window start = next_start end = next_end if isLastTime: break if end >= pf: isLastTime = True if currentPercent != 100 and not havePrinted100: print '100%' else: print '' if ONLY_VCF: print 'VCF generation completed in', else: print 'Read sampling completed in', print int(time.time()-tt),'(sec)' # write all output variants for this reference if SAVE_VCF: print 'Writing output VCF...' for k in sorted(ALL_VARIANTS_OUT.keys()): currentRef = refIndex[RI][0] myID = '.' myQual = '.' myFilt = 'PASS' # k[0] + 1 because we're going back to 1-based vcf coords OFW.writeVCFRecord(currentRef, str(int(k[0])+1), myID, k[1], k[2], myQual, myFilt, k[4]) #break # write unmapped reads to bam file if SAVE_BAM and len(unmapped_records): print 'writing unmapped reads to bam file...' for umr in unmapped_records: if PAIRED_END: OFW.writeBAMRecord(-1, umr[0], 0, umr[1][1], umr[1][2], umr[1][3], samFlag=umr[2], matePos=0, alnMapQual=0) else: OFW.writeBAMRecord(-1, umr[0], 0, umr[1][1], umr[1][2], umr[1][3], samFlag=umr[2], alnMapQual=0) # close output files OFW.closeFiles() if CANCER: OFW_CANCER.closeFiles() if __name__ == '__main__': main()