Mercurial > repos > thondeboer > neat_genreads
annotate py/vcfFunc.py @ 7:fc1c7b6fb7b6 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
| author | thondeboer | 
|---|---|
| date | Tue, 15 May 2018 18:12:29 -0400 | 
| parents | 6e75a84e9338 | 
| children | 
| rev | line source | 
|---|---|
| 0 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1 import sys | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 2 import time | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 3 import os | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 4 import re | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 5 import random | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 6 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 7 INCLUDE_HOMS = False | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 8 INCLUDE_FAIL = False | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 9 CHOOSE_RANDOM_PLOID_IF_NO_GT_FOUND = True | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 10 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 11 def parseLine(splt,colDict,colSamp): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 12 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 13 # check if we want to proceed.. | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 14 ra = splt[colDict['REF']] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 15 aa = splt[colDict['ALT']] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 16 # enough columns? | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 17 if len(splt) != len(colDict): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 18 return None | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 19 # exclude homs / filtered? | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 20 if not(INCLUDE_HOMS) and (aa == '.' or aa == '' or aa == ra): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 21 return None | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 22 if not(INCLUDE_FAIL) and (splt[colDict['FILTER']] != 'PASS' and splt[colDict['FILTER']] != '.'): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 23 return None | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 24 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 25 # default vals | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 26 alt_alleles = [aa] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 27 alt_freqs = [] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 28 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 29 gt_perSamp = [] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 30 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 31 # any alt alleles? | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 32 alt_split = aa.split(',') | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 33 if len(alt_split) > 1: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 34 alt_alleles = alt_split | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 35 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 36 # check INFO for AF | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 37 af = None | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 38 if 'INFO' in colDict and ';AF=' in ';'+splt[colDict['INFO']]: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 39 info = splt[colDict['INFO']]+';' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 40 af = re.findall(r"AF=.*?(?=;)",info)[0][3:] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 41 if af != None: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 42 af_splt = af.split(',') | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 43 while(len(af_splt) < len(alt_alleles)): # are we lacking enough AF values for some reason? | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 44 af_splt.append(af_splt[-1]) # phone it in. | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 45 if len(af_splt) != 0 and af_splt[0] != '.' and af_splt[0] != '': # missing data, yay | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 46 alt_freqs = [float(n) for n in af_splt] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 47 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 48 alt_freqs = [None]*max([len(alt_alleles),1]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 49 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 50 gt_perSamp = None | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 51 # if available (i.e. we simulated it) look for WP in info | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 52 if len(colSamp) == 0 and 'INFO' in colDict and 'WP=' in splt[colDict['INFO']]: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 53 info = splt[colDict['INFO']]+';' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 54 gt_perSamp = [re.findall(r"WP=.*?(?=;)",info)[0][3:]] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 55 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 56 # if no sample columns, check info for GT | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 57 if len(colSamp) == 0 and 'INFO' in colDict and 'GT=' in splt[colDict['INFO']]: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 58 info = splt[colDict['INFO']]+';' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 59 gt_perSamp = [re.findall(r"GT=.*?(?=;)",info)[0][3:]] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 60 elif len(colSamp): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 61 fmt = ':'+splt[colDict['FORMAT']]+':' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 62 if ':GT:' in fmt: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 63 gtInd = fmt.split(':').index('GT') | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 64 gt_perSamp = [splt[colSamp[iii]].split(':')[gtInd-1] for iii in xrange(len(colSamp))] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 65 for i in xrange(len(gt_perSamp)): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 66 gt_perSamp[i] = gt_perSamp[i].replace('.','0') | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 67 if gt_perSamp == None: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 68 gt_perSamp = [None]*max([len(colSamp),1]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 69 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 70 return (alt_alleles, alt_freqs, gt_perSamp) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 71 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 72 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 73 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 74 def parseVCF(vcfPath,tumorNormal=False,ploidy=2): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 75 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 76 tt = time.time() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 77 print '--------------------------------' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 78 sys.stdout.write('reading input VCF...\n') | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 79 sys.stdout.flush() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 80 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 81 colDict = {} | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 82 colSamp = [] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 83 nSkipped = 0 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 84 nSkipped_becauseHash = 0 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 85 allVars = {} # [ref][pos] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 86 sampNames = [] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 87 alreadyPrintedWarning = False | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 88 f = open(vcfPath,'r') | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 89 for line in f: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 90 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 91 if line[0] != '#': | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 92 if len(colDict) == 0: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 93 print '\n\nERROR: VCF has no header?\n'+VCF_FILENAME+'\n\n' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 94 f.close() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 95 exit(1) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 96 splt = line[:-1].split('\t') | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 97 plOut = parseLine(splt,colDict,colSamp) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 98 if plOut == None: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 99 nSkipped += 1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 100 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 101 (aa, af, gt) = plOut | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 102 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 103 # make sure at least one allele somewhere contains the variant | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 104 if tumorNormal: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 105 gtEval = gt[:2] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 106 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 107 gtEval = gt[:1] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 108 if None in gtEval: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 109 if CHOOSE_RANDOM_PLOID_IF_NO_GT_FOUND: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 110 if not alreadyPrintedWarning: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 111 print 'Warning: Found variants without a GT field, assuming heterozygous...' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 112 alreadyPrintedWarning = True | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 113 for i in xrange(len(gtEval)): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 114 tmp = ['0']*ploidy | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 115 tmp[random.randint(0,ploidy-1)] = '1' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 116 gtEval[i] = '/'.join(tmp) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 117 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 118 # skip because no GT field was found | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 119 nSkipped += 1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 120 continue | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 121 isNonReference = False | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 122 for gtVal in gtEval: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 123 if gtVal != None: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 124 if '1' in gtVal: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 125 isNonReference = True | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 126 if not isNonReference: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 127 # skip if no genotype actually contains this variant | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 128 nSkipped += 1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 129 continue | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 130 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 131 chrom = splt[0] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 132 pos = int(splt[1]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 133 ref = splt[3] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 134 # skip if position is <= 0 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 135 if pos <= 0: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 136 nSkipped += 1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 137 continue | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 138 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 139 # hash variants to avoid inserting duplicates (there are some messy VCFs out there...) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 140 if chrom not in allVars: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 141 allVars[chrom] = {} | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 142 if pos not in allVars[chrom]: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 143 allVars[chrom][pos] = (pos,ref,aa,af,gtEval) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 144 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 145 nSkipped_becauseHash += 1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 146 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 147 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 148 if line[1] != '#': | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 149 cols = line[1:-1].split('\t') | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 150 for i in xrange(len(cols)): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 151 if 'FORMAT' in colDict: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 152 colSamp.append(i) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 153 colDict[cols[i]] = i | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 154 if len(colSamp): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 155 sampNames = cols[-len(colSamp):] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 156 if len(colSamp) == 1: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 157 pass | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 158 elif len(colSamp) == 2 and tumorNormal: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 159 print 'Detected 2 sample columns in input VCF, assuming tumor/normal.' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 160 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 161 print 'Warning: Multiple sample columns present in input VCF. By default genReads uses only the first column.' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 162 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 163 sampNames = ['Unknown'] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 164 if tumorNormal: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 165 #tumorInd = sampNames.index('TUMOR') | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 166 #normalInd = sampNames.index('NORMAL') | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 167 if 'NORMAL' not in sampNames or 'TUMOR' not in sampNames: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 168 print '\n\nERROR: Input VCF must have a "NORMAL" and "TUMOR" column.\n' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 169 f.close() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 170 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 171 varsOut = {} | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 172 for r in allVars.keys(): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 173 varsOut[r] = [allVars[r][k] for k in sorted(allVars[r].keys())] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 174 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 175 print 'found',sum([len(n) for n in allVars.values()]),'valid variants in input vcf.' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 176 print ' *',nSkipped,'variants skipped: (qual filtered / ref genotypes / invalid syntax)' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 177 print ' *',nSkipped_becauseHash,'variants skipped due to multiple variants found per position' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 178 print '--------------------------------' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 179 return (sampNames, varsOut) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 180 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 181 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 182 | 
