comparison tests/compute-site.py @ 5:31361191d2d2

Uploaded tarball. Version 1.1: Stranded output, slightly different handling of minor allele ties and 0 coverage sites, revised help text, added test datasets.
author nick
date Thu, 12 Sep 2013 11:34:23 -0400
parents
children
comparison
equal deleted inserted replaced
4:898eb3daab43 5:31361191d2d2
1 #!/usr/bin/python
2 import os
3 import sys
4 from optparse import OptionParser
5
6 OPT_DEFAULTS = {'freq_thres':0, 'covg_thres':0}
7 BASES = ['A', 'C', 'G', 'T']
8 USAGE = ("Usage: %prog (options) 'variant_str' ('variant_str2' (etc))\n"
9 +" cat variants.txt > %prog (options)")
10
11 def main():
12
13 parser = OptionParser(usage=USAGE)
14 parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float',
15 default=OPT_DEFAULTS.get('freq_thres'),
16 help=('Frequency threshold for counting alleles, given in percentage: -f 1 '
17 +'= 1% frequency. Default is %default%.'))
18 parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int',
19 default=OPT_DEFAULTS.get('covg_thres'),
20 help=('Coverage threshold. Each site must be supported by at least this '
21 +'many reads on each strand. Otherwise the site will not be printed in '
22 +'the output. The default is %default reads per strand.'))
23 (options, args) = parser.parse_args()
24 freq_thres = options.freq_thres
25 covg_thres = options.covg_thres
26
27 if len(sys.argv) > 1 and '-h' in sys.argv[1][0:3]:
28 script_name = os.path.basename(sys.argv[0])
29 print """USAGE:
30 $ """+script_name+""" [sample column text]
31 $ """+script_name+""" '+A=10,+G=2,-A=20,-G=41,'
32 $ """+script_name+""" '0/1:10,1:0.25,0.025:-A=29,-AG=1,-G=10,'
33 $ """+script_name+""" '+A=10,+G=2,-A=20,-G=41,' '0/1:10,1:0.25,0.025:-A=29,-AG=1,-G=10,'
34 Or invoke with no arguments to use interactively. It will read from stdin, so
35 just paste one sample per line."""
36 sys.exit(0)
37
38 if len(args) > 0:
39 stdin = False
40 samples = args
41 else:
42 stdin = True
43 samples = sys.stdin
44 print "Reading from standard input.."
45
46 for sample in samples:
47 print ''
48 sample = sample.split(':')[-1]
49 if not sample:
50 continue
51 print sample
52 counts_dict = parse_counts(sample)
53 compute_stats(counts_dict, freq_thres, covg_thres)
54
55
56 def parse_counts(sample_str):
57 counts_dict = {}
58 counts = sample_str.split(',')
59 for count in counts:
60 if '=' not in count:
61 continue
62 (var, reads) = count.split('=')
63 if var[1:] in BASES:
64 counts_dict[var] = int(reads)
65 return counts_dict
66
67
68 def compute_stats(counts_dict, freq_thres, covg_thres):
69 # totals for A, C, G, T
70 counts_unstranded = {}
71 for base in BASES:
72 counts_unstranded[base] = 0
73 for strand in '+-':
74 counts_unstranded[base] += counts_dict.get(strand+base, 0)
75 print '+- '+str(counts_unstranded)
76
77 # get counts for each strand
78 plus_counts = get_stranded_counts(counts_dict, '+')
79 minus_counts = get_stranded_counts(counts_dict, '-')
80 counts_lists = {'+':plus_counts, '-':minus_counts}
81 print ' + '+str(plus_counts)
82 print ' - '+str(minus_counts)
83
84 # stranded coverage threshold
85 coverages = {}
86 failing_strands = {}
87 for strand in '+-':
88 coverages[strand] = 0
89 for count in counts_lists[strand].values():
90 coverages[strand] += count
91 if coverages[strand] < covg_thres:
92 failing_strands[strand] = coverages[strand]
93 sys.stdout.write(strand+'coverage: '+str(coverages[strand])+"\t")
94 coverages['+-'] = coverages['+'] + coverages['-']
95 sys.stdout.write("+-coverage: "+str(coverages['+-'])+"\n")
96
97 if failing_strands:
98 for strand in failing_strands:
99 print ('coverage on '+strand+' strand too low ('
100 +str(failing_strands[strand])+' < '+str(covg_thres)+")")
101 return
102
103 # apply frequency threshold
104 for strand in counts_lists:
105 strand_counts = counts_lists[strand]
106 for variant in strand_counts.keys():
107 # print (variant+" freq: "+str(strand_counts[variant])+"/"
108 # +str(coverages[strand])+" = "
109 # +str(strand_counts[variant]/float(coverages[strand])))
110 if strand_counts[variant]/float(coverages[strand]) < freq_thres:
111 strand_counts.pop(variant)
112 plus_variants = sorted(plus_counts.keys())
113 minus_variants = sorted(minus_counts.keys())
114 if plus_variants == minus_variants:
115 strand_bias = False
116 print "no strand bias: +"+str(plus_variants)+" == -"+str(minus_variants)
117 sys.stdout.write("alleles: "+str(len(plus_variants))+"\t")
118 else:
119 strand_bias = True
120 print " strand bias: +"+str(plus_variants)+" != -"+str(minus_variants)
121 sys.stdout.write("alleles: 0\t")
122
123 variants_sorted = sort_variants(counts_unstranded)
124 if len(variants_sorted) >= 1:
125 sys.stdout.write("major: "+variants_sorted[0]+"\t")
126 minor = '.'
127 if len(variants_sorted) == 2:
128 minor = variants_sorted[1]
129 elif len(variants_sorted) > 2:
130 if (counts_unstranded.get(variants_sorted[1]) ==
131 counts_unstranded.get(variants_sorted[2])):
132 minor = 'N'
133 else:
134 minor = variants_sorted[1]
135
136 sys.stdout.write("minor: "+minor+"\tfreq: ")
137 print round(float(counts_unstranded.get(minor, 0))/coverages['+-'], 5)
138
139
140 def get_stranded_counts(unstranded_counts, strand):
141 stranded_counts = {}
142 for variant in unstranded_counts:
143 if variant[0] == strand:
144 stranded_counts[variant[1:]] = unstranded_counts[variant]
145 return stranded_counts
146
147 def sort_variants(variant_counts):
148 """Sort the list of variants based on their counts. Returns a list of just
149 the variants, no counts."""
150 variants = variant_counts.keys()
151 var_del = []
152 for variant in variants:
153 if variant_counts.get(variant) == 0:
154 var_del.append(variant)
155 for variant in var_del:
156 variants.remove(variant)
157 variants.sort(reverse=True, key=lambda variant: variant_counts.get(variant,0))
158 return variants
159
160 if __name__ == "__main__":
161 main()