annotate tests/compute-site.py @ 5:31361191d2d2

Uploaded tarball. Version 1.1: Stranded output, slightly different handling of minor allele ties and 0 coverage sites, revised help text, added test datasets.
author nick
date Thu, 12 Sep 2013 11:34:23 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
1 #!/usr/bin/python
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
2 import os
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
3 import sys
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
4 from optparse import OptionParser
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
5
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
6 OPT_DEFAULTS = {'freq_thres':0, 'covg_thres':0}
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
7 BASES = ['A', 'C', 'G', 'T']
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
8 USAGE = ("Usage: %prog (options) 'variant_str' ('variant_str2' (etc))\n"
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
9 +" cat variants.txt > %prog (options)")
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
10
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
11 def main():
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
12
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
13 parser = OptionParser(usage=USAGE)
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
14 parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float',
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
15 default=OPT_DEFAULTS.get('freq_thres'),
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
16 help=('Frequency threshold for counting alleles, given in percentage: -f 1 '
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
17 +'= 1% frequency. Default is %default%.'))
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
18 parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int',
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
19 default=OPT_DEFAULTS.get('covg_thres'),
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
20 help=('Coverage threshold. Each site must be supported by at least this '
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
21 +'many reads on each strand. Otherwise the site will not be printed in '
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
22 +'the output. The default is %default reads per strand.'))
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
23 (options, args) = parser.parse_args()
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
24 freq_thres = options.freq_thres
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
25 covg_thres = options.covg_thres
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
26
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
27 if len(sys.argv) > 1 and '-h' in sys.argv[1][0:3]:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
28 script_name = os.path.basename(sys.argv[0])
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
29 print """USAGE:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
30 $ """+script_name+""" [sample column text]
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
31 $ """+script_name+""" '+A=10,+G=2,-A=20,-G=41,'
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
32 $ """+script_name+""" '0/1:10,1:0.25,0.025:-A=29,-AG=1,-G=10,'
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
33 $ """+script_name+""" '+A=10,+G=2,-A=20,-G=41,' '0/1:10,1:0.25,0.025:-A=29,-AG=1,-G=10,'
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
34 Or invoke with no arguments to use interactively. It will read from stdin, so
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
35 just paste one sample per line."""
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
36 sys.exit(0)
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
37
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
38 if len(args) > 0:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
39 stdin = False
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
40 samples = args
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
41 else:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
42 stdin = True
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
43 samples = sys.stdin
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
44 print "Reading from standard input.."
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
45
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
46 for sample in samples:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
47 print ''
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
48 sample = sample.split(':')[-1]
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
49 if not sample:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
50 continue
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
51 print sample
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
52 counts_dict = parse_counts(sample)
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
53 compute_stats(counts_dict, freq_thres, covg_thres)
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
54
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
55
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
56 def parse_counts(sample_str):
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
57 counts_dict = {}
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
58 counts = sample_str.split(',')
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
59 for count in counts:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
60 if '=' not in count:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
61 continue
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
62 (var, reads) = count.split('=')
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
63 if var[1:] in BASES:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
64 counts_dict[var] = int(reads)
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
65 return counts_dict
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
66
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
67
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
68 def compute_stats(counts_dict, freq_thres, covg_thres):
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
69 # totals for A, C, G, T
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
70 counts_unstranded = {}
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
71 for base in BASES:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
72 counts_unstranded[base] = 0
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
73 for strand in '+-':
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
74 counts_unstranded[base] += counts_dict.get(strand+base, 0)
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
75 print '+- '+str(counts_unstranded)
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
76
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
77 # get counts for each strand
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
78 plus_counts = get_stranded_counts(counts_dict, '+')
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
79 minus_counts = get_stranded_counts(counts_dict, '-')
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
80 counts_lists = {'+':plus_counts, '-':minus_counts}
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
81 print ' + '+str(plus_counts)
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
82 print ' - '+str(minus_counts)
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
83
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
84 # stranded coverage threshold
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
85 coverages = {}
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
86 failing_strands = {}
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
87 for strand in '+-':
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
88 coverages[strand] = 0
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
89 for count in counts_lists[strand].values():
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
90 coverages[strand] += count
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
91 if coverages[strand] < covg_thres:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
92 failing_strands[strand] = coverages[strand]
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
93 sys.stdout.write(strand+'coverage: '+str(coverages[strand])+"\t")
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
94 coverages['+-'] = coverages['+'] + coverages['-']
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
95 sys.stdout.write("+-coverage: "+str(coverages['+-'])+"\n")
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
96
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
97 if failing_strands:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
98 for strand in failing_strands:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
99 print ('coverage on '+strand+' strand too low ('
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
100 +str(failing_strands[strand])+' < '+str(covg_thres)+")")
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
101 return
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
102
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
103 # apply frequency threshold
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
104 for strand in counts_lists:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
105 strand_counts = counts_lists[strand]
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
106 for variant in strand_counts.keys():
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
107 # print (variant+" freq: "+str(strand_counts[variant])+"/"
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
108 # +str(coverages[strand])+" = "
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
109 # +str(strand_counts[variant]/float(coverages[strand])))
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
110 if strand_counts[variant]/float(coverages[strand]) < freq_thres:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
111 strand_counts.pop(variant)
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
112 plus_variants = sorted(plus_counts.keys())
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
113 minus_variants = sorted(minus_counts.keys())
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
114 if plus_variants == minus_variants:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
115 strand_bias = False
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
116 print "no strand bias: +"+str(plus_variants)+" == -"+str(minus_variants)
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
117 sys.stdout.write("alleles: "+str(len(plus_variants))+"\t")
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
118 else:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
119 strand_bias = True
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
120 print " strand bias: +"+str(plus_variants)+" != -"+str(minus_variants)
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
121 sys.stdout.write("alleles: 0\t")
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
122
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
123 variants_sorted = sort_variants(counts_unstranded)
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
124 if len(variants_sorted) >= 1:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
125 sys.stdout.write("major: "+variants_sorted[0]+"\t")
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
126 minor = '.'
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
127 if len(variants_sorted) == 2:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
128 minor = variants_sorted[1]
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
129 elif len(variants_sorted) > 2:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
130 if (counts_unstranded.get(variants_sorted[1]) ==
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
131 counts_unstranded.get(variants_sorted[2])):
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
132 minor = 'N'
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
133 else:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
134 minor = variants_sorted[1]
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
135
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
136 sys.stdout.write("minor: "+minor+"\tfreq: ")
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
137 print round(float(counts_unstranded.get(minor, 0))/coverages['+-'], 5)
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
138
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
139
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
140 def get_stranded_counts(unstranded_counts, strand):
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
141 stranded_counts = {}
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
142 for variant in unstranded_counts:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
143 if variant[0] == strand:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
144 stranded_counts[variant[1:]] = unstranded_counts[variant]
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
145 return stranded_counts
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
146
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
147 def sort_variants(variant_counts):
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
148 """Sort the list of variants based on their counts. Returns a list of just
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
149 the variants, no counts."""
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
150 variants = variant_counts.keys()
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
151 var_del = []
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
152 for variant in variants:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
153 if variant_counts.get(variant) == 0:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
154 var_del.append(variant)
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
155 for variant in var_del:
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
156 variants.remove(variant)
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
157 variants.sort(reverse=True, key=lambda variant: variant_counts.get(variant,0))
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
158 return variants
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
159
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
160 if __name__ == "__main__":
31361191d2d2 Uploaded tarball.
nick
parents:
diff changeset
161 main()