| 
22
 | 
     1 #!/usr/bin/python
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 import re
 | 
| 
 | 
     4 import sys
 | 
| 
 | 
     5 import optparse
 | 
| 
 | 
     6 import csv
 | 
| 
 | 
     7 import re
 | 
| 
 | 
     8 import pprint
 | 
| 
 | 
     9 from decimal import *
 | 
| 
 | 
    10 from rpy import *
 | 
| 
 | 
    11 
 | 
| 
 | 
    12 def main():
 | 
| 
 | 
    13 	csv.field_size_limit(1000000000)
 | 
| 
 | 
    14 
 | 
| 
 | 
    15 	parser = optparse.OptionParser()
 | 
| 
 | 
    16 	parser.add_option('-v', '--sample_vcf', dest = 'sample_vcf', action = 'store', type = 'string', default = None, help = "Sample VCF from GATK Unified Genotyper")
 | 
| 
 | 
    17 	parser.add_option('-l', '--loess_span', dest = 'loess_span', action = 'store', type = 'float', default = .01, help = "Loess span")
 | 
| 
 | 
    18 	parser.add_option('-d', '--d_yaxis', dest = 'd_yaxis', action = 'store', type = 'float', default = .7, help = "y-axis upper limit for dot plot")  
 | 
| 
 | 
    19 	parser.add_option('-y', '--h_yaxis', dest = 'h_yaxis', action = 'store', type = 'int', default = 5, help = "y-axis upper limit for histogram plot")   
 | 
| 
 | 
    20 	parser.add_option('-c', '--points_color', dest = 'points_color', action = 'store', type = 'string', default = "gray27", help = "Color for data points") 
 | 
| 
 | 
    21 	parser.add_option('-k', '--loess_color', dest = 'loess_color', action = 'store', type = 'string', default = "red", help = "Color for loess regression line")        
 | 
| 
 | 
    22 	parser.add_option('-z', '--standardize', dest = 'standardize', default= 'true', help = "Standardize X-axis")
 | 
| 
 | 
    23 	parser.add_option('-b', '--break_file', dest = 'break_file', action = 'store', type = 'string', default = 'C.elegans', help = "File defining the breaks per chromosome")
 | 
| 
 | 
    24 	parser.add_option('-x', '--bin_size', dest = 'bin_size', action = 'store', type = 'int', default = 1000000, help = "Size of histogram bins, default is 1mb")
 | 
| 
 | 
    25 	parser.add_option('-n', '--normalize_bins', dest = 'normalize_bins', default= 'true', help = "Normalize histograms")
 | 
| 
 | 
    26 
 | 
| 
 | 
    27 
 | 
| 
 | 
    28 	parser.add_option('-o', '--output', dest = 'output', action = 'store', type = 'string', default = None, help = "Output file name")
 | 
| 
 | 
    29 	parser.add_option('-s', '--location_plot_output', dest = 'location_plot_output', action = 'store', type = 'string', default = "SNP_Mapping_Plot.pdf", help = "Output file name of SNP plots by chromosomal location")
 | 
| 
 | 
    30 
 | 
| 
 | 
    31 	(options, args) = parser.parse_args()
 | 
| 
 | 
    32 
 | 
| 
 | 
    33 	vcf_info = parse_vcf(sample_vcf = options.sample_vcf)
 | 
| 
 | 
    34 
 | 
| 
 | 
    35 	output_vcf_info(output = options.output, vcf_info = vcf_info)
 | 
| 
 | 
    36 	
 | 
| 
 | 
    37 	#output plot with all ratios
 | 
| 
 | 
    38 	rounded_bin_size = int(round((float(options.bin_size) / 1000000), 1) * 1000000)
 | 
| 
 | 
    39 	
 | 
| 
 | 
    40 	normalized_histogram_bins_per_mb = calculate_normalized_histogram_bins_per_xbase(vcf_info = vcf_info, xbase = rounded_bin_size, normalize_bins = options.normalize_bins)
 | 
| 
 | 
    41 	normalized_histogram_bins_per_5kb = calculate_normalized_histogram_bins_per_xbase(vcf_info = vcf_info, xbase = (rounded_bin_size / 2), normalize_bins = options.normalize_bins)
 | 
| 
 | 
    42 
 | 
| 
 | 
    43 	break_dict = parse_breaks(break_file = options.break_file)
 | 
| 
 | 
    44 
 | 
| 
 | 
    45 	output_scatter_plots_by_location(location_plot_output = options.location_plot_output, vcf_info = vcf_info, loess_span=options.loess_span, d_yaxis=options.d_yaxis, h_yaxis=options.h_yaxis, points_color=options.points_color, loess_color=options.loess_color, standardize =options.standardize, normalized_hist_per_mb = normalized_histogram_bins_per_mb, normalized_hist_per_5kb = normalized_histogram_bins_per_5kb, breaks = break_dict, rounded_bin_size = rounded_bin_size)
 | 
| 
 | 
    46 
 | 
| 
 | 
    47 
 | 
| 
 | 
    48 def skip_headers(reader = None, i_file = None):
 | 
| 
 | 
    49 	# count headers
 | 
| 
 | 
    50 	comment = 0
 | 
| 
 | 
    51 	while reader.next()[0].startswith('#'):
 | 
| 
 | 
    52 		comment = comment + 1
 | 
| 
 | 
    53 	
 | 
| 
 | 
    54 	# skip headers
 | 
| 
 | 
    55 	i_file.seek(0)
 | 
| 
 | 
    56 	for i in range(0, comment):
 | 
| 
 | 
    57 		reader.next()
 | 
| 
 | 
    58 
 | 
| 
 | 
    59 def parse_breaks(break_file = None):
 | 
| 
 | 
    60 	if break_file == 'C.elegans':
 | 
| 
 | 
    61 		break_dict = { 'I' : 16 , 'II' : 16,  'III' : 14, 'IV' : 18, 'V' : 21, 'X' : 18 }
 | 
| 
 | 
    62 		return break_dict
 | 
| 
 | 
    63 	elif break_file == 'Arabidopsis':
 | 
| 
 | 
    64 		break_dict = { '1' : 31 , '2' : 20,  '3' : 24, '4' : 19, '5' : 27 }
 | 
| 
 | 
    65 		return break_dict
 | 
| 
 | 
    66 	else:
 | 
| 
 | 
    67 		i_file = open(break_file, 'rU')
 | 
| 
 | 
    68 		break_dict = {}
 | 
| 
 | 
    69 		reader = csv.reader(i_file, delimiter = '\t')
 | 
| 
 | 
    70 		for row in reader:
 | 
| 
 | 
    71 			chromosome = row[0].upper()
 | 
| 
 | 
    72 			chromosome = re.sub("chr", "", chromosome, flags = re.IGNORECASE)
 | 
| 
 | 
    73 			chromosome = re.sub("CHROMOSOME_", "", chromosome, flags = re.IGNORECASE)
 | 
| 
 | 
    74 			break_count = row[1]
 | 
| 
 | 
    75 			break_dict[chromosome] = int(break_count)
 | 
| 
 | 
    76 		return break_dict
 | 
| 
 | 
    77 
 | 
| 
 | 
    78 
 | 
| 
 | 
    79 def location_comparer(location_1, location_2):
 | 
| 
 | 
    80 	chr_loc_1 = location_1.split(':')[0]
 | 
| 
 | 
    81 	pos_loc_1 = int(location_1.split(':')[1])
 | 
| 
 | 
    82 
 | 
| 
 | 
    83 	chr_loc_2 = location_2.split(':')[0]
 | 
| 
 | 
    84 	pos_loc_2 = int(location_2.split(':')[1])
 | 
| 
 | 
    85 
 | 
| 
 | 
    86 	if chr_loc_1 == chr_loc_2:
 | 
| 
 | 
    87 		if pos_loc_1 < pos_loc_2:
 | 
| 
 | 
    88 			return -1
 | 
| 
 | 
    89 		elif pos_loc_1 == pos_loc_1:
 | 
| 
 | 
    90 			return 0
 | 
| 
 | 
    91 		elif pos_loc_1 > pos_loc_2:
 | 
| 
 | 
    92 			return 1
 | 
| 
 | 
    93 	elif chr_loc_1 < chr_loc_2:
 | 
| 
 | 
    94 		return -1
 | 
| 
 | 
    95 	elif chr_loc_1 > chr_loc_2:
 | 
| 
 | 
    96 		return 1
 | 
| 
 | 
    97 
 | 
| 
 | 
    98 def output_vcf_info(output = None, vcf_info = None):
 | 
| 
 | 
    99 	o_file = open(output, 'wb')
 | 
| 
 | 
   100 	writer = csv.writer(o_file, delimiter = '\t')
 | 
| 
 | 
   101 
 | 
| 
 | 
   102 	writer.writerow(["#Chr\t", "Pos\t", "Alt Count\t", "Ref Count\t", "Read Depth\t", "Ratio\t"])
 | 
| 
 | 
   103 
 | 
| 
 | 
   104 	location_sorted_vcf_info_keys = sorted(vcf_info.keys(), cmp=location_comparer)
 | 
| 
 | 
   105 
 | 
| 
 | 
   106 	for location in location_sorted_vcf_info_keys:
 | 
| 
 | 
   107 		alt_allele_count, ref_allele_count, read_depth, ratio = vcf_info[location]
 | 
| 
 | 
   108 		
 | 
| 
 | 
   109 		location_info = location.split(':')
 | 
| 
 | 
   110 		chromosome = location_info[0]
 | 
| 
 | 
   111 		position = location_info[1]
 | 
| 
 | 
   112 
 | 
| 
 | 
   113 		writer.writerow([chromosome, position, alt_allele_count, ref_allele_count, read_depth, ratio])
 | 
| 
 | 
   114 
 | 
| 
 | 
   115 	o_file.close()
 | 
| 
 | 
   116 
 | 
| 
 | 
   117 def output_scatter_plots_by_location(location_plot_output = None, vcf_info = None, loess_span="", d_yaxis="", h_yaxis="", points_color="", loess_color="", standardize=None, normalized_hist_per_mb = None, normalized_hist_per_5kb = None, breaks = None, rounded_bin_size = 1000000):
 | 
| 
 | 
   118 	positions = {}
 | 
| 
 | 
   119 	current_chr = ""
 | 
| 
 | 
   120 	prev_chr = ""
 | 
| 
 | 
   121 
 | 
| 
 | 
   122 	x_label = "Location (Mb)"
 | 
| 
 | 
   123 	filtered_label = ''
 | 
| 
 | 
   124 
 | 
| 
 | 
   125 	location_sorted_vcf_info_keys = sorted(vcf_info.keys(), cmp=location_comparer)
 | 
| 
 | 
   126 	
 | 
| 
 | 
   127 	break_unit = Decimal(rounded_bin_size) / Decimal(1000000)
 | 
| 
 | 
   128 	max_breaks = max(breaks.values())
 | 
| 
 | 
   129 
 | 
| 
 | 
   130 	try:
 | 
| 
 | 
   131 		r.pdf(location_plot_output, 8, 8)
 | 
| 
 | 
   132 	
 | 
| 
 | 
   133 		for location in location_sorted_vcf_info_keys:
 | 
| 
 | 
   134 			current_chr = location.split(':')[0]
 | 
| 
 | 
   135 			position = location.split(':')[1]
 | 
| 
 | 
   136 
 | 
| 
 | 
   137 			alt_allele_count, ref_allele_count, read_depth, ratio = vcf_info[location]
 | 
| 
 | 
   138 		
 | 
| 
 | 
   139 			if prev_chr != current_chr:
 | 
| 
 | 
   140 				if prev_chr != "":
 | 
| 
 | 
   141 					hist_dict_mb = get_hist_dict_by_chr(normalized_hist_per_xbase = normalized_hist_per_mb, chr = prev_chr)
 | 
| 
 | 
   142 					hist_dict_5kb = get_hist_dict_by_chr(normalized_hist_per_xbase = normalized_hist_per_5kb, chr = prev_chr)
 | 
| 
 | 
   143 					
 | 
| 
 | 
   144 					plot_data(chr_dict = positions, hist_dict_mb = hist_dict_mb, hist_dict_5kb = hist_dict_5kb, chr = prev_chr + filtered_label, x_label = "Location (Mb)", divide_position = True, draw_secondary_grid_lines = True, loess_span=loess_span, d_yaxis=d_yaxis, h_yaxis=h_yaxis, points_color=points_color, loess_color=loess_color, breaks = breaks[prev_chr], standardize=standardize, max_breaks = max_breaks, break_unit = break_unit)
 | 
| 
 | 
   145 				
 | 
| 
 | 
   146 				prev_chr = current_chr
 | 
| 
 | 
   147 				positions = {}
 | 
| 
 | 
   148 		
 | 
| 
 | 
   149 			positions[position] = ratio
 | 
| 
 | 
   150 
 | 
| 
 | 
   151 		hist_dict_mb = get_hist_dict_by_chr(normalized_hist_per_xbase = normalized_hist_per_mb, chr = current_chr)
 | 
| 
 | 
   152 		hist_dict_5kb = get_hist_dict_by_chr(normalized_hist_per_xbase = normalized_hist_per_5kb, chr = current_chr)
 | 
| 
 | 
   153 							
 | 
| 
 | 
   154 		plot_data(chr_dict = positions, hist_dict_mb = hist_dict_mb, hist_dict_5kb = hist_dict_5kb, chr = current_chr + filtered_label, x_label = "Location (Mb)", divide_position = True, draw_secondary_grid_lines = True, loess_span=loess_span, d_yaxis=d_yaxis, h_yaxis=h_yaxis, points_color=points_color, loess_color=loess_color, breaks = breaks[current_chr], standardize=standardize, max_breaks = max_breaks, break_unit = break_unit)
 | 
| 
 | 
   155 
 | 
| 
 | 
   156 		r.dev_off()
 | 
| 
 | 
   157 		
 | 
| 
 | 
   158 	except Exception as inst:
 | 
| 
 | 
   159         	print inst
 | 
| 
 | 
   160         	print "There was an error creating the location plot pdf... Please try again"
 | 
| 
 | 
   161 
 | 
| 
 | 
   162 def get_hist_dict_by_chr(normalized_hist_per_xbase = None, chr = ''):
 | 
| 
 | 
   163 	hist_dict = {}	
 | 
| 
 | 
   164 
 | 
| 
 | 
   165 	for location in normalized_hist_per_xbase:
 | 
| 
 | 
   166 		chromosome = location.split(':')[0]		
 | 
| 
 | 
   167 		if chromosome == chr:
 | 
| 
 | 
   168 			position = int(location.split(':')[1])
 | 
| 
 | 
   169 			hist_dict[position] = normalized_hist_per_xbase[location]
 | 
| 
 | 
   170 	
 | 
| 
 | 
   171 	max_location = max(hist_dict.keys(), key=int)
 | 
| 
 | 
   172 	for i in range(1, max_location):
 | 
| 
 | 
   173 		if i not in hist_dict:
 | 
| 
 | 
   174 			hist_dict[i] = 0	
 | 
| 
 | 
   175 	
 | 
| 
 | 
   176 	return hist_dict	
 | 
| 
 | 
   177 
 | 
| 
 | 
   178 def plot_data(chr_dict =  None, hist_dict_mb = None, hist_dict_5kb = None, chr = "", x_label = "", divide_position = False, draw_secondary_grid_lines = False, loess_span=None, d_yaxis=None, h_yaxis=None, points_color="", loess_color="", breaks = None, standardize= None, max_breaks = 1, break_unit = 1):
 | 
| 
 | 
   179 	ratios = "c("
 | 
| 
 | 
   180 	positions = "c("
 | 
| 
 | 
   181 	
 | 
| 
 | 
   182 	for position in chr_dict:
 | 
| 
 | 
   183 		ratio = chr_dict[position]
 | 
| 
 | 
   184 		if divide_position:
 | 
| 
 | 
   185 		       	position = float(position) / 1000000.0
 | 
| 
 | 
   186 	        positions = positions + str(position) + ", "
 | 
| 
 | 
   187 		ratios = ratios + str(ratio) + ", "
 | 
| 
 | 
   188 
 | 
| 
 | 
   189 	if len(ratios) == 2:
 | 
| 
 | 
   190 		ratios = ratios + ")"
 | 
| 
 | 
   191 	else:
 | 
| 
 | 
   192 		ratios = ratios[0:len(ratios) - 2] + ")"
 | 
| 
 | 
   193 
 | 
| 
 | 
   194 	if len(positions) == 2:
 | 
| 
 | 
   195 		positions = positions + ")"
 | 
| 
 | 
   196 	else:
 | 
| 
 | 
   197 		positions = positions[0:len(positions) - 2] + ")"
 | 
| 
 | 
   198 
 | 
| 
 | 
   199 	r("x <- " + positions)
 | 
| 
 | 
   200 	r("y <- " + ratios)
 | 
| 
 | 
   201 
 | 
| 
 | 
   202 	hist_mb_values = "c("
 | 
| 
 | 
   203     	for position in sorted(hist_dict_mb):
 | 
| 
 | 
   204 		hist_mb_values = hist_mb_values + str(hist_dict_mb[position]) + ", "
 | 
| 
 | 
   205 	
 | 
| 
 | 
   206 	if len(hist_mb_values) == 2:
 | 
| 
 | 
   207 		hist_mb_values = hist_mb_values + ")"
 | 
| 
 | 
   208 	else:
 | 
| 
 | 
   209 		hist_mb_values = hist_mb_values[0:len(hist_mb_values) - 2] + ")"
 | 
| 
 | 
   210 
 | 
| 
 | 
   211 	hist_5kb_values = "c("
 | 
| 
 | 
   212 	for position in sorted(hist_dict_5kb):
 | 
| 
 | 
   213 		hist_5kb_values = hist_5kb_values + str(hist_dict_5kb[position]) + ", "	
 | 
| 
 | 
   214 
 | 
| 
 | 
   215 	if len(hist_5kb_values) == 2:
 | 
| 
 | 
   216 		hist_5kb_values = hist_5kb_values + ")"
 | 
| 
 | 
   217 	else:
 | 
| 
 | 
   218 		hist_5kb_values = hist_5kb_values[0:len(hist_5kb_values) - 2] + ")"
 | 
| 
 | 
   219 
 | 
| 
 | 
   220 	r("xz <- " + hist_mb_values)
 | 
| 
 | 
   221 	r("yz <- " + hist_5kb_values)
 | 
| 
 | 
   222 
 | 
| 
 | 
   223 
 | 
| 
 | 
   224 	max_break_str = str(max_breaks)
 | 
| 
 | 
   225 	break_unit_str = str(Decimal(break_unit)) 	
 | 
| 
 | 
   226 	half_break_unit_str = str(Decimal(break_unit) / Decimal(2))
 | 
| 
 | 
   227 	break_penta_unit_str = str(Decimal(break_unit) * Decimal(5))
 | 
| 
 | 
   228 
 | 
| 
 | 
   229 	if (standardize=='true'):  
 | 
| 
 | 
   230 		r("plot(x, y, ,cex=0.60, xlim=c(0," + max_break_str + "), main='LG " + chr + "', xlab= '" + x_label + "', ylim = c(0, %f " %d_yaxis + "), ylab='Ratios of mapping strain alleles/total reads (at SNP positions)', pch=18, col='"+ points_color +"')")
 | 
| 
 | 
   231 		r("lines(loess.smooth(x, y, span = %f "%loess_span + "), lwd=5, col='"+ loess_color +"')")
 | 
| 
 | 
   232 		r("axis(1, at=seq(0, " + max_break_str + ", by=" + break_unit_str + "), labels=FALSE, tcl=-0.5)")
 | 
| 
 | 
   233 		r("axis(1, at=seq(0, " + max_break_str + ", by=" + half_break_unit_str + "), labels=FALSE, tcl=-0.25)")
 | 
| 
 | 
   234 		r("axis(2, at=seq(floor(min(y)), 1, by=0.1), labels=FALSE, tcl=-0.2)")
 | 
| 
 | 
   235 	elif (standardize=='false'):
 | 
| 
 | 
   236 		r("plot(x, y, cex=0.60, main='LG " + chr + "', xlab= '" + x_label + "', ylim = c(0, %f " %d_yaxis + "), ylab='Ratios of mapping strain alleles/total reads (at SNP positions)', pch=18, col='"+ points_color +"')")
 | 
| 
 | 
   237 		r("lines(loess.smooth(x, y, span = %f "%loess_span + "), lwd=5, col='"+ loess_color +"')")    
 | 
| 
 | 
   238 		r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by= " + break_unit_str + "), labels=FALSE, tcl=-0.5)")
 | 
| 
 | 
   239 		r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by= " + half_break_unit_str + "), labels=FALSE, tcl=-0.25)")	
 | 
| 
 | 
   240 		r("axis(2, at=seq(floor(min(y)), 1, by=0.1), labels=FALSE, tcl=-0.2)")
 | 
| 
 | 
   241 
 | 
| 
 | 
   242 	if draw_secondary_grid_lines:
 | 
| 
 | 
   243 		r("abline(h = seq(floor(min(y)), 1, by=0.1), v = seq(floor(min(x)), length(x), by= 1), col='gray')")
 | 
| 
 | 
   244 	else:
 | 
| 
 | 
   245 		r("grid(lty = 1, col = 'gray')")
 | 
| 
 | 
   246 
 | 
| 
 | 
   247 	if (standardize=='true'):
 | 
| 
 | 
   248 		r("barplot(xz, xlim=c(0, " + max_break_str + "), ylim = c(0, " + str(h_yaxis) + "), yaxp=c(0, " + str(h_yaxis) + ", 1), space = 0, col='darkgray', width = " + break_unit_str + ", xlab='Location (Mb)', ylab='Normalized frequency of pure parental alleles ', main='LG " + chr + "')")
 | 
| 
 | 
   249 		r("barplot(yz, space = 0, add=TRUE, width = " + half_break_unit_str + ", col=rgb(1, 0, 0, 1))")	
 | 
| 
 | 
   250 		r("axis(1, hadj = 1, at=seq(0, " + max_break_str + ", by= " + break_unit_str + "), labels=FALSE, tcl=-0.5)")
 | 
| 
 | 
   251 		r("axis(1, at=seq(0, " + max_break_str + ", by= " + break_penta_unit_str + "), labels=TRUE, tcl=-0.5)")
 | 
| 
 | 
   252 		r("axis(1, at=seq(0, " + max_break_str + ", by= " + half_break_unit_str + "), labels=FALSE, tcl=-0.25)")
 | 
| 
 | 
   253 	elif (standardize=='false'):
 | 
| 
 | 
   254 		r("barplot(xz, ylim = c(0, " + str(h_yaxis) + "), yaxp=c(0, " + str(h_yaxis) + ", 1), space = 0, col='darkgray', width = 1, xlab='Location (Mb)', ylab='Normalized frequency of pure parental alleles ', main='LG " + chr + "')")
 | 
| 
 | 
   255 		r("barplot(yz, space = 0, add=TRUE, width = 0.5, col=rgb(1, 0, 0, 1))")	
 | 
| 
 | 
   256 		r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by= " + break_unit_str + "), labels=FALSE, tcl=-0.5)")
 | 
| 
 | 
   257 		r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by= " + break_penta_unit_str + "), labels=TRUE, tcl=-0.5)")
 | 
| 
 | 
   258 		r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by= " + half_break_unit_str + "), labels=FALSE, tcl=-0.25)")
 | 
| 
 | 
   259 
 | 
| 
 | 
   260 
 | 
| 
 | 
   261 		
 | 
| 
 | 
   262 def calculate_normalized_histogram_bins_per_xbase(vcf_info = None, xbase = 1000000, normalize_bins = None):
 | 
| 
 | 
   263 	normalized_histogram_bins_per_xbase = {}
 | 
| 
 | 
   264 
 | 
| 
 | 
   265 	ref_snp_count_per_xbase = get_ref_snp_count_per_xbase(vcf_info = vcf_info, xbase = xbase)
 | 
| 
 | 
   266 	#print "ref_snp_count_per_xbase: " + str(ref_snp_count_per_xbase) 
 | 
| 
 | 
   267 	mean_zero_snp_count_per_chromosome = get_mean_zero_snp_count_per_chromosome(vcf_info = vcf_info, xbase = xbase)
 | 
| 
 | 
   268 	#print "mean_zero_snp_count_per_chromosome: " + str(mean_zero_snp_count_per_chromosome)
 | 
| 
 | 
   269 	zero_snp_count_per_xbase = get_zero_snp_count_per_xbase(vcf_info = vcf_info, xbase = xbase)
 | 
| 
 | 
   270 	#print "zero_snp_count_per_xbase: " + str(zero_snp_count_per_xbase)
 | 
| 
 | 
   271 
 | 
| 
 | 
   272 	for location in ref_snp_count_per_xbase:
 | 
| 
 | 
   273 		chromosome = location.split(':')[0]
 | 
| 
 | 
   274 		mean_zero_snp_count = mean_zero_snp_count_per_chromosome[chromosome]
 | 
| 
 | 
   275 		ref_snp_count = ref_snp_count_per_xbase[location]
 | 
| 
 | 
   276 
 | 
| 
 | 
   277 		zero_snp_count = 0
 | 
| 
 | 
   278 		if location in zero_snp_count_per_xbase:
 | 
| 
 | 
   279 			zero_snp_count = zero_snp_count_per_xbase[location]	
 | 
| 
 | 
   280 
 | 
| 
 | 
   281 		if normalize_bins == 'true':							
 | 
| 
 | 
   282 			if zero_snp_count == 0 or ref_snp_count == 0:
 | 
| 
 | 
   283 				normalized_histogram_bins_per_xbase[location] = 0
 | 
| 
 | 
   284 			elif zero_snp_count == ref_snp_count:
 | 
| 
 | 
   285 				normalized_histogram_bins_per_xbase[location] = 0					
 | 
| 
 | 
   286 			else:				
 | 
| 
 | 
   287 				normalized_histogram_bins_per_xbase[location] = (Decimal(zero_snp_count) / (Decimal(ref_snp_count)-Decimal(zero_snp_count))) * Decimal(mean_zero_snp_count)					
 | 
| 
 | 
   288 		else:
 | 
| 
 | 
   289 			normalized_histogram_bins_per_xbase[location] = zero_snp_count
 | 
| 
 | 
   290 
 | 
| 
 | 
   291 	return normalized_histogram_bins_per_xbase
 | 
| 
 | 
   292 
 | 
| 
 | 
   293 
 | 
| 
 | 
   294 def get_ref_snp_count_per_xbase(vcf_info = None, xbase = 1000000):
 | 
| 
 | 
   295 	ref_snps_per_xbase = {}
 | 
| 
 | 
   296 
 | 
| 
 | 
   297 	for location in vcf_info:
 | 
| 
 | 
   298 		location_info = location.split(':')
 | 
| 
 | 
   299 
 | 
| 
 | 
   300 		chromosome = location_info[0].upper()
 | 
| 
 | 
   301 		chromosome = re.sub("chr", "", chromosome, flags = re.IGNORECASE)
 | 
| 
 | 
   302 		chromosome = re.sub("CHROMOSOME_", "", chromosome, flags = re.IGNORECASE)
 | 
| 
 | 
   303 
 | 
| 
 | 
   304 		position = location_info[1]
 | 
| 
 | 
   305 		xbase_position = (int(position) / xbase) + 1
 | 
| 
 | 
   306 
 | 
| 
 | 
   307 		location = chromosome + ":" + str(xbase_position)
 | 
| 
 | 
   308 		if location in ref_snps_per_xbase:
 | 
| 
 | 
   309 			ref_snps_per_xbase[location] = ref_snps_per_xbase[location] + 1
 | 
| 
 | 
   310 		else:
 | 
| 
 | 
   311 			ref_snps_per_xbase[location] = 1
 | 
| 
 | 
   312 
 | 
| 
 | 
   313 	return ref_snps_per_xbase
 | 
| 
 | 
   314 
 | 
| 
 | 
   315 
 | 
| 
 | 
   316 
 | 
| 
 | 
   317 def get_mean_zero_snp_count_per_chromosome(vcf_info, xbase = 1000000):
 | 
| 
 | 
   318 	sample_snp_count_per_xbase = {}
 | 
| 
 | 
   319 	
 | 
| 
 | 
   320 	for location in vcf_info:
 | 
| 
 | 
   321 		alt_allele_count, ref_allele_count, read_depth, ratio = vcf_info[location]
 | 
| 
 | 
   322 			
 | 
| 
 | 
   323 		location_info = location.split(':')
 | 
| 
 | 
   324 		chromosome = location_info[0]
 | 
| 
 | 
   325 		position = location_info[1]
 | 
| 
 | 
   326 		xbase_position = (int(position) / xbase) + 1
 | 
| 
 | 
   327 		xbase_location = chromosome + ":" + str(xbase_position)
 | 
| 
 | 
   328 		
 | 
| 
 | 
   329 		if int(alt_allele_count) == 0:
 | 
| 
 | 
   330 			if xbase_location in sample_snp_count_per_xbase:
 | 
| 
 | 
   331 				sample_snp_count_per_xbase[xbase_location] = sample_snp_count_per_xbase[xbase_location] + 1
 | 
| 
 | 
   332 			else:
 | 
| 
 | 
   333 				sample_snp_count_per_xbase[xbase_location] = 1
 | 
| 
 | 
   334 
 | 
| 
 | 
   335 		elif int(alt_allele_count) != 0 and xbase_location not in sample_snp_count_per_xbase:		
 | 
| 
 | 
   336 			sample_snp_count_per_xbase[xbase_location] = 0
 | 
| 
 | 
   337 
 | 
| 
 | 
   338 	mean_zero_snp_count_per_chromosome = {}
 | 
| 
 | 
   339 	for location in sample_snp_count_per_xbase:
 | 
| 
 | 
   340 		chromosome = location.split(':')[0]
 | 
| 
 | 
   341 		sample_count = sample_snp_count_per_xbase[location]
 | 
| 
 | 
   342 		if chromosome in mean_zero_snp_count_per_chromosome:
 | 
| 
 | 
   343 			mean_zero_snp_count_per_chromosome[chromosome].append(sample_count)
 | 
| 
 | 
   344 		else:
 | 
| 
 | 
   345 			mean_zero_snp_count_per_chromosome[chromosome] = [sample_count]
 | 
| 
 | 
   346 
 | 
| 
 | 
   347 	for chromosome in mean_zero_snp_count_per_chromosome:
 | 
| 
 | 
   348 		summa = sum(mean_zero_snp_count_per_chromosome[chromosome])
 | 
| 
 | 
   349 		count = len(mean_zero_snp_count_per_chromosome[chromosome])
 | 
| 
 | 
   350 
 | 
| 
 | 
   351 		mean_zero_snp_count_per_chromosome[chromosome] = Decimal(summa) / Decimal(count)
 | 
| 
 | 
   352 
 | 
| 
 | 
   353 	return mean_zero_snp_count_per_chromosome
 | 
| 
 | 
   354 
 | 
| 
 | 
   355 
 | 
| 
 | 
   356 def get_zero_snp_count_per_xbase(vcf_info = None, xbase = 1000000):
 | 
| 
 | 
   357 	zero_snp_count_per_xbase = {}
 | 
| 
 | 
   358 
 | 
| 
 | 
   359 	for location in vcf_info:
 | 
| 
 | 
   360 		alt_allele_count, ref_allele_count, read_depth, ratio = vcf_info[location]
 | 
| 
 | 
   361 			
 | 
| 
 | 
   362 		location_info = location.split(':')
 | 
| 
 | 
   363 		chromosome = location_info[0]
 | 
| 
 | 
   364 		position = location_info[1]
 | 
| 
 | 
   365 		xbase_position = (int(position) / xbase) + 1
 | 
| 
 | 
   366 		xbase_location = chromosome + ":" + str(xbase_position)
 | 
| 
 | 
   367 		
 | 
| 
 | 
   368 		if int(alt_allele_count) == 0:
 | 
| 
 | 
   369 			if xbase_location in zero_snp_count_per_xbase:
 | 
| 
 | 
   370 				zero_snp_count_per_xbase[xbase_location] = zero_snp_count_per_xbase[xbase_location] + 1
 | 
| 
 | 
   371 			else:
 | 
| 
 | 
   372 				zero_snp_count_per_xbase[xbase_location] = 1
 | 
| 
 | 
   373 
 | 
| 
 | 
   374 		elif int(alt_allele_count) != 0 and xbase_location not in zero_snp_count_per_xbase:
 | 
| 
 | 
   375 			zero_snp_count_per_xbase[xbase_location] = 0
 | 
| 
 | 
   376 
 | 
| 
 | 
   377 	return zero_snp_count_per_xbase
 | 
| 
 | 
   378 
 | 
| 
 | 
   379 
 | 
| 
 | 
   380 def parse_vcf(sample_vcf = None):
 | 
| 
 | 
   381 	i_file = open(sample_vcf, 'rU')
 | 
| 
 | 
   382 	reader = csv.reader(i_file, delimiter = '\t', quoting = csv.QUOTE_NONE)	
 | 
| 
 | 
   383 
 | 
| 
 | 
   384 	skip_headers(reader = reader, i_file = i_file)
 | 
| 
 | 
   385 	vcf_info = {}
 | 
| 
 | 
   386 
 | 
| 
 | 
   387 	for row in reader:
 | 
| 
 | 
   388 		chromosome = row[0].upper()
 | 
| 
 | 
   389 		chromosome = re.sub("chr", "", chromosome, flags = re.IGNORECASE)
 | 
| 
 | 
   390 		chromosome = re.sub("CHROMOSOME_", "", chromosome, flags = re.IGNORECASE)
 | 
| 
 | 
   391 
 | 
| 
 | 
   392 		if chromosome != 'MTDNA':
 | 
| 
 | 
   393 			position = row[1]
 | 
| 
 | 
   394 			#ref_allele = row[2]
 | 
| 
 | 
   395 			#read_depth = row[3]
 | 
| 
 | 
   396 			#read_bases = row[4]
 | 
| 
 | 
   397 
 | 
| 
 | 
   398 			vcf_format_info = row[8].split(":")
 | 
| 
 | 
   399 			vcf_allele_freq_data = row[9] 
 | 
| 
 | 
   400 			
 | 
| 
 | 
   401 			read_depth_data_index = vcf_format_info.index("DP")
 | 
| 
 | 
   402 			read_depth = vcf_allele_freq_data.split(":")[read_depth_data_index]
 | 
| 
 | 
   403 
 | 
| 
 | 
   404 			ref_and_alt_counts_data_index = vcf_format_info.index("AD")
 | 
| 
 | 
   405 			ref_and_alt_counts = vcf_allele_freq_data.split(":")[ref_and_alt_counts_data_index]	
 | 
| 
 | 
   406 			ref_allele_count = ref_and_alt_counts.split(",")[0]
 | 
| 
 | 
   407 			alt_allele_count = ref_and_alt_counts.split(",")[1]
 | 
| 
 | 
   408 
 | 
| 
 | 
   409 			location = chromosome + ":" + position
 | 
| 
 | 
   410 	
 | 
| 
 | 
   411 			if (Decimal(read_depth)!=0):	
 | 
| 
 | 
   412 				getcontext().prec = 6	
 | 
| 
 | 
   413 				ratio = Decimal(alt_allele_count) / Decimal(read_depth)
 | 
| 
 | 
   414 					
 | 
| 
 | 
   415 				vcf_info[location] = (alt_allele_count, ref_allele_count, read_depth, ratio)
 | 
| 
 | 
   416 			
 | 
| 
 | 
   417 				#debug line
 | 
| 
 | 
   418 				#print chromosome, position, read_depth, ref_allele_count, alt_allele_count, ratio, id
 | 
| 
 | 
   419 
 | 
| 
 | 
   420 	i_file.close()
 | 
| 
 | 
   421 
 | 
| 
 | 
   422 	return vcf_info
 | 
| 
 | 
   423 
 | 
| 
 | 
   424 def parse_read_bases(read_bases = None, alt_allele = None):
 | 
| 
 | 
   425 	read_bases = re.sub('\$', '', read_bases)
 | 
| 
 | 
   426 	read_bases = re.sub('\^[^\s]', '', read_bases)
 | 
| 
 | 
   427 
 | 
| 
 | 
   428 	ref_allele_matches = re.findall("\.|\,", read_bases)
 | 
| 
 | 
   429 	ref_allele_count = len(ref_allele_matches)
 | 
| 
 | 
   430 
 | 
| 
 | 
   431 	alt_allele_matches = re.findall(alt_allele, read_bases, flags = re.IGNORECASE)
 | 
| 
 | 
   432 	alt_allele_count = len(alt_allele_matches)
 | 
| 
 | 
   433 
 | 
| 
 | 
   434 	#debug line
 | 
| 
 | 
   435 	#print read_bases, alt_allele, alt_allele_count, ref_allele_count
 | 
| 
 | 
   436 
 | 
| 
 | 
   437 	return ref_allele_count, alt_allele_count
 | 
| 
 | 
   438 
 | 
| 
 | 
   439 if __name__ == "__main__":
 | 
| 
 | 
   440 	main()
 |