Repository 'snp_mapping_using_wgs'
hg clone https://toolshed.g2.bx.psu.edu/repos/gregory-minevich/snp_mapping_using_wgs

Changeset 10:7d6bccd5f88c (2012-06-14)
Previous changeset 9:d4aa63bc2ef6 (2012-03-27) Next changeset 11:cb4c388fb155 (2012-06-14)
Commit message:
Uploaded
modified:
SNP_Mapping.py
b
diff -r d4aa63bc2ef6 -r 7d6bccd5f88c SNP_Mapping.py
--- a/SNP_Mapping.py Tue Mar 27 11:33:24 2012 -0400
+++ b/SNP_Mapping.py Thu Jun 14 20:34:20 2012 -0400
[
b'@@ -5,6 +5,7 @@\n import optparse\n import csv\n import re\n+import pprint\n from decimal import *\n from rpy import *\n \n@@ -16,10 +17,13 @@\n \tparser.add_option(\'-v\', \'--haw_vcf\', dest = \'haw_vcf\', action = \'store\', type = \'string\', default = None, help = "vcf file of Hawaiian SNPs")\n \tparser.add_option(\'-l\', \'--loess_span\', dest = \'loess_span\', action = \'store\', type = \'float\', default = .01, help = "Loess span")\n \tparser.add_option(\'-d\', \'--d_yaxis\', dest = \'d_yaxis\', action = \'store\', type = \'float\', default = .7, help = "y-axis upper limit for dot plot")  \n-\tparser.add_option(\'-y\', \'--h_yaxis\', dest = \'h_yaxis\', action = \'store\', type = \'int\', default = 500, help = "y-axis upper limit for dot plot")   \n+\tparser.add_option(\'-y\', \'--h_yaxis\', dest = \'h_yaxis\', action = \'store\', type = \'int\', default = 5, help = "y-axis upper limit for histogram plot")   \n \tparser.add_option(\'-c\', \'--points_color\', dest = \'points_color\', action = \'store\', type = \'string\', default = "gray27", help = "Color for data points") \n \tparser.add_option(\'-k\', \'--loess_color\', dest = \'loess_color\', action = \'store\', type = \'string\', default = "red", help = "Color for loess regression line")        \n-\tparser.add_option(\'-z\', \'--standardize\', dest = \'standardize\', default= \'false\', help = "Standardize X-axis")\n+\tparser.add_option(\'-z\', \'--standardize\', dest = \'standardize\', default= \'true\', help = "Standardize X-axis")\n+\tparser.add_option(\'-b\', \'--break_file\', dest = \'break_file\', action = \'store\', type = \'string\', default = \'C.elegans\', help = "File defining the breaks per chromosome")\n+\tparser.add_option(\'-x\', \'--bin_size\', dest = \'bin_size\', action = \'store\', type = \'int\', default = 1000000, help = "Size of histogram bins, default is 1mb")\n+\tparser.add_option(\'-n\', \'--do_not_normalize_bin\', dest = \'do_not_normalize_bin\', action = \'store_true\', help = "Do not Normalize histograms")\n \n \tparser.add_option(\'-o\', \'--output\', dest = \'output\', action = \'store\', type = \'string\', default = None, help = "Output file name")\n \tparser.add_option(\'-s\', \'--location_plot_output\', dest = \'location_plot_output\', action = \'store\', type = \'string\', default = "SNP_Mapping_Plot.pdf", help = "Output file name of SNP plots by chromosomal location")\n@@ -30,10 +34,18 @@\n \n \thaw_snps = build_haw_snp_dictionary(haw_vcf = options.haw_vcf)\t\n \tpileup_info = parse_pileup(sample_pileup = options.sample_pileup, haw_snps = haw_snps)\n-\toutput_pileup_info(output = options.output, pileup_info = pileup_info)\n \n+\toutput_pileup_info(output = options.output, pileup_info = pileup_info)\n+\t\n \t#output plot with all ratios\n-\toutput_scatter_plots_by_location(location_plot_output = options.location_plot_output, pileup_info = pileup_info, loess_span=options.loess_span, d_yaxis=options.d_yaxis, h_yaxis=options.h_yaxis, points_color=options.points_color, loess_color=options.loess_color, standardize =options.standardize)\n+\trounded_bin_size = int(round((float(options.bin_size) / 1000000), 1) * 1000000)\n+\t\n+\tnormalized_histogram_bins_per_mb = calculate_normalized_histogram_bins_per_xbase(pileup_info = pileup_info, xbase = rounded_bin_size, do_not_normalize = options.do_not_normalize_bin)\n+\tnormalized_histogram_bins_per_5kb = calculate_normalized_histogram_bins_per_xbase(pileup_info = pileup_info, xbase = (rounded_bin_size / 2), do_not_normalize = options.do_not_normalize_bin)\n+\t\n+\tbreak_dict = parse_breaks(break_file = options.break_file)\n+\n+\toutput_scatter_plots_by_location(location_plot_output = options.location_plot_output, pileup_info = pileup_info, loess_span=options.loess_span, d_yaxis=options.d_yaxis, h_yaxis=options.h_yaxis, points_color=options.points_color, loess_color=options.loess_color, standardize =options.standardize, normalized_hist_per_mb = normalized_histogram_bins_per_mb, normalized_hist_per_5kb = normalized_histogram_bins_per_5kb, breaks = break_dict, rounded_bin_size = rounded_bin_size)\n \n \t#For plotting with map units on the X-axis instead of physical distance)\n \t#output_scatter_plots_by_m'..b'hromosome = location_info[0].upper()\n+\t\tchromosome = re.sub("chr", "", chromosome, flags = re.IGNORECASE)\n+\t\tchromosome = re.sub("CHROMOSOME_", "", chromosome, flags = re.IGNORECASE)\n+\n+\t\tposition = location_info[1]\n+\t\txbase_position = (int(position) / xbase) + 1\n+\n+\t\tlocation = chromosome + ":" + str(xbase_position)\n+\t\tif location in ref_snps_per_xbase:\n+\t\t\tref_snps_per_xbase[location] = ref_snps_per_xbase[location] + 1\n+\t\telse:\n+\t\t\tref_snps_per_xbase[location] = 1\n+\n+\treturn ref_snps_per_xbase\n+\n+def get_mean_zero_snp_count_per_chromosome(pileup_info, xbase = 1000000):\n+\tsample_snp_count_per_xbase = {}\n+\t\n+\tfor location in pileup_info:\n+\t\talt_allele_count, ref_allele_count, read_depth, ratio, haw_snp_id, mapping_unit = pileup_info[location]\n+\t\t\t\n+\t\tlocation_info = location.split(\':\')\n+\t\tchromosome = location_info[0]\n+\t\tposition = location_info[1]\n+\t\txbase_position = (int(position) / xbase) + 1\n+\t\txbase_location = chromosome + ":" + str(xbase_position)\n+\t\t\n+\t\tif alt_allele_count == 0:\n+\t\t\tif xbase_location in sample_snp_count_per_xbase:\n+\t\t\t\tsample_snp_count_per_xbase[xbase_location] = sample_snp_count_per_xbase[xbase_location] + 1\n+\t\t\telse:\n+\t\t\t\tsample_snp_count_per_xbase[xbase_location] = 1\n+\t\t\n+\t\telif alt_allele_count != 0 and xbase_location not in sample_snp_count_per_xbase:\n+\t\t\tsample_snp_count_per_xbase[xbase_location] = 0\n+\n+\tmean_zero_snp_count_per_chromosome = {}\n+\tfor location in sample_snp_count_per_xbase:\n+\t\tchromosome = location.split(\':\')[0]\n+\t\tsample_count = sample_snp_count_per_xbase[location]\n+\t\tif chromosome in mean_zero_snp_count_per_chromosome:\n+\t\t\tmean_zero_snp_count_per_chromosome[chromosome].append(sample_count)\n+\t\telse:\n+\t\t\tmean_zero_snp_count_per_chromosome[chromosome] = [sample_count]\n+\n+\tfor chromosome in mean_zero_snp_count_per_chromosome:\n+\t\tsumma = sum(mean_zero_snp_count_per_chromosome[chromosome])\n+\t\tcount = len(mean_zero_snp_count_per_chromosome[chromosome])\n+\t\t\n+\t\tmean_zero_snp_count_per_chromosome[chromosome] = Decimal(summa) / Decimal(count)\n+\n+\treturn mean_zero_snp_count_per_chromosome\n+\n+def get_zero_snp_count_per_xbase(pileup_info = None, xbase = 1000000):\n+\tzero_snp_count_per_xbase = {}\n+\n+\tfor location in pileup_info:\n+\t\talt_allele_count, ref_allele_count, read_depth, ratio, haw_snp_id, mapping_unit = pileup_info[location]\n+\t\t\t\n+\t\tlocation_info = location.split(\':\')\n+\t\tchromosome = location_info[0]\n+\t\tposition = location_info[1]\n+\t\txbase_position = (int(position) / xbase) + 1\n+\t\txbase_location = chromosome + ":" + str(xbase_position)\n+\t\t\n+\t\tif alt_allele_count == 0:\n+\t\t\tif xbase_location in zero_snp_count_per_xbase:\n+\t\t\t\tzero_snp_count_per_xbase[xbase_location] = zero_snp_count_per_xbase[xbase_location] + 1\n+\t\t\telse:\n+\t\t\t\tzero_snp_count_per_xbase[xbase_location] = 1\n+\t\t\n+\t\telif alt_allele_count != 0 and xbase_location not in zero_snp_count_per_xbase:\n+\t\t\tzero_snp_count_per_xbase[xbase_location] = 0\n+\n+\treturn zero_snp_count_per_xbase\n \n def parse_pileup(sample_pileup = None, haw_snps = None):\n \ti_file = open(sample_pileup, \'rU\')\n@@ -309,14 +461,16 @@\n \t\tif location in haw_snps:\n \t\t\talt_allele, haw_snp_id, mapping_unit = haw_snps[location]\n \t\t\tref_allele_count, alt_allele_count = parse_read_bases(read_bases = read_bases, alt_allele = alt_allele)\n-\t\t\n-\t\t\tgetcontext().prec = 6\t\n-\t\t\tratio = Decimal(alt_allele_count) / Decimal(read_depth)\n+\t\n+\t\t\tif Decimal(read_depth!=0):\t\t\n+\t\t\t\tgetcontext().prec = 6\t\n+\t\t\t\tratio = Decimal(alt_allele_count) / Decimal(read_depth)\n+\t\t\t\t#ratio = Decimal(alt_allele_count) / Decimal(ref_allele_count)\t\n+\t\n+\t\t\t\tpileup_info[location] = (alt_allele_count, ref_allele_count, read_depth, ratio, haw_snp_id, mapping_unit)\n \t\t\n-\t\t\tpileup_info[location] = (alt_allele_count, ref_allele_count, read_depth, ratio, haw_snp_id, mapping_unit)\n-\t\t\n-\t\t\t#debug line\n-\t\t\t#print chromosome, position, read_depth, ref_allele_count, alt_allele_count, ratio, id\n+\t\t\t\t#debug line\n+\t\t\t\t#print chromosome, position, read_depth, ref_allele_count, alt_allele_count, ratio, id\n \n \ti_file.close()\n \n'