Galaxy |

Changeset 15:6dc4cc8da6b4 (2012-06-25)

Previous changeset 14:837e392903de (2012-06-25) Next changeset 16:a1f826e440d4 (2012-06-25)

Commit message:
Uploaded

added:
SNP_Mapping.py

diff -r 837e392903de -r 6dc4cc8da6b4 SNP_Mapping.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/SNP_Mapping.py Mon Jun 25 16:00:53 2012 -0400

[

b'@@ -0,0 +1,497 @@\n+#!/usr/bin/python\n+\n+import re\n+import sys\n+import optparse\n+import csv\n+import re\n+import pprint\n+from decimal import *\n+from rpy import *\n+\n+def main():\n+\tcsv.field_size_limit(1000000000)\n+\n+\tparser = optparse.OptionParser()\n+\tparser.add_option(\'-p\', \'--sample_pileup\', dest = \'sample_pileup\', action = \'store\', type = \'string\', default = None, help = "Sample pileup from mpileup")\n+\tparser.add_option(\'-v\', \'--haw_vcf\', dest = \'haw_vcf\', action = \'store\', type = \'string\', default = None, help = "vcf file of Hawaiian SNPs")\n+\tparser.add_option(\'-l\', \'--loess_span\', dest = \'loess_span\', action = \'store\', type = \'float\', default = .01, help = "Loess span")\n+\tparser.add_option(\'-d\', \'--d_yaxis\', dest = \'d_yaxis\', action = \'store\', type = \'float\', default = .7, help = "y-axis upper limit for dot plot") \n+\tparser.add_option(\'-y\', \'--h_yaxis\', dest = \'h_yaxis\', action = \'store\', type = \'int\', default = 5, help = "y-axis upper limit for histogram plot") \n+\tparser.add_option(\'-c\', \'--points_color\', dest = \'points_color\', action = \'store\', type = \'string\', default = "gray27", help = "Color for data points") \n+\tparser.add_option(\'-k\', \'--loess_color\', dest = \'loess_color\', action = \'store\', type = \'string\', default = "red", help = "Color for loess regression line") \n+\tparser.add_option(\'-z\', \'--standardize\', dest = \'standardize\', default= \'true\', help = "Standardize X-axis")\n+\tparser.add_option(\'-b\', \'--break_file\', dest = \'break_file\', action = \'store\', type = \'string\', default = \'C.elegans\', help = "File defining the breaks per chromosome")\n+\tparser.add_option(\'-x\', \'--bin_size\', dest = \'bin_size\', action = \'store\', type = \'int\', default = 1000000, help = "Size of histogram bins, default is 1mb")\n+\t#parser.add_option(\'-n\', \'--normalize_bins\', dest = \'normalize_bins\', action = \'store_true\', help = "Normalize histograms")\n+\tparser.add_option(\'-n\', \'--normalize_bins\', dest = \'normalize_bins\', default= \'true\', help = "Normalize histograms")\n+\n+\n+\tparser.add_option(\'-o\', \'--output\', dest = \'output\', action = \'store\', type = \'string\', default = None, help = "Output file name")\n+\tparser.add_option(\'-s\', \'--location_plot_output\', dest = \'location_plot_output\', action = \'store\', type = \'string\', default = "SNP_Mapping_Plot.pdf", help = "Output file name of SNP plots by chromosomal location")\n+\n+\t#For plotting with map units on the X-axis instead of physical distance\n+\t#parser.add_option(\'-u\', \'--mpu_plot_output\', dest = \'mpu_plot_output\', action = \'store\', type = \'string\', default = None, help = "Output file name of SNP plots by map unit location")\n+\t(options, args) = parser.parse_args()\n+\n+\thaw_snps = build_haw_snp_dictionary(haw_vcf = options.haw_vcf)\t\n+\tpileup_info = parse_pileup(sample_pileup = options.sample_pileup, haw_snps = haw_snps)\n+\n+\toutput_pileup_info(output = options.output, pileup_info = pileup_info)\n+\t\n+\t#output plot with all ratios\n+\trounded_bin_size = int(round((float(options.bin_size) / 1000000), 1) * 1000000)\n+\t\n+\tnormalized_histogram_bins_per_mb = calculate_normalized_histogram_bins_per_xbase(pileup_info = pileup_info, xbase = rounded_bin_size, normalize_bins = options.normalize_bins)\n+\tnormalized_histogram_bins_per_5kb = calculate_normalized_histogram_bins_per_xbase(pileup_info = pileup_info, xbase = (rounded_bin_size / 2), normalize_bins = options.normalize_bins)\n+\n+\tbreak_dict = parse_breaks(break_file = options.break_file)\n+\n+\toutput_scatter_plots_by_location(location_plot_output = options.location_plot_output, pileup_info = pileup_info, loess_span=options.loess_span, d_yaxis=options.d_yaxis, h_yaxis=options.h_yaxis, points_color=options.points_color, loess_color=options.loess_color, standardize =options.standardize, normalized_hist_per_mb = normalized_histogram_bins_per_mb, normalized_hist_per_5kb = normalized_histogram_bins_per_5kb, breaks = break_dict, rounded_bin_size = rounded_bin_size)\n+\n+\t#For plotting with map units on the X-axis instead of physical distance)\n+\t#output_scatter_plots_'..b'ele_count, read_depth, ratio, haw_snp_id, mapping_unit = pileup_info[location]\n+\t\t\t\n+\t\tlocation_info = location.split(\':\')\n+\t\tchromosome = location_info[0]\n+\t\tposition = location_info[1]\n+\t\txbase_position = (int(position) / xbase) + 1\n+\t\txbase_location = chromosome + ":" + str(xbase_position)\n+\t\t\n+\t\tif alt_allele_count == 0:\n+\t\t\tif xbase_location in sample_snp_count_per_xbase:\n+\t\t\t\tsample_snp_count_per_xbase[xbase_location] = sample_snp_count_per_xbase[xbase_location] + 1\n+\t\t\telse:\n+\t\t\t\tsample_snp_count_per_xbase[xbase_location] = 1\n+\t\t\n+\t\telif alt_allele_count != 0 and xbase_location not in sample_snp_count_per_xbase:\n+\t\t\tsample_snp_count_per_xbase[xbase_location] = 0\n+\n+\tmean_zero_snp_count_per_chromosome = {}\n+\tfor location in sample_snp_count_per_xbase:\n+\t\tchromosome = location.split(\':\')[0]\n+\t\tsample_count = sample_snp_count_per_xbase[location]\n+\t\tif chromosome in mean_zero_snp_count_per_chromosome:\n+\t\t\tmean_zero_snp_count_per_chromosome[chromosome].append(sample_count)\n+\t\telse:\n+\t\t\tmean_zero_snp_count_per_chromosome[chromosome] = [sample_count]\n+\n+\tfor chromosome in mean_zero_snp_count_per_chromosome:\n+\t\tsumma = sum(mean_zero_snp_count_per_chromosome[chromosome])\n+\t\tcount = len(mean_zero_snp_count_per_chromosome[chromosome])\n+\t\t\n+\t\tmean_zero_snp_count_per_chromosome[chromosome] = Decimal(summa) / Decimal(count)\n+\n+\treturn mean_zero_snp_count_per_chromosome\n+\n+def get_zero_snp_count_per_xbase(pileup_info = None, xbase = 1000000):\n+\tzero_snp_count_per_xbase = {}\n+\n+\tfor location in pileup_info:\n+\t\talt_allele_count, ref_allele_count, read_depth, ratio, haw_snp_id, mapping_unit = pileup_info[location]\n+\t\t\t\n+\t\tlocation_info = location.split(\':\')\n+\t\tchromosome = location_info[0]\n+\t\tposition = location_info[1]\n+\t\txbase_position = (int(position) / xbase) + 1\n+\t\txbase_location = chromosome + ":" + str(xbase_position)\n+\t\t\n+\t\tif alt_allele_count == 0:\n+\t\t\tif xbase_location in zero_snp_count_per_xbase:\n+\t\t\t\tzero_snp_count_per_xbase[xbase_location] = zero_snp_count_per_xbase[xbase_location] + 1\n+\t\t\telse:\n+\t\t\t\tzero_snp_count_per_xbase[xbase_location] = 1\n+\t\t\n+\t\telif alt_allele_count != 0 and xbase_location not in zero_snp_count_per_xbase:\n+\t\t\tzero_snp_count_per_xbase[xbase_location] = 0\n+\n+\treturn zero_snp_count_per_xbase\n+\n+def parse_pileup(sample_pileup = None, haw_snps = None):\n+\ti_file = open(sample_pileup, \'rU\')\n+\treader = csv.reader(i_file, delimiter = \'\\t\', quoting = csv.QUOTE_NONE)\t\n+\n+\tpileup_info = {}\n+\n+\tfor row in reader:\n+\t\tchromosome = row[0].upper()\n+\t\tchromosome = re.sub("chr", "", chromosome, flags = re.IGNORECASE)\n+\t\tchromosome = re.sub("CHROMOSOME_", "", chromosome, flags = re.IGNORECASE)\n+\n+\t\tposition = row[1]\n+\t\tref_allele = row[2]\n+\t\tread_depth = row[3]\n+\t\tread_bases = row[4]\n+\n+\t\tlocation = chromosome + ":" + position\n+\t\tif location in haw_snps:\n+\t\t\talt_allele, haw_snp_id, mapping_unit = haw_snps[location]\n+\t\t\tref_allele_count, alt_allele_count = parse_read_bases(read_bases = read_bases, alt_allele = alt_allele)\n+\t\n+\t\t\tif Decimal(read_depth!=0):\t\t\n+\t\t\t\tgetcontext().prec = 6\t\n+\t\t\t\tratio = Decimal(alt_allele_count) / Decimal(read_depth)\n+\t\t\t\t#ratio = Decimal(alt_allele_count) / Decimal(ref_allele_count)\t\n+\t\n+\t\t\t\tpileup_info[location] = (alt_allele_count, ref_allele_count, read_depth, ratio, haw_snp_id, mapping_unit)\n+\t\t\n+\t\t\t\t#debug line\n+\t\t\t\t#print chromosome, position, read_depth, ref_allele_count, alt_allele_count, ratio, id\n+\n+\ti_file.close()\n+\n+\treturn pileup_info\n+\n+def parse_read_bases(read_bases = None, alt_allele = None):\n+\tread_bases = re.sub(\'\\$\', \'\', read_bases)\n+\tread_bases = re.sub(\'\\^[^\\s]\', \'\', read_bases)\n+\n+\tref_allele_matches = re.findall("\\.|\\,", read_bases)\n+\tref_allele_count = len(ref_allele_matches)\n+\n+\talt_allele_matches = re.findall(alt_allele, read_bases, flags = re.IGNORECASE)\n+\talt_allele_count = len(alt_allele_matches)\n+\n+\t#debug line\n+\t#print read_bases, alt_allele, alt_allele_count, ref_allele_count\n+\n+\treturn ref_allele_count, alt_allele_count\n+\n+if __name__ == "__main__":\n+\tmain()\n'