Galaxy |

Changeset 31:e4d90981c7a1 (2014-05-09)

Previous changeset 30:e71fd2c217a1 (2014-05-09) Next changeset 32:11da66cb7216 (2014-05-09)

Commit message:
Uploaded

added:
SNP_Mapping.py

diff -r e71fd2c217a1 -r e4d90981c7a1 SNP_Mapping.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/SNP_Mapping.py Fri May 09 15:41:01 2014 -0400

[

b'@@ -0,0 +1,458 @@\n+#!/usr/bin/python\n+\n+import re\n+import sys\n+import optparse\n+import csv\n+import re\n+import pprint\n+from decimal import *\n+from rpy import *\n+\n+def main():\n+\tcsv.field_size_limit(1000000000)\n+\n+\tparser = optparse.OptionParser()\n+\tparser.add_option(\'-v\', \'--sample_vcf\', dest = \'sample_vcf\', action = \'store\', type = \'string\', default = None, help = "Sample VCF from GATK Unified Genotyper")\n+\tparser.add_option(\'-l\', \'--loess_span\', dest = \'loess_span\', action = \'store\', type = \'float\', default = .1, help = "Loess span")\n+\tparser.add_option(\'-d\', \'--d_yaxis\', dest = \'d_yaxis\', action = \'store\', type = \'float\', default = 1, help = "y-axis upper limit for dot plot") \n+\tparser.add_option(\'-y\', \'--h_yaxis\', dest = \'h_yaxis\', action = \'store\', type = \'int\', default = 0, help = "y-axis upper limit for histogram plot") \n+\tparser.add_option(\'-c\', \'--points_color\', dest = \'points_color\', action = \'store\', type = \'string\', default = "gray27", help = "Color for data points") \n+\tparser.add_option(\'-k\', \'--loess_color\', dest = \'loess_color\', action = \'store\', type = \'string\', default = "red", help = "Color for loess regression line") \n+\tparser.add_option(\'-z\', \'--standardize\', dest = \'standardize\', default= \'true\', help = "Standardize X-axis")\n+\tparser.add_option(\'-b\', \'--break_file\', dest = \'break_file\', action = \'store\', type = \'string\', default = \'C.elegans\', help = "File defining the breaks per chromosome")\n+\tparser.add_option(\'-x\', \'--bin_size\', dest = \'bin_size\', action = \'store\', type = \'int\', default = 1000000, help = "Size of histogram bins, default is 1mb")\n+\tparser.add_option(\'-n\', \'--normalize_bins\', dest = \'normalize_bins\', default= \'true\', help = "Normalize histograms")\n+\n+\n+\tparser.add_option(\'-o\', \'--output\', dest = \'output\', action = \'store\', type = \'string\', default = None, help = "Output file name")\n+\tparser.add_option(\'-s\', \'--location_plot_output\', dest = \'location_plot_output\', action = \'store\', type = \'string\', default = "SNP_Mapping_Plot.pdf", help = "Output file name of SNP plots by chromosomal location")\n+\n+\t(options, args) = parser.parse_args()\n+\n+\tvcf_info = parse_vcf(sample_vcf = options.sample_vcf)\n+\n+\toutput_vcf_info(output = options.output, vcf_info = vcf_info)\n+\t\n+\trounded_bin_size = int(round((float(options.bin_size) / 1000000), 1) * 1000000)\n+\t\n+\tnormalized_histogram_bins_per_mb = calculate_normalized_histogram_bins_per_xbase(vcf_info = vcf_info, xbase = rounded_bin_size, normalize_bins = options.normalize_bins)\n+\tmax_y_hist_mb = normalized_histogram_bins_per_mb[max(normalized_histogram_bins_per_mb, key = lambda x: normalized_histogram_bins_per_mb.get(x) )]\n+\t\n+\tnormalized_histogram_bins_per_5kb = calculate_normalized_histogram_bins_per_xbase(vcf_info = vcf_info, xbase = (rounded_bin_size / 2), normalize_bins = options.normalize_bins)\n+\tmax_y_hist_5kb = normalized_histogram_bins_per_5kb[max(normalized_histogram_bins_per_5kb, key = lambda x: normalized_histogram_bins_per_5kb.get(x) )]\n+\t\n+\tmax_y_hist_overall = myround(max(max_y_hist_mb, max_y_hist_5kb) + int(round(round(max(max_y_hist_mb, max_y_hist_5kb)) * .1)))\n+\n+\tbreak_dict = parse_breaks(break_file = options.break_file)\n+\n+\toutput_scatter_plots_by_location(location_plot_output = options.location_plot_output, vcf_info = vcf_info, loess_span=options.loess_span, d_yaxis=options.d_yaxis, h_yaxis=options.h_yaxis, points_color=options.points_color, loess_color=options.loess_color, standardize =options.standardize, normalized_hist_per_mb = normalized_histogram_bins_per_mb, normalized_hist_per_5kb = normalized_histogram_bins_per_5kb, breaks = break_dict, rounded_bin_size = rounded_bin_size, max_y_hist_overall = max_y_hist_overall)\n+\n+\n+def myround(x, base=5):\n+ return int(base * round(float(x)/base))\n+\n+def skip_headers(reader = None, i_file = None):\n+\t# count headers\n+\tcomment = 0\n+\twhile reader.next()[0].startswith(\'#\'):\n+\t\tcomment = comment + 1\n+\t\n+\t# skip headers\n+\ti_file.seek(0)\n+\tfor i in range(0, comment):\n+\t\treader.next()\n+'..b'e) + 1\n+\t\txbase_location = chromosome + ":" + str(xbase_position)\n+\t\t\n+\t\tif int(alt_allele_count) == 0:\n+\t\t\tif xbase_location in sample_snp_count_per_xbase:\n+\t\t\t\tsample_snp_count_per_xbase[xbase_location] = sample_snp_count_per_xbase[xbase_location] + 1\n+\t\t\telse:\n+\t\t\t\tsample_snp_count_per_xbase[xbase_location] = 1\n+\n+\t\telif int(alt_allele_count) != 0 and xbase_location not in sample_snp_count_per_xbase:\t\t\n+\t\t\tsample_snp_count_per_xbase[xbase_location] = 0\n+\n+\tmean_zero_snp_count_per_chromosome = {}\n+\tfor location in sample_snp_count_per_xbase:\n+\t\tchromosome = location.split(\':\')[0]\n+\t\tsample_count = sample_snp_count_per_xbase[location]\n+\t\tif chromosome in mean_zero_snp_count_per_chromosome:\n+\t\t\tmean_zero_snp_count_per_chromosome[chromosome].append(sample_count)\n+\t\telse:\n+\t\t\tmean_zero_snp_count_per_chromosome[chromosome] = [sample_count]\n+\n+\tfor chromosome in mean_zero_snp_count_per_chromosome:\n+\t\tsumma = sum(mean_zero_snp_count_per_chromosome[chromosome])\n+\t\tcount = len(mean_zero_snp_count_per_chromosome[chromosome])\n+\n+\t\tmean_zero_snp_count_per_chromosome[chromosome] = Decimal(summa) / Decimal(count)\n+\n+\treturn mean_zero_snp_count_per_chromosome\n+\n+\n+def get_zero_snp_count_per_xbase(vcf_info = None, xbase = 1000000):\n+\tzero_snp_count_per_xbase = {}\n+\n+\tfor location in vcf_info:\n+\t\talt_allele_count, ref_allele_count, read_depth, ratio = vcf_info[location]\n+\t\t\t\n+\t\tlocation_info = location.split(\':\')\n+\t\tchromosome = location_info[0]\n+\t\tposition = location_info[1]\n+\t\txbase_position = (int(position) / xbase) + 1\n+\t\txbase_location = chromosome + ":" + str(xbase_position)\n+\t\t\n+\t\tif int(alt_allele_count) == 0:\n+\t\t\tif xbase_location in zero_snp_count_per_xbase:\n+\t\t\t\tzero_snp_count_per_xbase[xbase_location] = zero_snp_count_per_xbase[xbase_location] + 1\n+\t\t\telse:\n+\t\t\t\tzero_snp_count_per_xbase[xbase_location] = 1\n+\n+\t\telif int(alt_allele_count) != 0 and xbase_location not in zero_snp_count_per_xbase:\n+\t\t\tzero_snp_count_per_xbase[xbase_location] = 0\n+\n+\treturn zero_snp_count_per_xbase\n+\n+\n+def parse_vcf(sample_vcf = None):\n+\ti_file = open(sample_vcf, \'rU\')\n+\treader = csv.reader(i_file, delimiter = \'\\t\', quoting = csv.QUOTE_NONE)\t\n+\n+\tskip_headers(reader = reader, i_file = i_file)\n+\tvcf_info = {}\n+\n+\tfor row in reader:\n+\t\tchromosome = row[0].upper()\n+\t\tchromosome = re.sub("CHROMOSOME_", "", chromosome, flags = re.IGNORECASE)\n+\t\tchromosome = re.sub("chr", "", chromosome, flags = re.IGNORECASE)\n+\n+\n+\t\tif chromosome != \'MTDNA\':\n+\t\t\tposition = row[1]\n+\t\t\t#ref_allele = row[2]\n+\t\t\t#read_depth = row[3]\n+\t\t\t#read_bases = row[4]\n+\n+\t\t\tvcf_format_info = row[8].split(":")\n+\t\t\tvcf_allele_freq_data = row[9] \n+\t\t\t\n+\t\t\tread_depth_data_index = vcf_format_info.index("DP")\n+\t\t\tread_depth = vcf_allele_freq_data.split(":")[read_depth_data_index]\n+\n+\t\t\tref_and_alt_counts_data_index = vcf_format_info.index("AD")\n+\t\t\tref_and_alt_counts = vcf_allele_freq_data.split(":")[ref_and_alt_counts_data_index]\t\n+\t\t\tref_allele_count = ref_and_alt_counts.split(",")[0]\n+\t\t\talt_allele_count = ref_and_alt_counts.split(",")[1]\n+\n+\t\t\tlocation = chromosome + ":" + position\n+\t\n+\t\t\tif (Decimal(read_depth)!=0):\t\n+\t\t\t\tgetcontext().prec = 6\t\n+\t\t\t\tratio = Decimal(alt_allele_count) / Decimal(read_depth)\n+\t\t\t\t\t\n+\t\t\t\tvcf_info[location] = (alt_allele_count, ref_allele_count, read_depth, ratio)\n+\t\t\t\n+\t\t\t\t#debug line\n+\t\t\t\t#print chromosome, position, read_depth, ref_allele_count, alt_allele_count, ratio, id\n+\n+\ti_file.close()\n+\n+\treturn vcf_info\n+\n+def parse_read_bases(read_bases = None, alt_allele = None):\n+\tread_bases = re.sub(\'\\$\', \'\', read_bases)\n+\tread_bases = re.sub(\'\\^[^\\s]\', \'\', read_bases)\n+\n+\tref_allele_matches = re.findall("\\.|\\,", read_bases)\n+\tref_allele_count = len(ref_allele_matches)\n+\n+\talt_allele_matches = re.findall(alt_allele, read_bases, flags = re.IGNORECASE)\n+\talt_allele_count = len(alt_allele_matches)\n+\n+\t#debug line\n+\t#print read_bases, alt_allele, alt_allele_count, ref_allele_count\n+\n+\treturn ref_allele_count, alt_allele_count\n+\n+if __name__ == "__main__":\n+\tmain()\n'