Repository 'snp_mapping_using_wgs'
hg clone https://toolshed.g2.bx.psu.edu/repos/gregory-minevich/snp_mapping_using_wgs

Changeset 37:6841f9837250 (2014-09-19)
Previous changeset 36:ca390f9c6b29 (2014-05-31) Next changeset 38:7fb9d1e732a0 (2014-09-19)
Commit message:
Deleted selected files
removed:
SNP_Mapping.py
b
diff -r ca390f9c6b29 -r 6841f9837250 SNP_Mapping.py
--- a/SNP_Mapping.py Sat May 31 10:21:44 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,458 +0,0 @@\n-#!/usr/bin/python\n-\n-import re\n-import sys\n-import optparse\n-import csv\n-import re\n-import pprint\n-from decimal import *\n-from rpy import *\n-\n-def main():\n-\tcsv.field_size_limit(1000000000)\n-\n-\tparser = optparse.OptionParser()\n-\tparser.add_option(\'-v\', \'--sample_vcf\', dest = \'sample_vcf\', action = \'store\', type = \'string\', default = None, help = "Sample VCF from GATK Unified Genotyper")\n-\tparser.add_option(\'-l\', \'--loess_span\', dest = \'loess_span\', action = \'store\', type = \'float\', default = .1, help = "Loess span")\n-\tparser.add_option(\'-d\', \'--d_yaxis\', dest = \'d_yaxis\', action = \'store\', type = \'float\', default = 1, help = "y-axis upper limit for dot plot")  \n-\tparser.add_option(\'-y\', \'--h_yaxis\', dest = \'h_yaxis\', action = \'store\', type = \'int\', default = 0, help = "y-axis upper limit for histogram plot")   \n-\tparser.add_option(\'-c\', \'--points_color\', dest = \'points_color\', action = \'store\', type = \'string\', default = "gray27", help = "Color for data points") \n-\tparser.add_option(\'-k\', \'--loess_color\', dest = \'loess_color\', action = \'store\', type = \'string\', default = "red", help = "Color for loess regression line")        \n-\tparser.add_option(\'-z\', \'--standardize\', dest = \'standardize\', default= \'true\', help = "Standardize X-axis")\n-\tparser.add_option(\'-b\', \'--break_file\', dest = \'break_file\', action = \'store\', type = \'string\', default = \'C.elegans\', help = "File defining the breaks per chromosome")\n-\tparser.add_option(\'-x\', \'--bin_size\', dest = \'bin_size\', action = \'store\', type = \'int\', default = 1000000, help = "Size of histogram bins, default is 1mb")\n-\tparser.add_option(\'-n\', \'--normalize_bins\', dest = \'normalize_bins\', default= \'true\', help = "Normalize histograms")\n-\n-\n-\tparser.add_option(\'-o\', \'--output\', dest = \'output\', action = \'store\', type = \'string\', default = None, help = "Output file name")\n-\tparser.add_option(\'-s\', \'--location_plot_output\', dest = \'location_plot_output\', action = \'store\', type = \'string\', default = "SNP_Mapping_Plot.pdf", help = "Output file name of SNP plots by chromosomal location")\n-\n-\t(options, args) = parser.parse_args()\n-\n-\tvcf_info = parse_vcf(sample_vcf = options.sample_vcf)\n-\n-\toutput_vcf_info(output = options.output, vcf_info = vcf_info)\n-\t\n-\trounded_bin_size = int(round((float(options.bin_size) / 1000000), 1) * 1000000)\n-\t\n-\tnormalized_histogram_bins_per_mb = calculate_normalized_histogram_bins_per_xbase(vcf_info = vcf_info, xbase = rounded_bin_size, normalize_bins = options.normalize_bins)\n-\tmax_y_hist_mb = normalized_histogram_bins_per_mb[max(normalized_histogram_bins_per_mb, key = lambda x: normalized_histogram_bins_per_mb.get(x) )]\n-\t\n-\tnormalized_histogram_bins_per_5kb = calculate_normalized_histogram_bins_per_xbase(vcf_info = vcf_info, xbase = (rounded_bin_size / 2), normalize_bins = options.normalize_bins)\n-\tmax_y_hist_5kb = normalized_histogram_bins_per_5kb[max(normalized_histogram_bins_per_5kb, key = lambda x: normalized_histogram_bins_per_5kb.get(x) )]\n-\t\n-\tmax_y_hist_overall = myround(max(max_y_hist_mb, max_y_hist_5kb) + int(round(round(max(max_y_hist_mb, max_y_hist_5kb)) * .1)))\n-\n-\tbreak_dict = parse_breaks(break_file = options.break_file)\n-\n-\toutput_scatter_plots_by_location(location_plot_output = options.location_plot_output, vcf_info = vcf_info, loess_span=options.loess_span, d_yaxis=options.d_yaxis, h_yaxis=options.h_yaxis, points_color=options.points_color, loess_color=options.loess_color, standardize =options.standardize, normalized_hist_per_mb = normalized_histogram_bins_per_mb, normalized_hist_per_5kb = normalized_histogram_bins_per_5kb, breaks = break_dict, rounded_bin_size = rounded_bin_size, max_y_hist_overall = max_y_hist_overall)\n-\n-\n-def myround(x, base=10):\n-    return int(base * round(float(x)/base))\n-\n-def skip_headers(reader = None, i_file = None):\n-\t# count headers\n-\tcomment = 0\n-\twhile reader.next()[0].startswith(\'#\'):\n-\t\tcomment = comment + 1\n-\t\n-\t# skip headers\n-\ti_file.seek(0)\n-\tfor i in range(0, comment):\n-\t\treader.next()\n'..b'e) + 1\n-\t\txbase_location = chromosome + ":" + str(xbase_position)\n-\t\t\n-\t\tif int(alt_allele_count) == 0:\n-\t\t\tif xbase_location in sample_snp_count_per_xbase:\n-\t\t\t\tsample_snp_count_per_xbase[xbase_location] = sample_snp_count_per_xbase[xbase_location] + 1\n-\t\t\telse:\n-\t\t\t\tsample_snp_count_per_xbase[xbase_location] = 1\n-\n-\t\telif int(alt_allele_count) != 0 and xbase_location not in sample_snp_count_per_xbase:\t\t\n-\t\t\tsample_snp_count_per_xbase[xbase_location] = 0\n-\n-\tmean_zero_snp_count_per_chromosome = {}\n-\tfor location in sample_snp_count_per_xbase:\n-\t\tchromosome = location.split(\':\')[0]\n-\t\tsample_count = sample_snp_count_per_xbase[location]\n-\t\tif chromosome in mean_zero_snp_count_per_chromosome:\n-\t\t\tmean_zero_snp_count_per_chromosome[chromosome].append(sample_count)\n-\t\telse:\n-\t\t\tmean_zero_snp_count_per_chromosome[chromosome] = [sample_count]\n-\n-\tfor chromosome in mean_zero_snp_count_per_chromosome:\n-\t\tsumma = sum(mean_zero_snp_count_per_chromosome[chromosome])\n-\t\tcount = len(mean_zero_snp_count_per_chromosome[chromosome])\n-\n-\t\tmean_zero_snp_count_per_chromosome[chromosome] = Decimal(summa) / Decimal(count)\n-\n-\treturn mean_zero_snp_count_per_chromosome\n-\n-\n-def get_zero_snp_count_per_xbase(vcf_info = None, xbase = 1000000):\n-\tzero_snp_count_per_xbase = {}\n-\n-\tfor location in vcf_info:\n-\t\talt_allele_count, ref_allele_count, read_depth, ratio = vcf_info[location]\n-\t\t\t\n-\t\tlocation_info = location.split(\':\')\n-\t\tchromosome = location_info[0]\n-\t\tposition = location_info[1]\n-\t\txbase_position = (int(position) / xbase) + 1\n-\t\txbase_location = chromosome + ":" + str(xbase_position)\n-\t\t\n-\t\tif int(alt_allele_count) == 0:\n-\t\t\tif xbase_location in zero_snp_count_per_xbase:\n-\t\t\t\tzero_snp_count_per_xbase[xbase_location] = zero_snp_count_per_xbase[xbase_location] + 1\n-\t\t\telse:\n-\t\t\t\tzero_snp_count_per_xbase[xbase_location] = 1\n-\n-\t\telif int(alt_allele_count) != 0 and xbase_location not in zero_snp_count_per_xbase:\n-\t\t\tzero_snp_count_per_xbase[xbase_location] = 0\n-\n-\treturn zero_snp_count_per_xbase\n-\n-\n-def parse_vcf(sample_vcf = None):\n-\ti_file = open(sample_vcf, \'rU\')\n-\treader = csv.reader(i_file, delimiter = \'\\t\', quoting = csv.QUOTE_NONE)\t\n-\n-\tskip_headers(reader = reader, i_file = i_file)\n-\tvcf_info = {}\n-\n-\tfor row in reader:\n-\t\tchromosome = row[0].upper()\n-\t\tchromosome = re.sub("CHROMOSOME_", "", chromosome, flags = re.IGNORECASE)\n-\t\tchromosome = re.sub("chr", "", chromosome, flags = re.IGNORECASE)\n-\n-\n-\t\tif chromosome != \'MTDNA\':\n-\t\t\tposition = row[1]\n-\t\t\t#ref_allele = row[2]\n-\t\t\t#read_depth = row[3]\n-\t\t\t#read_bases = row[4]\n-\n-\t\t\tvcf_format_info = row[8].split(":")\n-\t\t\tvcf_allele_freq_data = row[9] \n-\t\t\t\n-\t\t\tread_depth_data_index = vcf_format_info.index("DP")\n-\t\t\tread_depth = vcf_allele_freq_data.split(":")[read_depth_data_index]\n-\n-\t\t\tref_and_alt_counts_data_index = vcf_format_info.index("AD")\n-\t\t\tref_and_alt_counts = vcf_allele_freq_data.split(":")[ref_and_alt_counts_data_index]\t\n-\t\t\tref_allele_count = ref_and_alt_counts.split(",")[0]\n-\t\t\talt_allele_count = ref_and_alt_counts.split(",")[1]\n-\n-\t\t\tlocation = chromosome + ":" + position\n-\t\n-\t\t\tif (Decimal(read_depth)!=0):\t\n-\t\t\t\tgetcontext().prec = 6\t\n-\t\t\t\tratio = Decimal(alt_allele_count) / Decimal(read_depth)\n-\t\t\t\t\t\n-\t\t\t\tvcf_info[location] = (alt_allele_count, ref_allele_count, read_depth, ratio)\n-\t\t\t\n-\t\t\t\t#debug line\n-\t\t\t\t#print chromosome, position, read_depth, ref_allele_count, alt_allele_count, ratio, id\n-\n-\ti_file.close()\n-\n-\treturn vcf_info\n-\n-def parse_read_bases(read_bases = None, alt_allele = None):\n-\tread_bases = re.sub(\'\\$\', \'\', read_bases)\n-\tread_bases = re.sub(\'\\^[^\\s]\', \'\', read_bases)\n-\n-\tref_allele_matches = re.findall("\\.|\\,", read_bases)\n-\tref_allele_count = len(ref_allele_matches)\n-\n-\talt_allele_matches = re.findall(alt_allele, read_bases, flags = re.IGNORECASE)\n-\talt_allele_count = len(alt_allele_matches)\n-\n-\t#debug line\n-\t#print read_bases, alt_allele, alt_allele_count, ref_allele_count\n-\n-\treturn ref_allele_count, alt_allele_count\n-\n-if __name__ == "__main__":\n-\tmain()\n'