Previous changeset 2:bff397ce7794 (2015-12-16) Next changeset 4:8bccc05371b0 (2015-12-16) |
Commit message:
Deleted selected files |
removed:
vcfs2fasta/vcfs2fasta.py |
b |
diff -r bff397ce7794 -r 13ff9dbcd580 vcfs2fasta/vcfs2fasta.py --- a/vcfs2fasta/vcfs2fasta.py Wed Dec 16 07:23:27 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,415 +0,0 @@\n-#!/usr/bin/env python\n-\'\'\'\n-Merge SNP data from multiple VCF files into a single fasta file.\n-\n-Created on 5 Oct 2015\n-\n-@author: alex\n-\'\'\'\n-import argparse\n-from collections import OrderedDict\n-import glob\n-import itertools\n-import logging\n-import os\n-\n-from Bio import SeqIO\n-from bintrees import FastRBTree\n-\n-# Try importing the matplotlib and numpy for stats.\n-try:\n- from matplotlib import pyplot as plt\n- import numpy\n- can_stats = True\n-except ImportError:\n- can_stats = False\n-\n-import vcf\n-\n-from phe.variant_filters import IUPAC_CODES\n-\n-\n-def plot_stats(pos_stats, total_samples, plots_dir="plots", discarded={}):\n- if not os.path.exists(plots_dir):\n- os.makedirs(plots_dir)\n-\n- for contig in pos_stats:\n-\n- plt.style.use(\'ggplot\')\n-\n- x = numpy.array([pos for pos in pos_stats[contig] if pos not in discarded.get(contig, [])])\n- y = numpy.array([ float(pos_stats[contig][pos]["mut"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, []) ])\n-\n- f, (ax1, ax2, ax3, ax4) = plt.subplots(4, sharex=True, sharey=True)\n- f.set_size_inches(12, 15)\n- ax1.plot(x, y, \'ro\')\n- ax1.set_title("Fraction of samples with SNPs")\n- plt.ylim(0, 1.1)\n-\n- y = numpy.array([ float(pos_stats[contig][pos]["N"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, [])])\n- ax2.plot(x, y, \'bo\')\n- ax2.set_title("Fraction of samples with Ns")\n-\n- y = numpy.array([ float(pos_stats[contig][pos]["mix"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, [])])\n- ax3.plot(x, y, \'go\')\n- ax3.set_title("Fraction of samples with mixed bases")\n-\n- y = numpy.array([ float(pos_stats[contig][pos]["gap"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, [])])\n- ax4.plot(x, y, \'yo\')\n- ax4.set_title("Fraction of samples with uncallable genotype (gap)")\n-\n- plt.savefig(os.path.join(plots_dir, "%s.png" % contig), dpi=100)\n-\n-def get_mixture(record, threshold):\n- mixtures = {}\n- try:\n- if len(record.samples[0].data.AD) > 1:\n-\n- total_depth = sum(record.samples[0].data.AD)\n- # Go over all combinations of touples.\n- for comb in itertools.combinations(range(0, len(record.samples[0].data.AD)), 2):\n- i = comb[0]\n- j = comb[1]\n-\n- alleles = list()\n-\n- if 0 in comb:\n- alleles.append(str(record.REF))\n-\n- if i != 0:\n- alleles.append(str(record.ALT[i - 1]))\n- mixture = record.samples[0].data.AD[i]\n- if j != 0:\n- alleles.append(str(record.ALT[j - 1]))\n- mixture = record.samples[0].data.AD[j]\n-\n- ratio = float(mixture) / total_depth\n- if ratio == 1.0:\n- logging.debug("This is only designed for mixtures! %s %s %s %s", record, ratio, record.samples[0].data.AD, record.FILTER)\n-\n- if ratio not in mixtures:\n- mixtures[ratio] = []\n- mixtures[ratio].append(alleles.pop())\n-\n- elif ratio >= threshold:\n- try:\n- code = IUPAC_CODES[frozenset(alleles)]\n- if ratio not in mixtures:\n- mixtures[ratio] = []\n- mixtures[ratio].append(code)\n- except KeyError:\n- logging.warn("Could not retrieve IUPAC code for %s from %s", alleles, record)\n- except AttributeError:\n- mixtures = {}\n-\n- return mixtures\n-\n-def print_stats(stats, pos_stats, total_vars):\n- for contig in stats:\n- for sample, info in stats[contig].items():\n- print "%s,%i,%i" % (sample, len(info.get("n_pos", [])), total_vars)\n-\n'..b'n_ratio = float(len(sample_stats[contig][sample]["n_pos"])) / len(avail_pos[contig])\n- if sample_n_ratio > args.sample_Ns:\n- for pos in sample_stats[contig][sample]["n_pos"]:\n- pos_stats[contig][pos]["N"] -= 1\n-\n- logging.info("Removing %s due to high Ns in sample: %s", sample , sample_n_ratio)\n-\n- delete_samples.append(sample)\n-\n- samples = [sample for sample in samples if sample not in delete_samples]\n- snp_positions = []\n- with open(args.out, "w") as fp:\n-\n- for sample in samples:\n- sample_seq = ""\n- for contig in contigs:\n- if contig in avail_pos:\n- if args.reference:\n- positions = xrange(1, len(args.reference[contig]) + 1)\n- else:\n- positions = avail_pos[contig].keys()\n- for pos in positions:\n- if pos in avail_pos[contig]:\n- if not args.column_Ns or float(pos_stats[contig][pos]["N"]) / len(samples) < args.column_Ns and \\\n- float(pos_stats[contig][pos]["-"]) / len(samples) < args.column_Ns:\n- sample_seq += all_data[contig][sample][pos]\n- else:\n- if contig not in discarded:\n- discarded[contig] = []\n- discarded[contig].append(pos)\n- elif args.reference:\n- sample_seq += args.reference[contig][pos - 1]\n- elif args.reference:\n- sample_seq += args.reference[contig]\n-\n- fp.write(">%s\\n%s\\n" % (sample, sample_seq))\n- # Do the same for reference data.\n- ref_snps = ""\n-\n- for contig in contigs:\n- if contig in avail_pos:\n- if args.reference:\n- positions = xrange(1, len(args.reference[contig]) + 1)\n- else:\n- positions = avail_pos[contig].keys()\n- for pos in positions:\n- if pos in avail_pos[contig]:\n- if not args.column_Ns or float(pos_stats[contig][pos]["N"]) / len(samples) < args.column_Ns and \\\n- float(pos_stats[contig][pos]["-"]) / len(samples) < args.column_Ns:\n-\n- ref_snps += str(avail_pos[contig][pos])\n- snp_positions.append((contig, pos,))\n- elif args.reference:\n- ref_snps += args.reference[contig][pos - 1]\n- elif args.reference:\n- ref_snps += args.reference[contig]\n-\n- fp.write(">reference\\n%s\\n" % ref_snps)\n-\n- if can_stats and args.with_stats:\n- with open(args.with_stats, "wb") as fp:\n- fp.write("contig\\tposition\\tmutations\\tn_frac\\n")\n- for values in snp_positions:\n- fp.write("%s\\t%s\\t%s\\t%s\\n" % (values[0],\n- values[1],\n- float(pos_stats[values[0]][values[1]]["mut"]) / len(args.input),\n- float(pos_stats[values[0]][values[1]]["N"]) / len(args.input)))\n- plot_stats(pos_stats, len(samples), discarded=discarded, plots_dir=os.path.abspath(args.plots_dir))\n- # print_stats(sample_stats, pos_stats, total_vars=len(avail_pos[contig]))\n-\n- total_discarded = 0\n- for _, i in discarded.items():\n- total_discarded += len(i)\n- logging.info("Discarded total of %i poor quality columns", float(total_discarded) / len(args.input))\n- return 0\n-\n-if __name__ == \'__main__\':\n- import time\n-\n-# with PyCallGraph(output=graphviz):\n-# T0 = time.time()\n- r = main()\n-# T1 = time.time()\n-\n-# print "Time taken: %i" % (T1 - T0)\n- exit(r)\n' |