Repository 'vcfs2fasta'
hg clone https://toolshed.g2.bx.psu.edu/repos/ulfschaefer/vcfs2fasta

Changeset 3:13ff9dbcd580 (2015-12-16)
Previous changeset 2:bff397ce7794 (2015-12-16) Next changeset 4:8bccc05371b0 (2015-12-16)
Commit message:
Deleted selected files
removed:
vcfs2fasta/vcfs2fasta.py
b
diff -r bff397ce7794 -r 13ff9dbcd580 vcfs2fasta/vcfs2fasta.py
--- a/vcfs2fasta/vcfs2fasta.py Wed Dec 16 07:23:27 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,415 +0,0 @@\n-#!/usr/bin/env python\n-\'\'\'\n-Merge SNP data from multiple VCF files into a single fasta file.\n-\n-Created on 5 Oct 2015\n-\n-@author: alex\n-\'\'\'\n-import argparse\n-from collections import OrderedDict\n-import glob\n-import itertools\n-import logging\n-import os\n-\n-from Bio import SeqIO\n-from bintrees import FastRBTree\n-\n-# Try importing the matplotlib and numpy for stats.\n-try:\n-    from matplotlib import pyplot as plt\n-    import numpy\n-    can_stats = True\n-except ImportError:\n-    can_stats = False\n-\n-import vcf\n-\n-from phe.variant_filters import IUPAC_CODES\n-\n-\n-def plot_stats(pos_stats, total_samples, plots_dir="plots", discarded={}):\n-    if not os.path.exists(plots_dir):\n-        os.makedirs(plots_dir)\n-\n-    for contig in pos_stats:\n-\n-        plt.style.use(\'ggplot\')\n-\n-        x = numpy.array([pos for pos in pos_stats[contig] if pos not in discarded.get(contig, [])])\n-        y = numpy.array([ float(pos_stats[contig][pos]["mut"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, []) ])\n-\n-        f, (ax1, ax2, ax3, ax4) = plt.subplots(4, sharex=True, sharey=True)\n-        f.set_size_inches(12, 15)\n-        ax1.plot(x, y, \'ro\')\n-        ax1.set_title("Fraction of samples with SNPs")\n-        plt.ylim(0, 1.1)\n-\n-        y = numpy.array([ float(pos_stats[contig][pos]["N"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, [])])\n-        ax2.plot(x, y, \'bo\')\n-        ax2.set_title("Fraction of samples with Ns")\n-\n-        y = numpy.array([ float(pos_stats[contig][pos]["mix"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, [])])\n-        ax3.plot(x, y, \'go\')\n-        ax3.set_title("Fraction of samples with mixed bases")\n-\n-        y = numpy.array([ float(pos_stats[contig][pos]["gap"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, [])])\n-        ax4.plot(x, y, \'yo\')\n-        ax4.set_title("Fraction of samples with uncallable genotype (gap)")\n-\n-        plt.savefig(os.path.join(plots_dir, "%s.png" % contig), dpi=100)\n-\n-def get_mixture(record, threshold):\n-    mixtures = {}\n-    try:\n-        if len(record.samples[0].data.AD) > 1:\n-\n-            total_depth = sum(record.samples[0].data.AD)\n-            # Go over all combinations of touples.\n-            for comb in itertools.combinations(range(0, len(record.samples[0].data.AD)), 2):\n-                i = comb[0]\n-                j = comb[1]\n-\n-                alleles = list()\n-\n-                if 0 in comb:\n-                    alleles.append(str(record.REF))\n-\n-                if i != 0:\n-                    alleles.append(str(record.ALT[i - 1]))\n-                    mixture = record.samples[0].data.AD[i]\n-                if j != 0:\n-                    alleles.append(str(record.ALT[j - 1]))\n-                    mixture = record.samples[0].data.AD[j]\n-\n-                ratio = float(mixture) / total_depth\n-                if ratio == 1.0:\n-                    logging.debug("This is only designed for mixtures! %s %s %s %s", record, ratio, record.samples[0].data.AD, record.FILTER)\n-\n-                    if ratio not in mixtures:\n-                        mixtures[ratio] = []\n-                    mixtures[ratio].append(alleles.pop())\n-\n-                elif ratio >= threshold:\n-                    try:\n-                        code = IUPAC_CODES[frozenset(alleles)]\n-                        if ratio not in mixtures:\n-                            mixtures[ratio] = []\n-                            mixtures[ratio].append(code)\n-                    except KeyError:\n-                        logging.warn("Could not retrieve IUPAC code for %s from %s", alleles, record)\n-    except AttributeError:\n-        mixtures = {}\n-\n-    return mixtures\n-\n-def print_stats(stats, pos_stats, total_vars):\n-    for contig in stats:\n-        for sample, info in stats[contig].items():\n-            print "%s,%i,%i" % (sample, len(info.get("n_pos", [])), total_vars)\n-\n'..b'n_ratio = float(len(sample_stats[contig][sample]["n_pos"])) / len(avail_pos[contig])\n-                if sample_n_ratio > args.sample_Ns:\n-                    for pos in sample_stats[contig][sample]["n_pos"]:\n-                        pos_stats[contig][pos]["N"] -= 1\n-\n-                    logging.info("Removing %s due to high Ns in sample: %s", sample , sample_n_ratio)\n-\n-                    delete_samples.append(sample)\n-\n-        samples = [sample for sample in samples if sample not in delete_samples]\n-    snp_positions = []\n-    with open(args.out, "w") as fp:\n-\n-        for sample in samples:\n-            sample_seq = ""\n-            for contig in contigs:\n-                if contig in avail_pos:\n-                    if args.reference:\n-                        positions = xrange(1, len(args.reference[contig]) + 1)\n-                    else:\n-                        positions = avail_pos[contig].keys()\n-                    for pos in positions:\n-                        if pos in avail_pos[contig]:\n-                            if not args.column_Ns or float(pos_stats[contig][pos]["N"]) / len(samples) < args.column_Ns and \\\n-                                float(pos_stats[contig][pos]["-"]) / len(samples) < args.column_Ns:\n-                                sample_seq += all_data[contig][sample][pos]\n-                            else:\n-                                if contig not in discarded:\n-                                    discarded[contig] = []\n-                                discarded[contig].append(pos)\n-                        elif args.reference:\n-                            sample_seq += args.reference[contig][pos - 1]\n-                elif args.reference:\n-                    sample_seq += args.reference[contig]\n-\n-            fp.write(">%s\\n%s\\n" % (sample, sample_seq))\n-        # Do the same for reference data.\n-        ref_snps = ""\n-\n-        for contig in contigs:\n-            if contig in avail_pos:\n-                if args.reference:\n-                    positions = xrange(1, len(args.reference[contig]) + 1)\n-                else:\n-                    positions = avail_pos[contig].keys()\n-                for pos in positions:\n-                    if pos in avail_pos[contig]:\n-                        if not args.column_Ns or float(pos_stats[contig][pos]["N"]) / len(samples) < args.column_Ns and \\\n-                                float(pos_stats[contig][pos]["-"]) / len(samples) < args.column_Ns:\n-\n-                            ref_snps += str(avail_pos[contig][pos])\n-                            snp_positions.append((contig, pos,))\n-                    elif args.reference:\n-                        ref_snps += args.reference[contig][pos - 1]\n-            elif args.reference:\n-                    ref_snps += args.reference[contig]\n-\n-        fp.write(">reference\\n%s\\n" % ref_snps)\n-\n-    if can_stats and args.with_stats:\n-        with open(args.with_stats, "wb") as fp:\n-            fp.write("contig\\tposition\\tmutations\\tn_frac\\n")\n-            for values in snp_positions:\n-                fp.write("%s\\t%s\\t%s\\t%s\\n" % (values[0],\n-                                             values[1],\n-                                             float(pos_stats[values[0]][values[1]]["mut"]) / len(args.input),\n-                                             float(pos_stats[values[0]][values[1]]["N"]) / len(args.input)))\n-        plot_stats(pos_stats, len(samples), discarded=discarded, plots_dir=os.path.abspath(args.plots_dir))\n-    # print_stats(sample_stats, pos_stats, total_vars=len(avail_pos[contig]))\n-\n-    total_discarded = 0\n-    for _, i in discarded.items():\n-        total_discarded += len(i)\n-    logging.info("Discarded total of %i poor quality columns", float(total_discarded) / len(args.input))\n-    return 0\n-\n-if __name__ == \'__main__\':\n-    import time\n-\n-#     with PyCallGraph(output=graphviz):\n-#     T0 = time.time()\n-    r = main()\n-#     T1 = time.time()\n-\n-#     print "Time taken: %i" % (T1 - T0)\n-    exit(r)\n'