Commit message:
Imported from capsule None |
added:
bedClass.py filter.py filter.xml test-data/test.small.vcf test-data/test_filter_quality_9_DP_2000_lt.vcf test-data/test_filter_quality_9_NS_360_gt.vcf tools.py vcfClass.py vcfPytools.py |
b |
diff -r 000000000000 -r da1a6f33b504 bedClass.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bedClass.py Mon Jan 27 09:29:09 2014 -0500 |
[ |
@@ -0,0 +1,80 @@ +#!/usr/bin/python + +import os.path +import sys + +class bed: + def __init__(self): + self.numberTargets = 0 + self.referenceSequences = {} + self.referenceSequenceList = [] + + def openBed(self, filename): + if filename == "stdin": self.filehandle = sys.stdin + else: + try: self.filehandle = open(filename,"r") + except IOError: + print >> sys.stderr, "Failed to find file: ",filename + exit(1) + +# Get a bed record. + def getRecord(self): + self.record = self.filehandle.readline() + if not self.record: return False + + self.numberTargets = self.numberTargets + 1 + self.ref = "" + self.start = 0 + self.end = 0 + +# bed file should be 0-based, half-open, so the start coordinate +# must be that in the bed file plus one. + entries = self.record.rstrip("\n").split("\t") + self.referenceSequence = entries[0] + +# Add the reference sequence to the dictionary. If it didn't previously +# exist append the reference sequence to the end of the list as well. +# This ensures that the order in which the reference sequences appeared +# in the header can be preserved. + if self.referenceSequence not in self.referenceSequences: + self.referenceSequences[self.referenceSequence] = True + self.referenceSequenceList.append(self.referenceSequence) + + try: self.start = int(entries[1]) + 1 + except: + text = "start position need is not an integer" + self.generalError(text, "start", entries[1]) + + try: self.end = int(entries[2]) + except: + text = "end position need is not an integer" + self.generalError(text, "end", entries[2]) + +# Check that the record is a valid interval. + if self.end - self.start < 0: + print >> sys.stderr, "Invalid target interval:\n\t", self.record + exit(1) + + return True + +# Parse through the bed file until the correct reference sequence is +# encountered and the end position is greater than or equal to that requested. + def parseBed(self, referenceSequence, position): + success = True + if self.referenceSequence != referenceSequence: + while self.referenceSequence != referenceSequence and success: success = self.getRecord() + + while self.referenceSequence == referenceSequence and self.end < position and success: success = self.getRecord() + + return success + +# Close the bed file. + def closeBed(self, filename): + self.filehandle.close() + +# Define error messages for different handled errors. + def generalError(self, text, field, fieldValue): + print >> sys.stderr, "\nError encountered when attempting to read:" + if field != "": print >> sys.stderr, "\t", field, ": ", fieldValue + print >> sys.stderr, "\n", text + exit(1) |
b |
diff -r 000000000000 -r da1a6f33b504 filter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter.py Mon Jan 27 09:29:09 2014 -0500 |
[ |
@@ -0,0 +1,162 @@ +#!/usr/bin/python + +import os.path +import sys +import optparse + +import vcfClass +from vcfClass import * + +import tools +from tools import * + +if __name__ == "__main__": + main() + +def filterFail(text, file): + print >> sys.stderr, text + if file != None: os.remove(file) + exit(1) + +def main(): + +# Parse the command line options + usage = "Usage: vcfPytools.py filter [options]" + parser = optparse.OptionParser(usage = usage) + parser.add_option("-i", "--in", + action="store", type="string", + dest="vcfFile", help="input vcf file") + parser.add_option("-o", "--out", + action="store", type="string", + dest="output", help="output vcf file") + parser.add_option("-q", "--quality", + action="store", type="int", + dest="quality", help="filter out SNPs with qualities lower than selected value") + parser.add_option("-n", "--info", + action="append", type="string", nargs=3, + dest="infoFilters", help="filter based on entries in the info string") + parser.add_option("-r", "--remove-genotypes", + action="store_true", default=False, + dest="removeGeno", help="remove the genotype strings from the vcf file") + parser.add_option("-m", "--mark-as-pass", + action="store_true", default=False, + dest="markPass", help="Mark all records as having passed filters") + + (options, args) = parser.parse_args() + +# Check that a single vcf file is given. + if options.vcfFile == None: + parser.print_help() + print >> sys.stderr, "\nInput vcf file (-i, --input) is required for vcf filtering." + exit(1) + +# The --mark-as-pass option can only be used if no actual filters +# have been specified. + if options.markPass and options.infoFilters: + print >> sys.stderr, "--mark-as-pass cannot be used in conjunction with filters." + exit(1) + +# Set the output file to stdout if no output file was specified. + outputFile, writeOut = setOutput(options.output) # tools.py + + v = vcf() # Define vcf object. + +# Open the vcf file. + v.openVcf(options.vcfFile) + +# Read in the header information. + v.parseHeader(options.vcfFile, writeOut) + taskDescriptor = "##vcfPytools=" + if options.infoFilters: + taskDescriptor += "filtered using the following filters: " + for filter, value, logic in options.infoFilters: taskDescriptor += str(filter) + str(value) + "," + taskDescriptor = taskDescriptor.rstrip(",") + if options.markPass: taskDescriptor += "marked all records as PASS" + + writeHeader(outputFile, v, options.removeGeno, taskDescriptor) + +# Check that specified filters from the info field are either integers or floats. + if options.infoFilters: + v.processInfo = True # Process the info string + filters = {} + filterValues = {} + filterLogic = {} + for filter, value, logic in options.infoFilters: + filterName = str(filter) + str(value) + if "-" in filter or "-" in value or "-" in logic: + print >> sys.stderr, "\n--info (-n) requires three arguments, for example:" + print >> sys.stderr, "\t--info DP 5 lt: filter records with DP less than (lt) 5.\n" + print >> sys.stderr, "allowed logic arguments:\n\tgt: greater than\n\tlt: less than." + print >> sys.stderr, "\nError in:", filter + exit(1) + if logic != "gt" and logic != "lt": + print >> sys.stderr, "\nfilter logic not recognised." + print >> sys.stderr, "allowed logic arguments:\n\tgt: greater than\n\tlt: less than." + print >> sys.stderr, "\nError in:", filter + exit(1) + if v.infoHeaderTags.has_key(filter): + if v.infoHeaderTags[filter][1].lower() == "integer": + try: + filters[filterName] = filter + filterValues[filterName] = int(value) + filterLogic[filterName] = logic + #filterLogic[filterName] = logic + except ValueError: + text = "Filter " + filter + " requires an integer entry, not " + str(type(value)) + filterFail(text, options.output) + + if v.infoHeaderTags[filter][1].lower() == "float": + try: + filters[filterName] = filter + filterValues[filterName] = float(value) + filterLogic[filterName] = logic + #filters[filterName] = float(value) + #filterLogic[filterName] = logic + except ValueError: + text = "Filter " + filter + " requires an float entry, not " + str(type(value)) + filterFail(text, options.output) + + else: + text = "Filter " + filter + " has no explanation in the header. Unknown type for the entry." + filterFail(text, options.output) + +# Parse the vcf file and check if any of the filters are failed. If +# so, build up a string of failed filters. + while v.getRecord(): + filterString = "" + +# Mark the record as "PASS" if --mark-as-pass was applied. + if options.markPass: v.filters = "PASS" + +# Check for quality filtering. + if options.quality != None: + if v.quality < options.quality: + filterString = filterString + ";" + "Q" + str(options.quality) if filterString != "" else "Q" + str(options.quality) + +# Check for filtering on info string filters. + if options.infoFilters: + for filterName, filter in filters.iteritems(): + value = filterValues[filterName] + logic = filterLogic[filterName] + if v.infoTags.has_key(filter): + if type(value) == int: + if logic == "lt" and int(v.infoTags[filter]) < value: + filterString = filterString + ";" + filter + str(value) if filterString != "" else filter + str(value) + if logic == "gt" and int(v.infoTags[filter]) > value: + filterString = filterString + ";" + filter + str(value) if filterString != "" else filter + str(value) + elif type(value) == float: + if logic == "lt" and float(v.infoTags[filter]) < value: + filterString = filterString + ";" + filter + str(value) if filterString != "" else filter + str(value) + if logic == "gt" and float(v.infoTags[filter]) > value: + filterString = filterString + ";" + filter + str(value) if filterString != "" else filter + str(value) + + filterString = "PASS" if filterString == "" else filterString + v.filters = filterString + record = v.buildRecord(options.removeGeno) + outputFile.write(record) + +# Close the vcf files. + v.closeVcf(options.vcfFile) + +# Terminate the program. + return 0 |
b |
diff -r 000000000000 -r da1a6f33b504 filter.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter.xml Mon Jan 27 09:29:09 2014 -0500 |
[ |
@@ -0,0 +1,64 @@ +<tool id="vcf_filter" name="Filter" version="1.0.0"> + <description>a VCF file</description> + <command interpreter="python"> + vcfPytools.py + filter + --in=$input1 + --out=$output1 + --quality=$quality + #for $i in $info_filter: + --info ${i.info} + #end for + $remove_genotypes + $mark_as_pass + </command> + <inputs> + <param name="input1" label="VCF file" type="data" format="vcf" /> + <param name="quality" label="Filter by quality" type="integer" value='' help="Filter out SNPs with qualities lower than selected value" /> + <repeat name="info_filter" title="Filter based on entries in the info string"> + <param name="info" label="Filter" type="text" value='' help='This option takes three values: the info string tag, the cutoff value and whether to filter out those records with less than (lt) or greater than (gt) this value. For example: DP 10 lt ' /> + </repeat> + <param name="remove_genotypes" label="Remove the genotype strings" type="boolean" truevalue="--remove-genotypes" falsevalue="" checked="False" /> + <param name="mark_as_pass" label="Mark all records as having passed filters" type="boolean" truevalue="--mark-as-pass" falsevalue="" checked="False" /> + </inputs> + <tests> + <test> + <param name="input1" value="test.small.vcf" ftype="vcf" /> + <param name="quality" value="9" /> + <param name="info" value="NS 360 gt"/> + <param name="remove_genotypes" value="" /> + <param name="mark_as_pass" value="" /> + <output name="output" file="test_filter_quality_9_NS_360_gt.vcf" lines_diff="6" ftype="vcf" /> + </test> + <test> + <param name="input1" value="test.small.vcf" ftype="vcf" /> + <param name="quality" value="9" /> + <param name="info" value="DP 2000 lt"/> + <param name="remove_genotypes" value="" /> + <param name="mark_as_pass" value="" /> + <output name="output" file="test_filter_quality_9_DP_2000_lt.vcf" lines_diff="6" ftype="vcf" /> + </test> + </tests> + <outputs> + <data format="vcf" name="output1" label="${tool.name} ${on_string}" /> + </outputs> + <help> + +**What it does** + +This tool uses vcfPytools_' filter command + +.. _vcfPytools: https://github.com/AlistairNWard/vcfPytools + +Quality option will check the variant quality for each record and if it is below the defined value, the filter field will be populated with the filter entry Q[value]. + +Any value in the info string can be used for filtering by using the 'Filter by info' option. This option takes three values: the info string tag, the cutoff value and whether to filter out those records with less than (lt) or greater than (gt) this value. For example: + + DP 10 lt + +would filter out all varianta with a depth (DP) less than 10 and the filter field would be populated with DP10. + +This option can be defined as many times as required. + + </help> +</tool> |
b |
diff -r 000000000000 -r da1a6f33b504 test-data/test.small.vcf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test.small.vcf Mon Jan 27 09:29:09 2014 -0500 |
b |
b'@@ -0,0 +1,1000 @@\n+##fileformat=VCFv4.0\n+##fileDate=20110215\n+##source=freeBayes version 0.4.1\n+##reference=/d2/data/references/build_37/human_reference_v37.fa\n+##phasing=none\n+##commandline="/share/software/freebayes/bin/freebayes --stdin --no-filters --min-alternate-count 2 --posterior-integration-depth 100 --pvar 0.01 --vcf /d1/data/1000G/20101123/filteringTests/AFR/mosaik/noFilters/Pipeline/none//freebayes/freebayes.20:0-1000000.baq.20110210.vcf --fasta-reference /d2/data/references/build_37/human_reference_v37.fa --region 20:0..1000000"\n+##vcfPytools=merge freebayes.20:0-1000000.baq.20110210.vcf, freebayes.20:1000000-2000000.baq.20110210.vcf, freebayes.20:2000000-3000000.baq.20110210.vcf, freebayes.20:3000000-4000000.baq.20110210.vcf, freebayes.20:4000000-5000000.baq.20110210.vcf, freebayes.20:5000000-6000000.baq.20110210.vcf, freebayes.20:6000000-7000000.baq.20110210.vcf, freebayes.20:7000000-8000000.baq.20110210.vcf, freebayes.20:8000000-9000000.baq.20110210.vcf, freebayes.20:9000000-10000000.baq.20110210.vcf, freebayes.20:10000000-11000000.baq.20110210.vcf, freebayes.20:11000000-12000000.baq.20110210.vcf, freebayes.20:12000000-13000000.baq.20110210.vcf, freebayes.20:13000000-14000000.baq.20110210.vcf, freebayes.20:14000000-15000000.baq.20110210.vcf, freebayes.20:15000000-16000000.baq.20110210.vcf, freebayes.20:16000000-17000000.baq.20110210.vcf, freebayes.20:17000000-18000000.baq.20110210.vcf, freebayes.20:18000000-19000000.baq.20110210.vcf, freebayes.20:19000000-20000000.baq.20110210.vcf, freebayes.20:20000000-21000000.baq.20110210.vcf, freebayes.20:21000000-22000000.baq.20110210.vcf, freebayes.20:22000000-23000000.baq.20110210.vcf, freebayes.20:23000000-24000000.baq.20110210.vcf, freebayes.20:24000000-25000000.baq.20110210.vcf, freebayes.20:25000000-26000000.baq.20110210.vcf, freebayes.20:26000000-27000000.baq.20110210.vcf, freebayes.20:27000000-28000000.baq.20110210.vcf, freebayes.20:28000000-29000000.baq.20110210.vcf, freebayes.20:29000000-30000000.baq.20110210.vcf, freebayes.20:30000000-31000000.baq.20110210.vcf, freebayes.20:31000000-32000000.baq.20110210.vcf, freebayes.20:32000000-33000000.baq.20110210.vcf, freebayes.20:33000000-34000000.baq.20110210.vcf, freebayes.20:34000000-35000000.baq.20110210.vcf, freebayes.20:35000000-36000000.baq.20110210.vcf, freebayes.20:36000000-37000000.baq.20110210.vcf, freebayes.20:37000000-38000000.baq.20110210.vcf, freebayes.20:38000000-39000000.baq.20110210.vcf, freebayes.20:39000000-40000000.baq.20110210.vcf, freebayes.20:40000000-41000000.baq.20110210.vcf, freebayes.20:41000000-42000000.baq.20110210.vcf, freebayes.20:42000000-43000000.baq.20110210.vcf, freebayes.20:43000000-44000000.baq.20110210.vcf, freebayes.20:44000000-45000000.baq.20110210.vcf, freebayes.20:45000000-46000000.baq.20110210.vcf, freebayes.20:46000000-47000000.baq.20110210.vcf, freebayes.20:47000000-48000000.baq.20110210.vcf, freebayes.20:48000000-49000000.baq.20110210.vcf, freebayes.20:49000000-50000000.baq.20110210.vcf, freebayes.20:50000000-51000000.baq.20110210.vcf, freebayes.20:51000000-52000000.baq.20110210.vcf, freebayes.20:52000000-53000000.baq.20110210.vcf, freebayes.20:53000000-54000000.baq.20110210.vcf, freebayes.20:54000000-55000000.baq.20110210.vcf, freebayes.20:55000000-56000000.baq.20110210.vcf, freebayes.20:56000000-57000000.baq.20110210.vcf, freebayes.20:57000000-58000000.baq.20110210.vcf, freebayes.20:58000000-59000000.baq.20110210.vcf, freebayes.20:59000000-60000000.baq.20110210.vcf, freebayes.20:60000000-61000000.baq.20110210.vcf, freebayes.20:61000000-62000000.baq.20110210.vcf, freebayes.20:62000000-63000000.baq.20110210.vcf, freebayes.20:63000000-63025520.baq.20110210.vcf\n+##vcfPytools=\n+##INFO=<ID=AB,Number=1,Type=Float,Description="Allele balance at heterozygous sites: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous">\n+##INFO=<ID=DEL,Number=1,Type=Integer,Description="Len'..b'74;RA=2141;AA=3;SR=1118|1023;SA=1|2;SB=0.33333;AB=0.75;ABR=6;ABA=2;RUN=2;SNP;TV\n+20\t124059\t.\tC\tT\t37.8\tPASS\tNS=358;DP=1984;AC=3;AN=716;AF=0.0041899;RA=1971;AA=7;SR=971|1000;SA=5|2;SB=0.71429;AB=0.44444;ABR=4;ABA=5;RUN=1;SNP;TS;CpG\n+20\t124060\t.\tG\tA\t34.6\tPASS\tNS=359;DP=1991;AC=1;AN=718;AF=0.0013928;RA=1983;AA=3;SR=995|988;SA=2|1;SB=0.66667;AB=0.5;ABR=3;ABA=3;RUN=2;SNP;TS;CpG\n+20\t124137\t.\tC\tA\t0.0701\tPASS\tNS=357;DP=1933;AC=1;AN=714;AF=0.0014006;RA=1922;AA=4;SR=990|932;SA=0|4;SB=0;AB=0.5;ABR=1;ABA=1;RUN=2;SNP;TV\n+20\t124148\t.\tA\tG\t0.344\tPASS\tNS=358;DP=1957;AC=2;AN=716;AF=0.0027933;RA=1947;AA=6;SR=989|958;SA=1|5;SB=0.16667;AB=0.8;ABR=12;ABA=3;RUN=1;SNP;TS\n+20\t124229\t.\tT\tC\t0.672\tPASS\tNS=356;DP=1979;AC=1;AN=712;AF=0.0014045;RA=1972;AA=4;SR=1025|947;SA=1|3;SB=0.25;AB=0.8;ABR=8;ABA=2;RUN=1;SNP;TS\n+20\t124230\t.\tA\tG\t0.0816\tPASS\tNS=356;DP=1970;AC=1;AN=712;AF=0.0014045;RA=1959;AA=4;SR=1026|933;SA=1|3;SB=0.25;AB=0.81818;ABR=9;ABA=2;RUN=1;SNP;TS\n+20\t124249\t.\tA\tG\t99.0\tPASS\tNS=359;DP=1969;AC=41;AN=718;AF=0.057103;RA=1864;AA=101;SR=974|890;SA=55|46;SB=0.54455;AB=0.51977;ABR=92;ABA=85;RUN=1;SNP;TS;CpG\n+20\t124302\t.\tA\tC\t99.0\tPASS\tNS=358;DP=2007;AC=319;AN=716;AF=0.44553;RA=1185;AA=819;SR=577|608;SA=392|427;SB=0.47863;AB=0.51934;ABR=376;ABA=348;RUN=2;SNP;TV\n+20\t124335\t.\tC\tT\t99.0\tPASS\tNS=361;DP=2035;AC=4;AN=722;AF=0.0055402;RA=2013;AA=15;SR=929|1084;SA=10|5;SB=0.66667;AB=0.3913;ABR=9;ABA=14;RUN=1;SNP;TS\n+20\t124412\t.\tC\tT\t99.0\tPASS\tNS=357;DP=2005;AC=307;AN=714;AF=0.42997;RA=1207;AA=795;SR=602|605;SA=400|395;SB=0.50314;AB=0.51286;ABR=379;ABA=360;RUN=1;SNP;TS\n+20\t124511\t.\tT\tA\t0.647\tPASS\tNS=365;DP=2141;AC=1;AN=730;AF=0.0013699;RA=2133;AA=4;SR=1077|1056;SA=2|2;SB=0.5;AB=0.66667;ABR=4;ABA=2;RUN=2;SNP;TV\n+20\t124607\t.\tA\tG\t2.28\tPASS\tNS=357;DP=1872;AC=1;AN=714;AF=0.0014006;RA=1865;AA=2;SR=930|935;SA=1|1;SB=0.5;AB=0.75;ABR=6;ABA=2;RUN=1;SNP;TS\n+20\t124737\t.\tA\tC\t99.0\tPASS\tNS=367;DP=2024;AC=4;AN=734;AF=0.0054496;RA=2003;AA=16;SR=1046|957;SA=11|5;SB=0.6875;AB=0.26316;ABR=5;ABA=14;RUN=1;SNP;TV\n+20\t124827\t.\tT\tC\t1.7\tPASS\tNS=367;DP=2059;AC=1;AN=734;AF=0.0013624;RA=2048;AA=4;SR=987|1061;SA=2|2;SB=0.5;AB=0.6;ABR=3;ABA=2;RUN=2;SNP;TS\n+20\t124848\t.\tA\tG\t36.5\tPASS\tNS=364;DP=2077;AC=1;AN=728;AF=0.0013736;RA=2060;AA=6;SR=966|1094;SA=2|4;SB=0.33333;AB=0.57143;ABR=4;ABA=3;RUN=1;SNP;TS\n+20\t124878\t.\tG\tA\t0.22\tPASS\tNS=361;DP=2060;AC=1;AN=722;AF=0.001385;RA=2049;AA=4;SR=974|1075;SA=3|1;SB=0.75;AB=0.77778;ABR=7;ABA=2;RUN=1;SNP;TS\n+20\t124920\t.\tA\tG\t76.0\tPASS\tNS=361;DP=1956;AC=1;AN=722;AF=0.001385;RA=1949;AA=6;SR=984|965;SA=2|4;SB=0.33333;AB=0.14286;ABR=1;ABA=6;RUN=1;SNP;TS;CpG\n+20\t124942\t.\tA\tT\t1.44\tPASS\tNS=362;DP=2015;AC=1;AN=724;AF=0.0013812;RA=2007;AA=4;SR=1107|900;SA=1|3;SB=0.25;AB=0.6;ABR=3;ABA=2;RUN=1;SNP;TV\n+20\t124998\t.\tT\tC\t99.0\tPASS\tNS=366;DP=2204;AC=29;AN=732;AF=0.039617;RA=2131;AA=66;SR=1132|999;SA=36|30;SB=0.54545;AB=0.54902;ABR=56;ABA=46;RUN=1;SNP;TS\n+20\t125096\t.\tC\tG\t4.18\tPASS\tNS=363;DP=2095;AC=1;AN=726;AF=0.0013774;RA=2082;AA=2;SR=1071|1011;SA=2|0;SB=1;AB=0.75;ABR=6;ABA=2;RUN=1;SNP;TV\n+20\t125109\t.\tA\tT\t0.0858\tPASS\tNS=365;DP=2091;AC=2;AN=730;AF=0.0027397;RA=2080;AA=7;SR=1103|977;SA=7|0;SB=1;AB=0.66667;ABR=6;ABA=3;RUN=8;SNP;TV\n+20\t125121\t.\tC\tT\t99.0\tPASS\tNS=364;DP=2075;AC=208;AN=728;AF=0.28571;RA=1472;AA=599;SR=826|646;SA=335|264;SB=0.55927;AB=0.49868;ABR=377;ABA=378;RUN=1;SNP;TS\n+20\t125179\t.\tC\tA\t99.0\tPASS\tNS=368;DP=2155;AC=7;AN=736;AF=0.0095109;RA=2127;AA=26;SR=1123|1004;SA=16|10;SB=0.61538;AB=0.36842;ABR=14;ABA=24;RUN=2;SNP;TV\n+20\t125266\t.\tT\tA\t3.47\tPASS\tNS=364;DP=2138;AC=1;AN=728;AF=0.0013736;RA=2123;AA=5;SR=907|1216;SA=1|4;SB=0.2;AB=0.6;ABR=3;ABA=2;RUN=1;SNP;TV\n+20\t125381\t.\tA\tG\t4.69\tPASS\tNS=362;DP=2102;AC=1;AN=724;AF=0.0013812;RA=2094;AA=3;SR=919|1175;SA=1|2;SB=0.33333;AB=0.66667;ABR=4;ABA=2;RUN=1;SNP;TS\n+20\t125450\t.\tG\tA\t0.364\tPASS\tNS=359;DP=1881;AC=1;AN=718;AF=0.0013928;RA=1868;AA=5;SR=862|1006;SA=1|4;SB=0.2;AB=0.71429;ABR=5;ABA=2;RUN=2;SNP;TS\n+20\t125470\t.\tA\tG\t0.602\tPASS\tNS=360;DP=1870;AC=1;AN=720;AF=0.0013889;RA=1855;AA=5;SR=909|946;SA=3|2;SB=0.6;AB=0.75;ABR=6;ABA=2;RUN=1;SNP;TS\n' |
b |
diff -r 000000000000 -r da1a6f33b504 test-data/test_filter_quality_9_DP_2000_lt.vcf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_filter_quality_9_DP_2000_lt.vcf Mon Jan 27 09:29:09 2014 -0500 |
b |
b'@@ -0,0 +1,1001 @@\n+##fileformat=VCFv4.0\n+##fileDate=20110215\n+##source=freeBayes version 0.4.1\n+##reference=/d2/data/references/build_37/human_reference_v37.fa\n+##phasing=none\n+##commandline="/share/software/freebayes/bin/freebayes --stdin --no-filters --min-alternate-count 2 --posterior-integration-depth 100 --pvar 0.01 --vcf /d1/data/1000G/20101123/filteringTests/AFR/mosaik/noFilters/Pipeline/none//freebayes/freebayes.20:0-1000000.baq.20110210.vcf --fasta-reference /d2/data/references/build_37/human_reference_v37.fa --region 20:0..1000000"\n+##vcfPytools=merge freebayes.20:0-1000000.baq.20110210.vcf, freebayes.20:1000000-2000000.baq.20110210.vcf, freebayes.20:2000000-3000000.baq.20110210.vcf, freebayes.20:3000000-4000000.baq.20110210.vcf, freebayes.20:4000000-5000000.baq.20110210.vcf, freebayes.20:5000000-6000000.baq.20110210.vcf, freebayes.20:6000000-7000000.baq.20110210.vcf, freebayes.20:7000000-8000000.baq.20110210.vcf, freebayes.20:8000000-9000000.baq.20110210.vcf, freebayes.20:9000000-10000000.baq.20110210.vcf, freebayes.20:10000000-11000000.baq.20110210.vcf, freebayes.20:11000000-12000000.baq.20110210.vcf, freebayes.20:12000000-13000000.baq.20110210.vcf, freebayes.20:13000000-14000000.baq.20110210.vcf, freebayes.20:14000000-15000000.baq.20110210.vcf, freebayes.20:15000000-16000000.baq.20110210.vcf, freebayes.20:16000000-17000000.baq.20110210.vcf, freebayes.20:17000000-18000000.baq.20110210.vcf, freebayes.20:18000000-19000000.baq.20110210.vcf, freebayes.20:19000000-20000000.baq.20110210.vcf, freebayes.20:20000000-21000000.baq.20110210.vcf, freebayes.20:21000000-22000000.baq.20110210.vcf, freebayes.20:22000000-23000000.baq.20110210.vcf, freebayes.20:23000000-24000000.baq.20110210.vcf, freebayes.20:24000000-25000000.baq.20110210.vcf, freebayes.20:25000000-26000000.baq.20110210.vcf, freebayes.20:26000000-27000000.baq.20110210.vcf, freebayes.20:27000000-28000000.baq.20110210.vcf, freebayes.20:28000000-29000000.baq.20110210.vcf, freebayes.20:29000000-30000000.baq.20110210.vcf, freebayes.20:30000000-31000000.baq.20110210.vcf, freebayes.20:31000000-32000000.baq.20110210.vcf, freebayes.20:32000000-33000000.baq.20110210.vcf, freebayes.20:33000000-34000000.baq.20110210.vcf, freebayes.20:34000000-35000000.baq.20110210.vcf, freebayes.20:35000000-36000000.baq.20110210.vcf, freebayes.20:36000000-37000000.baq.20110210.vcf, freebayes.20:37000000-38000000.baq.20110210.vcf, freebayes.20:38000000-39000000.baq.20110210.vcf, freebayes.20:39000000-40000000.baq.20110210.vcf, freebayes.20:40000000-41000000.baq.20110210.vcf, freebayes.20:41000000-42000000.baq.20110210.vcf, freebayes.20:42000000-43000000.baq.20110210.vcf, freebayes.20:43000000-44000000.baq.20110210.vcf, freebayes.20:44000000-45000000.baq.20110210.vcf, freebayes.20:45000000-46000000.baq.20110210.vcf, freebayes.20:46000000-47000000.baq.20110210.vcf, freebayes.20:47000000-48000000.baq.20110210.vcf, freebayes.20:48000000-49000000.baq.20110210.vcf, freebayes.20:49000000-50000000.baq.20110210.vcf, freebayes.20:50000000-51000000.baq.20110210.vcf, freebayes.20:51000000-52000000.baq.20110210.vcf, freebayes.20:52000000-53000000.baq.20110210.vcf, freebayes.20:53000000-54000000.baq.20110210.vcf, freebayes.20:54000000-55000000.baq.20110210.vcf, freebayes.20:55000000-56000000.baq.20110210.vcf, freebayes.20:56000000-57000000.baq.20110210.vcf, freebayes.20:57000000-58000000.baq.20110210.vcf, freebayes.20:58000000-59000000.baq.20110210.vcf, freebayes.20:59000000-60000000.baq.20110210.vcf, freebayes.20:60000000-61000000.baq.20110210.vcf, freebayes.20:61000000-62000000.baq.20110210.vcf, freebayes.20:62000000-63000000.baq.20110210.vcf, freebayes.20:63000000-63025520.baq.20110210.vcf\n+##vcfPytools=\n+##vcfPytools=filtered using the following filters: DP2000\n+##INFO=<ID=REPEAT,Number=1,Type=String,Description="Description of the local repeat structures flanking the current position">\n+##INFO=<ID=DEL,Number=1,Type=Integer,Description="Length of deletion allele, if present">\n+##INFO=<ID=INS,Number=1,Ty'..b'3;SA=1|2;SB=0.33333;AB=0.75;ABR=6;ABA=2;RUN=2;SNP;TV\n+20\t124059\t.\tC\tT\t37.8\tDP2000\tNS=358;DP=1984;AC=3;AN=716;AF=0.0041899;RA=1971;AA=7;SR=971|1000;SA=5|2;SB=0.71429;AB=0.44444;ABR=4;ABA=5;RUN=1;SNP;TS;CpG\n+20\t124060\t.\tG\tA\t34.6\tDP2000\tNS=359;DP=1991;AC=1;AN=718;AF=0.0013928;RA=1983;AA=3;SR=995|988;SA=2|1;SB=0.66667;AB=0.5;ABR=3;ABA=3;RUN=2;SNP;TS;CpG\n+20\t124137\t.\tC\tA\t0.0701\tQ9;DP2000\tNS=357;DP=1933;AC=1;AN=714;AF=0.0014006;RA=1922;AA=4;SR=990|932;SA=0|4;SB=0;AB=0.5;ABR=1;ABA=1;RUN=2;SNP;TV\n+20\t124148\t.\tA\tG\t0.344\tQ9;DP2000\tNS=358;DP=1957;AC=2;AN=716;AF=0.0027933;RA=1947;AA=6;SR=989|958;SA=1|5;SB=0.16667;AB=0.8;ABR=12;ABA=3;RUN=1;SNP;TS\n+20\t124229\t.\tT\tC\t0.672\tQ9;DP2000\tNS=356;DP=1979;AC=1;AN=712;AF=0.0014045;RA=1972;AA=4;SR=1025|947;SA=1|3;SB=0.25;AB=0.8;ABR=8;ABA=2;RUN=1;SNP;TS\n+20\t124230\t.\tA\tG\t0.0816\tQ9;DP2000\tNS=356;DP=1970;AC=1;AN=712;AF=0.0014045;RA=1959;AA=4;SR=1026|933;SA=1|3;SB=0.25;AB=0.81818;ABR=9;ABA=2;RUN=1;SNP;TS\n+20\t124249\t.\tA\tG\t99.0\tDP2000\tNS=359;DP=1969;AC=41;AN=718;AF=0.057103;RA=1864;AA=101;SR=974|890;SA=55|46;SB=0.54455;AB=0.51977;ABR=92;ABA=85;RUN=1;SNP;TS;CpG\n+20\t124302\t.\tA\tC\t99.0\tPASS\tNS=358;DP=2007;AC=319;AN=716;AF=0.44553;RA=1185;AA=819;SR=577|608;SA=392|427;SB=0.47863;AB=0.51934;ABR=376;ABA=348;RUN=2;SNP;TV\n+20\t124335\t.\tC\tT\t99.0\tPASS\tNS=361;DP=2035;AC=4;AN=722;AF=0.0055402;RA=2013;AA=15;SR=929|1084;SA=10|5;SB=0.66667;AB=0.3913;ABR=9;ABA=14;RUN=1;SNP;TS\n+20\t124412\t.\tC\tT\t99.0\tPASS\tNS=357;DP=2005;AC=307;AN=714;AF=0.42997;RA=1207;AA=795;SR=602|605;SA=400|395;SB=0.50314;AB=0.51286;ABR=379;ABA=360;RUN=1;SNP;TS\n+20\t124511\t.\tT\tA\t0.647\tQ9\tNS=365;DP=2141;AC=1;AN=730;AF=0.0013699;RA=2133;AA=4;SR=1077|1056;SA=2|2;SB=0.5;AB=0.66667;ABR=4;ABA=2;RUN=2;SNP;TV\n+20\t124607\t.\tA\tG\t2.28\tQ9;DP2000\tNS=357;DP=1872;AC=1;AN=714;AF=0.0014006;RA=1865;AA=2;SR=930|935;SA=1|1;SB=0.5;AB=0.75;ABR=6;ABA=2;RUN=1;SNP;TS\n+20\t124737\t.\tA\tC\t99.0\tPASS\tNS=367;DP=2024;AC=4;AN=734;AF=0.0054496;RA=2003;AA=16;SR=1046|957;SA=11|5;SB=0.6875;AB=0.26316;ABR=5;ABA=14;RUN=1;SNP;TV\n+20\t124827\t.\tT\tC\t1.7\tQ9\tNS=367;DP=2059;AC=1;AN=734;AF=0.0013624;RA=2048;AA=4;SR=987|1061;SA=2|2;SB=0.5;AB=0.6;ABR=3;ABA=2;RUN=2;SNP;TS\n+20\t124848\t.\tA\tG\t36.5\tPASS\tNS=364;DP=2077;AC=1;AN=728;AF=0.0013736;RA=2060;AA=6;SR=966|1094;SA=2|4;SB=0.33333;AB=0.57143;ABR=4;ABA=3;RUN=1;SNP;TS\n+20\t124878\t.\tG\tA\t0.22\tQ9\tNS=361;DP=2060;AC=1;AN=722;AF=0.001385;RA=2049;AA=4;SR=974|1075;SA=3|1;SB=0.75;AB=0.77778;ABR=7;ABA=2;RUN=1;SNP;TS\n+20\t124920\t.\tA\tG\t76.0\tDP2000\tNS=361;DP=1956;AC=1;AN=722;AF=0.001385;RA=1949;AA=6;SR=984|965;SA=2|4;SB=0.33333;AB=0.14286;ABR=1;ABA=6;RUN=1;SNP;TS;CpG\n+20\t124942\t.\tA\tT\t1.44\tQ9\tNS=362;DP=2015;AC=1;AN=724;AF=0.0013812;RA=2007;AA=4;SR=1107|900;SA=1|3;SB=0.25;AB=0.6;ABR=3;ABA=2;RUN=1;SNP;TV\n+20\t124998\t.\tT\tC\t99.0\tPASS\tNS=366;DP=2204;AC=29;AN=732;AF=0.039617;RA=2131;AA=66;SR=1132|999;SA=36|30;SB=0.54545;AB=0.54902;ABR=56;ABA=46;RUN=1;SNP;TS\n+20\t125096\t.\tC\tG\t4.18\tQ9\tNS=363;DP=2095;AC=1;AN=726;AF=0.0013774;RA=2082;AA=2;SR=1071|1011;SA=2|0;SB=1;AB=0.75;ABR=6;ABA=2;RUN=1;SNP;TV\n+20\t125109\t.\tA\tT\t0.0858\tQ9\tNS=365;DP=2091;AC=2;AN=730;AF=0.0027397;RA=2080;AA=7;SR=1103|977;SA=7|0;SB=1;AB=0.66667;ABR=6;ABA=3;RUN=8;SNP;TV\n+20\t125121\t.\tC\tT\t99.0\tPASS\tNS=364;DP=2075;AC=208;AN=728;AF=0.28571;RA=1472;AA=599;SR=826|646;SA=335|264;SB=0.55927;AB=0.49868;ABR=377;ABA=378;RUN=1;SNP;TS\n+20\t125179\t.\tC\tA\t99.0\tPASS\tNS=368;DP=2155;AC=7;AN=736;AF=0.0095109;RA=2127;AA=26;SR=1123|1004;SA=16|10;SB=0.61538;AB=0.36842;ABR=14;ABA=24;RUN=2;SNP;TV\n+20\t125266\t.\tT\tA\t3.47\tQ9\tNS=364;DP=2138;AC=1;AN=728;AF=0.0013736;RA=2123;AA=5;SR=907|1216;SA=1|4;SB=0.2;AB=0.6;ABR=3;ABA=2;RUN=1;SNP;TV\n+20\t125381\t.\tA\tG\t4.69\tQ9\tNS=362;DP=2102;AC=1;AN=724;AF=0.0013812;RA=2094;AA=3;SR=919|1175;SA=1|2;SB=0.33333;AB=0.66667;ABR=4;ABA=2;RUN=1;SNP;TS\n+20\t125450\t.\tG\tA\t0.364\tQ9;DP2000\tNS=359;DP=1881;AC=1;AN=718;AF=0.0013928;RA=1868;AA=5;SR=862|1006;SA=1|4;SB=0.2;AB=0.71429;ABR=5;ABA=2;RUN=2;SNP;TS\n+20\t125470\t.\tA\tG\t0.602\tQ9;DP2000\tNS=360;DP=1870;AC=1;AN=720;AF=0.0013889;RA=1855;AA=5;SR=909|946;SA=3|2;SB=0.6;AB=0.75;ABR=6;ABA=2;RUN=1;SNP;TS\n' |
b |
diff -r 000000000000 -r da1a6f33b504 test-data/test_filter_quality_9_NS_360_gt.vcf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_filter_quality_9_NS_360_gt.vcf Mon Jan 27 09:29:09 2014 -0500 |
b |
b'@@ -0,0 +1,1001 @@\n+##fileformat=VCFv4.0\n+##fileDate=20110215\n+##source=freeBayes version 0.4.1\n+##reference=/d2/data/references/build_37/human_reference_v37.fa\n+##phasing=none\n+##commandline="/share/software/freebayes/bin/freebayes --stdin --no-filters --min-alternate-count 2 --posterior-integration-depth 100 --pvar 0.01 --vcf /d1/data/1000G/20101123/filteringTests/AFR/mosaik/noFilters/Pipeline/none//freebayes/freebayes.20:0-1000000.baq.20110210.vcf --fasta-reference /d2/data/references/build_37/human_reference_v37.fa --region 20:0..1000000"\n+##vcfPytools=merge freebayes.20:0-1000000.baq.20110210.vcf, freebayes.20:1000000-2000000.baq.20110210.vcf, freebayes.20:2000000-3000000.baq.20110210.vcf, freebayes.20:3000000-4000000.baq.20110210.vcf, freebayes.20:4000000-5000000.baq.20110210.vcf, freebayes.20:5000000-6000000.baq.20110210.vcf, freebayes.20:6000000-7000000.baq.20110210.vcf, freebayes.20:7000000-8000000.baq.20110210.vcf, freebayes.20:8000000-9000000.baq.20110210.vcf, freebayes.20:9000000-10000000.baq.20110210.vcf, freebayes.20:10000000-11000000.baq.20110210.vcf, freebayes.20:11000000-12000000.baq.20110210.vcf, freebayes.20:12000000-13000000.baq.20110210.vcf, freebayes.20:13000000-14000000.baq.20110210.vcf, freebayes.20:14000000-15000000.baq.20110210.vcf, freebayes.20:15000000-16000000.baq.20110210.vcf, freebayes.20:16000000-17000000.baq.20110210.vcf, freebayes.20:17000000-18000000.baq.20110210.vcf, freebayes.20:18000000-19000000.baq.20110210.vcf, freebayes.20:19000000-20000000.baq.20110210.vcf, freebayes.20:20000000-21000000.baq.20110210.vcf, freebayes.20:21000000-22000000.baq.20110210.vcf, freebayes.20:22000000-23000000.baq.20110210.vcf, freebayes.20:23000000-24000000.baq.20110210.vcf, freebayes.20:24000000-25000000.baq.20110210.vcf, freebayes.20:25000000-26000000.baq.20110210.vcf, freebayes.20:26000000-27000000.baq.20110210.vcf, freebayes.20:27000000-28000000.baq.20110210.vcf, freebayes.20:28000000-29000000.baq.20110210.vcf, freebayes.20:29000000-30000000.baq.20110210.vcf, freebayes.20:30000000-31000000.baq.20110210.vcf, freebayes.20:31000000-32000000.baq.20110210.vcf, freebayes.20:32000000-33000000.baq.20110210.vcf, freebayes.20:33000000-34000000.baq.20110210.vcf, freebayes.20:34000000-35000000.baq.20110210.vcf, freebayes.20:35000000-36000000.baq.20110210.vcf, freebayes.20:36000000-37000000.baq.20110210.vcf, freebayes.20:37000000-38000000.baq.20110210.vcf, freebayes.20:38000000-39000000.baq.20110210.vcf, freebayes.20:39000000-40000000.baq.20110210.vcf, freebayes.20:40000000-41000000.baq.20110210.vcf, freebayes.20:41000000-42000000.baq.20110210.vcf, freebayes.20:42000000-43000000.baq.20110210.vcf, freebayes.20:43000000-44000000.baq.20110210.vcf, freebayes.20:44000000-45000000.baq.20110210.vcf, freebayes.20:45000000-46000000.baq.20110210.vcf, freebayes.20:46000000-47000000.baq.20110210.vcf, freebayes.20:47000000-48000000.baq.20110210.vcf, freebayes.20:48000000-49000000.baq.20110210.vcf, freebayes.20:49000000-50000000.baq.20110210.vcf, freebayes.20:50000000-51000000.baq.20110210.vcf, freebayes.20:51000000-52000000.baq.20110210.vcf, freebayes.20:52000000-53000000.baq.20110210.vcf, freebayes.20:53000000-54000000.baq.20110210.vcf, freebayes.20:54000000-55000000.baq.20110210.vcf, freebayes.20:55000000-56000000.baq.20110210.vcf, freebayes.20:56000000-57000000.baq.20110210.vcf, freebayes.20:57000000-58000000.baq.20110210.vcf, freebayes.20:58000000-59000000.baq.20110210.vcf, freebayes.20:59000000-60000000.baq.20110210.vcf, freebayes.20:60000000-61000000.baq.20110210.vcf, freebayes.20:61000000-62000000.baq.20110210.vcf, freebayes.20:62000000-63000000.baq.20110210.vcf, freebayes.20:63000000-63025520.baq.20110210.vcf\n+##vcfPytools=\n+##vcfPytools=filtered using the following filters: NS360\n+##INFO=<ID=REPEAT,Number=1,Type=String,Description="Description of the local repeat structures flanking the current position">\n+##INFO=<ID=DEL,Number=1,Type=Integer,Description="Length of deletion allele, if present">\n+##INFO=<ID=INS,Number=1,Typ'..b'023;SA=1|2;SB=0.33333;AB=0.75;ABR=6;ABA=2;RUN=2;SNP;TV\n+20\t124059\t.\tC\tT\t37.8\tPASS\tNS=358;DP=1984;AC=3;AN=716;AF=0.0041899;RA=1971;AA=7;SR=971|1000;SA=5|2;SB=0.71429;AB=0.44444;ABR=4;ABA=5;RUN=1;SNP;TS;CpG\n+20\t124060\t.\tG\tA\t34.6\tPASS\tNS=359;DP=1991;AC=1;AN=718;AF=0.0013928;RA=1983;AA=3;SR=995|988;SA=2|1;SB=0.66667;AB=0.5;ABR=3;ABA=3;RUN=2;SNP;TS;CpG\n+20\t124137\t.\tC\tA\t0.0701\tQ9\tNS=357;DP=1933;AC=1;AN=714;AF=0.0014006;RA=1922;AA=4;SR=990|932;SA=0|4;SB=0;AB=0.5;ABR=1;ABA=1;RUN=2;SNP;TV\n+20\t124148\t.\tA\tG\t0.344\tQ9\tNS=358;DP=1957;AC=2;AN=716;AF=0.0027933;RA=1947;AA=6;SR=989|958;SA=1|5;SB=0.16667;AB=0.8;ABR=12;ABA=3;RUN=1;SNP;TS\n+20\t124229\t.\tT\tC\t0.672\tQ9\tNS=356;DP=1979;AC=1;AN=712;AF=0.0014045;RA=1972;AA=4;SR=1025|947;SA=1|3;SB=0.25;AB=0.8;ABR=8;ABA=2;RUN=1;SNP;TS\n+20\t124230\t.\tA\tG\t0.0816\tQ9\tNS=356;DP=1970;AC=1;AN=712;AF=0.0014045;RA=1959;AA=4;SR=1026|933;SA=1|3;SB=0.25;AB=0.81818;ABR=9;ABA=2;RUN=1;SNP;TS\n+20\t124249\t.\tA\tG\t99.0\tPASS\tNS=359;DP=1969;AC=41;AN=718;AF=0.057103;RA=1864;AA=101;SR=974|890;SA=55|46;SB=0.54455;AB=0.51977;ABR=92;ABA=85;RUN=1;SNP;TS;CpG\n+20\t124302\t.\tA\tC\t99.0\tPASS\tNS=358;DP=2007;AC=319;AN=716;AF=0.44553;RA=1185;AA=819;SR=577|608;SA=392|427;SB=0.47863;AB=0.51934;ABR=376;ABA=348;RUN=2;SNP;TV\n+20\t124335\t.\tC\tT\t99.0\tNS360\tNS=361;DP=2035;AC=4;AN=722;AF=0.0055402;RA=2013;AA=15;SR=929|1084;SA=10|5;SB=0.66667;AB=0.3913;ABR=9;ABA=14;RUN=1;SNP;TS\n+20\t124412\t.\tC\tT\t99.0\tPASS\tNS=357;DP=2005;AC=307;AN=714;AF=0.42997;RA=1207;AA=795;SR=602|605;SA=400|395;SB=0.50314;AB=0.51286;ABR=379;ABA=360;RUN=1;SNP;TS\n+20\t124511\t.\tT\tA\t0.647\tQ9;NS360\tNS=365;DP=2141;AC=1;AN=730;AF=0.0013699;RA=2133;AA=4;SR=1077|1056;SA=2|2;SB=0.5;AB=0.66667;ABR=4;ABA=2;RUN=2;SNP;TV\n+20\t124607\t.\tA\tG\t2.28\tQ9\tNS=357;DP=1872;AC=1;AN=714;AF=0.0014006;RA=1865;AA=2;SR=930|935;SA=1|1;SB=0.5;AB=0.75;ABR=6;ABA=2;RUN=1;SNP;TS\n+20\t124737\t.\tA\tC\t99.0\tNS360\tNS=367;DP=2024;AC=4;AN=734;AF=0.0054496;RA=2003;AA=16;SR=1046|957;SA=11|5;SB=0.6875;AB=0.26316;ABR=5;ABA=14;RUN=1;SNP;TV\n+20\t124827\t.\tT\tC\t1.7\tQ9;NS360\tNS=367;DP=2059;AC=1;AN=734;AF=0.0013624;RA=2048;AA=4;SR=987|1061;SA=2|2;SB=0.5;AB=0.6;ABR=3;ABA=2;RUN=2;SNP;TS\n+20\t124848\t.\tA\tG\t36.5\tNS360\tNS=364;DP=2077;AC=1;AN=728;AF=0.0013736;RA=2060;AA=6;SR=966|1094;SA=2|4;SB=0.33333;AB=0.57143;ABR=4;ABA=3;RUN=1;SNP;TS\n+20\t124878\t.\tG\tA\t0.22\tQ9;NS360\tNS=361;DP=2060;AC=1;AN=722;AF=0.001385;RA=2049;AA=4;SR=974|1075;SA=3|1;SB=0.75;AB=0.77778;ABR=7;ABA=2;RUN=1;SNP;TS\n+20\t124920\t.\tA\tG\t76.0\tNS360\tNS=361;DP=1956;AC=1;AN=722;AF=0.001385;RA=1949;AA=6;SR=984|965;SA=2|4;SB=0.33333;AB=0.14286;ABR=1;ABA=6;RUN=1;SNP;TS;CpG\n+20\t124942\t.\tA\tT\t1.44\tQ9;NS360\tNS=362;DP=2015;AC=1;AN=724;AF=0.0013812;RA=2007;AA=4;SR=1107|900;SA=1|3;SB=0.25;AB=0.6;ABR=3;ABA=2;RUN=1;SNP;TV\n+20\t124998\t.\tT\tC\t99.0\tNS360\tNS=366;DP=2204;AC=29;AN=732;AF=0.039617;RA=2131;AA=66;SR=1132|999;SA=36|30;SB=0.54545;AB=0.54902;ABR=56;ABA=46;RUN=1;SNP;TS\n+20\t125096\t.\tC\tG\t4.18\tQ9;NS360\tNS=363;DP=2095;AC=1;AN=726;AF=0.0013774;RA=2082;AA=2;SR=1071|1011;SA=2|0;SB=1;AB=0.75;ABR=6;ABA=2;RUN=1;SNP;TV\n+20\t125109\t.\tA\tT\t0.0858\tQ9;NS360\tNS=365;DP=2091;AC=2;AN=730;AF=0.0027397;RA=2080;AA=7;SR=1103|977;SA=7|0;SB=1;AB=0.66667;ABR=6;ABA=3;RUN=8;SNP;TV\n+20\t125121\t.\tC\tT\t99.0\tNS360\tNS=364;DP=2075;AC=208;AN=728;AF=0.28571;RA=1472;AA=599;SR=826|646;SA=335|264;SB=0.55927;AB=0.49868;ABR=377;ABA=378;RUN=1;SNP;TS\n+20\t125179\t.\tC\tA\t99.0\tNS360\tNS=368;DP=2155;AC=7;AN=736;AF=0.0095109;RA=2127;AA=26;SR=1123|1004;SA=16|10;SB=0.61538;AB=0.36842;ABR=14;ABA=24;RUN=2;SNP;TV\n+20\t125266\t.\tT\tA\t3.47\tQ9;NS360\tNS=364;DP=2138;AC=1;AN=728;AF=0.0013736;RA=2123;AA=5;SR=907|1216;SA=1|4;SB=0.2;AB=0.6;ABR=3;ABA=2;RUN=1;SNP;TV\n+20\t125381\t.\tA\tG\t4.69\tQ9;NS360\tNS=362;DP=2102;AC=1;AN=724;AF=0.0013812;RA=2094;AA=3;SR=919|1175;SA=1|2;SB=0.33333;AB=0.66667;ABR=4;ABA=2;RUN=1;SNP;TS\n+20\t125450\t.\tG\tA\t0.364\tQ9\tNS=359;DP=1881;AC=1;AN=718;AF=0.0013928;RA=1868;AA=5;SR=862|1006;SA=1|4;SB=0.2;AB=0.71429;ABR=5;ABA=2;RUN=2;SNP;TS\n+20\t125470\t.\tA\tG\t0.602\tQ9\tNS=360;DP=1870;AC=1;AN=720;AF=0.0013889;RA=1855;AA=5;SR=909|946;SA=3|2;SB=0.6;AB=0.75;ABR=6;ABA=2;RUN=1;SNP;TS\n' |
b |
diff -r 000000000000 -r da1a6f33b504 tools.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools.py Mon Jan 27 09:29:09 2014 -0500 |
[ |
b'@@ -0,0 +1,188 @@\n+#!/usr/bin/python\n+\n+import os.path\n+import sys\n+import vcfPytools\n+from vcfPytools import __version__\n+\n+# Determine whether to output to a file or stdout.\n+def setOutput(output):\n+ if output == None:\n+ outputFile = sys.stdout\n+ writeOut = False\n+ else:\n+ output = os.path.abspath(output)\n+ outputFile = open(output, \'w\')\n+ writeOut = True\n+\n+ return outputFile, writeOut\n+\n+# Determine which file has priority for writing out records.\n+def setVcfPriority(priorityFile, vcfFiles):\n+ if priorityFile == None: priority = 0\n+ elif priorityFile == vcfFiles[0]: priority = 1\n+ elif priorityFile == vcfFiles[1]: priority = 2\n+ elif priorityFile.lower() == "merge": priority = 3\n+ else:\n+ print >> sys.stderr, "vcf file give priority must be one of the two input vcf files or merge."\n+ exit(1)\n+\n+ return priority\n+\n+# If the union or intersection of two vcf files is being performed\n+# and the output vcf file is to contain the information from both\n+# files, the headers need to be merged to ensure that all info and\n+# format entries have an explanation.\n+def mergeHeaders(v1, v2, v3):\n+\n+# If either file does not have a header, terminate the program.\n+# In order to merge the headers, the different fields must be\n+# checked to ensure the files are compatible.\n+ if not v1.hasHeader or not v2.hasHeader:\n+ print >> sys.stderr, "Both vcf files must have a header in order to merge data sets."\n+ exit(1)\n+\n+ v3.infoHeaderTags = v1.infoHeaderTags.copy()\n+ v3.formatHeaderTags = v1.formatHeaderTags.copy()\n+ v3.numberDataSets = v1.numberDataSets\n+ v3.includedDataSets = v1.includedDataSets.copy()\n+ v3.headerText = v1.headerText\n+ v3.headerTitles = v1.headerTitles\n+ v3.infoHeaderString = v1.infoHeaderString.copy()\n+ v3.formatHeaderString = v1.formatHeaderString.copy()\n+\n+# Merge the info field descriptions.\n+ for tag in v2.infoHeaderTags:\n+ if v1.infoHeaderTags.has_key(tag):\n+ if v1.infoHeaderTags[tag][0] != v2.infoHeaderTags[tag][0] or \\\n+ v1.infoHeaderTags[tag][1] != v2.infoHeaderTags[tag][1]:\n+ print v1.infoHeaderTags[tag][0]\n+ print v1.infoHeaderTags[tag][1]\n+ print v1.infoHeaderTags[tag][2]\n+ print >> sys.stderr, "Input vcf files have different definitions for " + tag + " field."\n+ exit(1)\n+ else: v3.infoHeaderTags[tag] = v2.infoHeaderTags[tag]\n+\n+# Merge the format field descriptions.\n+ for tag in v2.formatHeaderTags:\n+ if v1.formatHeaderTags.has_key(tag):\n+ if v1.formatHeaderTags[tag][0] != v2.formatHeaderTags[tag][0] or \\\n+ v1.formatHeaderTags[tag][1] != v2.formatHeaderTags[tag][1]:\n+ print >> sys.stderr, "Input vcf files have different definitions for " + tag + " field."\n+ exit(1)\n+ else: v3.formatHeaderTags[tag] = v2.formatHeaderTags[tag]\n+\n+# Now check to see if the vcf files contain information from multiple\n+# records themselves and create an ordered list in which the data\n+# will appear in the file. For instance, of the first file has\n+# already got two sets of data and is being intersected with a file\n+# with one set of data, the order of data in the new vcf file will be\n+# the two sets from the first file followed by the second, e.g.\n+# AB=3/2/4, where the 3 and 2 are from the first file and the 4 is the\n+# value of AC from the second vcf. The header will have a ##FILE for\n+# each of the three files, so the origin if the data can be recovered.\n+ if v1.numberDataSets == 0:\n+ v3.includedDataSets[v3.numberDataSets + 1] = v1.filename\n+ v3.numberDataSets += 1\n+ if v2.numberDataSets == 0:\n+ v3.includedDataSets[v3.numberDataSets + 1] = v2.filename\n+ v3.numberDataSets += 1\n+ else:\n+ for i in range(1, v2.numberDataSets + 1):\n+ v3.includedDataSets[v3.numberDataSets + 1] = v2.includedDataSets[i]\n+ v3.numberDataSets += 1\n+\n+# If either of the input files contain multiple data sets (e.g. multiple\n+# vcf files have undergone intersection or union calculations and all\n+# inform'..b'OR:"\n+ print >> sys.stderr, "input vcf file(s) contain data sets from multiple vcf files."\n+ print >> sys.stderr, "Further intersection or union operations must include --priority-file merge"\n+ print >> sys.stderr, "Other tools may be incompatible with this format."\n+ exit(1)\n+\n+# Write the header to file.\n+def writeHeader (outputFile, v, removeGenotypes, taskDescriptor):\n+ if not v.hasHeader: \n+ v.headerText = "##fileformat=VCFv4.0\\n##source=vcfPytools " + __version__ + "\\n"\n+ v.headerTitles = "#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER\\tINFO\\n"\n+ outputFile.write(v.headerText) if v.headerText != "" else None\n+ print >> outputFile, taskDescriptor\n+ for tag in v.infoHeaderString: print >> outputFile, v.infoHeaderString[tag]\n+ for tag in v.formatHeaderString: print >> outputFile, v.formatHeaderString[tag]\n+\n+# Write out a list of files indicating which data set belongs to which file.\n+ if v.numberDataSets != 0:\n+ for i in range(1, v.numberDataSets + 1):\n+ print >> outputFile, "##FILE=<ID=" + str(i) + ",\\"" + v.includedDataSets[i] + "\\">"\n+\n+ if removeGenotypes:\n+ line = v.headerTitles.rstrip("\\n").split("\\t")\n+ newHeaderTitles = line[0]\n+ for i in range(1,8):\n+ newHeaderTitles = newHeaderTitles + "\\t" + line[i]\n+ newHeaderTitles = newHeaderTitles + "\\n"\n+ outputFile.write( newHeaderTitles )\n+ else:\n+ outputFile.write( v.headerTitles )\n+\n+# Check that the two reference sequence lists are identical.\n+# If there are a different number or order, the results may\n+# not be as expected.\n+def checkReferenceSequenceLists(list1, list2):\n+ errorMessage = False\n+ if len(list1) != len(list2):\n+ print >> sys.stderr, "WARNING: Input files contain a different number of reference sequences."\n+ errorMessage = True\n+ elif list1 != list2:\n+ print >> sys.stderr, "WARNING: Input files contain different or differently ordered reference sequences."\n+ errorMessage = True\n+ if errorMessage:\n+ print >> sys.stderr, "Results may not be as expected."\n+ print >> sys.stderr, "Ensure that input files have the same reference sequences in the same order."\n+ print >> sys.stderr, "Reference sequence lists observed were:\\n\\t", list1, "\\n\\t", list2\n+\n+# Write out a vcf record to file. The record written depends on the\n+# value of \'priority\' and could therefore be the record from either\n+# of the vcf files, or a combination of them.\n+\n+def writeVcfRecord(priority, v1, v2, outputFile):\n+ if priority == 0:\n+ if v1.quality >= v2.quality: outputFile.write(v1.record)\n+ else: outputFile.write(v2.record)\n+ elif priority == 1: outputFile.write(v1.record)\n+ elif priority == 2: outputFile.write(v2.record)\n+ elif priority == 3:\n+\n+# Define the missing entry values (depends on the number of data sets\n+# in the file).\n+ info = ""\n+ missingEntry1 = missingEntry2 = "."\n+ for i in range(1, v1.numberDataSets): missingEntry1 += "/."\n+ for i in range(1, v2.numberDataSets): missingEntry2 += "/."\n+ secondList = v2.infoTags.copy()\n+\n+# Build up the info field.\n+ for tag in v1.infoTags:\n+ if secondList.has_key(tag):\n+ if v1.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + v1.infoTags[tag] + "/" + v2.infoTags[tag] + ";"\n+ del secondList[tag]\n+ else: \n+ if v1.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + v1.infoTags[tag] + "/" + missingEntry2 + ";"\n+\n+# Now include the info tags that are not populated in the first vcf file.\n+ for tag in secondList:\n+ if v2.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + missingEntry1 + "/" + v2.infoTags[tag] + ";"\n+\n+# Build the complete record.\n+ info = info.rstrip(";")\n+ record = v1.referenceSequence + "\\t" + str(v1.position) + "\\t" + v1.rsid + "\\t" + v1.ref + "\\t" + \\\n+ v1.alt + "/" + v2.alt + "\\t" + v1.quality + "/" + v2.quality + "\\t.\\t" + info\n+ print >> outputFile, record\n+ else:\n+ print >> sys.sterr, "Unknown file priority."\n+ exit(1)\n' |
b |
diff -r 000000000000 -r da1a6f33b504 vcfClass.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vcfClass.py Mon Jan 27 09:29:09 2014 -0500 |
[ |
b'@@ -0,0 +1,440 @@\n+#!/usr/bin/python\n+\n+import os.path\n+import sys\n+import re\n+\n+class vcf:\n+ def __init__(self):\n+\n+# Header info.\n+ self.filename = ""\n+ self.hasHeader = True\n+ self.headerText = ""\n+ self.headerTitles = ""\n+ self.vcfFormat = ""\n+ #self.headerInfoText = ""\n+ #self.headerFormatText = ""\n+\n+# Store the info and format tags as well as the lines that describe\n+# them in a dictionary.\n+ self.numberDataSets = 0\n+ self.includedDataSets = {}\n+ self.infoHeaderTags = {}\n+ self.infoHeaderString = {}\n+ self.formatHeaderTags = {}\n+ self.formatHeaderString = {}\n+\n+# Genotype information.\n+ self.genotypes = False\n+ self.infoField = {}\n+\n+# Reference sequence information.\n+ self.referenceSequences = {}\n+ self.referenceSequenceList = []\n+ self.referenceSequence = ""\n+\n+# Record information.\n+ self.position = -1\n+ self.samplesList = []\n+\n+# Determine which fields to process.\n+ self.processInfo = False\n+ self.processGenotypes = False\n+ self.dbsnpVcf = False\n+ self.hapmapVcf = False\n+\n+# Open a vcf file.\n+ def openVcf(self, filename):\n+ if filename == "stdin":\n+ self.filehandle = sys.stdin\n+ self.filename = "stdin"\n+ else:\n+ try: self.filehandle = open(filename,"r")\n+ except IOError:\n+ print >> sys.stderr, "Failed to find file: ",filename\n+ exit(1)\n+ self.filename = os.path.abspath(filename)\n+\n+# Parse the vcf header.\n+ def parseHeader(self, filename, writeOut):\n+ while self.getHeaderLine(filename, writeOut):\n+ continue\n+\n+# Determine the type of information in the header line.\n+ def getHeaderLine(self, filename, writeOut):\n+ self.headerLine = self.filehandle.readline().rstrip("\\n")\n+ if self.headerLine.startswith("##fileformat"): success = self.getvcfFormat()\n+ if self.headerLine.startswith("##INFO"): success = self.headerInfo(writeOut, "info")\n+ elif self.headerLine.startswith("##FORMAT"): success = self.headerInfo(writeOut, "format")\n+ elif self.headerLine.startswith("##FILE"): success = self.headerFiles(writeOut)\n+ elif self.headerLine.startswith("##"): success = self.headerAdditional()\n+ elif self.headerLine.startswith("#"): success = self.headerTitleString(filename, writeOut)\n+ else: success = self.noHeader(filename, writeOut)\n+\n+ return success\n+\n+# Read VCF format\n+ def getvcfFormat(self):\n+ try:\n+ self.vcfFormat = self.headerLine.split("=",1)[1]\n+ self.vcfFormat = float( self.vcfFormat.split("VCFv",1)[1] )## Extract the version number rather than the whole string\n+ except IndexError:\n+ print >> sys.stderr, "\\nError parsing the fileformat"\n+ print >> sys.stderr, "The following fileformat header is wrongly formatted: ", self.headerLine\n+ exit(1)\n+ return True\n+\n+\n+# Read information on an info field from the header line.\n+ def headerInfo(self, writeOut, lineType):\n+ tag = self.headerLine.split("=",1)\n+ tagID = (tag[1].split("ID=",1))[1].split(",",1)\n+\n+# Check if this info field has already been defined.\n+ if (lineType == "info" and self.infoHeaderTags.has_key(tagID[0])) or (lineType == "format" and self.formatHeaderTags.has_key(tagID[0])):\n+ print >> sys.stderr, "Info tag \\"", tagID[0], "\\" is defined multiple times in the header."\n+ exit(1)\n+\n+# Determine the number of entries, entry type and description.\n+ tagNumber = (tagID[1].split("Number=",1))[1].split(",",1)\n+ tagType = (tagNumber[1].split("Type=",1))[1].split(",",1)\n+ try: tagDescription = ( ( (tagType[1].split("Description=\\"",1))[1] ).split("\\">") )[0]\n+ except IndexError: tagDescription = ""\n+ tagID = tagID[0]; tagNumber = tagNumber[0]; tagType = tagType[0]\n+\n+# Check that the number of fields associated with the tag is either\n+# an integer or a \'.\' to indicate variable number of entries.\n+ if tagNumber == ".": tagNumber = "variable"\n+ else:\n+ if self.vcfFormat<4.1:\n+\n+ try:\n+ tagNum'..b'lues != 0 and infoType == "Flag":\n+ print >> sys.stderr, "ERROR"\n+ exit(1)\n+ else:\n+ fields = self.infoTags[tag].split(",")\n+ if len(fields) != numberValues:\n+ text = "Unexpected number of entries"\n+ self.generalError(text, "information tag", tag)\n+\n+ for i in range(infoNumber):\n+ try: result.append(fields[i])\n+ except IndexError:\n+ text = "Insufficient values. Expected: " + self.infoHeaderTags[tag][0]\n+ self.generalError(text, "tag:", tag)\n+ else: numberValues = 0\n+\n+ else:\n+ text = "information field does not have a definition in the header"\n+ self.generalError(text, "tag", tag)\n+\n+ return numberValues, infoType, result\n+\n+# Get the genotype information.\n+ def getGenotypeInfo(self, sample, tag):\n+ result = []\n+ if self.formatHeaderTags.has_key(tag):\n+ infoNumber = self.formatHeaderTags[tag][0]\n+ infoType = self.formatHeaderTags[tag][1]\n+ numberValues = infoNumber\n+\n+ if self.genotypeFields[sample] == "." and len(self.genotypeFields[sample]) == 1:\n+ numberValues = 0\n+ result = "."\n+ else:\n+ if self.genotypeFields[sample].has_key(tag):\n+ if tag == "GT":\n+ if len(self.genotypeFields[sample][tag]) != 3 and len(self.genotypeFields[sample][tag]) != 1:\n+ text = "Unexected number of characters in genotype (GT) field"\n+ self.generalError(text, "sample", sample)\n+\n+# If a diploid call, check whether or not the genotype is phased.\n+ elif len(self.genotypeFields[sample][tag]) == 3:\n+ self.phased = True if self.genotypeFields[sample][tag][1] == "|" else False\n+ result.append( self.genotypeFields[sample][tag][0] )\n+ result.append( self.genotypeFields[sample][tag][2] )\n+ elif len(self.genotypeFields[sample][tag]) == 3:\n+ result.append( self.genotypeFields[sample][tag][0] )\n+ else:\n+ fields = self.genotypeFields[sample][tag].split(",")\n+ if len(fields) != numberValues:\n+ text = "Unexpected number of characters in " + tag + " field"\n+ self.generalError(text, "sample", sample)\n+\n+ for i in range(infoNumber): result.append(fields[i])\n+ else:\n+ text = "genotype field does not have a definition in the header"\n+ self.generalError(text, "tag", tag)\n+\n+ return numberValues, result\n+\n+# Parse the dbsnp entry. If the entry conforms to the required variant type,\n+# return the dbsnp rsid value, otherwise ".".\n+ def getDbsnpInfo(self):\n+\n+# First check that the variant class (VC) is listed as SNP.\n+ vc = self.info.split("VC=",1)\n+ if vc[1].find(";") != -1: snp = vc[1].split(";",1)\n+ else:\n+ snp = []\n+ snp.append(vc[1])\n+\n+ if snp[0].lower() == "snp": rsid = self.rsid\n+ else: rsid = "."\n+\n+ return rsid\n+\n+# Build a new vcf record.\n+ def buildRecord(self, removeGenotypes):\n+ record = self.referenceSequence + "\\t" + \\\n+ str(self.position) + "\\t" + \\\n+ self.rsid + "\\t" + \\\n+ self.ref + "\\t" + \\\n+ self.alt + "\\t" + \\\n+ str(self.quality) + "\\t" + \\\n+ self.filters + "\\t" + \\\n+ self.info\n+\n+ if self.hasGenotypes and not removeGenotypes: record += self.genotypeString\n+\n+ record += "\\n"\n+\n+ return record\n+\n+# Close the vcf file.\n+ def closeVcf(self, filename):\n+ self.filehandle.close()\n+\n+# Define error messages for different handled errors.\n+ def generalError(self, text, field, fieldValue):\n+ print >> sys.stderr, "\\nError encountered when attempting to read:"\n+ print >> sys.stderr, "\\treference sequence :\\t", self.referenceSequence\n+ print >> sys.stderr, "\\tposition :\\t\\t", self.position\n+ if field != "": print >> sys.stderr, "\\t", field, ":\\t", fieldValue\n+ print >> sys.stderr, "\\n", text\n+ exit(1)\n' |
b |
diff -r 000000000000 -r da1a6f33b504 vcfPytools.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vcfPytools.py Mon Jan 27 09:29:09 2014 -0500 |
[ |
@@ -0,0 +1,82 @@ +#!/usr/bin/python + +import os.path +import sys + +__author__ = "alistair ward" +__version__ = "version 0.26" +__date__ = "february 2011" + +def main(): + usage = "Usage: vcfPytools.py [tool] [options]\n\n" + \ + "Available tools:\n" + \ + " annotate:\n\tAnnotate the vcf file with membership in other vcf files.\n" + \ + " extract:\n\tExtract vcf records from a region.\n" + \ + " filter:\n\tFilter the vcf file.\n" + \ + " intersect:\n\tGenerate the intersection of two vcf files.\n" + \ + " merge:\n\tMerge a list of vcf files.\n" + \ + " multi:\n\tFind the intersections and unique fractions of multiple vcf files.\n" + \ + " sort:\n\tSort a vcf file.\n" + \ + " stats:\n\tGenerate statistics from a vcf file.\n" + \ + " union:\n\tGenerate the union of two vcf files.\n" + \ + " unique:\n\tGenerate the unique fraction from two vcf files.\n" + \ + " validate:\n\tValidate the input vcf file.\n\n" + \ + "vcfPytools.py [tool] --help for information on a specific tool." + +# Determine the requested tool. + + if len(sys.argv) > 1: + tool = sys.argv[1] + else: + print >> sys.stderr, usage + exit(1) + + if tool == "annotate": + import annotate + success = annotate.main() + elif tool == "extract": + import extract + success = extract.main() + elif tool == "filter": + import filter + success = filter.main() + elif tool == "intersect": + import intersect + success = intersect.main() + elif tool == "multi": + import multi + success = multi.main() + elif tool == "merge": + import merge + success = merge.main() + elif tool == "sort": + import sort + success = sort.main() + elif tool == "stats": + import stats + success = stats.main() + elif tool == "union": + import union + success = union.main() + elif tool == "unique": + import unique + success = unique.main() + elif tool == "test": + import test + success = test.main() + elif tool == "validate": + import validate + success = validate.main() + elif tool == "--help" or tool == "-h" or tool == "?": + print >> sys.stderr, usage + else: + print >> sys.stderr, "Unknown tool: ",tool + print >> sys.stderr, "\n", usage + exit(1) + +# If program completed properly, terminate. + + if success == 0: exit(0) + +if __name__ == "__main__": + main() |