Mercurial > repos > nick > allele_counts
changeset 5:31361191d2d2
Uploaded tarball.
Version 1.1: Stranded output, slightly different handling of minor allele ties and 0 coverage sites, revised help text, added test datasets.
author | nick |
---|---|
date | Thu, 12 Sep 2013 11:34:23 -0400 |
parents | 898eb3daab43 |
children | df3b28364cd2 |
files | allele-counts.py allele-counts.xml tests/artificial-nofilt.csv.out tests/artificial-nofilt.vcf.in tests/artificial-samples.csv.out tests/artificial-samples.vcf.in tests/artificial.csv.out tests/artificial.vcf.in tests/compute-site.py tests/errors.vcf.in tests/real-mit-s.csv.out tests/real-mit-s.vcf.in tests/real-mit.csv.out tests/real-mit.vcf.in tests/real-nofilt.csv.out tests/real-nofilt.vcf.in tests/real.csv.out tests/real.vcf.in tests/run-tests.py |
diffstat | 19 files changed, 837 insertions(+), 136 deletions(-) [+] |
line wrap: on
line diff
--- a/allele-counts.py Tue Jun 04 00:16:29 2013 -0400 +++ b/allele-counts.py Thu Sep 12 11:34:23 2013 -0400 @@ -1,25 +1,25 @@ #!/usr/bin/python # This parses the output of Dan's "Naive Variant Detector" (previously, # "BAM Coverage"). It was forked from the code of "bam-coverage.py". +# Or, to put it briefly, +# cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:' # # New in this version: # Made header line customizable # - separate from internal column labels, which are used as dict keys -# -# TODO: -# - test handling of -c 0 (and -f 0?) -# - should it technically handle data lines that start with a '#'? import os import sys +import random from optparse import OptionParser COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', 'major', 'minor', 'freq'] #, 'bias'] COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS'] CANONICAL_VARIANTS = ['A', 'C', 'G', 'T'] -USAGE = """Usage: cat variants.vcf | %prog [options] > alleles.csv - %prog [options] -i variants.vcf -o alleles.csv""" +USAGE = """Usage: %prog [options] -i variants.vcf -o alleles.csv + cat variants.vcf | %prog [options] > alleles.csv""" OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100, - 'print_header':False, 'stdin':False} + 'print_header':False, 'stdin':False, 'stranded':False, 'no_filter':False, + 'debug_loc':'', 'seed':''} DESCRIPTION = """This will parse the VCF output of Dan's "Naive Variant Caller" (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the number of reads of each base, determines the major allele, minor allele (second most frequent variant), and number of alleles above a threshold. So currently it only considers SNVs (ACGT), including in the coverage figure. By default it reads from stdin and prints to stdout.""" EPILOG = """Requirements: The input VCF must report the variants for each strand. @@ -40,27 +40,35 @@ help='Print output data to this file instead of stdout.') parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float', default=defaults.get('freq_thres'), - help='Frequency threshold for counting alleles, given in percentage: -f 1 = 1% frequency. Default is %default%.') + help=('Frequency threshold for counting alleles, given in percentage: -f 1 ' + +'= 1% frequency. Default is %default%.')) parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int', default=defaults.get('covg_thres'), - help='Coverage threshold. Each site must be supported by at least this many reads on each strand. Otherwise the site will not be printed in the output. The default is %default reads per strand.') + help=('Coverage threshold. Each site must be supported by at least this ' + +'many reads on each strand. Otherwise the site will not be printed in ' + +'the output. The default is %default reads per strand.')) + parser.add_option('-n', '--no-filter', dest='no_filter', action='store_const', + const=not(defaults.get('no_filter')), default=defaults.get('no_filter'), + help=('Operate without a frequency threshold or coverage threshold. ' + +'Equivalent to "-c 0 -f 0".')) parser.add_option('-H', '--header', dest='print_header', action='store_const', const=not(defaults.get('print_header')), default=defaults.get('print_header'), - help='Print header line. This is a #-commented line with the column labels. Off by default.') - parser.add_option('-d', '--debug', dest='debug', action='store_true', - default=False, - help='Turn on debug mode. You must also specify a single site to process in a final argument using UCSC coordinate format.') + help=('Print header line. This is a #-commented line with the column ' + +'labels. Off by default.')) + parser.add_option('-s', '--stranded', dest='stranded', action='store_const', + const=not(defaults.get('stranded')), default=defaults.get('stranded'), + help='Report variant counts by strand, in separate columns. Off by default.') + parser.add_option('-r', '--rand-seed', dest='seed', + default=defaults.get('seed'), help=('Seed for random number generator.')) + parser.add_option('-d', '--debug', dest='debug_loc', + default=defaults.get('debug_loc'), + help=('Turn on debug mode and specify a single site to process using UCSC ' + +'coordinate format. You can also append a sample ID after another ":" ' + +'to restrict it further.')) (options, args) = parser.parse_args() - # read in positional arguments - arguments = {} - if options.debug: - if len(args) >= 1: - arguments['print_loc'] = args[0] - args.remove(args[0]) - - return (options, arguments) + return (options, args) def main(): @@ -72,19 +80,26 @@ print_header = options.print_header freq_thres = options.freq_thres / 100.0 covg_thres = options.covg_thres - debug = options.debug + stranded = options.stranded + debug_loc = options.debug_loc + seed = options.seed + + if options.no_filter: + freq_thres = 0 + covg_thres = 0 - if debug: - print_loc = args.get('print_loc') - if print_loc: - if ':' in print_loc: - (print_chr, print_pos) = print_loc.split(':') - else: - print_pos = print_loc - else: - sys.stderr.write("Warning: No site coordinate found in arguments. " - +"Turning off debug mode.\n") - debug = False + if seed: + random.seed(seed) + + debug = False + print_sample = '' + if debug_loc: + debug = True + coords = debug_loc.split(':') + print_chr = coords[0] + print_pos = '' + if len(coords) > 1: print_pos = coords[1] + if len(coords) > 2: print_sample = coords[2] # set infile_handle to either stdin or the input file if infile == OPT_DEFAULTS.get('infile'): @@ -105,12 +120,19 @@ except IOError, e: fail('Error: The given output filename '+outfile+' could not be opened.') + # Take care of column names, print header if len(COLUMNS) != len(COLUMN_LABELS): - fail('Error: Internal column names do not match column labels.') + fail('Error: Internal column names list do not match column labels list.') + if stranded: + COLUMNS[3:7] = ['+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T'] + COLUMN_LABELS[3:7] = ['+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T'] if print_header: outfile_handle.write('#'+'\t'.join(COLUMN_LABELS)+"\n") - # main loop: process and print one line at a time + # main loop + # each iteration processes one VCF line and prints one or more output lines + # one VCF line = one site, one or more samples + # one output line = one site, one sample sample_names = [] for line in infile_handle: line = line.rstrip('\r\n') @@ -128,19 +150,24 @@ site_data = read_site(line, sample_names, CANONICAL_VARIANTS) if debug: - if site_data['pos'] != print_pos: + if print_pos != site_data['pos']: + continue + if print_chr != site_data['chr'] and print_chr != '': continue - try: - if site_data['chr'] != print_chr: - continue - except NameError, e: - pass # No chr specified. Just go ahead and print the line. + if print_sample != '': + for sample in site_data['samples'].keys(): + if sample.lower() != print_sample.lower(): + site_data['samples'].pop(sample, None) + if len(site_data['samples']) == 0: + sys.stderr.write("Error: Sample '"+print_sample+"' not found.\n") + sys.exit(1) + site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS, - freq_thres, covg_thres, debug=debug) + freq_thres, covg_thres, stranded, debug=debug) if debug and site_summary[0]['print']: - print line.split('\t')[9].split(':')[-1] + print line.split('\t')[9].split(':')[-1] print_site(outfile_handle, site_summary, COLUMNS) @@ -158,8 +185,27 @@ def read_site(line, sample_names, canonical): """Read in a line, parse the variants into a data structure, and return it. The line should be actual site data, not a header line, so check beforehand. - Notes: - - The line is assumed to have been chomped.""" + Only the variants in 'canonical' will be read; all others are ignored. + Note: the line is assumed to have been chomped. + The returned data is stored in a dict, with the following structure: + { + 'chr': 'chr1', + 'pos': '2617', + 'samples': { + 'THYROID': { + '+A': 32, + '-A': 45, + '-G': 1, + }, + 'BLOOD': { + '+A': 2, + '-C': 1, + '+G': 37, + '-G': 42, + }, + }, + } + """ site = {} fields = line.split('\t') @@ -212,7 +258,7 @@ def summarize_site(site, sample_names, canonical, freq_thres, covg_thres, - debug=False): + stranded=False, debug=False): """Take the raw data from the VCF line and transform it into the summary data to be printed in the output format.""" @@ -221,9 +267,6 @@ sample = {'print':False} variants = site['samples'].get(sample_name) - if not variants: - site_summary.append(sample) - continue sample['sample'] = sample_name sample['chr'] = site['chr'] @@ -240,31 +283,49 @@ elif variant[0] == '-': covg_minus += variants[variant] # stranded coverage threshold - if coverage <= 0 or covg_plus < covg_thres or covg_minus < covg_thres: + if covg_plus < covg_thres or covg_minus < covg_thres: site_summary.append(sample) continue else: sample['print'] = True - # get an ordered list of read counts for all variants (either strand) - ranked_bases = get_read_counts(variants, 0, strands='+-', debug=debug) + # get an ordered list of read counts for all variants (both strands) + bases = get_read_counts(variants, '+-') + ranked_bases = process_read_counts(bases, sort=True, debug=debug) + + # prepare stranded or unstranded lists of base counts + base_count_lists = [] + if stranded: + strands = ['+', '-'] + base_count_lists.append(get_read_counts(variants, '+')) + base_count_lists.append(get_read_counts(variants, '-')) + else: + strands = [''] + base_count_lists.append(ranked_bases) - # record read counts into dict for this sample - for base in ranked_bases: - sample[base[0]] = base[1] - # fill in any zeros - for variant in canonical: - if not sample.has_key(variant): - sample[variant] = 0 + # record read counts into output dict + # If stranded, this will loop twice, once for each strand, and prepend '+' + # or '-' to the base name. If not stranded, it will loop once, and prepend + # nothing (''). + for (strand, base_count_list) in zip(strands, base_count_lists): + for base_count in base_count_list: + sample[strand+base_count[0]] = base_count[1] + # fill in any zeros + for base in canonical: + if not sample.has_key(strand+base): + sample[strand+base] = 0 - sample['alleles'] = count_alleles(variants, freq_thres, debug=debug) + sample['alleles'] = count_alleles(variants, freq_thres, debug=debug) - # set minor allele to N if there's a tie for 2nd + # If there's a tie for 2nd, randomly choose one to be 2nd if len(ranked_bases) >= 3 and ranked_bases[1][1] == ranked_bases[2][1]: - ranked_bases[1] = ('N', 0) - sample['alleles'] = 1 if sample['alleles'] else 0 + swap = random.choice([True,False]) + if swap: + tmp_base = ranked_bases[1] + ranked_bases[1] = ranked_bases[2] + ranked_bases[2] = tmp_base - if debug: print ranked_bases + if debug: print "ranked +-: "+str(ranked_bases) sample['coverage'] = coverage try: @@ -283,67 +344,63 @@ return site_summary -def print_site(filehandle, site, columns): - """Print the output lines for one site (one per sample). - filehandle must be open.""" - for sample in site: - if sample['print']: - fields = [str(sample.get(column)) for column in columns] - filehandle.write('\t'.join(fields)+"\n") +def get_read_counts(stranded_counts, strands='+-'): + """Do a simple sum of the read counts per variant, on the specified strands. + Arguments: + stranded_counts: Dict of the stranded variants (keys) and their read counts + (values). + strands: Which strand(s) to count. Can be '+', '-', or '+-' for both (default). + Return value: + summed_counts: A list of the alleles and their read counts. The elements are + tuples (variant, read count).""" + + variants = stranded_counts.keys() + + summed_counts = {} + for variant in variants: + strand = variant[0] + base = variant[1:] + if strand in strands: + summed_counts[base] = stranded_counts[variant] + summed_counts.get(base, 0) + + return summed_counts.items() -def get_read_counts(variant_counts, freq_thres, strands='+-', debug=False): - """Count the number of reads for each base, and create a ranked list of - alleles passing the frequency threshold. +def process_read_counts(variant_counts, freq_thres=0, sort=False, debug=False): + """Process a list of read counts by frequency filtering and/or sorting. Arguments: - variant_counts: Dict of the stranded variants (keys) and their read counts (values). + variant_counts: List of the non-stranded variants and their read counts. The + elements are tuples (variant, read count). freq_thres: The frequency threshold each allele needs to pass to be included. - strands: Which strand(s) to count. Can be '+', '-', or '+-' for both (default). - variants: A list of the variants of interest. Other types of variants will not - be included in the returned list. If no list is given, all variants found in - the variant_counts will be used. + sort: Whether to sort the list in descending order of read counts. Return value: - ranked_bases: A list of the alleles and their read counts. The elements are - tuples (base, read count). The alleles are listed in descending order of - frequency, and only those passing the threshold are included.""" - - # Get list of all variants from variant_counts list - variants = [variant[1:] for variant in variant_counts] - # deduplicate via a dict - variant_dict = dict((variant, 1) for variant in variants) - variants = variant_dict.keys() - - ranked_bases = [] - for variant in variants: - reads = 0 - for strand in strands: - reads += variant_counts.get(strand+variant, 0) - ranked_bases.append((variant, reads)) + variant_counts: A list of the processed alleles and their read counts. The + elements are tuples (variant, read count).""" # get coverage for the specified strands coverage = 0 for variant in variant_counts: - if variant[0] in strands: - coverage += variant_counts.get(variant, 0) - # if debug: print "strands: "+strands+', covg: '+str(coverage) + coverage += variant[1] - if coverage < 1: + if coverage <= 0: return [] # sort the list of alleles by read count - ranked_bases.sort(reverse=True, key=lambda base: base[1]) + if sort: + variant_counts.sort(reverse=True, key=lambda variant: variant[1]) if debug: - print strands+' coverage: '+str(coverage)+', freq_thres: '+str(freq_thres) - for base in ranked_bases: - print (base[0]+': '+str(base[1])+'/'+str(float(coverage))+' = '+ - str(base[1]/float(coverage))) + print 'coverage: '+str(coverage)+', freq_thres: '+str(freq_thres) + for variant in variant_counts: + print (variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+ + str(variant[1]/float(coverage))) # remove bases below the frequency threshold - ranked_bases = [base for base in ranked_bases - if base[1]/float(coverage) >= freq_thres] + if freq_thres > 0: + variant_counts = [variant for variant in variant_counts + if variant[1]/float(coverage) >= freq_thres] - return ranked_bases + return variant_counts def count_alleles(variant_counts, freq_thres, debug=False): @@ -354,16 +411,19 @@ is zero.""" allele_count = 0 - alleles_plus = get_read_counts(variant_counts, freq_thres, debug=debug, - strands='+') - alleles_minus = get_read_counts(variant_counts, freq_thres, debug=debug, - strands='-') + alleles_plus = get_read_counts(variant_counts, '+') + alleles_plus = process_read_counts(alleles_plus, freq_thres=freq_thres, + sort=False, debug=debug) + alleles_minus = get_read_counts(variant_counts, '-') + alleles_minus = process_read_counts(alleles_minus, freq_thres=freq_thres, + sort=False, debug=debug) if debug: print '+ '+str(alleles_plus) print '- '+str(alleles_minus) - # check if each strand reports the same set of alleles + # Check if each strand reports the same set of alleles. + # Sorting by base is to compare lists without regard to order (as sets). alleles_plus_sorted = sorted([base[0] for base in alleles_plus if base[1]]) alleles_minus_sorted = sorted([base[0] for base in alleles_minus if base[1]]) if alleles_plus_sorted == alleles_minus_sorted: @@ -372,9 +432,19 @@ return allele_count +def print_site(filehandle, site, columns): + """Print the output lines for one site (one per sample). + filehandle must be open.""" + for sample in site: + if sample['print']: + fields = [str(sample.get(column)) for column in columns] + filehandle.write('\t'.join(fields)+"\n") + + def fail(message): sys.stderr.write(message+'\n') sys.exit(1) + if __name__ == "__main__": main() \ No newline at end of file
--- a/allele-counts.xml Tue Jun 04 00:16:29 2013 -0400 +++ b/allele-counts.xml Thu Sep 12 11:34:23 2013 -0400 @@ -1,10 +1,12 @@ -<tool id="allele_counts_1" version="1.0" name="Count alleles"> - <description>and minor allele frequencies</description> - <command interpreter="python">allele-counts.py -i $input -o $output -f $freq -c $covg $header</command> +<tool id="allele_counts_1" version="1.1" name="Variant Annotator"> + <description> process variant counts</description> + <command interpreter="python">allele-counts.py -i $input -o $output -f $freq -c $covg $header $stranded $nofilt</command> <inputs> <param name="input" type="data" format="vcf" label="Input variants from Naive Variants Detector"/> <param name="freq" type="float" value="1.0" min="0" max="100" label="Minor allele frequency threshold (in percent)"/> - <param name="covg" type="integer" value="10" min="0" label="Coverage threshold (per strand)"/> + <param name="covg" type="integer" value="10" min="0" label="Coverage threshold (in reads per strand)"/> + <param name="nofilt" type="boolean" truevalue="-n" falsevalue="" checked="False" label="Do not filter sites or alleles" /> + <param name="stranded" type="boolean" truevalue="-s" falsevalue="" checked="False" label="Output stranded base counts" /> <param name="header" type="boolean" truevalue="-H" falsevalue="" checked="True" label="Write header line" /> </inputs> <outputs> @@ -21,40 +23,51 @@ **What it does** -This tool parses variant counts from a special VCF file (normally the output of the **Naive Variant Detector** tool). It counts simple (ACGT) variants, calculates numbers of alleles, and calculates minor allele frequency. It applies filters based on coverage, strand bias, and minor allele frequency cutoffs. +This tool parses variant counts from a special VCF file. It counts simple variants, calculates numbers of alleles, and calculates minor allele frequency. It can apply filters based on coverage, strand bias, and minor allele frequency cutoffs. ----- +.. class:: infomark + +**Input Format** + .. class:: warningmark -**Note** +**Note:** variants that are not A/C/G/T SNVs will be ignored! -The VCF must have a certain genotype field in the sample columns, giving the read count of each type of variant. Also, the variant data **must be stranded**. The **Naive Variant Detector** tool produces this type of VCF. +The input VCF should be like the output of the **Naive Variant Detector** tool (using the stranded option). The sample column(s) must give the read count for each variant **on each strand**. Below is an example of a valid sample column entry (the important part is after the last colon):: + + 0/0:1:0.02:+T=27,+G=1,-T=22, ----- .. class:: infomark -**Output columns** +**Output** -Each row represents one site in one sample. 12 fields give information about that site:: +Each row represents one site in one sample. For unstranded output, 12 fields give information about that site:: - 1. SAMPLE - Sample names (from VCF sample column labels) + 1. SAMPLE - Sample name (from VCF sample column labels) 2. CHR - Chromosome of the site 3. POS - Chromosomal coordinate of the site 4. A - Number of reads supporting an 'A' - 5. C - ditto, for 'C' - 6. G - ditto, for 'G' - 7. T - ditto, for 'T' + 5. C - 'C' reads + 6. G - 'G' reads + 7. T - 'T' reads 8. CVRG - Total (number of reads supporting one of the four bases above) 9. ALLELES - Number of qualifying alleles - 10. MAJOR - Major allele base - 11. MINOR - Minor allele base (2nd most prevalent variant) + 10. MAJOR - Major allele + 11. MINOR - Minor allele (2nd most prevalent variant) 12. MINOR.FREQ.PERC. - Frequency of minor allele +For stranded output, instead of using 4 columns to report read counts per base, 8 are used to report the stranded counts per base:: + + 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + SAMPLE CHR POS +A +C +G +T -A -C -G -T CVRG ALLELES MAJOR MINOR MINOR.FREQ.PERC. + **Example** -This is the header line, followed by some example data lines. Note that some samples and/or sites will not be included in the output, if they fall below the coverage threshold:: +Below is a header line, followed by some example data lines. Since the input contained three samples, the data for each site is reported on three consecutive lines. However, if a sample fell below the coverage threshold at that site, the line will be omitted:: #SAMPLE CHR POS A C G T CVRG ALLELES MAJOR MINOR MINOR.FREQ.PERC. BLOOD_1 chr20 99 0 101 1 2 104 1 C T 0.01923 @@ -69,19 +82,17 @@ **Site printing and allele tallying requirements** -Each line is printed only when the site is covered by the threshold number of reads **on each strand**. If coverage of either strand is below the threshold, the line (sample + site combination) is omitted. +Coverage threshold: -**N.B.**: This means the total coverage for each printed site will be at least twice the number you give in the "coverage threshold" option. +If a coverage threshold is used, the number of reads **on each strand** must be at or above the threshold. If either strand is below the threshold, the line will be omitted. **N.B.** this means the total coverage for each printed site will be at least twice the number you give in the "coverage threshold" option. Also, since only simple variants are counted, a site with 100 reads, all supporting a deletion variant, would not be printed. -Also, reads supporting a variant outside the canonical 4 nucleotides will not count towards the coverage requirement. For instance, a site/sample line with 100x coverage, all of which support a deletion variant, will not be printed. +Frequency threshold: -Alleles are only counted (in column 9) if they meet or exceed the minor allele frequency threshold. So a site/sample line with types of variants, 96% A, 3.3% C, and 0.7% G, will count as 2 alleles (at 1% threshold). - -Strand bias: the alleles passing the threshold on each strand have to match (though not in order). Otherwise, the allele count will be 0. So a site/sample line whose + strand shows 70% A, 27% C, and 3% G, and - strand shows 70% A and 30% C will have an allele count of 0. The minor allele and minor allele frequency, though, will always be reported\*. +If a frequency threshold is used, alleles are only counted (in the ALLELES column) if they meet or exceed this minor allele frequency threshold. -But in this version, there is no requirement that the strands show similar allele frequencies, as long as they both pass the threshold. +Strand bias: -\*One specific case will actually affect the reported minor allele identity and frequency. If there is a tie for the minor allele (between the 2nd and 3rd most common alleles), the minor allele will be reporated as 'N', and the frequency as 0.0. +The alleles passing the threshold on each strand must match (though not in order), or the allele count will be 0. So a site with A, C, G on the plus strand and A, G on the minus strand will get an allele count of zero, though the (strand-independent) major allele, minor allele, and minor allele frequency will still be reported. If there is a tie for the minor allele, one will be randomly chosen. </help>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/artificial-nofilt.csv.out Thu Sep 12 11:34:23 2013 -0400 @@ -0,0 +1,27 @@ +#SAMPLE CHR POS A C G T CVRG ALLELES MAJOR MINOR MINOR.FREQ.PERC. +THYROID chr1 0 30 0 0 0 30 1 A . 0.0 +THYROID chr1 10 30 0 2 0 32 2 A G 0.0625 +THYROID chr1 20 31 0 1 0 32 0 A G 0.03125 +THYROID chr1 30 21 0 4 0 25 2 A G 0.16 +THYROID chr1 40 22 0 3 0 25 0 A G 0.12 +THYROID chr1 50 3 0 0 0 3 1 A . 0.0 +THYROID chr1 60 2 0 2 0 4 2 A G 0.5 +THYROID chr1 70 1 0 3 0 4 0 G A 0.25 +THYROID chr1 80 104 0 3 0 107 0 A G 0.02804 +THYROID chr1 90 100 2 11 0 113 3 A G 0.09735 +THYROID chr1 100 100 1 11 0 112 0 A G 0.09821 +THYROID chr1 120 0 0 0 0 0 0 . . 0.0 +THYROID chr1 130 0 0 2 0 2 1 G . 0.0 +THYROID chr1 140 0 0 1 0 1 0 G . 0.0 +THYROID chr1 150 0 0 4 0 4 1 G . 0.0 +THYROID chr1 160 0 0 3 0 3 0 G . 0.0 +THYROID chr1 260 106 0 14 0 120 2 A G 0.11667 +THYROID chr1 300 2 0 2 76 80 3 T G 0.025 +THYROID chr1 310 12 0 12 76 100 3 T G 0.12 +THYROID chr1 320 12 0 12 56 80 3 T A 0.15 +THYROID chr1 330 7 0 7 66 80 3 T G 0.0875 +THYROID chr1 340 1 0 1 98 100 0 T G 0.01 +THYROID chr1 350 11 0 11 78 100 0 T A 0.11 +THYROID chr1 400 32 0 8 0 40 2 A G 0.2 +THYROID chr1 410 1 0 2 97 100 0 T G 0.02 +THYROID chr1 420 104 0 0 0 104 1 A . 0.0
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/artificial-nofilt.vcf.in Thu Sep 12 11:34:23 2013 -0400 @@ -0,0 +1,50 @@ +##fileformat=VCFv4.1 +##comment="ARGS=-H -r 1 -f 0 -c 0" +##comment="This is a test set of made-up sites, each created in order to test certain functionality. It's meant to be run with -f 10 -c 10" +##fileDate=19700101 +##source=Dan +##reference=file:///scratch/dan/galaxy/galaxy-central/database/files/002/dataset_0000.dat +##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> +##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> +##FORMAT=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=AF,Number=.,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=NC,Number=.,Type=String,Description="Nucleotide and indel counts"> +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT THYROID +# General note: the only data made consistent is the CHROM, POS, REF, ALT, and the variant data (after the ':'). The other stuff isn't supposed to be consistent. +# Simplest case, but POS 0 and no minor allele +chr1 0 . A . . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,-A=15, +# Simple, normal cases of A/G variants: above/below threshold x strand bias/no strand bias (2 x 2 = 4 cases) +chr1 10 . A G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,+G=1,-A=15,-G=1, +chr1 20 . A G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,+G=1,-A=16, +chr1 30 . A G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=10,+G=2,-A=11,-G=2, +chr1 40 . A G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=11,+G=3,-A=11, +# Really low coverage +chr1 50 . A G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=2,-A=1, +chr1 60 . A G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=1,+G=1,-A=1,-G=1 +chr1 70 . A G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=1,+G=1,-G=2, +chr1 80 . A G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=102,+G=3,-A=2, +# Very low frequency tertiary allele +chr1 90 . A G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=50,+G=5,+C=1,-A=50,-G=6,-C=1, +chr1 100 . A G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=50,+G=5,-A=50,-G=6,-C=1, +# Only non-canonical alleles +chr1 120 . d1 . . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+d1=15,-d1=15, +# Same 4 cases, but with non-canonical major allele (d1) +chr1 130 . d1 G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+d1=15,+G=1,-d1=15,-G=1, +chr1 140 . d1 G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+d1=15,+G=1,-d1=16, +chr1 150 . d1 G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+d1=10,+G=2,-d1=11,-G=2, +chr1 160 . d1 G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+d1=11,+G=3,-d1=11, +# Test case where minor allele is above threshold on only one strand because of different coverage. Also, a long decimal minor allele frequency. +chr1 260 . A G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=13,+G=7,-A=93,-G=7, +# Test case where minor alleles have equal frequency: Above/below threshold, +/- strand bias +chr1 300 . T G,A . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=1,+G=1,+T=38,-A=1,-G=1,-T=38, +chr1 310 . T G,A . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=6,+G=6,+T=38,-A=6,-G=6,-T=38, +chr1 320 . T G,A . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=6,+G=6,+T=38,-A=6,-G=6,-T=18, +chr1 330 . T G,A . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=6,+G=6,+T=38,-A=1,-G=1,-T=28, +chr1 340 . T G,A . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=1,+T=80,-G=1,-T=18, +chr1 350 . T G,A . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=11,+T=60,-G=11,-T=18, +# Case where + and - variants are interleaved with each other. Also, a long decimal result for the minor allele frequency. +chr1 400 . A N . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=16,-A=16,+G=4,-G=4, +# Test complex data in the ALT, INFO, and sample (before the ':') columns +chr1 410 . T G,A . . AC=1,1;AF=0.0111111111111,0.0111111111111 GT:AC:AF:NC 0/0:1,1:0.0111111111111,0.0111111111111:+A=1,+T=81,-T=16,-G=2, +chr1 420 . A . . . AC=;AF= GT:AC:AF:NC 0/0:::+A=82,-A=22, \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/artificial-samples.csv.out Thu Sep 12 11:34:23 2013 -0400 @@ -0,0 +1,13 @@ +BRAIN chr1 0 30 0 0 0 30 1 A . 0.0 +ARTERY chr1 0 0 0 30 0 30 1 G . 0.0 +THYROID chr1 0 0 30 0 0 30 1 C . 0.0 +BRAIN chr1 10 30 0 0 0 30 1 A . 0.0 +ARTERY chr1 10 30 0 2 0 32 1 A G 0.0625 +THYROID chr1 10 31 0 1 0 32 1 A G 0.03125 +BRAIN chr1 20 30 0 2 0 32 1 A G 0.0625 +ARTERY chr1 20 34 0 6 0 40 2 A G 0.15 +THYROID chr1 20 30 0 2 0 32 0 A G 0.0625 +BRAIN chr1 30 30 0 0 0 30 1 A . 0.0 +BRAIN chr1 40 0 0 0 30 30 1 T . 0.0 +ARTERY chr1 40 1 0 2 97 100 0 T G 0.02 +THYROID chr1 40 0 69 0 31 100 0 C T 0.31
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/artificial-samples.vcf.in Thu Sep 12 11:34:23 2013 -0400 @@ -0,0 +1,23 @@ +##fileformat=VCFv4.1 +##comment="ARGS=-f 10 -c 10" +##comment="This is a test set of made-up sites, in this case to test the handling of multiple read groups. It's meant to be run with -f 10 -c 10" +##fileDate=19700101 +##source=Dan +##reference=file:///scratch/dan/galaxy/galaxy-central/database/files/002/dataset_0000.dat +##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> +##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> +##FORMAT=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=AF,Number=.,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=NC,Number=.,Type=String,Description="Nucleotide and indel counts"> +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT BRAIN ARTERY THYROID +# naive case +chr1 0 . A G,C . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,-A=15, 0/0:1:1:+G=15,-G=15, 0/0:1:1:+C=15,-C=15, +# cases all with one allele, but different minor allele numbers below threshold +chr1 10 . A G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,-A=15, 0/0:1:1:+A=15,+G=1,-A=15,-G=1, 0/0:1:1:+A=15,+G=1,-A=16, +# cases with different numbers of alleles +chr1 20 . A G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,+G=1,-A=15,-G=1, 0/0:1:1:+A=17,+G=3,-A=17,-G=3, 0/0:1:1:+A=15,+G=2,-A=15, +# cases where some shouldn't be printed +chr1 30 . A G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,-A=15, 0/0:1:1:+A=9,-A=8, 0/0:1:1:+A=15,-A=9, +# cases with some complex pre-':' data +chr1 40 . T A,G,C . . AC=1,1;AF=0.0111111111111,0.0111111111111 GT:AC:AF:NC 0/0:1:1:+T=15,-T=15, 0/0:1,1:0.0111111111111,0.0111111111111:+A=1,+T=81,-T=16,-G=2, 0/0:3,1:0.0394736842105,0.0131578947368:+C=52,+T=30,-C=17,-T=1, \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/artificial.csv.out Thu Sep 12 11:34:23 2013 -0400 @@ -0,0 +1,35 @@ +THYROID chr1 0 30 0 0 0 30 1 A . 0.0 +THYROID chr1 10 30 0 2 0 32 1 A G 0.0625 +THYROID chr1 20 31 0 1 0 32 1 A G 0.03125 +THYROID chr1 30 21 0 4 0 25 2 A G 0.16 +THYROID chr1 40 22 0 3 0 25 0 A G 0.12 +THYROID chr1 50 30 0 0 0 30 1 A . 0.0 +THYROID chr1 60 31 0 0 0 31 1 A . 0.0 +THYROID chr1 70 21 0 0 0 21 1 A . 0.0 +THYROID chr1 80 22 0 0 0 22 1 A . 0.0 +THYROID chr1 82 30 0 2 0 32 1 A G 0.0625 +THYROID chr1 84 31 0 1 0 32 1 A G 0.03125 +THYROID chr1 86 21 0 4 0 25 2 A G 0.16 +THYROID chr1 88 22 0 3 0 25 0 A G 0.12 +THYROID chr1 90 30 0 0 0 30 1 A . 0.0 +THYROID chr1 100 31 0 0 0 31 1 A . 0.0 +THYROID chr1 110 21 0 0 0 21 1 A . 0.0 +THYROID chr1 120 22 0 0 0 22 1 A . 0.0 +THYROID chr1 210 20 0 0 0 20 1 A . 0.0 +THYROID chr1 220 22 0 0 0 22 1 A . 0.0 +THYROID chr1 230 182 0 18 0 200 1 A G 0.09 +THYROID chr1 240 180 0 20 0 200 2 A G 0.1 +THYROID chr1 250 178 0 22 0 200 2 A G 0.11 +THYROID chr1 260 106 0 14 0 120 0 A G 0.11667 +THYROID chr1 300 2 0 2 76 80 1 T G 0.025 +THYROID chr1 310 12 0 12 76 100 3 T G 0.12 +THYROID chr1 320 12 0 12 56 80 3 T A 0.15 +THYROID chr1 330 7 0 7 66 80 0 T G 0.0875 +THYROID chr1 340 1 0 1 98 100 1 T G 0.01 +THYROID chr1 350 11 0 11 78 100 0 T A 0.11 +THYROID chr1 400 32 0 8 0 40 2 A G 0.2 +THYROID chr1 410 1 0 2 97 100 0 T G 0.02 +THYROID chr1 420 104 0 0 0 104 1 A . 0.0 +THYROID chr1 430 30 0 0 0 30 1 A . 0.0 +THYROID chr1 440 30 0 0 0 30 1 A . 0.0 +THYROID 27 1234567890 29 0 0 0 29 1 A . 0.0
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/artificial.vcf.in Thu Sep 12 11:34:23 2013 -0400 @@ -0,0 +1,72 @@ +##fileformat=VCFv4.1 +##comment="ARGS=-r 1 -f 10 -c 10" +##comment="This is a test set of made-up sites, each created in order to test certain functionality. It's meant to be run with -f 10 -c 10" +##fileDate=19700101 +##source=Dan +##reference=file:///scratch/dan/galaxy/galaxy-central/database/files/002/dataset_0000.dat +##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> +##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> +##FORMAT=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=AF,Number=.,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=NC,Number=.,Type=String,Description="Nucleotide and indel counts"> +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT THYROID +# General note: the only data made consistent is the CHROM, POS, REF, ALT, and the variant data (after the ':'). The other stuff isn't supposed to be consistent. +# Simplest case, but POS 0 and no minor allele +chr1 0 . A . . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,-A=15, +# Simple, normal cases of A/G variants: above/below threshold x strand bias/no strand bias (2 x 2 = 4 cases) +chr1 10 . A G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,+G=1,-A=15,-G=1, +chr1 20 . A G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,+G=1,-A=16, +chr1 30 . A G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=10,+G=2,-A=11,-G=2, +chr1 40 . A G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=11,+G=3,-A=11, +# Same 4 cases, but with minor allele = N +chr1 50 . A N . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,+N=1,-A=15,-N=1, +chr1 60 . A N . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,+N=1,-A=16, +chr1 70 . A N . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=10,+N=2,-A=11,-N=2, +chr1 80 . A N . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=11,+N=3,-A=11, +# Same 4 cases, but with an additional noncanonical minor allele d1 +chr1 82 . A d1 . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,+G=1,+d1=1,-A=15,-G=1,-d1=1, +chr1 84 . A d1 . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,+G=1,+d1=1,-A=16, +chr1 86 . A d1 . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=10,+G=2,+d1=1,-A=11,-G=2,-d1=1, +chr1 88 . A d1 . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=11,+G=3,+d1=1,-A=11, +# Same 4 cases, but with minor allele = d1 (non-canonical) +chr1 90 . A d1 . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,+d1=1,-A=15,-d1=1, +chr1 100 . A d1 . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,+d1=1,-A=16, +chr1 110 . A d1 . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=10,+d1=2,-A=11,-d1=2, +chr1 120 . A d1 . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=11,+d1=3,-A=11, +# Same 4 cases, but with MAJOR allele = d1 +chr1 130 . d1 G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+d1=15,+G=1,-d1=15,-G=1, +chr1 140 . d1 G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+d1=15,+G=1,-d1=16, +chr1 150 . d1 G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+d1=10,+G=2,-d1=11,-G=2, +chr1 160 . d1 G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+d1=11,+G=3,-d1=11, +# Test edge cases where freq == freq_thres and/or covg == covg_thres +chr1 200 . A . . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=9,-A=9, +chr1 210 . A . . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=10,-A=10, +chr1 220 . A . . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=11,-A=11, +chr1 230 . A G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=91,+G=9,-A=91,-G=9, +chr1 240 . A G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=90,+G=10,-A=90,-G=10, +chr1 250 . A G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=89,+G=11,-A=89,-G=11, +# Test case where minor allele is above threshold on only one strand because of different coverage. Also, a long decimal minor allele frequency. +chr1 260 . A G . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=13,+G=7,-A=93,-G=7, +# Test case where minor alleles have equal frequency: Above/below threshold, +/- strand bias +chr1 300 . T G,A . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=1,+G=1,+T=38,-A=1,-G=1,-T=38, +chr1 310 . T G,A . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=6,+G=6,+T=38,-A=6,-G=6,-T=38, +chr1 320 . T G,A . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=6,+G=6,+T=38,-A=6,-G=6,-T=18, +chr1 330 . T G,A . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=6,+G=6,+T=38,-A=1,-G=1,-T=28, +chr1 340 . T G,A . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=1,+T=80,-G=1,-T=18, +chr1 350 . T G,A . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=11,+T=60,-G=11,-T=18, +# Case where + and - variants are interleaved with each other. Also, a long decimal result for the minor allele frequency. +chr1 400 . A N . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=16,-A=16,+G=4,-G=4, +# Test complex data in the ALT, INFO, and sample (before the ':') columns +chr1 410 . T G,A . . AC=1,1;AF=0.0111111111111,0.0111111111111 GT:AC:AF:NC 0/0:1,1:0.0111111111111,0.0111111111111:+A=1,+T=81,-T=16,-G=2, +chr1 420 . A . . . AC=;AF= GT:AC:AF:NC 0/0:::+A=82,-A=22, +# Test some other types of noncanonical variants (tie for 2nd and not) +chr1 430 . A N,GAA,d2 . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,+N=1,+d2=1,-A=15,-GAA=2 +chr1 440 . A N,GAA,d2 . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,+N=1,+d2=1,-A=15,-GAA=1 +# No canonical variants present +chr1 450 . A d1 . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+d1=15,-d1=20, +chr1 460 . A d1,N . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+d1=15,+N=2,-d1=20,-N=2, +# Catch some divide by zero errors +chr1 470 . A . . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=0, +# Test an unusual CHROM value and a long POS value +27 1234567890 . A N . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,+N=1,-A=14, \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/compute-site.py Thu Sep 12 11:34:23 2013 -0400 @@ -0,0 +1,161 @@ +#!/usr/bin/python +import os +import sys +from optparse import OptionParser + +OPT_DEFAULTS = {'freq_thres':0, 'covg_thres':0} +BASES = ['A', 'C', 'G', 'T'] +USAGE = ("Usage: %prog (options) 'variant_str' ('variant_str2' (etc))\n" + +" cat variants.txt > %prog (options)") + +def main(): + + parser = OptionParser(usage=USAGE) + parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float', + default=OPT_DEFAULTS.get('freq_thres'), + help=('Frequency threshold for counting alleles, given in percentage: -f 1 ' + +'= 1% frequency. Default is %default%.')) + parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int', + default=OPT_DEFAULTS.get('covg_thres'), + help=('Coverage threshold. Each site must be supported by at least this ' + +'many reads on each strand. Otherwise the site will not be printed in ' + +'the output. The default is %default reads per strand.')) + (options, args) = parser.parse_args() + freq_thres = options.freq_thres + covg_thres = options.covg_thres + + if len(sys.argv) > 1 and '-h' in sys.argv[1][0:3]: + script_name = os.path.basename(sys.argv[0]) + print """USAGE: + $ """+script_name+""" [sample column text] + $ """+script_name+""" '+A=10,+G=2,-A=20,-G=41,' + $ """+script_name+""" '0/1:10,1:0.25,0.025:-A=29,-AG=1,-G=10,' + $ """+script_name+""" '+A=10,+G=2,-A=20,-G=41,' '0/1:10,1:0.25,0.025:-A=29,-AG=1,-G=10,' +Or invoke with no arguments to use interactively. It will read from stdin, so +just paste one sample per line.""" + sys.exit(0) + + if len(args) > 0: + stdin = False + samples = args + else: + stdin = True + samples = sys.stdin + print "Reading from standard input.." + + for sample in samples: + print '' + sample = sample.split(':')[-1] + if not sample: + continue + print sample + counts_dict = parse_counts(sample) + compute_stats(counts_dict, freq_thres, covg_thres) + + +def parse_counts(sample_str): + counts_dict = {} + counts = sample_str.split(',') + for count in counts: + if '=' not in count: + continue + (var, reads) = count.split('=') + if var[1:] in BASES: + counts_dict[var] = int(reads) + return counts_dict + + +def compute_stats(counts_dict, freq_thres, covg_thres): + # totals for A, C, G, T + counts_unstranded = {} + for base in BASES: + counts_unstranded[base] = 0 + for strand in '+-': + counts_unstranded[base] += counts_dict.get(strand+base, 0) + print '+- '+str(counts_unstranded) + + # get counts for each strand + plus_counts = get_stranded_counts(counts_dict, '+') + minus_counts = get_stranded_counts(counts_dict, '-') + counts_lists = {'+':plus_counts, '-':minus_counts} + print ' + '+str(plus_counts) + print ' - '+str(minus_counts) + + # stranded coverage threshold + coverages = {} + failing_strands = {} + for strand in '+-': + coverages[strand] = 0 + for count in counts_lists[strand].values(): + coverages[strand] += count + if coverages[strand] < covg_thres: + failing_strands[strand] = coverages[strand] + sys.stdout.write(strand+'coverage: '+str(coverages[strand])+"\t") + coverages['+-'] = coverages['+'] + coverages['-'] + sys.stdout.write("+-coverage: "+str(coverages['+-'])+"\n") + + if failing_strands: + for strand in failing_strands: + print ('coverage on '+strand+' strand too low (' + +str(failing_strands[strand])+' < '+str(covg_thres)+")") + return + + # apply frequency threshold + for strand in counts_lists: + strand_counts = counts_lists[strand] + for variant in strand_counts.keys(): + # print (variant+" freq: "+str(strand_counts[variant])+"/" + # +str(coverages[strand])+" = " + # +str(strand_counts[variant]/float(coverages[strand]))) + if strand_counts[variant]/float(coverages[strand]) < freq_thres: + strand_counts.pop(variant) + plus_variants = sorted(plus_counts.keys()) + minus_variants = sorted(minus_counts.keys()) + if plus_variants == minus_variants: + strand_bias = False + print "no strand bias: +"+str(plus_variants)+" == -"+str(minus_variants) + sys.stdout.write("alleles: "+str(len(plus_variants))+"\t") + else: + strand_bias = True + print " strand bias: +"+str(plus_variants)+" != -"+str(minus_variants) + sys.stdout.write("alleles: 0\t") + + variants_sorted = sort_variants(counts_unstranded) + if len(variants_sorted) >= 1: + sys.stdout.write("major: "+variants_sorted[0]+"\t") + minor = '.' + if len(variants_sorted) == 2: + minor = variants_sorted[1] + elif len(variants_sorted) > 2: + if (counts_unstranded.get(variants_sorted[1]) == + counts_unstranded.get(variants_sorted[2])): + minor = 'N' + else: + minor = variants_sorted[1] + + sys.stdout.write("minor: "+minor+"\tfreq: ") + print round(float(counts_unstranded.get(minor, 0))/coverages['+-'], 5) + + +def get_stranded_counts(unstranded_counts, strand): + stranded_counts = {} + for variant in unstranded_counts: + if variant[0] == strand: + stranded_counts[variant[1:]] = unstranded_counts[variant] + return stranded_counts + +def sort_variants(variant_counts): + """Sort the list of variants based on their counts. Returns a list of just + the variants, no counts.""" + variants = variant_counts.keys() + var_del = [] + for variant in variants: + if variant_counts.get(variant) == 0: + var_del.append(variant) + for variant in var_del: + variants.remove(variant) + variants.sort(reverse=True, key=lambda variant: variant_counts.get(variant,0)) + return variants + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/errors.vcf.in Thu Sep 12 11:34:23 2013 -0400 @@ -0,0 +1,25 @@ +##fileformat=VCFv4.1 +##comment="ARGS=-f 10 -c 10" +##comment="This is a test set of made-up sites, each created in order to test a certain type of error handling. It's meant to be run with -f 10 -c 10" +##fileDate=19700101 +##source=Dan +##reference=file:///scratch/dan/galaxy/galaxy-central/database/files/002/dataset_0000.dat +##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> +##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> +##FORMAT=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=AF,Number=.,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=NC,Number=.,Type=String,Description="Nucleotide and indel counts"> +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT THYROID +# A correct control +chr1 0 . A . . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,-A=15, +# List some bases more than once +chr1 0 . A . . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,-A=15,+A=12, +# Comma omissions +chr1 0 . A . . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,-A=15 +chr1 0 . A . . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15-A=15, +chr1 0 . A . . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,+G=1,+C=1,-A=15,-G=1, +chr1 0 . A . . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,+G=1+C=1,-A=15,-G=1, +# Wrong number of sample columns +chr1 0 . A . . . AC=;AF= GT:AC:AF:NC 0/0:1:1:+A=15,-A=15, 0/0:1:1:+A=15,-A=15, +chr1 0 . A . . . AC=;AF= GT:AC:AF:NC \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/real-mit-s.csv.out Thu Sep 12 11:34:23 2013 -0400 @@ -0,0 +1,12 @@ +#SAMPLE CHR POS +A +C +G +T -A -C -G -T CVRG ALLELES MAJOR MINOR MINOR.FREQ.PERC. +S1 chrM 2000 1 9095 1 0 7 5808 0 1 14913 1 C A 0.00054 +S3 chrM 2000 0 7933 0 4 10 5242 1 2 13192 1 C A 0.00076 +S1 chrM 3000 17399 7 22 8 10567 35 22 4 28064 0 A G 0.00157 +S2 chrM 3000 12535 3 24 2 7937 13 12 2 20528 1 A G 0.00175 +S3 chrM 3000 18981 7 29 6 11286 33 17 4 30363 0 A G 0.00152 +S4 chrM 3000 9254 1 15 2 6240 16 14 1 15543 0 A G 0.00187 +S1 chrM 4000 6134 2 1 3 6124 1 1 1 12267 1 A T 0.00033 +S1 chrM 7000 0 17 1 6216 4 9 2 7529 13778 0 T C 0.00189 +S2 chrM 7000 0 7 2 5104 0 9 4 6288 11414 1 T C 0.0014 +S3 chrM 7000 0 9 0 6446 4 4 10 7506 13979 1 T C 0.00093 +S3 chrM 8000 3 1 5023 1 1 0 5043 2 10074 1 G A 0.0004
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/real-mit-s.vcf.in Thu Sep 12 11:34:23 2013 -0400 @@ -0,0 +1,28 @@ +##fileformat=VCFv4.1 +##comment="ARGS=-f 0.2 -c 5000 -H -s" +##fileDate=20130521 +##source=Dan +##reference=file:///scratch/dan/galaxy/galaxy-central/database/files/002/dataset_2536.dat +##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> +##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> +##FORMAT=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=AF,Number=.,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=NC,Number=.,Type=String,Description="Nucleotide and indel counts"> +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 S4 +chrM 1000 . ACTC TCTC,CCTC,GCTC,ATC,A . . AC=188032,682,8,8,1;AF=0.995310134556,0.00361003186568,4.23464148467e-05,4.23464148467e-05,5.29330185583e-06 GT:AC:AF:NC 1/1:12546,51,0,0,0:0.994688020297,0.00404344723698,0,0,0:+A=2,+C=20,+T=8020,-A=14,-C=31,-T=4526, 1/1:9970,38,1,1,0:0.995208624476,0.00379317228988,9.98203234178e-05,9.98203234178e-05,0:+A=1,+d1=1,+C=21,+T=6546,+G=1,-A=7,-C=17,-T=3424, 1/1:11372,35,0,2,0:0.995448179272,0.0030637254902,0,0.000175070028011,0:+d1=2,+C=10,+T=7199,-A=15,-C=25,-T=4173, 1/1:9115,39,0,0,0:0.995304651671,0.00425857174055,0,0,0:+C=13,+T=5817,-A=4,-C=26,-T=3298, +chrM 2000 . TAC CAC,AAC,GAC,T,TC,TGAC . . AC=209371,114,8,2,2,1;AF=0.999083807733,0.000543989158392,3.81746777819e-05,9.54366944547e-06,9.54366944547e-06,4.77183472273e-06 GT:AC:AF:NC 1/1:14903,8,1,0,0,0:0.999329444109,0.000536444712667,6.70555890834e-05,0,0,0:+A=1,+C=9095,+G=1,-A=7,-C=5808,-T=1, 1/1:11838,3,0,0,0,0:0.999324666554,0.000253250042208,0,0,0,0:+C=7242,+T=4,-A=3,-C=4596,-T=1, 1/1:13175,10,1,0,0,0:0.998711340206,0.000758035172832,7.58035172832e-05,0,0,0:+C=7933,+T=4,-A=10,-C=5242,-T=2,-G=1, 1/1:10921,3,2,0,0,1:0.999176578225,0.000274473924977,0.000182982616651,0,0,9.14913083257e-05:+A=1,+TG=1,+C=6732,+T=1,-A=2,-C=4189,-T=2,-G=2, +chrM 3000 . TC AC,GC,CC,T,TGC . . AC=386813,649,441,35,33;AF=0.996740346013,0.00167234421946,0.00113636949273,9.01880549786e-05,8.5034451837e-05 GT:AC:AF:NC 1/1:27966,44,42,5,6:0.996117542297,0.00156723063224,0.00149599287622,0.000178094390027,0.000213713268032:+A=17399,+d1=1,+C=7,+T=8,+G=22,-A=10567,-d1=4,-C=35,-G=22,-T=4,-TG=6, 1/1:20472,36,16,0,0:0.997272018706,0.00175370226033,0.000779423226812,0,0:+A=12535,+C=3,+T=2,+G=24,-A=7937,-C=13,-T=2,-G=12, 1/1:30267,46,40,5,3:0.996575680748,0.00151460274604,0.00131704586612,0.000164630733265,9.87784399592e-05:+A=18981,+d1=2,+C=7,+T=6,+G=29,-A=11286,-d1=3,-C=33,-G=17,-T=4,-TG=3, 1/1:15494,29,17,2,3:0.996526884487,0.00186519166452,0.00109338821713,0.000128633907898,0.000192950861847:+A=9254,+d1=2,+C=1,+T=2,+G=15,-A=6240,-TG=3,-C=16,-T=1,-G=14, +chrM 4000 . TA AA,GA,CA,T,TCA,TTA . . AC=138825,77,37,9,8,3;AF=0.998511134127,0.000553829334254,0.000266125783992,6.47332988089e-05,5.75407100524e-05,2.15777662696e-05 GT:AC:AF:NC 1/1:12258,2,3,1,1,2:0.998940591639,0.00016298590172,0.000244478852579,8.14929508598e-05,8.14929508598e-05,0.00016298590172:+A=6134,+d1=1,+C=2,+G=1,+TT=1,+T=3,+TC=1,-A=6124,-C=1,-TT=1,-G=1,-T=1, 1/1:8934,6,3,0,0,0:0.998546998994,0.000670615848888,0.000335307924444,0,0,0:+A=4382,+T=3,+G=5,-A=4552,-C=3,-T=1,-G=1, 1/1:10135,4,1,2,1,0:0.999014292755,0.000394282897979,9.85707244948e-05,0.00019714144899,9.85707244948e-05,0:+A=4966,+d1=1,+TC=1,+G=3,-A=5169,-d1=1,-C=1,-T=2,-G=1, 1/1:7342,4,1,0,0,0:0.999183451279,0.000544365813827,0.000136091453457,0,0,0:+A=3711,+T=1,-A=3631,-C=1,-G=4, +chrM 5000 . A G,T,C . . AC=28,27,10;AF=0.000251195421066,0.000242224156028,8.97126503808e-05 GT:AC:AF:NC 0/0:2,1,0:0.000190912562047,9.54562810233e-05,0:+A=3664,+T=1,-A=6809,-G=2, 0/0:7,1,2:0.000987724001693,0.000141103428813,0.000282206857627:+A=2337,+C=1,+T=1,+G=2,-A=4740,-C=1,-G=5, 0/0:2,3,0:0.00022411474675,0.000336172120126,0:+A=3194,+T=1,-A=5725,-T=2,-G=2, 0/0:1,1,0:0.000169865806013,0.000169865806013,0:+A=2022,+T=1,-A=3863,-G=1, +chrM 6000 . T C,A,TT,TCTTA,G . . AC=118279,31,7,1,1;AF=0.999459199108,0.000261950432218,5.91500975977e-05,8.45001394252e-06,8.45001394252e-06 GT:AC:AF:NC 1/1:9612,2,0,0,1:0.999168399168,0.0002079002079,0,0,0.00010395010395:+C=4845,+T=3,-A=2,-C=4767,-T=2,-G=1, 1/1:7771,0,1,0,0:0.99987133299,0,0.000128667009779,0,0:+C=3682,-C=4089,-TT=1, 1/1:9152,1,0,0,0:0.999563128003,0.000109217999126,0,0,0:+C=4640,+T=1,-A=1,-C=4512,-T=2, 1/1:5500,0,1,0,0:0.99981821487,0,0.000181785129976,0,0:+C=2707,-C=2793,-TT=1, +chrM 7000 . GTA TTA,CTA,ATA,GA,GACTACTA,G . . AC=177915,270,72,20,3,1;AF=0.997370840434,0.00151358866266,0.000403623643376,0.000112117678716,1.68176518073e-05,5.60588393578e-06 GT:AC:AF:NC 1/1:13745,26,4,1,0,0:0.997532476958,0.0018869293853,0.000290296828507,7.25742071268e-05,0,0:+d1=1,+C=17,+T=6216,+G=1,-A=4,-C=9,-T=7529,-G=2, 1/1:11392,16,0,1,0,0:0.997985107315,0.00140166447657,0,8.76040297854e-05,0,0:+d1=1,+C=7,+T=5104,+G=2,-C=9,-T=6288,-G=4, 1/1:13952,13,4,0,0,0:0.998068531368,0.000929966378139,0.000286143500966,0,0,0:+C=9,+T=6446,-A=4,-C=4,-T=7506,-G=10, 1/1:7740,12,1,0,0,0:0.998194480268,0.0015475883415,0.000128965695125,0,0,0:+C=8,+T=3471,-A=1,-C=4,-T=4269,-G=1, +chrM 8000 . TGA GGA,AGA,CGA,T . . AC=132300,33,7,2;AF=0.999350384482,0.000249271070959,5.28756817186e-05,1.51073376339e-05 GT:AC:AF:NC 1/1:10337,2,0,2:0.999226679555,0.000193330111165,0,0.000193330111165:+A=1,+d2=1,+T=1,+G=4832,-A=1,-d2=1,-T=3,-G=5505, 1/1:8347,3,0,0:0.999640718563,0.000359281437126,0,0:+A=2,+G=4258,-A=1,-G=4089, 1/1:10066,4,1,0:0.999205876514,0.000397061743101,9.92654357753e-05,0:+A=3,+C=1,+T=1,+G=5023,-A=1,-T=2,-G=5043, 1/1:4891,2,3,0:0.99897875817,0.000408496732026,0.000612745098039,0:+A=1,+C=1,+G=1746,-A=1,-C=2,-G=3145, +chrM 9000 . TACGC AACGC,GACGC,CACGC,T,TCGACGC . . AC=125089,77,46,8,2;AF=0.998842167463,0.000614849002667,0.000367312391204,6.38804158615e-05,1.59701039654e-05 GT:AC:AF:NC 1/1:10775,9,6,1,0:0.998517282921,0.000834028356964,0.000556018904643,9.26698174405e-05,0:+A=6306,+G=1,-A=4469,-C=6,-d4=1,-G=8, 1/1:8081,7,2,0,0:0.998764058831,0.00086515881844,0.00024718823384,0,0:+A=4777,+T=1,+G=3,-A=3304,-C=2,-G=4, 1/1:9475,4,3,2,0:0.998735111205,0.000421629598398,0.000316222198798,0.000210814799199,0:+A=5435,+G=1,-A=4040,-C=3,-d4=2,-T=3,-G=3, 1/1:5709,3,1,0,0:0.999299842465,0.000525118151584,0.000175039383861,0,0:+A=3234,+G=1,-A=2475,-C=1,-G=2, +#chrM 10000 . AGT GGT,TGT,AAGT,ACGT,A,CGT,AAGAGT,AACAGT,AT . . AC=88942,145,51,12,7,6,5,2,1;AF=0.997051734768,0.00162546942436,0.000571716832016,0.000134521607533,7.84709377277e-05,6.72608037666e-05,5.60506698055e-05,2.24202679222e-05,1.12101339611e-05 GT:AC:AF:NC 1/1:7632,14,4,3,1,1,0,0,0:0.996735013713,0.00182839232075,0.000522397805929,0.000391798354447,0.000130599451482,0.000130599451482,0,0,0:+A=1,+C=1,+AC=3,+G=2535,+T=11,-A=1,-AA=4,-d2=1,-T=3,-G=5097, 1/1:6053,5,2,1,0,0,1,0,0:0.998186015831,0.000824538258575,0.00032981530343,0.000164907651715,0,0,0.000164907651715,0,0:+AC=1,+T=5,+G=1674,-A=2,-AA=2,-G=4379,-AAGA=1, 1/1:7012,14,6,0,2,0,0,0,0:0.996730632552,0.00199004975124,0.000852878464819,0,0.000284292821606,0,0,0,0:+d2=1,+T=11,+G=2241,-A=1,-AA=6,-d2=1,-T=3,-G=4771, 1/1:4646,1,2,0,0,0,0,0,0:0.998710232158,0.000214961306965,0.000429922613929,0,0,0,0,0,0:+A=1,+T=1,+G=1337,-A=2,-AA=2,-G=3309, +#chrM 11000 . CCA ACA,GCA,TCA,CA,C . . AC=54,44,36,3,2;AF=0.000362343152385,0.00029524256861,0.00024156210159,2.01301751325e-05,1.3420116755e-05 GT:AC:AF:NC 0/0:7,3,1,0,0:0.000521920668058,0.000223680286311,7.45600954369e-05,0,0:+C=10229,+T=1,-A=7,-C=3172,-G=3, 0/0:3,4,2,0,0:0.000368233705659,0.000490978274211,0.000245489137106,0,0:+A=2,+C=6575,+T=2,+G=1,-A=1,-C=1563,-G=3, 0/0:9,3,2,0,0:0.000811615114077,0.000270538371359,0.000180358914239,0,0:+A=1,+C=7906,+T=2,-A=8,-C=3169,-G=3, 0/0:0,2,3,1,0:0,0.000220507166483,0.000330760749724,0.000110253583241,0:+d1=1,+C=7066,+T=2,-C=1998,-T=1,-G=2, +#chrM 12000 . AC CC,TC,GC,A . . AC=182586,84,23,1;AF=0.999146337459,0.000459664444955,0.000125860502785,5.47219577328e-06 GT:AC:AF:NC 1/1:13592,8,0,0:0.999264813998,0.000588148801647,0,0:+A=2,+C=7113,+T=5,-C=6479,-T=3, 1/1:9868,1,0,0:0.999797365755,0.000101317122594,0,0:+C=5182,-A=1,-C=4686,-T=1, 1/1:11289,6,3,0:0.998938147067,0.000530926466684,0.000265463233342,0:+A=2,+C=6123,+T=3,+G=1,-A=1,-C=5166,-T=3,-G=2, 1/1:9708,4,0,0:0.999382334775,0.000411776816965,0,0:+A=1,+C=4892,+T=2,-A=1,-C=4816,-T=2, +#chrM 13000 . A G,C,T . . AC=56125,9,6;AF=0.999358985773,0.000160253556739,0.000106835704492 GT:AC:AF:NC 1/1:3816,0,0:0.999214454046,0,0:+A=1,+G=1319,-A=2,-G=2497, 1/1:2814,1,0:0.999289772727,0.000355113636364,0:+G=986,-A=1,-C=1,-G=1828, 1/1:3479,0,0:0.999712643678,0,0:+A=1,+G=1285,-G=2194, 1/1:2679,0,0:1.0,0,0:+G=979,-G=1700, +#chrM 14000 . CTAA TTAA,GTAA,CAA,ATAA,C . . AC=147853,32,22,21,1;AF=0.997766290558,0.000215947740647,0.000148464071695,0.000141715704799,6.74836689521e-06 GT:AC:AF:NC 1/1:10992,3,2,4,0:0.996825972613,0.000272059490342,0.000181372993561,0.000362745987123,0:+A=1,+d1=1,+C=19,+T=7038,-A=3,-d1=1,-C=7,-T=3954,-G=3, 1/1:6853,2,2,0,0:0.997525473071,0.000291120815138,0.000291120815138,0,0:+d1=2,+C=9,+T=4609,-C=4,-T=2244,-G=2, 1/1:9277,2,0,5,0:0.998063474987,0.000215169445939,0,0.000537923614847,0:+A=2,+C=9,+T=5786,-A=3,-C=2,-T=3491,-G=2, 1/1:8125,1,3,2,0:0.997422047631,0.000122759636631,0.000368278909894,0.000245519273263,0:+d1=2,+C=9,+T=5241,-A=2,-d1=1,-C=6,-T=2884,-G=1, +#chrM 15000 . AA GA,TA,CA,A . . AC=132,29,29,4;AF=0.000611470633196,0.000134338245172,0.000134338245172,1.85294131272e-05 GT:AC:AF:NC 0/0:15,2,0,0:0.000956510649152,0.00012753475322,0,0:+A=8715,+T=1,+G=5,-A=6950,-T=1,-G=10, 0/0:13,1,0,0:0.00121985549404,9.38350380032e-05,0,0:+A=6102,+T=1,+G=6,-A=4541,-G=7, 0/0:1,2,5,1:7.86101721563e-05,0.000157220344313,0.000393050860781,7.86101721563e-05:+A=7378,+C=5,+T=2,-A=5334,-d1=1,-G=1, 0/0:6,0,0,0:0.000532765050613,0,0,0:+A=6215,+G=4,-A=5041,-G=2, +#chrM 16000 . AG GG,TG,CG,A . . AC=224750,90,3,1;AF=0.999244175707,0.000400142272808,1.33380757603e-05,4.44602525342e-06 GT:AC:AF:NC 1/1:15911,2,0,0:0.99949745587,0.000125636032414,0,0:+A=3,+T=1,+G=7156,-A=3,-T=1,-G=8755, 1/1:11311,2,0,0:0.999381516169,0.000176709666019,0,0:+A=3,+G=4973,-A=2,-T=2,-G=6338, 1/1:14157,2,0,0:0.999012066897,0.000141133300402,0,0:+A=5,+T=1,+G=6766,-A=7,-T=1,-G=7391, 1/1:11039,3,0,0:0.999185372918,0.000271542360608,0,0:+A=4,+T=1,+G=4959,-A=2,-T=2,-G=6080,
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/real-mit.csv.out Thu Sep 12 11:34:23 2013 -0400 @@ -0,0 +1,12 @@ +#SAMPLE CHR POS A C G T CVRG ALLELES MAJOR MINOR MINOR.FREQ.PERC. +S1 chrM 2000 8 14903 1 1 14913 1 C A 0.00054 +S3 chrM 2000 10 13175 1 6 13192 1 C A 0.00076 +S1 chrM 3000 27966 42 44 12 28064 0 A G 0.00157 +S2 chrM 3000 20472 16 36 4 20528 1 A G 0.00175 +S3 chrM 3000 30267 40 46 10 30363 0 A G 0.00152 +S4 chrM 3000 15494 17 29 3 15543 0 A G 0.00187 +S1 chrM 4000 12258 3 2 4 12267 1 A T 0.00033 +S1 chrM 7000 4 26 3 13745 13778 0 T C 0.00189 +S2 chrM 7000 0 16 6 11392 11414 1 T C 0.0014 +S3 chrM 7000 4 13 10 13952 13979 1 T C 0.00093 +S3 chrM 8000 4 1 10066 3 10074 1 G A 0.0004
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/real-mit.vcf.in Thu Sep 12 11:34:23 2013 -0400 @@ -0,0 +1,28 @@ +##fileformat=VCFv4.1 +##comment="ARGS=-f 0.2 -c 5000 -H" +##fileDate=20130521 +##source=Dan +##reference=file:///scratch/dan/galaxy/galaxy-central/database/files/002/dataset_2536.dat +##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> +##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> +##FORMAT=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=AF,Number=.,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=NC,Number=.,Type=String,Description="Nucleotide and indel counts"> +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 S4 +chrM 1000 . ACTC TCTC,CCTC,GCTC,ATC,A . . AC=188032,682,8,8,1;AF=0.995310134556,0.00361003186568,4.23464148467e-05,4.23464148467e-05,5.29330185583e-06 GT:AC:AF:NC 1/1:12546,51,0,0,0:0.994688020297,0.00404344723698,0,0,0:+A=2,+C=20,+T=8020,-A=14,-C=31,-T=4526, 1/1:9970,38,1,1,0:0.995208624476,0.00379317228988,9.98203234178e-05,9.98203234178e-05,0:+A=1,+d1=1,+C=21,+T=6546,+G=1,-A=7,-C=17,-T=3424, 1/1:11372,35,0,2,0:0.995448179272,0.0030637254902,0,0.000175070028011,0:+d1=2,+C=10,+T=7199,-A=15,-C=25,-T=4173, 1/1:9115,39,0,0,0:0.995304651671,0.00425857174055,0,0,0:+C=13,+T=5817,-A=4,-C=26,-T=3298, +chrM 2000 . TAC CAC,AAC,GAC,T,TC,TGAC . . AC=209371,114,8,2,2,1;AF=0.999083807733,0.000543989158392,3.81746777819e-05,9.54366944547e-06,9.54366944547e-06,4.77183472273e-06 GT:AC:AF:NC 1/1:14903,8,1,0,0,0:0.999329444109,0.000536444712667,6.70555890834e-05,0,0,0:+A=1,+C=9095,+G=1,-A=7,-C=5808,-T=1, 1/1:11838,3,0,0,0,0:0.999324666554,0.000253250042208,0,0,0,0:+C=7242,+T=4,-A=3,-C=4596,-T=1, 1/1:13175,10,1,0,0,0:0.998711340206,0.000758035172832,7.58035172832e-05,0,0,0:+C=7933,+T=4,-A=10,-C=5242,-T=2,-G=1, 1/1:10921,3,2,0,0,1:0.999176578225,0.000274473924977,0.000182982616651,0,0,9.14913083257e-05:+A=1,+TG=1,+C=6732,+T=1,-A=2,-C=4189,-T=2,-G=2, +chrM 3000 . TC AC,GC,CC,T,TGC . . AC=386813,649,441,35,33;AF=0.996740346013,0.00167234421946,0.00113636949273,9.01880549786e-05,8.5034451837e-05 GT:AC:AF:NC 1/1:27966,44,42,5,6:0.996117542297,0.00156723063224,0.00149599287622,0.000178094390027,0.000213713268032:+A=17399,+d1=1,+C=7,+T=8,+G=22,-A=10567,-d1=4,-C=35,-G=22,-T=4,-TG=6, 1/1:20472,36,16,0,0:0.997272018706,0.00175370226033,0.000779423226812,0,0:+A=12535,+C=3,+T=2,+G=24,-A=7937,-C=13,-T=2,-G=12, 1/1:30267,46,40,5,3:0.996575680748,0.00151460274604,0.00131704586612,0.000164630733265,9.87784399592e-05:+A=18981,+d1=2,+C=7,+T=6,+G=29,-A=11286,-d1=3,-C=33,-G=17,-T=4,-TG=3, 1/1:15494,29,17,2,3:0.996526884487,0.00186519166452,0.00109338821713,0.000128633907898,0.000192950861847:+A=9254,+d1=2,+C=1,+T=2,+G=15,-A=6240,-TG=3,-C=16,-T=1,-G=14, +chrM 4000 . TA AA,GA,CA,T,TCA,TTA . . AC=138825,77,37,9,8,3;AF=0.998511134127,0.000553829334254,0.000266125783992,6.47332988089e-05,5.75407100524e-05,2.15777662696e-05 GT:AC:AF:NC 1/1:12258,2,3,1,1,2:0.998940591639,0.00016298590172,0.000244478852579,8.14929508598e-05,8.14929508598e-05,0.00016298590172:+A=6134,+d1=1,+C=2,+G=1,+TT=1,+T=3,+TC=1,-A=6124,-C=1,-TT=1,-G=1,-T=1, 1/1:8934,6,3,0,0,0:0.998546998994,0.000670615848888,0.000335307924444,0,0,0:+A=4382,+T=3,+G=5,-A=4552,-C=3,-T=1,-G=1, 1/1:10135,4,1,2,1,0:0.999014292755,0.000394282897979,9.85707244948e-05,0.00019714144899,9.85707244948e-05,0:+A=4966,+d1=1,+TC=1,+G=3,-A=5169,-d1=1,-C=1,-T=2,-G=1, 1/1:7342,4,1,0,0,0:0.999183451279,0.000544365813827,0.000136091453457,0,0,0:+A=3711,+T=1,-A=3631,-C=1,-G=4, +chrM 5000 . A G,T,C . . AC=28,27,10;AF=0.000251195421066,0.000242224156028,8.97126503808e-05 GT:AC:AF:NC 0/0:2,1,0:0.000190912562047,9.54562810233e-05,0:+A=3664,+T=1,-A=6809,-G=2, 0/0:7,1,2:0.000987724001693,0.000141103428813,0.000282206857627:+A=2337,+C=1,+T=1,+G=2,-A=4740,-C=1,-G=5, 0/0:2,3,0:0.00022411474675,0.000336172120126,0:+A=3194,+T=1,-A=5725,-T=2,-G=2, 0/0:1,1,0:0.000169865806013,0.000169865806013,0:+A=2022,+T=1,-A=3863,-G=1, +chrM 6000 . T C,A,TT,TCTTA,G . . AC=118279,31,7,1,1;AF=0.999459199108,0.000261950432218,5.91500975977e-05,8.45001394252e-06,8.45001394252e-06 GT:AC:AF:NC 1/1:9612,2,0,0,1:0.999168399168,0.0002079002079,0,0,0.00010395010395:+C=4845,+T=3,-A=2,-C=4767,-T=2,-G=1, 1/1:7771,0,1,0,0:0.99987133299,0,0.000128667009779,0,0:+C=3682,-C=4089,-TT=1, 1/1:9152,1,0,0,0:0.999563128003,0.000109217999126,0,0,0:+C=4640,+T=1,-A=1,-C=4512,-T=2, 1/1:5500,0,1,0,0:0.99981821487,0,0.000181785129976,0,0:+C=2707,-C=2793,-TT=1, +chrM 7000 . GTA TTA,CTA,ATA,GA,GACTACTA,G . . AC=177915,270,72,20,3,1;AF=0.997370840434,0.00151358866266,0.000403623643376,0.000112117678716,1.68176518073e-05,5.60588393578e-06 GT:AC:AF:NC 1/1:13745,26,4,1,0,0:0.997532476958,0.0018869293853,0.000290296828507,7.25742071268e-05,0,0:+d1=1,+C=17,+T=6216,+G=1,-A=4,-C=9,-T=7529,-G=2, 1/1:11392,16,0,1,0,0:0.997985107315,0.00140166447657,0,8.76040297854e-05,0,0:+d1=1,+C=7,+T=5104,+G=2,-C=9,-T=6288,-G=4, 1/1:13952,13,4,0,0,0:0.998068531368,0.000929966378139,0.000286143500966,0,0,0:+C=9,+T=6446,-A=4,-C=4,-T=7506,-G=10, 1/1:7740,12,1,0,0,0:0.998194480268,0.0015475883415,0.000128965695125,0,0,0:+C=8,+T=3471,-A=1,-C=4,-T=4269,-G=1, +chrM 8000 . TGA GGA,AGA,CGA,T . . AC=132300,33,7,2;AF=0.999350384482,0.000249271070959,5.28756817186e-05,1.51073376339e-05 GT:AC:AF:NC 1/1:10337,2,0,2:0.999226679555,0.000193330111165,0,0.000193330111165:+A=1,+d2=1,+T=1,+G=4832,-A=1,-d2=1,-T=3,-G=5505, 1/1:8347,3,0,0:0.999640718563,0.000359281437126,0,0:+A=2,+G=4258,-A=1,-G=4089, 1/1:10066,4,1,0:0.999205876514,0.000397061743101,9.92654357753e-05,0:+A=3,+C=1,+T=1,+G=5023,-A=1,-T=2,-G=5043, 1/1:4891,2,3,0:0.99897875817,0.000408496732026,0.000612745098039,0:+A=1,+C=1,+G=1746,-A=1,-C=2,-G=3145, +chrM 9000 . TACGC AACGC,GACGC,CACGC,T,TCGACGC . . AC=125089,77,46,8,2;AF=0.998842167463,0.000614849002667,0.000367312391204,6.38804158615e-05,1.59701039654e-05 GT:AC:AF:NC 1/1:10775,9,6,1,0:0.998517282921,0.000834028356964,0.000556018904643,9.26698174405e-05,0:+A=6306,+G=1,-A=4469,-C=6,-d4=1,-G=8, 1/1:8081,7,2,0,0:0.998764058831,0.00086515881844,0.00024718823384,0,0:+A=4777,+T=1,+G=3,-A=3304,-C=2,-G=4, 1/1:9475,4,3,2,0:0.998735111205,0.000421629598398,0.000316222198798,0.000210814799199,0:+A=5435,+G=1,-A=4040,-C=3,-d4=2,-T=3,-G=3, 1/1:5709,3,1,0,0:0.999299842465,0.000525118151584,0.000175039383861,0,0:+A=3234,+G=1,-A=2475,-C=1,-G=2, +#chrM 10000 . AGT GGT,TGT,AAGT,ACGT,A,CGT,AAGAGT,AACAGT,AT . . AC=88942,145,51,12,7,6,5,2,1;AF=0.997051734768,0.00162546942436,0.000571716832016,0.000134521607533,7.84709377277e-05,6.72608037666e-05,5.60506698055e-05,2.24202679222e-05,1.12101339611e-05 GT:AC:AF:NC 1/1:7632,14,4,3,1,1,0,0,0:0.996735013713,0.00182839232075,0.000522397805929,0.000391798354447,0.000130599451482,0.000130599451482,0,0,0:+A=1,+C=1,+AC=3,+G=2535,+T=11,-A=1,-AA=4,-d2=1,-T=3,-G=5097, 1/1:6053,5,2,1,0,0,1,0,0:0.998186015831,0.000824538258575,0.00032981530343,0.000164907651715,0,0,0.000164907651715,0,0:+AC=1,+T=5,+G=1674,-A=2,-AA=2,-G=4379,-AAGA=1, 1/1:7012,14,6,0,2,0,0,0,0:0.996730632552,0.00199004975124,0.000852878464819,0,0.000284292821606,0,0,0,0:+d2=1,+T=11,+G=2241,-A=1,-AA=6,-d2=1,-T=3,-G=4771, 1/1:4646,1,2,0,0,0,0,0,0:0.998710232158,0.000214961306965,0.000429922613929,0,0,0,0,0,0:+A=1,+T=1,+G=1337,-A=2,-AA=2,-G=3309, +#chrM 11000 . CCA ACA,GCA,TCA,CA,C . . AC=54,44,36,3,2;AF=0.000362343152385,0.00029524256861,0.00024156210159,2.01301751325e-05,1.3420116755e-05 GT:AC:AF:NC 0/0:7,3,1,0,0:0.000521920668058,0.000223680286311,7.45600954369e-05,0,0:+C=10229,+T=1,-A=7,-C=3172,-G=3, 0/0:3,4,2,0,0:0.000368233705659,0.000490978274211,0.000245489137106,0,0:+A=2,+C=6575,+T=2,+G=1,-A=1,-C=1563,-G=3, 0/0:9,3,2,0,0:0.000811615114077,0.000270538371359,0.000180358914239,0,0:+A=1,+C=7906,+T=2,-A=8,-C=3169,-G=3, 0/0:0,2,3,1,0:0,0.000220507166483,0.000330760749724,0.000110253583241,0:+d1=1,+C=7066,+T=2,-C=1998,-T=1,-G=2, +#chrM 12000 . AC CC,TC,GC,A . . AC=182586,84,23,1;AF=0.999146337459,0.000459664444955,0.000125860502785,5.47219577328e-06 GT:AC:AF:NC 1/1:13592,8,0,0:0.999264813998,0.000588148801647,0,0:+A=2,+C=7113,+T=5,-C=6479,-T=3, 1/1:9868,1,0,0:0.999797365755,0.000101317122594,0,0:+C=5182,-A=1,-C=4686,-T=1, 1/1:11289,6,3,0:0.998938147067,0.000530926466684,0.000265463233342,0:+A=2,+C=6123,+T=3,+G=1,-A=1,-C=5166,-T=3,-G=2, 1/1:9708,4,0,0:0.999382334775,0.000411776816965,0,0:+A=1,+C=4892,+T=2,-A=1,-C=4816,-T=2, +#chrM 13000 . A G,C,T . . AC=56125,9,6;AF=0.999358985773,0.000160253556739,0.000106835704492 GT:AC:AF:NC 1/1:3816,0,0:0.999214454046,0,0:+A=1,+G=1319,-A=2,-G=2497, 1/1:2814,1,0:0.999289772727,0.000355113636364,0:+G=986,-A=1,-C=1,-G=1828, 1/1:3479,0,0:0.999712643678,0,0:+A=1,+G=1285,-G=2194, 1/1:2679,0,0:1.0,0,0:+G=979,-G=1700, +#chrM 14000 . CTAA TTAA,GTAA,CAA,ATAA,C . . AC=147853,32,22,21,1;AF=0.997766290558,0.000215947740647,0.000148464071695,0.000141715704799,6.74836689521e-06 GT:AC:AF:NC 1/1:10992,3,2,4,0:0.996825972613,0.000272059490342,0.000181372993561,0.000362745987123,0:+A=1,+d1=1,+C=19,+T=7038,-A=3,-d1=1,-C=7,-T=3954,-G=3, 1/1:6853,2,2,0,0:0.997525473071,0.000291120815138,0.000291120815138,0,0:+d1=2,+C=9,+T=4609,-C=4,-T=2244,-G=2, 1/1:9277,2,0,5,0:0.998063474987,0.000215169445939,0,0.000537923614847,0:+A=2,+C=9,+T=5786,-A=3,-C=2,-T=3491,-G=2, 1/1:8125,1,3,2,0:0.997422047631,0.000122759636631,0.000368278909894,0.000245519273263,0:+d1=2,+C=9,+T=5241,-A=2,-d1=1,-C=6,-T=2884,-G=1, +#chrM 15000 . AA GA,TA,CA,A . . AC=132,29,29,4;AF=0.000611470633196,0.000134338245172,0.000134338245172,1.85294131272e-05 GT:AC:AF:NC 0/0:15,2,0,0:0.000956510649152,0.00012753475322,0,0:+A=8715,+T=1,+G=5,-A=6950,-T=1,-G=10, 0/0:13,1,0,0:0.00121985549404,9.38350380032e-05,0,0:+A=6102,+T=1,+G=6,-A=4541,-G=7, 0/0:1,2,5,1:7.86101721563e-05,0.000157220344313,0.000393050860781,7.86101721563e-05:+A=7378,+C=5,+T=2,-A=5334,-d1=1,-G=1, 0/0:6,0,0,0:0.000532765050613,0,0,0:+A=6215,+G=4,-A=5041,-G=2, +#chrM 16000 . AG GG,TG,CG,A . . AC=224750,90,3,1;AF=0.999244175707,0.000400142272808,1.33380757603e-05,4.44602525342e-06 GT:AC:AF:NC 1/1:15911,2,0,0:0.99949745587,0.000125636032414,0,0:+A=3,+T=1,+G=7156,-A=3,-T=1,-G=8755, 1/1:11311,2,0,0:0.999381516169,0.000176709666019,0,0:+A=3,+G=4973,-A=2,-T=2,-G=6338, 1/1:14157,2,0,0:0.999012066897,0.000141133300402,0,0:+A=5,+T=1,+G=6766,-A=7,-T=1,-G=7391, 1/1:11039,3,0,0:0.999185372918,0.000271542360608,0,0:+A=4,+T=1,+G=4959,-A=2,-T=2,-G=6080,
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/real-nofilt.csv.out Thu Sep 12 11:34:23 2013 -0400 @@ -0,0 +1,15 @@ +#SAMPLE CHR POS A C G T CVRG ALLELES MAJOR MINOR MINOR.FREQ.PERC. +THYROID chr1 246704250 29 0 0 0 29 1 A . 0.0 +THYROID chr1 246704257 0 0 0 71 71 1 T . 0.0 +THYROID chr1 246704268 104 0 0 0 104 1 A . 0.0 +THYROID chr1 246704269 0 0 0 105 105 1 T . 0.0 +THYROID chr1 246704363 0 72 3 0 75 0 C G 0.04 +THYROID chr1 246704437 5 130 0 0 135 0 C A 0.03704 +THYROID chr1 246707878 0 0 131 0 131 1 G . 0.0 +THYROID chr1 246714587 30 0 43 0 73 2 G A 0.41096 +THYROID chr1 246729215 1 0 1 88 90 0 T G 0.01111 +THYROID chr1 246729216 1 0 1 90 92 0 T G 0.01087 +THYROID chr1 246729378 16 7 0 0 23 0 A C 0.30435 +THYROID chr1 246729392 29 0 10 0 39 0 A G 0.25641 +THYROID chr7 91502881 0 0 0 26 26 1 T . 0.0 +THYROID chr7 91502897 7 36 0 0 43 0 C A 0.16279
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/real-nofilt.vcf.in Thu Sep 12 11:34:23 2013 -0400 @@ -0,0 +1,27 @@ +##fileformat=VCFv4.1 +##comment="ARGS=-H -r 1 -f 0 -c 0" +##comment="This is a test set of sites copied from real data." +##fileDate=19700101 +##source=Dan +##reference=file:///scratch/dan/galaxy/galaxy-central/database/files/002/dataset_0000.dat +##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> +##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> +##FORMAT=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=AF,Number=.,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=NC,Number=.,Type=String,Description="Nucleotide and indel counts"> +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT THYROID +chr1 246704250 . A N . . AC=1;AF=0.0333333333333 GT:AC:AF:NC 0/0:1:0.0333333333333:+A=15,+N=1,-A=14, +chr1 246704257 . T N . . AC=2;AF=0.027397260274 GT:AC:AF:NC 0/0:2:0.027397260274:+T=52,+N=2,-T=19, +chr1 246704268 . A . . . AC=;AF= GT:AC:AF:NC 0/0:::+A=82,-A=22, +chr1 246704269 . T . . . AC=;AF= GT:AC:AF:NC 0/0:::+T=83,-T=22, +chr1 246704363 . C G,N . . AC=3,1;AF=0.0394736842105,0.0131578947368 GT:AC:AF:NC 0/0:3,1:0.0394736842105,0.0131578947368:+C=52,+G=3,+N=1,-C=20, +chr1 246704437 . C A,N . . AC=5,1;AF=0.0367647058824,0.00735294117647 GT:AC:AF:NC 0/0:5,1:0.0367647058824,0.00735294117647:+C=72,-A=5,-C=58,-N=1, +chr1 246707878 . G GA . . AC=6;AF=0.043795620438 GT:AC:AF:NC 0/0:6:0.043795620438:+G=90,-G=41,-GA=6, +chr1 246714587 . G A . . AC=30;AF=0.41095890411 GT:AC:AF:NC 0/1:30:0.41095890411:+A=10,+G=2,-A=20,-G=41, +chr1 246729215 . T G,A . . AC=1,1;AF=0.0111111111111,0.0111111111111 GT:AC:AF:NC 0/0:1,1:0.0111111111111,0.0111111111111:+A=1,+T=81,-T=7,-G=1, +chr1 246729216 . T G,A . . AC=1,1;AF=0.0111111111111,0.0111111111111 GT:AC:AF:NC 0/0:1,1:0.0111111111111,0.0111111111111:+A=1,+T=81,-T=9,-G=1, +chr1 246729378 . A C . . AC=7;AF=0.304347826087 GT:AC:AF:NC 0/1:7:0.304347826087:-A=16,-C=7, +chr1 246729392 . A G,AG . . AC=10,1;AF=0.25,0.025 GT:AC:AF:NC 0/1:10,1:0.25,0.025:-A=29,-AG=1,-G=10, +chr7 91502881 . TC T . . AC=3;AF=0.103448275862 GT:AC:AF:NC 0/0:3:0.103448275862:+T=14,-d1=3,-T=12, +chr7 91502897 . C A . . AC=7;AF=0.162790697674 GT:AC:AF:NC 0/0:7:0.162790697674:+A=7,+C=17,-C=19, \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/real.csv.out Thu Sep 12 11:34:23 2013 -0400 @@ -0,0 +1,11 @@ +THYROID chr1 246704250 29 0 0 0 29 1 A . 0.0 +THYROID chr1 246704257 0 0 0 71 71 1 T . 0.0 +THYROID chr1 246704268 104 0 0 0 104 1 A . 0.0 +THYROID chr1 246704269 0 0 0 105 105 1 T . 0.0 +THYROID chr1 246704363 0 72 3 0 75 0 C G 0.04 +THYROID chr1 246704437 5 130 0 0 135 0 C A 0.03704 +THYROID chr1 246707878 0 0 131 0 131 1 G . 0.0 +THYROID chr1 246714587 30 0 43 0 73 2 G A 0.41096 +THYROID chr1 246729216 1 0 1 90 92 0 T G 0.01087 +THYROID chr7 91502881 0 0 0 26 26 1 T . 0.0 +THYROID chr7 91502897 7 36 0 0 43 0 C A 0.16279
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/real.vcf.in Thu Sep 12 11:34:23 2013 -0400 @@ -0,0 +1,27 @@ +##fileformat=VCFv4.1 +##comment="ARGS=-r 1 -f 1 -c 10" +##comment="This is a test set of sites copied from real data. It's meant to be run with -f 1 -c 10" +##fileDate=19700101 +##source=Dan +##reference=file:///scratch/dan/galaxy/galaxy-central/database/files/002/dataset_0000.dat +##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> +##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> +##FORMAT=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=AF,Number=.,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> +##FORMAT=<ID=NC,Number=.,Type=String,Description="Nucleotide and indel counts"> +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT THYROID +chr1 246704250 . A N . . AC=1;AF=0.0333333333333 GT:AC:AF:NC 0/0:1:0.0333333333333:+A=15,+N=1,-A=14, +chr1 246704257 . T N . . AC=2;AF=0.027397260274 GT:AC:AF:NC 0/0:2:0.027397260274:+T=52,+N=2,-T=19, +chr1 246704268 . A . . . AC=;AF= GT:AC:AF:NC 0/0:::+A=82,-A=22, +chr1 246704269 . T . . . AC=;AF= GT:AC:AF:NC 0/0:::+T=83,-T=22, +chr1 246704363 . C G,N . . AC=3,1;AF=0.0394736842105,0.0131578947368 GT:AC:AF:NC 0/0:3,1:0.0394736842105,0.0131578947368:+C=52,+G=3,+N=1,-C=20, +chr1 246704437 . C A,N . . AC=5,1;AF=0.0367647058824,0.00735294117647 GT:AC:AF:NC 0/0:5,1:0.0367647058824,0.00735294117647:+C=72,-A=5,-C=58,-N=1, +chr1 246707878 . G GA . . AC=6;AF=0.043795620438 GT:AC:AF:NC 0/0:6:0.043795620438:+G=90,-G=41,-GA=6, +chr1 246714587 . G A . . AC=30;AF=0.41095890411 GT:AC:AF:NC 0/1:30:0.41095890411:+A=10,+G=2,-A=20,-G=41, +chr1 246729215 . T G,A . . AC=1,1;AF=0.0111111111111,0.0111111111111 GT:AC:AF:NC 0/0:1,1:0.0111111111111,0.0111111111111:+A=1,+T=81,-T=7,-G=1, +chr1 246729216 . T G,A . . AC=1,1;AF=0.0111111111111,0.0111111111111 GT:AC:AF:NC 0/0:1,1:0.0111111111111,0.0111111111111:+A=1,+T=81,-T=9,-G=1, +chr1 246729378 . A C . . AC=7;AF=0.304347826087 GT:AC:AF:NC 0/1:7:0.304347826087:-A=16,-C=7, +chr1 246729392 . A G,AG . . AC=10,1;AF=0.25,0.025 GT:AC:AF:NC 0/1:10,1:0.25,0.025:-A=29,-AG=1,-G=10, +chr7 91502881 . TC T . . AC=3;AF=0.103448275862 GT:AC:AF:NC 0/0:3:0.103448275862:+T=14,-d1=3,-T=12, +chr7 91502897 . C A . . AC=7;AF=0.162790697674 GT:AC:AF:NC 0/0:7:0.162790697674:+A=7,+C=17,-C=19, \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/run-tests.py Thu Sep 12 11:34:23 2013 -0400 @@ -0,0 +1,54 @@ +#!/usr/bin/env python +import os +import sys +import subprocess + +DATASETS = [ + 'artificial', + 'artificial-samples', + 'artificial-nofilt', + 'real', + 'real-mit', + 'real-mit-s', + 'real-nofilt', +] +IN_EXT = '.vcf.in' +OUT_EXT = '.csv.out' +ARGS_KEY = '##comment="ARGS=' + +def main(): + + test_dir = os.path.dirname(os.path.relpath(sys.argv[0])) + if test_dir: + test_dir += os.sep + + for dataset in DATASETS: + infile = test_dir+dataset+IN_EXT + outfile = test_dir+dataset+OUT_EXT + + if not os.path.exists(infile): + sys.stderr.write("Error: file not found: "+infile+"\n") + continue + if not os.path.exists(outfile): + sys.stderr.write("Error: file not found: "+outfile+"\n") + continue + + options = read_options(infile) + script_cmd = 'allele-counts.py '+options+' -i '+infile + bash_cmd = 'diff '+outfile+' <('+script_cmd+')' + # print infile+":" + print script_cmd + subprocess.call(['bash', '-c', bash_cmd]) + + +def read_options(infile): + with open(infile, 'r') as infilehandle: + for line in infilehandle: + line.strip() + if ARGS_KEY == line[:len(ARGS_KEY)]: + return line[len(ARGS_KEY):-2] + return '' + + +if __name__ == '__main__': + main() \ No newline at end of file