# HG changeset patch # User Richard Burhans # Date 1373899655 14400 # Node ID 8997f2ca8c7a3dbf9f0e622efa3658ff5102edd4 # Parent 91e835060ad2f1dfa932f201264a5b2bf918fa5d Update to Miller Lab devshed revision bae0d3306d3b diff -r 91e835060ad2 -r 8997f2ca8c7a Population.py --- a/Population.py Mon Jun 03 12:29:29 2013 -0400 +++ b/Population.py Mon Jul 15 10:47:35 2013 -0400 @@ -1,12 +1,17 @@ #!/usr/bin/env python -from OrderedDict import OrderedDict +import OrderedDict +import base64 +import json +import zlib + +import sys class Individual(object): __slots__ = ['_column', '_name', '_alias'] def __init__(self, column, name, alias=None): - self._column = column + self._column = int(column) self._name = name self._alias = alias @@ -42,7 +47,7 @@ class Population(object): def __init__(self, name=None): - self._columns = OrderedDict() + self._columns = OrderedDict.OrderedDict() self._name = name @property @@ -87,8 +92,9 @@ def tag_list(self, delimiter=':'): entries = [] - for column, individual in self._columns.items(): - entry = '{0}{1}{2}'.format(column, delimiter, individual.name) + for column, individual in self._columns.iteritems(): + first_token = individual.name.split()[0] + entry = '{0}{1}{2}'.format(column, delimiter, first_token) entries.append(entry) return entries @@ -122,7 +128,58 @@ individual = Individual(column, name) self.add_individual(individual) + def from_wrapped_dict(self, wrapped_dict): + unwraped_dict = self.unwrap_dict(wrapped_dict) + for name, column in unwraped_dict.iteritems(): + individual = Individual(column, name) + self.add_individual(individual) + + def unwrap_dict(self, wrapped_dict): + decoded_value = self.decode_value(wrapped_dict) + decompressed_value = self.decompress_value(decoded_value) + def _decode_list(data): + rv = [] + for item in data: + if isinstance(item, unicode): + item = item.encode('utf-8') + elif isinstance(item, list): + item = _decode_list(item) + elif isinstance(item, dict): + item = _decode_dict(item) + rv.append(item) + return rv + def _decode_dict(data): + rv = {} + for key, value in data.iteritems(): + if isinstance(key, unicode): + key = key.encode('utf-8') + if isinstance(value, unicode): + value = value.encode('utf-8') + elif isinstance(value, list): + value = _decode_list(value) + elif isinstance(value, dict): + value = _decode_dict(value) + rv[key] = value + return rv + unwrapped_dict = json.loads(decompressed_value, object_hook=_decode_dict) + return unwrapped_dict + + def decode_value(self, value): + try: + return base64.b64decode(value) + except TypeError, message: + print >> sys.stderr, 'base64.b64decode: {0}: {1}'.format(message, value) + sys.exit(1) + + def decompress_value(self, value): + try: + return zlib.decompress(value) + except zlib.error, message: + print >> sys.stderr, 'zlib.decompress: {0}'.format(message) + sys.exit(1) + def individual_names(self): for column, individual in self._columns.items(): - yield individual.name + first_token = individual.name.split()[0] + yield first_token diff -r 91e835060ad2 -r 8997f2ca8c7a add_fst_column.py --- a/add_fst_column.py Mon Jun 03 12:29:29 2013 -0400 +++ b/add_fst_column.py Mon Jul 15 10:47:35 2013 -0400 @@ -1,49 +1,36 @@ #!/usr/bin/env python -# -# add_fst_column.py "$input" "$p1_input" "$p2_input" "$data_source.choice" "$data_source.min_value" "$retain" "$discard_fixed" "$biased" "$output" -# #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) -# #set $arg = '%s:%s' % ($individual_col, $individual) -# "$arg" -# #end for -# - +import gd_util import sys -import subprocess from Population import Population ################################################################################ -if len(sys.argv) < 12: - print >> sys.stderr, "Usage" - sys.exit(1) +if len(sys.argv) != 13: + gd_util.die('Usage') -input, p1_input, p2_input, input_type, genotypes, min_reads, min_qual, retain, discard_fixed, biased, output = sys.argv[1:12] -individual_metadata = sys.argv[12:] +input, p1_input, p2_input, input_type, data_source, min_reads, min_qual, retain, discard_fixed, biased, output, ind_arg = sys.argv[1:] p_total = Population() -p_total.from_tag_list(individual_metadata) +p_total.from_wrapped_dict(ind_arg) p1 = Population() p1.from_population_file(p1_input) if not p_total.is_superset(p1): - print >> sys.stderr, 'There is an individual in population 1 that is not in the SNP table' - sys.exit(1) + gd_util.die('There is an individual in population 1 that is not in the SNP table') p2 = Population() p2.from_population_file(p2_input) if not p_total.is_superset(p2): - print >> sys.stderr, 'There is an individual in population 2 that is not in the SNP table' - sys.exit(1) + gd_util.die('There is an individual in population 2 that is not in the SNP table') ################################################################################ prog = 'Fst_column' -args = [] -args.append(prog) +args = [ prog ] args.append(input) -args.append(genotypes) +args.append(data_source) args.append(min_reads) args.append(min_qual) args.append(retain) @@ -62,12 +49,8 @@ column = int(column) - 2 args.append('{0}:2'.format(column)) -fh = open(output, 'w') - -#print "args:", ' '.join(args) -p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=fh, stderr=sys.stderr) -rc = p.wait() -fh.close() +with open(output, 'w') as fh: + gd_util.run_program(prog, args, stdout=fh) sys.exit(0) diff -r 91e835060ad2 -r 8997f2ca8c7a add_fst_column.xml --- a/add_fst_column.xml Mon Jun 03 12:29:29 2013 -0400 +++ b/add_fst_column.xml Mon Jul 15 10:47:35 2013 -0400 @@ -2,22 +2,27 @@ : Compute a fixation index score for each SNP - add_fst_column.py "$input" "$p1_input" "$p2_input" + #import json + #import base64 + #import zlib + #set $ind_names = $input.dataset.metadata.individual_names + #set $ind_colms = $input.dataset.metadata.individual_columns + #set $ind_dict = dict(zip($ind_names, $ind_colms)) + #set $ind_json = json.dumps($ind_dict, separators=(',',':')) + #set $ind_comp = zlib.compress($ind_json, 9) + #set $ind_arg = base64.b64encode($ind_comp) + add_fst_column.py '$input' '$p1_input' '$p2_input' #if $input_type.choice == '0' - "gd_snp" "$input_type.data_source.choice" + 'gd_snp' '$input_type.data_source.choice' #if $input_type.data_source.choice == '0' - "$input_type.data_source.min_reads" "$input_type.data_source.min_qual" + '$input_type.data_source.min_reads' '$input_type.data_source.min_qual' #else if $input_type.data_source.choice == '1' - "0" "0" + '0' '0' #end if #else if $input_type.choice == '1' - "gd_genotype" "1" "0" "0" + 'gd_genotype' '1' '0' '0' #end if - "$retain" "$discard_fixed" "$biased" "$output" - #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) - #set $arg = '%s:%s' % ($individual_col, $individual) - "$arg" - #end for + '$retain' '$discard_fixed' '$biased' '$output' '$ind_arg' diff -r 91e835060ad2 -r 8997f2ca8c7a aggregate_gd_indivs.py --- a/aggregate_gd_indivs.py Mon Jun 03 12:29:29 2013 -0400 +++ b/aggregate_gd_indivs.py Mon Jul 15 10:47:35 2013 -0400 @@ -1,43 +1,38 @@ #!/usr/bin/env python +import gd_util import sys -import subprocess from Population import Population ################################################################################ -if len(sys.argv) < 6: - print >> sys.stderr, "Usage" - sys.exit(1) +if len(sys.argv) != 6: + gd_util.dir('Usage') -input, p1_input, output, input_type = sys.argv[1:5] -individual_metadata = sys.argv[5:] +input, p1_input, output, input_type, ind_arg = sys.argv[1:] p_total = Population() -p_total.from_tag_list(individual_metadata) +p_total.from_wrapped_dict(ind_arg) p1 = Population() p1.from_population_file(p1_input) if not p_total.is_superset(p1): - print >> sys.stderr, 'There is an individual in the population that is not in the SNP table' - sys.exit(1) + gd_util.die('There is an individual in the population that is not in the SNP table') ################################################################################ prog = 'aggregate' -args = [] -args.append(prog) +args = [ prog ] args.append(input) if input_type == 'gd_snp': - args.append('1') + args.append(1) elif input_type == 'gd_genotype': - args.append('0') + args.append(0) else: - print >> sys.stderr, "unknown input type:", input_type - sys.exit(1) + die('unknown input type: {0}'.format(input_type)) columns = p1.column_list() @@ -46,12 +41,8 @@ column = str(int(column) - 2) args.append(column) -fh = open(output, 'w') - -#print "args:", ' '.join(args) -p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=fh, stderr=sys.stderr) -rc = p.wait() -fh.close() +with open(output, 'w') as fh: + gd_util.run_program(prog, args, stdout=fh) sys.exit(0) diff -r 91e835060ad2 -r 8997f2ca8c7a aggregate_gd_indivs.xml --- a/aggregate_gd_indivs.xml Mon Jun 03 12:29:29 2013 -0400 +++ b/aggregate_gd_indivs.xml Mon Jul 15 10:47:35 2013 -0400 @@ -2,16 +2,22 @@ : Append summary columns for a population - aggregate_gd_indivs.py "$input" "$p1_input" "$output" + #import json + #import base64 + #import zlib + #set $ind_names = $input.dataset.metadata.individual_names + #set $ind_colms = $input.dataset.metadata.individual_columns + #set $ind_dict = dict(zip($ind_names, $ind_colms)) + #set $ind_json = json.dumps($ind_dict, separators=(',',':')) + #set $ind_comp = zlib.compress($ind_json, 9) + #set $ind_arg = base64.b64encode($ind_comp) + aggregate_gd_indivs.py '$input' '$p1_input' '$output' #if $input_type.choice == '0' - "gd_snp" + 'gd_snp' #else if $input_type.choice == '1' - "gd_genotype" + 'gd_genotype' #end if - #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) - #set $arg = '%s:%s' % ($individual_col, $individual) - "$arg" - #end for + '$ind_arg' diff -r 91e835060ad2 -r 8997f2ca8c7a average_fst.py --- a/average_fst.py Mon Jun 03 12:29:29 2013 -0400 +++ b/average_fst.py Mon Jul 15 10:47:35 2013 -0400 @@ -1,17 +1,15 @@ #!/usr/bin/env python +import gd_util import sys -import subprocess from Population import Population ################################################################################ -if len(sys.argv) < 12: - print >> sys.stderr, "Usage" - sys.exit(1) +if len(sys.argv) != 12: + gd_util.die('Usage') -input, p1_input, p2_input, input_type, data_source, min_total_count, discard_fixed, output, shuffles, p0_input = sys.argv[1:11] -individual_metadata = sys.argv[11:] +input, p1_input, p2_input, input_type, data_source, min_total_count, discard_fixed, output, shuffles, p0_input, ind_arg = sys.argv[1:] try: shuffle_count = int(shuffles) @@ -19,34 +17,30 @@ shuffle_count = 0 p_total = Population() -p_total.from_tag_list(individual_metadata) +p_total.from_wrapped_dict(ind_arg) p1 = Population() p1.from_population_file(p1_input) if not p_total.is_superset(p1): - print >> sys.stderr, 'There is an individual in population 1 that is not in the SNP table' - sys.exit(1) + gd_util.die('There is an individual in population 1 that is not in the SNP table') p2 = Population() p2.from_population_file(p2_input) if not p_total.is_superset(p2): - print >> sys.stderr, 'There is an individual in population 2 that is not in the SNP table' - sys.exit(1) + gd_util.die('There is an individual in population 2 that is not in the SNP table') p0 = None if shuffle_count > 0: p0 = Population() p0.from_population_file(p0_input) if not p_total.is_superset(p0): - print >> sys.stderr, 'There is an individual in population 0 that is not in the SNP table' - sys.exit(1) + gd_util.die('There is an individual in population 0 that is not in the SNP table') ################################################################################ prog = 'Fst_ave' -args = [] -args.append(prog) +args = [ prog ] args.append(input) args.append(data_source) args.append(min_total_count) @@ -72,12 +66,8 @@ column = int(column) - 2 args.append('{0}:0'.format(column)) -fh = open(output, 'w') - -#print "args:", ' '.join(args) -p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=fh, stderr=sys.stderr) -rc = p.wait() -fh.close() +with open(output, 'w') as fh: + gd_util.run_program(prog, args, stdout=fh) sys.exit(0) diff -r 91e835060ad2 -r 8997f2ca8c7a average_fst.xml --- a/average_fst.xml Mon Jun 03 12:29:29 2013 -0400 +++ b/average_fst.xml Mon Jul 15 10:47:35 2013 -0400 @@ -2,27 +2,33 @@ : Estimate the relative fixation index between two populations - average_fst.py "$input" "$p1_input" "$p2_input" + #import json + #import base64 + #import zlib + #set $ind_names = $input.dataset.metadata.individual_names + #set $ind_colms = $input.dataset.metadata.individual_columns + #set $ind_dict = dict(zip($ind_names, $ind_colms)) + #set $ind_json = json.dumps($ind_dict, separators=(',',':')) + #set $ind_comp = zlib.compress($ind_json, 9) + #set $ind_arg = base64.b64encode($ind_comp) + average_fst.py '$input' '$p1_input' '$p2_input' #if $input_type.choice == '0' - "gd_snp" "$input_type.data_source.choice" + 'gd_snp' '$input_type.data_source.choice' #if $input_type.data_source.choice == '0' - "$input_type.data_source.min_value" + '$input_type.data_source.min_value' #else if $input_type.data_source.choice == '1' - "1" + '1' #end if #else if $input_type.choice == '1' - "gd_genotype" "1" "1" + 'gd_genotype' '1' '1' #end if - "$discard_fixed" "$output" + '$discard_fixed' '$output' #if $use_randomization.choice == '0' - "0" "/dev/null" + '0' '/dev/null' #else if $use_randomization.choice == '1' - "$use_randomization.shuffles" "$use_randomization.p0_input" + '$use_randomization.shuffles' '$use_randomization.p0_input' #end if - #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) - #set $arg = '%s:%s' % ($individual_col, $individual) - "$arg" - #end for + '$ind_arg' diff -r 91e835060ad2 -r 8997f2ca8c7a commits.log --- a/commits.log Mon Jun 03 12:29:29 2013 -0400 +++ b/commits.log Mon Jul 15 10:47:35 2013 -0400 @@ -1,3 +1,9 @@ + +:2fb0cd69fe08 +cathy 2013-07-08 15:21 +New versions of Rank Pathways and Rank Terms (code from Oscar, 16 May 2013). +Rank Pathways still needs updates in the help text and sample input/output, +and both tools need test datasets. :6255a0a7fad5 cathy 2013-05-10 15:45 diff -r 91e835060ad2 -r 8997f2ca8c7a coverage_distributions.py --- a/coverage_distributions.py Mon Jun 03 12:29:29 2013 -0400 +++ b/coverage_distributions.py Mon Jul 15 10:47:35 2013 -0400 @@ -1,60 +1,43 @@ #!/usr/bin/env python +import gd_util import os -import errno import sys -import shutil -import subprocess from Population import Population import gd_composite ################################################################################ -def mkdir_p(path): - try: - os.makedirs(path) - except OSError, e: - if e.errno <> errno.EEXIST: - raise +if len(sys.argv) < 7: + gd_util.die('Usage') -################################################################################ +input, data_source, output, extra_files_path, ind_arg = sys.argv[1:6] -if len(sys.argv) < 7: - print >> sys.stderr, "Usage" - sys.exit(1) - -input, data_source, output, extra_files_path = sys.argv[1:5] - -individual_metadata = [] population_info = [] p1_input = None all_individuals = False -for arg in sys.argv[5:]: +for arg in sys.argv[6:]: if arg == 'all_individuals': all_individuals = True elif len(arg) > 12 and arg[:12] == 'individuals:': p1_input = arg[12:] - elif len(arg) > 11: - if arg[:11] == 'population:': - file, name = arg[11:].split(':', 1) - population_info.append((file, name)) - elif arg[:11] == 'individual:': - individual_metadata.append(arg[11:]) + elif len(arg) > 11 and arg[:11] == 'population:': + file, name = arg[11:].split(':', 1) + population_info.append((file, name)) p_total = Population() -p_total.from_tag_list(individual_metadata) +p_total.from_wrapped_dict(ind_arg) ################################################################################ -mkdir_p(extra_files_path) +gd_util.mkdir_p(extra_files_path) ################################################################################ prog = 'coverage' -args = [] -args.append(prog) +args = [ prog ] args.append(input) args.append(data_source) @@ -72,8 +55,7 @@ population_list.append(this_pop) p1.from_population_file(p1_input) if not p_total.is_superset(p1): - print >> sys.stderr, 'There is an individual in the population that is not in the SNP table' - sys.exit(1) + gd_util.die('There is an individual in the population that is not in the SNP table') tags = p1.tag_list() else: tags = [] @@ -84,8 +66,7 @@ population_list.append(this_pop) population.from_population_file(population_file) if not p_total.is_superset(population): - print >> sys.stderr, 'There is an individual in the {} population that is not in the SNP table'.format(population_name) - sys.exit(1) + gd_util.die('There is an individual in the {} population that is not in the SNP table'.format(population_name)) columns = population.column_list() for column in columns: tags.append('{0}:{1}'.format(column, population_name)) @@ -95,55 +76,37 @@ ## text output coverage_file = 'coverage.txt' -fh = open(coverage_file, 'w') -#print "args:", ' '.join(args) -p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=fh, stderr=sys.stderr) -rc = p.wait() -fh.close() +with open(coverage_file, 'w') as fh: + gd_util.run_program(prog, args, stdout=fh) ## graphical output -fh = open(coverage_file) coverage2_file = 'coverage2.txt' -ofh = open(coverage2_file, 'w') - -for line in fh: - line = line.rstrip('\r\n') - elems = line.split('\t') - name = elems.pop(0) - values = [ elems[0] ] - for idx in range(1, len(elems)): - val = str(float(elems[idx]) - float(elems[idx-1])) - values.append(val) - print >> ofh, '{0}\t{1}'.format(name, '\t'.join(values)) - -fh.close() -ofh.close() +with open(coverage_file) as fh, open(coverage2_file, 'w') as ofh: + for line in fh: + line = line.rstrip('\r\n') + elems = line.split('\t') + name = elems.pop(0) + values = [ elems[0] ] + for idx in range(1, len(elems)): + val = str(float(elems[idx]) - float(elems[idx-1])) + values.append(val) + print >> ofh, '{0}\t{1}'.format(name, '\t'.join(values)) ################################################################################ -prog = 'R' +prog = 'Rscript' -args = [] -args.append(prog) -args.append('--vanilla') -args.append('--quiet') +args = [ prog ] _realpath = os.path.realpath(__file__) _script_dir = os.path.dirname(_realpath) r_script_file = os.path.join(_script_dir, 'coverage_plot.r') - -ifh = open(r_script_file) -ofh = open('/dev/null', 'w') -#print "args:", ' '.join(args) -p = subprocess.Popen(args, bufsize=-1, stdin=ifh, stdout=ofh, stderr=None) -rc = p.wait() -ifh.close() -ofh.close() +args.append(r_script_file) pdf_file = os.path.join(extra_files_path, 'coverage.pdf') -shutil.copy2('coverage.pdf', pdf_file) -os.remove('coverage.pdf') -os.remove(coverage2_file) +args.append(pdf_file) + +gd_util.run_program(prog, args) ################################################################################ @@ -159,7 +122,6 @@ info_page.add_output_parameter(out_pdf) info_page.add_output_parameter(out_txt) - if data_source == '0': data_source_value = 'sequence coverage' elif data_source == '1': @@ -176,12 +138,8 @@ misc_individuals = gd_composite.Parameter(name='Individuals', value=tags, display_type=gd_composite.DisplayTagList()) info_page.add_misc(misc_individuals) - - - with open (output, 'w') as ofh: print >> ofh, info_page.render() - sys.exit(0) diff -r 91e835060ad2 -r 8997f2ca8c7a coverage_distributions.xml --- a/coverage_distributions.xml Mon Jun 03 12:29:29 2013 -0400 +++ b/coverage_distributions.xml Mon Jul 15 10:47:35 2013 -0400 @@ -2,22 +2,27 @@ : Examine sequence coverage for SNPs - coverage_distributions.py "$input" "0" "$output" "$output.files_path" + #import json + #import base64 + #import zlib + #set $ind_names = $input.dataset.metadata.individual_names + #set $ind_colms = $input.dataset.metadata.individual_columns + #set $ind_dict = dict(zip($ind_names, $ind_colms)) + #set $ind_json = json.dumps($ind_dict, separators=(',',':')) + #set $ind_comp = zlib.compress($ind_json, 9) + #set $ind_arg = base64.b64encode($ind_comp) + coverage_distributions.py '$input' '0' '$output' '$output.files_path' '$ind_arg' #if $individuals.choice == '0' - "all_individuals" + 'all_individuals' #else if $individuals.choice == '1' #set $arg = 'individuals:%s' % str($individuals.p1_input) - "$arg" + '$arg' #else if $individuals.choice == '2' #for $population in $individuals.populations #set $arg = 'population:%s:%s' % (str($population.p_input), str($population.p_input.name)) - "$arg" + '$arg' #end for #end if - #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) - #set $individual_arg = 'individual:%s:%s' % ($individual_col, $individual) - "$individual_arg" - #end for diff -r 91e835060ad2 -r 8997f2ca8c7a coverage_plot.r --- a/coverage_plot.r Mon Jun 03 12:29:29 2013 -0400 +++ b/coverage_plot.r Mon Jul 15 10:47:35 2013 -0400 @@ -1,19 +1,22 @@ -x <- read.table('coverage2.txt', skip=1, sep='\t') +args <- commandArgs(TRUE); +output_file <- args[1]; + +x <- read.table('coverage2.txt', skip=1, sep='\t'); -individuals <- dim(x)[1] -max_cov <- dim(x)[2] - 2 -max_val <- max(x[-1]) / 100 -colors <- rainbow(individuals) +individuals <- dim(x)[1]; +max_cov <- dim(x)[2] - 2; +max_val <- max(x[-1]) / 100; +colors <- rainbow(individuals); -line_width = 3 -xt = t(x) +line_width = 3; +xt = t(x); -xvals <- c(0:max_cov) -values <- as.numeric(as.vector(xt[,1][-1]))/100 +xvals <- c(0:max_cov); +values <- as.numeric(as.vector(xt[,1][-1]))/100; -pdf(file='coverage.pdf', onefile=TRUE, width=10, height=6); +pdf(file=output_file, onefile=TRUE, width=10, height=6); -plot(xvals, values, type='l', ylim=c(0, max_val), xlim=c(0, max_cov), col=colors[1], lwd=line_width, xlab="Coverage", ylab="Proportion") +plot(xvals, values, type='l', ylim=c(0, max_val), xlim=c(0, max_cov), col=colors[1], lwd=line_width, xlab="Coverage", ylab="Proportion"); if (individuals > 1) { for (i in 2:individuals) { @@ -23,9 +26,7 @@ } -names <- as.vector(t(x[1])) -legend(x='topright', legend=names, fill=colors, bty='n') +names <- as.vector(t(x[1])); +legend(x='topright', legend=names, fill=colors, bty='n'); -dev.off() - - +dev.off(); diff -r 91e835060ad2 -r 8997f2ca8c7a diversity_pi.py --- a/diversity_pi.py Mon Jun 03 12:29:29 2013 -0400 +++ b/diversity_pi.py Mon Jul 15 10:47:35 2013 -0400 @@ -1,54 +1,29 @@ #!/usr/bin/env python +import gd_util import sys -import subprocess from Population import Population -def run_program(prog, args, stdout_file=None, space_to_tab=False): - #print "args: ", ' '.join(args) - p = subprocess.Popen(args, bufsize=-1, executable=prog, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - (stdoutdata, stderrdata) = p.communicate() - rc = p.returncode - - if stdout_file is not None: - with open(stdout_file, 'w') as ofh: - lines = stdoutdata.split('\n') - for line in lines: - line = line.strip() - if line: - if space_to_tab: - line = line.replace(' ', '\t') - print >> ofh, line - - if rc != 0: - print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) - print >> sys.stderr, stderrdata - sys.exit(1) - ################################################################################ -if len(sys.argv) < 7: - print >> sys.stderr, "Usage" - sys.exit(1) +if len(sys.argv) != 7: + gd_util.die('Usage') -snp_input, coverage_input, indiv_input, min_coverage, output = sys.argv[1:6] -individual_metadata = sys.argv[6:] +snp_input, coverage_input, indiv_input, min_coverage, output, ind_arg = sys.argv[1:] p_total = Population() -p_total.from_tag_list(individual_metadata) +p_total.from_wrapped_dict(ind_arg) p1 = Population() p1.from_population_file(indiv_input) if not p_total.is_superset(p1): - print >> sys.stderr, 'There is an individual in the population individuals that is not in the SNP table' - sys.exit(1) + gd_util.die('There is an individual in the population individuals that is not in the SNP table') ################################################################################ prog = 'mt_pi' args = [ prog ] - args.append(snp_input) args.append(coverage_input) args.append(min_coverage) @@ -56,5 +31,8 @@ for tag in p1.tag_list(): args.append(tag) -run_program(prog, args, stdout_file=output) +with open(output, 'w') as fh: + gd_util.run_program(prog, args, stdout=fh) + sys.exit(0) + diff -r 91e835060ad2 -r 8997f2ca8c7a diversity_pi.xml --- a/diversity_pi.xml Mon Jun 03 12:29:29 2013 -0400 +++ b/diversity_pi.xml Mon Jul 15 10:47:35 2013 -0400 @@ -2,11 +2,16 @@ &pi; - diversity_pi.py "$input" "$coverage_input" "$indiv_input" "$min_coverage" "$output" - #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) - #set $arg = '%s:%s' % ($individual_col, $individual) - "$arg" - #end for + #import json + #import base64 + #import zlib + #set $ind_names = $input.dataset.metadata.individual_names + #set $ind_colms = $input.dataset.metadata.individual_columns + #set $ind_dict = dict(zip($ind_names, $ind_colms)) + #set $ind_json = json.dumps($ind_dict, separators=(',',':')) + #set $ind_comp = zlib.compress($ind_json, 9) + #set $ind_arg = base64.b64encode($ind_comp) + diversity_pi.py '$input' '$coverage_input' '$indiv_input' '$min_coverage' '$output' '$ind_arg' diff -r 91e835060ad2 -r 8997f2ca8c7a dpmix.py --- a/dpmix.py Mon Jun 03 12:29:29 2013 -0400 +++ b/dpmix.py Mon Jul 15 10:47:35 2013 -0400 @@ -1,9 +1,8 @@ #!/usr/bin/env python -import errno +import gd_util import sys import os -import subprocess from Population import Population import gd_composite from dpmix_plot import make_dpmix_plot @@ -11,87 +10,70 @@ ################################################################################ -def mkdir_p(path): - try: - os.makedirs(path) - except OSError, e: - if e.errno <> errno.EEXIST: - raise - -def run_program(prog, args, stdout_file=None, space_to_tab=False): - #print "args: ", ' '.join(args) - p = subprocess.Popen(args, bufsize=-1, executable=prog, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - (stdoutdata, stderrdata) = p.communicate() - rc = p.returncode - - if stdout_file is not None: - with open(stdout_file, 'w') as ofh: - lines = stdoutdata.split('\n') - for line in lines: - line = line.strip() - if line: - if space_to_tab: - line = line.replace(' ', '\t') - print >> ofh, line - - if rc != 0: - print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) - print >> sys.stderr, stderrdata - sys.exit(1) - -################################################################################ - -if len(sys.argv) < 16: +if len(sys.argv) != 22: print "usage" sys.exit(1) -input, input_type, data_source, switch_penalty, ap1_input, ap2_input, p_input, output, output2, output2_dir, dbkey, ref_column, galaxy_data_index_dir, heterochromatin_loc_file = sys.argv[1:15] -individual_metadata = sys.argv[15:] +input, input_type, data_source, switch_penalty, ap1_input, ap1_name, ap2_input, ap2_name, ap3_input, ap3_name, p_input, output, output2, output2_dir, dbkey, ref_column, galaxy_data_index_dir, heterochromatin_loc_file, ind_arg, het_arg, add_logs = sys.argv[1:] + +if ap3_input == '/dev/null': + populations = 2 +else: + populations = 3 chrom = 'all' -add_logs = '0' -loc_path = os.path.join(galaxy_data_index_dir, heterochromatin_loc_file) -location_file = LocationFile(loc_path) -heterochrom_path = location_file.get_values_if_exists(dbkey) -if heterochrom_path is None: +if het_arg == 'use_installed': + loc_path = os.path.join(galaxy_data_index_dir, heterochromatin_loc_file) + location_file = LocationFile(loc_path) + heterochrom_path = location_file.get_values_if_exists(dbkey) + if heterochrom_path is None: + heterochrom_path = '/dev/null' +elif het_arg == 'use_none': heterochrom_path = '/dev/null' +else: + heterochrom_path = het_arg population_list = [] p_total = Population() -p_total.from_tag_list(individual_metadata) +p_total.from_wrapped_dict(ind_arg) ap1 = Population(name='Ancestral population 1') ap1.from_population_file(ap1_input) population_list.append(ap1) if not p_total.is_superset(ap1): - print >> sys.stderr, 'There is an individual in ancestral population 1 that is not in the SNP table' - sys.exit(1) + gd_util.die('There is an individual in ancestral population 1 that is not in the SNP table') ap2 = Population(name='Ancestral population 2') ap2.from_population_file(ap2_input) population_list.append(ap2) if not p_total.is_superset(ap2): - print >> sys.stderr, 'There is an individual in ancestral population 2 that is not in the SNP table' - sys.exit(1) + gd_util.die('There is an individual in ancestral population 2 that is not in the SNP table') + +if populations == 3: + ap3 = Population(name='Ancestral population 3') + ap3.from_population_file(ap3_input) + population_list.append(ap3) + if not p_total.is_superset(ap3): + gd_util.die('There is an individual in ancestral population 3 that is not in the SNP table') p = Population(name='Potentially admixed') p.from_population_file(p_input) population_list.append(p) if not p_total.is_superset(p): - print >> sys.stderr, 'There is an individual in the population that is not in the SNP table' - sys.exit(1) + gd_util.die('There is an individual in the population that is not in the SNP table') -mkdir_p(output2_dir) +gd_util.mkdir_p(output2_dir) ################################################################################ # Create tabular file ################################################################################ -misc_file = os.path.join(output2_dir, 'misc.txt') +misc_file = os.path.join(output2_dir, 'summary.txt') prog = 'dpmix' + args = [ prog ] args.append(input) args.append(ref_column) @@ -104,33 +86,64 @@ columns = ap1.column_list() for column in columns: + col = int(column) + name = ap1.individual_with_column(column).name + first_token = name.split()[0] if input_type == 'gd_genotype': - args.append('{0}:1:{1}'.format(int(column) - 2, ap1.individual_with_column(column).name)) - else: - args.append('{0}:1:{1}'.format(column, ap1.individual_with_column(column).name)) + col -= 2 + args.append('{0}:1:{1}'.format(col, first_token)) columns = ap2.column_list() for column in columns: + col = int(column) + name = ap2.individual_with_column(column).name + first_token = name.split()[0] if input_type == 'gd_genotype': - args.append('{0}:2:{1}'.format(int(column) - 2, ap2.individual_with_column(column).name)) - else: - args.append('{0}:2:{1}'.format(column, ap2.individual_with_column(column).name)) + col -= 2 + args.append('{0}:2:{1}'.format(col, first_token)) + +if populations == 3: + columns = ap3.column_list() + for column in columns: + col = int(column) + name = ap3.individual_with_column(column).name + first_token = name.split()[0] + if input_type == 'gd_genotype': + col -= 2 + args.append('{0}:3:{1}'.format(col, first_token)) columns = p.column_list() for column in columns: + col = int(column) + name = p.individual_with_column(column).name + first_token = name.split()[0] if input_type == 'gd_genotype': - args.append('{0}:0:{1}'.format(int(column) - 2, p.individual_with_column(column).name)) - else: - args.append('{0}:0:{1}'.format(column, p.individual_with_column(column).name)) + col -= 2 + args.append('{0}:0:{1}'.format(col, first_token)) -run_program(None, args, stdout_file=output, space_to_tab=True) +with open(output, 'w') as fh: + gd_util.run_program(prog, args, stdout=fh) ################################################################################ # Create pdf file ################################################################################ -pdf_file = os.path.join(output2_dir, 'dpmix.pdf') -make_dpmix_plot(dbkey, output, pdf_file, galaxy_data_index_dir) +if populations == 3: + state2name = { + 0:'heterochromatin', + 1:ap1_name, + 2:ap2_name, + 3:ap3_name + } +else: + state2name = { + 0:'heterochromatin', + 1:ap1_name, + 2:ap2_name + } + +pdf_file = os.path.join(output2_dir, 'picture.pdf') +make_dpmix_plot(dbkey, output, pdf_file, galaxy_data_index_dir, state2name=state2name, populations=populations) ################################################################################ # Create html @@ -142,8 +155,8 @@ display_file = gd_composite.DisplayFile() display_value = gd_composite.DisplayValue() -out_pdf = gd_composite.Parameter(name='dpmix.pdf', value='dpmix.pdf', display_type=display_file) -out_misc = gd_composite.Parameter(name='misc.txt', value='misc.txt', display_type=display_file) +out_pdf = gd_composite.Parameter(name='picture.pdf', value='picture.pdf', display_type=display_file) +out_misc = gd_composite.Parameter(name='summary.txt', value='summary.txt', display_type=display_file) info_page.add_output_parameter(out_pdf) info_page.add_output_parameter(out_misc) @@ -168,4 +181,3 @@ sys.exit(0) - diff -r 91e835060ad2 -r 8997f2ca8c7a dpmix.xml --- a/dpmix.xml Mon Jun 03 12:29:29 2013 -0400 +++ b/dpmix.xml Mon Jul 15 10:47:35 2013 -0400 @@ -1,18 +1,37 @@ - : Map genomic intervals resembling specified ancestral populations + : Map genomic intervals resembling specified source populations - dpmix.py "$input" + #import json + #import base64 + #import zlib + #set $ind_names = $input.dataset.metadata.individual_names + #set $ind_colms = $input.dataset.metadata.individual_columns + #set $ind_dict = dict(zip($ind_names, $ind_colms)) + #set $ind_json = json.dumps($ind_dict, separators=(',',':')) + #set $ind_comp = zlib.compress($ind_json, 9) + #set $ind_arg = base64.b64encode($ind_comp) + dpmix.py '$input' #if $input_type.choice == '0' - "gd_snp" "$input_type.data_source" + 'gd_snp' '$input_type.data_source' #else if $input_type.choice == '1' - "gd_genotype" "1" + 'gd_genotype' '1' #end if - "$switch_penalty" "$ap1_input" "$ap2_input" "$p_input" "$output" "$output2" "$output2.files_path" "$input.dataset.metadata.dbkey" "$input.dataset.metadata.ref" "$GALAXY_DATA_INDEX_DIR" "gd.heterochromatic.loc" - #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) - #set $arg = '%s:%s' % ($individual_col, $individual) - "$arg" - #end for + #if $third_pop.choice == '0' + #set $ap3_arg = '/dev/null' + #set $ap3_name_arg = '' + #else if $third_pop.choice == '1' + #set $ap3_arg = $third_pop.ap3_input + #set $ap3_name_arg = $third_pop.ap3_input.name + #end if + #if $user_het.choice == '0' + #set $het_arg = 'use_installed' + #else if $user_het.choice == '1' + #set $het_arg = $user_het.het_file + #else if $user_het.choice == '2' + #set $het_arg = 'use_none' + #end if + '$switch_penalty' '$ap1_input' '$ap1_input.name' '$ap2_input' '$ap2_input.name' '$ap3_arg' '$ap3_name_arg' '$p_input' '$output' '$output2' '$output2.files_path' '$input.dataset.metadata.dbkey' '$input.dataset.metadata.ref' '$GALAXY_DATA_INDEX_DIR' 'gd.heterochromatic.loc' '$ind_arg' '$het_arg' '1' @@ -38,11 +57,43 @@ - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -88,27 +139,37 @@ **What it does** -The user specifies two "ancestral" populations (i.e., sources for -chromosomes) and a set of potentially admixed individuals, and chooses -between the sequence coverage or the estimated genotypes to measure -the similarity of genomic intervals in admixed individuals to the two -classes of ancestral chromosomes. The user also picks a "genotype switch penalty", -typically between 10 and 100. For each potentially admixed individual, -the program divides the genome into three "genotypes": (0) homozygous -for the first ancestral population (i.e., both chromosomes from that -population), (1) heterozygous, or (2) homozygous for the second ancestral -population. Parts of a chromosome that are labeled as "heterochromatic" -are given the non-genotype "3". Smaller values of the switch penalty -(corresponding to more ancient admixture events) generally lead to the -reconstruction of more frequent changes between genotypes. +The user specifies two or three source populations (i.e., sources +for chromosomes) and a set of potentially admixed individuals, and +chooses between the sequence coverage or the estimated genotypes to +measure the similarity of genomic intervals in admixed individuals to +the three classes of source chromosomes. The user also specifies a +"switch penalty", controlling the strength of evidence needed to switch +between source populations as the the program scans along a chromosome. +Choice of picksan appropriate value depends on the number of SNPs and, to +a lesser extent, on the time since the admixture events. With several +million SNPs genome-wide, reasonable values might fall between 10 +and 100. If there are 3 source populatons, then for each potentially +admixed individual the program divides the genome into six "genotypes": + +1. homozygous for the first source population (i.e., both chromosomes from that population), +2. homozygous for the second source population, +3. homozygous for the third source population, +4. heterozygous for the first and second populations (i.e., one chromosome from each), +5. heterozygous for the first and third populations, or +6. heterozygous for the second and third populations. + +Parts of a reference chromosome that are labeled as "heterochromatic" +are given the "non-genotype" 0. With two source populations, only +"genotypes" 1, 2 and 3 are possible, where 3 now means heterozygous in +the two source populations. There are two output datasets generated. A tabular dataset with chromosome, start, stop, and pairs of columns containing the "genotypes" from above and label from the admixed individual. The second dataset is a composite dataset with general information from the run and a link to a pdf which -graphically shows the ancestral population along each of the chromosomes. +graphically shows the source population along each of the chromosomes. The second link is to a text file with summary information of the "genotypes" over the whole genome. - diff -r 91e835060ad2 -r 8997f2ca8c7a dpmix_plot.py --- a/dpmix_plot.py Mon Jun 03 12:29:29 2013 -0400 +++ b/dpmix_plot.py Mon Jul 15 10:47:35 2013 -0400 @@ -37,6 +37,7 @@ individuals = [] data = {} chrom_len = {} + used_states = [] with open(input_file) as fh: for line in fh: @@ -47,6 +48,9 @@ p1, p2, state = map(int, elems[1:4]) id = elems[4] + if state not in used_states: + used_states.append(state) + if chrom not in chroms: chroms.append(chrom) @@ -60,7 +64,7 @@ if p2 > chrom_len.setdefault(chrom, 0): chrom_len[chrom] = p2 - return chroms, individuals, data, chrom_len + return chroms, individuals, data, chrom_len, used_states def check_chroms(chroms, chrom_len, dbkey): error = 0 @@ -112,15 +116,43 @@ patch2 = make_rectangle(p1, p2, top_color, bottom=0.5) return [patch1, patch2] -def make_state_rectangle(p1, p2, state, chrom, individual): +def make_state_rectangle_2pop(p1, p2, state, chrom, individual): + p1_color = 'r' + p2_color = 'g' + heterochromatin_color = '#c7c7c7' + if state == 0: - return [ make_rectangle(p1, p2, 'r') ] + return [ make_rectangle(p1, p2, heterochromatin_color) ] elif state == 1: - return make_split_rectangle(p1, p2, 'r', 'g') + return [ make_rectangle(p1, p2, p1_color) ] elif state == 2: - return [ make_rectangle(p1, p2, 'g') ] + return [ make_rectangle(p1, p2, p2_color) ] elif state == 3: - return [ make_rectangle(p1, p2, '#c7c7c7') ] + return make_split_rectangle(p1, p2, p1_color, p2_color) + else: + print >> sys.stderr, "Unknown state: {0}: {1} {2} {3} {4}".format(state, chrom, p1, p2, state, individual) + sys.exit(1) + +def make_state_rectangle_3pop(p1, p2, state, chrom, individual): + p1_color = 'r' + p2_color = 'g' + p3_color = 'b' + heterochromatin_color = '#c7c7c7' + + if state == 0: + return [ make_rectangle(p1, p2, heterochromatin_color) ] + if state == 1: + return [ make_rectangle(p1, p2, p1_color) ] + if state == 2: + return [ make_rectangle(p1, p2, p2_color) ] + if state == 3: + return [ make_rectangle(p1, p2, p3_color) ] + if state == 4: + return make_split_rectangle(p1, p2, p1_color, p2_color) + if state == 5: + return make_split_rectangle(p1, p2, p1_color, p3_color) + if state == 6: + return make_split_rectangle(p1, p2, p2_color, p3_color) else: print >> sys.stderr, "Unknown state: {0}: {1} {2} {3} {4}".format(state, chrom, p1, p2, state, individual) sys.exit(1) @@ -195,9 +227,16 @@ ################################################################################ -def make_dpmix_plot(input_dbkey, input_file, output_file, galaxy_data_index_dir): +def make_dpmix_plot(input_dbkey, input_file, output_file, galaxy_data_index_dir, state2name=None, populations=3): fs_chrom_len = build_chrom_len_dict(input_dbkey, galaxy_data_index_dir) - chroms, individuals, data, chrom_len = parse_input_file(input_file) + chroms, individuals, data, chrom_len, used_states = parse_input_file(input_file) + + if populations == 3: + make_state_rectangle = make_state_rectangle_3pop + elif populations == 2: + make_state_rectangle = make_state_rectangle_2pop + else: + pass for chrom in chrom_len.keys(): if chrom in fs_chrom_len: @@ -214,7 +253,26 @@ ind_height = 0.25 total_height = 0.0 - at_top = True + + ## make a legend + ## only print out states that are + ## 1) in the data + ## - AND - + ## 2) in the state2name map + ## here, we only calculate the space needed + legend_states = [] + if state2name is not None: + for state in used_states: + if state in state2name: + legend_states.append(state) + + if legend_states: + total_height += len(legend_states) * (ind_space + ind_height) + total_height += (top_space - ind_space) + at_top = False + else: + at_top = True + for chrom in chroms: if at_top: total_height += (top_space + chrom_height) @@ -235,7 +293,29 @@ fig = plt.figure(figsize=(width, height)) - at_top = True + if legend_states: + at_top = True + for state in sorted(legend_states): + if at_top: + bottom -= (top_space + ind_height)/height + at_top = False + else: + bottom -= (ind_space + ind_height)/height + ## add code here to draw legend + # [left, bottom, width, height] + ax1 = fig.add_axes([0.0, bottom, 0.09, ind_height/height]) + plt.axis('off') + ax1.set_xlim(0, 1) + ax1.set_ylim(0, 1) + for patch in make_state_rectangle(0, 1, state, 'legend', state2name[state]): + ax1.add_patch(patch) + + ax2 = fig.add_axes([0.10, bottom, 0.88, ind_height/height], frame_on=False) + plt.axis('off') + plt.text(0.0, 0.5, state2name[state], fontsize=10, ha='left', va='center') + else: + at_top = True + for_webb = False for chrom in chroms: @@ -283,6 +363,7 @@ else: plt.axis('off') for p1, p2, state in sorted(data[chrom][individual]): + #for patch in make_state_rectangle(p1, p2, state, chrom, individual): for patch in make_state_rectangle(p1, p2, state, chrom, individual): ax2.add_patch(patch) @@ -295,3 +376,6 @@ make_dpmix_plot(input_dbkey, input_file, output_file, galaxy_data_index_dir) sys.exit(0) +## notes +# 1) pass in a state to name mapping +# 2) only print out names for states which exist in the data, and are in the state to name mapping diff -r 91e835060ad2 -r 8997f2ca8c7a draw_variants.py --- a/draw_variants.py Mon Jun 03 12:29:29 2013 -0400 +++ b/draw_variants.py Mon Jul 15 10:47:35 2013 -0400 @@ -1,54 +1,29 @@ #!/usr/bin/env python +import gd_util import sys -import subprocess from Population import Population -def run_program(prog, args, stdout_file=None, space_to_tab=False): - #print "args: ", ' '.join(args) - p = subprocess.Popen(args, bufsize=-1, executable=prog, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - (stdoutdata, stderrdata) = p.communicate() - rc = p.returncode - - if stdout_file is not None: - with open(stdout_file, 'w') as ofh: - lines = stdoutdata.split('\n') - for line in lines: - line = line.strip() - if line: - if space_to_tab: - line = line.replace(' ', '\t') - print >> ofh, line - - if rc != 0: - print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) - print >> sys.stderr, stderrdata - sys.exit(1) - ################################################################################ -if len(sys.argv) < 10: - print >> sys.stderr, "Usage" - sys.exit(1) +if len(sys.argv) != 10: + gd_util.die('Usage') -snp_input, indel_input, coverage_input, annotation_input, indiv_input, ref_name, min_coverage, output = sys.argv[1:9] -individual_metadata = sys.argv[9:] +snp_input, indel_input, coverage_input, annotation_input, indiv_input, ref_name, min_coverage, output, ind_arg = sys.argv[1:] p_total = Population() -p_total.from_tag_list(individual_metadata) +p_total.from_wrapped_dict(ind_arg) p1 = Population() p1.from_population_file(indiv_input) if not p_total.is_superset(p1): - print >> sys.stderr, 'There is an individual in the population individuals that is not in the SNP table' - sys.exit(1) + gd_util.die('There is an individual in the population individuals that is not in the SNP table') ################################################################################ prog = 'mk_Ji' args = [ prog ] - args.append(snp_input) args.append(indel_input) args.append(coverage_input) @@ -59,7 +34,8 @@ for tag in p1.tag_list(): args.append(tag) -run_program(prog, args, stdout_file='mk_Ji.out') +with open('mk_Ji.out', 'w') as fh: + gd_util.run_program(prog, args, stdout=fh) ################################################################################ @@ -67,12 +43,14 @@ args = [ prog ] args.append('-w') -args.append('3') +args.append(3) args.append('-s') -args.append('0.3') +args.append(0.3) args.append('-g') -args.append('0.2') +args.append(0.2) args.append('mk_Ji.out') -run_program(prog, args, stdout_file=output) +with open(output, 'w') as fh: + gd_util.run_program(prog, args, stdout=fh) + sys.exit(0) diff -r 91e835060ad2 -r 8997f2ca8c7a draw_variants.xml --- a/draw_variants.xml Mon Jun 03 12:29:29 2013 -0400 +++ b/draw_variants.xml Mon Jul 15 10:47:35 2013 -0400 @@ -2,11 +2,16 @@ variants - draw_variants.py "$input" "$indel_input" "$coverage_input" "$annotation_input" "$indiv_input" "$ref_name" "$min_coverage" "$output" - #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) - #set $arg = '%s:%s' % ($individual_col, $individual) - "$arg" - #end for + #import json + #import base64 + #import zlib + #set $ind_names = $input.dataset.metadata.individual_names + #set $ind_colms = $input.dataset.metadata.individual_columns + #set $ind_dict = dict(zip($ind_names, $ind_colms)) + #set $ind_json = json.dumps($ind_dict, separators=(',',':')) + #set $ind_comp = zlib.compress($ind_json, 9) + #set $ind_arg = base64.b64encode($ind_comp) + draw_variants.py '$input' '$indel_input' '$coverage_input' '$annotation_input' '$indiv_input' '$ref_name' '$min_coverage' '$output' '$ind_arg' diff -r 91e835060ad2 -r 8997f2ca8c7a filter_gd_snp.py --- a/filter_gd_snp.py Mon Jun 03 12:29:29 2013 -0400 +++ b/filter_gd_snp.py Mon Jul 15 10:47:35 2013 -0400 @@ -1,25 +1,11 @@ #!/usr/bin/env python +import gd_util import sys -import subprocess from Population import Population ################################################################################ -def convert_non_negative_int(string_value): - try: - val = int(string_value) - except: - print >> sys.stderr, '"%s" is not an integer' % string_value - sys.exit(1) - - if val < 0: - print >> sys.stderr, '"%d" is negative' % val - sys.exit(1) - - return val - - def convert_percent(string_value): if string_value.endswith('%'): val = convert_non_negative_int(string_value[:-1]) @@ -32,51 +18,66 @@ return str(val) +def convert_non_negative_int(string_value): + try: + val = int(string_value) + except: + print >> sys.stderr, '"%s" is not an integer' % string_value + sys.exit(1) + + if val < 0: + print >> sys.stderr, '"%d" is negative' % val + sys.exit(1) + + return val + ################################################################################ -if len(sys.argv) < 9: - print >> sys.stderr, "Usage" - sys.exit(1) +if len(sys.argv) != 13: + gd_util.die('Usage') -input, p1_input, output, lo, hi, lo_ind, lo_ind_qual = sys.argv[1:8] -individual_metadata = sys.argv[8:] +input, output, ref_chrom_col, min_spacing, lo_genotypes, p1_input, input_type, lo_coverage, hi_coverage, low_ind_cov, low_quality, ind_arg = sys.argv[1:] p_total = Population() -p_total.from_tag_list(individual_metadata) +p_total.from_wrapped_dict(ind_arg) p1 = Population() p1.from_population_file(p1_input) if not p_total.is_superset(p1): - print >> sys.stderr, 'There is an individual in the population that is not in the SNP table' - sys.exit(1) + gd_util.die('There is an individual in the population that is not in the SNP table') + +lo_coverage = convert_percent(lo_coverage) +hi_coverage = convert_percent(hi_coverage) -lo = convert_percent(lo) -hi = convert_percent(hi) +if input_type == 'gd_snp': + type_arg = 1 +elif input_type == 'gd_genotype': + type_arg = 0 +else: + gd_util.die('unknown input_type: {0}'.format(input_type)) ################################################################################ prog = 'filter_snps' -args = [] -args.append(prog) -args.append(input) -args.append(lo) -args.append(hi) -args.append(lo_ind) -args.append(lo_ind_qual) +args = [ prog ] +args.append(input) # file containing a Galaxy table +args.append(type_arg) # 1 for a gd_snp file, 0 for gd_genotype +args.append(lo_coverage) # lower bound on total coverage (< 0 means interpret as percentage) +args.append(hi_coverage) # upper bound on total coveraae (< 0 means interpret as percentage) +args.append(low_ind_cov) # lower bound on individual coverage +args.append(low_quality) # lower bound on individual quality value +args.append(lo_genotypes) # lower bound on the number of defined genotypes +args.append(min_spacing) # lower bound on the spacing between SNPs +args.append(ref_chrom_col) # reference-chromosome column (base-1); ref position in next column columns = p1.column_list() - for column in sorted(columns): - args.append(column) + args.append(column) # the starting columns (base-1) for the chosen individuals -fh = open(output, 'w') - -#print "args:", ' '.join(args) -p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=fh, stderr=sys.stderr) -rc = p.wait() -fh.close() +with open(output, 'w') as fh: + gd_util.run_program(prog, args, stdout=fh) sys.exit(0) diff -r 91e835060ad2 -r 8997f2ca8c7a filter_gd_snp.xml --- a/filter_gd_snp.xml Mon Jun 03 12:29:29 2013 -0400 +++ b/filter_gd_snp.xml Mon Jul 15 10:47:35 2013 -0400 @@ -1,39 +1,70 @@ - - : Discard some SNPs based on coverage or quality + + : Discard some SNPs based on coverage, quality or spacing - filter_gd_snp.py "$input" "$p1_input" "$output" "$lo_coverage" "$hi_coverage" "$low_ind_cov" "$lo_quality" - #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) - #set $arg = '%s:%s' % ($individual_col, $individual) - "$arg" - #end for + #import json + #import base64 + #import zlib + #set $ind_names = $input.dataset.metadata.individual_names + #set $ind_colms = $input.dataset.metadata.individual_columns + #set $ind_dict = dict(zip($ind_names, $ind_colms)) + #set $ind_json = json.dumps($ind_dict, separators=(',',':')) + #set $ind_comp = zlib.compress($ind_json, 9) + #set $ind_arg = base64.b64encode($ind_comp) + filter_gd_snp.py '$input' '$output' + #if str($input.dataset.metadata.dbkey) == '?' + '0' + #else + '$input.dataset.metadata.ref' + #end if + '$min_spacing' '$lo_genotypes' '$input_type.p1_input' + #if $input_type.choice == '0' + 'gd_snp' '$input_type.lo_coverage' '$input_type.hi_coverage' '$input_type.low_ind_cov' '$input_type.lo_quality' + #else if $input_type.choice == '1' + 'gd_genotype' '0' '0' '0' '0' + #end if + '$ind_arg' - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + @@ -52,10 +83,11 @@ **Dataset formats** -The input datasets are in gd_snp_ and gd_indivs_ formats. -The output dataset is in gd_snp_ format. (`Dataset missing?`_) +The input datasets are in gd_snp_, gd_genotype_, and gd_indivs_ formats. +The output dataset is in gd_snp_ or gd_genotype_ format. (`Dataset missing?`_) .. _gd_snp: ./static/formatHelp.html#gd_snp +.. _gd_genotype: ./static/formatHelp.html#gd_genotype .. _gd_indivs: ./static/formatHelp.html#gd_indivs .. _Dataset missing?: ./static/formatHelp.html @@ -63,19 +95,25 @@ **What it does** -The user specifies that some of the individuals in a gd_snp dataset form a -"population", by supplying a list that has been previously created using the -Specify Individuals tool. SNPs are then discarded if their total coverage -for the population is too low or too high, or if their coverage or quality -score for any individual in the population is too low. +For a gd_snp dataset, the user specifies that some of the individuals +form a "population", by supplying a list that has been previously created +using the Specify Individuals tool. SNPs are then discarded if their +total coverage for the population is too low or too high, or if their +coverage or quality score for any individual in the population is too low. The upper and lower bounds on total population coverage can be specified -either as read counts or as percentiles (e.g. "5%", with no decimal places). -For percentile bounds the SNPs are ranked by read count, so for example, a -lower bound of "10%" means that the least-covered 10% of the SNPs will be -discarded, while an upper bound of, say, "80%" will discard all SNPs above -the 80% mark, i.e. the top 20%. The threshold for the lower bound on -individual coverage can only be specified as a plain read count. +either as read counts or as percentiles (e.g. "5%", with no decimal +places). For percentile bounds the SNPs are ranked by read count, so +for example, a lower bound of "10%" means that the least-covered 10% +of the SNPs will be discarded, while an upper bound of, say, "80%" will +discard all SNPs above the 80% mark, i.e. the top 20%. The threshold +for the lower bound on individual coverage can only be specified as a +plain read count. + +For either a gd_snp or gd_genotype dataset, the user can specify a +minimum number of defined genotypes (i.e., not -1) and/or a minimum +spacing relative to the reference sequence. An error is reported if the +user requests a minimum spacing but no reference sequence is available. ----- diff -r 91e835060ad2 -r 8997f2ca8c7a find_intervals.xml --- a/find_intervals.xml Mon Jun 03 12:29:29 2013 -0400 +++ b/find_intervals.xml Mon Jul 15 10:47:35 2013 -0400 @@ -49,7 +49,7 @@ - + diff -r 91e835060ad2 -r 8997f2ca8c7a gd_util.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gd_util.py Mon Jul 15 10:47:35 2013 -0400 @@ -0,0 +1,77 @@ +#!/usr/bin/env python + +import base64 +import errno +import os +import subprocess +import sys +import zlib + +################################################################################ + +def die(message): + print >> sys.stderr, message + sys.exit(1) + +################################################################################ + +def mkdir_p(path): + try: + os.makedirs(path) + except OSError, e: + if e.errno <> errno.EEXIST: + raise + +################################################################################ + +def run_program(prog, args, stdout=subprocess.PIPE, stderr=subprocess.PIPE): + kwargs = { + "bufsize": -1, + "close_fds": False, + "creationflags": 0, + "cwd": None, + "env": None, + "executable": prog, + "preexec_fn": None, + "shell": False, + "startupinfo": None, + "stderr": stderr, + "stdin": None, + "stdout": stdout, + "universal_newlines": False + } + + str_args = [str(x) for x in args] + + p = subprocess.Popen(str_args, **kwargs) + (stdoutdata, stderrdata) = p.communicate() + rc = p.returncode + + if rc != 0: + die('FAILED:\n{0}\nCOMMAND:\n{1}'.format(stderrdata, ' '.join(str_args))) + + return stdoutdata, stderrdata + +################################################################################ + +def unwrap_string(string): + try: + decoded_string = base64.b64decode(string) + except TypeError, message: + die('base64.b64decode: {0}: {1}'.format(message, string)) + + try: + return zlib.decompress(decoded_string) + except zlib.error, message: + die('zlib.decompress: {0}'.format(message)) + +################################################################################ + +def wrap_string(string, level=9): + try: + compressed_string = zlib.compress(string, level) + except zlib.error, message: + die('zlib.compress: {0}'.format(message)) + return base64.b64encode(compressed_string) + +################################################################################ diff -r 91e835060ad2 -r 8997f2ca8c7a genome_diversity/src/Makefile --- a/genome_diversity/src/Makefile Mon Jun 03 12:29:29 2013 -0400 +++ b/genome_diversity/src/Makefile Mon Jul 15 10:47:35 2013 -0400 @@ -29,7 +29,7 @@ $(CC) $(CFLAGS) $^ -o $@ dpmix: dpmix.c lib.c - $(CC) $(CFLAGS) $^ -o $@ + $(CC) $(CFLAGS) $^ -lm -o $@ eval2pct: eval2pct.c lib.c $(CC) $(CFLAGS) $^ -o $@ diff -r 91e835060ad2 -r 8997f2ca8c7a genome_diversity/src/dpmix.c --- a/genome_diversity/src/dpmix.c Mon Jun 03 12:29:29 2013 -0400 +++ b/genome_diversity/src/dpmix.c Mon Jul 15 10:47:35 2013 -0400 @@ -1,30 +1,59 @@ -/* dpmix -- admixture using dynamic programming +/* dpmix -- admixture using dynamic programming; 2 or 3 source populations * -* argv{1] = a Galaxy SNP table. For each of several individuals, the table -* has four columns (#A, #B, genotype, quality) -- SNPs on the same -* chromosome must appear together, and in order of position +* argv{1] = a Galaxy SNP table. For each individual, the table may have +* four columns (#A, #B, genotype, quality), or only one +* (genotype). SNPs on the same chromosome must appear together, +* and in order of position * argv[2] = column with the chromosome name (position is the next column) * argv[3] = "all" or e.g., "chr20" -* argv[4] = 1 if ancestral allele frequencies are estimated from SAMtools -* genotypes; 0 means use read-coverage data. -* argv[5] = 1 to add logarithms of probabilities, allowing unobserve alleles, +* argv[4] = 1 if source-pop allele frequencies are estimated from genotypes; +* 0 means use read-coverage data. +* argv[5] = 1 to add logarithms of probabilities; * 0 to simply add probabilities * argv[6] = switch penalty (>= 0) -* argv[7] = file giving heterochromatic intervals ('-' means that no file is -* given) +* argv[7] = file giving heterochromatic intervals ('-' = no file is given) * argv[8] = file name for additional output -* argv[9], argv[10], ..., have the form "13:1:Peter", "13:2:Paul" or -* "13:0:Mary", meaning that the 13th and 14th columns (base 1) -* give the allele counts for an individual that is in ancestral -* population 1, ancestral population 2, or is a potentially admixed -* individual, resp. +* argv[9], argv[10], ... = like "13:1:Peter", "13:2:Paul", "13:3:Sam" +* or "13:0:Mary", meaning that the 13th and 14th columns (base 1) +* give the allele counts for an individual that is in source +* population 1, source population 2, source population 3, +* or is a potentially admixed individual, resp. Even when only the +* genotype is given, it is found in column 15. +* optional last arguments = "debug" What it does on Galaxy -The user specifies two "ancestral" populations (i.e., sources for chromosomes) and a set of potentially admixed individuals, and chooses between the sequence coverage or the estimated genotypes to measure the similarity of genomic intervals in admixed individuals to the two classes of ancestral chromosomes. The user also picks a "switch penalty", typically between 10 and 100. For each potentially admixed individual, the program divides the genome into three "genotypes": (0) homozygous for the second ancestral population (i.e., both chromosomes from that population), (1) heterozygous, or (2) homozygous for the second ancestral population. Parts of a reference chromosome that are labeled as "heterochromatic" are given the non-genotype, 3. Smaller values of the switch penalty (corresponding to more ancient admixture events) generally lead to the reconstruction of more frequent changes between genotypes. +The user specifies two or three source populations (i.e., sources for chromosomes) and a set of potentially admixed individuals, and chooses between the sequence coverage or the estimated genotypes to measure the similarity of genomic intervals in admixed individuals to the source chromosomes. The user also specifies a "switch penalty", controlling the strength of evidence needed to switch between source populations as the the program scans along a chromosome. + +Choice of an appropriate switch penalty depends on the number of SNPs and the time since the admixture event(s), and it is not always easy to determine a good setting ahead of time. One approach is to try, say, 10.0, and look at the resulting picture. If the admixture blocks are shorter than anticipated, based on expectations about the number of generations since the admixture event(s) (i.e., the implied admixture events are too ancient), then try a larger value. Conversely, if the blocks are longer than anticipated (i.e., the implied timing of admixture is too recent), then lower the penalty. In any case, it is prudent to base conclusions on results consistently obtained with a range of switch penalities. + +If there are 3 source populatons, then for each potentially admixed individual the program divides each potentially admixed genome into six "genotypes": + +(1) homozygous for the first source population (i.e., both chromosomes from that population), +(2) homozygous for the second source population, +(3) homozygous for the third source population, +(4) heterozygous for the first and second populations (i.e., one chromosome from each), +(5) heterozygous for the first and third populations, or +(6) heterozygous for the second and third populations. + +Parts of a reference chromosome that are labeled as "heterochromatic" are given the "non-genotype" 0. With two source populations, there are only three possible "genotypes". + +There are two Galaxy history-items generated. The first is a tabular dataset with chromosome, start, stop, and pairs of columns containing the "genotypes" as described above and the label from the admixed individual. It is used to draw a pciture and can be consulted for details. The second item is a composite of (1) a link to a pdf which graphically shows the inferred source population for each potentially admixed individual along each chromosome, and (2) a link to a text file that describes the run and summarizes the extent of predicted admixture in each individual. + +For certain genome assemblies, Galaxy stores a "heterochromatin file", which specifies the heterochromatic intervals on each chromosome, as well as telling the chromosome lengths. For other genome assemblies, the user can supply this information. Here are the first few lines of a heterochromatin file for human assembly hg19: +chr1 121485434 142535434 +chr1 249250621 249250621 +chr2 90545103 95326171 +chr2 243199373 243199373 +This gives the centromeres and lengths for chromosomes 1 and 2. Chromosomal addresses begin at 0, and the stated end position is 1 beyond the last address. For instance the interval described by the second line has length 0; it tells Galaxy that chromosome 1 is 249,250,621 bp in length. The entries for an acrocentric chromosome looks like the following: +chr22 0 16050000 +chr22 51304566 51304566 +The file must be ordered by chromosome name (starting with autosomes), and by position within a chromosome. */ #include "lib.h" -//#include +#include + +//#define ABT // maximum length of a line from the table #define MOST 50000 @@ -32,8 +61,8 @@ // we create a linked list of "events" on a chromosome -- mostly SNPs, but // also ends of hetorochomatic intervals struct snp { - double F1, F2; // reference allele frequencies in the two populations - int pos, *g, // position and an array of admixed genotypes + double F1, F2, F3; // ref. allele frequencies in the three populations + int pos, *g, // position; genotypes of admixed individuals type; // 0 = SNP, 1 = start of het. interval, 2 = end struct snp *prev; // we keep the list in order of decreasing pos } *last; @@ -41,11 +70,11 @@ // array of potentially admixed individuals struct admixed { char *name; - int gcol, ge20, gt02; - long long x[4]; // number of reference bp in each state + int gcol; + double x[7]; // number of reference bp in each state } A[MOST]; -// information about "ancestral" individuals, namely column and population +// information about source-population individuals struct ances { int col, pop; char *name; @@ -58,56 +87,75 @@ } H[MOST]; // global variables -int *B[4], // backpointer to state at the previous SNP (or event) +int *B[7], // backpointer to state at the previous SNP (or event) *P; // chromosome position -int nH, nI, nG, genotypes, nsnp, debug, chr_col, logs; +int nH, nI, nG, genotypes, nsnp, debug, chr_col, logs, last_snp_pos, pop3; char this_chr[100]; double switch_penalty; char buf[MOST], *status; FILE *fp, *out; -// probability of producing genotype g in admixture state s -// given reference allele frequencies f1 and f2 in the ancestral populations -double score (double f1, double f2, int g, int s) { +#define LOG_ZERO -4.0 +// probability of producing genotype g in admixture state s = 1 to 6 +// given source-population ref. allele frequencies f1, f2, and f3 +double score (double f1, double f2, double f3, int g, int s) { double p; - if (s == 2) { // homozygous for the first ancestral population + if (g < 0 || g > 2) + fatalf("bad genotype %d", g); + if (s == 1) { // homozygous for the first source population if (g == 2) p = f1*f1; else if (g == 0) p = (1.0-f1)*(1.0-f1); else p = 2.0*f1*(1.0-f1); - } else if (s == 0) { // homozygous for the second ancestral population + } else if (s == 2) { // homozygous for the second source population if (g == 2) p = f2*f2; else if (g == 0) p = (1.0-f2)*(1.0-f2); else p = 2.0*f2*(1.0-f2); - } else { // one chromosome from each ancestral population - if (s != 1) - fatalf("bad state %d", s); + } else if (s == 3) { // homozygous for the third source population + if (g == 2) + p = f3*f3; + else if (g == 0) + p = (1.0-f3)*(1.0-f3); + else + p = 2.0*f3*(1.0-f3); + } else if (s == 4) { // one chromosome each from source pops 1 and 2 if (g == 2) p = f1*f2; else if (g == 0) p = (1.0-f1)*(1.0-f2); else p = f1*(1.0-f2) + (1.0-f1)*f2; - } + } else if (s == 5) { // one chromosome each from source pops 1 and 3 + if (g == 2) + p = f1*f3; + else if (g == 0) + p = (1.0-f1)*(1.0-f3); + else + p = f1*(1.0-f3) + (1.0-f1)*f3; + } else if (s == 6) { // one chromosome each from source pops 2 and 3 + if (g == 2) + p = f2*f3; + else if (g == 0) + p = (1.0-f2)*(1.0-f3); + else + p = f2*(1.0-f3) + (1.0-f2)*f3; + } else + fatalf("bad state %d", s); if (p < 0.0) - fatalf("%f %f %d %d => %f", f1, f2, g, s, p); + fatalf("%f %f %f %d %d => %f", f1, f2, f3, g, s, p); if (!logs) return p; -#ifdef NEVER if (p == 0.0) - return -5.0; - p = log(p); - if (p < -5.0) - p = -5.0; + return LOG_ZERO; + p = MAX(log(p), LOG_ZERO); return p; -#endif fatal("dpmix: cannot happen"); } @@ -115,31 +163,38 @@ static char tmp[MOST]; char *s, *z = "\t\n"; int i = chr_col; + static int autosome_warning = 0; strcpy(tmp, buf); s = strtok(tmp, z); while (--i > 0) s = strtok(NULL, z); + if (!autosome_warning && strncmp(s, "chr", 3) == 0 && + !isdigit(s[3])) { + fprintf(out, + "WARNING: results assume diploid (non-sex) chromosomes\n\n"); + autosome_warning = 1; + } return s; } /* Process the a-th potentially admixed individual. * We think of a graph with nodes (event, state) for each event (SNP or * end-point of a heterochromatic interval on the current chromosome) and state -* = 0, 1, 2, 3 (corresponding to genotypes 0, 1, and 2, plus 3 = -* heterochromatin); for events other than the last one, there are edges from -* each (event, state) to (event+1, k) for 0 <= k <= 3. An edge (event, j) to +* = 0 through 7 (corresponding to genotypes 1 to 6, plus 0 = +* heterochromatin); for events where state != 0, there are 7 edges from +* each (event, state) to (event+1, k) for 0 <= k <= 6. An edge (event, j) to * (event+1, k) has penalty 0 if j = k and penalty switch_penalty otherwise. -* The bonus at SNP node (event, state) for 0 <= state <= 2 is the probability +* The bonus at SNP node (event, state) for 1 <= state <= 6 is the probability * of generating the genotype observed in the a-th potentially admixed -* individual given the allele frequences in the two ancestral populations and +* individual given the allele frequences in the source populations and * the assumed admixture state in this region of the chromosome. The score of a * path is the sum of the node bonuses minus the sum of the edge penalties. * * Working backwards through the events, we compute the maximum path score, -* from[state], from (event,state) back to the closest admixed interval. -* To force paths to reach state 3 at an event signalling the start of a -* heterochromatic interval (type = 1), but to avoid state 3 at other events, +* from[state], from (event,state) back to the closest heterochromatin interval. +* To force paths to reach state 0 at an event signalling the start of a +* heterochromatic interval (type = 1), but to avoid state 0 at other events, * we assign huge but arbitrary negative scores (see "avoid", below). * At (event,state), B[event][state] is the backpointer to the state at * event+1 on an optimal path. Finally, we follow backpointers to partition @@ -147,71 +202,96 @@ */ void one_admix(int a) { int i, j, m, state, prev_pos, b; - double from[4], f[4], ff[4], avoid = -1000000.0; + double from[7], f[7], ff[7], avoid = -1000000.0; struct snp *p; // from[i] = highest score of a path from the current event // (usually a SNP) to the next (to the right) heterochromatic interval // or the end of the chromosome. The score of the path is the sum of // SNP scores minus (switch_penalty times number of state switches). - // We assume that the last two event on the chromosome are the start - // and end of a heterochromatic interval (possibly of length 0)/ - for (i = 0; i < 4; ++i) + // We assume that the last two events on the chromosome are the start + // and end of a heterochromatic interval (possibly of length 0) + for (i = 0; i < 7; ++i) from[i] = 0; for (i = nsnp-1, p = last; i >= 0 && p != NULL; --i, p = p->prev) { if (p->type == 0 && p->g[a] == -1) { // no genotype // no state change at this SNP - for (state = 0; state < 4; ++state) + for (state = 0; state < 7; ++state) B[state][i] = state; continue; } +#ifdef ABT +fprintf(stderr, "%d: ABT genotype=%d, KHO %f,%f YRI %f,%f\n", + p->pos, p->g[a], p->F1, 1.0-p->F1, p->F2, 1.0-p->F2); +#endif - for (state = 0; state < 4; ++state) { + for (state = 0; state < 7; ++state) { // find highest path-score from this event onward - for (m = j = 0; j < 4; ++j) { + for (m = j = 0; j < 7; ++j) { f[j] = from[j]; if (j != state) f[j] -= switch_penalty; - //if (abs(j-state) == 2) - //from[j] -= switch_penalty; if (f[j] > f[m]) m = j; } B[state][i] = m; ff[state] = f[m]; - if (state < 3 && p->type == 0) + if (state > 0 && p->type == 0) +{ ff[state] += - score(p->F1, p->F2, p->g[a], state); + score(p->F1, p->F2, p->F3, p->g[a], state); +#ifdef ABT +if (state == 2) + fprintf(stderr, " YRI:"); +if (state == 4) + fprintf(stderr, " hetero:"); +if (state == 1) + fprintf(stderr, " KHO:"); +if (pop3 || state == 1 || state == 2 || state == 4) +fprintf(stderr, " score=%f ff=%f B = %d\n", + score(p->F1, p->F2, p->F3, p->g[a], state), ff[state], m); +#endif +} } if (p->type == 1) { // start of heterochomatic interval. Force paths - // reaching this point to go through state 3 - from[3] = 0; - from[0] = from[1] = from[2] = avoid; + // reaching this point to go through state 0 + from[0] = 0; + for (j = 1; j < 7; ++j) + from[j] = avoid; } else { - for (j = 0; j < 3; ++j) + for (j = 1; j < 7; ++j) from[j] = ff[j]; - from[3] = avoid; + from[0] = avoid; } - if (debug) - fprintf(stderr, "%d: %f(%d) %f(%d) %f(%d) %f(%d)\n", - i, from[0], B[0][i], from[1], B[1][i], from[2], - B[2][i], from[3], B[3][i]); + if (debug) { + fprintf(stderr, "%d:", i); + for (j = 0; j < 7; ++j) { + if (pop3 || j == 3 || j == 5 || j == 6) + fprintf(stderr, " %f(%d)", from[j], B[j][i]); + } + putc('\n', stderr); + } } // find the best initial state - for (state = 0, j = 1; j < 4; ++j) + for (state = 0, j = 1; j < 7; ++j) if (from[j] > from[state]) state = j; +#ifdef ABT +fprintf(stderr, "start backtrace in state %d\n", state); +#endif + // trace back to find the switch points // A[a].x[state] records the total length of intervals in each state for (prev_pos = i = 0; i < nsnp; ++i) { if ((b = B[state][i]) != state) { if (prev_pos < P[i+1]-1) printf("%s\t%d\t%d\t%d\t%s\n", - this_chr, prev_pos, P[i+1], state, A[a].name); - A[a].x[state] += (P[i+1]-prev_pos); + this_chr, prev_pos, P[i+1], + (state==4 && !pop3 ? 3 : state), A[a].name); + A[a].x[state] += (double)(P[i+1]-prev_pos); prev_pos = P[i+1]; state = b; } @@ -224,7 +304,7 @@ struct snp *new = ckalloc(sizeof(struct snp)); int i; - new->F1 = new->F2 = 0.0; + new->F1 = new->F2 = new->F3 = 0.0; new->pos = b; new->type = type; new->g = ckalloc(nG*sizeof(int)); @@ -243,9 +323,11 @@ */ void one_chr() { char *s, *z = "\t\n"; - int X[MOST], n, i, g, A1, B1, A2, B2, a, do_read, p, pos, het; + int X[MOST], n, i, g, A1, B1, A2, B2, A3, B3, a, do_read, p, pos, het; + // int j, k, tied; struct snp *new; - double F1, F2; + double F1, F2, F3; + // doublewinner, try; strcpy(this_chr, get_chr_name()); nsnp = 0; @@ -279,17 +361,11 @@ // should we discard this SNP? if (pos == -1) // SNP not mapped to the reference continue; -/* - for (i = 0; i < nG && X[A[i].gcol] >= 0; ++i) - ; - if (i < nG) // genotype of admixed individual not called - continue; -*/ // add SNP to a "backward pointing" linked list, recording the - // major allele frequencies in the two reference populations - // and genotypes in the potential admixed individuals - for (i = A1 = B1 = A2 = B2 = 0; i < nI; ++i) { + // major allele frequencies in the source populations and + // genotypes in the potential admixed individuals + for (i = A1 = B1 = A2 = B2 = A3 = B3 = 0; i < nI; ++i) { n = C[i].col; p = C[i].pop; if (genotypes) { @@ -304,6 +380,9 @@ } else if (p == 2) { A2 += g; B2 += (2 - g); + } else if (p == 3) { + A3 += g; + B3 += (2 - g); } } else { // use read counts if (p == 1) { @@ -322,43 +401,38 @@ new->pos = X[chr_col+1]; new->F1 = F1 = (double)A1/(double)(A1+B1); new->F2 = F2 = (double)A2/(double)(A2+B2); + new->F3 = F3 = (double)A3/(double)(A3+B3); new->type = 0; new->g = ckalloc(nG*sizeof(int)); - for (i = 0; i < nG; ++i) { + for (i = 0; i < nG; ++i) g = new->g[i] = X[A[i].gcol]; - if (score(F1, F2, g, 2) >= score(F1, F2, g, 0)) - A[i].ge20++; - else - A[i].gt02++; - } if (F1 < 0.0 || F1 > 1.0) fatalf("F1 = %f (A1 = %d, B1 = %d) at snp %d", F1, A1, B1, nsnp); if (F2 < 0.0 || F2 > 1.0) fatalf("F2 = %f (A2 = %d, B2 = %d) at snp %d", F2, A2, B2, nsnp); + if (F3 < 0.0 || F3 > 1.0) + fatalf("F3 = %f (A2 = %d, B2 = %d) at snp %d", + F3, A3, B3, nsnp); new->prev = last; last = new; } - // insert heterochomatin intervals that follow all SN + + // insert heterochomatin intervals that follow all SNPs while (het < nH && same_string(this_chr, H[het].chr)) { add_het(H[het].b, 1); add_het(H[het].e, 2); nsnp += 2; ++het; } -/* -printf("nsnp = %d\n", nsnp); -for (i = nsnp-1, new = last; i >= 0 && new != NULL; --i, new = new->prev) { -printf("%d %d ", new->pos, new->type); -printf("%g %g ", new->F1, new->F2); -for (a = 0; a < nG; ++a) -printf("%d", new->g[a]); -putchar('\n'); -} -//exit(0); -printf("\nbacktrace\n"); -*/ + // make sure the picture is drawn to at least the last SNP + if (last->type == 0) { + i = last->pos + 1; + add_het(i, 1); + add_het(i, 2); + nsnp += 2; + } // allocate arrays for the DP analysis P = ckalloc(nsnp*sizeof(int)); // position of each event @@ -366,7 +440,7 @@ --i, new = new->prev) P[i] = new->pos; - for (i = 0; i < 4; ++i) { // space for back-pointers + for (i = 0; i < 7; ++i) { // space for back-pointers B[i] = ckalloc((nsnp+1)*sizeof(int)); B[i][nsnp] = 0; } @@ -383,14 +457,14 @@ free(new); } free(P); - for (i = 0; i < 4; ++i) + for (i = 0; i < 7; ++i) free(B[i]); } int main(int argc, char **argv) { - int n, i, j, k, saw[3]; + int n, i, j, k, saw[4]; long long het_len, ref_len; - float N; + double N, tot[7], keep[7], xx, yy; char nam[100], *chr; if (argc < 9) @@ -406,8 +480,6 @@ genotypes = atoi(argv[4]); logs = atoi(argv[5]); - if (logs) - fatal("logarithms of probabilities -- under development"); //if (logs) switch_penalty = log(switch_penalty); switch_penalty = atof(argv[6]); @@ -423,12 +495,12 @@ fatal("Too many heterochromatic intervals"); if (sscanf(buf, "%s %d %d", nam, &i, &j) != 3) fatalf("gagging: %s", buf); + if (nH > 0 && !same_string(nam, H[nH-1].chr)) + ref_len += H[nH-1].e; H[nH].chr = copy_string(nam); H[nH].b = i; H[nH].e = j; // assumes last event per chrom. is a het. interval - if (nH > 0 && !same_string(nam, H[nH-1].chr)) - ref_len += j; het_len += (j - i); ++nH; } @@ -437,12 +509,12 @@ ref_len += H[nH-1].e; // populations must be disjoint - saw[1] = saw[2] = 0; + saw[0] = saw[1] = saw[2] = saw[3] = 0; for (i = 9; i < argc; ++i) { if (sscanf(argv[i], "%d:%d:%s", &j, &k, nam) != 3) fatalf("not like 13:2:fred : %s", argv[i]); - if (k < 0 || k > 2) - fatalf("not population 0, 1 or 2: %s", argv[i]); + if (k < 0 || k > 3) + fatalf("not population 0, 1, 2 or 3: %s", argv[i]); saw[k] = 1; // seen this individual (i.e., column) before?? @@ -455,7 +527,7 @@ fatal("Too many admixed individuals"); A[nG].name = copy_string(nam); A[nG++].gcol = j+2; - } else { // in an ancestral population + } else { // in a source population if (nI >= MOST) fatal("Too many ancestral individuals"); C[nI].col = j; @@ -469,15 +541,27 @@ fatal("first reference population is empty"); if (saw[2] == 0) fatal("second reference population is empty"); + pop3 = saw[3]; // start the output file of text - for (k = 1; k <= 2; ++k) { - fprintf(out, "state %d agrees with:", k == 1 ? 2 : 0); + for (k = 1; k <= 3; ++k) { + if (k == 3 && !pop3) + break; + fprintf(out, "source population %d (state %d):", k, k); for (i = 0; i < nI; ++i) if (C[i].pop == k) fprintf(out, " %s", C[i].name); - putc('\n', out); + fprintf(out, "\n\n"); } + if (pop3) { + fprintf(out, "state 4 is heterozygous for populations 1 and 2\n"); + fprintf(out, + "state 5 is heterozygous for populations 1 and 3\n"); + fprintf(out, + "state 6 is heterozygous for populations 2 and 3\n"); + } else + fprintf(out, "state 3 is heterozygous for populations 1 and 2\n"); + fprintf(out, "\nswitch penalty = %2.2f\n", switch_penalty); putc('\n', out); fp = ckopen(argv[1], "r"); @@ -493,26 +577,71 @@ if (status != NULL) one_chr(); } - for (i = 0; i < nG; ++i) { - fprintf(out, - "%s: %d SNPs where state 2 is at least as likely as state 0\n", - A[i].name, A[i].ge20); - fprintf(out, - "%s: %d SNPs where state 0 is more likely than state 2\n\n", - A[i].name, A[i].gt02); - } - // write fractions in each state to the output text file if (ref_len) fprintf(out, "%lld of %lld reference bp (%1.1f%%) are heterochromatin\n\n", het_len, ref_len, 100.0*(float)het_len/(float)ref_len); + // write fractions in each state to the output text file + for (j = 0; j < 7; ++j) + tot[j] = 0.0; + fprintf(out, "individual:"); + fprintf(out, "\tstate 1\tstate 2\tstate 3"); + if (pop3) + fprintf(out, "\tstate 4\tstate 5\tstate 6"); + fprintf(out, "\t pop 1\t pop 2"); + if (pop3) + fprintf(out, "\t pop 3"); + putc('\n', out); for (i = 0; i < nG; ++i) { - N = (float)(A[i].x[0] + A[i].x[1] + A[i].x[2])/100.0; - fprintf(out, "%s: 0 = %1.1f%%, 1 = %1.1f%%, 2 = %1.1f%%\n", - A[i].name, (float)A[i].x[0]/N, (float)A[i].x[1]/N, - (float)A[i].x[2]/N); + N = A[i].x[1] + A[i].x[2] + A[i].x[4]; + if (pop3) + N += A[i].x[3] + A[i].x[5] + A[i].x[6]; + N /= 100.0; + fprintf(out, "%s:", A[i].name); + if (strlen(A[i].name) < 7) + putc('\t', out); + for (j = 1; j < 7; ++j) + if (pop3 || j == 1 || j == 2 || j == 4) { + tot[j] += (keep[j] = A[i].x[j]); + fprintf(out, "\t %5.1f%%", keep[j]/N); + } + keep[1] += 0.5*keep[4]; + keep[2] += 0.5*keep[4]; + if (pop3) { + keep[1] += 0.5*keep[5]; + keep[2] += 0.5*keep[6]; + keep[3] += 0.5*(keep[5]+keep[6]); + } + + fprintf(out, "\t %5.1f%%\t %5.1f%%", keep[1]/N, keep[2]/N); + if (pop3) + fprintf(out, "\t %5.1f%%", keep[3]/N); + + putc('\n', out); + } + if (nG > 1) { + fprintf(out, "\naverage: "); + N = tot[1] + tot[2] + tot[4]; + if (pop3) + N += (tot[3] + tot[5] + tot[6]); + N /= 100.0; + for (j = 1; j < 7; ++j) { + if (pop3 || j == 1 || j == 2 || j == 4) + fprintf(out, "\t %5.1f%%", tot[j]/N); + } + xx = tot[1] + 0.5*tot[4]; + yy = tot[2] + 0.5*tot[4]; + if (pop3) { + xx += 0.5*tot[5]; + yy += 0.5*tot[6]; + } + fprintf(out, "\t %5.1f%%\t %5.1f%%", xx/N, yy/N); + if (pop3) + fprintf(out, "\t %5.1f%%", + (tot[3] + 0.5*tot[5] + 0.5*tot[6])/N); + putc('\n', out); } return 0; diff -r 91e835060ad2 -r 8997f2ca8c7a genome_diversity/src/filter_snps.c --- a/genome_diversity/src/filter_snps.c Mon Jun 03 12:29:29 2013 -0400 +++ b/genome_diversity/src/filter_snps.c Mon Jul 15 10:47:35 2013 -0400 @@ -1,15 +1,26 @@ -/* pop -- add four columns (allele counts, genotype, maximum quality) for a -* specified population to a Galaxy SNP table, or enforce bounds +/* filter_snps -- enforce constraints on a file of SNPs. * * argv[1] = file containing a Galaxy table -* argv[2] = lower bound on total coverage (> 0 means interpret as percentage) -* argv[3] = upper bound on total coverae (> 0 means interpret as percentage) -* argv[4] = lower bound on individual coverage -* argv[5] = lower bound on individual quality value -* argv[6] ... are the starting columns (base-1) for the chosen individuals +* argv[2] = 1 for a gd_snp file, 0 for gd_genotype +* argv[3] = lower bound on total coverage (< 0 means interpret as percentage) +* argv[4] = upper bound on total coveraae (< 0 means interpret as percentage; +* 0 means no bound) +* argv[5] = lower bound on individual coverage +* argv[6] = lower bound on individual quality value +* argv[7] = lower bound on the number of defined genotypes +* argv[8] = lower bound on the spacing between SNPs +* argv[9] = reference-chromosome column (base-1); ref position in next column +* If argv[8] == 0, then argv[7] must be 0. +* argv[10] ... are the starting columns (base-1) for the chosen individuals. +* If argc == 10, then only the lower bound on spacing between SNPs is +* enforced. What it does on Galaxy -The user specifies that some of the individuals in a gd_snp dataset form a "population", by supplying a list that has been previously created using the Specify Individuals tool. SNPs are then discarded if their total coverage for the population is too low or too high, or if their coverage or quality score for any individual in the population is too low. +For a gd_snp dataset, the user specifies that some of the individuals form a "population", by supplying a list that has been previously created using the Specify Individuals tool. SNPs are then discarded if their total coverage for the population is too low or too high, or if their coverage or quality score for any individual in the population is too low. + +The upper and lower bounds on total population coverage can be specified either as read counts or as percentiles (e.g. "5%", with no decimal places). For percentile bounds the SNPs are ranked by read count, so for example, a lower bound of "10%" means that the least-covered 10% of the SNPs will be discarded, while an upper bound of, say, "80%" will discard all SNPs above the 80% mark, i.e. the top 20%. The threshold for the lower bound on individual coverage can only be specified as a plain read count. + +For either a gd_snp or gd_genotype dataset, the user can specify a minimum number of defined genotypes (i.e., not -1) and/or a minimum spacing relative to the reference sequence. An error is reported if the user requests a minimum spacing but no reference sequence is available. */ @@ -18,11 +29,14 @@ // most characters allowed in a row of the table #define MOST 50000 -char buf[MOST]; +// larger than any possible coverage +#define INFINITE_COVERAGE 100000000 + +char buf[MOST], chr[100], old_chr[100]; // column for the relevant individuals/groups int col[MOST], *T; -int nI, lo, hi, X[MOST]; +int nI, lo, hi, min_space, min_geno, chr_col, pos, old_pos, gd_snp, X[MOST]; void get_X() { char *p, *z = "\t\n", trash[MOST]; @@ -31,8 +45,13 @@ strcpy(trash, buf); // set X[i] = atoi(i-th word of s), i is base 0 for (i = 1, p = strtok(trash, z); p != NULL; - ++i, p = strtok(NULL, z)) + ++i, p = strtok(NULL, z)) { + if (chr_col && i == chr_col) + strcpy(chr, p); X[i] = atoi(p); + if (chr_col && i == chr_col+1) + pos = X[i]; + } } int compar(int *a, int *b) { @@ -75,57 +94,71 @@ fclose(fp); } +void OK() { + if (chr_col == 0 || !same_string(chr, old_chr) || + pos >= old_pos + min_space) { + printf("%s", buf); + if (chr_col) { + strcpy(old_chr, chr); + old_pos = pos; + } + } +} int main(int argc, char **argv) { FILE *fp; - char *p; - int m, i, A, B, G, Q, indiv, qual, g, q; + int m, i, cov, tot_cov, indiv, qual, ngeno; - if (argc < 3) - fatalf("args: SNP-table low high col1 col2 ..."); + if (argc < 10) + fatalf("args: SNP-table gd_snp low-tot high-tot low-cov low-qual low-genotype low-space chr-col col1 col2 ..."); - for (i = 6, nI = 0; i < argc; ++i, ++nI) + for (i = 10, nI = 0; i < argc; ++i, ++nI) col[nI] = atoi(argv[i]); - lo = atoi(argv[2]); - hi = atoi(argv[3]); + gd_snp = atoi(argv[2]); + lo = atoi(argv[3]); + hi = atoi(argv[4]); if (hi == 0) - hi = 100000000; + hi = INFINITE_COVERAGE; if (lo < 0 || hi < 0) find_lo(argv[1]); - indiv = atoi(argv[4]); - qual = atoi(argv[5]); + indiv = atoi(argv[5]); + qual = atoi(argv[6]); + min_geno = atoi(argv[7]); + min_space = atoi(argv[8]); + chr_col = atoi(argv[9]); + + // reality checks + if (!gd_snp && + (lo != 0 || hi != INFINITE_COVERAGE || indiv != 0 || qual != 0)) + fatal("cannot bound coverage or quality in gd_genotype file"); + if (chr_col == 0 && min_space != 0) + fatalf("Specification of minimum spacing requires a reference sequence"); if (indiv < 0 || qual < 0) fatalf("percentiles not implemented for individuals"); - + + // scan the SNPs fp = ckopen(argv[1], "r"); while (fgets(buf, MOST, fp)) { if (buf[0] == '#') continue; get_X(); - for (i = A = B = Q = 0, G = -1; i < nI; ++i) { + for (i = tot_cov = ngeno = 0; i < nI; ++i) { m = col[i]; - if (X[m]+X[m+1] < indiv || (q = X[m+3]) < qual) - break; - A += X[m]; - B += X[m+1]; - g = X[m+2]; - if (g != -1) { - if (G == -1) // first time - G = g; - else if (G != g) - G = 1; + if (gd_snp) { + cov = (X[m]+X[m+1]); + if (cov < indiv || X[m+3] < qual) + break; + tot_cov += cov; } - Q = MAX(Q, q); + if (X[m+2] != -1) + ++ngeno; } if (i < nI) // check bounds on the population's individuals continue; - if (lo == -1 && hi == -1 && indiv == -1 && qual == -1) { - // add columns - if ((p = strchr(buf, '\n')) != NULL) - *p = '\0'; - printf("%s\t%d\t%d\t%d\t%d\n", buf, A, B, G, Q); - } else if (A+B >= lo && (hi == -1 || A+B <= hi)) - // coverage meets the population-level restrictions - printf("%s", buf); + if (gd_snp && (tot_cov < lo || tot_cov > hi)) + continue; + if (ngeno >= min_geno) + // OK, except possibly for lower bound on spacing + OK(); } return 0; diff -r 91e835060ad2 -r 8997f2ca8c7a make_gd_file.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/make_gd_file.py Mon Jul 15 10:47:35 2013 -0400 @@ -0,0 +1,187 @@ +#!/usr/bin/env python + +import base64 +import json +import math +import re +import sys + +identifier_regex = re.compile('[0-9A-Z_a-z]+$') + +def unwrap_column_names(string): + column_names = [] + string = unwrap_string(string) + for line in string.split('\n'): + line = line.strip() + if is_identifier(line): + column_names.append(line) + else: + die('invalid column name format: {}'.format(line)) + return column_names + +def unwrap_string(string): + try: + decoded = base64.b64decode(string) + except: + die('invalid base64 string: {}'.format(string)) + return decoded + +def is_identifier(string): + match = identifier_regex.match(string) + if match: + return True + else: + return False + +def read_individual_names(filename): + tokens = [] + names = [] + with open(filename) as fh: + for line in fh: + line = line.rstrip('\r\n') + elems = line.split() + + columns = len(elems) + if columns == 0: + continue + + first_token = elems[0] + + if columns == 1: + name = first_token + else: + keywords = ' '.join(elems[1:]) + name = ' '.join([first_token, keywords]) + + if first_token not in tokens: + tokens.append(first_token) + names.append(name) + else: + die('duplicate first column entry in Names dataset: {}'.format(first_token)) + return names + +def fold_line(line, maxlen, prefix): + prefix_len = len(prefix) + + lines = [] + + while len(line) > maxlen: + split_points = [] + state = 0 + for i in range(maxlen - prefix_len): + c = line[i] + if state == 0: + if c == '"': + state = 1 + elif c in [ '{', ':', ',', '}', '[', ']' ]: + split_points.append(i) + elif state == 1: + if c == '"': + state = 0 + elif c == '\\': + state = 2 + elif state == 2: + state = 1 + idx = split_points[-1] + lines.append('{0}{1}'.format(prefix, line[:idx+1])) + line = line[idx+1:] + + lines.append('{0}{1}'.format(prefix, line)) + + return lines + +def die(message): + print >> sys.stderr, message + sys.exit(1) + +################################################################################ + +type_to_columns = { + 'gd_snp':4, + 'gd_genotype':1 +} + +if len(sys.argv) != 12: + print >> sys.stderr, 'Usage' + sys.exit(1) + +input, scaffold_col, pos_col, ref_col, rPos_col, preamble_arg, names, species_arg, dbkey, output_type, output = sys.argv[1:12] + +preamble_column_names = unwrap_column_names(preamble_arg) +first_individual_column = len(preamble_column_names) + 1 + +individual_names = read_individual_names(names) + +species = unwrap_string(species_arg) +if not is_identifier(species): + die('invalid species format: {}'.format(species)) + +if not output_type in type_to_columns: + die('unknown output type: {}'.format(output_type)) +columns_per_individual = type_to_columns[output_type] + +jdict = {} + +column_names = preamble_column_names[:] +for i in range(1, len(individual_names) + 1): + if output_type == 'gd_snp': + column_names.append('{}A'.format(i)) + column_names.append('{}B'.format(i)) + column_names.append('{}G'.format(i)) + column_names.append('{}Q'.format(i)) + elif output_type == 'gd_genotype': + column_names.append('{}G'.format(i)) + else: + die('unknown output type: {}'.format(output_type)) + +jdict['column_names'] = column_names + +individuals = [] + +for pos, individual in enumerate(individual_names): + col = first_individual_column + pos * columns_per_individual + individuals.append([individual, col]) + +jdict['individuals'] = individuals + +jdict['scaffold'] = int(scaffold_col) +jdict['pos'] = int(pos_col) +jdict['ref'] = int(ref_col) +jdict['rPos'] = int(rPos_col) + +jdict['species'] = species +jdict['dbkey'] = dbkey + +json_string = json.dumps(jdict, separators=(',',':'), sort_keys=True) + +min_cols = len(column_names) +pos_col = int(pos_col) - 1 +rPos_col = int(rPos_col) - 1 + +def is_int(string): + try: + int(string) + return True + except ValueError: + return False + +with open(output, 'w') as ofh: + lines = fold_line(json_string, 200, '#') + for line in lines: + print >> ofh, line + + with open(input) as fh: + line_number = 0 + for line in fh: + line_number += 1 + if line[0] == '#': + continue + line = line.rstrip('\r\n') + elems = line.split('\t') + if len(elems) < min_cols: + die('Too few columns on line {0} of input file. Expecting {1}, saw {2}.'.format(line_number, min_cols, len(elems))) + if not is_int(elems[pos_col]): + die('bad pos on line {0} column {1} of input file: {2}'.format(line_number, pos_col+1, elems[pos_col])) + if not is_int(elems[rPos_col]): + die('bad rPos on line {0} column {1} of input file: {2}'.format(line_number, rPos_col+1, elems[rPos_col])) + print >> ofh, line diff -r 91e835060ad2 -r 8997f2ca8c7a make_gd_file.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/make_gd_file.xml Mon Jul 15 10:47:35 2013 -0400 @@ -0,0 +1,101 @@ + + : Build a gd_snp or gd_genotype file + + + #import base64 + #set $preamble_arg = base64.b64encode(str($preamble_names)) + #set $species_arg = base64.b64encode(str($species)) + make_gd_file.py '$input' '$scaffold_col' '$pos_col' '$ref_col' '$rPos_col' '$preamble_arg' '$names' '$species_arg' '$dbkey' '$output_type' '$output' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**Dataset formats** + +The input datasets are in tabular_ and text_ formats. +The output dataset is in gd_snp_ or gd_genotype_ format. (`Dataset missing?`_) + +.. _tabular: ./static/formatHelp.html#tab +.. _text: ./static/formatHelp.html#text +.. _gd_snp: ./static/formatHelp.html#gd_snp +.. _gd_genotype: ./static/formatHelp.html#gd_genotype +.. _Dataset missing?: ./static/formatHelp.html + +----- + +**What it does** + +This tool simplifies the job of creating a Galaxy file with format gd_snp +or gd_genotype. Often, the most complex part of preparing one of these +files is to specify how individuals are related to columns of the table, +a task facilitated by this command. Each gd_snp or gd_genotype file +typically has columnns giving: + +1. scaffold/contig name +2. zero-based position in the scaffold/contig +3. reference species chromosome +4. zero-based position in the reference species chromosome + +The user needs to specify the columns containing these data. Columns are +numbered starting with 1. The user also specifies brief column names for +these columns. When the focus species and the reference species are the +same, the scaffold/contig name and reference species chromosome columns +will be identical, as will the position in the scaffold/contig and +position in the reference species chromosome columns. + +To inform Galaxy about the correpondence between individuals and columns +of the table, the user directs the tool to a history item that lists +the individuals in order. Each line starts with unique name for the +individuals (no embedded space or tab character), followed by an arbitrary +(possibly empty) set of words that are helpful for specifying groups +of individuals. + + diff -r 91e835060ad2 -r 8997f2ca8c7a multiple_to_gd_genotype.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multiple_to_gd_genotype.py Mon Jul 15 10:47:35 2013 -0400 @@ -0,0 +1,513 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# multiple_to_gd_genotype.py +# +# Copyright 2013 Oscar Reina +# +# This program is free software; you can redistribute it and/or modify +# it under the pathways of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +# MA 02110-1301, USA. + +import argparse +import base64 +import json +import os +import sys + +def fold_line(line, maxlen=200, prefix="#"): + """ + format hader to a 200 char max. + """ + line_len = len(line) + prefix_len = len(prefix) + + if line_len + prefix_len <= maxlen: + return '%s%s' % (prefix, line) + + lines = [] + start_idx = 0 + offset = 0 + + while start_idx < line_len - 1: + last_idx = start_idx + idx = start_idx + start = start_idx + + while idx != -1 and idx < maxlen + prefix_len + offset - 1: + last_idx = idx + idx = line.find(',', start) + start = idx+1 + + if idx == -1: + lines.append('%s%s' % (prefix, line[start_idx:])) + break + + lines.append('%s%s' % (prefix, line[start_idx:last_idx+1])) + start_idx = last_idx + 1 + offset = last_idx + 1 + + return '\n'.join(lines) + + +def formthdr(lPops,dbkey,species): + """ + returns a formated metadata for a gd_genotype file from a paramSet + dictionary and a list (lPops) of individuals + """ + clmnsVals=', '.join(['"%sG"'%(x+1) for x in range(len(lPops))])#"'1G', '2G', ..." + indvdls='], ['.join(['"%s", %s'%(lPops[x],x+5) for x in range(len(lPops))])#['DU23M01 Duroc domestic breed Europe', 5], ['DU23M02 Duroc domestic breed Europe', 6], ... + obj='{"rPos": 2, "column_names": ["chr", "pos", "A", "B", %s], "scaffold": 1, "pos": 2, "dbkey": "%s", "individuals": [[%s]], "ref": 1, "species": "%s"}'%(clmnsVals,dbkey,indvdls,species) + json_value = json.loads(obj) + hdr = fold_line(json.dumps(json_value, separators=(',',':'), sort_keys=True)) + #~ + return hdr + + +def formthdr_gdsnp(lPops,dbkey,species): + """ + returns a formated metadata for a gd_genotype file from a paramSet + dictionary and a list (lPops) of individuals + """ + clmnsVals=', '.join(['"%sA","%sB","%sG","%sQ"'%((x+1),(x+1),(x+1),(x+1)) for x in range(len(lPops))])#"'1G', '2G', ..." + indvdls='], ['.join(['"%s", %s'%(lPops[x],(((x+1)*4)+2)) for x in range(len(lPops))])#['DU23M01 Duroc domestic breed Europe', 5], ['DU23M02 Duroc domestic breed Europe', 9], ... + obj='{"rPos": 2, "column_names": ["chr", "pos", "A", "B", "Q", %s], "scaffold": 1, "pos": 2, "dbkey": "%s", "individuals": [[%s]], "ref": 1, "species": "%s"}'%(clmnsVals,dbkey,indvdls,species) + json_value = json.loads(obj) + hdr = fold_line(json.dumps(json_value, separators=(',',':'), sort_keys=True)) + #~ + return hdr + + +def selAnc(SNPs): + """ + returns the ancestral and derived snps, and an gd_genotype-encoded + list of SNPs/nucleotides + """ + dIUPAC={'AC':'M','AG':'R','AT':'W','CG':'S','CT':'Y','GT':'K'}#'N':'N','A':'A','T':'T','C':'C','G':'G', + dNtsCnts={} + for eSNP in SNPs: + if eSNP!='N': + try: + dNtsCnts[eSNP]+=1 + except: + dNtsCnts[eSNP]=1 + #~ + nAlleles=len(dNtsCnts.keys()) + assert nAlleles<=3 + if nAlleles==3: + nonCons=[x for x in dNtsCnts.keys() if x in set(['A','T','C','G'])] + cons=[x for x in dNtsCnts.keys() if x not in nonCons] + assert len(nonCons)==2 and len(cons)==1 and dIUPAC[''.join(sorted(nonCons))]==''.join(cons) + ancd=nonCons[0] + dervd=nonCons[1] + if dNtsCnts[dervd]>dNtsCnts[ancd]: + ancd,dervd=dervd,ancd + elif nAlleles==2: + cons=set(dNtsCnts.keys()).intersection(set(dIUPAC.values())) + assert len(cons)<=1 + if len(cons)==0: + ancd,dervd=dNtsCnts.keys() + if dNtsCnts[dervd]>dNtsCnts[ancd]: + ancd,dervd=dervd,ancd + else: + dervd=''.join(cons) + ancd=''.join([x for x in dNtsCnts.keys() if x!=dervd]) + else:#<=1 + ancd=''.join(dNtsCnts.keys()) + dervd='N' + #~ + lSNPsEncoded=[] + for eSNP in SNPs: + if eSNP=='N': + lSNPsEncoded.append('-1') + elif eSNP==ancd: + lSNPsEncoded.append('2') + elif eSNP==dervd: + lSNPsEncoded.append('0') + else: + lSNPsEncoded.append('1') + #~ + return ancd,dervd,lSNPsEncoded + + + +def from_csv(in_csv,outgd_gentp,dbkey,species): + """ + returns a gd_genotype file format from csv file (saved in excel). + The input must consist of a set of rows with columns splited by a + comma. The first row must contain the names of the individuals. For + the other rows, the first of column must contain the chromosome and + position of the snp. The other columns must contain any kind of + fstat or genepop allele valid encoding, with at most 2 alleles. Also, + the user may input IUPAC valid nucleotide symbols. The program will + assume that the mosts common is the ancestral. + ------------- The file starts here ---------------- + ,Ind_1,Ind_2,Ind_3,Ind_4,Ind_5,Ind_6,Ind_7 + chr1 12334123,A,T,T,A,T,T,W + chr2 1232654,C,G,G,C,N,S,N + chr3 3356367,T,G,G,G,G,K,K + chr4 95673,A,C,C,A,C,M,M + chr5 45896,T,C,Y,Y,Y,C,T + ... + or + ... + chr6 2354,22,11,21,00,12,12,22 + ------------- The file ends here ------------------- + """ + infoLn=True + slf=open(outgd_gentp,'w') + for echl in open(in_csv,'r'): + if echl.strip(): + if infoLn: + lPops=[x for x in echl.splitlines()[0].split(',') if x.strip()] + hdr=formthdr(lPops,dbkey,species) + slf.write('%s\n'%hdr) + infoLn=False + else: + lsplitd=echl.splitlines()[0].split(',') + lchrpox,SNPs=lsplitd[0],[x for x in lsplitd[1:] if x.strip()] + lchrpox='\t'.join(lchrpox.strip().split()) + if SNPs[0].isdigit(): + lSNPsEncoded=[] + for snp in SNPs: + cnt=0 + for ep in snp: + ep=int(ep) + assert ep<=2 + cnt+=ep + cnt=4-cnt + if cnt==4: + frmtdSNP='-1' + else: + frmtdSNP=str(cnt) + lSNPsEncoded.append(frmtdSNP) + ancd,dervd='N','N' + else: + ancd,dervd,lSNPsEncoded=selAnc(SNPs) + outfrmtdDat='%s\t%s\t%s\t%s'%(lchrpox,ancd,dervd,'\t'.join(lSNPsEncoded)) + slf.write('%s\n'%outfrmtdDat) + #~ + slf.close() + return 0 + + +def from_fstat(in_fstat,outgd_gentp,dbkey,species): + """ + returns a gd_genotype file format from fstat file. Ignores pops + structures and alleles other than the combinations of the alleles + encoded by 0, 1, and 2 up to 9 digits. The first line contains 4 + numbers separated by any number of spaces: number of samples, np, + number of loci, nl, highest number used to label an allele, nu + (?=2), and 1 if the code for alleles is a one digit number (1-2), a + 2 if code for alleles is a 2 digit number (01-02) or a 3 if code for + alleles is a 3 digit number (001-002). Followed by nl lines: each + containing the name of a locus, in the order they will appear in the + rest of the file On line nl+2, a series of numbers as follow: 1 0102 + 0101 0101 0201 0 0101 first number: identifies the sample to which + the individual belongs second: the genotype of the individual at the + first locus, coded with a 2 digits number for each allele third: the + genotype at the second locus, until locus nl is entered. Missing + genotypes are encoded with 0 (0001 or 0100 are not a valid format, + so both alleles at a locus have to be known, otherwise, the genotype + is considered as missing) no empty lines are needed between samples + number of spaces between genotypes can be anything. Numbering of + samples need not be sequential the number of samples np needs to be + the same as the largest sample identifier. Samples need not to be + ordered nu needs to be equal to the largest code given to an allele + (even if there are less than nu alleles). Ancestral is taken as 01, + derived 02. In all cases ancestral and derived SNPs are returned as + N. + ------------- The file starts here ---------------- + 7 5 2 1 + chr1 12334123 + chr2 1232654 + chr3 3356367 + chr4 95673 + chr5 45896 + Ind_1 22 22 21 11 22 + Ind_2 22 22 11 12 22 + Ind_3 22 11 22 21 22 + Ind_4 22 21 22 21 22 + Ind_5 22 22 22 21 22 + Ind_6 22 22 22 22 22 + Ind_7 22 22 21 21 22 + ------------- The file ends here ------------------- + """ + dChrPos,lPops,lChrPos,addPop={},[],[],False + clines=-1 + for echl in open(in_fstat,'r'): + clines+=1 + if echl.strip(): + if clines==0: + nSmpls,nLoci,nUsed,nDigs=[x for x in echl.splitlines()[0].split() if x.strip()] + nLoci=int(nLoci) + elif clines<=nLoci: + addPop=True + lchrpox='\t'.join(echl.strip().split()) + lChrPos.append(lchrpox) + elif addPop: + lsplitd=echl.splitlines()[0].split() + pop,SNPs=lsplitd[0],[x for x in lsplitd[1:] if x.strip()] + pop=pop.strip() + for x in range(nLoci): + snp=SNPs[x] + cnt=0 + for ep in snp: + ep=int(ep) + assert ep<=2 + cnt+=ep + cnt=4-cnt + if cnt==4: + frmtdSNP='-1' + else: + frmtdSNP=str(cnt) + try: + dChrPos[lChrPos[x]].append(frmtdSNP) + except: + dChrPos[lChrPos[x]]=[frmtdSNP] + #~ + lPops.append(pop) + #~ + hdr=formthdr(lPops,dbkey,species) + outfrmtdDat=['%s\t%s\t%s\t%s'%(x,'N','N','\t'.join(dChrPos[x])) for x in lChrPos] + #~ + slf=open(outgd_gentp,'w') + slf.write('\n'.join([hdr,'\n'.join(outfrmtdDat)])) + slf.close() + return 0 + + +def from_genepop(in_genepop,outgd_gentp,dbkey,species): + """ + returns a gd_genotype file format from genepop file . Ignores pops + structures and alleles other than the combinations of the alleles + encoded by 00, 01, and 02. The second line must contain the chromosome + and position of the SNPs separated by an space or a tab. Each loci + should be separated by a comma. Alternatively, they may be given one + per line. They may be given one per line, or on the same line but + separated by commas. The name of individuals are defined as + everything on the left of a comma, and their genotypes following the + same order of the SNPs in the second line. Ancestral is taken as 01, + derived 02. In all cases ancestral and derived SNPs are returned as N + ------------- The file starts here ---------------- + Microsat on Chiracus radioactivus, a pest species + chr1 23123, chr2 90394, chr3 90909, chr3 910909, chr4 10909 + POP + AA2, 0201 0111 0102 0000 0101 + AA1, 0201 0201 0202 0000 0101 + A10, 0201 0201 0101 0000 0101 + A11, 0201 0202 0102 0000 0102 + A12, 0202 0201 0101 0000 0101 + A11, 0101 0101 0101 0000 0101 + A12, 0202 0201 0201 0000 0101 + A11, 0201 0201 0101 0000 0101 + Pop + AF1, 0000 0000 0000 0000 0101 + AF2, 0201 0101 0102 0000 0101 + AF3, 0202 0201 0202 0000 0101 + AF4, 0201 0101 0000 0000 0101 + AF5, 0201 0101 0202 0000 0101 + AF6, 0101 0101 0102 0000 0101 + AF7, 0201 0100 0000 0000 0101 + AF8, 0101 0100 0000 0000 0201 + AF9, 0201 0200 0000 0000 0101 + AF10, 0101 0202 0202 0000 0101 + pop + C211, 0101 0202 0202 0000 0101 + C211, 0101 0101 0202 0000 0101 + C21E, 0101 0102 0202 0000 0101 + C21B, 0101 0101 0102 0000 0201 + C21C, 0201 0101 0202 0000 0101 + C21D, 0201 0101 0202 0000 0201 + ------------- The file ends here ------------------- + """ + dChrPos,lPops,lChrPos,addPop,addDat={},[],[],False,True + clines=-1 + for echl in open(in_genepop,'r'): + clines+=1 + if echl.strip(): + if echl.strip() in set(['pop','POP','Pop']): + addDat,addPop=False,True + elif addDat and clines>0: + lchrpox=['\t'.join(x.split()) for x in echl.split(',') if x.strip()] + lChrPos.extend(lchrpox) + elif addPop: + pop,SNPs=echl.splitlines()[0].split(',') + pop=pop.strip() + SNPs=[x for x in SNPs.split() if x.strip()] + for x in range(len(SNPs)): + snp=SNPs[x] + cnt=0 + for ep in snp: + ep=int(ep) + assert ep<=2 + cnt+=ep + cnt=4-cnt + if cnt==4: + frmtdSNP='-1' + else: + frmtdSNP=str(cnt) + try: + dChrPos[lChrPos[x]].append(frmtdSNP) + except: + dChrPos[lChrPos[x]]=[frmtdSNP] + #~ + lPops.append(pop) + #~ + hdr=formthdr(lPops,dbkey,species) + outfrmtdDat=['%s\t%s\t%s\t%s'%(x,'N','N','\t'.join(dChrPos[x])) for x in lChrPos] + #~ + slf=open(outgd_gentp,'w') + slf.write('\n'.join([hdr,'\n'.join(outfrmtdDat)])) + slf.close() + return 0 + + +def from_vcf(inVCF,outgd_gentp,dbkey,species): + """ + returns a gd_genotype file format from vcf a file + """ + slf=open(outgd_gentp,'w') + paramSet,addPop,adinfo=False,False,False + lPops=[] + for echl in open(inVCF,'r'): + if echl.strip(): + if not paramSet: + if echl.find('##')==0: + pass + elif echl.find('#')==0: + paramSet={} + all_params=[x for x in echl.split() if x.strip()] + clmn=-1 + for eparam in all_params: + clmn+=1 + if eparam=='#CHROM': + paramSet['chr']=clmn + elif eparam=='POS': + paramSet['pos']=clmn + elif eparam=='REF': + paramSet['A']=clmn + elif eparam=='ALT': + paramSet['B']=clmn + elif eparam=='QUAL': + paramSet['qual']=clmn + elif eparam=='FORMAT': + paramSet['frmt']=clmn + addPop=True + elif addPop: + lPops.append(eparam) + paramSet[eparam]=clmn + if paramSet: + hdr=formthdr(lPops,dbkey,species) + slf.write('%s\n'%hdr) + else: + all_vals=[x for x in echl.split() if x.strip()] + frmt=[x for x in all_vals[paramSet['frmt']].split(':') if x.strip()] + clmn=-1 + gtclmn,adclmn,qulclmn=False,False,False + for p in frmt: + clmn+=1 + if p=='GT': + gtclmn=clmn + elif p=='AD': + adclmn=clmn + adinfo=True + elif p=='GQ': + qulclmn=clmn + #~ + if adinfo: + outptInfo=[all_vals[paramSet['chr']],all_vals[paramSet['pos']],all_vals[paramSet['A']],all_vals[paramSet['B']],all_vals[paramSet['qual']]] + for ePop in lPops: + gntyp=all_vals[paramSet[ePop]].replace('|','/').split(':')[gtclmn].split('/') + encdGntyp,adA,adB,qual='-1','0','0','-1' + #~ + if set(gntyp)!=set(['.']): + encdGntyp=2-sum([int(x) for x in gntyp]) + if adclmn: + try: + adA,adB=all_vals[paramSet[ePop]].split(':')[adclmn].split(',') + except: + pass + if qulclmn: + try: + qual=all_vals[paramSet[ePop]].split(':')[qulclmn] + except: + pass + outptInfo.extend([adA,adB,str(encdGntyp),qual]) + slf.write('%s\n'%'\t'.join(outptInfo)) + #~ + else: + outptInfo=[all_vals[paramSet['chr']],all_vals[paramSet['pos']],all_vals[paramSet['A']],all_vals[paramSet['B']]] + for ePop in lPops: + gntyp=all_vals[paramSet[ePop]].replace('|','/').split(':')[gtclmn].split('/') + try: + encdGntyp=2-sum([int(x) for x in gntyp]) + except: + encdGntyp=-1 + outptInfo.append(str(encdGntyp)) + #~ + slf.write('%s\n'%'\t'.join(outptInfo)) + slf.close() + #~ + if adinfo: + hdr=formthdr_gdsnp(lPops,dbkey,species) + slf=open('%stmp'%outgd_gentp,'w') + slf.write('%s\n'%hdr) + appnd=False + for echl in open(outgd_gentp,'r'): + if appnd: + slf.write(echl) + else: + if echl[0]!='#': + appnd=True + slf.close() + #~ + os.system('mv %stmp %s'%(outgd_gentp,outgd_gentp)) + #~ + return 0 + + +def main(): + #~ + parser = argparse.ArgumentParser(description='Returns the count of genes in KEGG categories and their statistical overrrepresentation, from a list of genes and an background file (i.e. plane text with ENSEMBLT and KEGG pathways).') + parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in VCF format.',required=True) + parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in gd_genotype format.',required=True) + parser.add_argument('--dbkey',metavar='string',type=str,help='the input reference species dbkey (i.e. susScr3).',required=True) + parser.add_argument('--species',metavar='string',type=str,help='the input reference species name (i.e. int).',required=True) + parser.add_argument('--format',metavar='string',type=str,help='format of the input file (i.e. vcf, genepop, fstat, csv).',required=True) + + args = parser.parse_args() + + infile = args.input + outgd_gentp = args.output + dbkey = args.dbkey + species = base64.b64decode(args.species) + frmat = args.format + + #~ + if frmat=='vcf': + from_vcf(infile,outgd_gentp,dbkey,species) + elif frmat=='genepop': + from_genepop(infile,outgd_gentp,dbkey,species) + elif frmat=='fstat': + from_fstat(infile,outgd_gentp,dbkey,species) + elif frmat=='csv': + from_csv(infile,outgd_gentp,dbkey,species) + + #~ + return 0 + +if __name__ == '__main__': + main() + diff -r 91e835060ad2 -r 8997f2ca8c7a multiple_to_gd_genotype.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multiple_to_gd_genotype.xml Mon Jul 15 10:47:35 2013 -0400 @@ -0,0 +1,60 @@ + + : to gd_genotype + + + #import base64 + #set species_arg = base64.b64encode(str($species)) + multiple_to_gd_genotype.py --input '$input' --output '$output' --dbkey '$dbkey' --species '$species_arg' --format '$input_format' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**Dataset formats** + +----- + +**What it does** + +----- + +**Example** + + + diff -r 91e835060ad2 -r 8997f2ca8c7a nucleotide_diversity_pi.py --- a/nucleotide_diversity_pi.py Mon Jun 03 12:29:29 2013 -0400 +++ b/nucleotide_diversity_pi.py Mon Jul 15 10:47:35 2013 -0400 @@ -1,49 +1,23 @@ #!/usr/bin/env python +import gd_util import sys -import subprocess from Population import Population ################################################################################ -def run_program(prog, args, stdout_file=None, space_to_tab=False): - #print "args: ", ' '.join(args) - p = subprocess.Popen(args, bufsize=-1, executable=prog, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - (stdoutdata, stderrdata) = p.communicate() - rc = p.returncode +if len(sys.argv) != 7: + gd_util.die('Usage') - if stdout_file is not None: - with open(stdout_file, 'w') as ofh: - lines = stdoutdata.split('\n') - for line in lines: - line = line.strip() - if line: - if space_to_tab: - line = line.replace(' ', '\t') - print >> ofh, line - - if rc != 0: - print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) - print >> sys.stderr, stderrdata - sys.exit(1) - -################################################################################ - -if len(sys.argv) < 8: - print >> sys.stderr, "Usage" - sys.exit(1) - -gd_saps_file, gd_snps_file, covered_intervals_file, gd_indivs_file, output_file = sys.argv[1:6] -individual_metadata = sys.argv[6:] +gd_saps_file, gd_snps_file, covered_intervals_file, gd_indivs_file, output_file, ind_arg = sys.argv[1:] p_total = Population() -p_total.from_tag_list(individual_metadata) +p_total.from_wrapped_dict(ind_arg) p1 = Population() p1.from_population_file(gd_indivs_file) if not p_total.is_superset(p1): - print >> sys.stderr, 'There is an individual in the population individuals that is not in the SNP table' - sys.exit(1) + gd_util.die('There is an individual in the population individuals that is not in the SNP table') ################################################################################ @@ -56,9 +30,10 @@ columns = p1.column_list() for column in columns: - args.append('{0}'.format(column)) + args.append(column) -run_program(None, args, stdout_file=output_file) +with open(output_file, 'w') as fh: + gd_util.run_program(prog, args, stdout=fh) sys.exit(0) diff -r 91e835060ad2 -r 8997f2ca8c7a nucleotide_diversity_pi.xml --- a/nucleotide_diversity_pi.xml Mon Jun 03 12:29:29 2013 -0400 +++ b/nucleotide_diversity_pi.xml Mon Jul 15 10:47:35 2013 -0400 @@ -2,11 +2,16 @@ : &pi; and &theta; - nucleotide_diversity_pi.py "$saps" "$snps" "$intervals" "$indivs" "$output" - #for $individual_name, $individual_col in zip($snps.dataset.metadata.individual_names, $snps.dataset.metadata.individual_columns) - #set $arg = '%s:%s' % ($individual_col, $individual_name) - "$arg" - #end for + #import json + #import base64 + #import zlib + #set $ind_names = $snps.dataset.metadata.individual_names + #set $ind_colms = $snps.dataset.metadata.individual_columns + #set $ind_dict = dict(zip($ind_names, $ind_colms)) + #set $ind_json = json.dumps($ind_dict, separators=(',',':')) + #set $ind_comp = zlib.compress($ind_json, 9) + #set $ind_arg = base64.b64encode($ind_comp) + nucleotide_diversity_pi.py '$saps' '$snps' '$intervals' '$indivs' '$output' '$ind_arg' diff -r 91e835060ad2 -r 8997f2ca8c7a pca.py --- a/pca.py Mon Jun 03 12:29:29 2013 -0400 +++ b/pca.py Mon Jul 15 10:47:35 2013 -0400 @@ -1,39 +1,12 @@ #!/usr/bin/env python -import errno +import gd_util import os +import re import shutil -import subprocess import sys from BeautifulSoup import BeautifulSoup import gd_composite -import re - -################################################################################ - -def mkdir_p(path): - try: - os.makedirs(path) - except OSError, e: - if e.errno <> errno.EEXIST: - raise - -################################################################################ - -def run_program(prog, args, stdout_file=None): - #print "args: ", ' '.join(args) - p = subprocess.Popen(args, bufsize=-1, executable=prog, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - (stdoutdata, stderrdata) = p.communicate() - rc = p.returncode - - if stdout_file is not None: - with open(stdout_file, 'w') as ofh: - print >> ofh, stdoutdata - - if rc != 0: - print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) - print >> sys.stderr, stderrdata - sys.exit(1) ################################################################################ @@ -106,15 +79,7 @@ args.append('-p') args.append(par_file) - #print "args: ", ' '.join(args) - p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=subprocess.PIPE, stderr=sys.stderr) - (stdoutdata, stderrdata) = p.communicate() - rc = p.returncode - - if rc != 0: - print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) - print >> sys.stderr, stderrdata - sys.exit(1) + stdoutdata, stderrdata = gd_util.run_program(prog, args) stats = [] @@ -142,7 +107,7 @@ args.append(':'.join(population_names)) args.append('-x') - run_program(None, args) + gd_util.run_program(prog, args) def do_eval2pct(eval_file, explained_file): prog = 'eval2pct' @@ -150,16 +115,8 @@ args = [ prog ] args.append(eval_file) - with open(explained_file, 'w') as ofh: - #print "args:", ' '.join(args) - p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=ofh, stderr=subprocess.PIPE) - (stdoutdata, stderrdata) = p.communicate() - rc = p.returncode - - if rc != 0: - print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) - print >> sys.stderr, stderrdata - sys.exit(1) + with open(explained_file, 'w') as fh: + gd_util.run_program(prog, args, stdout=fh) def do_coords2admix(coords_file): prog = 'coords2admix' @@ -167,16 +124,8 @@ args = [ prog ] args.append(coords_file) - with open('fake', 'w') as ofh: - #print "args:", ' '.join(args) - p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=ofh, stderr=subprocess.PIPE) - (stdoutdata, stderrdata) = p.communicate() - rc = p.returncode - - if rc != 0: - print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) - print >> sys.stderr, stderrdata - sys.exit(1) + with open('fake', 'w') as fh: + gd_util.run_program(prog, args, stdout=fh) shutil.copy2('fake', coords_file) @@ -203,41 +152,55 @@ ################################################################################ if len(sys.argv) != 5: - print "usage" - sys.exit(1) + gd_util.die('Usage') input, input_files_path, output, output_files_path = sys.argv[1:5] +gd_util.mkdir_p(output_files_path) -mkdir_p(output_files_path) +################################################################################ ped_file = os.path.join(input_files_path, 'admix.ped') geno_file = os.path.join(output_files_path, 'admix.geno') do_ped2geno(ped_file, geno_file) +################################################################################ + map_file = os.path.join(input_files_path, 'admix.map') snp_file = os.path.join(output_files_path, 'admix.snp') do_map2snp(map_file, snp_file) +################################################################################ + ind_file = os.path.join(output_files_path, 'admix.ind') population_names, name_map = make_ind_file(ind_file, input) +################################################################################ + par_file = os.path.join(output_files_path, 'par.admix') evec_file = os.path.join(output_files_path, 'coordinates.txt') eval_file = os.path.join(output_files_path, 'admix.eval') make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file) +################################################################################ + smartpca_stats = do_smartpca(par_file) fix_names(name_map, [ind_file, evec_file]) +################################################################################ + do_ploteig(evec_file, population_names) plot_file = 'coordinates.txt.1:2.{0}.pdf'.format(':'.join(population_names)) output_plot_file = os.path.join(output_files_path, 'PCA.pdf') shutil.copy2(plot_file, output_plot_file) os.unlink(plot_file) +################################################################################ + do_eval2pct(eval_file, os.path.join(output_files_path, 'explained.txt')) os.unlink(eval_file) +################################################################################ + do_coords2admix(evec_file) ################################################################################ diff -r 91e835060ad2 -r 8997f2ca8c7a pca.xml --- a/pca.xml Mon Jun 03 12:29:29 2013 -0400 +++ b/pca.xml Mon Jul 15 10:47:35 2013 -0400 @@ -2,7 +2,7 @@ : Principal Components Analysis of genotype data - pca.py "$input" "$input.extra_files_path" "$output" "$output.files_path" + pca.py '$input' '$input.extra_files_path' '$output' '$output.files_path' diff -r 91e835060ad2 -r 8997f2ca8c7a phylogenetic_tree.py --- a/phylogenetic_tree.py Mon Jun 03 12:29:29 2013 -0400 +++ b/phylogenetic_tree.py Mon Jul 15 10:47:35 2013 -0400 @@ -1,154 +1,63 @@ #!/usr/bin/env python +import gd_util import os -import errno import sys -import subprocess -import shutil from Population import Population import gd_composite ################################################################################ -def mkdir_p(path): - try: - os.makedirs(path) - except OSError, e: - if e.errno <> errno.EEXIST: - raise - -################################################################################ +if len(sys.argv) != 12: + gd_util.die('Usage') -# -# phylogenetic_tree.py "$input" "$output" "$output.files_path" -# -# #if $input_type.choice == '0' -# "gd_snp" -# #if $input_type.data_source.choice == '0' -# "sequence_coverage" -# "$input_type.data_source.minimum_coverage" -# "$input_type.data_source.minimum_quality" -# #else if $input_type.data_source.choice == '1' -# "estimated_genotype" -# #else if $input_type.choice == '1' -# "gd_genotype" -# #end if -# -# #if $individuals.choice == '0' -# "all_individuals" -# #else if $individuals.choice == '1' -# "$individuals.p1_input" -# #end if -# -# #if ((str($input.metadata.scaffold) == str($input.metadata.ref)) and (str($input.metadata.pos) == str($input.metadata.rPos))) or (str($include_reference) == '0') -# "none" -# #else -# "$input.metadata.dbkey" -# #end if -# -# #set $draw_tree_options = ''.join(str(x) for x in [$branch_style, $scale_style, $length_style, $layout_style]) -# #if $draw_tree_options == '' -# "" -# #else -# "-$draw_tree_options" -# #end if -# -# #for $individual_name, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) -# #set $arg = '%s:%s' % ($individual_col, $individual_name) -# "$arg" -# #end for -# - -################################################################################ - -# if len(sys.argv) < 11: -# print >> sys.stderr, "Usage" -# sys.exit(1) -# -# input, p1_input, output, extra_files_path, minimum_coverage, minimum_quality, dbkey, data_source, draw_tree_options = sys.argv[1:10] -# -# individual_metadata = sys.argv[10:] -# -# # note: TEST THIS -# if dbkey in ['', '?', 'None']: -# dbkey = 'none' -# -# p_total = Population() -# p_total.from_tag_list(individual_metadata) - -if len(sys.argv) < 5: - print >> sys.stderr, 'Usage' - sys.exit(1) - -input, output, extra_files_path, input_type = sys.argv[1:5] -args = sys.argv[5:] - -data_source = '1' -minimum_coverage = '0' -minimum_quality = '0' +input, output, extra_files_path, input_type, data_source_arg, minimum_coverage, minimum_quality, p1_input, dbkey, draw_tree_options, ind_arg = sys.argv[1:] if input_type == 'gd_snp': - data_source_arg = args.pop(0) if data_source_arg == 'sequence_coverage': - data_source = '0' - minimum_coverage = args.pop(0) - minimum_quality = args.pop(0) + data_source = 0 elif data_source_arg == 'estimated_genotype': - pass + data_source = 1 else: - print >> sys.stderr, 'Unsupported data_source:', data_source_arg - sys.exit(1) + gd_util.die('Unsupported data_source: {0}'.format(data_source_arg)) elif input_type == 'gd_genotype': - pass + data_source = 1 + minimum_coverage = 0 + minimum_quality = 0 else: - print >> sys.stderr, 'Unsupported input_type:', input_type - sys.exit(1) - -p1_input, dbkey, draw_tree_options = args[:3] + gd_util.die('Unsupported input_type:: {0}'.format(input_type)) # note: TEST THIS if dbkey in ['', '?', 'None']: dbkey = 'none' -individual_metadata = args[3:] +p_total = Population() +p_total.from_wrapped_dict(ind_arg) -p_total = Population() -p_total.from_tag_list(individual_metadata) - -################################################################################ - -mkdir_p(extra_files_path) +if p1_input == "all_individuals": + tags = p_total.tag_list() +else: + p1 = Population() + p1.from_population_file(p1_input) + if not p_total.is_superset(p1): + gd_util.die('There is an individual in the population that is not in the SNP table') + tags = p1.tag_list() ################################################################################ -def run_program(prog, args, ofh): - #print "args: ", ' '.join(args) - p = subprocess.Popen(args, bufsize=-1, executable=prog, stdin=None, stdout=ofh, stderr=subprocess.PIPE) - (stdoutdata, stderrdata) = p.communicate() - rc = p.returncode - ofh.close() - - if rc != 0: - #print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) - print >> sys.stderr, stderrdata - sys.exit(1) - -################################################################################ - +gd_util.mkdir_p(extra_files_path) phylip_outfile = os.path.join(extra_files_path, 'distance_matrix.phylip') newick_outfile = os.path.join(extra_files_path, 'phylogenetic_tree.newick') ps_outfile = 'tree.ps' pdf_outfile = os.path.join(extra_files_path, 'tree.pdf') +informative_snp_file = os.path.join(extra_files_path, 'informative_snps.txt') +mega_distance_matrix_file = os.path.join(extra_files_path, 'mega_distance_matrix.txt') ################################################################################ -informative_snp_file = os.path.join(extra_files_path, 'informative_snps.txt') -mega_distance_matrix_file = os.path.join(extra_files_path, 'mega_distance_matrix.txt') - prog = 'dist_mat' -args = [] -args.append(prog) +args = [ prog ] args.append(input) args.append(minimum_coverage) args.append(minimum_quality) @@ -157,67 +66,53 @@ args.append(informative_snp_file) args.append(mega_distance_matrix_file) -if p1_input == "all_individuals": - tags = p_total.tag_list() -else: - p1 = Population() - p1.from_population_file(p1_input) - if not p_total.is_superset(p1): - print >> sys.stderr, 'There is an individual in the population that is not in the SNP table' - sys.exit(1) - tags = p1.tag_list() - for tag in tags: if input_type == 'gd_genotype': column, name = tag.split(':') tag = '{0}:{1}'.format(int(column) - 2, name) args.append(tag) -fh = open(phylip_outfile, 'w') -run_program(None, args, fh) +with open(phylip_outfile, 'w') as fh: + gd_util.run_program(prog, args, stdout=fh) ################################################################################ prog = 'quicktree' -args = [] -args.append(prog) +args = [ prog ] args.append('-in') args.append('m') args.append('-out') args.append('t') args.append(phylip_outfile) -fh = open(newick_outfile, 'w') -run_program(None, args, fh) +with open(newick_outfile, 'w') as fh: + gd_util.run_program(prog, args, stdout=fh) ################################################################################ prog = 'draw_tree' -args = [] -args.append(prog) +args = [ prog ] + if draw_tree_options: args.append(draw_tree_options) + args.append(newick_outfile) -fh = open(ps_outfile, 'w') -run_program(None, args, fh) +with open(ps_outfile, 'w') as fh: + gd_util.run_program(prog, args, stdout=fh) ################################################################################ prog = 'ps2pdf' -args = [] -args.append(prog) +args = [ prog ] args.append('-dPDFSETTINGS=/prepress') args.append(ps_outfile) -args.append('-') +args.append(pdf_outfile) -fh = open(pdf_outfile, 'w') -run_program(None, args, fh) - -shutil.copyfile(pdf_outfile, output) +gd_util.run_program(prog, args) ################################################################################ @@ -248,9 +143,9 @@ in_include_ref = gd_composite.Parameter(description='Include reference sequence', value=include_ref_value, display_type=display_value) -if data_source == '0': +if data_source == 0: data_source_value = 'sequence coverage' -elif data_source == '1': +elif data_source == 1: data_source_value = 'estimated genotype' in_data_source = gd_composite.Parameter(description='Data source', value=data_source_value, display_type=display_value) diff -r 91e835060ad2 -r 8997f2ca8c7a phylogenetic_tree.xml --- a/phylogenetic_tree.xml Mon Jun 03 12:29:29 2013 -0400 +++ b/phylogenetic_tree.xml Mon Jul 15 10:47:35 2013 -0400 @@ -2,44 +2,45 @@ : Show genetic relationships among individuals - phylogenetic_tree.py "$input" "$output" "$output.files_path" - + #import json + #import base64 + #import zlib + #set $ind_names = $input.dataset.metadata.individual_names + #set $ind_colms = $input.dataset.metadata.individual_columns + #set $ind_dict = dict(zip($ind_names, $ind_colms)) + #set $ind_json = json.dumps($ind_dict, separators=(',',':')) + #set $ind_comp = zlib.compress($ind_json, 9) + #set $ind_arg = base64.b64encode($ind_comp) + phylogenetic_tree.py '$input' '$output' '$output.files_path' #if $input_type.choice == '0' - "gd_snp" + 'gd_snp' #if $input_type.data_source.choice == '0' - "sequence_coverage" - "$input_type.data_source.minimum_coverage" - "$input_type.data_source.minimum_quality" + 'sequence_coverage' + '$input_type.data_source.minimum_coverage' + '$input_type.data_source.minimum_quality' #else if $input_type.data_source.choice == '1' - "estimated_genotype" + 'estimated_genotype' '0' '0' #end if #else if $input_type.choice == '1' - "gd_genotype" - #end if - - #if $individuals.choice == '0' - "all_individuals" - #else if $individuals.choice == '1' - "$individuals.p1_input" + 'gd_genotype' 'estimated_genotype' '0' '0' #end if - + #if $individuals.choice == '0' + 'all_individuals' + #else if $individuals.choice == '1' + '$individuals.p1_input' + #end if #if ((str($input.metadata.scaffold) == str($input.metadata.ref)) and (str($input.metadata.pos) == str($input.metadata.rPos))) or (str($include_reference) == '0') - "none" + 'none' #else - "$input.metadata.dbkey" + '$input.metadata.dbkey' #end if - #set $draw_tree_options = ''.join(str(x) for x in [$branch_style, $scale_style, $length_style, $layout_style]) #if $draw_tree_options == '' - "" + '' #else - "-$draw_tree_options" + '-$draw_tree_options' #end if - - #for $individual_name, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) - #set $arg = '%s:%s' % ($individual_col, $individual_name) - "$arg" - #end for + '$ind_arg' diff -r 91e835060ad2 -r 8997f2ca8c7a prepare_population_structure.py --- a/prepare_population_structure.py Mon Jun 03 12:29:29 2013 -0400 +++ b/prepare_population_structure.py Mon Jul 15 10:47:35 2013 -0400 @@ -1,16 +1,15 @@ #!/usr/bin/env python -import errno +import gd_util import os import shutil -import subprocess import sys from Population import Population import gd_composite ################################################################################ -def do_import(filename, files_path, min_reads, min_qual, min_spacing, tags, using_info, population_list): +def do_import(filename, files_path, min_reads, min_qual, min_spacing, using_info, population_list): info_page = gd_composite.InfoPage() info_page.set_title('Prepare to look for population structure Galaxy Composite Dataset') @@ -39,50 +38,29 @@ with open(filename, 'w') as ofh: print >> ofh, info_page.render() -def mkdir_p(path): - try: - os.makedirs(path) - except OSError, e: - if e.errno <> errno.EEXIST: - raise - -def die(message, exit=True): - print >> sys.stderr, message - if exit: - sys.exit(1) - ################################################################################ if len(sys.argv) < 10: - die("Usage") + gd_util.die('Usage') # parse command line -input_snp_filename, input_type, min_reads, min_qual, min_spacing, output_filename, output_files_path = sys.argv[1:8] -args = sys.argv[8:] +input_snp_filename, input_type, min_reads, min_qual, min_spacing, output_filename, output_files_path, ind_arg = sys.argv[1:9] +args = sys.argv[9:] -individual_metadata = [] population_files = [] -population_names = [] all_individuals = False for arg in args: if arg == 'all_individuals': all_individuals = True - elif len(arg) > 11: - tag = arg[:11] - value = arg[11:] - if tag == 'individual:': - individual_metadata.append(value) - elif tag == 'population:': - filename, name = value.split(':', 1) - population_files.append(filename) - population_names.append(name) + elif len(arg) > 11 and arg[:11] == 'population:': + file, name = arg[11:].split(':', 1) + population_files.append((file, name)) p_total = Population() -p_total.from_tag_list(individual_metadata) +p_total.from_wrapped_dict(ind_arg) individual_population = {} - population_list = [] if all_individuals: @@ -91,57 +69,50 @@ population_list.append(p1) else: p1 = Population() - for idx in range(len(population_files)): - population_file = population_files[idx] - population_name = population_names[idx] - this_pop = Population(population_name) - this_pop.from_population_file(population_file) + for file, name in population_files: + this_pop = Population(name) + this_pop.from_population_file(file) population_list.append(this_pop) - p1.from_population_file(population_file) - tags = p1.tag_list() - for tag in tags: + + for tag in this_pop.tag_list(): if tag not in individual_population: - individual_population[tag] = population_name + individual_population[tag] = name + + # add individuals from this file to p1 + p1.from_population_file(file) + if not p_total.is_superset(p1): - print >> sys.stderr, 'There is an individual in the population that is not in the SNP table' - sys.exit(1) + gd_util.die('There is an individual in the population that is not in the SNP table') -# run tool +################################################################################ + prog = 'admix_prep' -args = [] -args.append(prog) +args = [ prog ] args.append(input_snp_filename) args.append(min_reads) args.append(min_qual) args.append(min_spacing) -tags = p1.tag_list() -for tag in tags: +for tag in p1.tag_list(): if input_type == 'gd_genotype': - column, name = tag.split(':') + column, name = tag.split(':', 1) tag = '{0}:{1}'.format(int(column) - 2, name) args.append(tag) -#print "args:", ' '.join(args) -p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=subprocess.PIPE, stderr=sys.stderr) -(stdoutdata, stderrdata) = p.communicate() -rc = p.returncode +stdoutdata, stderrdata = gd_util.run_program(prog, args) +using_info = stdoutdata.rstrip('\r\n') -if rc != 0: - die('admix_prep failed: rc={0}'.format(rc)) +################################################################################ -using_info = stdoutdata.rstrip('\r\n') -mkdir_p(output_files_path) +gd_util.mkdir_p(output_files_path) + output_ped_filename = os.path.join(output_files_path, 'admix.ped') output_map_filename = os.path.join(output_files_path, 'admix.map') shutil.copy2('admix.ped', output_ped_filename) shutil.copy2('admix.map', output_map_filename) -do_import(output_filename, output_files_path, min_reads, min_qual, min_spacing, tags, using_info, population_list) -os.unlink('admix.ped') -os.unlink('admix.map') - +do_import(output_filename, output_files_path, min_reads, min_qual, min_spacing, using_info, population_list) sys.exit(0) diff -r 91e835060ad2 -r 8997f2ca8c7a prepare_population_structure.xml --- a/prepare_population_structure.xml Mon Jun 03 12:29:29 2013 -0400 +++ b/prepare_population_structure.xml Mon Jul 15 10:47:35 2013 -0400 @@ -1,26 +1,31 @@ - + : Filter and convert to the format needed for these tools - prepare_population_structure.py "$input" + #import json + #import base64 + #import zlib + #set $ind_names = $input.dataset.metadata.individual_names + #set $ind_colms = $input.dataset.metadata.individual_columns + #set $ind_dict = dict(zip($ind_names, $ind_colms)) + #set $ind_json = json.dumps($ind_dict, separators=(',',':')) + #set $ind_comp = zlib.compress($ind_json, 9) + #set $ind_arg = base64.b64encode($ind_comp) + prepare_population_structure.py '$input' #if $input_type.choice == '0' - "gd_snp" "$input_type.min_reads" "$input_type.min_qual" + 'gd_snp' '$input_type.min_reads' '$input_type.min_qual' #else if $input_type.choice == '1' - "gd_genotype" "0" "0" + 'gd_genotype' '0' '0' #end if - "$min_spacing" "$output" "$output.files_path" + '0' '$output' '$output.files_path' '$ind_arg' #if $individuals.choice == '0' - "all_individuals" + 'all_individuals' #else if $individuals.choice == '1' #for $population in $individuals.populations #set $pop_arg = 'population:%s:%s' % (str($population.p_input), str($population.p_input.name)) - "$pop_arg" + '$pop_arg' #end for #end if - #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) - #set $arg = 'individual:%s:%s' % ($individual_col, $individual) - "$arg" - #end for @@ -52,7 +57,10 @@ + diff -r 91e835060ad2 -r 8997f2ca8c7a rank_pathways.xml --- a/rank_pathways.xml Mon Jun 03 12:29:29 2013 -0400 +++ b/rank_pathways.xml Mon Jul 15 10:47:35 2013 -0400 @@ -1,28 +1,52 @@ - + : Assess the impact of a gene set on KEGG pathways - #if str($output_format) == 'a' + #if $rank_by.choice == 'pct' rank_pathways_pct.py - #else if str($output_format) == 'b' + --input '$rank_by.input1' + --columnENSEMBLT '$rank_by.t_col1' + --inBckgrndfile '$rank_by.input2' + --columnENSEMBLTBckgrnd '$rank_by.t_col2' + --columnKEGGBckgrnd '$rank_by.k_col2' + --statsTest '$rank_by.stat' + --output '$output' + #else if $rank_by.choice == 'paths' calclenchange.py + '--loc_file=${GALAXY_DATA_INDEX_DIR}/gd.rank.loc' + '--species=${rank_by.input.metadata.dbkey}' + '--input=${rank_by.input}' + '--output=${output}' + '--posKEGGclmn=${rank_by.kpath}' + '--KEGGgeneposcolmn=${rank_by.kgene}' #end if - "--loc_file=${GALAXY_DATA_INDEX_DIR}/gd.rank.loc" - "--species=${input.metadata.dbkey}" - "--input=${input}" - "--output=${output}" - "--posKEGGclmn=${kpath}" - "--KEGGgeneposcolmn=${kgene}" - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + @@ -31,11 +55,6 @@ - - - - - @@ -43,9 +62,10 @@ **Dataset formats** -The input and output datasets are in tabular_ format. +All of the input and output datasets are in tabular_ format. The input dataset must have columns with KEGG gene ID and pathways. -The output dataset is described below. +[Need to update this, since input columns now depend on the "Rank by" choice.] +The output datasets are described below. (`Dataset missing?`_) .. _tabular: ./static/formatHelp.html#tab @@ -56,7 +76,8 @@ **What it does** This tool produces a table ranking the pathways based on the percentage -of genes in an input dataset, out of the total in each pathway. +of genes in an input dataset, out of the total in each pathway +[please clarify w.r.t. query and background datasets]. Alternatively, the tool ranks the pathways based on the change in length and number of paths connecting sources and sinks. This change is calculated between graphs representing pathways with and without excluding @@ -65,14 +86,15 @@ Sinks are all the nodes representing the final reactants/products in the pathway. -If pathways are ranked by percentage of genes affected, the output is -a tabular dataset with the following columns: +If pathways are ranked by percentage of genes affected, the output contains +a row for each KEGG pathway, with the following columns: -1. number of genes in the pathway present in the input dataset -2. percentage of the total genes in the pathway included in the input dataset -3. rank of the frequency (from high freq to low freq) -4. Fisher probability of enrichment/depletion of pathway genes in the input dataset -5. name of the pathway +1. count: the number of genes in the query set that are in this pathway +2. representation: the percentage of this pathway's genes (from the background dataset) that appear in the query set +3. ranking of this pathway, based on its representation ("1" is highest) +4. probability of depletion of this pathway in the query dataset +5. probability of enrichment of this pathway in the query dataset +6. KEGG pathway If pathways are ranked by change in length and number of paths, the output is a tabular dataset with the following columns: @@ -97,20 +119,20 @@ Contig62_chr1_19011969_19012646 265 chr1 19012240 ENSCAFT00000000144 ENSCAFP00000000125 * 161 R 483960 probably damaging N etc. -- output ranked by percentage of genes affected:: +- output ranked by percentage of genes affected [need new sample output with more columns]:: - 3 0.25 1 cfa03450=Non-homologous end-joining - 1 0.25 1 cfa00750=Vitamin B6 metabolism - 2 0.2 3 cfa00290=Valine, leucine and isoleucine biosynthesis - 3 0.18 4 cfa00770=Pantothenate and CoA biosynthesis + 3 0.25 1 cfa03450=Non-homologous end-joining + 1 0.25 1 cfa00750=Vitamin B6 metabolism + 2 0.2 3 cfa00290=Valine, leucine and isoleucine biosynthesis + 3 0.18 4 cfa00770=Pantothenate and CoA biosynthesis etc. - output ranked by change in length and number of paths:: - 3.64 8.44 4.8 2 4 9 5 1 cfa00260=Glycine, serine and threonine metabolism - 7.6 9.6 2 1 3 5 2 2 cfa00240=Pyrimidine metabolism - 0.05 2.67 2.62 6 1 30 29 3 cfa00982=Drug metabolism - cytochrome P450 - -0.08 8.33 8.41 84 1 30 29 3 cfa00564=Glycerophospholipid metabolism + 3.64 8.44 4.8 2 4 9 5 1 cfa00260=Glycine, serine and threonine metabolism + 7.6 9.6 2 1 3 5 2 2 cfa00240=Pyrimidine metabolism + 0.05 2.67 2.62 6 1 30 29 3 cfa00982=Drug metabolism - cytochrome P450 + -0.08 8.33 8.41 84 1 30 29 3 cfa00564=Glycerophospholipid metabolism etc. diff -r 91e835060ad2 -r 8997f2ca8c7a rank_pathways_pct.py --- a/rank_pathways_pct.py Mon Jun 03 12:29:29 2013 -0400 +++ b/rank_pathways_pct.py Mon Jul 15 10:47:35 2013 -0400 @@ -1,12 +1,12 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# calcfreq.py +# KEGGFisher.py # -# Copyright 2011 Oscar Bedoya-Reina +# Copyright 2013 Oscar Reina # # This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by +# it under the pathways of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # @@ -20,119 +20,187 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. -import argparse,os,sys +import argparse +import os +import sys +from fisher import pvalue as fisher from decimal import Decimal,getcontext -from LocationFile import LocationFile -from fisher import pvalue +from math import lgamma,exp,factorial -#method to rank the the pthways by mut. freq. -def rankd(ltfreqs): - ordvals=sorted(ltfreqs)#sort and reverse freqs. +def binProb(SAPs_KEGG,NoSAPs_KEGG,SAPs_all,NoSAPs_all,CntKEGG_All,totalSAPs,pKEGG): + """ + Returns binomial probability. + """ + def comb(CntKEGG_All,k): + return factorial(CntKEGG_All) / Decimal(str(factorial(k)*factorial(CntKEGG_All-k))) + probLow = 0 + for k in range(0, SAPs_KEGG+1): + cp=Decimal(str(comb(CntKEGG_All,k))) + bp=Decimal(str(pKEGG**k)) + dp=Decimal(str(1.0-pKEGG))**Decimal(str(CntKEGG_All-k)) + probLow+=cp*bp*dp + #~ + probHigh = 0 + for k in range(int(SAPs_KEGG),CntKEGG_All+1): + cp=Decimal(str(comb(CntKEGG_All,k))) + bp=Decimal(str(pKEGG**k)) + dp=Decimal(str(1.0-pKEGG))**Decimal(str(CntKEGG_All-k)) + probHigh+=cp*bp*dp + return probLow,probHigh + +def gauss_hypergeom(X, CntKEGG_All, SAPs_all, totalSAPs): + CntKEGG_All,SAPs_all,totalSAPs + """ + Returns the probability of drawing X successes of SAPs_all marked items + in CntKEGG_All draws from a bin of totalSAPs total items + """ + def logchoose(ni, ki): + try: + lgn1 = lgamma(ni+1) + lgk1 = lgamma(ki+1) + lgnk1 = lgamma(ni-ki+1) + except ValueError: + raise ValueError + return lgn1 - (lgnk1 + lgk1) #~ - outrnk=[] - tmpFreq0,tmpCount,pval,tmpPthw=ordvals.pop()#the highest possible value - crank=1 - outrnk.append('\t'.join([str(tmpCount),str(tmpFreq0),str(crank),str(pval),tmpPthw])) - totalnvals=len(ordvals) + r1 = logchoose(SAPs_all, X) + try: + r2 = logchoose(totalSAPs-SAPs_all, CntKEGG_All-X) + except ValueError: + return 0 + r3 = logchoose(totalSAPs,CntKEGG_All) + return exp(r1 + r2 - r3) + +def hypergeo_sf(SAPs_KEGG,NoSAPs_KEGG,SAPs_all,NoSAPs_all,CntKEGG_All,totalSAPs,pKEGG): + """ + Runs Hypergeometric probability test + """ + s = 0 + t=0 + for i in range(SAPs_KEGG,min(SAPs_all,CntKEGG_All)+1): + s += max(gauss_hypergeom(i,CntKEGG_All,SAPs_all,totalSAPs), 0.0) + for i in range(0, SAPs_KEGG+1): + t += max(gauss_hypergeom(i,CntKEGG_All,SAPs_all,totalSAPs), 0.0) + return min(max(t,0.0), 1),min(max(s,0.0), 1) + +def fisherexct(SAPs_KEGG,NoSAPs_KEGG,SAPs_all,NoSAPs_all,CntKEGG_All,totalSAPs,pKEGG): + """ + Runs Fisher's exact test + """ + ftest=fisher(SAPs_KEGG,NoSAPs_KEGG,SAPs_all,NoSAPs_all) + probLow,probHigh=ftest.left_tail,ftest.right_tail + return probLow,probHigh + +def rtrnKEGGcENSEMBLc(inBckgrndfile,columnENSEMBLTBckgrnd,columnKEGGBckgrnd): + """ + """ + dKEGGTENSEMBLT={} + for eachl in open(inBckgrndfile,'r'): + if eachl.strip(): + ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLTBckgrnd] + KEGGTs=set(eachl.splitlines()[0].split('\t')[columnKEGGBckgrnd].split('.')) + KEGGTs=KEGGTs.difference(set(['','U','N'])) + for KEGGT in KEGGTs: + try: + dKEGGTENSEMBLT[KEGGT].add(ENSEMBLT) + except: + dKEGGTENSEMBLT[KEGGT]=set([ENSEMBLT]) + ENSEMBLTGinKEGG=set.union(*dKEGGTENSEMBLT.values()) + return dKEGGTENSEMBLT,ENSEMBLTGinKEGG + +def rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinKEGG): + """ + returns a set of the ENSEMBLT codes present in the input list and + in the KEGG file + """ + sENSEMBLTSAPsinKEGG=set() + for eachl in open(inSAPsfile,'r'): + ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLT] + if ENSEMBLT in ENSEMBLTGinKEGG: + sENSEMBLTSAPsinKEGG.add(ENSEMBLT) + return sENSEMBLTSAPsinKEGG + +def rtrnCounts(dKEGGTENSEMBLT,ENSEMBLTGinKEGG,sENSEMBLTSAPsinKEGG,statsTest): + """ + returns a list of the ENSEMBLT codes present in the input list and + in the KEGG file. The pathways in this list are: 'Go Term','# Genes in + the KEGG Term','# Genes in the list and in the KEGG Term','Enrichement + of the KEGG Term for genes in the input list','Genes in the input list + present in the KEGG term' + """ + totalSAPs=len(ENSEMBLTGinKEGG) + SAPs_all=len(sENSEMBLTSAPsinKEGG) + NoSAPs_all=totalSAPs-SAPs_all + pKEGG=SAPs_all/float(totalSAPs) + #~ + lp=len(dKEGGTENSEMBLT) cnt=0 - while totalnvals>cnt: + #~ + if statsTest=='fisher': + ptest=fisherexct + elif statsTest=='hypergeometric': + ptest=hypergeo_sf + elif statsTest=='binomial': + ptest=binProb + #~ + ltfreqs=[] + for echKEGGT in dKEGGTENSEMBLT: cnt+=1 - tmpFreq,tmpCount,pval,tmpPthw=ordvals.pop() - if tmpFreq!=tmpFreq0: - crank=len(outrnk)+1 - tmpFreq0=tmpFreq - outrnk.append('\t'.join([str(tmpCount),str(tmpFreq),str(crank),str(pval),tmpPthw])) - return outrnk - + CntKEGG_All=len(dKEGGTENSEMBLT[echKEGGT]) + SAPs_KEGG=len(dKEGGTENSEMBLT[echKEGGT].intersection(sENSEMBLTSAPsinKEGG)) + NoSAPs_KEGG=CntKEGG_All-SAPs_KEGG + probLow,probHigh=ptest(SAPs_KEGG,NoSAPs_KEGG,SAPs_all,NoSAPs_all,CntKEGG_All,totalSAPs,pKEGG) + ltfreqs.append([(SAPs_KEGG/Decimal(CntKEGG_All)),SAPs_KEGG,probLow,probHigh,echKEGGT]) + #~ + ltfreqs.sort() + ltfreqs.reverse() + outl=[] + cper,crank=Decimal('2'),0 + #~ + getcontext().prec=2#set 2 decimal places + for perc,cnt_go,pvalLow,pvalHigh,goTerm in ltfreqs: + if perc-1]) - while True:#to correct names with more than one gene - try: - mgenes=sdGenes.pop() - pthwsAssotd=dKEGGcPthws.pop(mgenes) - mgenes=mgenes.split('.') - for eachg in mgenes: - dKEGGcPthws[eachg]=pthwsAssotd - except: - break - #~ Count genes + #~ + parser = argparse.ArgumentParser(description='Returns the count of genes in KEGG categories and their statistical overrrepresentation, from a list of genes and an background file (i.e. plane text with ENSEMBLT and KEGG pathways).') + parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format.',required=True) + parser.add_argument('--inBckgrndfile',metavar='input TXT file',type=str,help='the input file with the background table in txt format.',required=True) + parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format.',required=True) + parser.add_argument('--columnENSEMBLT',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the input file.',required=True) + parser.add_argument('--columnENSEMBLTBckgrnd',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the background file.',required=True) + parser.add_argument('--columnKEGGBckgrnd',metavar='column number',type=int,help='column with the KEGG pathways in the background file.',required=True) + parser.add_argument('--statsTest',metavar='input TXT file',type=str,help='statistical test to compare KEGG pathways (i.e. fisher, hypergeometric, binomial).',required=True) + + args = parser.parse_args() - location_file = LocationFile(locf) - prefix, kxml_dir_path, dict_file = location_file.get_values(species) - dPthContsTotls = {} - try: - with open(dict_file) as fh: - for line in fh: - line = line.rstrip('\r\n') - value, key = line.split('\t') - dPthContsTotls[key] = int(value) - except IOError, err: - print >> sys.stderr, 'Error opening dict file {0}: {1}'.format(dict_file, err.strerror) - sys.exit(1) - - dPthContsTmp=dict([(x,0) for x in dPthContsTotls.keys()])#create a list of genes - sdGenes=set(dKEGGcPthws.keys())#list of all genes - cntGens=0 - ltGens=len(sdGenes) - while cntGens> sys.stderr, "Error: pathway not found in database: '{0}'".format(eachP) - sys.exit(1) - cntGens+=1 - #~ Calculate Freqs. - ltfreqs=[] - cntAllKEGGinGnm=sum(dPthContsTotls.values()) - cntListKEGGinGnm=sum(dPthContsTmp.values()) - cntAllKEGGNOTinGnm=cntAllKEGGinGnm-cntListKEGGinGnm - for pthw in dPthContsTotls: - cntAllKEGGinpthw=Decimal(dPthContsTotls[pthw]) - try: - cntListKEGGinpthw=Decimal(dPthContsTmp[pthw]) - except: - cntListKEGGinpthw=Decimal('0') - cntAllKEGGNOTinpthw=cntAllKEGGinpthw-cntListKEGGinpthw - pval=pvalue(cntListKEGGinpthw,cntAllKEGGNOTinpthw,cntListKEGGinGnm,cntAllKEGGNOTinGnm) - - ltfreqs.append([(cntListKEGGinpthw/cntAllKEGGinpthw),cntListKEGGinpthw,Decimal(str(pval.two_tail))*1,pthw]) - #~ ltfreqs=[((Decimal(dPthContsTmp[x])/Decimal(dPthContsTotls[x])),Decimal(dPthContsTmp[x]),x) for x in dPthContsTotls] - tabllfreqs='\n'.join(rankd(ltfreqs)) - salef=open(outputf,'w') - salef.write(tabllfreqs) - salef.close() + inSAPsfile = args.input + inBckgrndfile = args.inBckgrndfile + saleKEGGPCount = args.output + columnENSEMBLT = args.columnENSEMBLT + columnENSEMBLTBckgrnd = args.columnENSEMBLTBckgrnd + columnKEGGBckgrnd = args.columnKEGGBckgrnd + statsTest = args.statsTest + columnENSEMBLT-=1 + columnENSEMBLTBckgrnd-=1 + columnKEGGBckgrnd=-1 + #~ + dKEGGTENSEMBLT,ENSEMBLTGinKEGG=rtrnKEGGcENSEMBLc(inBckgrndfile,columnENSEMBLTBckgrnd,columnKEGGBckgrnd) + sENSEMBLTSAPsinKEGG=rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinKEGG) + outl=rtrnCounts(dKEGGTENSEMBLT,ENSEMBLTGinKEGG,sENSEMBLTSAPsinKEGG,statsTest) + #~ + saleKEGGPCount=open(saleKEGGPCount,'w') + saleKEGGPCount.write('\n'.join(outl)) + saleKEGGPCount.close() + #~ return 0 - if __name__ == '__main__': main() + diff -r 91e835060ad2 -r 8997f2ca8c7a rank_terms.py --- a/rank_terms.py Mon Jun 03 12:29:29 2013 -0400 +++ b/rank_terms.py Mon Jul 15 10:47:35 2013 -0400 @@ -1,132 +1,204 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GOFisher.py -# -# Copyright 2013 Oscar Reina -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, -# MA 02110-1301, USA. - -import argparse -import os -import sys -from fisher import pvalue -from decimal import Decimal,getcontext - -def rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd): - """ - """ - dGOTENSEMBLT={} - for eachl in open(inExtnddfile,'r'): - if eachl.strip(): - ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLTExtndd] - GOTs=set(eachl.splitlines()[0].split('\t')[columnGOExtndd].split('.')) - GOTs=GOTs.difference(set(['','U','N'])) - for GOT in GOTs: - try: - dGOTENSEMBLT[GOT].add(ENSEMBLT) - except: - dGOTENSEMBLT[GOT]=set([ENSEMBLT]) - #~ - ##dGOTENSEMBLT.pop('') - ENSEMBLTGinGO=set.union(*dGOTENSEMBLT.values()) - return dGOTENSEMBLT,ENSEMBLTGinGO - -def rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO): - """ - returns a set of the ENSEMBLT codes present in the input list and - in the GO file - """ - sENSEMBLTSAPsinGO=set() - for eachl in open(inSAPsfile,'r'): - ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLT] - if ENSEMBLT in ENSEMBLTGinGO: - sENSEMBLTSAPsinGO.add(ENSEMBLT) - return sENSEMBLTSAPsinGO - -def rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO): - """ - returns a list of the ENSEMBLT codes present in the input list and - in the GO file. The terms in this list are: 'Go Term','# Genes in - the GO Term','# Genes in the list and in the GO Term','Enrichement - of the GO Term for genes in the input list','Genes in the input list - present in the GO term' - """ - getcontext().prec=2#set 2 decimal places - SAPs_all=len(sENSEMBLTSAPsinGO) - NoSAPs_all=len(ENSEMBLTGinGO)-SAPs_all - #~ - lp=len(dGOTENSEMBLT) - cnt=0 - #~ - ltfreqs=[] - for echGOT in dGOTENSEMBLT: - cnt+=1 - ##print 'Running "%s", %s out of %s'%(echGOT,cnt,lp) - CntGO_All=len(dGOTENSEMBLT[echGOT]) - SAPs_GO=len(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO)) - NoSAPs_GO=CntGO_All-SAPs_GO - pval=pvalue(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all) - #~ outl.append('\t'.join([str(x) for x in(str(pval.two_tail),CntGO_All,SAPs_GO,'.'.join(sorted(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO))),echGOT)])) - ltfreqs.append([(SAPs_GO/Decimal(CntGO_All)),SAPs_GO,Decimal(str(pval.two_tail))*1,echGOT]) - #~ - ltfreqs.sort() - ltfreqs.reverse() - outl=[] - cper,crank=Decimal('2'),0 - #~ - for perc,cnt_go,pval,goTerm in ltfreqs: - if perc +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +# MA 02110-1301, USA. + +import argparse +import os +import sys +from fisher import pvalue as fisher +from decimal import Decimal,getcontext +from math import lgamma,exp,factorial + +def binProb(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all,CntGO_All,totalSAPs,pGO): + """ + Returns binomial probability. + """ + def comb(CntGO_All,k): + return factorial(CntGO_All) / Decimal(str(factorial(k)*factorial(CntGO_All-k))) + probLow = 0 + for k in range(0, SAPs_GO+1): + cp=Decimal(str(comb(CntGO_All,k))) + bp=Decimal(str(pGO**k)) + dp=Decimal(str(1.0-pGO))**Decimal(str(CntGO_All-k)) + probLow+=cp*bp*dp + #~ + probHigh = 0 + for k in range(int(SAPs_GO),CntGO_All+1): + cp=Decimal(str(comb(CntGO_All,k))) + bp=Decimal(str(pGO**k)) + dp=Decimal(str(1.0-pGO))**Decimal(str(CntGO_All-k)) + probHigh+=cp*bp*dp + return probLow,probHigh + +def gauss_hypergeom(X, CntGO_All, SAPs_all, totalSAPs): + CntGO_All,SAPs_all,totalSAPs + """ + Returns the probability of drawing X successes of SAPs_all marked items + in CntGO_All draws from a bin of totalSAPs total items + """ + def logchoose(ni, ki): + try: + lgn1 = lgamma(ni+1) + lgk1 = lgamma(ki+1) + lgnk1 = lgamma(ni-ki+1) + except ValueError: + raise ValueError + return lgn1 - (lgnk1 + lgk1) + #~ + r1 = logchoose(SAPs_all, X) + try: + r2 = logchoose(totalSAPs-SAPs_all, CntGO_All-X) + except ValueError: + return 0 + r3 = logchoose(totalSAPs,CntGO_All) + return exp(r1 + r2 - r3) + +def hypergeo_sf(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all,CntGO_All,totalSAPs,pGO): + """ + Runs Hypergeometric probability test + """ + s = 0 + t=0 + for i in range(SAPs_GO,min(SAPs_all,CntGO_All)+1): + s += max(gauss_hypergeom(i,CntGO_All,SAPs_all,totalSAPs), 0.0) + for i in range(0, SAPs_GO+1): + t += max(gauss_hypergeom(i,CntGO_All,SAPs_all,totalSAPs), 0.0) + return min(max(t,0.0), 1),min(max(s,0.0), 1) + +def fisherexct(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all,CntGO_All,totalSAPs,pGO): + """ + Runs Fisher's exact test + """ + ftest=fisher(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all) + probLow,probHigh=ftest.left_tail,ftest.right_tail + return probLow,probHigh + +def rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd): + """ + """ + dGOTENSEMBLT={} + for eachl in open(inExtnddfile,'r'): + if eachl.strip(): + ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLTExtndd] + GOTs=set(eachl.splitlines()[0].split('\t')[columnGOExtndd].split('.')) + GOTs=GOTs.difference(set(['','U','N'])) + for GOT in GOTs: + try: + dGOTENSEMBLT[GOT].add(ENSEMBLT) + except: + dGOTENSEMBLT[GOT]=set([ENSEMBLT]) + ENSEMBLTGinGO=set.union(*dGOTENSEMBLT.values()) + return dGOTENSEMBLT,ENSEMBLTGinGO + +def rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO): + """ + returns a set of the ENSEMBLT codes present in the input list and + in the GO file + """ + sENSEMBLTSAPsinGO=set() + for eachl in open(inSAPsfile,'r'): + ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLT] + if ENSEMBLT in ENSEMBLTGinGO: + sENSEMBLTSAPsinGO.add(ENSEMBLT) + return sENSEMBLTSAPsinGO + +def rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO,statsTest): + """ + returns a list of the ENSEMBLT codes present in the input list and + in the GO file. The terms in this list are: 'Go Term','# Genes in + the GO Term','# Genes in the list and in the GO Term','Enrichement + of the GO Term for genes in the input list','Genes in the input list + present in the GO term' + """ + totalSAPs=len(ENSEMBLTGinGO) + SAPs_all=len(sENSEMBLTSAPsinGO) + NoSAPs_all=totalSAPs-SAPs_all + pGO=SAPs_all/float(totalSAPs) + #~ + lp=len(dGOTENSEMBLT) + cnt=0 + #~ + if statsTest=='fisher': + ptest=fisherexct + elif statsTest=='hypergeometric': + ptest=hypergeo_sf + elif statsTest=='binomial': + ptest=binProb + #~ + ltfreqs=[] + for echGOT in dGOTENSEMBLT: + cnt+=1 + CntGO_All=len(dGOTENSEMBLT[echGOT]) + SAPs_GO=len(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO)) + NoSAPs_GO=CntGO_All-SAPs_GO + probLow,probHigh=ptest(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all,CntGO_All,totalSAPs,pGO) + ltfreqs.append([(SAPs_GO/Decimal(CntGO_All)),SAPs_GO,probLow,probHigh,echGOT]) + #~ + ltfreqs.sort() + ltfreqs.reverse() + outl=[] + cper,crank=Decimal('2'),0 + #~ + getcontext().prec=2#set 2 decimal places + for perc,cnt_go,pvalLow,pvalHigh,goTerm in ltfreqs: + if perc + : Assess the enrichment/depletion of a gene set for GO terms #set $t_col1_0 = int(str($t_col1)) - 1 #set $t_col2_0 = int(str($t_col2)) - 1 #set $g_col2_0 = int(str($g_col2)) - 1 - rank_terms.py --input "$input1" --columnENSEMBLT $t_col1_0 --inExtnddfile "$input2" --columnENSEMBLTExtndd $t_col2_0 --columnGOExtndd $g_col2_0 --output "$output" + rank_terms.py --input "$input1" --columnENSEMBLT $t_col1_0 --inExtnddfile "$input2" --columnENSEMBLTExtndd $t_col2_0 --columnGOExtndd $g_col2_0 --statsTest "$stat" --output "$output" - + + + + + @@ -41,16 +45,17 @@ **What it does** Given a query set of genes from a larger background dataset, this tool -evaluates the statistical over- or under-representation of Gene Ontology -terms in the query set, using a two-tailed Fisher's exact test. +evaluates the over- or under-representation of Gene Ontology terms in the +query set, using the specified statistical test. The output contains a row for each GO term, with the following columns: 1. count: the number of genes in the query set that are in this GO category 2. representation: the percentage of this category's genes (from the background dataset) that appear in the query set 3. ranking of this term, based on its representation ("1" is highest) -4. Fisher probability of enrichment/depletion of this GO category in the query dataset -5. GO term +4. probability of depletion of this GO category in the query dataset +5. probability of enrichment of this GO category in the query dataset +6. GO term diff -r 91e835060ad2 -r 8997f2ca8c7a reorder.xml --- a/reorder.xml Mon Jun 03 12:29:29 2013 -0400 +++ b/reorder.xml Mon Jul 15 10:47:35 2013 -0400 @@ -2,7 +2,7 @@ individuals - reorder.py "$input" "$output" "$order" + reorder.py '$input' '$output' '$order' diff -r 91e835060ad2 -r 8997f2ca8c7a specify.py --- a/specify.py Mon Jun 03 12:29:29 2013 -0400 +++ b/specify.py Mon Jul 15 10:47:35 2013 -0400 @@ -1,147 +1,69 @@ #!/usr/bin/env python +import gd_util import sys -import base64 - -def parse_args(args): - if len(args) < 3: - usage() - - input_file, output_file = args[1:3] - - individuals = [] - checkboxes = [] - strings = [] - - for arg in args[3:]: - if ':' in arg: - arg_type, arg = arg.split(':', 1) - else: - print >> sys.stderr, "unknown argument:", arg - usage() - - if arg_type == 'individual': - individuals.append(arg) - elif arg_type == 'checkbox': - checkboxes.append(arg) - elif arg_type == 'string': - strings.append(arg) - else: - print >> sys.stderr, "unknown argument:", arg - usage() - - return input_file, output_file, individuals, checkboxes, strings +from Population import Population -def usage(): - print >> sys.stderr, "Usage: %s [ ...] [ ...] [ ...]" % (sys.argv[0]) - sys.exit(1) - -def parse_individuals(individuals): - ind_col2name = {} - ind_name2col = {} - - for individual in individuals: - if ':' in individual: - column, name = individual.split(':', 1) - else: - print >> sys.stderr, "invalid individual specification:", individual - usage() +################################################################################ - try: - column = int(column) - except: - print "individual column is not an integer:", individual - usage() - - if column not in ind_col2name: - ind_col2name[column] = name - else: - if ind_col2name[column] != name: - print "duplicate individual column:", name, column, ind_col2name[column] - usage() - - if name not in ind_name2col: - ind_name2col[name] = [column] - elif column not in ind_name2col[name]: - ind_name2col[name].append(column) - - return ind_col2name, ind_name2col - -def parse_checkboxes(checkboxes, ind_col2name): +def parse_string(str_arg, ind_token2col): columns = [] - for checkbox in checkboxes: - if ':' in checkbox: - column, name = checkbox.split(':', 1) - else: - print >> sys.stderr, "invalid checkbox specification:", checkbox - usage() + string = gd_util.unwrap_string(str_arg) + tokens = find_tokens(string, ind_token2col) - try: - column = int(column) - except: - print "checkbox column is not an integer:", checkbox - usage() - - if column not in ind_col2name: - print "individual not in SNP table:", name - usage() - - if column not in columns: - columns.append(column) + for token in tokens: + col = ind_token2col[token] + if col not in columns: + columns.append(col) return columns -def parse_strings(strings, ind_col2name, ind_name2col): - columns = [] - - for string in strings: - try: - decoded = base64.b64decode(string) - except: - print >> sys.stderr, "invalid base64 string:", string - usage() - - names = find_names(decoded, ind_name2col.keys()) - for name in names: - cols = ind_name2col[name] - if len(cols) == 1: - col = cols[0] - if col not in columns: - columns.append(col) - else: - print >> sys.stderr, "name with multiple columns:", name - usage() - - return columns - -def find_names(string, names): +def find_tokens(string, tokens): rv = [] - for name in names: - if name in string: - if name not in rv: - rv.append(name) + for token in tokens: + if token in string: + if token not in rv: + rv.append(token) return rv +################################################################################ +if len(sys.argv) != 6: + gd_util.die('Usage') +input, output, ind_arg, cb_arg, str_arg = sys.argv[1:] -input_file, output_file, individuals, checkboxes, strings = parse_args(sys.argv) -ind_col2name, ind_name2col = parse_individuals(individuals) -cb_cols = parse_checkboxes(checkboxes, ind_col2name) -str_cols = parse_strings(strings, ind_col2name, ind_name2col) +p_total = Population() +p_total.from_wrapped_dict(ind_arg) + +p_cb = Population() +p_cb.from_wrapped_dict(cb_arg) + +if not p_total.is_superset(p_cb): + gd_util.die('There is a checked individual that does not appear in the SNP table') + +################################################################################ -out_cols = cb_cols -for col in str_cols: - if col not in out_cols: - out_cols.append(col) +ind_col2name = {} +ind_token2col = {} +for col in p_total.column_list(): + individual = p_total.individual_with_column(col) + name = individual.name + ind_col2name[col] = name + first_token = name.split()[0] + if first_token not in ind_token2col: + ind_token2col[first_token] = col + else: + gd_util.die('duplicate first token: {0}'.format(first_token)) -with open(output_file, 'w') as fh: - for col in sorted(out_cols): - print >> fh, '\t'.join([str(x) for x in [col, ind_col2name[col], '']]) +out_cols = p_cb.column_list() +str_cols = parse_string(str_arg, ind_token2col) + +with open(output, 'w') as fh: + for col in sorted(ind_col2name.keys()): + if col in out_cols or col in str_cols: + print >> fh, '\t'.join([str(x) for x in [col, ind_col2name[col], '']]) sys.exit(0) - - - diff -r 91e835060ad2 -r 8997f2ca8c7a specify.xml --- a/specify.xml Mon Jun 03 12:29:29 2013 -0400 +++ b/specify.xml Mon Jul 15 10:47:35 2013 -0400 @@ -2,28 +2,37 @@ : Define a collection of individuals from a gd_snp dataset - specify.py "$input" "$output" - #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) - #set $individual_arg = 'individual:%s:%s' % ($individual_col, $individual) - "$individual_arg" - #end for - #if str($individuals).strip() != 'None' - #for $individual in str($individuals).split(',') - #set $individual_idx = $input.dataset.metadata.individual_names.index($individual) - #set $individual_col = str( $input.dataset.metadata.individual_columns[$individual_idx] ) - #set $cb_arg = 'checkbox:%s:%s' % ($individual_col, $individual) - "$cb_arg" - #end for + #import json + #import base64 + #import zlib + #set $ind_names = $input.dataset.metadata.individual_names + #set $ind_colms = $input.dataset.metadata.individual_columns + #set $ind_dict = dict(zip($ind_names, $ind_colms)) + #set $ind_json = json.dumps($ind_dict, separators=(',',':')) + #set $ind_comp = zlib.compress($ind_json, 9) + #set $ind_arg = base64.b64encode($ind_comp) + #set $cb_string = str($individuals).strip() + #if $cb_string != 'None' + #set $cb_dict = dict.fromkeys($cb_string.split('\t')) + #for $cb_name in $cb_dict: + #set $cb_idx = $input.dataset.metadata.individual_names.index($cb_name) + #set $cb_dict[$cb_name] = str($input.dataset.metadata.individual_columns[$cb_idx]) + #end for + #else + #set $cb_dict = dict() #end if - #if str($string).strip() != '' - #set str_arg = 'string:%s' % ( __import__('base64').b64encode( str($string) ) ) - "$str_arg" - #end if + #set $cb_json = json.dumps($cb_dict, separators=(',',':')) + #set $cb_comp = zlib.compress($cb_json, 9) + #set $cb_arg = base64.b64encode($cb_comp) + #set $str_string = str($string).strip() + #set $str_comp = zlib.compress($str_string, 9) + #set $str_arg = base64.b64encode($str_comp) + specify.py '$input' '$output' '$ind_arg' '$cb_arg' '$str_arg' - + @@ -32,7 +41,7 @@ #used to be "Individuals from ${input.hid}" - +