# HG changeset patch # User iuc # Date 1434643851 14400 # Node ID 497c6bb3b717e0ee768881bb6bfa1d307eb90a53 # Parent 2c9e5136b416d658892cc181f209ba2d0291acab planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse commit 0887009a23d176b21536c9fd8a18c4fecc417d4f diff -r 2c9e5136b416 -r 497c6bb3b717 blastxml_to_gapped_gff3.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blastxml_to_gapped_gff3.py Thu Jun 18 12:10:51 2015 -0400 @@ -0,0 +1,256 @@ +#!/usr/bin/perl +import re +import sys +import copy +import argparse +from BCBio import GFF +import logging +logging.basicConfig(level=logging.INFO) +log = logging.getLogger(name='blastxml2gff3') + +__author__ = "Eric Rasche" +__version__ = "0.4.0" +__maintainer__ = "Eric Rasche" +__email__ = "esr@tamu.edu" + +__doc__ = """ +BlastXML files, when transformed to GFF3, do not normally show gaps in the +blast hits. This tool aims to fill that "gap". +""" + + +def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False): + from Bio.Blast import NCBIXML + from Bio.Seq import Seq + from Bio.SeqRecord import SeqRecord + from Bio.SeqFeature import SeqFeature, FeatureLocation + + blast_records = NCBIXML.parse(blastxml) + records = [] + for record in blast_records: + rec = SeqRecord(Seq("ACTG"), id=record.query) + for hit in record.alignments: + for hsp in hit.hsps: + qualifiers = { + "source": "blast", + "score": hsp.expect, + "accession": hit.accession, + "hit_id": hit.hit_id, + "length": hit.length, + "hit_titles": hit.title.split(' >') + } + desc = hit.title.split(' >')[0] + qualifiers['description'] = desc[desc.index(' '):] + + # This required a fair bit of sketching out/match to figure out + # the first time. + # + # the match_start location must account for queries and + # subjecst that start at locations other than 1 + parent_match_start = hsp.query_start - hsp.sbjct_start + # The end is the start + hit.length because the match itself + # may be longer than the parent feature, so we use the supplied + # subject/hit length to calculate the real ending of the target + # protein. + parent_match_end = hsp.query_start + hit.length + hsp.query.count('-') + + # However, if the user requests that we trim the feature, then + # we need to cut the ``match`` start to 0 to match the parent feature. + # We'll also need to cut the end to match the query's end. It (maybe) + # should be the feature end? But we don't have access to that data, so + # We settle for this. + if trim: + if parent_match_start < 1: + parent_match_start = 0 + + if trim or trim_end: + if parent_match_end > hsp.query_end: + parent_match_end = hsp.query_end + 1 + + # The ``protein_match`` feature will hold one or more ``match_part``s + top_feature = SeqFeature( + FeatureLocation(parent_match_start, parent_match_end), + type="protein_match", strand=0, + qualifiers=qualifiers + ) + + # Unlike the parent feature, ``match_part``s have sources. + part_qualifiers = { + "source": "blast", + } + top_feature.sub_features = [] + for start, end, cigar in generate_parts(hsp.query, hsp.match, + hsp.sbjct, + ignore_under=min_gap): + part_qualifiers['Gap'] = cigar + part_qualifiers['ID'] = hit.hit_id + + if trim: + # If trimming, then we start relative to the + # protein_match's start + match_part_start = parent_match_start + start + else: + # Otherwise, we have to account for the subject start's location + match_part_start = parent_match_start + hsp.sbjct_start + start - 1 + + # We used to use hsp.align_length here, but that includes + # gaps in the parent sequence + # + # Furthermore align_length will give calculation errors in weird places + # So we just use (end-start) for simplicity + match_part_end = match_part_start + (end - start) + + top_feature.sub_features.append( + SeqFeature( + FeatureLocation(match_part_start, match_part_end), + type="match_part", strand=0, + qualifiers=copy.deepcopy(part_qualifiers)) + ) + + rec.features.append(top_feature) + records.append(rec) + return records + + +def __remove_query_gaps(query, match, subject): + """remove positions in all three based on gaps in query + + In order to simplify math and calculations...we remove all of the gaps + based on gap locations in the query sequence:: + + Q:ACTG-ACTGACTG + S:ACTGAAC---CTG + + will become:: + + Q:ACTGACTGACTG + S:ACTGAC---CTG + + which greatly simplifies the process of identifying the correct location + for a match_part + """ + prev = 0 + fq = '' + fm = '' + fs = '' + for position in re.finditer('-', query): + fq += query[prev:position.start()] + fm += match[prev:position.start()] + fs += subject[prev:position.start()] + prev = position.start() + 1 + fq += query[prev:] + fm += match[prev:] + fs += subject[prev:] + + return (fq, fm, fs) + + +def generate_parts(query, match, subject, ignore_under=3): + region_q = [] + region_m = [] + region_s = [] + + (query, match, subject) = __remove_query_gaps(query, match, subject) + + region_start = -1 + region_end = -1 + mismatch_count = 0 + for i, (q, m, s) in enumerate(zip(query, match, subject)): + + # If we have a match + if m != ' ' or m == '+': + if region_start == -1: + region_start = i + # It's a new region, we need to reset or it's pre-seeded with + # spaces + region_q = [] + region_m = [] + region_s = [] + region_end = i + mismatch_count = 0 + else: + mismatch_count += 1 + + region_q.append(q) + region_m.append(m) + region_s.append(s) + + if mismatch_count >= ignore_under and region_start != -1 and region_end != -1: + region_q = region_q[0:-ignore_under] + region_m = region_m[0:-ignore_under] + region_s = region_s[0:-ignore_under] + yield region_start, region_end + 1, \ + cigar_from_string(region_q, region_m, region_s, strict_m=True) + region_q = [] + region_m = [] + region_s = [] + + region_start = -1 + region_end = -1 + mismatch_count = 0 + + yield region_start, region_end + 1, \ + cigar_from_string(region_q, region_m, region_s, strict_m=True) + + +def _qms_to_matches(query, match, subject, strict_m=True): + matchline = [] + + for (q, m, s) in zip(query, match, subject): + ret = '' + + if m != ' ' or m == '+': + ret = '=' + elif m == ' ': + if q == '-': + ret = 'D' + elif s == '-': + ret = 'I' + else: + ret = 'X' + else: + log.warn("Bad data: \n\t%s\n\t%s\n\t%s\n" % (query, match, subject)) + + + if strict_m: + if ret == '=' or ret == 'X': + ret = 'M' + + matchline.append(ret) + return matchline + + +def _matchline_to_cigar(matchline): + cigar_line = [] + last_char = matchline[0] + count = 0 + for char in matchline: + if char == last_char: + count += 1 + else: + cigar_line.append("%s%s" % (last_char, count)) + count = 1 + last_char = char + cigar_line.append("%s%s" % (last_char, count)) + return ' '.join(cigar_line) + + +def cigar_from_string(query, match, subject, strict_m=True): + matchline = _qms_to_matches(query, match, subject, strict_m=strict_m) + if len(matchline) > 0: + return _matchline_to_cigar(matchline) + else: + return "" + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Convert Blast XML to gapped GFF3', epilog='') + parser.add_argument('blastxml', type=file, help='Blast XML Output') + parser.add_argument('--min_gap', type=int, help='Maximum gap size before generating a new match_part', default=3) + parser.add_argument('--trim', action='store_true', help='Trim blast hits to be only as long as the parent feature') + parser.add_argument('--trim_end', action='store_true', help='Cut blast results off at end of gene') + args = parser.parse_args() + + result = blastxml2gff3(**vars(args)) + + GFF.write(result, sys.stdout) diff -r 2c9e5136b416 -r 497c6bb3b717 gff3_rebase.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff3_rebase.py Thu Jun 18 12:10:51 2015 -0400 @@ -0,0 +1,108 @@ +#!/usr/bin/env python +import sys +import logging +logging.basicConfig(level=logging.INFO) +import argparse +from BCBio import GFF +from Bio.SeqFeature import FeatureLocation +log = logging.getLogger(__name__) + +__author__ = "Eric Rasche" +__version__ = "0.4.0" +__maintainer__ = "Eric Rasche" +__email__ = "esr@tamu.edu" + + +def __get_features(child, interpro=False): + child_features = {} + for rec in GFF.parse(child): + for feature in rec.features: + parent_feature_id = rec.id + if interpro: + if feature.type == 'polypeptide': + continue + if '_' in parent_feature_id: + parent_feature_id = parent_feature_id[parent_feature_id.index('_') + 1:] + + try: + child_features[parent_feature_id].append(feature) + except KeyError: + child_features[parent_feature_id] = [feature] + return child_features + + +def __update_feature_location(feature, parent, protein2dna): + start = feature.location.start + end = feature.location.end + if protein2dna: + start *= 3 + end *= 3 + + if parent.location.strand >= 0: + ns = parent.location.start + start + ne = parent.location.start + end + st = +1 + else: + ns = parent.location.end - end + ne = parent.location.end - start + st = -1 + + # Don't let start/stops be less than zero. It's technically valid for them + # to be (at least in the model I'm working with) but it causes numerous + # issues. + # + # Instead, we'll replace with %3 to try and keep it in the same reading + # frame that it should be in. + if ns < 0: + ns %= 3 + if ne < 0: + ne %= 3 + + feature.location = FeatureLocation(ns, ne, strand=st) + + if hasattr(feature, 'sub_features'): + for subfeature in feature.sub_features: + __update_feature_location(subfeature, parent, protein2dna) + + +def rebase(parent, child, interpro=False, protein2dna=False): + child_features = __get_features(child, interpro=interpro) + + for rec in GFF.parse(parent): + # TODO, replace with recursion in case it's matched against a + # non-parent feature. We're cheating a bit here right now... + replacement_features = [] + for feature in rec.features: + if feature.id in child_features: + new_subfeatures = child_features[feature.id] + # TODO: update starts + fixed_subfeatures = [] + for x in new_subfeatures: + # Then update the location of the actual feature + __update_feature_location(x, feature, protein2dna) + + if interpro: + for y in ('status', 'Target'): + try: + del x.qualifiers[y] + except: + pass + + fixed_subfeatures.append(x) + replacement_features.extend(fixed_subfeatures) + # We do this so we don't include the original set of features that we + # were rebasing against in our result. + rec.features = replacement_features + GFF.write([rec], sys.stdout) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='rebase gff3 features against parent locations', epilog="") + parser.add_argument('parent', type=file, help='Parent GFF3 annotations') + parser.add_argument('child', help='Child GFF3 annotations to rebase against parent') + parser.add_argument('--interpro', action='store_true', + help='Interpro specific modifications') + parser.add_argument('--protein2dna', action='store_true', + help='Map protein translated results to original DNA data') + args = parser.parse_args() + rebase(**vars(args)) diff -r 2c9e5136b416 -r 497c6bb3b717 jbrowse.py --- a/jbrowse.py Mon May 04 17:21:38 2015 -0400 +++ b/jbrowse.py Thu Jun 18 12:10:51 2015 -0400 @@ -1,9 +1,45 @@ #!/usr/bin/env python +from string import Template import os -import shutil import argparse import subprocess import hashlib +import tempfile +import json +import yaml +import logging +logging.basicConfig() +log = logging.getLogger(__name__) + +COLOR_FUNCTION_TEMPLATE = Template(""" +function(feature, variableName, glyphObject, track) { + var score = ${score}; + ${opacity} + return 'rgba(${red}, ${green}, ${blue}, ' + opacity + ')'; +} +""") + +BLAST_OPACITY_MATH = """ +var opacity = 0; +if(score == 0.0) { + opacity = 1; +} else{ + opacity = (20 - Math.log10(score)) / 180; +} +""" + +BREWER_COLOUR_IDX = 0 +BREWER_COLOUR_SCHEMES = [ + (228, 26, 28), + (55, 126, 184), + (77, 175, 74), + (152, 78, 163), + (255, 127, 0), +] + + +# score comes from feature._parent.get('score') or feature.get('score') +# Opacity math TN_TABLE = { 'gff3': '--gff', @@ -12,40 +48,298 @@ # 'genbank': '--gbk', } - -def process_genome(jbrowse_dir, genome): - subprocess.check_output(['perl', 'bin/prepare-refseqs.pl', '--fasta', genome], cwd=jbrowse_dir) +INSTALLED_TO = os.path.dirname(os.path.realpath(__file__)) -def process_annotations(jbrowse_dir, annot_file, annot_label, annot_format, - **kwargs): - key = hashlib.md5(annot_file).hexdigest() +class JbrowseConnector(object): + + def __init__(self, jbrowse, jbrowse_dir, outdir, genome): + self.jbrowse = jbrowse + self.jbrowse_dir = jbrowse_dir + self.outdir = outdir + self.genome_path = genome + self.brewer_colour_idx = 0 + + self.clone_jbrowse(self.jbrowse, self.outdir) + self.process_genome() + + def subprocess_check_call(self, command): + log.debug('cd %s && %s', self.jbrowse_dir, ' '.join(command)) + subprocess.check_call(command, cwd=self.jbrowse_dir) + + def _get_colours(self): + r, g, b = BREWER_COLOUR_SCHEMES[self.brewer_colour_idx] + self.brewer_colour_idx += 1 + return r, g, b + + def process_genome(self): + self.subprocess_check_call(['perl', 'bin/prepare-refseqs.pl', + '--fasta', self.genome_path]) + + def _add_json(self, json_data): + if len(json_data.keys()) == 0: + return + + tmp = tempfile.NamedTemporaryFile(delete=False) + tmp.write(json.dumps(json_data)) + tmp.close() + cmd = ['perl', 'bin/add-track-json.pl', tmp.name, + os.path.join('data', 'trackList.json')] + self.subprocess_check_call(cmd) + os.unlink(tmp.name) + + def add_blastxml(self, data, key, **kwargs): + gff3_unrebased = tempfile.NamedTemporaryFile(delete=False) + cmd = ['python', os.path.join(INSTALLED_TO, 'blastxml_to_gapped_gff3.py'), + '--trim_end', '--min_gap', str(kwargs['min_gap']), data] + subprocess.check_call(cmd, cwd=self.jbrowse_dir, stdout=gff3_unrebased) + gff3_unrebased.close() + + gff3_rebased = tempfile.NamedTemporaryFile(delete=False) + cmd = ['python', os.path.join(INSTALLED_TO, 'gff3_rebase.py')] + if kwargs['protein']: + cmd.append('--protein2dna') + cmd.extend([kwargs['parent'], gff3_unrebased.name]) + subprocess.check_call(cmd, cwd=self.jbrowse_dir, stdout=gff3_rebased) + gff3_rebased.close() + + label = hashlib.md5(data).hexdigest() + + red, green, blue = self._get_colours() + color_function = COLOR_FUNCTION_TEMPLATE.substitute({ + 'score': "feature._parent.get('score')", + 'opacity': BLAST_OPACITY_MATH, + 'red': red, + 'green': green, + 'blue': blue, + }) + + clientConfig = { + 'label': 'description', + 'color': color_function.replace('\n', ''), + 'description': 'Hit_titles', + } + config = {'glyph': 'JBrowse/View/FeatureGlyph/Segments'} + if 'category' in kwargs: + config['category'] = kwargs['category'] + + cmd = ['perl', 'bin/flatfile-to-json.pl', + '--gff', gff3_rebased.name, + '--trackLabel', label, + '--key', key, + '--clientConfig', json.dumps(clientConfig), + '--config', json.dumps(config), + '--trackType', 'JBrowse/View/Track/CanvasFeatures' + ] + + self.subprocess_check_call(cmd) + os.unlink(gff3_rebased.name) + os.unlink(gff3_unrebased.name) + + def _min_max_gff(self, gff_file): + min_val = None + max_val = None + with open(gff_file, 'r') as handle: + for line in handle: + try: + value = float(line.split('\t')[5]) + min_val = min(value, (min_val or value)) + max_val = max(value, (max_val or value)) + + if value < min_val: + min_val = value + + if value > max_val: + max_val = value + except Exception: + pass + return min_val, max_val + + def add_bigwig(self, data, key, **kwargs): + label = hashlib.md5(data).hexdigest() + dest = os.path.join('data', 'raw', os.path.basename(data)) + cmd = ['ln', '-s', data, dest] + self.subprocess_check_call(cmd) + + track_data = { + "label": label, + "urlTemplate": os.path.join('..', dest), + "bicolor_pivot": "zero", + "storeClass": "JBrowse/Store/SeqFeature/BigWig", + "type": "JBrowse/View/Track/Wiggle/Density", + "key": key, + } + track_data.update(kwargs) + + if 'min' not in track_data and 'max' not in track_data \ + and 'autoscale' not in track_data: + track_data['autoscale'] = 'local' + + self._add_json(track_data) - cmd = ['perl', 'bin/flatfile-to-json.pl', TN_TABLE.get(annot_format, 'gff'), - annot_file, '--trackLabel', key, '--key', annot_label] - subprocess.check_output(cmd, cwd=jbrowse_dir) + def add_bam(self, data, key, **kwargs): + label = hashlib.md5(data).hexdigest() + dest = os.path.join('data', 'raw', os.path.basename(data)) + # ln? + cmd = ['ln', '-s', data, dest] + self.subprocess_check_call(cmd) + + bai_source = kwargs['bam_index'] + cmd = ['ln', '-s', bai_source, dest + '.bai'] + self.subprocess_check_call(cmd) + + track_data = { + "urlTemplate": os.path.join('..', dest), + "key": key, + "label": label, + "type": "JBrowse/View/Track/Alignments2", + "storeClass": "JBrowse/Store/SeqFeature/BAM", + } + if 'category' in kwargs: + track_data['category'] = kwargs['category'] + self._add_json(track_data) + + if kwargs.get('auto_snp', False): + track_data = { + "storeClass": "JBrowse/Store/SeqFeature/BAM", + "urlTemplate": os.path.join('..', dest), + "type": "JBrowse/View/Track/SNPCoverage", + "key": key + " - SNPs/Coverage", + "label": label + "_autosnp", + } + if 'category' in kwargs: + track_data['category'] = kwargs['category'] + self._add_json(track_data) + + def add_vcf(self, data, key, **kwargs): + label = hashlib.md5(data).hexdigest() + dest = os.path.join('data', 'raw', os.path.basename(data)) + # ln? + cmd = ['ln', '-s', data, dest] + self.subprocess_check_call(cmd) + cmd = ['bgzip', dest] + self.subprocess_check_call(cmd) + cmd = ['tabix', '-p', 'vcf', dest + '.gz'] + self.subprocess_check_call(cmd) + + track_data = { + "key": key, + "label": label, + "urlTemplate": os.path.join('..', dest + '.gz'), + "type": "JBrowse/View/Track/HTMLVariants", + "storeClass": "JBrowse/Store/SeqFeature/VCFTabix", + } + track_data.update(kwargs) + self._add_json(track_data) + + def add_features(self, data, key, format, **kwargs): + label = hashlib.md5(data).hexdigest() + cmd = ['perl', 'bin/flatfile-to-json.pl', + TN_TABLE.get(format, 'gff'), data, + '--trackLabel', label, + '--key', key] + + config = {} + if 'category' in kwargs: + config['category'] = kwargs['category'] + + if kwargs.get('match', False): + clientConfig = { + 'label': 'description', + 'description': 'Hit_titles', + } + + # Get min/max and build a scoring function since JBrowse doesn't + min_val, max_val = self._min_max_gff(data) + + if min_val is not None and max_val is not None: + MIN_MAX_OPACITY_MATH = Template(""" + var opacity = (score - ${min}) * (1/(${max} - ${min})); + """).substitute({ + 'max': max_val, + 'min': min_val, + }) + + red, green, blue = self._get_colours() + color_function = COLOR_FUNCTION_TEMPLATE.substitute({ + 'score': "feature.get('score')", + 'opacity': MIN_MAX_OPACITY_MATH, + 'red': red, + 'green': green, + 'blue': blue, + }) + + clientConfig['color'] = color_function.replace('\n', '') + + config['glyph'] = 'JBrowse/View/FeatureGlyph/Segments' + + cmd += ['--clientConfig', json.dumps(clientConfig), + '--trackType', 'JBrowse/View/Track/CanvasFeatures' + ] + + cmd.extend(['--config', json.dumps(config)]) + + self.subprocess_check_call(cmd) + + def process_annotations(self, data, key, format, **kwargs): + if format in ('gff', 'gff3', 'bed'): + self.add_features(data, key, format, **kwargs) + elif format == 'bigwig': + self.add_bigwig(data, key, **kwargs) + elif format == 'bam': + self.add_bam(data, key, **kwargs) + elif format == 'blastxml': + self.add_blastxml(data, key, **kwargs) + elif format == 'vcf': + self.add_vcf(data, key, **kwargs) + + def clone_jbrowse(self, jbrowse_dir, destination): + # JBrowse seems to have included some bad symlinks, cp ignores bad symlinks + # unlike copytree + cmd = ['mkdir', '-p', destination] + subprocess.check_call(cmd) + cmd = ['cp', '-r', jbrowse_dir, destination] + subprocess.check_call(cmd) + cmd = ['mkdir', '-p', os.path.join(destination, 'JBrowse-1.11.6', + 'data', 'raw')] + subprocess.check_call(cmd) + + # http://unix.stackexchange.com/a/38691/22785 + # JBrowse releases come with some broken symlinks + cmd = ['find', destination, '-type', 'l', '-xtype', 'l', '-exec', 'rm', "'{}'", '+'] + subprocess.check_call(cmd) if __name__ == '__main__': parser = argparse.ArgumentParser(description="", epilog="") parser.add_argument('genome', type=file, help='Input genome file') - - parser.add_argument('--gff3', type=file, nargs='*', help='GFF3/BED/GBK File') - parser.add_argument('--gff3_format', choices=['gff3', 'gff', 'bed', 'gbk'], nargs='*', help='GFF3/BED/GBK Format') - parser.add_argument('--gff3_label', type=str, nargs='*', help='GFF3/BED/GBK Label') + parser.add_argument('yaml', type=file, help='Track Configuration') parser.add_argument('--jbrowse', help='Folder containing a jbrowse release') parser.add_argument('--outdir', help='Output directory', default='out') args = parser.parse_args() - jbrowse_dir = os.path.join(args.outdir, 'JBrowse-1.11.6') - shutil.copytree(args.jbrowse, jbrowse_dir) + jc = JbrowseConnector( + jbrowse=args.jbrowse, + jbrowse_dir=os.path.join(args.outdir, 'JBrowse-1.11.6'), + outdir=args.outdir, + genome=os.path.realpath(args.genome.name), + ) - process_genome(jbrowse_dir, os.path.realpath(args.genome.name)) + track_data = yaml.load(args.yaml) + for track in track_data: + path = os.path.realpath(track['file']) + extra = track.get('options', {}) + if '__unused__' in extra: + del extra['__unused__'] - for gff3, gff3_format, gff3_label in zip(args.gff3, args.gff3_format, args.gff3_label): - gff3_path = os.path.realpath(gff3.name) - process_annotations(jbrowse_dir, gff3_path, gff3_label, gff3_format) + for possible_partial_path in ('bam_index', 'parent'): + if possible_partial_path in extra: + extra[possible_partial_path] = os.path.realpath( + extra[possible_partial_path]) + extra['category'] = track.get('category', 'Default') + + jc.process_annotations(path, track['label'], track['ext'], **extra) print """ @@ -54,6 +348,9 @@ window.location=JBrowse-1.11.6/index.html Go to JBrowse +
Please note that JBrowse functions best on production Galaxy + instances. The paste server used in development instances has issues + serving the volumes of data regularly involved in JBrowse