guppy_basecaller: env/lib/python3.7/site-packages/lxml/html/diff.py comparison

comparison env/lib/python3.7/site-packages/lxml/html/diff.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"

author	shellac
date	Sat, 02 May 2020 07:14:21 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:26e78fe6e8c4
+# cython: language_level=3
+from __future__ import absolute_import
+import difflib
+from lxml import etree
+from lxml.html import fragment_fromstring
+import re
+__all__ = ['html_annotate', 'htmldiff']
+try:
+from html import escape as html_escape
+except ImportError:
+from cgi import escape as html_escape
+try:
+_unicode = unicode
+except NameError:
+# Python 3
+_unicode = str
+try:
+basestring
+except NameError:
+# Python 3
+basestring = str
+############################################################
+## Annotation
+############################################################
+def default_markup(text, version):
+return '<span title="%s">%s</span>' % (
+html_escape(_unicode(version), 1), text)
+def html_annotate(doclist, markup=default_markup):
+"""
+doclist should be ordered from oldest to newest, like::
+>>> version1 = 'Hello World'
+>>> version2 = 'Goodbye World'
+>>> print(html_annotate([(version1, 'version 1'),
+...                      (version2, 'version 2')]))
+<span title="version 2">Goodbye</span> <span title="version 1">World</span>
+The documents must be *fragments* (str/UTF8 or unicode), not
+complete documents
+The markup argument is a function to markup the spans of words.
+This function is called like markup('Hello', 'version 2'), and
+returns HTML.  The first argument is text and never includes any
+markup.  The default uses a span with a title:
+>>> print(default_markup('Some Text', 'by Joe'))
+<span title="by Joe">Some Text</span>
+"""
+# The basic strategy we have is to split the documents up into
+# logical tokens (which are words with attached markup).  We then
+# do diffs of each of the versions to track when a token first
+# appeared in the document; the annotation attached to the token
+# is the version where it first appeared.
+tokenlist = [tokenize_annotated(doc, version)
+for doc, version in doclist]
+cur_tokens = tokenlist[0]
+for tokens in tokenlist[1:]:
+html_annotate_merge_annotations(cur_tokens, tokens)
+cur_tokens = tokens
+# After we've tracked all the tokens, we can combine spans of text
+# that are adjacent and have the same annotation
+cur_tokens = compress_tokens(cur_tokens)
+# And finally add markup
+result = markup_serialize_tokens(cur_tokens, markup)
+return ''.join(result).strip()
+def tokenize_annotated(doc, annotation):
+"""Tokenize a document and add an annotation attribute to each token
+"""
+tokens = tokenize(doc, include_hrefs=False)
+for tok in tokens:
+tok.annotation = annotation
+return tokens
+def html_annotate_merge_annotations(tokens_old, tokens_new):
+"""Merge the annotations from tokens_old into tokens_new, when the
+tokens in the new document already existed in the old document.
+"""
+s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
+commands = s.get_opcodes()
+for command, i1, i2, j1, j2 in commands:
+if command == 'equal':
+eq_old = tokens_old[i1:i2]
+eq_new = tokens_new[j1:j2]
+copy_annotations(eq_old, eq_new)
+def copy_annotations(src, dest):
+"""
+Copy annotations from the tokens listed in src to the tokens in dest
+"""
+assert len(src) == len(dest)
+for src_tok, dest_tok in zip(src, dest):
+dest_tok.annotation = src_tok.annotation
+def compress_tokens(tokens):
+"""
+Combine adjacent tokens when there is no HTML between the tokens,
+and they share an annotation
+"""
+result = [tokens[0]]
+for tok in tokens[1:]:
+if (not result[-1].post_tags and
+not tok.pre_tags and
+result[-1].annotation == tok.annotation):
+compress_merge_back(result, tok)
+else:
+result.append(tok)
+return result
+def compress_merge_back(tokens, tok):
+""" Merge tok into the last element of tokens (modifying the list of
+tokens in-place).  """
+last = tokens[-1]
+if type(last) is not token or type(tok) is not token:
+tokens.append(tok)
+else:
+text = _unicode(last)
+if last.trailing_whitespace:
+text += last.trailing_whitespace
+text += tok
+merged = token(text,
+pre_tags=last.pre_tags,
+post_tags=tok.post_tags,
+trailing_whitespace=tok.trailing_whitespace)
+merged.annotation = last.annotation
+tokens[-1] = merged
+def markup_serialize_tokens(tokens, markup_func):
+"""
+Serialize the list of tokens into a list of text chunks, calling
+markup_func around text to add annotations.
+"""
+for token in tokens:
+for pre in token.pre_tags:
+yield pre
+html = token.html()
+html = markup_func(html, token.annotation)
+if token.trailing_whitespace:
+html += token.trailing_whitespace
+yield html
+for post in token.post_tags:
+yield post
+############################################################
+## HTML Diffs
+############################################################
+def htmldiff(old_html, new_html):
+## FIXME: this should take parsed documents too, and use their body
+## or other content.
+""" Do a diff of the old and new document.  The documents are HTML
+*fragments* (str/UTF8 or unicode), they are not complete documents
+(i.e., no <html> tag).
+Returns HTML with <ins> and <del> tags added around the
+appropriate text.
+Markup is generally ignored, with the markup from new_html
+preserved, and possibly some markup from old_html (though it is
+considered acceptable to lose some of the old markup).  Only the
+words in the HTML are diffed.  The exception is <img> tags, which
+are treated like words, and the href attribute of <a> tags, which
+are noted inside the tag itself when there are changes.
+"""
+old_html_tokens = tokenize(old_html)
+new_html_tokens = tokenize(new_html)
+result = htmldiff_tokens(old_html_tokens, new_html_tokens)
+result = ''.join(result).strip()
+return fixup_ins_del_tags(result)
+def htmldiff_tokens(html1_tokens, html2_tokens):
+""" Does a diff on the tokens themselves, returning a list of text
+chunks (not tokens).
+"""
+# There are several passes as we do the differences.  The tokens
+# isolate the portion of the content we care to diff; difflib does
+# all the actual hard work at that point.
+#
+# Then we must create a valid document from pieces of both the old
+# document and the new document.  We generally prefer to take
+# markup from the new document, and only do a best effort attempt
+# to keep markup from the old document; anything that we can't
+# resolve we throw away.  Also we try to put the deletes as close
+# to the location where we think they would have been -- because
+# we are only keeping the markup from the new document, it can be
+# fuzzy where in the new document the old text would have gone.
+# Again we just do a best effort attempt.
+s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
+commands = s.get_opcodes()
+result = []
+for command, i1, i2, j1, j2 in commands:
+if command == 'equal':
+result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
+continue
+if command == 'insert' or command == 'replace':
+ins_tokens = expand_tokens(html2_tokens[j1:j2])
+merge_insert(ins_tokens, result)
+if command == 'delete' or command == 'replace':
+del_tokens = expand_tokens(html1_tokens[i1:i2])
+merge_delete(del_tokens, result)
+# If deletes were inserted directly as <del> then we'd have an
+# invalid document at this point.  Instead we put in special
+# markers, and when the complete diffed document has been created
+# we try to move the deletes around and resolve any problems.
+result = cleanup_delete(result)
+return result
+def expand_tokens(tokens, equal=False):
+"""Given a list of tokens, return a generator of the chunks of
+text for the data in the tokens.
+"""
+for token in tokens:
+for pre in token.pre_tags:
+yield pre
+if not equal or not token.hide_when_equal:
+if token.trailing_whitespace:
+yield token.html() + token.trailing_whitespace
+else:
+yield token.html()
+for post in token.post_tags:
+yield post
+def merge_insert(ins_chunks, doc):
+""" doc is the already-handled document (as a list of text chunks);
+here we add <ins>ins_chunks</ins> to the end of that.  """
+# Though we don't throw away unbalanced_start or unbalanced_end
+# (we assume there is accompanying markup later or earlier in the
+# document), we only put <ins> around the balanced portion.
+unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
+doc.extend(unbalanced_start)
+if doc and not doc[-1].endswith(' '):
+# Fix up the case where the word before the insert didn't end with
+# a space
+doc[-1] += ' '
+doc.append('<ins>')
+if balanced and balanced[-1].endswith(' '):
+# We move space outside of </ins>
+balanced[-1] = balanced[-1][:-1]
+doc.extend(balanced)
+doc.append('</ins> ')
+doc.extend(unbalanced_end)
+# These are sentinals to represent the start and end of a <del>
+# segment, until we do the cleanup phase to turn them into proper
+# markup:
+class DEL_START:
+pass
+class DEL_END:
+pass
+class NoDeletes(Exception):
+""" Raised when the document no longer contains any pending deletes
+(DEL_START/DEL_END) """
+def merge_delete(del_chunks, doc):
+""" Adds the text chunks in del_chunks to the document doc (another
+list of text chunks) with marker to show it is a delete.
+cleanup_delete later resolves these markers into <del> tags."""
+doc.append(DEL_START)
+doc.extend(del_chunks)
+doc.append(DEL_END)
+def cleanup_delete(chunks):
+""" Cleans up any DEL_START/DEL_END markers in the document, replacing
+them with <del></del>.  To do this while keeping the document
+valid, it may need to drop some tags (either start or end tags).
+It may also move the del into adjacent tags to try to move it to a
+similar location where it was originally located (e.g., moving a
+delete into preceding <div> tag, if the del looks like (DEL_START,
+'Text</div>', DEL_END)"""
+while 1:
+# Find a pending DEL_START/DEL_END, splitting the document
+# into stuff-preceding-DEL_START, stuff-inside, and
+# stuff-following-DEL_END
+try:
+pre_delete, delete, post_delete = split_delete(chunks)
+except NoDeletes:
+# Nothing found, we've cleaned up the entire doc
+break
+# The stuff-inside-DEL_START/END may not be well balanced
+# markup.  First we figure out what unbalanced portions there are:
+unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
+# Then we move the span forward and/or backward based on these
+# unbalanced portions:
+locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
+locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
+doc = pre_delete
+if doc and not doc[-1].endswith(' '):
+# Fix up case where the word before us didn't have a trailing space
+doc[-1] += ' '
+doc.append('<del>')
+if balanced and balanced[-1].endswith(' '):
+# We move space outside of </del>
+balanced[-1] = balanced[-1][:-1]
+doc.extend(balanced)
+doc.append('</del> ')
+doc.extend(post_delete)
+chunks = doc
+return chunks
+def split_unbalanced(chunks):
+"""Return (unbalanced_start, balanced, unbalanced_end), where each is
+a list of text and tag chunks.
+unbalanced_start is a list of all the tags that are opened, but
+not closed in this span.  Similarly, unbalanced_end is a list of
+tags that are closed but were not opened.  Extracting these might
+mean some reordering of the chunks."""
+start = []
+end = []
+tag_stack = []
+balanced = []
+for chunk in chunks:
+if not chunk.startswith('<'):
+balanced.append(chunk)
+continue
+endtag = chunk[1] == '/'
+name = chunk.split()[0].strip('<>/')
+if name in empty_tags:
+balanced.append(chunk)
+continue
+if endtag:
+if tag_stack and tag_stack[-1][0] == name:
+balanced.append(chunk)
+name, pos, tag = tag_stack.pop()
+balanced[pos] = tag
+elif tag_stack:
+start.extend([tag for name, pos, tag in tag_stack])
+tag_stack = []
+end.append(chunk)
+else:
+end.append(chunk)
+else:
+tag_stack.append((name, len(balanced), chunk))
+balanced.append(None)
+start.extend(
+[chunk for name, pos, chunk in tag_stack])
+balanced = [chunk for chunk in balanced if chunk is not None]
+return start, balanced, end
+def split_delete(chunks):
+""" Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
+stuff_after_DEL_END).  Returns the first case found (there may be
+more DEL_STARTs in stuff_after_DEL_END).  Raises NoDeletes if
+there's no DEL_START found. """
+try:
+pos = chunks.index(DEL_START)
+except ValueError:
+raise NoDeletes
+pos2 = chunks.index(DEL_END)
+return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
+def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
+""" pre_delete and post_delete implicitly point to a place in the
+document (where the two were split).  This moves that point (by
+popping items from one and pushing them onto the other).  It moves
+the point to try to find a place where unbalanced_start applies.
+As an example::
+>>> unbalanced_start = ['<div>']
+>>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
+>>> pre, post = doc[:3], doc[3:]
+>>> pre, post
+(['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
+>>> locate_unbalanced_start(unbalanced_start, pre, post)
+>>> pre, post
+(['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
+As you can see, we moved the point so that the dangling <div> that
+we found will be effectively replaced by the div in the original
+document.  If this doesn't work out, we just throw away
+unbalanced_start without doing anything.
+"""
+while 1:
+if not unbalanced_start:
+# We have totally succeeded in finding the position
+break
+finding = unbalanced_start[0]
+finding_name = finding.split()[0].strip('<>')
+if not post_delete:
+break
+next = post_delete[0]
+if next is DEL_START or not next.startswith('<'):
+# Reached a word, we can't move the delete text forward
+break
+if next[1] == '/':
+# Reached a closing tag, can we go further?  Maybe not...
+break
+name = next.split()[0].strip('<>')
+if name == 'ins':
+# Can't move into an insert
+break
+assert name != 'del', (
+"Unexpected delete tag: %r" % next)
+if name == finding_name:
+unbalanced_start.pop(0)
+pre_delete.append(post_delete.pop(0))
+else:
+# Found a tag that doesn't match
+break
+def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
+""" like locate_unbalanced_start, except handling end tags and
+possibly moving the point earlier in the document.  """
+while 1:
+if not unbalanced_end:
+# Success
+break
+finding = unbalanced_end[-1]
+finding_name = finding.split()[0].strip('<>/')
+if not pre_delete:
+break
+next = pre_delete[-1]
+if next is DEL_END or not next.startswith('</'):
+# A word or a start tag
+break
+name = next.split()[0].strip('<>/')
+if name == 'ins' or name == 'del':
+# Can't move into an insert or delete
+break
+if name == finding_name:
+unbalanced_end.pop()
+post_delete.insert(0, pre_delete.pop())
+else:
+# Found a tag that doesn't match
+break
+class token(_unicode):
+""" Represents a diffable token, generally a word that is displayed to
+the user.  Opening tags are attached to this token when they are
+adjacent (pre_tags) and closing tags that follow the word
+(post_tags).  Some exceptions occur when there are empty tags
+adjacent to a word, so there may be close tags in pre_tags, or
+open tags in post_tags.
+We also keep track of whether the word was originally followed by
+whitespace, even though we do not want to treat the word as
+equivalent to a similar word that does not have a trailing
+space."""
+# When this is true, the token will be eliminated from the
+# displayed diff if no change has occurred:
+hide_when_equal = False
+def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
+obj = _unicode.__new__(cls, text)
+if pre_tags is not None:
+obj.pre_tags = pre_tags
+else:
+obj.pre_tags = []
+if post_tags is not None:
+obj.post_tags = post_tags
+else:
+obj.post_tags = []
+obj.trailing_whitespace = trailing_whitespace
+return obj
+def __repr__(self):
+return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,
+self.post_tags, self.trailing_whitespace)
+def html(self):
+return _unicode(self)
+class tag_token(token):
+""" Represents a token that is actually a tag.  Currently this is just
+the <img> tag, which takes up visible space just like a word but
+is only represented in a document by a tag.  """
+def __new__(cls, tag, data, html_repr, pre_tags=None,
+post_tags=None, trailing_whitespace=""):
+obj = token.__new__(cls, "%s: %s" % (type, data),
+pre_tags=pre_tags,
+post_tags=post_tags,
+trailing_whitespace=trailing_whitespace)
+obj.tag = tag
+obj.data = data
+obj.html_repr = html_repr
+return obj
+def __repr__(self):
+return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
+self.tag,
+self.data,
+self.html_repr,
+self.pre_tags,
+self.post_tags,
+self.trailing_whitespace)
+def html(self):
+return self.html_repr
+class href_token(token):
+""" Represents the href in an anchor tag.  Unlike other words, we only
+show the href when it changes.  """
+hide_when_equal = True
+def html(self):
+return ' Link: %s' % self
+def tokenize(html, include_hrefs=True):
+"""
+Parse the given HTML and returns token objects (words with attached tags).
+This parses only the content of a page; anything in the head is
+ignored, and the <head> and <body> elements are themselves
+optional.  The content is then parsed by lxml, which ensures the
+validity of the resulting parsed document (though lxml may make
+incorrect guesses when the markup is particular bad).
+<ins> and <del> tags are also eliminated from the document, as
+that gets confusing.
+If include_hrefs is true, then the href attribute of <a> tags is
+included as a special kind of diffable token."""
+if etree.iselement(html):
+body_el = html
+else:
+body_el = parse_html(html, cleanup=True)
+# Then we split the document into text chunks for each tag, word, and end tag:
+chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
+# Finally re-joining them into token objects:
+return fixup_chunks(chunks)
+def parse_html(html, cleanup=True):
+"""
+Parses an HTML fragment, returning an lxml element.  Note that the HTML will be
+wrapped in a <div> tag that was not in the original document.
+If cleanup is true, make sure there's no <head> or <body>, and get
+rid of any <ins> and <del> tags.
+"""
+if cleanup:
+# This removes any extra markup or structure like <head>:
+html = cleanup_html(html)
+return fragment_fromstring(html, create_parent=True)
+_body_re = re.compile(r'<body.*?>', re.I|re.S)
+_end_body_re = re.compile(r'</body.*?>', re.I|re.S)
+_ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
+def cleanup_html(html):
+""" This 'cleans' the HTML, meaning that any page structure is removed
+(only the contents of <body> are used, if there is any <body).
+Also <ins> and <del> tags are removed.  """
+match = _body_re.search(html)
+if match:
+html = html[match.end():]
+match = _end_body_re.search(html)
+if match:
+html = html[:match.start()]
+html = _ins_del_re.sub('', html)
+return html
+end_whitespace_re = re.compile(r'[ \t\n\r]$')
+def split_trailing_whitespace(word):
+"""
+This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
+"""
+stripped_length = len(word.rstrip())
+return word[0:stripped_length], word[stripped_length:]
+def fixup_chunks(chunks):
+"""
+This function takes a list of chunks and produces a list of tokens.
+"""
+tag_accum = []
+cur_word = None
+result = []
+for chunk in chunks:
+if isinstance(chunk, tuple):
+if chunk[0] == 'img':
+src = chunk[1]
+tag, trailing_whitespace = split_trailing_whitespace(chunk[2])
+cur_word = tag_token('img', src, html_repr=tag,
+pre_tags=tag_accum,
+trailing_whitespace=trailing_whitespace)
+tag_accum = []
+result.append(cur_word)
+elif chunk[0] == 'href':
+href = chunk[1]
+cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")
+tag_accum = []
+result.append(cur_word)
+continue
+if is_word(chunk):
+chunk, trailing_whitespace = split_trailing_whitespace(chunk)
+cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
+tag_accum = []
+result.append(cur_word)
+elif is_start_tag(chunk):
+tag_accum.append(chunk)
+elif is_end_tag(chunk):
+if tag_accum:
+tag_accum.append(chunk)
+else:
+assert cur_word, (
+"Weird state, cur_word=%r, result=%r, chunks=%r of %r"
+% (cur_word, result, chunk, chunks))
+cur_word.post_tags.append(chunk)
+else:
+assert False
+if not result:
+return [token('', pre_tags=tag_accum)]
+else:
+result[-1].post_tags.extend(tag_accum)
+return result
+# All the tags in HTML that don't require end tags:
+empty_tags = (
+'param', 'img', 'area', 'br', 'basefont', 'input',
+'base', 'meta', 'link', 'col')
+block_level_tags = (
+'address',
+'blockquote',
+'center',
+'dir',
+'div',
+'dl',
+'fieldset',
+'form',
+'h1',
+'h2',
+'h3',
+'h4',
+'h5',
+'h6',
+'hr',
+'isindex',
+'menu',
+'noframes',
+'noscript',
+'ol',
+'p',
+'pre',
+'table',
+'ul',
+)
+block_level_container_tags = (
+'dd',
+'dt',
+'frameset',
+'li',
+'tbody',
+'td',
+'tfoot',
+'th',
+'thead',
+'tr',
+)
+def flatten_el(el, include_hrefs, skip_tag=False):
+""" Takes an lxml element el, and generates all the text chunks for
+that tag.  Each start tag is a chunk, each word is a chunk, and each
+end tag is a chunk.
+If skip_tag is true, then the outermost container tag is
+not returned (just its contents)."""
+if not skip_tag:
+if el.tag == 'img':
+yield ('img', el.get('src'), start_tag(el))
+else:
+yield start_tag(el)
+if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
+return
+start_words = split_words(el.text)
+for word in start_words:
+yield html_escape(word)
+for child in el:
+for item in flatten_el(child, include_hrefs=include_hrefs):
+yield item
+if el.tag == 'a' and el.get('href') and include_hrefs:
+yield ('href', el.get('href'))
+if not skip_tag:
+yield end_tag(el)
+end_words = split_words(el.tail)
+for word in end_words:
+yield html_escape(word)
+split_words_re = re.compile(r'\S+(?:\s+|$)', re.U)
+def split_words(text):
+""" Splits some text into words. Includes trailing whitespace
+on each word when appropriate.  """
+if not text or not text.strip():
+return []
+words = split_words_re.findall(text)
+return words
+start_whitespace_re = re.compile(r'^[ \t\n\r]')
+def start_tag(el):
+"""
+The text representation of the start tag for a tag.
+"""
+return '<%s%s>' % (
+el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True))
+for name, value in el.attrib.items()]))
+def end_tag(el):
+""" The text representation of an end tag for a tag.  Includes
+trailing whitespace when appropriate.  """
+if el.tail and start_whitespace_re.search(el.tail):
+extra = ' '
+else:
+extra = ''
+return '</%s>%s' % (el.tag, extra)
+def is_word(tok):
+return not tok.startswith('<')
+def is_end_tag(tok):
+return tok.startswith('</')
+def is_start_tag(tok):
+return tok.startswith('<') and not tok.startswith('</')
+def fixup_ins_del_tags(html):
+""" Given an html string, move any <ins> or <del> tags inside of any
+block-level elements, e.g. transform <ins><p>word</p></ins> to
+<p><ins>word</ins></p> """
+doc = parse_html(html, cleanup=False)
+_fixup_ins_del_tags(doc)
+html = serialize_html_fragment(doc, skip_outer=True)
+return html
+def serialize_html_fragment(el, skip_outer=False):
+""" Serialize a single lxml element as HTML.  The serialized form
+includes the elements tail.
+If skip_outer is true, then don't serialize the outermost tag
+"""
+assert not isinstance(el, basestring), (
+"You should pass in an element, not a string like %r" % el)
+html = etree.tostring(el, method="html", encoding=_unicode)
+if skip_outer:
+# Get rid of the extra starting tag:
+html = html[html.find('>')+1:]
+# Get rid of the extra end tag:
+html = html[:html.rfind('<')]
+return html.strip()
+else:
+return html
+def _fixup_ins_del_tags(doc):
+"""fixup_ins_del_tags that works on an lxml document in-place
+"""
+for tag in ['ins', 'del']:
+for el in doc.xpath('descendant-or-self::%s' % tag):
+if not _contains_block_level_tag(el):
+continue
+_move_el_inside_block(el, tag=tag)
+el.drop_tag()
+#_merge_element_contents(el)
+def _contains_block_level_tag(el):
+"""True if the element contains any block-level elements, like <p>, <td>, etc.
+"""
+if el.tag in block_level_tags or el.tag in block_level_container_tags:
+return True
+for child in el:
+if _contains_block_level_tag(child):
+return True
+return False
+def _move_el_inside_block(el, tag):
+""" helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
+and moves them inside any block-level tags.  """
+for child in el:
+if _contains_block_level_tag(child):
+break
+else:
+# No block-level tags in any child
+children_tag = etree.Element(tag)
+children_tag.text = el.text
+el.text = None
+children_tag.extend(list(el))
+el[:] = [children_tag]
+return
+for child in list(el):
+if _contains_block_level_tag(child):
+_move_el_inside_block(child, tag)
+if child.tail:
+tail_tag = etree.Element(tag)
+tail_tag.text = child.tail
+child.tail = None
+el.insert(el.index(child)+1, tail_tag)
+else:
+child_tag = etree.Element(tag)
+el.replace(child, child_tag)
+child_tag.append(child)
+if el.text:
+text_tag = etree.Element(tag)
+text_tag.text = el.text
+el.text = None
+el.insert(0, text_tag)
+def _merge_element_contents(el):
+"""
+Removes an element, but merges its contents into its place, e.g.,
+given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
+<p>Hi there!</p>
+"""
+parent = el.getparent()
+text = el.text or ''
+if el.tail:
+if not len(el):
+text += el.tail
+else:
+if el[-1].tail:
+el[-1].tail += el.tail
+else:
+el[-1].tail = el.tail
+index = parent.index(el)
+if text:
+if index == 0:
+previous = None
+else:
+previous = parent[index-1]
+if previous is None:
+if parent.text:
+parent.text += text
+else:
+parent.text = text
+else:
+if previous.tail:
+previous.tail += text
+else:
+previous.tail = text
+parent[index:index+1] = el.getchildren()
+class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
+"""
+Acts like SequenceMatcher, but tries not to find very small equal
+blocks amidst large spans of changes
+"""
+threshold = 2
+def get_matching_blocks(self):
+size = min(len(self.b), len(self.b))
+threshold = min(self.threshold, size / 4)
+actual = difflib.SequenceMatcher.get_matching_blocks(self)
+return [item for item in actual
+if item[2] > threshold
+or not item[2]]
+if __name__ == '__main__':
+from lxml.html import _diffcommand
+_diffcommand.main()

Mercurial > repos > shellac > guppy_basecaller

comparison env/lib/python3.7/site-packages/lxml/html/diff.py @ 0:26e78fe6e8c4 draft