Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/lxml/html/diff.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
| author | shellac |
|---|---|
| date | Sat, 02 May 2020 07:14:21 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:26e78fe6e8c4 |
|---|---|
| 1 # cython: language_level=3 | |
| 2 | |
| 3 from __future__ import absolute_import | |
| 4 | |
| 5 import difflib | |
| 6 from lxml import etree | |
| 7 from lxml.html import fragment_fromstring | |
| 8 import re | |
| 9 | |
| 10 __all__ = ['html_annotate', 'htmldiff'] | |
| 11 | |
| 12 try: | |
| 13 from html import escape as html_escape | |
| 14 except ImportError: | |
| 15 from cgi import escape as html_escape | |
| 16 try: | |
| 17 _unicode = unicode | |
| 18 except NameError: | |
| 19 # Python 3 | |
| 20 _unicode = str | |
| 21 try: | |
| 22 basestring | |
| 23 except NameError: | |
| 24 # Python 3 | |
| 25 basestring = str | |
| 26 | |
| 27 ############################################################ | |
| 28 ## Annotation | |
| 29 ############################################################ | |
| 30 | |
| 31 def default_markup(text, version): | |
| 32 return '<span title="%s">%s</span>' % ( | |
| 33 html_escape(_unicode(version), 1), text) | |
| 34 | |
| 35 def html_annotate(doclist, markup=default_markup): | |
| 36 """ | |
| 37 doclist should be ordered from oldest to newest, like:: | |
| 38 | |
| 39 >>> version1 = 'Hello World' | |
| 40 >>> version2 = 'Goodbye World' | |
| 41 >>> print(html_annotate([(version1, 'version 1'), | |
| 42 ... (version2, 'version 2')])) | |
| 43 <span title="version 2">Goodbye</span> <span title="version 1">World</span> | |
| 44 | |
| 45 The documents must be *fragments* (str/UTF8 or unicode), not | |
| 46 complete documents | |
| 47 | |
| 48 The markup argument is a function to markup the spans of words. | |
| 49 This function is called like markup('Hello', 'version 2'), and | |
| 50 returns HTML. The first argument is text and never includes any | |
| 51 markup. The default uses a span with a title: | |
| 52 | |
| 53 >>> print(default_markup('Some Text', 'by Joe')) | |
| 54 <span title="by Joe">Some Text</span> | |
| 55 """ | |
| 56 # The basic strategy we have is to split the documents up into | |
| 57 # logical tokens (which are words with attached markup). We then | |
| 58 # do diffs of each of the versions to track when a token first | |
| 59 # appeared in the document; the annotation attached to the token | |
| 60 # is the version where it first appeared. | |
| 61 tokenlist = [tokenize_annotated(doc, version) | |
| 62 for doc, version in doclist] | |
| 63 cur_tokens = tokenlist[0] | |
| 64 for tokens in tokenlist[1:]: | |
| 65 html_annotate_merge_annotations(cur_tokens, tokens) | |
| 66 cur_tokens = tokens | |
| 67 | |
| 68 # After we've tracked all the tokens, we can combine spans of text | |
| 69 # that are adjacent and have the same annotation | |
| 70 cur_tokens = compress_tokens(cur_tokens) | |
| 71 # And finally add markup | |
| 72 result = markup_serialize_tokens(cur_tokens, markup) | |
| 73 return ''.join(result).strip() | |
| 74 | |
| 75 def tokenize_annotated(doc, annotation): | |
| 76 """Tokenize a document and add an annotation attribute to each token | |
| 77 """ | |
| 78 tokens = tokenize(doc, include_hrefs=False) | |
| 79 for tok in tokens: | |
| 80 tok.annotation = annotation | |
| 81 return tokens | |
| 82 | |
| 83 def html_annotate_merge_annotations(tokens_old, tokens_new): | |
| 84 """Merge the annotations from tokens_old into tokens_new, when the | |
| 85 tokens in the new document already existed in the old document. | |
| 86 """ | |
| 87 s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new) | |
| 88 commands = s.get_opcodes() | |
| 89 | |
| 90 for command, i1, i2, j1, j2 in commands: | |
| 91 if command == 'equal': | |
| 92 eq_old = tokens_old[i1:i2] | |
| 93 eq_new = tokens_new[j1:j2] | |
| 94 copy_annotations(eq_old, eq_new) | |
| 95 | |
| 96 def copy_annotations(src, dest): | |
| 97 """ | |
| 98 Copy annotations from the tokens listed in src to the tokens in dest | |
| 99 """ | |
| 100 assert len(src) == len(dest) | |
| 101 for src_tok, dest_tok in zip(src, dest): | |
| 102 dest_tok.annotation = src_tok.annotation | |
| 103 | |
| 104 def compress_tokens(tokens): | |
| 105 """ | |
| 106 Combine adjacent tokens when there is no HTML between the tokens, | |
| 107 and they share an annotation | |
| 108 """ | |
| 109 result = [tokens[0]] | |
| 110 for tok in tokens[1:]: | |
| 111 if (not result[-1].post_tags and | |
| 112 not tok.pre_tags and | |
| 113 result[-1].annotation == tok.annotation): | |
| 114 compress_merge_back(result, tok) | |
| 115 else: | |
| 116 result.append(tok) | |
| 117 return result | |
| 118 | |
| 119 def compress_merge_back(tokens, tok): | |
| 120 """ Merge tok into the last element of tokens (modifying the list of | |
| 121 tokens in-place). """ | |
| 122 last = tokens[-1] | |
| 123 if type(last) is not token or type(tok) is not token: | |
| 124 tokens.append(tok) | |
| 125 else: | |
| 126 text = _unicode(last) | |
| 127 if last.trailing_whitespace: | |
| 128 text += last.trailing_whitespace | |
| 129 text += tok | |
| 130 merged = token(text, | |
| 131 pre_tags=last.pre_tags, | |
| 132 post_tags=tok.post_tags, | |
| 133 trailing_whitespace=tok.trailing_whitespace) | |
| 134 merged.annotation = last.annotation | |
| 135 tokens[-1] = merged | |
| 136 | |
| 137 def markup_serialize_tokens(tokens, markup_func): | |
| 138 """ | |
| 139 Serialize the list of tokens into a list of text chunks, calling | |
| 140 markup_func around text to add annotations. | |
| 141 """ | |
| 142 for token in tokens: | |
| 143 for pre in token.pre_tags: | |
| 144 yield pre | |
| 145 html = token.html() | |
| 146 html = markup_func(html, token.annotation) | |
| 147 if token.trailing_whitespace: | |
| 148 html += token.trailing_whitespace | |
| 149 yield html | |
| 150 for post in token.post_tags: | |
| 151 yield post | |
| 152 | |
| 153 | |
| 154 ############################################################ | |
| 155 ## HTML Diffs | |
| 156 ############################################################ | |
| 157 | |
| 158 def htmldiff(old_html, new_html): | |
| 159 ## FIXME: this should take parsed documents too, and use their body | |
| 160 ## or other content. | |
| 161 """ Do a diff of the old and new document. The documents are HTML | |
| 162 *fragments* (str/UTF8 or unicode), they are not complete documents | |
| 163 (i.e., no <html> tag). | |
| 164 | |
| 165 Returns HTML with <ins> and <del> tags added around the | |
| 166 appropriate text. | |
| 167 | |
| 168 Markup is generally ignored, with the markup from new_html | |
| 169 preserved, and possibly some markup from old_html (though it is | |
| 170 considered acceptable to lose some of the old markup). Only the | |
| 171 words in the HTML are diffed. The exception is <img> tags, which | |
| 172 are treated like words, and the href attribute of <a> tags, which | |
| 173 are noted inside the tag itself when there are changes. | |
| 174 """ | |
| 175 old_html_tokens = tokenize(old_html) | |
| 176 new_html_tokens = tokenize(new_html) | |
| 177 result = htmldiff_tokens(old_html_tokens, new_html_tokens) | |
| 178 result = ''.join(result).strip() | |
| 179 return fixup_ins_del_tags(result) | |
| 180 | |
| 181 def htmldiff_tokens(html1_tokens, html2_tokens): | |
| 182 """ Does a diff on the tokens themselves, returning a list of text | |
| 183 chunks (not tokens). | |
| 184 """ | |
| 185 # There are several passes as we do the differences. The tokens | |
| 186 # isolate the portion of the content we care to diff; difflib does | |
| 187 # all the actual hard work at that point. | |
| 188 # | |
| 189 # Then we must create a valid document from pieces of both the old | |
| 190 # document and the new document. We generally prefer to take | |
| 191 # markup from the new document, and only do a best effort attempt | |
| 192 # to keep markup from the old document; anything that we can't | |
| 193 # resolve we throw away. Also we try to put the deletes as close | |
| 194 # to the location where we think they would have been -- because | |
| 195 # we are only keeping the markup from the new document, it can be | |
| 196 # fuzzy where in the new document the old text would have gone. | |
| 197 # Again we just do a best effort attempt. | |
| 198 s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens) | |
| 199 commands = s.get_opcodes() | |
| 200 result = [] | |
| 201 for command, i1, i2, j1, j2 in commands: | |
| 202 if command == 'equal': | |
| 203 result.extend(expand_tokens(html2_tokens[j1:j2], equal=True)) | |
| 204 continue | |
| 205 if command == 'insert' or command == 'replace': | |
| 206 ins_tokens = expand_tokens(html2_tokens[j1:j2]) | |
| 207 merge_insert(ins_tokens, result) | |
| 208 if command == 'delete' or command == 'replace': | |
| 209 del_tokens = expand_tokens(html1_tokens[i1:i2]) | |
| 210 merge_delete(del_tokens, result) | |
| 211 # If deletes were inserted directly as <del> then we'd have an | |
| 212 # invalid document at this point. Instead we put in special | |
| 213 # markers, and when the complete diffed document has been created | |
| 214 # we try to move the deletes around and resolve any problems. | |
| 215 result = cleanup_delete(result) | |
| 216 | |
| 217 return result | |
| 218 | |
| 219 def expand_tokens(tokens, equal=False): | |
| 220 """Given a list of tokens, return a generator of the chunks of | |
| 221 text for the data in the tokens. | |
| 222 """ | |
| 223 for token in tokens: | |
| 224 for pre in token.pre_tags: | |
| 225 yield pre | |
| 226 if not equal or not token.hide_when_equal: | |
| 227 if token.trailing_whitespace: | |
| 228 yield token.html() + token.trailing_whitespace | |
| 229 else: | |
| 230 yield token.html() | |
| 231 for post in token.post_tags: | |
| 232 yield post | |
| 233 | |
| 234 def merge_insert(ins_chunks, doc): | |
| 235 """ doc is the already-handled document (as a list of text chunks); | |
| 236 here we add <ins>ins_chunks</ins> to the end of that. """ | |
| 237 # Though we don't throw away unbalanced_start or unbalanced_end | |
| 238 # (we assume there is accompanying markup later or earlier in the | |
| 239 # document), we only put <ins> around the balanced portion. | |
| 240 unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks) | |
| 241 doc.extend(unbalanced_start) | |
| 242 if doc and not doc[-1].endswith(' '): | |
| 243 # Fix up the case where the word before the insert didn't end with | |
| 244 # a space | |
| 245 doc[-1] += ' ' | |
| 246 doc.append('<ins>') | |
| 247 if balanced and balanced[-1].endswith(' '): | |
| 248 # We move space outside of </ins> | |
| 249 balanced[-1] = balanced[-1][:-1] | |
| 250 doc.extend(balanced) | |
| 251 doc.append('</ins> ') | |
| 252 doc.extend(unbalanced_end) | |
| 253 | |
| 254 # These are sentinals to represent the start and end of a <del> | |
| 255 # segment, until we do the cleanup phase to turn them into proper | |
| 256 # markup: | |
| 257 class DEL_START: | |
| 258 pass | |
| 259 class DEL_END: | |
| 260 pass | |
| 261 | |
| 262 class NoDeletes(Exception): | |
| 263 """ Raised when the document no longer contains any pending deletes | |
| 264 (DEL_START/DEL_END) """ | |
| 265 | |
| 266 def merge_delete(del_chunks, doc): | |
| 267 """ Adds the text chunks in del_chunks to the document doc (another | |
| 268 list of text chunks) with marker to show it is a delete. | |
| 269 cleanup_delete later resolves these markers into <del> tags.""" | |
| 270 doc.append(DEL_START) | |
| 271 doc.extend(del_chunks) | |
| 272 doc.append(DEL_END) | |
| 273 | |
| 274 def cleanup_delete(chunks): | |
| 275 """ Cleans up any DEL_START/DEL_END markers in the document, replacing | |
| 276 them with <del></del>. To do this while keeping the document | |
| 277 valid, it may need to drop some tags (either start or end tags). | |
| 278 | |
| 279 It may also move the del into adjacent tags to try to move it to a | |
| 280 similar location where it was originally located (e.g., moving a | |
| 281 delete into preceding <div> tag, if the del looks like (DEL_START, | |
| 282 'Text</div>', DEL_END)""" | |
| 283 while 1: | |
| 284 # Find a pending DEL_START/DEL_END, splitting the document | |
| 285 # into stuff-preceding-DEL_START, stuff-inside, and | |
| 286 # stuff-following-DEL_END | |
| 287 try: | |
| 288 pre_delete, delete, post_delete = split_delete(chunks) | |
| 289 except NoDeletes: | |
| 290 # Nothing found, we've cleaned up the entire doc | |
| 291 break | |
| 292 # The stuff-inside-DEL_START/END may not be well balanced | |
| 293 # markup. First we figure out what unbalanced portions there are: | |
| 294 unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete) | |
| 295 # Then we move the span forward and/or backward based on these | |
| 296 # unbalanced portions: | |
| 297 locate_unbalanced_start(unbalanced_start, pre_delete, post_delete) | |
| 298 locate_unbalanced_end(unbalanced_end, pre_delete, post_delete) | |
| 299 doc = pre_delete | |
| 300 if doc and not doc[-1].endswith(' '): | |
| 301 # Fix up case where the word before us didn't have a trailing space | |
| 302 doc[-1] += ' ' | |
| 303 doc.append('<del>') | |
| 304 if balanced and balanced[-1].endswith(' '): | |
| 305 # We move space outside of </del> | |
| 306 balanced[-1] = balanced[-1][:-1] | |
| 307 doc.extend(balanced) | |
| 308 doc.append('</del> ') | |
| 309 doc.extend(post_delete) | |
| 310 chunks = doc | |
| 311 return chunks | |
| 312 | |
| 313 def split_unbalanced(chunks): | |
| 314 """Return (unbalanced_start, balanced, unbalanced_end), where each is | |
| 315 a list of text and tag chunks. | |
| 316 | |
| 317 unbalanced_start is a list of all the tags that are opened, but | |
| 318 not closed in this span. Similarly, unbalanced_end is a list of | |
| 319 tags that are closed but were not opened. Extracting these might | |
| 320 mean some reordering of the chunks.""" | |
| 321 start = [] | |
| 322 end = [] | |
| 323 tag_stack = [] | |
| 324 balanced = [] | |
| 325 for chunk in chunks: | |
| 326 if not chunk.startswith('<'): | |
| 327 balanced.append(chunk) | |
| 328 continue | |
| 329 endtag = chunk[1] == '/' | |
| 330 name = chunk.split()[0].strip('<>/') | |
| 331 if name in empty_tags: | |
| 332 balanced.append(chunk) | |
| 333 continue | |
| 334 if endtag: | |
| 335 if tag_stack and tag_stack[-1][0] == name: | |
| 336 balanced.append(chunk) | |
| 337 name, pos, tag = tag_stack.pop() | |
| 338 balanced[pos] = tag | |
| 339 elif tag_stack: | |
| 340 start.extend([tag for name, pos, tag in tag_stack]) | |
| 341 tag_stack = [] | |
| 342 end.append(chunk) | |
| 343 else: | |
| 344 end.append(chunk) | |
| 345 else: | |
| 346 tag_stack.append((name, len(balanced), chunk)) | |
| 347 balanced.append(None) | |
| 348 start.extend( | |
| 349 [chunk for name, pos, chunk in tag_stack]) | |
| 350 balanced = [chunk for chunk in balanced if chunk is not None] | |
| 351 return start, balanced, end | |
| 352 | |
| 353 def split_delete(chunks): | |
| 354 """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END, | |
| 355 stuff_after_DEL_END). Returns the first case found (there may be | |
| 356 more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if | |
| 357 there's no DEL_START found. """ | |
| 358 try: | |
| 359 pos = chunks.index(DEL_START) | |
| 360 except ValueError: | |
| 361 raise NoDeletes | |
| 362 pos2 = chunks.index(DEL_END) | |
| 363 return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:] | |
| 364 | |
| 365 def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete): | |
| 366 """ pre_delete and post_delete implicitly point to a place in the | |
| 367 document (where the two were split). This moves that point (by | |
| 368 popping items from one and pushing them onto the other). It moves | |
| 369 the point to try to find a place where unbalanced_start applies. | |
| 370 | |
| 371 As an example:: | |
| 372 | |
| 373 >>> unbalanced_start = ['<div>'] | |
| 374 >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>'] | |
| 375 >>> pre, post = doc[:3], doc[3:] | |
| 376 >>> pre, post | |
| 377 (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>']) | |
| 378 >>> locate_unbalanced_start(unbalanced_start, pre, post) | |
| 379 >>> pre, post | |
| 380 (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>']) | |
| 381 | |
| 382 As you can see, we moved the point so that the dangling <div> that | |
| 383 we found will be effectively replaced by the div in the original | |
| 384 document. If this doesn't work out, we just throw away | |
| 385 unbalanced_start without doing anything. | |
| 386 """ | |
| 387 while 1: | |
| 388 if not unbalanced_start: | |
| 389 # We have totally succeeded in finding the position | |
| 390 break | |
| 391 finding = unbalanced_start[0] | |
| 392 finding_name = finding.split()[0].strip('<>') | |
| 393 if not post_delete: | |
| 394 break | |
| 395 next = post_delete[0] | |
| 396 if next is DEL_START or not next.startswith('<'): | |
| 397 # Reached a word, we can't move the delete text forward | |
| 398 break | |
| 399 if next[1] == '/': | |
| 400 # Reached a closing tag, can we go further? Maybe not... | |
| 401 break | |
| 402 name = next.split()[0].strip('<>') | |
| 403 if name == 'ins': | |
| 404 # Can't move into an insert | |
| 405 break | |
| 406 assert name != 'del', ( | |
| 407 "Unexpected delete tag: %r" % next) | |
| 408 if name == finding_name: | |
| 409 unbalanced_start.pop(0) | |
| 410 pre_delete.append(post_delete.pop(0)) | |
| 411 else: | |
| 412 # Found a tag that doesn't match | |
| 413 break | |
| 414 | |
| 415 def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete): | |
| 416 """ like locate_unbalanced_start, except handling end tags and | |
| 417 possibly moving the point earlier in the document. """ | |
| 418 while 1: | |
| 419 if not unbalanced_end: | |
| 420 # Success | |
| 421 break | |
| 422 finding = unbalanced_end[-1] | |
| 423 finding_name = finding.split()[0].strip('<>/') | |
| 424 if not pre_delete: | |
| 425 break | |
| 426 next = pre_delete[-1] | |
| 427 if next is DEL_END or not next.startswith('</'): | |
| 428 # A word or a start tag | |
| 429 break | |
| 430 name = next.split()[0].strip('<>/') | |
| 431 if name == 'ins' or name == 'del': | |
| 432 # Can't move into an insert or delete | |
| 433 break | |
| 434 if name == finding_name: | |
| 435 unbalanced_end.pop() | |
| 436 post_delete.insert(0, pre_delete.pop()) | |
| 437 else: | |
| 438 # Found a tag that doesn't match | |
| 439 break | |
| 440 | |
| 441 class token(_unicode): | |
| 442 """ Represents a diffable token, generally a word that is displayed to | |
| 443 the user. Opening tags are attached to this token when they are | |
| 444 adjacent (pre_tags) and closing tags that follow the word | |
| 445 (post_tags). Some exceptions occur when there are empty tags | |
| 446 adjacent to a word, so there may be close tags in pre_tags, or | |
| 447 open tags in post_tags. | |
| 448 | |
| 449 We also keep track of whether the word was originally followed by | |
| 450 whitespace, even though we do not want to treat the word as | |
| 451 equivalent to a similar word that does not have a trailing | |
| 452 space.""" | |
| 453 | |
| 454 # When this is true, the token will be eliminated from the | |
| 455 # displayed diff if no change has occurred: | |
| 456 hide_when_equal = False | |
| 457 | |
| 458 def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""): | |
| 459 obj = _unicode.__new__(cls, text) | |
| 460 | |
| 461 if pre_tags is not None: | |
| 462 obj.pre_tags = pre_tags | |
| 463 else: | |
| 464 obj.pre_tags = [] | |
| 465 | |
| 466 if post_tags is not None: | |
| 467 obj.post_tags = post_tags | |
| 468 else: | |
| 469 obj.post_tags = [] | |
| 470 | |
| 471 obj.trailing_whitespace = trailing_whitespace | |
| 472 | |
| 473 return obj | |
| 474 | |
| 475 def __repr__(self): | |
| 476 return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, | |
| 477 self.post_tags, self.trailing_whitespace) | |
| 478 | |
| 479 def html(self): | |
| 480 return _unicode(self) | |
| 481 | |
| 482 class tag_token(token): | |
| 483 | |
| 484 """ Represents a token that is actually a tag. Currently this is just | |
| 485 the <img> tag, which takes up visible space just like a word but | |
| 486 is only represented in a document by a tag. """ | |
| 487 | |
| 488 def __new__(cls, tag, data, html_repr, pre_tags=None, | |
| 489 post_tags=None, trailing_whitespace=""): | |
| 490 obj = token.__new__(cls, "%s: %s" % (type, data), | |
| 491 pre_tags=pre_tags, | |
| 492 post_tags=post_tags, | |
| 493 trailing_whitespace=trailing_whitespace) | |
| 494 obj.tag = tag | |
| 495 obj.data = data | |
| 496 obj.html_repr = html_repr | |
| 497 return obj | |
| 498 | |
| 499 def __repr__(self): | |
| 500 return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % ( | |
| 501 self.tag, | |
| 502 self.data, | |
| 503 self.html_repr, | |
| 504 self.pre_tags, | |
| 505 self.post_tags, | |
| 506 self.trailing_whitespace) | |
| 507 def html(self): | |
| 508 return self.html_repr | |
| 509 | |
| 510 class href_token(token): | |
| 511 | |
| 512 """ Represents the href in an anchor tag. Unlike other words, we only | |
| 513 show the href when it changes. """ | |
| 514 | |
| 515 hide_when_equal = True | |
| 516 | |
| 517 def html(self): | |
| 518 return ' Link: %s' % self | |
| 519 | |
| 520 def tokenize(html, include_hrefs=True): | |
| 521 """ | |
| 522 Parse the given HTML and returns token objects (words with attached tags). | |
| 523 | |
| 524 This parses only the content of a page; anything in the head is | |
| 525 ignored, and the <head> and <body> elements are themselves | |
| 526 optional. The content is then parsed by lxml, which ensures the | |
| 527 validity of the resulting parsed document (though lxml may make | |
| 528 incorrect guesses when the markup is particular bad). | |
| 529 | |
| 530 <ins> and <del> tags are also eliminated from the document, as | |
| 531 that gets confusing. | |
| 532 | |
| 533 If include_hrefs is true, then the href attribute of <a> tags is | |
| 534 included as a special kind of diffable token.""" | |
| 535 if etree.iselement(html): | |
| 536 body_el = html | |
| 537 else: | |
| 538 body_el = parse_html(html, cleanup=True) | |
| 539 # Then we split the document into text chunks for each tag, word, and end tag: | |
| 540 chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs) | |
| 541 # Finally re-joining them into token objects: | |
| 542 return fixup_chunks(chunks) | |
| 543 | |
| 544 def parse_html(html, cleanup=True): | |
| 545 """ | |
| 546 Parses an HTML fragment, returning an lxml element. Note that the HTML will be | |
| 547 wrapped in a <div> tag that was not in the original document. | |
| 548 | |
| 549 If cleanup is true, make sure there's no <head> or <body>, and get | |
| 550 rid of any <ins> and <del> tags. | |
| 551 """ | |
| 552 if cleanup: | |
| 553 # This removes any extra markup or structure like <head>: | |
| 554 html = cleanup_html(html) | |
| 555 return fragment_fromstring(html, create_parent=True) | |
| 556 | |
| 557 _body_re = re.compile(r'<body.*?>', re.I|re.S) | |
| 558 _end_body_re = re.compile(r'</body.*?>', re.I|re.S) | |
| 559 _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S) | |
| 560 | |
| 561 def cleanup_html(html): | |
| 562 """ This 'cleans' the HTML, meaning that any page structure is removed | |
| 563 (only the contents of <body> are used, if there is any <body). | |
| 564 Also <ins> and <del> tags are removed. """ | |
| 565 match = _body_re.search(html) | |
| 566 if match: | |
| 567 html = html[match.end():] | |
| 568 match = _end_body_re.search(html) | |
| 569 if match: | |
| 570 html = html[:match.start()] | |
| 571 html = _ins_del_re.sub('', html) | |
| 572 return html | |
| 573 | |
| 574 | |
| 575 end_whitespace_re = re.compile(r'[ \t\n\r]$') | |
| 576 | |
| 577 def split_trailing_whitespace(word): | |
| 578 """ | |
| 579 This function takes a word, such as 'test\n\n' and returns ('test','\n\n') | |
| 580 """ | |
| 581 stripped_length = len(word.rstrip()) | |
| 582 return word[0:stripped_length], word[stripped_length:] | |
| 583 | |
| 584 | |
| 585 def fixup_chunks(chunks): | |
| 586 """ | |
| 587 This function takes a list of chunks and produces a list of tokens. | |
| 588 """ | |
| 589 tag_accum = [] | |
| 590 cur_word = None | |
| 591 result = [] | |
| 592 for chunk in chunks: | |
| 593 if isinstance(chunk, tuple): | |
| 594 if chunk[0] == 'img': | |
| 595 src = chunk[1] | |
| 596 tag, trailing_whitespace = split_trailing_whitespace(chunk[2]) | |
| 597 cur_word = tag_token('img', src, html_repr=tag, | |
| 598 pre_tags=tag_accum, | |
| 599 trailing_whitespace=trailing_whitespace) | |
| 600 tag_accum = [] | |
| 601 result.append(cur_word) | |
| 602 | |
| 603 elif chunk[0] == 'href': | |
| 604 href = chunk[1] | |
| 605 cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ") | |
| 606 tag_accum = [] | |
| 607 result.append(cur_word) | |
| 608 continue | |
| 609 | |
| 610 if is_word(chunk): | |
| 611 chunk, trailing_whitespace = split_trailing_whitespace(chunk) | |
| 612 cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) | |
| 613 tag_accum = [] | |
| 614 result.append(cur_word) | |
| 615 | |
| 616 elif is_start_tag(chunk): | |
| 617 tag_accum.append(chunk) | |
| 618 | |
| 619 elif is_end_tag(chunk): | |
| 620 if tag_accum: | |
| 621 tag_accum.append(chunk) | |
| 622 else: | |
| 623 assert cur_word, ( | |
| 624 "Weird state, cur_word=%r, result=%r, chunks=%r of %r" | |
| 625 % (cur_word, result, chunk, chunks)) | |
| 626 cur_word.post_tags.append(chunk) | |
| 627 else: | |
| 628 assert False | |
| 629 | |
| 630 if not result: | |
| 631 return [token('', pre_tags=tag_accum)] | |
| 632 else: | |
| 633 result[-1].post_tags.extend(tag_accum) | |
| 634 | |
| 635 return result | |
| 636 | |
| 637 | |
| 638 # All the tags in HTML that don't require end tags: | |
| 639 empty_tags = ( | |
| 640 'param', 'img', 'area', 'br', 'basefont', 'input', | |
| 641 'base', 'meta', 'link', 'col') | |
| 642 | |
| 643 block_level_tags = ( | |
| 644 'address', | |
| 645 'blockquote', | |
| 646 'center', | |
| 647 'dir', | |
| 648 'div', | |
| 649 'dl', | |
| 650 'fieldset', | |
| 651 'form', | |
| 652 'h1', | |
| 653 'h2', | |
| 654 'h3', | |
| 655 'h4', | |
| 656 'h5', | |
| 657 'h6', | |
| 658 'hr', | |
| 659 'isindex', | |
| 660 'menu', | |
| 661 'noframes', | |
| 662 'noscript', | |
| 663 'ol', | |
| 664 'p', | |
| 665 'pre', | |
| 666 'table', | |
| 667 'ul', | |
| 668 ) | |
| 669 | |
| 670 block_level_container_tags = ( | |
| 671 'dd', | |
| 672 'dt', | |
| 673 'frameset', | |
| 674 'li', | |
| 675 'tbody', | |
| 676 'td', | |
| 677 'tfoot', | |
| 678 'th', | |
| 679 'thead', | |
| 680 'tr', | |
| 681 ) | |
| 682 | |
| 683 | |
| 684 def flatten_el(el, include_hrefs, skip_tag=False): | |
| 685 """ Takes an lxml element el, and generates all the text chunks for | |
| 686 that tag. Each start tag is a chunk, each word is a chunk, and each | |
| 687 end tag is a chunk. | |
| 688 | |
| 689 If skip_tag is true, then the outermost container tag is | |
| 690 not returned (just its contents).""" | |
| 691 if not skip_tag: | |
| 692 if el.tag == 'img': | |
| 693 yield ('img', el.get('src'), start_tag(el)) | |
| 694 else: | |
| 695 yield start_tag(el) | |
| 696 if el.tag in empty_tags and not el.text and not len(el) and not el.tail: | |
| 697 return | |
| 698 start_words = split_words(el.text) | |
| 699 for word in start_words: | |
| 700 yield html_escape(word) | |
| 701 for child in el: | |
| 702 for item in flatten_el(child, include_hrefs=include_hrefs): | |
| 703 yield item | |
| 704 if el.tag == 'a' and el.get('href') and include_hrefs: | |
| 705 yield ('href', el.get('href')) | |
| 706 if not skip_tag: | |
| 707 yield end_tag(el) | |
| 708 end_words = split_words(el.tail) | |
| 709 for word in end_words: | |
| 710 yield html_escape(word) | |
| 711 | |
| 712 split_words_re = re.compile(r'\S+(?:\s+|$)', re.U) | |
| 713 | |
| 714 def split_words(text): | |
| 715 """ Splits some text into words. Includes trailing whitespace | |
| 716 on each word when appropriate. """ | |
| 717 if not text or not text.strip(): | |
| 718 return [] | |
| 719 | |
| 720 words = split_words_re.findall(text) | |
| 721 return words | |
| 722 | |
| 723 start_whitespace_re = re.compile(r'^[ \t\n\r]') | |
| 724 | |
| 725 def start_tag(el): | |
| 726 """ | |
| 727 The text representation of the start tag for a tag. | |
| 728 """ | |
| 729 return '<%s%s>' % ( | |
| 730 el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True)) | |
| 731 for name, value in el.attrib.items()])) | |
| 732 | |
| 733 def end_tag(el): | |
| 734 """ The text representation of an end tag for a tag. Includes | |
| 735 trailing whitespace when appropriate. """ | |
| 736 if el.tail and start_whitespace_re.search(el.tail): | |
| 737 extra = ' ' | |
| 738 else: | |
| 739 extra = '' | |
| 740 return '</%s>%s' % (el.tag, extra) | |
| 741 | |
| 742 def is_word(tok): | |
| 743 return not tok.startswith('<') | |
| 744 | |
| 745 def is_end_tag(tok): | |
| 746 return tok.startswith('</') | |
| 747 | |
| 748 def is_start_tag(tok): | |
| 749 return tok.startswith('<') and not tok.startswith('</') | |
| 750 | |
| 751 def fixup_ins_del_tags(html): | |
| 752 """ Given an html string, move any <ins> or <del> tags inside of any | |
| 753 block-level elements, e.g. transform <ins><p>word</p></ins> to | |
| 754 <p><ins>word</ins></p> """ | |
| 755 doc = parse_html(html, cleanup=False) | |
| 756 _fixup_ins_del_tags(doc) | |
| 757 html = serialize_html_fragment(doc, skip_outer=True) | |
| 758 return html | |
| 759 | |
| 760 def serialize_html_fragment(el, skip_outer=False): | |
| 761 """ Serialize a single lxml element as HTML. The serialized form | |
| 762 includes the elements tail. | |
| 763 | |
| 764 If skip_outer is true, then don't serialize the outermost tag | |
| 765 """ | |
| 766 assert not isinstance(el, basestring), ( | |
| 767 "You should pass in an element, not a string like %r" % el) | |
| 768 html = etree.tostring(el, method="html", encoding=_unicode) | |
| 769 if skip_outer: | |
| 770 # Get rid of the extra starting tag: | |
| 771 html = html[html.find('>')+1:] | |
| 772 # Get rid of the extra end tag: | |
| 773 html = html[:html.rfind('<')] | |
| 774 return html.strip() | |
| 775 else: | |
| 776 return html | |
| 777 | |
| 778 def _fixup_ins_del_tags(doc): | |
| 779 """fixup_ins_del_tags that works on an lxml document in-place | |
| 780 """ | |
| 781 for tag in ['ins', 'del']: | |
| 782 for el in doc.xpath('descendant-or-self::%s' % tag): | |
| 783 if not _contains_block_level_tag(el): | |
| 784 continue | |
| 785 _move_el_inside_block(el, tag=tag) | |
| 786 el.drop_tag() | |
| 787 #_merge_element_contents(el) | |
| 788 | |
| 789 def _contains_block_level_tag(el): | |
| 790 """True if the element contains any block-level elements, like <p>, <td>, etc. | |
| 791 """ | |
| 792 if el.tag in block_level_tags or el.tag in block_level_container_tags: | |
| 793 return True | |
| 794 for child in el: | |
| 795 if _contains_block_level_tag(child): | |
| 796 return True | |
| 797 return False | |
| 798 | |
| 799 def _move_el_inside_block(el, tag): | |
| 800 """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags | |
| 801 and moves them inside any block-level tags. """ | |
| 802 for child in el: | |
| 803 if _contains_block_level_tag(child): | |
| 804 break | |
| 805 else: | |
| 806 # No block-level tags in any child | |
| 807 children_tag = etree.Element(tag) | |
| 808 children_tag.text = el.text | |
| 809 el.text = None | |
| 810 children_tag.extend(list(el)) | |
| 811 el[:] = [children_tag] | |
| 812 return | |
| 813 for child in list(el): | |
| 814 if _contains_block_level_tag(child): | |
| 815 _move_el_inside_block(child, tag) | |
| 816 if child.tail: | |
| 817 tail_tag = etree.Element(tag) | |
| 818 tail_tag.text = child.tail | |
| 819 child.tail = None | |
| 820 el.insert(el.index(child)+1, tail_tag) | |
| 821 else: | |
| 822 child_tag = etree.Element(tag) | |
| 823 el.replace(child, child_tag) | |
| 824 child_tag.append(child) | |
| 825 if el.text: | |
| 826 text_tag = etree.Element(tag) | |
| 827 text_tag.text = el.text | |
| 828 el.text = None | |
| 829 el.insert(0, text_tag) | |
| 830 | |
| 831 def _merge_element_contents(el): | |
| 832 """ | |
| 833 Removes an element, but merges its contents into its place, e.g., | |
| 834 given <p>Hi <i>there!</i></p>, if you remove the <i> element you get | |
| 835 <p>Hi there!</p> | |
| 836 """ | |
| 837 parent = el.getparent() | |
| 838 text = el.text or '' | |
| 839 if el.tail: | |
| 840 if not len(el): | |
| 841 text += el.tail | |
| 842 else: | |
| 843 if el[-1].tail: | |
| 844 el[-1].tail += el.tail | |
| 845 else: | |
| 846 el[-1].tail = el.tail | |
| 847 index = parent.index(el) | |
| 848 if text: | |
| 849 if index == 0: | |
| 850 previous = None | |
| 851 else: | |
| 852 previous = parent[index-1] | |
| 853 if previous is None: | |
| 854 if parent.text: | |
| 855 parent.text += text | |
| 856 else: | |
| 857 parent.text = text | |
| 858 else: | |
| 859 if previous.tail: | |
| 860 previous.tail += text | |
| 861 else: | |
| 862 previous.tail = text | |
| 863 parent[index:index+1] = el.getchildren() | |
| 864 | |
| 865 class InsensitiveSequenceMatcher(difflib.SequenceMatcher): | |
| 866 """ | |
| 867 Acts like SequenceMatcher, but tries not to find very small equal | |
| 868 blocks amidst large spans of changes | |
| 869 """ | |
| 870 | |
| 871 threshold = 2 | |
| 872 | |
| 873 def get_matching_blocks(self): | |
| 874 size = min(len(self.b), len(self.b)) | |
| 875 threshold = min(self.threshold, size / 4) | |
| 876 actual = difflib.SequenceMatcher.get_matching_blocks(self) | |
| 877 return [item for item in actual | |
| 878 if item[2] > threshold | |
| 879 or not item[2]] | |
| 880 | |
| 881 if __name__ == '__main__': | |
| 882 from lxml.html import _diffcommand | |
| 883 _diffcommand.main() | |
| 884 |
