comparison env/lib/python3.9/site-packages/lxml/html/diff.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 # cython: language_level=3
2
3 from __future__ import absolute_import
4
5 import difflib
6 from lxml import etree
7 from lxml.html import fragment_fromstring
8 import re
9
10 __all__ = ['html_annotate', 'htmldiff']
11
12 try:
13 from html import escape as html_escape
14 except ImportError:
15 from cgi import escape as html_escape
16 try:
17 _unicode = unicode
18 except NameError:
19 # Python 3
20 _unicode = str
21 try:
22 basestring
23 except NameError:
24 # Python 3
25 basestring = str
26
27 ############################################################
28 ## Annotation
29 ############################################################
30
31 def default_markup(text, version):
32 return '<span title="%s">%s</span>' % (
33 html_escape(_unicode(version), 1), text)
34
35 def html_annotate(doclist, markup=default_markup):
36 """
37 doclist should be ordered from oldest to newest, like::
38
39 >>> version1 = 'Hello World'
40 >>> version2 = 'Goodbye World'
41 >>> print(html_annotate([(version1, 'version 1'),
42 ... (version2, 'version 2')]))
43 <span title="version 2">Goodbye</span> <span title="version 1">World</span>
44
45 The documents must be *fragments* (str/UTF8 or unicode), not
46 complete documents
47
48 The markup argument is a function to markup the spans of words.
49 This function is called like markup('Hello', 'version 2'), and
50 returns HTML. The first argument is text and never includes any
51 markup. The default uses a span with a title:
52
53 >>> print(default_markup('Some Text', 'by Joe'))
54 <span title="by Joe">Some Text</span>
55 """
56 # The basic strategy we have is to split the documents up into
57 # logical tokens (which are words with attached markup). We then
58 # do diffs of each of the versions to track when a token first
59 # appeared in the document; the annotation attached to the token
60 # is the version where it first appeared.
61 tokenlist = [tokenize_annotated(doc, version)
62 for doc, version in doclist]
63 cur_tokens = tokenlist[0]
64 for tokens in tokenlist[1:]:
65 html_annotate_merge_annotations(cur_tokens, tokens)
66 cur_tokens = tokens
67
68 # After we've tracked all the tokens, we can combine spans of text
69 # that are adjacent and have the same annotation
70 cur_tokens = compress_tokens(cur_tokens)
71 # And finally add markup
72 result = markup_serialize_tokens(cur_tokens, markup)
73 return ''.join(result).strip()
74
75 def tokenize_annotated(doc, annotation):
76 """Tokenize a document and add an annotation attribute to each token
77 """
78 tokens = tokenize(doc, include_hrefs=False)
79 for tok in tokens:
80 tok.annotation = annotation
81 return tokens
82
83 def html_annotate_merge_annotations(tokens_old, tokens_new):
84 """Merge the annotations from tokens_old into tokens_new, when the
85 tokens in the new document already existed in the old document.
86 """
87 s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
88 commands = s.get_opcodes()
89
90 for command, i1, i2, j1, j2 in commands:
91 if command == 'equal':
92 eq_old = tokens_old[i1:i2]
93 eq_new = tokens_new[j1:j2]
94 copy_annotations(eq_old, eq_new)
95
96 def copy_annotations(src, dest):
97 """
98 Copy annotations from the tokens listed in src to the tokens in dest
99 """
100 assert len(src) == len(dest)
101 for src_tok, dest_tok in zip(src, dest):
102 dest_tok.annotation = src_tok.annotation
103
104 def compress_tokens(tokens):
105 """
106 Combine adjacent tokens when there is no HTML between the tokens,
107 and they share an annotation
108 """
109 result = [tokens[0]]
110 for tok in tokens[1:]:
111 if (not result[-1].post_tags and
112 not tok.pre_tags and
113 result[-1].annotation == tok.annotation):
114 compress_merge_back(result, tok)
115 else:
116 result.append(tok)
117 return result
118
119 def compress_merge_back(tokens, tok):
120 """ Merge tok into the last element of tokens (modifying the list of
121 tokens in-place). """
122 last = tokens[-1]
123 if type(last) is not token or type(tok) is not token:
124 tokens.append(tok)
125 else:
126 text = _unicode(last)
127 if last.trailing_whitespace:
128 text += last.trailing_whitespace
129 text += tok
130 merged = token(text,
131 pre_tags=last.pre_tags,
132 post_tags=tok.post_tags,
133 trailing_whitespace=tok.trailing_whitespace)
134 merged.annotation = last.annotation
135 tokens[-1] = merged
136
137 def markup_serialize_tokens(tokens, markup_func):
138 """
139 Serialize the list of tokens into a list of text chunks, calling
140 markup_func around text to add annotations.
141 """
142 for token in tokens:
143 for pre in token.pre_tags:
144 yield pre
145 html = token.html()
146 html = markup_func(html, token.annotation)
147 if token.trailing_whitespace:
148 html += token.trailing_whitespace
149 yield html
150 for post in token.post_tags:
151 yield post
152
153
154 ############################################################
155 ## HTML Diffs
156 ############################################################
157
158 def htmldiff(old_html, new_html):
159 ## FIXME: this should take parsed documents too, and use their body
160 ## or other content.
161 """ Do a diff of the old and new document. The documents are HTML
162 *fragments* (str/UTF8 or unicode), they are not complete documents
163 (i.e., no <html> tag).
164
165 Returns HTML with <ins> and <del> tags added around the
166 appropriate text.
167
168 Markup is generally ignored, with the markup from new_html
169 preserved, and possibly some markup from old_html (though it is
170 considered acceptable to lose some of the old markup). Only the
171 words in the HTML are diffed. The exception is <img> tags, which
172 are treated like words, and the href attribute of <a> tags, which
173 are noted inside the tag itself when there are changes.
174 """
175 old_html_tokens = tokenize(old_html)
176 new_html_tokens = tokenize(new_html)
177 result = htmldiff_tokens(old_html_tokens, new_html_tokens)
178 result = ''.join(result).strip()
179 return fixup_ins_del_tags(result)
180
181 def htmldiff_tokens(html1_tokens, html2_tokens):
182 """ Does a diff on the tokens themselves, returning a list of text
183 chunks (not tokens).
184 """
185 # There are several passes as we do the differences. The tokens
186 # isolate the portion of the content we care to diff; difflib does
187 # all the actual hard work at that point.
188 #
189 # Then we must create a valid document from pieces of both the old
190 # document and the new document. We generally prefer to take
191 # markup from the new document, and only do a best effort attempt
192 # to keep markup from the old document; anything that we can't
193 # resolve we throw away. Also we try to put the deletes as close
194 # to the location where we think they would have been -- because
195 # we are only keeping the markup from the new document, it can be
196 # fuzzy where in the new document the old text would have gone.
197 # Again we just do a best effort attempt.
198 s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
199 commands = s.get_opcodes()
200 result = []
201 for command, i1, i2, j1, j2 in commands:
202 if command == 'equal':
203 result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
204 continue
205 if command == 'insert' or command == 'replace':
206 ins_tokens = expand_tokens(html2_tokens[j1:j2])
207 merge_insert(ins_tokens, result)
208 if command == 'delete' or command == 'replace':
209 del_tokens = expand_tokens(html1_tokens[i1:i2])
210 merge_delete(del_tokens, result)
211 # If deletes were inserted directly as <del> then we'd have an
212 # invalid document at this point. Instead we put in special
213 # markers, and when the complete diffed document has been created
214 # we try to move the deletes around and resolve any problems.
215 result = cleanup_delete(result)
216
217 return result
218
219 def expand_tokens(tokens, equal=False):
220 """Given a list of tokens, return a generator of the chunks of
221 text for the data in the tokens.
222 """
223 for token in tokens:
224 for pre in token.pre_tags:
225 yield pre
226 if not equal or not token.hide_when_equal:
227 if token.trailing_whitespace:
228 yield token.html() + token.trailing_whitespace
229 else:
230 yield token.html()
231 for post in token.post_tags:
232 yield post
233
234 def merge_insert(ins_chunks, doc):
235 """ doc is the already-handled document (as a list of text chunks);
236 here we add <ins>ins_chunks</ins> to the end of that. """
237 # Though we don't throw away unbalanced_start or unbalanced_end
238 # (we assume there is accompanying markup later or earlier in the
239 # document), we only put <ins> around the balanced portion.
240 unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
241 doc.extend(unbalanced_start)
242 if doc and not doc[-1].endswith(' '):
243 # Fix up the case where the word before the insert didn't end with
244 # a space
245 doc[-1] += ' '
246 doc.append('<ins>')
247 if balanced and balanced[-1].endswith(' '):
248 # We move space outside of </ins>
249 balanced[-1] = balanced[-1][:-1]
250 doc.extend(balanced)
251 doc.append('</ins> ')
252 doc.extend(unbalanced_end)
253
254 # These are sentinals to represent the start and end of a <del>
255 # segment, until we do the cleanup phase to turn them into proper
256 # markup:
257 class DEL_START:
258 pass
259 class DEL_END:
260 pass
261
262 class NoDeletes(Exception):
263 """ Raised when the document no longer contains any pending deletes
264 (DEL_START/DEL_END) """
265
266 def merge_delete(del_chunks, doc):
267 """ Adds the text chunks in del_chunks to the document doc (another
268 list of text chunks) with marker to show it is a delete.
269 cleanup_delete later resolves these markers into <del> tags."""
270 doc.append(DEL_START)
271 doc.extend(del_chunks)
272 doc.append(DEL_END)
273
274 def cleanup_delete(chunks):
275 """ Cleans up any DEL_START/DEL_END markers in the document, replacing
276 them with <del></del>. To do this while keeping the document
277 valid, it may need to drop some tags (either start or end tags).
278
279 It may also move the del into adjacent tags to try to move it to a
280 similar location where it was originally located (e.g., moving a
281 delete into preceding <div> tag, if the del looks like (DEL_START,
282 'Text</div>', DEL_END)"""
283 while 1:
284 # Find a pending DEL_START/DEL_END, splitting the document
285 # into stuff-preceding-DEL_START, stuff-inside, and
286 # stuff-following-DEL_END
287 try:
288 pre_delete, delete, post_delete = split_delete(chunks)
289 except NoDeletes:
290 # Nothing found, we've cleaned up the entire doc
291 break
292 # The stuff-inside-DEL_START/END may not be well balanced
293 # markup. First we figure out what unbalanced portions there are:
294 unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
295 # Then we move the span forward and/or backward based on these
296 # unbalanced portions:
297 locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
298 locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
299 doc = pre_delete
300 if doc and not doc[-1].endswith(' '):
301 # Fix up case where the word before us didn't have a trailing space
302 doc[-1] += ' '
303 doc.append('<del>')
304 if balanced and balanced[-1].endswith(' '):
305 # We move space outside of </del>
306 balanced[-1] = balanced[-1][:-1]
307 doc.extend(balanced)
308 doc.append('</del> ')
309 doc.extend(post_delete)
310 chunks = doc
311 return chunks
312
313 def split_unbalanced(chunks):
314 """Return (unbalanced_start, balanced, unbalanced_end), where each is
315 a list of text and tag chunks.
316
317 unbalanced_start is a list of all the tags that are opened, but
318 not closed in this span. Similarly, unbalanced_end is a list of
319 tags that are closed but were not opened. Extracting these might
320 mean some reordering of the chunks."""
321 start = []
322 end = []
323 tag_stack = []
324 balanced = []
325 for chunk in chunks:
326 if not chunk.startswith('<'):
327 balanced.append(chunk)
328 continue
329 endtag = chunk[1] == '/'
330 name = chunk.split()[0].strip('<>/')
331 if name in empty_tags:
332 balanced.append(chunk)
333 continue
334 if endtag:
335 if tag_stack and tag_stack[-1][0] == name:
336 balanced.append(chunk)
337 name, pos, tag = tag_stack.pop()
338 balanced[pos] = tag
339 elif tag_stack:
340 start.extend([tag for name, pos, tag in tag_stack])
341 tag_stack = []
342 end.append(chunk)
343 else:
344 end.append(chunk)
345 else:
346 tag_stack.append((name, len(balanced), chunk))
347 balanced.append(None)
348 start.extend(
349 [chunk for name, pos, chunk in tag_stack])
350 balanced = [chunk for chunk in balanced if chunk is not None]
351 return start, balanced, end
352
353 def split_delete(chunks):
354 """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
355 stuff_after_DEL_END). Returns the first case found (there may be
356 more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if
357 there's no DEL_START found. """
358 try:
359 pos = chunks.index(DEL_START)
360 except ValueError:
361 raise NoDeletes
362 pos2 = chunks.index(DEL_END)
363 return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
364
365 def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
366 """ pre_delete and post_delete implicitly point to a place in the
367 document (where the two were split). This moves that point (by
368 popping items from one and pushing them onto the other). It moves
369 the point to try to find a place where unbalanced_start applies.
370
371 As an example::
372
373 >>> unbalanced_start = ['<div>']
374 >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
375 >>> pre, post = doc[:3], doc[3:]
376 >>> pre, post
377 (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
378 >>> locate_unbalanced_start(unbalanced_start, pre, post)
379 >>> pre, post
380 (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
381
382 As you can see, we moved the point so that the dangling <div> that
383 we found will be effectively replaced by the div in the original
384 document. If this doesn't work out, we just throw away
385 unbalanced_start without doing anything.
386 """
387 while 1:
388 if not unbalanced_start:
389 # We have totally succeeded in finding the position
390 break
391 finding = unbalanced_start[0]
392 finding_name = finding.split()[0].strip('<>')
393 if not post_delete:
394 break
395 next = post_delete[0]
396 if next is DEL_START or not next.startswith('<'):
397 # Reached a word, we can't move the delete text forward
398 break
399 if next[1] == '/':
400 # Reached a closing tag, can we go further? Maybe not...
401 break
402 name = next.split()[0].strip('<>')
403 if name == 'ins':
404 # Can't move into an insert
405 break
406 assert name != 'del', (
407 "Unexpected delete tag: %r" % next)
408 if name == finding_name:
409 unbalanced_start.pop(0)
410 pre_delete.append(post_delete.pop(0))
411 else:
412 # Found a tag that doesn't match
413 break
414
415 def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
416 """ like locate_unbalanced_start, except handling end tags and
417 possibly moving the point earlier in the document. """
418 while 1:
419 if not unbalanced_end:
420 # Success
421 break
422 finding = unbalanced_end[-1]
423 finding_name = finding.split()[0].strip('<>/')
424 if not pre_delete:
425 break
426 next = pre_delete[-1]
427 if next is DEL_END or not next.startswith('</'):
428 # A word or a start tag
429 break
430 name = next.split()[0].strip('<>/')
431 if name == 'ins' or name == 'del':
432 # Can't move into an insert or delete
433 break
434 if name == finding_name:
435 unbalanced_end.pop()
436 post_delete.insert(0, pre_delete.pop())
437 else:
438 # Found a tag that doesn't match
439 break
440
441 class token(_unicode):
442 """ Represents a diffable token, generally a word that is displayed to
443 the user. Opening tags are attached to this token when they are
444 adjacent (pre_tags) and closing tags that follow the word
445 (post_tags). Some exceptions occur when there are empty tags
446 adjacent to a word, so there may be close tags in pre_tags, or
447 open tags in post_tags.
448
449 We also keep track of whether the word was originally followed by
450 whitespace, even though we do not want to treat the word as
451 equivalent to a similar word that does not have a trailing
452 space."""
453
454 # When this is true, the token will be eliminated from the
455 # displayed diff if no change has occurred:
456 hide_when_equal = False
457
458 def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
459 obj = _unicode.__new__(cls, text)
460
461 if pre_tags is not None:
462 obj.pre_tags = pre_tags
463 else:
464 obj.pre_tags = []
465
466 if post_tags is not None:
467 obj.post_tags = post_tags
468 else:
469 obj.post_tags = []
470
471 obj.trailing_whitespace = trailing_whitespace
472
473 return obj
474
475 def __repr__(self):
476 return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,
477 self.post_tags, self.trailing_whitespace)
478
479 def html(self):
480 return _unicode(self)
481
482 class tag_token(token):
483
484 """ Represents a token that is actually a tag. Currently this is just
485 the <img> tag, which takes up visible space just like a word but
486 is only represented in a document by a tag. """
487
488 def __new__(cls, tag, data, html_repr, pre_tags=None,
489 post_tags=None, trailing_whitespace=""):
490 obj = token.__new__(cls, "%s: %s" % (type, data),
491 pre_tags=pre_tags,
492 post_tags=post_tags,
493 trailing_whitespace=trailing_whitespace)
494 obj.tag = tag
495 obj.data = data
496 obj.html_repr = html_repr
497 return obj
498
499 def __repr__(self):
500 return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
501 self.tag,
502 self.data,
503 self.html_repr,
504 self.pre_tags,
505 self.post_tags,
506 self.trailing_whitespace)
507 def html(self):
508 return self.html_repr
509
510 class href_token(token):
511
512 """ Represents the href in an anchor tag. Unlike other words, we only
513 show the href when it changes. """
514
515 hide_when_equal = True
516
517 def html(self):
518 return ' Link: %s' % self
519
520 def tokenize(html, include_hrefs=True):
521 """
522 Parse the given HTML and returns token objects (words with attached tags).
523
524 This parses only the content of a page; anything in the head is
525 ignored, and the <head> and <body> elements are themselves
526 optional. The content is then parsed by lxml, which ensures the
527 validity of the resulting parsed document (though lxml may make
528 incorrect guesses when the markup is particular bad).
529
530 <ins> and <del> tags are also eliminated from the document, as
531 that gets confusing.
532
533 If include_hrefs is true, then the href attribute of <a> tags is
534 included as a special kind of diffable token."""
535 if etree.iselement(html):
536 body_el = html
537 else:
538 body_el = parse_html(html, cleanup=True)
539 # Then we split the document into text chunks for each tag, word, and end tag:
540 chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
541 # Finally re-joining them into token objects:
542 return fixup_chunks(chunks)
543
544 def parse_html(html, cleanup=True):
545 """
546 Parses an HTML fragment, returning an lxml element. Note that the HTML will be
547 wrapped in a <div> tag that was not in the original document.
548
549 If cleanup is true, make sure there's no <head> or <body>, and get
550 rid of any <ins> and <del> tags.
551 """
552 if cleanup:
553 # This removes any extra markup or structure like <head>:
554 html = cleanup_html(html)
555 return fragment_fromstring(html, create_parent=True)
556
557 _body_re = re.compile(r'<body.*?>', re.I|re.S)
558 _end_body_re = re.compile(r'</body.*?>', re.I|re.S)
559 _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
560
561 def cleanup_html(html):
562 """ This 'cleans' the HTML, meaning that any page structure is removed
563 (only the contents of <body> are used, if there is any <body).
564 Also <ins> and <del> tags are removed. """
565 match = _body_re.search(html)
566 if match:
567 html = html[match.end():]
568 match = _end_body_re.search(html)
569 if match:
570 html = html[:match.start()]
571 html = _ins_del_re.sub('', html)
572 return html
573
574
575 end_whitespace_re = re.compile(r'[ \t\n\r]$')
576
577 def split_trailing_whitespace(word):
578 """
579 This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
580 """
581 stripped_length = len(word.rstrip())
582 return word[0:stripped_length], word[stripped_length:]
583
584
585 def fixup_chunks(chunks):
586 """
587 This function takes a list of chunks and produces a list of tokens.
588 """
589 tag_accum = []
590 cur_word = None
591 result = []
592 for chunk in chunks:
593 if isinstance(chunk, tuple):
594 if chunk[0] == 'img':
595 src = chunk[1]
596 tag, trailing_whitespace = split_trailing_whitespace(chunk[2])
597 cur_word = tag_token('img', src, html_repr=tag,
598 pre_tags=tag_accum,
599 trailing_whitespace=trailing_whitespace)
600 tag_accum = []
601 result.append(cur_word)
602
603 elif chunk[0] == 'href':
604 href = chunk[1]
605 cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")
606 tag_accum = []
607 result.append(cur_word)
608 continue
609
610 if is_word(chunk):
611 chunk, trailing_whitespace = split_trailing_whitespace(chunk)
612 cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
613 tag_accum = []
614 result.append(cur_word)
615
616 elif is_start_tag(chunk):
617 tag_accum.append(chunk)
618
619 elif is_end_tag(chunk):
620 if tag_accum:
621 tag_accum.append(chunk)
622 else:
623 assert cur_word, (
624 "Weird state, cur_word=%r, result=%r, chunks=%r of %r"
625 % (cur_word, result, chunk, chunks))
626 cur_word.post_tags.append(chunk)
627 else:
628 assert False
629
630 if not result:
631 return [token('', pre_tags=tag_accum)]
632 else:
633 result[-1].post_tags.extend(tag_accum)
634
635 return result
636
637
638 # All the tags in HTML that don't require end tags:
639 empty_tags = (
640 'param', 'img', 'area', 'br', 'basefont', 'input',
641 'base', 'meta', 'link', 'col')
642
643 block_level_tags = (
644 'address',
645 'blockquote',
646 'center',
647 'dir',
648 'div',
649 'dl',
650 'fieldset',
651 'form',
652 'h1',
653 'h2',
654 'h3',
655 'h4',
656 'h5',
657 'h6',
658 'hr',
659 'isindex',
660 'menu',
661 'noframes',
662 'noscript',
663 'ol',
664 'p',
665 'pre',
666 'table',
667 'ul',
668 )
669
670 block_level_container_tags = (
671 'dd',
672 'dt',
673 'frameset',
674 'li',
675 'tbody',
676 'td',
677 'tfoot',
678 'th',
679 'thead',
680 'tr',
681 )
682
683
684 def flatten_el(el, include_hrefs, skip_tag=False):
685 """ Takes an lxml element el, and generates all the text chunks for
686 that tag. Each start tag is a chunk, each word is a chunk, and each
687 end tag is a chunk.
688
689 If skip_tag is true, then the outermost container tag is
690 not returned (just its contents)."""
691 if not skip_tag:
692 if el.tag == 'img':
693 yield ('img', el.get('src'), start_tag(el))
694 else:
695 yield start_tag(el)
696 if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
697 return
698 start_words = split_words(el.text)
699 for word in start_words:
700 yield html_escape(word)
701 for child in el:
702 for item in flatten_el(child, include_hrefs=include_hrefs):
703 yield item
704 if el.tag == 'a' and el.get('href') and include_hrefs:
705 yield ('href', el.get('href'))
706 if not skip_tag:
707 yield end_tag(el)
708 end_words = split_words(el.tail)
709 for word in end_words:
710 yield html_escape(word)
711
712 split_words_re = re.compile(r'\S+(?:\s+|$)', re.U)
713
714 def split_words(text):
715 """ Splits some text into words. Includes trailing whitespace
716 on each word when appropriate. """
717 if not text or not text.strip():
718 return []
719
720 words = split_words_re.findall(text)
721 return words
722
723 start_whitespace_re = re.compile(r'^[ \t\n\r]')
724
725 def start_tag(el):
726 """
727 The text representation of the start tag for a tag.
728 """
729 return '<%s%s>' % (
730 el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True))
731 for name, value in el.attrib.items()]))
732
733 def end_tag(el):
734 """ The text representation of an end tag for a tag. Includes
735 trailing whitespace when appropriate. """
736 if el.tail and start_whitespace_re.search(el.tail):
737 extra = ' '
738 else:
739 extra = ''
740 return '</%s>%s' % (el.tag, extra)
741
742 def is_word(tok):
743 return not tok.startswith('<')
744
745 def is_end_tag(tok):
746 return tok.startswith('</')
747
748 def is_start_tag(tok):
749 return tok.startswith('<') and not tok.startswith('</')
750
751 def fixup_ins_del_tags(html):
752 """ Given an html string, move any <ins> or <del> tags inside of any
753 block-level elements, e.g. transform <ins><p>word</p></ins> to
754 <p><ins>word</ins></p> """
755 doc = parse_html(html, cleanup=False)
756 _fixup_ins_del_tags(doc)
757 html = serialize_html_fragment(doc, skip_outer=True)
758 return html
759
760 def serialize_html_fragment(el, skip_outer=False):
761 """ Serialize a single lxml element as HTML. The serialized form
762 includes the elements tail.
763
764 If skip_outer is true, then don't serialize the outermost tag
765 """
766 assert not isinstance(el, basestring), (
767 "You should pass in an element, not a string like %r" % el)
768 html = etree.tostring(el, method="html", encoding=_unicode)
769 if skip_outer:
770 # Get rid of the extra starting tag:
771 html = html[html.find('>')+1:]
772 # Get rid of the extra end tag:
773 html = html[:html.rfind('<')]
774 return html.strip()
775 else:
776 return html
777
778 def _fixup_ins_del_tags(doc):
779 """fixup_ins_del_tags that works on an lxml document in-place
780 """
781 for tag in ['ins', 'del']:
782 for el in doc.xpath('descendant-or-self::%s' % tag):
783 if not _contains_block_level_tag(el):
784 continue
785 _move_el_inside_block(el, tag=tag)
786 el.drop_tag()
787 #_merge_element_contents(el)
788
789 def _contains_block_level_tag(el):
790 """True if the element contains any block-level elements, like <p>, <td>, etc.
791 """
792 if el.tag in block_level_tags or el.tag in block_level_container_tags:
793 return True
794 for child in el:
795 if _contains_block_level_tag(child):
796 return True
797 return False
798
799 def _move_el_inside_block(el, tag):
800 """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
801 and moves them inside any block-level tags. """
802 for child in el:
803 if _contains_block_level_tag(child):
804 break
805 else:
806 # No block-level tags in any child
807 children_tag = etree.Element(tag)
808 children_tag.text = el.text
809 el.text = None
810 children_tag.extend(list(el))
811 el[:] = [children_tag]
812 return
813 for child in list(el):
814 if _contains_block_level_tag(child):
815 _move_el_inside_block(child, tag)
816 if child.tail:
817 tail_tag = etree.Element(tag)
818 tail_tag.text = child.tail
819 child.tail = None
820 el.insert(el.index(child)+1, tail_tag)
821 else:
822 child_tag = etree.Element(tag)
823 el.replace(child, child_tag)
824 child_tag.append(child)
825 if el.text:
826 text_tag = etree.Element(tag)
827 text_tag.text = el.text
828 el.text = None
829 el.insert(0, text_tag)
830
831 def _merge_element_contents(el):
832 """
833 Removes an element, but merges its contents into its place, e.g.,
834 given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
835 <p>Hi there!</p>
836 """
837 parent = el.getparent()
838 text = el.text or ''
839 if el.tail:
840 if not len(el):
841 text += el.tail
842 else:
843 if el[-1].tail:
844 el[-1].tail += el.tail
845 else:
846 el[-1].tail = el.tail
847 index = parent.index(el)
848 if text:
849 if index == 0:
850 previous = None
851 else:
852 previous = parent[index-1]
853 if previous is None:
854 if parent.text:
855 parent.text += text
856 else:
857 parent.text = text
858 else:
859 if previous.tail:
860 previous.tail += text
861 else:
862 previous.tail = text
863 parent[index:index+1] = el.getchildren()
864
865 class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
866 """
867 Acts like SequenceMatcher, but tries not to find very small equal
868 blocks amidst large spans of changes
869 """
870
871 threshold = 2
872
873 def get_matching_blocks(self):
874 size = min(len(self.b), len(self.b))
875 threshold = min(self.threshold, size / 4)
876 actual = difflib.SequenceMatcher.get_matching_blocks(self)
877 return [item for item in actual
878 if item[2] > threshold
879 or not item[2]]
880
881 if __name__ == '__main__':
882 from lxml.html import _diffcommand
883 _diffcommand.main()
884