comparison BeautifulSoup.py @ 0:2c498d40ecde

Uploaded
author miller-lab
date Mon, 09 Apr 2012 12:03:06 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2c498d40ecde
1 """Beautiful Soup
2 Elixir and Tonic
3 "The Screen-Scraper's Friend"
4 http://www.crummy.com/software/BeautifulSoup/
5
6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7 tree representation. It provides methods and Pythonic idioms that make
8 it easy to navigate, search, and modify the tree.
9
10 A well-formed XML/HTML document yields a well-formed data
11 structure. An ill-formed XML/HTML document yields a correspondingly
12 ill-formed data structure. If your document is only locally
13 well-formed, you can use this library to find and process the
14 well-formed part of it.
15
16 Beautiful Soup works with Python 2.2 and up. It has no external
17 dependencies, but you'll have more success at converting data to UTF-8
18 if you also install these three packages:
19
20 * chardet, for auto-detecting character encodings
21 http://chardet.feedparser.org/
22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
23 by stock Python.
24 http://cjkpython.i18n.org/
25
26 Beautiful Soup defines classes for two main parsing strategies:
27
28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29 language that kind of looks like XML.
30
31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32 or invalid. This class has web browser-like heuristics for
33 obtaining a sensible parse tree in the face of common HTML errors.
34
35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36 the encoding of an HTML or XML document, and converting it to
37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
38
39 For more than you ever wanted to know about Beautiful Soup, see the
40 documentation:
41 http://www.crummy.com/software/BeautifulSoup/documentation.html
42
43 Here, have some legalese:
44
45 Copyright (c) 2004-2010, Leonard Richardson
46
47 All rights reserved.
48
49 Redistribution and use in source and binary forms, with or without
50 modification, are permitted provided that the following conditions are
51 met:
52
53 * Redistributions of source code must retain the above copyright
54 notice, this list of conditions and the following disclaimer.
55
56 * Redistributions in binary form must reproduce the above
57 copyright notice, this list of conditions and the following
58 disclaimer in the documentation and/or other materials provided
59 with the distribution.
60
61 * Neither the name of the the Beautiful Soup Consortium and All
62 Night Kosher Bakery nor the names of its contributors may be
63 used to endorse or promote products derived from this software
64 without specific prior written permission.
65
66 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
77
78 """
79 from __future__ import generators
80
81 __author__ = "Leonard Richardson (leonardr@segfault.org)"
82 __version__ = "3.2.0"
83 __copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
84 __license__ = "New-style BSD"
85
86 from sgmllib import SGMLParser, SGMLParseError
87 import codecs
88 import markupbase
89 import types
90 import re
91 import sgmllib
92 try:
93 from htmlentitydefs import name2codepoint
94 except ImportError:
95 name2codepoint = {}
96 try:
97 set
98 except NameError:
99 from sets import Set as set
100
101 #These hacks make Beautiful Soup able to parse XML with namespaces
102 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
103 markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
104
105 DEFAULT_OUTPUT_ENCODING = "utf-8"
106
107 def _match_css_class(str):
108 """Build a RE to match the given CSS class."""
109 return re.compile(r"(^|.*\s)%s($|\s)" % str)
110
111 # First, the classes that represent markup elements.
112
113 class PageElement(object):
114 """Contains the navigational information for some part of the page
115 (either a tag or a piece of text)"""
116
117 def setup(self, parent=None, previous=None):
118 """Sets up the initial relations between this element and
119 other elements."""
120 self.parent = parent
121 self.previous = previous
122 self.next = None
123 self.previousSibling = None
124 self.nextSibling = None
125 if self.parent and self.parent.contents:
126 self.previousSibling = self.parent.contents[-1]
127 self.previousSibling.nextSibling = self
128
129 def replaceWith(self, replaceWith):
130 oldParent = self.parent
131 myIndex = self.parent.index(self)
132 if hasattr(replaceWith, "parent")\
133 and replaceWith.parent is self.parent:
134 # We're replacing this element with one of its siblings.
135 index = replaceWith.parent.index(replaceWith)
136 if index and index < myIndex:
137 # Furthermore, it comes before this element. That
138 # means that when we extract it, the index of this
139 # element will change.
140 myIndex = myIndex - 1
141 self.extract()
142 oldParent.insert(myIndex, replaceWith)
143
144 def replaceWithChildren(self):
145 myParent = self.parent
146 myIndex = self.parent.index(self)
147 self.extract()
148 reversedChildren = list(self.contents)
149 reversedChildren.reverse()
150 for child in reversedChildren:
151 myParent.insert(myIndex, child)
152
153 def extract(self):
154 """Destructively rips this element out of the tree."""
155 if self.parent:
156 try:
157 del self.parent.contents[self.parent.index(self)]
158 except ValueError:
159 pass
160
161 #Find the two elements that would be next to each other if
162 #this element (and any children) hadn't been parsed. Connect
163 #the two.
164 lastChild = self._lastRecursiveChild()
165 nextElement = lastChild.next
166
167 if self.previous:
168 self.previous.next = nextElement
169 if nextElement:
170 nextElement.previous = self.previous
171 self.previous = None
172 lastChild.next = None
173
174 self.parent = None
175 if self.previousSibling:
176 self.previousSibling.nextSibling = self.nextSibling
177 if self.nextSibling:
178 self.nextSibling.previousSibling = self.previousSibling
179 self.previousSibling = self.nextSibling = None
180 return self
181
182 def _lastRecursiveChild(self):
183 "Finds the last element beneath this object to be parsed."
184 lastChild = self
185 while hasattr(lastChild, 'contents') and lastChild.contents:
186 lastChild = lastChild.contents[-1]
187 return lastChild
188
189 def insert(self, position, newChild):
190 if isinstance(newChild, basestring) \
191 and not isinstance(newChild, NavigableString):
192 newChild = NavigableString(newChild)
193
194 position = min(position, len(self.contents))
195 if hasattr(newChild, 'parent') and newChild.parent is not None:
196 # We're 'inserting' an element that's already one
197 # of this object's children.
198 if newChild.parent is self:
199 index = self.index(newChild)
200 if index > position:
201 # Furthermore we're moving it further down the
202 # list of this object's children. That means that
203 # when we extract this element, our target index
204 # will jump down one.
205 position = position - 1
206 newChild.extract()
207
208 newChild.parent = self
209 previousChild = None
210 if position == 0:
211 newChild.previousSibling = None
212 newChild.previous = self
213 else:
214 previousChild = self.contents[position-1]
215 newChild.previousSibling = previousChild
216 newChild.previousSibling.nextSibling = newChild
217 newChild.previous = previousChild._lastRecursiveChild()
218 if newChild.previous:
219 newChild.previous.next = newChild
220
221 newChildsLastElement = newChild._lastRecursiveChild()
222
223 if position >= len(self.contents):
224 newChild.nextSibling = None
225
226 parent = self
227 parentsNextSibling = None
228 while not parentsNextSibling:
229 parentsNextSibling = parent.nextSibling
230 parent = parent.parent
231 if not parent: # This is the last element in the document.
232 break
233 if parentsNextSibling:
234 newChildsLastElement.next = parentsNextSibling
235 else:
236 newChildsLastElement.next = None
237 else:
238 nextChild = self.contents[position]
239 newChild.nextSibling = nextChild
240 if newChild.nextSibling:
241 newChild.nextSibling.previousSibling = newChild
242 newChildsLastElement.next = nextChild
243
244 if newChildsLastElement.next:
245 newChildsLastElement.next.previous = newChildsLastElement
246 self.contents.insert(position, newChild)
247
248 def append(self, tag):
249 """Appends the given tag to the contents of this tag."""
250 self.insert(len(self.contents), tag)
251
252 def findNext(self, name=None, attrs={}, text=None, **kwargs):
253 """Returns the first item that matches the given criteria and
254 appears after this Tag in the document."""
255 return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
256
257 def findAllNext(self, name=None, attrs={}, text=None, limit=None,
258 **kwargs):
259 """Returns all items that match the given criteria and appear
260 after this Tag in the document."""
261 return self._findAll(name, attrs, text, limit, self.nextGenerator,
262 **kwargs)
263
264 def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
265 """Returns the closest sibling to this Tag that matches the
266 given criteria and appears after this Tag in the document."""
267 return self._findOne(self.findNextSiblings, name, attrs, text,
268 **kwargs)
269
270 def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
271 **kwargs):
272 """Returns the siblings of this Tag that match the given
273 criteria and appear after this Tag in the document."""
274 return self._findAll(name, attrs, text, limit,
275 self.nextSiblingGenerator, **kwargs)
276 fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
277
278 def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
279 """Returns the first item that matches the given criteria and
280 appears before this Tag in the document."""
281 return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
282
283 def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
284 **kwargs):
285 """Returns all items that match the given criteria and appear
286 before this Tag in the document."""
287 return self._findAll(name, attrs, text, limit, self.previousGenerator,
288 **kwargs)
289 fetchPrevious = findAllPrevious # Compatibility with pre-3.x
290
291 def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
292 """Returns the closest sibling to this Tag that matches the
293 given criteria and appears before this Tag in the document."""
294 return self._findOne(self.findPreviousSiblings, name, attrs, text,
295 **kwargs)
296
297 def findPreviousSiblings(self, name=None, attrs={}, text=None,
298 limit=None, **kwargs):
299 """Returns the siblings of this Tag that match the given
300 criteria and appear before this Tag in the document."""
301 return self._findAll(name, attrs, text, limit,
302 self.previousSiblingGenerator, **kwargs)
303 fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
304
305 def findParent(self, name=None, attrs={}, **kwargs):
306 """Returns the closest parent of this Tag that matches the given
307 criteria."""
308 # NOTE: We can't use _findOne because findParents takes a different
309 # set of arguments.
310 r = None
311 l = self.findParents(name, attrs, 1)
312 if l:
313 r = l[0]
314 return r
315
316 def findParents(self, name=None, attrs={}, limit=None, **kwargs):
317 """Returns the parents of this Tag that match the given
318 criteria."""
319
320 return self._findAll(name, attrs, None, limit, self.parentGenerator,
321 **kwargs)
322 fetchParents = findParents # Compatibility with pre-3.x
323
324 #These methods do the real heavy lifting.
325
326 def _findOne(self, method, name, attrs, text, **kwargs):
327 r = None
328 l = method(name, attrs, text, 1, **kwargs)
329 if l:
330 r = l[0]
331 return r
332
333 def _findAll(self, name, attrs, text, limit, generator, **kwargs):
334 "Iterates over a generator looking for things that match."
335
336 if isinstance(name, SoupStrainer):
337 strainer = name
338 # (Possibly) special case some findAll*(...) searches
339 elif text is None and not limit and not attrs and not kwargs:
340 # findAll*(True)
341 if name is True:
342 return [element for element in generator()
343 if isinstance(element, Tag)]
344 # findAll*('tag-name')
345 elif isinstance(name, basestring):
346 return [element for element in generator()
347 if isinstance(element, Tag) and
348 element.name == name]
349 else:
350 strainer = SoupStrainer(name, attrs, text, **kwargs)
351 # Build a SoupStrainer
352 else:
353 strainer = SoupStrainer(name, attrs, text, **kwargs)
354 results = ResultSet(strainer)
355 g = generator()
356 while True:
357 try:
358 i = g.next()
359 except StopIteration:
360 break
361 if i:
362 found = strainer.search(i)
363 if found:
364 results.append(found)
365 if limit and len(results) >= limit:
366 break
367 return results
368
369 #These Generators can be used to navigate starting from both
370 #NavigableStrings and Tags.
371 def nextGenerator(self):
372 i = self
373 while i is not None:
374 i = i.next
375 yield i
376
377 def nextSiblingGenerator(self):
378 i = self
379 while i is not None:
380 i = i.nextSibling
381 yield i
382
383 def previousGenerator(self):
384 i = self
385 while i is not None:
386 i = i.previous
387 yield i
388
389 def previousSiblingGenerator(self):
390 i = self
391 while i is not None:
392 i = i.previousSibling
393 yield i
394
395 def parentGenerator(self):
396 i = self
397 while i is not None:
398 i = i.parent
399 yield i
400
401 # Utility methods
402 def substituteEncoding(self, str, encoding=None):
403 encoding = encoding or "utf-8"
404 return str.replace("%SOUP-ENCODING%", encoding)
405
406 def toEncoding(self, s, encoding=None):
407 """Encodes an object to a string in some encoding, or to Unicode.
408 ."""
409 if isinstance(s, unicode):
410 if encoding:
411 s = s.encode(encoding)
412 elif isinstance(s, str):
413 if encoding:
414 s = s.encode(encoding)
415 else:
416 s = unicode(s)
417 else:
418 if encoding:
419 s = self.toEncoding(str(s), encoding)
420 else:
421 s = unicode(s)
422 return s
423
424 class NavigableString(unicode, PageElement):
425
426 def __new__(cls, value):
427 """Create a new NavigableString.
428
429 When unpickling a NavigableString, this method is called with
430 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
431 passed in to the superclass's __new__ or the superclass won't know
432 how to handle non-ASCII characters.
433 """
434 if isinstance(value, unicode):
435 return unicode.__new__(cls, value)
436 return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
437
438 def __getnewargs__(self):
439 return (NavigableString.__str__(self),)
440
441 def __getattr__(self, attr):
442 """text.string gives you text. This is for backwards
443 compatibility for Navigable*String, but for CData* it lets you
444 get the string without the CData wrapper."""
445 if attr == 'string':
446 return self
447 else:
448 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
449
450 def __unicode__(self):
451 return str(self).decode(DEFAULT_OUTPUT_ENCODING)
452
453 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
454 if encoding:
455 return self.encode(encoding)
456 else:
457 return self
458
459 class CData(NavigableString):
460
461 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
462 return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
463
464 class ProcessingInstruction(NavigableString):
465 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
466 output = self
467 if "%SOUP-ENCODING%" in output:
468 output = self.substituteEncoding(output, encoding)
469 return "<?%s?>" % self.toEncoding(output, encoding)
470
471 class Comment(NavigableString):
472 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
473 return "<!--%s-->" % NavigableString.__str__(self, encoding)
474
475 class Declaration(NavigableString):
476 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
477 return "<!%s>" % NavigableString.__str__(self, encoding)
478
479 class Tag(PageElement):
480
481 """Represents a found HTML tag with its attributes and contents."""
482
483 def _invert(h):
484 "Cheap function to invert a hash."
485 i = {}
486 for k,v in h.items():
487 i[v] = k
488 return i
489
490 XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
491 "quot" : '"',
492 "amp" : "&",
493 "lt" : "<",
494 "gt" : ">" }
495
496 XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
497
498 def _convertEntities(self, match):
499 """Used in a call to re.sub to replace HTML, XML, and numeric
500 entities with the appropriate Unicode characters. If HTML
501 entities are being converted, any unrecognized entities are
502 escaped."""
503 x = match.group(1)
504 if self.convertHTMLEntities and x in name2codepoint:
505 return unichr(name2codepoint[x])
506 elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
507 if self.convertXMLEntities:
508 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
509 else:
510 return u'&%s;' % x
511 elif len(x) > 0 and x[0] == '#':
512 # Handle numeric entities
513 if len(x) > 1 and x[1] == 'x':
514 return unichr(int(x[2:], 16))
515 else:
516 return unichr(int(x[1:]))
517
518 elif self.escapeUnrecognizedEntities:
519 return u'&amp;%s;' % x
520 else:
521 return u'&%s;' % x
522
523 def __init__(self, parser, name, attrs=None, parent=None,
524 previous=None):
525 "Basic constructor."
526
527 # We don't actually store the parser object: that lets extracted
528 # chunks be garbage-collected
529 self.parserClass = parser.__class__
530 self.isSelfClosing = parser.isSelfClosingTag(name)
531 self.name = name
532 if attrs is None:
533 attrs = []
534 elif isinstance(attrs, dict):
535 attrs = attrs.items()
536 self.attrs = attrs
537 self.contents = []
538 self.setup(parent, previous)
539 self.hidden = False
540 self.containsSubstitutions = False
541 self.convertHTMLEntities = parser.convertHTMLEntities
542 self.convertXMLEntities = parser.convertXMLEntities
543 self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
544
545 # Convert any HTML, XML, or numeric entities in the attribute values.
546 convert = lambda(k, val): (k,
547 re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
548 self._convertEntities,
549 val))
550 self.attrs = map(convert, self.attrs)
551
552 def getString(self):
553 if (len(self.contents) == 1
554 and isinstance(self.contents[0], NavigableString)):
555 return self.contents[0]
556
557 def setString(self, string):
558 """Replace the contents of the tag with a string"""
559 self.clear()
560 self.append(string)
561
562 string = property(getString, setString)
563
564 def getText(self, separator=u""):
565 if not len(self.contents):
566 return u""
567 stopNode = self._lastRecursiveChild().next
568 strings = []
569 current = self.contents[0]
570 while current is not stopNode:
571 if isinstance(current, NavigableString):
572 strings.append(current.strip())
573 current = current.next
574 return separator.join(strings)
575
576 text = property(getText)
577
578 def get(self, key, default=None):
579 """Returns the value of the 'key' attribute for the tag, or
580 the value given for 'default' if it doesn't have that
581 attribute."""
582 return self._getAttrMap().get(key, default)
583
584 def clear(self):
585 """Extract all children."""
586 for child in self.contents[:]:
587 child.extract()
588
589 def index(self, element):
590 for i, child in enumerate(self.contents):
591 if child is element:
592 return i
593 raise ValueError("Tag.index: element not in tag")
594
595 def has_key(self, key):
596 return self._getAttrMap().has_key(key)
597
598 def __getitem__(self, key):
599 """tag[key] returns the value of the 'key' attribute for the tag,
600 and throws an exception if it's not there."""
601 return self._getAttrMap()[key]
602
603 def __iter__(self):
604 "Iterating over a tag iterates over its contents."
605 return iter(self.contents)
606
607 def __len__(self):
608 "The length of a tag is the length of its list of contents."
609 return len(self.contents)
610
611 def __contains__(self, x):
612 return x in self.contents
613
614 def __nonzero__(self):
615 "A tag is non-None even if it has no contents."
616 return True
617
618 def __setitem__(self, key, value):
619 """Setting tag[key] sets the value of the 'key' attribute for the
620 tag."""
621 self._getAttrMap()
622 self.attrMap[key] = value
623 found = False
624 for i in range(0, len(self.attrs)):
625 if self.attrs[i][0] == key:
626 self.attrs[i] = (key, value)
627 found = True
628 if not found:
629 self.attrs.append((key, value))
630 self._getAttrMap()[key] = value
631
632 def __delitem__(self, key):
633 "Deleting tag[key] deletes all 'key' attributes for the tag."
634 for item in self.attrs:
635 if item[0] == key:
636 self.attrs.remove(item)
637 #We don't break because bad HTML can define the same
638 #attribute multiple times.
639 self._getAttrMap()
640 if self.attrMap.has_key(key):
641 del self.attrMap[key]
642
643 def __call__(self, *args, **kwargs):
644 """Calling a tag like a function is the same as calling its
645 findAll() method. Eg. tag('a') returns a list of all the A tags
646 found within this tag."""
647 return apply(self.findAll, args, kwargs)
648
649 def __getattr__(self, tag):
650 #print "Getattr %s.%s" % (self.__class__, tag)
651 if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
652 return self.find(tag[:-3])
653 elif tag.find('__') != 0:
654 return self.find(tag)
655 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
656
657 def __eq__(self, other):
658 """Returns true iff this tag has the same name, the same attributes,
659 and the same contents (recursively) as the given tag.
660
661 NOTE: right now this will return false if two tags have the
662 same attributes in a different order. Should this be fixed?"""
663 if other is self:
664 return True
665 if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
666 return False
667 for i in range(0, len(self.contents)):
668 if self.contents[i] != other.contents[i]:
669 return False
670 return True
671
672 def __ne__(self, other):
673 """Returns true iff this tag is not identical to the other tag,
674 as defined in __eq__."""
675 return not self == other
676
677 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
678 """Renders this tag as a string."""
679 return self.__str__(encoding)
680
681 def __unicode__(self):
682 return self.__str__(None)
683
684 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
685 + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
686 + ")")
687
688 def _sub_entity(self, x):
689 """Used with a regular expression to substitute the
690 appropriate XML entity for an XML special character."""
691 return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
692
693 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
694 prettyPrint=False, indentLevel=0):
695 """Returns a string or Unicode representation of this tag and
696 its contents. To get Unicode, pass None for encoding.
697
698 NOTE: since Python's HTML parser consumes whitespace, this
699 method is not certain to reproduce the whitespace present in
700 the original string."""
701
702 encodedName = self.toEncoding(self.name, encoding)
703
704 attrs = []
705 if self.attrs:
706 for key, val in self.attrs:
707 fmt = '%s="%s"'
708 if isinstance(val, basestring):
709 if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
710 val = self.substituteEncoding(val, encoding)
711
712 # The attribute value either:
713 #
714 # * Contains no embedded double quotes or single quotes.
715 # No problem: we enclose it in double quotes.
716 # * Contains embedded single quotes. No problem:
717 # double quotes work here too.
718 # * Contains embedded double quotes. No problem:
719 # we enclose it in single quotes.
720 # * Embeds both single _and_ double quotes. This
721 # can't happen naturally, but it can happen if
722 # you modify an attribute value after parsing
723 # the document. Now we have a bit of a
724 # problem. We solve it by enclosing the
725 # attribute in single quotes, and escaping any
726 # embedded single quotes to XML entities.
727 if '"' in val:
728 fmt = "%s='%s'"
729 if "'" in val:
730 # TODO: replace with apos when
731 # appropriate.
732 val = val.replace("'", "&squot;")
733
734 # Now we're okay w/r/t quotes. But the attribute
735 # value might also contain angle brackets, or
736 # ampersands that aren't part of entities. We need
737 # to escape those to XML entities too.
738 val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
739
740 attrs.append(fmt % (self.toEncoding(key, encoding),
741 self.toEncoding(val, encoding)))
742 close = ''
743 closeTag = ''
744 if self.isSelfClosing:
745 close = ' /'
746 else:
747 closeTag = '</%s>' % encodedName
748
749 indentTag, indentContents = 0, 0
750 if prettyPrint:
751 indentTag = indentLevel
752 space = (' ' * (indentTag-1))
753 indentContents = indentTag + 1
754 contents = self.renderContents(encoding, prettyPrint, indentContents)
755 if self.hidden:
756 s = contents
757 else:
758 s = []
759 attributeString = ''
760 if attrs:
761 attributeString = ' ' + ' '.join(attrs)
762 if prettyPrint:
763 s.append(space)
764 s.append('<%s%s%s>' % (encodedName, attributeString, close))
765 if prettyPrint:
766 s.append("\n")
767 s.append(contents)
768 if prettyPrint and contents and contents[-1] != "\n":
769 s.append("\n")
770 if prettyPrint and closeTag:
771 s.append(space)
772 s.append(closeTag)
773 if prettyPrint and closeTag and self.nextSibling:
774 s.append("\n")
775 s = ''.join(s)
776 return s
777
778 def decompose(self):
779 """Recursively destroys the contents of this tree."""
780 self.extract()
781 if len(self.contents) == 0:
782 return
783 current = self.contents[0]
784 while current is not None:
785 next = current.next
786 if isinstance(current, Tag):
787 del current.contents[:]
788 current.parent = None
789 current.previous = None
790 current.previousSibling = None
791 current.next = None
792 current.nextSibling = None
793 current = next
794
795 def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
796 return self.__str__(encoding, True)
797
798 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
799 prettyPrint=False, indentLevel=0):
800 """Renders the contents of this tag as a string in the given
801 encoding. If encoding is None, returns a Unicode string.."""
802 s=[]
803 for c in self:
804 text = None
805 if isinstance(c, NavigableString):
806 text = c.__str__(encoding)
807 elif isinstance(c, Tag):
808 s.append(c.__str__(encoding, prettyPrint, indentLevel))
809 if text and prettyPrint:
810 text = text.strip()
811 if text:
812 if prettyPrint:
813 s.append(" " * (indentLevel-1))
814 s.append(text)
815 if prettyPrint:
816 s.append("\n")
817 return ''.join(s)
818
819 #Soup methods
820
821 def find(self, name=None, attrs={}, recursive=True, text=None,
822 **kwargs):
823 """Return only the first child of this Tag matching the given
824 criteria."""
825 r = None
826 l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
827 if l:
828 r = l[0]
829 return r
830 findChild = find
831
832 def findAll(self, name=None, attrs={}, recursive=True, text=None,
833 limit=None, **kwargs):
834 """Extracts a list of Tag objects that match the given
835 criteria. You can specify the name of the Tag and any
836 attributes you want the Tag to have.
837
838 The value of a key-value pair in the 'attrs' map can be a
839 string, a list of strings, a regular expression object, or a
840 callable that takes a string and returns whether or not the
841 string matches for some custom definition of 'matches'. The
842 same is true of the tag name."""
843 generator = self.recursiveChildGenerator
844 if not recursive:
845 generator = self.childGenerator
846 return self._findAll(name, attrs, text, limit, generator, **kwargs)
847 findChildren = findAll
848
849 # Pre-3.x compatibility methods
850 first = find
851 fetch = findAll
852
853 def fetchText(self, text=None, recursive=True, limit=None):
854 return self.findAll(text=text, recursive=recursive, limit=limit)
855
856 def firstText(self, text=None, recursive=True):
857 return self.find(text=text, recursive=recursive)
858
859 #Private methods
860
861 def _getAttrMap(self):
862 """Initializes a map representation of this tag's attributes,
863 if not already initialized."""
864 if not getattr(self, 'attrMap'):
865 self.attrMap = {}
866 for (key, value) in self.attrs:
867 self.attrMap[key] = value
868 return self.attrMap
869
870 #Generator methods
871 def childGenerator(self):
872 # Just use the iterator from the contents
873 return iter(self.contents)
874
875 def recursiveChildGenerator(self):
876 if not len(self.contents):
877 raise StopIteration
878 stopNode = self._lastRecursiveChild().next
879 current = self.contents[0]
880 while current is not stopNode:
881 yield current
882 current = current.next
883
884
885 # Next, a couple classes to represent queries and their results.
886 class SoupStrainer:
887 """Encapsulates a number of ways of matching a markup element (tag or
888 text)."""
889
890 def __init__(self, name=None, attrs={}, text=None, **kwargs):
891 self.name = name
892 if isinstance(attrs, basestring):
893 kwargs['class'] = _match_css_class(attrs)
894 attrs = None
895 if kwargs:
896 if attrs:
897 attrs = attrs.copy()
898 attrs.update(kwargs)
899 else:
900 attrs = kwargs
901 self.attrs = attrs
902 self.text = text
903
904 def __str__(self):
905 if self.text:
906 return self.text
907 else:
908 return "%s|%s" % (self.name, self.attrs)
909
910 def searchTag(self, markupName=None, markupAttrs={}):
911 found = None
912 markup = None
913 if isinstance(markupName, Tag):
914 markup = markupName
915 markupAttrs = markup
916 callFunctionWithTagData = callable(self.name) \
917 and not isinstance(markupName, Tag)
918
919 if (not self.name) \
920 or callFunctionWithTagData \
921 or (markup and self._matches(markup, self.name)) \
922 or (not markup and self._matches(markupName, self.name)):
923 if callFunctionWithTagData:
924 match = self.name(markupName, markupAttrs)
925 else:
926 match = True
927 markupAttrMap = None
928 for attr, matchAgainst in self.attrs.items():
929 if not markupAttrMap:
930 if hasattr(markupAttrs, 'get'):
931 markupAttrMap = markupAttrs
932 else:
933 markupAttrMap = {}
934 for k,v in markupAttrs:
935 markupAttrMap[k] = v
936 attrValue = markupAttrMap.get(attr)
937 if not self._matches(attrValue, matchAgainst):
938 match = False
939 break
940 if match:
941 if markup:
942 found = markup
943 else:
944 found = markupName
945 return found
946
947 def search(self, markup):
948 #print 'looking for %s in %s' % (self, markup)
949 found = None
950 # If given a list of items, scan it for a text element that
951 # matches.
952 if hasattr(markup, "__iter__") \
953 and not isinstance(markup, Tag):
954 for element in markup:
955 if isinstance(element, NavigableString) \
956 and self.search(element):
957 found = element
958 break
959 # If it's a Tag, make sure its name or attributes match.
960 # Don't bother with Tags if we're searching for text.
961 elif isinstance(markup, Tag):
962 if not self.text:
963 found = self.searchTag(markup)
964 # If it's text, make sure the text matches.
965 elif isinstance(markup, NavigableString) or \
966 isinstance(markup, basestring):
967 if self._matches(markup, self.text):
968 found = markup
969 else:
970 raise Exception, "I don't know how to match against a %s" \
971 % markup.__class__
972 return found
973
974 def _matches(self, markup, matchAgainst):
975 #print "Matching %s against %s" % (markup, matchAgainst)
976 result = False
977 if matchAgainst is True:
978 result = markup is not None
979 elif callable(matchAgainst):
980 result = matchAgainst(markup)
981 else:
982 #Custom match methods take the tag as an argument, but all
983 #other ways of matching match the tag name as a string.
984 if isinstance(markup, Tag):
985 markup = markup.name
986 if markup and not isinstance(markup, basestring):
987 markup = unicode(markup)
988 #Now we know that chunk is either a string, or None.
989 if hasattr(matchAgainst, 'match'):
990 # It's a regexp object.
991 result = markup and matchAgainst.search(markup)
992 elif hasattr(matchAgainst, '__iter__'): # list-like
993 result = markup in matchAgainst
994 elif hasattr(matchAgainst, 'items'):
995 result = markup.has_key(matchAgainst)
996 elif matchAgainst and isinstance(markup, basestring):
997 if isinstance(markup, unicode):
998 matchAgainst = unicode(matchAgainst)
999 else:
1000 matchAgainst = str(matchAgainst)
1001
1002 if not result:
1003 result = matchAgainst == markup
1004 return result
1005
1006 class ResultSet(list):
1007 """A ResultSet is just a list that keeps track of the SoupStrainer
1008 that created it."""
1009 def __init__(self, source):
1010 list.__init__([])
1011 self.source = source
1012
1013 # Now, some helper functions.
1014
1015 def buildTagMap(default, *args):
1016 """Turns a list of maps, lists, or scalars into a single map.
1017 Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
1018 NESTING_RESET_TAGS maps out of lists and partial maps."""
1019 built = {}
1020 for portion in args:
1021 if hasattr(portion, 'items'):
1022 #It's a map. Merge it.
1023 for k,v in portion.items():
1024 built[k] = v
1025 elif hasattr(portion, '__iter__'): # is a list
1026 #It's a list. Map each item to the default.
1027 for k in portion:
1028 built[k] = default
1029 else:
1030 #It's a scalar. Map it to the default.
1031 built[portion] = default
1032 return built
1033
1034 # Now, the parser classes.
1035
1036 class BeautifulStoneSoup(Tag, SGMLParser):
1037
1038 """This class contains the basic parser and search code. It defines
1039 a parser that knows nothing about tag behavior except for the
1040 following:
1041
1042 You can't close a tag without closing all the tags it encloses.
1043 That is, "<foo><bar></foo>" actually means
1044 "<foo><bar></bar></foo>".
1045
1046 [Another possible explanation is "<foo><bar /></foo>", but since
1047 this class defines no SELF_CLOSING_TAGS, it will never use that
1048 explanation.]
1049
1050 This class is useful for parsing XML or made-up markup languages,
1051 or when BeautifulSoup makes an assumption counter to what you were
1052 expecting."""
1053
1054 SELF_CLOSING_TAGS = {}
1055 NESTABLE_TAGS = {}
1056 RESET_NESTING_TAGS = {}
1057 QUOTE_TAGS = {}
1058 PRESERVE_WHITESPACE_TAGS = []
1059
1060 MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
1061 lambda x: x.group(1) + ' />'),
1062 (re.compile('<!\s+([^<>]*)>'),
1063 lambda x: '<!' + x.group(1) + '>')
1064 ]
1065
1066 ROOT_TAG_NAME = u'[document]'
1067
1068 HTML_ENTITIES = "html"
1069 XML_ENTITIES = "xml"
1070 XHTML_ENTITIES = "xhtml"
1071 # TODO: This only exists for backwards-compatibility
1072 ALL_ENTITIES = XHTML_ENTITIES
1073
1074 # Used when determining whether a text node is all whitespace and
1075 # can be replaced with a single space. A text node that contains
1076 # fancy Unicode spaces (usually non-breaking) should be left
1077 # alone.
1078 STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
1079
1080 def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1081 markupMassage=True, smartQuotesTo=XML_ENTITIES,
1082 convertEntities=None, selfClosingTags=None, isHTML=False):
1083 """The Soup object is initialized as the 'root tag', and the
1084 provided markup (which can be a string or a file-like object)
1085 is fed into the underlying parser.
1086
1087 sgmllib will process most bad HTML, and the BeautifulSoup
1088 class has some tricks for dealing with some HTML that kills
1089 sgmllib, but Beautiful Soup can nonetheless choke or lose data
1090 if your data uses self-closing tags or declarations
1091 incorrectly.
1092
1093 By default, Beautiful Soup uses regexes to sanitize input,
1094 avoiding the vast majority of these problems. If the problems
1095 don't apply to you, pass in False for markupMassage, and
1096 you'll get better performance.
1097
1098 The default parser massage techniques fix the two most common
1099 instances of invalid HTML that choke sgmllib:
1100
1101 <br/> (No space between name of closing tag and tag close)
1102 <! --Comment--> (Extraneous whitespace in declaration)
1103
1104 You can pass in a custom list of (RE object, replace method)
1105 tuples to get Beautiful Soup to scrub your input the way you
1106 want."""
1107
1108 self.parseOnlyThese = parseOnlyThese
1109 self.fromEncoding = fromEncoding
1110 self.smartQuotesTo = smartQuotesTo
1111 self.convertEntities = convertEntities
1112 # Set the rules for how we'll deal with the entities we
1113 # encounter
1114 if self.convertEntities:
1115 # It doesn't make sense to convert encoded characters to
1116 # entities even while you're converting entities to Unicode.
1117 # Just convert it all to Unicode.
1118 self.smartQuotesTo = None
1119 if convertEntities == self.HTML_ENTITIES:
1120 self.convertXMLEntities = False
1121 self.convertHTMLEntities = True
1122 self.escapeUnrecognizedEntities = True
1123 elif convertEntities == self.XHTML_ENTITIES:
1124 self.convertXMLEntities = True
1125 self.convertHTMLEntities = True
1126 self.escapeUnrecognizedEntities = False
1127 elif convertEntities == self.XML_ENTITIES:
1128 self.convertXMLEntities = True
1129 self.convertHTMLEntities = False
1130 self.escapeUnrecognizedEntities = False
1131 else:
1132 self.convertXMLEntities = False
1133 self.convertHTMLEntities = False
1134 self.escapeUnrecognizedEntities = False
1135
1136 self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1137 SGMLParser.__init__(self)
1138
1139 if hasattr(markup, 'read'): # It's a file-type object.
1140 markup = markup.read()
1141 self.markup = markup
1142 self.markupMassage = markupMassage
1143 try:
1144 self._feed(isHTML=isHTML)
1145 except StopParsing:
1146 pass
1147 self.markup = None # The markup can now be GCed
1148
1149 def convert_charref(self, name):
1150 """This method fixes a bug in Python's SGMLParser."""
1151 try:
1152 n = int(name)
1153 except ValueError:
1154 return
1155 if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1156 return
1157 return self.convert_codepoint(n)
1158
1159 def _feed(self, inDocumentEncoding=None, isHTML=False):
1160 # Convert the document to Unicode.
1161 markup = self.markup
1162 if isinstance(markup, unicode):
1163 if not hasattr(self, 'originalEncoding'):
1164 self.originalEncoding = None
1165 else:
1166 dammit = UnicodeDammit\
1167 (markup, [self.fromEncoding, inDocumentEncoding],
1168 smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
1169 markup = dammit.unicode
1170 self.originalEncoding = dammit.originalEncoding
1171 self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
1172 if markup:
1173 if self.markupMassage:
1174 if not hasattr(self.markupMassage, "__iter__"):
1175 self.markupMassage = self.MARKUP_MASSAGE
1176 for fix, m in self.markupMassage:
1177 markup = fix.sub(m, markup)
1178 # TODO: We get rid of markupMassage so that the
1179 # soup object can be deepcopied later on. Some
1180 # Python installations can't copy regexes. If anyone
1181 # was relying on the existence of markupMassage, this
1182 # might cause problems.
1183 del(self.markupMassage)
1184 self.reset()
1185
1186 SGMLParser.feed(self, markup)
1187 # Close out any unfinished strings and close all the open tags.
1188 self.endData()
1189 while self.currentTag.name != self.ROOT_TAG_NAME:
1190 self.popTag()
1191
1192 def __getattr__(self, methodName):
1193 """This method routes method call requests to either the SGMLParser
1194 superclass or the Tag superclass, depending on the method name."""
1195 #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1196
1197 if methodName.startswith('start_') or methodName.startswith('end_') \
1198 or methodName.startswith('do_'):
1199 return SGMLParser.__getattr__(self, methodName)
1200 elif not methodName.startswith('__'):
1201 return Tag.__getattr__(self, methodName)
1202 else:
1203 raise AttributeError
1204
1205 def isSelfClosingTag(self, name):
1206 """Returns true iff the given string is the name of a
1207 self-closing tag according to this parser."""
1208 return self.SELF_CLOSING_TAGS.has_key(name) \
1209 or self.instanceSelfClosingTags.has_key(name)
1210
1211 def reset(self):
1212 Tag.__init__(self, self, self.ROOT_TAG_NAME)
1213 self.hidden = 1
1214 SGMLParser.reset(self)
1215 self.currentData = []
1216 self.currentTag = None
1217 self.tagStack = []
1218 self.quoteStack = []
1219 self.pushTag(self)
1220
1221 def popTag(self):
1222 tag = self.tagStack.pop()
1223
1224 #print "Pop", tag.name
1225 if self.tagStack:
1226 self.currentTag = self.tagStack[-1]
1227 return self.currentTag
1228
1229 def pushTag(self, tag):
1230 #print "Push", tag.name
1231 if self.currentTag:
1232 self.currentTag.contents.append(tag)
1233 self.tagStack.append(tag)
1234 self.currentTag = self.tagStack[-1]
1235
1236 def endData(self, containerClass=NavigableString):
1237 if self.currentData:
1238 currentData = u''.join(self.currentData)
1239 if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
1240 not set([tag.name for tag in self.tagStack]).intersection(
1241 self.PRESERVE_WHITESPACE_TAGS)):
1242 if '\n' in currentData:
1243 currentData = '\n'
1244 else:
1245 currentData = ' '
1246 self.currentData = []
1247 if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1248 (not self.parseOnlyThese.text or \
1249 not self.parseOnlyThese.search(currentData)):
1250 return
1251 o = containerClass(currentData)
1252 o.setup(self.currentTag, self.previous)
1253 if self.previous:
1254 self.previous.next = o
1255 self.previous = o
1256 self.currentTag.contents.append(o)
1257
1258
1259 def _popToTag(self, name, inclusivePop=True):
1260 """Pops the tag stack up to and including the most recent
1261 instance of the given tag. If inclusivePop is false, pops the tag
1262 stack up to but *not* including the most recent instqance of
1263 the given tag."""
1264 #print "Popping to %s" % name
1265 if name == self.ROOT_TAG_NAME:
1266 return
1267
1268 numPops = 0
1269 mostRecentTag = None
1270 for i in range(len(self.tagStack)-1, 0, -1):
1271 if name == self.tagStack[i].name:
1272 numPops = len(self.tagStack)-i
1273 break
1274 if not inclusivePop:
1275 numPops = numPops - 1
1276
1277 for i in range(0, numPops):
1278 mostRecentTag = self.popTag()
1279 return mostRecentTag
1280
1281 def _smartPop(self, name):
1282
1283 """We need to pop up to the previous tag of this type, unless
1284 one of this tag's nesting reset triggers comes between this
1285 tag and the previous tag of this type, OR unless this tag is a
1286 generic nesting trigger and another generic nesting trigger
1287 comes between this tag and the previous tag of this type.
1288
1289 Examples:
1290 <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1291 <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1292 <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1293
1294 <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1295 <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1296 <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1297 """
1298
1299 nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1300 isNestable = nestingResetTriggers != None
1301 isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1302 popTo = None
1303 inclusive = True
1304 for i in range(len(self.tagStack)-1, 0, -1):
1305 p = self.tagStack[i]
1306 if (not p or p.name == name) and not isNestable:
1307 #Non-nestable tags get popped to the top or to their
1308 #last occurance.
1309 popTo = name
1310 break
1311 if (nestingResetTriggers is not None
1312 and p.name in nestingResetTriggers) \
1313 or (nestingResetTriggers is None and isResetNesting
1314 and self.RESET_NESTING_TAGS.has_key(p.name)):
1315
1316 #If we encounter one of the nesting reset triggers
1317 #peculiar to this tag, or we encounter another tag
1318 #that causes nesting to reset, pop up to but not
1319 #including that tag.
1320 popTo = p.name
1321 inclusive = False
1322 break
1323 p = p.parent
1324 if popTo:
1325 self._popToTag(popTo, inclusive)
1326
1327 def unknown_starttag(self, name, attrs, selfClosing=0):
1328 #print "Start tag %s: %s" % (name, attrs)
1329 if self.quoteStack:
1330 #This is not a real tag.
1331 #print "<%s> is not real!" % name
1332 attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
1333 self.handle_data('<%s%s>' % (name, attrs))
1334 return
1335 self.endData()
1336
1337 if not self.isSelfClosingTag(name) and not selfClosing:
1338 self._smartPop(name)
1339
1340 if self.parseOnlyThese and len(self.tagStack) <= 1 \
1341 and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1342 return
1343
1344 tag = Tag(self, name, attrs, self.currentTag, self.previous)
1345 if self.previous:
1346 self.previous.next = tag
1347 self.previous = tag
1348 self.pushTag(tag)
1349 if selfClosing or self.isSelfClosingTag(name):
1350 self.popTag()
1351 if name in self.QUOTE_TAGS:
1352 #print "Beginning quote (%s)" % name
1353 self.quoteStack.append(name)
1354 self.literal = 1
1355 return tag
1356
1357 def unknown_endtag(self, name):
1358 #print "End tag %s" % name
1359 if self.quoteStack and self.quoteStack[-1] != name:
1360 #This is not a real end tag.
1361 #print "</%s> is not real!" % name
1362 self.handle_data('</%s>' % name)
1363 return
1364 self.endData()
1365 self._popToTag(name)
1366 if self.quoteStack and self.quoteStack[-1] == name:
1367 self.quoteStack.pop()
1368 self.literal = (len(self.quoteStack) > 0)
1369
1370 def handle_data(self, data):
1371 self.currentData.append(data)
1372
1373 def _toStringSubclass(self, text, subclass):
1374 """Adds a certain piece of text to the tree as a NavigableString
1375 subclass."""
1376 self.endData()
1377 self.handle_data(text)
1378 self.endData(subclass)
1379
1380 def handle_pi(self, text):
1381 """Handle a processing instruction as a ProcessingInstruction
1382 object, possibly one with a %SOUP-ENCODING% slot into which an
1383 encoding will be plugged later."""
1384 if text[:3] == "xml":
1385 text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1386 self._toStringSubclass(text, ProcessingInstruction)
1387
1388 def handle_comment(self, text):
1389 "Handle comments as Comment objects."
1390 self._toStringSubclass(text, Comment)
1391
1392 def handle_charref(self, ref):
1393 "Handle character references as data."
1394 if self.convertEntities:
1395 data = unichr(int(ref))
1396 else:
1397 data = '&#%s;' % ref
1398 self.handle_data(data)
1399
1400 def handle_entityref(self, ref):
1401 """Handle entity references as data, possibly converting known
1402 HTML and/or XML entity references to the corresponding Unicode
1403 characters."""
1404 data = None
1405 if self.convertHTMLEntities:
1406 try:
1407 data = unichr(name2codepoint[ref])
1408 except KeyError:
1409 pass
1410
1411 if not data and self.convertXMLEntities:
1412 data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1413
1414 if not data and self.convertHTMLEntities and \
1415 not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1416 # TODO: We've got a problem here. We're told this is
1417 # an entity reference, but it's not an XML entity
1418 # reference or an HTML entity reference. Nonetheless,
1419 # the logical thing to do is to pass it through as an
1420 # unrecognized entity reference.
1421 #
1422 # Except: when the input is "&carol;" this function
1423 # will be called with input "carol". When the input is
1424 # "AT&T", this function will be called with input
1425 # "T". We have no way of knowing whether a semicolon
1426 # was present originally, so we don't know whether
1427 # this is an unknown entity or just a misplaced
1428 # ampersand.
1429 #
1430 # The more common case is a misplaced ampersand, so I
1431 # escape the ampersand and omit the trailing semicolon.
1432 data = "&amp;%s" % ref
1433 if not data:
1434 # This case is different from the one above, because we
1435 # haven't already gone through a supposedly comprehensive
1436 # mapping of entities to Unicode characters. We might not
1437 # have gone through any mapping at all. So the chances are
1438 # very high that this is a real entity, and not a
1439 # misplaced ampersand.
1440 data = "&%s;" % ref
1441 self.handle_data(data)
1442
1443 def handle_decl(self, data):
1444 "Handle DOCTYPEs and the like as Declaration objects."
1445 self._toStringSubclass(data, Declaration)
1446
1447 def parse_declaration(self, i):
1448 """Treat a bogus SGML declaration as raw data. Treat a CDATA
1449 declaration as a CData object."""
1450 j = None
1451 if self.rawdata[i:i+9] == '<![CDATA[':
1452 k = self.rawdata.find(']]>', i)
1453 if k == -1:
1454 k = len(self.rawdata)
1455 data = self.rawdata[i+9:k]
1456 j = k+3
1457 self._toStringSubclass(data, CData)
1458 else:
1459 try:
1460 j = SGMLParser.parse_declaration(self, i)
1461 except SGMLParseError:
1462 toHandle = self.rawdata[i:]
1463 self.handle_data(toHandle)
1464 j = i + len(toHandle)
1465 return j
1466
1467 class BeautifulSoup(BeautifulStoneSoup):
1468
1469 """This parser knows the following facts about HTML:
1470
1471 * Some tags have no closing tag and should be interpreted as being
1472 closed as soon as they are encountered.
1473
1474 * The text inside some tags (ie. 'script') may contain tags which
1475 are not really part of the document and which should be parsed
1476 as text, not tags. If you want to parse the text as tags, you can
1477 always fetch it and parse it explicitly.
1478
1479 * Tag nesting rules:
1480
1481 Most tags can't be nested at all. For instance, the occurance of
1482 a <p> tag should implicitly close the previous <p> tag.
1483
1484 <p>Para1<p>Para2
1485 should be transformed into:
1486 <p>Para1</p><p>Para2
1487
1488 Some tags can be nested arbitrarily. For instance, the occurance
1489 of a <blockquote> tag should _not_ implicitly close the previous
1490 <blockquote> tag.
1491
1492 Alice said: <blockquote>Bob said: <blockquote>Blah
1493 should NOT be transformed into:
1494 Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1495
1496 Some tags can be nested, but the nesting is reset by the
1497 interposition of other tags. For instance, a <tr> tag should
1498 implicitly close the previous <tr> tag within the same <table>,
1499 but not close a <tr> tag in another table.
1500
1501 <table><tr>Blah<tr>Blah
1502 should be transformed into:
1503 <table><tr>Blah</tr><tr>Blah
1504 but,
1505 <tr>Blah<table><tr>Blah
1506 should NOT be transformed into
1507 <tr>Blah<table></tr><tr>Blah
1508
1509 Differing assumptions about tag nesting rules are a major source
1510 of problems with the BeautifulSoup class. If BeautifulSoup is not
1511 treating as nestable a tag your page author treats as nestable,
1512 try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1513 BeautifulStoneSoup before writing your own subclass."""
1514
1515 def __init__(self, *args, **kwargs):
1516 if not kwargs.has_key('smartQuotesTo'):
1517 kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1518 kwargs['isHTML'] = True
1519 BeautifulStoneSoup.__init__(self, *args, **kwargs)
1520
1521 SELF_CLOSING_TAGS = buildTagMap(None,
1522 ('br' , 'hr', 'input', 'img', 'meta',
1523 'spacer', 'link', 'frame', 'base', 'col'))
1524
1525 PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
1526
1527 QUOTE_TAGS = {'script' : None, 'textarea' : None}
1528
1529 #According to the HTML standard, each of these inline tags can
1530 #contain another tag of the same type. Furthermore, it's common
1531 #to actually use these tags this way.
1532 NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1533 'center')
1534
1535 #According to the HTML standard, these block tags can contain
1536 #another tag of the same type. Furthermore, it's common
1537 #to actually use these tags this way.
1538 NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
1539
1540 #Lists can contain other lists, but there are restrictions.
1541 NESTABLE_LIST_TAGS = { 'ol' : [],
1542 'ul' : [],
1543 'li' : ['ul', 'ol'],
1544 'dl' : [],
1545 'dd' : ['dl'],
1546 'dt' : ['dl'] }
1547
1548 #Tables can contain other tables, but there are restrictions.
1549 NESTABLE_TABLE_TAGS = {'table' : [],
1550 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1551 'td' : ['tr'],
1552 'th' : ['tr'],
1553 'thead' : ['table'],
1554 'tbody' : ['table'],
1555 'tfoot' : ['table'],
1556 }
1557
1558 NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
1559
1560 #If one of these tags is encountered, all tags up to the next tag of
1561 #this type are popped.
1562 RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1563 NON_NESTABLE_BLOCK_TAGS,
1564 NESTABLE_LIST_TAGS,
1565 NESTABLE_TABLE_TAGS)
1566
1567 NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1568 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1569
1570 # Used to detect the charset in a META tag; see start_meta
1571 CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
1572
1573 def start_meta(self, attrs):
1574 """Beautiful Soup can detect a charset included in a META tag,
1575 try to convert the document to that charset, and re-parse the
1576 document from the beginning."""
1577 httpEquiv = None
1578 contentType = None
1579 contentTypeIndex = None
1580 tagNeedsEncodingSubstitution = False
1581
1582 for i in range(0, len(attrs)):
1583 key, value = attrs[i]
1584 key = key.lower()
1585 if key == 'http-equiv':
1586 httpEquiv = value
1587 elif key == 'content':
1588 contentType = value
1589 contentTypeIndex = i
1590
1591 if httpEquiv and contentType: # It's an interesting meta tag.
1592 match = self.CHARSET_RE.search(contentType)
1593 if match:
1594 if (self.declaredHTMLEncoding is not None or
1595 self.originalEncoding == self.fromEncoding):
1596 # An HTML encoding was sniffed while converting
1597 # the document to Unicode, or an HTML encoding was
1598 # sniffed during a previous pass through the
1599 # document, or an encoding was specified
1600 # explicitly and it worked. Rewrite the meta tag.
1601 def rewrite(match):
1602 return match.group(1) + "%SOUP-ENCODING%"
1603 newAttr = self.CHARSET_RE.sub(rewrite, contentType)
1604 attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1605 newAttr)
1606 tagNeedsEncodingSubstitution = True
1607 else:
1608 # This is our first pass through the document.
1609 # Go through it again with the encoding information.
1610 newCharset = match.group(3)
1611 if newCharset and newCharset != self.originalEncoding:
1612 self.declaredHTMLEncoding = newCharset
1613 self._feed(self.declaredHTMLEncoding)
1614 raise StopParsing
1615 pass
1616 tag = self.unknown_starttag("meta", attrs)
1617 if tag and tagNeedsEncodingSubstitution:
1618 tag.containsSubstitutions = True
1619
1620 class StopParsing(Exception):
1621 pass
1622
1623 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1624
1625 """The BeautifulSoup class is oriented towards skipping over
1626 common HTML errors like unclosed tags. However, sometimes it makes
1627 errors of its own. For instance, consider this fragment:
1628
1629 <b>Foo<b>Bar</b></b>
1630
1631 This is perfectly valid (if bizarre) HTML. However, the
1632 BeautifulSoup class will implicitly close the first b tag when it
1633 encounters the second 'b'. It will think the author wrote
1634 "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1635 there's no real-world reason to bold something that's already
1636 bold. When it encounters '</b></b>' it will close two more 'b'
1637 tags, for a grand total of three tags closed instead of two. This
1638 can throw off the rest of your document structure. The same is
1639 true of a number of other tags, listed below.
1640
1641 It's much more common for someone to forget to close a 'b' tag
1642 than to actually use nested 'b' tags, and the BeautifulSoup class
1643 handles the common case. This class handles the not-co-common
1644 case: where you can't believe someone wrote what they did, but
1645 it's valid HTML and BeautifulSoup screwed up by assuming it
1646 wouldn't be."""
1647
1648 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1649 ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1650 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1651 'big')
1652
1653 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
1654
1655 NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1656 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1657 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1658
1659 class MinimalSoup(BeautifulSoup):
1660 """The MinimalSoup class is for parsing HTML that contains
1661 pathologically bad markup. It makes no assumptions about tag
1662 nesting, but it does know which tags are self-closing, that
1663 <script> tags contain Javascript and should not be parsed, that
1664 META tags may contain encoding information, and so on.
1665
1666 This also makes it better for subclassing than BeautifulStoneSoup
1667 or BeautifulSoup."""
1668
1669 RESET_NESTING_TAGS = buildTagMap('noscript')
1670 NESTABLE_TAGS = {}
1671
1672 class BeautifulSOAP(BeautifulStoneSoup):
1673 """This class will push a tag with only a single string child into
1674 the tag's parent as an attribute. The attribute's name is the tag
1675 name, and the value is the string child. An example should give
1676 the flavor of the change:
1677
1678 <foo><bar>baz</bar></foo>
1679 =>
1680 <foo bar="baz"><bar>baz</bar></foo>
1681
1682 You can then access fooTag['bar'] instead of fooTag.barTag.string.
1683
1684 This is, of course, useful for scraping structures that tend to
1685 use subelements instead of attributes, such as SOAP messages. Note
1686 that it modifies its input, so don't print the modified version
1687 out.
1688
1689 I'm not sure how many people really want to use this class; let me
1690 know if you do. Mainly I like the name."""
1691
1692 def popTag(self):
1693 if len(self.tagStack) > 1:
1694 tag = self.tagStack[-1]
1695 parent = self.tagStack[-2]
1696 parent._getAttrMap()
1697 if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1698 isinstance(tag.contents[0], NavigableString) and
1699 not parent.attrMap.has_key(tag.name)):
1700 parent[tag.name] = tag.contents[0]
1701 BeautifulStoneSoup.popTag(self)
1702
1703 #Enterprise class names! It has come to our attention that some people
1704 #think the names of the Beautiful Soup parser classes are too silly
1705 #and "unprofessional" for use in enterprise screen-scraping. We feel
1706 #your pain! For such-minded folk, the Beautiful Soup Consortium And
1707 #All-Night Kosher Bakery recommends renaming this file to
1708 #"RobustParser.py" (or, in cases of extreme enterprisiness,
1709 #"RobustParserBeanInterface.class") and using the following
1710 #enterprise-friendly class aliases:
1711 class RobustXMLParser(BeautifulStoneSoup):
1712 pass
1713 class RobustHTMLParser(BeautifulSoup):
1714 pass
1715 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1716 pass
1717 class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1718 pass
1719 class SimplifyingSOAPParser(BeautifulSOAP):
1720 pass
1721
1722 ######################################################
1723 #
1724 # Bonus library: Unicode, Dammit
1725 #
1726 # This class forces XML data into a standard format (usually to UTF-8
1727 # or Unicode). It is heavily based on code from Mark Pilgrim's
1728 # Universal Feed Parser. It does not rewrite the XML or HTML to
1729 # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1730 # (XML) and BeautifulSoup.start_meta (HTML).
1731
1732 # Autodetects character encodings.
1733 # Download from http://chardet.feedparser.org/
1734 try:
1735 import chardet
1736 # import chardet.constants
1737 # chardet.constants._debug = 1
1738 except ImportError:
1739 chardet = None
1740
1741 # cjkcodecs and iconv_codec make Python know about more character encodings.
1742 # Both are available from http://cjkpython.i18n.org/
1743 # They're built in if you use Python 2.4.
1744 try:
1745 import cjkcodecs.aliases
1746 except ImportError:
1747 pass
1748 try:
1749 import iconv_codec
1750 except ImportError:
1751 pass
1752
1753 class UnicodeDammit:
1754 """A class for detecting the encoding of a *ML document and
1755 converting it to a Unicode string. If the source encoding is
1756 windows-1252, can replace MS smart quotes with their HTML or XML
1757 equivalents."""
1758
1759 # This dictionary maps commonly seen values for "charset" in HTML
1760 # meta tags to the corresponding Python codec names. It only covers
1761 # values that aren't in Python's aliases and can't be determined
1762 # by the heuristics in find_codec.
1763 CHARSET_ALIASES = { "macintosh" : "mac-roman",
1764 "x-sjis" : "shift-jis" }
1765
1766 def __init__(self, markup, overrideEncodings=[],
1767 smartQuotesTo='xml', isHTML=False):
1768 self.declaredHTMLEncoding = None
1769 self.markup, documentEncoding, sniffedEncoding = \
1770 self._detectEncoding(markup, isHTML)
1771 self.smartQuotesTo = smartQuotesTo
1772 self.triedEncodings = []
1773 if markup == '' or isinstance(markup, unicode):
1774 self.originalEncoding = None
1775 self.unicode = unicode(markup)
1776 return
1777
1778 u = None
1779 for proposedEncoding in overrideEncodings:
1780 u = self._convertFrom(proposedEncoding)
1781 if u: break
1782 if not u:
1783 for proposedEncoding in (documentEncoding, sniffedEncoding):
1784 u = self._convertFrom(proposedEncoding)
1785 if u: break
1786
1787 # If no luck and we have auto-detection library, try that:
1788 if not u and chardet and not isinstance(self.markup, unicode):
1789 u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1790
1791 # As a last resort, try utf-8 and windows-1252:
1792 if not u:
1793 for proposed_encoding in ("utf-8", "windows-1252"):
1794 u = self._convertFrom(proposed_encoding)
1795 if u: break
1796
1797 self.unicode = u
1798 if not u: self.originalEncoding = None
1799
1800 def _subMSChar(self, orig):
1801 """Changes a MS smart quote character to an XML or HTML
1802 entity."""
1803 sub = self.MS_CHARS.get(orig)
1804 if isinstance(sub, tuple):
1805 if self.smartQuotesTo == 'xml':
1806 sub = '&#x%s;' % sub[1]
1807 else:
1808 sub = '&%s;' % sub[0]
1809 return sub
1810
1811 def _convertFrom(self, proposed):
1812 proposed = self.find_codec(proposed)
1813 if not proposed or proposed in self.triedEncodings:
1814 return None
1815 self.triedEncodings.append(proposed)
1816 markup = self.markup
1817
1818 # Convert smart quotes to HTML if coming from an encoding
1819 # that might have them.
1820 if self.smartQuotesTo and proposed.lower() in("windows-1252",
1821 "iso-8859-1",
1822 "iso-8859-2"):
1823 markup = re.compile("([\x80-\x9f])").sub \
1824 (lambda(x): self._subMSChar(x.group(1)),
1825 markup)
1826
1827 try:
1828 # print "Trying to convert document to %s" % proposed
1829 u = self._toUnicode(markup, proposed)
1830 self.markup = u
1831 self.originalEncoding = proposed
1832 except Exception, e:
1833 # print "That didn't work!"
1834 # print e
1835 return None
1836 #print "Correct encoding: %s" % proposed
1837 return self.markup
1838
1839 def _toUnicode(self, data, encoding):
1840 '''Given a string and its encoding, decodes the string into Unicode.
1841 %encoding is a string recognized by encodings.aliases'''
1842
1843 # strip Byte Order Mark (if present)
1844 if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1845 and (data[2:4] != '\x00\x00'):
1846 encoding = 'utf-16be'
1847 data = data[2:]
1848 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1849 and (data[2:4] != '\x00\x00'):
1850 encoding = 'utf-16le'
1851 data = data[2:]
1852 elif data[:3] == '\xef\xbb\xbf':
1853 encoding = 'utf-8'
1854 data = data[3:]
1855 elif data[:4] == '\x00\x00\xfe\xff':
1856 encoding = 'utf-32be'
1857 data = data[4:]
1858 elif data[:4] == '\xff\xfe\x00\x00':
1859 encoding = 'utf-32le'
1860 data = data[4:]
1861 newdata = unicode(data, encoding)
1862 return newdata
1863
1864 def _detectEncoding(self, xml_data, isHTML=False):
1865 """Given a document, tries to detect its XML encoding."""
1866 xml_encoding = sniffed_xml_encoding = None
1867 try:
1868 if xml_data[:4] == '\x4c\x6f\xa7\x94':
1869 # EBCDIC
1870 xml_data = self._ebcdic_to_ascii(xml_data)
1871 elif xml_data[:4] == '\x00\x3c\x00\x3f':
1872 # UTF-16BE
1873 sniffed_xml_encoding = 'utf-16be'
1874 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1875 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1876 and (xml_data[2:4] != '\x00\x00'):
1877 # UTF-16BE with BOM
1878 sniffed_xml_encoding = 'utf-16be'
1879 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1880 elif xml_data[:4] == '\x3c\x00\x3f\x00':
1881 # UTF-16LE
1882 sniffed_xml_encoding = 'utf-16le'
1883 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1884 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1885 (xml_data[2:4] != '\x00\x00'):
1886 # UTF-16LE with BOM
1887 sniffed_xml_encoding = 'utf-16le'
1888 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1889 elif xml_data[:4] == '\x00\x00\x00\x3c':
1890 # UTF-32BE
1891 sniffed_xml_encoding = 'utf-32be'
1892 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1893 elif xml_data[:4] == '\x3c\x00\x00\x00':
1894 # UTF-32LE
1895 sniffed_xml_encoding = 'utf-32le'
1896 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1897 elif xml_data[:4] == '\x00\x00\xfe\xff':
1898 # UTF-32BE with BOM
1899 sniffed_xml_encoding = 'utf-32be'
1900 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1901 elif xml_data[:4] == '\xff\xfe\x00\x00':
1902 # UTF-32LE with BOM
1903 sniffed_xml_encoding = 'utf-32le'
1904 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1905 elif xml_data[:3] == '\xef\xbb\xbf':
1906 # UTF-8 with BOM
1907 sniffed_xml_encoding = 'utf-8'
1908 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1909 else:
1910 sniffed_xml_encoding = 'ascii'
1911 pass
1912 except:
1913 xml_encoding_match = None
1914 xml_encoding_match = re.compile(
1915 '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
1916 if not xml_encoding_match and isHTML:
1917 regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
1918 xml_encoding_match = regexp.search(xml_data)
1919 if xml_encoding_match is not None:
1920 xml_encoding = xml_encoding_match.groups()[0].lower()
1921 if isHTML:
1922 self.declaredHTMLEncoding = xml_encoding
1923 if sniffed_xml_encoding and \
1924 (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1925 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1926 'utf-16', 'utf-32', 'utf_16', 'utf_32',
1927 'utf16', 'u16')):
1928 xml_encoding = sniffed_xml_encoding
1929 return xml_data, xml_encoding, sniffed_xml_encoding
1930
1931
1932 def find_codec(self, charset):
1933 return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1934 or (charset and self._codec(charset.replace("-", ""))) \
1935 or (charset and self._codec(charset.replace("-", "_"))) \
1936 or charset
1937
1938 def _codec(self, charset):
1939 if not charset: return charset
1940 codec = None
1941 try:
1942 codecs.lookup(charset)
1943 codec = charset
1944 except (LookupError, ValueError):
1945 pass
1946 return codec
1947
1948 EBCDIC_TO_ASCII_MAP = None
1949 def _ebcdic_to_ascii(self, s):
1950 c = self.__class__
1951 if not c.EBCDIC_TO_ASCII_MAP:
1952 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1953 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1954 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1955 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1956 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1957 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1958 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1959 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1960 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1961 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1962 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1963 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1964 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1965 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1966 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1967 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1968 250,251,252,253,254,255)
1969 import string
1970 c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1971 ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1972 return s.translate(c.EBCDIC_TO_ASCII_MAP)
1973
1974 MS_CHARS = { '\x80' : ('euro', '20AC'),
1975 '\x81' : ' ',
1976 '\x82' : ('sbquo', '201A'),
1977 '\x83' : ('fnof', '192'),
1978 '\x84' : ('bdquo', '201E'),
1979 '\x85' : ('hellip', '2026'),
1980 '\x86' : ('dagger', '2020'),
1981 '\x87' : ('Dagger', '2021'),
1982 '\x88' : ('circ', '2C6'),
1983 '\x89' : ('permil', '2030'),
1984 '\x8A' : ('Scaron', '160'),
1985 '\x8B' : ('lsaquo', '2039'),
1986 '\x8C' : ('OElig', '152'),
1987 '\x8D' : '?',
1988 '\x8E' : ('#x17D', '17D'),
1989 '\x8F' : '?',
1990 '\x90' : '?',
1991 '\x91' : ('lsquo', '2018'),
1992 '\x92' : ('rsquo', '2019'),
1993 '\x93' : ('ldquo', '201C'),
1994 '\x94' : ('rdquo', '201D'),
1995 '\x95' : ('bull', '2022'),
1996 '\x96' : ('ndash', '2013'),
1997 '\x97' : ('mdash', '2014'),
1998 '\x98' : ('tilde', '2DC'),
1999 '\x99' : ('trade', '2122'),
2000 '\x9a' : ('scaron', '161'),
2001 '\x9b' : ('rsaquo', '203A'),
2002 '\x9c' : ('oelig', '153'),
2003 '\x9d' : '?',
2004 '\x9e' : ('#x17E', '17E'),
2005 '\x9f' : ('Yuml', ''),}
2006
2007 #######################################################################
2008
2009
2010 #By default, act as an HTML pretty-printer.
2011 if __name__ == '__main__':
2012 import sys
2013 soup = BeautifulSoup(sys.stdin)
2014 print soup.prettify()