comparison BeautifulSoup.py @ 32:03c22b722882

remove BeautifulSoup dependency
author Richard Burhans <burhans@bx.psu.edu>
date Fri, 20 Sep 2013 13:54:23 -0400
parents
children
comparison
equal deleted inserted replaced
31:a631c2f6d913 32:03c22b722882
1 """Beautiful Soup
2 Elixir and Tonic
3 "The Screen-Scraper's Friend"
4 http://www.crummy.com/software/BeautifulSoup/
5
6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7 tree representation. It provides methods and Pythonic idioms that make
8 it easy to navigate, search, and modify the tree.
9
10 A well-formed XML/HTML document yields a well-formed data
11 structure. An ill-formed XML/HTML document yields a correspondingly
12 ill-formed data structure. If your document is only locally
13 well-formed, you can use this library to find and process the
14 well-formed part of it.
15
16 Beautiful Soup works with Python 2.2 and up. It has no external
17 dependencies, but you'll have more success at converting data to UTF-8
18 if you also install these three packages:
19
20 * chardet, for auto-detecting character encodings
21 http://chardet.feedparser.org/
22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
23 by stock Python.
24 http://cjkpython.i18n.org/
25
26 Beautiful Soup defines classes for two main parsing strategies:
27
28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29 language that kind of looks like XML.
30
31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32 or invalid. This class has web browser-like heuristics for
33 obtaining a sensible parse tree in the face of common HTML errors.
34
35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36 the encoding of an HTML or XML document, and converting it to
37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
38
39 For more than you ever wanted to know about Beautiful Soup, see the
40 documentation:
41 http://www.crummy.com/software/BeautifulSoup/documentation.html
42
43 Here, have some legalese:
44
45 Copyright (c) 2004-2010, Leonard Richardson
46
47 All rights reserved.
48
49 Redistribution and use in source and binary forms, with or without
50 modification, are permitted provided that the following conditions are
51 met:
52
53 * Redistributions of source code must retain the above copyright
54 notice, this list of conditions and the following disclaimer.
55
56 * Redistributions in binary form must reproduce the above
57 copyright notice, this list of conditions and the following
58 disclaimer in the documentation and/or other materials provided
59 with the distribution.
60
61 * Neither the name of the the Beautiful Soup Consortium and All
62 Night Kosher Bakery nor the names of its contributors may be
63 used to endorse or promote products derived from this software
64 without specific prior written permission.
65
66 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
77
78 """
79 from __future__ import generators
80
81 __author__ = "Leonard Richardson (leonardr@segfault.org)"
82 __version__ = "3.2.1"
83 __copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
84 __license__ = "New-style BSD"
85
86 from sgmllib import SGMLParser, SGMLParseError
87 import codecs
88 import markupbase
89 import types
90 import re
91 import sgmllib
92 try:
93 from htmlentitydefs import name2codepoint
94 except ImportError:
95 name2codepoint = {}
96 try:
97 set
98 except NameError:
99 from sets import Set as set
100
101 #These hacks make Beautiful Soup able to parse XML with namespaces
102 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
103 markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
104
105 DEFAULT_OUTPUT_ENCODING = "utf-8"
106
107 def _match_css_class(str):
108 """Build a RE to match the given CSS class."""
109 return re.compile(r"(^|.*\s)%s($|\s)" % str)
110
111 # First, the classes that represent markup elements.
112
113 class PageElement(object):
114 """Contains the navigational information for some part of the page
115 (either a tag or a piece of text)"""
116
117 def _invert(h):
118 "Cheap function to invert a hash."
119 i = {}
120 for k,v in h.items():
121 i[v] = k
122 return i
123
124 XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
125 "quot" : '"',
126 "amp" : "&",
127 "lt" : "<",
128 "gt" : ">" }
129
130 XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
131
132 def setup(self, parent=None, previous=None):
133 """Sets up the initial relations between this element and
134 other elements."""
135 self.parent = parent
136 self.previous = previous
137 self.next = None
138 self.previousSibling = None
139 self.nextSibling = None
140 if self.parent and self.parent.contents:
141 self.previousSibling = self.parent.contents[-1]
142 self.previousSibling.nextSibling = self
143
144 def replaceWith(self, replaceWith):
145 oldParent = self.parent
146 myIndex = self.parent.index(self)
147 if hasattr(replaceWith, "parent")\
148 and replaceWith.parent is self.parent:
149 # We're replacing this element with one of its siblings.
150 index = replaceWith.parent.index(replaceWith)
151 if index and index < myIndex:
152 # Furthermore, it comes before this element. That
153 # means that when we extract it, the index of this
154 # element will change.
155 myIndex = myIndex - 1
156 self.extract()
157 oldParent.insert(myIndex, replaceWith)
158
159 def replaceWithChildren(self):
160 myParent = self.parent
161 myIndex = self.parent.index(self)
162 self.extract()
163 reversedChildren = list(self.contents)
164 reversedChildren.reverse()
165 for child in reversedChildren:
166 myParent.insert(myIndex, child)
167
168 def extract(self):
169 """Destructively rips this element out of the tree."""
170 if self.parent:
171 try:
172 del self.parent.contents[self.parent.index(self)]
173 except ValueError:
174 pass
175
176 #Find the two elements that would be next to each other if
177 #this element (and any children) hadn't been parsed. Connect
178 #the two.
179 lastChild = self._lastRecursiveChild()
180 nextElement = lastChild.next
181
182 if self.previous:
183 self.previous.next = nextElement
184 if nextElement:
185 nextElement.previous = self.previous
186 self.previous = None
187 lastChild.next = None
188
189 self.parent = None
190 if self.previousSibling:
191 self.previousSibling.nextSibling = self.nextSibling
192 if self.nextSibling:
193 self.nextSibling.previousSibling = self.previousSibling
194 self.previousSibling = self.nextSibling = None
195 return self
196
197 def _lastRecursiveChild(self):
198 "Finds the last element beneath this object to be parsed."
199 lastChild = self
200 while hasattr(lastChild, 'contents') and lastChild.contents:
201 lastChild = lastChild.contents[-1]
202 return lastChild
203
204 def insert(self, position, newChild):
205 if isinstance(newChild, basestring) \
206 and not isinstance(newChild, NavigableString):
207 newChild = NavigableString(newChild)
208
209 position = min(position, len(self.contents))
210 if hasattr(newChild, 'parent') and newChild.parent is not None:
211 # We're 'inserting' an element that's already one
212 # of this object's children.
213 if newChild.parent is self:
214 index = self.index(newChild)
215 if index > position:
216 # Furthermore we're moving it further down the
217 # list of this object's children. That means that
218 # when we extract this element, our target index
219 # will jump down one.
220 position = position - 1
221 newChild.extract()
222
223 newChild.parent = self
224 previousChild = None
225 if position == 0:
226 newChild.previousSibling = None
227 newChild.previous = self
228 else:
229 previousChild = self.contents[position-1]
230 newChild.previousSibling = previousChild
231 newChild.previousSibling.nextSibling = newChild
232 newChild.previous = previousChild._lastRecursiveChild()
233 if newChild.previous:
234 newChild.previous.next = newChild
235
236 newChildsLastElement = newChild._lastRecursiveChild()
237
238 if position >= len(self.contents):
239 newChild.nextSibling = None
240
241 parent = self
242 parentsNextSibling = None
243 while not parentsNextSibling:
244 parentsNextSibling = parent.nextSibling
245 parent = parent.parent
246 if not parent: # This is the last element in the document.
247 break
248 if parentsNextSibling:
249 newChildsLastElement.next = parentsNextSibling
250 else:
251 newChildsLastElement.next = None
252 else:
253 nextChild = self.contents[position]
254 newChild.nextSibling = nextChild
255 if newChild.nextSibling:
256 newChild.nextSibling.previousSibling = newChild
257 newChildsLastElement.next = nextChild
258
259 if newChildsLastElement.next:
260 newChildsLastElement.next.previous = newChildsLastElement
261 self.contents.insert(position, newChild)
262
263 def append(self, tag):
264 """Appends the given tag to the contents of this tag."""
265 self.insert(len(self.contents), tag)
266
267 def findNext(self, name=None, attrs={}, text=None, **kwargs):
268 """Returns the first item that matches the given criteria and
269 appears after this Tag in the document."""
270 return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
271
272 def findAllNext(self, name=None, attrs={}, text=None, limit=None,
273 **kwargs):
274 """Returns all items that match the given criteria and appear
275 after this Tag in the document."""
276 return self._findAll(name, attrs, text, limit, self.nextGenerator,
277 **kwargs)
278
279 def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
280 """Returns the closest sibling to this Tag that matches the
281 given criteria and appears after this Tag in the document."""
282 return self._findOne(self.findNextSiblings, name, attrs, text,
283 **kwargs)
284
285 def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
286 **kwargs):
287 """Returns the siblings of this Tag that match the given
288 criteria and appear after this Tag in the document."""
289 return self._findAll(name, attrs, text, limit,
290 self.nextSiblingGenerator, **kwargs)
291 fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
292
293 def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
294 """Returns the first item that matches the given criteria and
295 appears before this Tag in the document."""
296 return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
297
298 def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
299 **kwargs):
300 """Returns all items that match the given criteria and appear
301 before this Tag in the document."""
302 return self._findAll(name, attrs, text, limit, self.previousGenerator,
303 **kwargs)
304 fetchPrevious = findAllPrevious # Compatibility with pre-3.x
305
306 def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
307 """Returns the closest sibling to this Tag that matches the
308 given criteria and appears before this Tag in the document."""
309 return self._findOne(self.findPreviousSiblings, name, attrs, text,
310 **kwargs)
311
312 def findPreviousSiblings(self, name=None, attrs={}, text=None,
313 limit=None, **kwargs):
314 """Returns the siblings of this Tag that match the given
315 criteria and appear before this Tag in the document."""
316 return self._findAll(name, attrs, text, limit,
317 self.previousSiblingGenerator, **kwargs)
318 fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
319
320 def findParent(self, name=None, attrs={}, **kwargs):
321 """Returns the closest parent of this Tag that matches the given
322 criteria."""
323 # NOTE: We can't use _findOne because findParents takes a different
324 # set of arguments.
325 r = None
326 l = self.findParents(name, attrs, 1)
327 if l:
328 r = l[0]
329 return r
330
331 def findParents(self, name=None, attrs={}, limit=None, **kwargs):
332 """Returns the parents of this Tag that match the given
333 criteria."""
334
335 return self._findAll(name, attrs, None, limit, self.parentGenerator,
336 **kwargs)
337 fetchParents = findParents # Compatibility with pre-3.x
338
339 #These methods do the real heavy lifting.
340
341 def _findOne(self, method, name, attrs, text, **kwargs):
342 r = None
343 l = method(name, attrs, text, 1, **kwargs)
344 if l:
345 r = l[0]
346 return r
347
348 def _findAll(self, name, attrs, text, limit, generator, **kwargs):
349 "Iterates over a generator looking for things that match."
350
351 if isinstance(name, SoupStrainer):
352 strainer = name
353 # (Possibly) special case some findAll*(...) searches
354 elif text is None and not limit and not attrs and not kwargs:
355 # findAll*(True)
356 if name is True:
357 return [element for element in generator()
358 if isinstance(element, Tag)]
359 # findAll*('tag-name')
360 elif isinstance(name, basestring):
361 return [element for element in generator()
362 if isinstance(element, Tag) and
363 element.name == name]
364 else:
365 strainer = SoupStrainer(name, attrs, text, **kwargs)
366 # Build a SoupStrainer
367 else:
368 strainer = SoupStrainer(name, attrs, text, **kwargs)
369 results = ResultSet(strainer)
370 g = generator()
371 while True:
372 try:
373 i = g.next()
374 except StopIteration:
375 break
376 if i:
377 found = strainer.search(i)
378 if found:
379 results.append(found)
380 if limit and len(results) >= limit:
381 break
382 return results
383
384 #These Generators can be used to navigate starting from both
385 #NavigableStrings and Tags.
386 def nextGenerator(self):
387 i = self
388 while i is not None:
389 i = i.next
390 yield i
391
392 def nextSiblingGenerator(self):
393 i = self
394 while i is not None:
395 i = i.nextSibling
396 yield i
397
398 def previousGenerator(self):
399 i = self
400 while i is not None:
401 i = i.previous
402 yield i
403
404 def previousSiblingGenerator(self):
405 i = self
406 while i is not None:
407 i = i.previousSibling
408 yield i
409
410 def parentGenerator(self):
411 i = self
412 while i is not None:
413 i = i.parent
414 yield i
415
416 # Utility methods
417 def substituteEncoding(self, str, encoding=None):
418 encoding = encoding or "utf-8"
419 return str.replace("%SOUP-ENCODING%", encoding)
420
421 def toEncoding(self, s, encoding=None):
422 """Encodes an object to a string in some encoding, or to Unicode.
423 ."""
424 if isinstance(s, unicode):
425 if encoding:
426 s = s.encode(encoding)
427 elif isinstance(s, str):
428 if encoding:
429 s = s.encode(encoding)
430 else:
431 s = unicode(s)
432 else:
433 if encoding:
434 s = self.toEncoding(str(s), encoding)
435 else:
436 s = unicode(s)
437 return s
438
439 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
440 + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
441 + ")")
442
443 def _sub_entity(self, x):
444 """Used with a regular expression to substitute the
445 appropriate XML entity for an XML special character."""
446 return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
447
448
449 class NavigableString(unicode, PageElement):
450
451 def __new__(cls, value):
452 """Create a new NavigableString.
453
454 When unpickling a NavigableString, this method is called with
455 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
456 passed in to the superclass's __new__ or the superclass won't know
457 how to handle non-ASCII characters.
458 """
459 if isinstance(value, unicode):
460 return unicode.__new__(cls, value)
461 return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
462
463 def __getnewargs__(self):
464 return (NavigableString.__str__(self),)
465
466 def __getattr__(self, attr):
467 """text.string gives you text. This is for backwards
468 compatibility for Navigable*String, but for CData* it lets you
469 get the string without the CData wrapper."""
470 if attr == 'string':
471 return self
472 else:
473 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
474
475 def __unicode__(self):
476 return str(self).decode(DEFAULT_OUTPUT_ENCODING)
477
478 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
479 # Substitute outgoing XML entities.
480 data = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, self)
481 if encoding:
482 return data.encode(encoding)
483 else:
484 return data
485
486 class CData(NavigableString):
487
488 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
489 return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
490
491 class ProcessingInstruction(NavigableString):
492 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
493 output = self
494 if "%SOUP-ENCODING%" in output:
495 output = self.substituteEncoding(output, encoding)
496 return "<?%s?>" % self.toEncoding(output, encoding)
497
498 class Comment(NavigableString):
499 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
500 return "<!--%s-->" % NavigableString.__str__(self, encoding)
501
502 class Declaration(NavigableString):
503 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
504 return "<!%s>" % NavigableString.__str__(self, encoding)
505
506 class Tag(PageElement):
507
508 """Represents a found HTML tag with its attributes and contents."""
509
510 def _convertEntities(self, match):
511 """Used in a call to re.sub to replace HTML, XML, and numeric
512 entities with the appropriate Unicode characters. If HTML
513 entities are being converted, any unrecognized entities are
514 escaped."""
515 x = match.group(1)
516 if self.convertHTMLEntities and x in name2codepoint:
517 return unichr(name2codepoint[x])
518 elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
519 if self.convertXMLEntities:
520 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
521 else:
522 return u'&%s;' % x
523 elif len(x) > 0 and x[0] == '#':
524 # Handle numeric entities
525 if len(x) > 1 and x[1] == 'x':
526 return unichr(int(x[2:], 16))
527 else:
528 return unichr(int(x[1:]))
529
530 elif self.escapeUnrecognizedEntities:
531 return u'&amp;%s;' % x
532 else:
533 return u'&%s;' % x
534
535 def __init__(self, parser, name, attrs=None, parent=None,
536 previous=None):
537 "Basic constructor."
538
539 # We don't actually store the parser object: that lets extracted
540 # chunks be garbage-collected
541 self.parserClass = parser.__class__
542 self.isSelfClosing = parser.isSelfClosingTag(name)
543 self.name = name
544 if attrs is None:
545 attrs = []
546 elif isinstance(attrs, dict):
547 attrs = attrs.items()
548 self.attrs = attrs
549 self.contents = []
550 self.setup(parent, previous)
551 self.hidden = False
552 self.containsSubstitutions = False
553 self.convertHTMLEntities = parser.convertHTMLEntities
554 self.convertXMLEntities = parser.convertXMLEntities
555 self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
556
557 # Convert any HTML, XML, or numeric entities in the attribute values.
558 convert = lambda(k, val): (k,
559 re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
560 self._convertEntities,
561 val))
562 self.attrs = map(convert, self.attrs)
563
564 def getString(self):
565 if (len(self.contents) == 1
566 and isinstance(self.contents[0], NavigableString)):
567 return self.contents[0]
568
569 def setString(self, string):
570 """Replace the contents of the tag with a string"""
571 self.clear()
572 self.append(string)
573
574 string = property(getString, setString)
575
576 def getText(self, separator=u""):
577 if not len(self.contents):
578 return u""
579 stopNode = self._lastRecursiveChild().next
580 strings = []
581 current = self.contents[0]
582 while current is not stopNode:
583 if isinstance(current, NavigableString):
584 strings.append(current.strip())
585 current = current.next
586 return separator.join(strings)
587
588 text = property(getText)
589
590 def get(self, key, default=None):
591 """Returns the value of the 'key' attribute for the tag, or
592 the value given for 'default' if it doesn't have that
593 attribute."""
594 return self._getAttrMap().get(key, default)
595
596 def clear(self):
597 """Extract all children."""
598 for child in self.contents[:]:
599 child.extract()
600
601 def index(self, element):
602 for i, child in enumerate(self.contents):
603 if child is element:
604 return i
605 raise ValueError("Tag.index: element not in tag")
606
607 def has_key(self, key):
608 return self._getAttrMap().has_key(key)
609
610 def __getitem__(self, key):
611 """tag[key] returns the value of the 'key' attribute for the tag,
612 and throws an exception if it's not there."""
613 return self._getAttrMap()[key]
614
615 def __iter__(self):
616 "Iterating over a tag iterates over its contents."
617 return iter(self.contents)
618
619 def __len__(self):
620 "The length of a tag is the length of its list of contents."
621 return len(self.contents)
622
623 def __contains__(self, x):
624 return x in self.contents
625
626 def __nonzero__(self):
627 "A tag is non-None even if it has no contents."
628 return True
629
630 def __setitem__(self, key, value):
631 """Setting tag[key] sets the value of the 'key' attribute for the
632 tag."""
633 self._getAttrMap()
634 self.attrMap[key] = value
635 found = False
636 for i in range(0, len(self.attrs)):
637 if self.attrs[i][0] == key:
638 self.attrs[i] = (key, value)
639 found = True
640 if not found:
641 self.attrs.append((key, value))
642 self._getAttrMap()[key] = value
643
644 def __delitem__(self, key):
645 "Deleting tag[key] deletes all 'key' attributes for the tag."
646 for item in self.attrs:
647 if item[0] == key:
648 self.attrs.remove(item)
649 #We don't break because bad HTML can define the same
650 #attribute multiple times.
651 self._getAttrMap()
652 if self.attrMap.has_key(key):
653 del self.attrMap[key]
654
655 def __call__(self, *args, **kwargs):
656 """Calling a tag like a function is the same as calling its
657 findAll() method. Eg. tag('a') returns a list of all the A tags
658 found within this tag."""
659 return apply(self.findAll, args, kwargs)
660
661 def __getattr__(self, tag):
662 #print "Getattr %s.%s" % (self.__class__, tag)
663 if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
664 return self.find(tag[:-3])
665 elif tag.find('__') != 0:
666 return self.find(tag)
667 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
668
669 def __eq__(self, other):
670 """Returns true iff this tag has the same name, the same attributes,
671 and the same contents (recursively) as the given tag.
672
673 NOTE: right now this will return false if two tags have the
674 same attributes in a different order. Should this be fixed?"""
675 if other is self:
676 return True
677 if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
678 return False
679 for i in range(0, len(self.contents)):
680 if self.contents[i] != other.contents[i]:
681 return False
682 return True
683
684 def __ne__(self, other):
685 """Returns true iff this tag is not identical to the other tag,
686 as defined in __eq__."""
687 return not self == other
688
689 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
690 """Renders this tag as a string."""
691 return self.__str__(encoding)
692
693 def __unicode__(self):
694 return self.__str__(None)
695
696 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
697 prettyPrint=False, indentLevel=0):
698 """Returns a string or Unicode representation of this tag and
699 its contents. To get Unicode, pass None for encoding.
700
701 NOTE: since Python's HTML parser consumes whitespace, this
702 method is not certain to reproduce the whitespace present in
703 the original string."""
704
705 encodedName = self.toEncoding(self.name, encoding)
706
707 attrs = []
708 if self.attrs:
709 for key, val in self.attrs:
710 fmt = '%s="%s"'
711 if isinstance(val, basestring):
712 if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
713 val = self.substituteEncoding(val, encoding)
714
715 # The attribute value either:
716 #
717 # * Contains no embedded double quotes or single quotes.
718 # No problem: we enclose it in double quotes.
719 # * Contains embedded single quotes. No problem:
720 # double quotes work here too.
721 # * Contains embedded double quotes. No problem:
722 # we enclose it in single quotes.
723 # * Embeds both single _and_ double quotes. This
724 # can't happen naturally, but it can happen if
725 # you modify an attribute value after parsing
726 # the document. Now we have a bit of a
727 # problem. We solve it by enclosing the
728 # attribute in single quotes, and escaping any
729 # embedded single quotes to XML entities.
730 if '"' in val:
731 fmt = "%s='%s'"
732 if "'" in val:
733 # TODO: replace with apos when
734 # appropriate.
735 val = val.replace("'", "&squot;")
736
737 # Now we're okay w/r/t quotes. But the attribute
738 # value might also contain angle brackets, or
739 # ampersands that aren't part of entities. We need
740 # to escape those to XML entities too.
741 val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
742
743 attrs.append(fmt % (self.toEncoding(key, encoding),
744 self.toEncoding(val, encoding)))
745 close = ''
746 closeTag = ''
747 if self.isSelfClosing:
748 close = ' /'
749 else:
750 closeTag = '</%s>' % encodedName
751
752 indentTag, indentContents = 0, 0
753 if prettyPrint:
754 indentTag = indentLevel
755 space = (' ' * (indentTag-1))
756 indentContents = indentTag + 1
757 contents = self.renderContents(encoding, prettyPrint, indentContents)
758 if self.hidden:
759 s = contents
760 else:
761 s = []
762 attributeString = ''
763 if attrs:
764 attributeString = ' ' + ' '.join(attrs)
765 if prettyPrint:
766 s.append(space)
767 s.append('<%s%s%s>' % (encodedName, attributeString, close))
768 if prettyPrint:
769 s.append("\n")
770 s.append(contents)
771 if prettyPrint and contents and contents[-1] != "\n":
772 s.append("\n")
773 if prettyPrint and closeTag:
774 s.append(space)
775 s.append(closeTag)
776 if prettyPrint and closeTag and self.nextSibling:
777 s.append("\n")
778 s = ''.join(s)
779 return s
780
781 def decompose(self):
782 """Recursively destroys the contents of this tree."""
783 self.extract()
784 if len(self.contents) == 0:
785 return
786 current = self.contents[0]
787 while current is not None:
788 next = current.next
789 if isinstance(current, Tag):
790 del current.contents[:]
791 current.parent = None
792 current.previous = None
793 current.previousSibling = None
794 current.next = None
795 current.nextSibling = None
796 current = next
797
798 def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
799 return self.__str__(encoding, True)
800
801 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
802 prettyPrint=False, indentLevel=0):
803 """Renders the contents of this tag as a string in the given
804 encoding. If encoding is None, returns a Unicode string.."""
805 s=[]
806 for c in self:
807 text = None
808 if isinstance(c, NavigableString):
809 text = c.__str__(encoding)
810 elif isinstance(c, Tag):
811 s.append(c.__str__(encoding, prettyPrint, indentLevel))
812 if text and prettyPrint:
813 text = text.strip()
814 if text:
815 if prettyPrint:
816 s.append(" " * (indentLevel-1))
817 s.append(text)
818 if prettyPrint:
819 s.append("\n")
820 return ''.join(s)
821
822 #Soup methods
823
824 def find(self, name=None, attrs={}, recursive=True, text=None,
825 **kwargs):
826 """Return only the first child of this Tag matching the given
827 criteria."""
828 r = None
829 l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
830 if l:
831 r = l[0]
832 return r
833 findChild = find
834
835 def findAll(self, name=None, attrs={}, recursive=True, text=None,
836 limit=None, **kwargs):
837 """Extracts a list of Tag objects that match the given
838 criteria. You can specify the name of the Tag and any
839 attributes you want the Tag to have.
840
841 The value of a key-value pair in the 'attrs' map can be a
842 string, a list of strings, a regular expression object, or a
843 callable that takes a string and returns whether or not the
844 string matches for some custom definition of 'matches'. The
845 same is true of the tag name."""
846 generator = self.recursiveChildGenerator
847 if not recursive:
848 generator = self.childGenerator
849 return self._findAll(name, attrs, text, limit, generator, **kwargs)
850 findChildren = findAll
851
852 # Pre-3.x compatibility methods
853 first = find
854 fetch = findAll
855
856 def fetchText(self, text=None, recursive=True, limit=None):
857 return self.findAll(text=text, recursive=recursive, limit=limit)
858
859 def firstText(self, text=None, recursive=True):
860 return self.find(text=text, recursive=recursive)
861
862 #Private methods
863
864 def _getAttrMap(self):
865 """Initializes a map representation of this tag's attributes,
866 if not already initialized."""
867 if not getattr(self, 'attrMap'):
868 self.attrMap = {}
869 for (key, value) in self.attrs:
870 self.attrMap[key] = value
871 return self.attrMap
872
873 #Generator methods
874 def childGenerator(self):
875 # Just use the iterator from the contents
876 return iter(self.contents)
877
878 def recursiveChildGenerator(self):
879 if not len(self.contents):
880 raise StopIteration
881 stopNode = self._lastRecursiveChild().next
882 current = self.contents[0]
883 while current is not stopNode:
884 yield current
885 current = current.next
886
887
888 # Next, a couple classes to represent queries and their results.
889 class SoupStrainer:
890 """Encapsulates a number of ways of matching a markup element (tag or
891 text)."""
892
893 def __init__(self, name=None, attrs={}, text=None, **kwargs):
894 self.name = name
895 if isinstance(attrs, basestring):
896 kwargs['class'] = _match_css_class(attrs)
897 attrs = None
898 if kwargs:
899 if attrs:
900 attrs = attrs.copy()
901 attrs.update(kwargs)
902 else:
903 attrs = kwargs
904 self.attrs = attrs
905 self.text = text
906
907 def __str__(self):
908 if self.text:
909 return self.text
910 else:
911 return "%s|%s" % (self.name, self.attrs)
912
913 def searchTag(self, markupName=None, markupAttrs={}):
914 found = None
915 markup = None
916 if isinstance(markupName, Tag):
917 markup = markupName
918 markupAttrs = markup
919 callFunctionWithTagData = callable(self.name) \
920 and not isinstance(markupName, Tag)
921
922 if (not self.name) \
923 or callFunctionWithTagData \
924 or (markup and self._matches(markup, self.name)) \
925 or (not markup and self._matches(markupName, self.name)):
926 if callFunctionWithTagData:
927 match = self.name(markupName, markupAttrs)
928 else:
929 match = True
930 markupAttrMap = None
931 for attr, matchAgainst in self.attrs.items():
932 if not markupAttrMap:
933 if hasattr(markupAttrs, 'get'):
934 markupAttrMap = markupAttrs
935 else:
936 markupAttrMap = {}
937 for k,v in markupAttrs:
938 markupAttrMap[k] = v
939 attrValue = markupAttrMap.get(attr)
940 if not self._matches(attrValue, matchAgainst):
941 match = False
942 break
943 if match:
944 if markup:
945 found = markup
946 else:
947 found = markupName
948 return found
949
950 def search(self, markup):
951 #print 'looking for %s in %s' % (self, markup)
952 found = None
953 # If given a list of items, scan it for a text element that
954 # matches.
955 if hasattr(markup, "__iter__") \
956 and not isinstance(markup, Tag):
957 for element in markup:
958 if isinstance(element, NavigableString) \
959 and self.search(element):
960 found = element
961 break
962 # If it's a Tag, make sure its name or attributes match.
963 # Don't bother with Tags if we're searching for text.
964 elif isinstance(markup, Tag):
965 if not self.text:
966 found = self.searchTag(markup)
967 # If it's text, make sure the text matches.
968 elif isinstance(markup, NavigableString) or \
969 isinstance(markup, basestring):
970 if self._matches(markup, self.text):
971 found = markup
972 else:
973 raise Exception, "I don't know how to match against a %s" \
974 % markup.__class__
975 return found
976
977 def _matches(self, markup, matchAgainst):
978 #print "Matching %s against %s" % (markup, matchAgainst)
979 result = False
980 if matchAgainst is True:
981 result = markup is not None
982 elif callable(matchAgainst):
983 result = matchAgainst(markup)
984 else:
985 #Custom match methods take the tag as an argument, but all
986 #other ways of matching match the tag name as a string.
987 if isinstance(markup, Tag):
988 markup = markup.name
989 if markup and not isinstance(markup, basestring):
990 markup = unicode(markup)
991 #Now we know that chunk is either a string, or None.
992 if hasattr(matchAgainst, 'match'):
993 # It's a regexp object.
994 result = markup and matchAgainst.search(markup)
995 elif hasattr(matchAgainst, '__iter__'): # list-like
996 result = markup in matchAgainst
997 elif hasattr(matchAgainst, 'items'):
998 result = markup.has_key(matchAgainst)
999 elif matchAgainst and isinstance(markup, basestring):
1000 if isinstance(markup, unicode):
1001 matchAgainst = unicode(matchAgainst)
1002 else:
1003 matchAgainst = str(matchAgainst)
1004
1005 if not result:
1006 result = matchAgainst == markup
1007 return result
1008
1009 class ResultSet(list):
1010 """A ResultSet is just a list that keeps track of the SoupStrainer
1011 that created it."""
1012 def __init__(self, source):
1013 list.__init__([])
1014 self.source = source
1015
1016 # Now, some helper functions.
1017
1018 def buildTagMap(default, *args):
1019 """Turns a list of maps, lists, or scalars into a single map.
1020 Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
1021 NESTING_RESET_TAGS maps out of lists and partial maps."""
1022 built = {}
1023 for portion in args:
1024 if hasattr(portion, 'items'):
1025 #It's a map. Merge it.
1026 for k,v in portion.items():
1027 built[k] = v
1028 elif hasattr(portion, '__iter__'): # is a list
1029 #It's a list. Map each item to the default.
1030 for k in portion:
1031 built[k] = default
1032 else:
1033 #It's a scalar. Map it to the default.
1034 built[portion] = default
1035 return built
1036
1037 # Now, the parser classes.
1038
1039 class BeautifulStoneSoup(Tag, SGMLParser):
1040
1041 """This class contains the basic parser and search code. It defines
1042 a parser that knows nothing about tag behavior except for the
1043 following:
1044
1045 You can't close a tag without closing all the tags it encloses.
1046 That is, "<foo><bar></foo>" actually means
1047 "<foo><bar></bar></foo>".
1048
1049 [Another possible explanation is "<foo><bar /></foo>", but since
1050 this class defines no SELF_CLOSING_TAGS, it will never use that
1051 explanation.]
1052
1053 This class is useful for parsing XML or made-up markup languages,
1054 or when BeautifulSoup makes an assumption counter to what you were
1055 expecting."""
1056
1057 SELF_CLOSING_TAGS = {}
1058 NESTABLE_TAGS = {}
1059 RESET_NESTING_TAGS = {}
1060 QUOTE_TAGS = {}
1061 PRESERVE_WHITESPACE_TAGS = []
1062
1063 MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
1064 lambda x: x.group(1) + ' />'),
1065 (re.compile('<!\s+([^<>]*)>'),
1066 lambda x: '<!' + x.group(1) + '>')
1067 ]
1068
1069 ROOT_TAG_NAME = u'[document]'
1070
1071 HTML_ENTITIES = "html"
1072 XML_ENTITIES = "xml"
1073 XHTML_ENTITIES = "xhtml"
1074 # TODO: This only exists for backwards-compatibility
1075 ALL_ENTITIES = XHTML_ENTITIES
1076
1077 # Used when determining whether a text node is all whitespace and
1078 # can be replaced with a single space. A text node that contains
1079 # fancy Unicode spaces (usually non-breaking) should be left
1080 # alone.
1081 STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
1082
1083 def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1084 markupMassage=True, smartQuotesTo=XML_ENTITIES,
1085 convertEntities=None, selfClosingTags=None, isHTML=False):
1086 """The Soup object is initialized as the 'root tag', and the
1087 provided markup (which can be a string or a file-like object)
1088 is fed into the underlying parser.
1089
1090 sgmllib will process most bad HTML, and the BeautifulSoup
1091 class has some tricks for dealing with some HTML that kills
1092 sgmllib, but Beautiful Soup can nonetheless choke or lose data
1093 if your data uses self-closing tags or declarations
1094 incorrectly.
1095
1096 By default, Beautiful Soup uses regexes to sanitize input,
1097 avoiding the vast majority of these problems. If the problems
1098 don't apply to you, pass in False for markupMassage, and
1099 you'll get better performance.
1100
1101 The default parser massage techniques fix the two most common
1102 instances of invalid HTML that choke sgmllib:
1103
1104 <br/> (No space between name of closing tag and tag close)
1105 <! --Comment--> (Extraneous whitespace in declaration)
1106
1107 You can pass in a custom list of (RE object, replace method)
1108 tuples to get Beautiful Soup to scrub your input the way you
1109 want."""
1110
1111 self.parseOnlyThese = parseOnlyThese
1112 self.fromEncoding = fromEncoding
1113 self.smartQuotesTo = smartQuotesTo
1114 self.convertEntities = convertEntities
1115 # Set the rules for how we'll deal with the entities we
1116 # encounter
1117 if self.convertEntities:
1118 # It doesn't make sense to convert encoded characters to
1119 # entities even while you're converting entities to Unicode.
1120 # Just convert it all to Unicode.
1121 self.smartQuotesTo = None
1122 if convertEntities == self.HTML_ENTITIES:
1123 self.convertXMLEntities = False
1124 self.convertHTMLEntities = True
1125 self.escapeUnrecognizedEntities = True
1126 elif convertEntities == self.XHTML_ENTITIES:
1127 self.convertXMLEntities = True
1128 self.convertHTMLEntities = True
1129 self.escapeUnrecognizedEntities = False
1130 elif convertEntities == self.XML_ENTITIES:
1131 self.convertXMLEntities = True
1132 self.convertHTMLEntities = False
1133 self.escapeUnrecognizedEntities = False
1134 else:
1135 self.convertXMLEntities = False
1136 self.convertHTMLEntities = False
1137 self.escapeUnrecognizedEntities = False
1138
1139 self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1140 SGMLParser.__init__(self)
1141
1142 if hasattr(markup, 'read'): # It's a file-type object.
1143 markup = markup.read()
1144 self.markup = markup
1145 self.markupMassage = markupMassage
1146 try:
1147 self._feed(isHTML=isHTML)
1148 except StopParsing:
1149 pass
1150 self.markup = None # The markup can now be GCed
1151
1152 def convert_charref(self, name):
1153 """This method fixes a bug in Python's SGMLParser."""
1154 try:
1155 n = int(name)
1156 except ValueError:
1157 return
1158 if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1159 return
1160 return self.convert_codepoint(n)
1161
1162 def _feed(self, inDocumentEncoding=None, isHTML=False):
1163 # Convert the document to Unicode.
1164 markup = self.markup
1165 if isinstance(markup, unicode):
1166 if not hasattr(self, 'originalEncoding'):
1167 self.originalEncoding = None
1168 else:
1169 dammit = UnicodeDammit\
1170 (markup, [self.fromEncoding, inDocumentEncoding],
1171 smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
1172 markup = dammit.unicode
1173 self.originalEncoding = dammit.originalEncoding
1174 self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
1175 if markup:
1176 if self.markupMassage:
1177 if not hasattr(self.markupMassage, "__iter__"):
1178 self.markupMassage = self.MARKUP_MASSAGE
1179 for fix, m in self.markupMassage:
1180 markup = fix.sub(m, markup)
1181 # TODO: We get rid of markupMassage so that the
1182 # soup object can be deepcopied later on. Some
1183 # Python installations can't copy regexes. If anyone
1184 # was relying on the existence of markupMassage, this
1185 # might cause problems.
1186 del(self.markupMassage)
1187 self.reset()
1188
1189 SGMLParser.feed(self, markup)
1190 # Close out any unfinished strings and close all the open tags.
1191 self.endData()
1192 while self.currentTag.name != self.ROOT_TAG_NAME:
1193 self.popTag()
1194
1195 def __getattr__(self, methodName):
1196 """This method routes method call requests to either the SGMLParser
1197 superclass or the Tag superclass, depending on the method name."""
1198 #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1199
1200 if methodName.startswith('start_') or methodName.startswith('end_') \
1201 or methodName.startswith('do_'):
1202 return SGMLParser.__getattr__(self, methodName)
1203 elif not methodName.startswith('__'):
1204 return Tag.__getattr__(self, methodName)
1205 else:
1206 raise AttributeError
1207
1208 def isSelfClosingTag(self, name):
1209 """Returns true iff the given string is the name of a
1210 self-closing tag according to this parser."""
1211 return self.SELF_CLOSING_TAGS.has_key(name) \
1212 or self.instanceSelfClosingTags.has_key(name)
1213
1214 def reset(self):
1215 Tag.__init__(self, self, self.ROOT_TAG_NAME)
1216 self.hidden = 1
1217 SGMLParser.reset(self)
1218 self.currentData = []
1219 self.currentTag = None
1220 self.tagStack = []
1221 self.quoteStack = []
1222 self.pushTag(self)
1223
1224 def popTag(self):
1225 tag = self.tagStack.pop()
1226
1227 #print "Pop", tag.name
1228 if self.tagStack:
1229 self.currentTag = self.tagStack[-1]
1230 return self.currentTag
1231
1232 def pushTag(self, tag):
1233 #print "Push", tag.name
1234 if self.currentTag:
1235 self.currentTag.contents.append(tag)
1236 self.tagStack.append(tag)
1237 self.currentTag = self.tagStack[-1]
1238
1239 def endData(self, containerClass=NavigableString):
1240 if self.currentData:
1241 currentData = u''.join(self.currentData)
1242 if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
1243 not set([tag.name for tag in self.tagStack]).intersection(
1244 self.PRESERVE_WHITESPACE_TAGS)):
1245 if '\n' in currentData:
1246 currentData = '\n'
1247 else:
1248 currentData = ' '
1249 self.currentData = []
1250 if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1251 (not self.parseOnlyThese.text or \
1252 not self.parseOnlyThese.search(currentData)):
1253 return
1254 o = containerClass(currentData)
1255 o.setup(self.currentTag, self.previous)
1256 if self.previous:
1257 self.previous.next = o
1258 self.previous = o
1259 self.currentTag.contents.append(o)
1260
1261
1262 def _popToTag(self, name, inclusivePop=True):
1263 """Pops the tag stack up to and including the most recent
1264 instance of the given tag. If inclusivePop is false, pops the tag
1265 stack up to but *not* including the most recent instqance of
1266 the given tag."""
1267 #print "Popping to %s" % name
1268 if name == self.ROOT_TAG_NAME:
1269 return
1270
1271 numPops = 0
1272 mostRecentTag = None
1273 for i in range(len(self.tagStack)-1, 0, -1):
1274 if name == self.tagStack[i].name:
1275 numPops = len(self.tagStack)-i
1276 break
1277 if not inclusivePop:
1278 numPops = numPops - 1
1279
1280 for i in range(0, numPops):
1281 mostRecentTag = self.popTag()
1282 return mostRecentTag
1283
1284 def _smartPop(self, name):
1285
1286 """We need to pop up to the previous tag of this type, unless
1287 one of this tag's nesting reset triggers comes between this
1288 tag and the previous tag of this type, OR unless this tag is a
1289 generic nesting trigger and another generic nesting trigger
1290 comes between this tag and the previous tag of this type.
1291
1292 Examples:
1293 <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1294 <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1295 <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1296
1297 <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1298 <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1299 <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1300 """
1301
1302 nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1303 isNestable = nestingResetTriggers != None
1304 isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1305 popTo = None
1306 inclusive = True
1307 for i in range(len(self.tagStack)-1, 0, -1):
1308 p = self.tagStack[i]
1309 if (not p or p.name == name) and not isNestable:
1310 #Non-nestable tags get popped to the top or to their
1311 #last occurance.
1312 popTo = name
1313 break
1314 if (nestingResetTriggers is not None
1315 and p.name in nestingResetTriggers) \
1316 or (nestingResetTriggers is None and isResetNesting
1317 and self.RESET_NESTING_TAGS.has_key(p.name)):
1318
1319 #If we encounter one of the nesting reset triggers
1320 #peculiar to this tag, or we encounter another tag
1321 #that causes nesting to reset, pop up to but not
1322 #including that tag.
1323 popTo = p.name
1324 inclusive = False
1325 break
1326 p = p.parent
1327 if popTo:
1328 self._popToTag(popTo, inclusive)
1329
1330 def unknown_starttag(self, name, attrs, selfClosing=0):
1331 #print "Start tag %s: %s" % (name, attrs)
1332 if self.quoteStack:
1333 #This is not a real tag.
1334 #print "<%s> is not real!" % name
1335 attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
1336 self.handle_data('<%s%s>' % (name, attrs))
1337 return
1338 self.endData()
1339
1340 if not self.isSelfClosingTag(name) and not selfClosing:
1341 self._smartPop(name)
1342
1343 if self.parseOnlyThese and len(self.tagStack) <= 1 \
1344 and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1345 return
1346
1347 tag = Tag(self, name, attrs, self.currentTag, self.previous)
1348 if self.previous:
1349 self.previous.next = tag
1350 self.previous = tag
1351 self.pushTag(tag)
1352 if selfClosing or self.isSelfClosingTag(name):
1353 self.popTag()
1354 if name in self.QUOTE_TAGS:
1355 #print "Beginning quote (%s)" % name
1356 self.quoteStack.append(name)
1357 self.literal = 1
1358 return tag
1359
1360 def unknown_endtag(self, name):
1361 #print "End tag %s" % name
1362 if self.quoteStack and self.quoteStack[-1] != name:
1363 #This is not a real end tag.
1364 #print "</%s> is not real!" % name
1365 self.handle_data('</%s>' % name)
1366 return
1367 self.endData()
1368 self._popToTag(name)
1369 if self.quoteStack and self.quoteStack[-1] == name:
1370 self.quoteStack.pop()
1371 self.literal = (len(self.quoteStack) > 0)
1372
1373 def handle_data(self, data):
1374 self.currentData.append(data)
1375
1376 def _toStringSubclass(self, text, subclass):
1377 """Adds a certain piece of text to the tree as a NavigableString
1378 subclass."""
1379 self.endData()
1380 self.handle_data(text)
1381 self.endData(subclass)
1382
1383 def handle_pi(self, text):
1384 """Handle a processing instruction as a ProcessingInstruction
1385 object, possibly one with a %SOUP-ENCODING% slot into which an
1386 encoding will be plugged later."""
1387 if text[:3] == "xml":
1388 text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1389 self._toStringSubclass(text, ProcessingInstruction)
1390
1391 def handle_comment(self, text):
1392 "Handle comments as Comment objects."
1393 self._toStringSubclass(text, Comment)
1394
1395 def handle_charref(self, ref):
1396 "Handle character references as data."
1397 if self.convertEntities:
1398 data = unichr(int(ref))
1399 else:
1400 data = '&#%s;' % ref
1401 self.handle_data(data)
1402
1403 def handle_entityref(self, ref):
1404 """Handle entity references as data, possibly converting known
1405 HTML and/or XML entity references to the corresponding Unicode
1406 characters."""
1407 data = None
1408 if self.convertHTMLEntities:
1409 try:
1410 data = unichr(name2codepoint[ref])
1411 except KeyError:
1412 pass
1413
1414 if not data and self.convertXMLEntities:
1415 data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1416
1417 if not data and self.convertHTMLEntities and \
1418 not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1419 # TODO: We've got a problem here. We're told this is
1420 # an entity reference, but it's not an XML entity
1421 # reference or an HTML entity reference. Nonetheless,
1422 # the logical thing to do is to pass it through as an
1423 # unrecognized entity reference.
1424 #
1425 # Except: when the input is "&carol;" this function
1426 # will be called with input "carol". When the input is
1427 # "AT&T", this function will be called with input
1428 # "T". We have no way of knowing whether a semicolon
1429 # was present originally, so we don't know whether
1430 # this is an unknown entity or just a misplaced
1431 # ampersand.
1432 #
1433 # The more common case is a misplaced ampersand, so I
1434 # escape the ampersand and omit the trailing semicolon.
1435 data = "&amp;%s" % ref
1436 if not data:
1437 # This case is different from the one above, because we
1438 # haven't already gone through a supposedly comprehensive
1439 # mapping of entities to Unicode characters. We might not
1440 # have gone through any mapping at all. So the chances are
1441 # very high that this is a real entity, and not a
1442 # misplaced ampersand.
1443 data = "&%s;" % ref
1444 self.handle_data(data)
1445
1446 def handle_decl(self, data):
1447 "Handle DOCTYPEs and the like as Declaration objects."
1448 self._toStringSubclass(data, Declaration)
1449
1450 def parse_declaration(self, i):
1451 """Treat a bogus SGML declaration as raw data. Treat a CDATA
1452 declaration as a CData object."""
1453 j = None
1454 if self.rawdata[i:i+9] == '<![CDATA[':
1455 k = self.rawdata.find(']]>', i)
1456 if k == -1:
1457 k = len(self.rawdata)
1458 data = self.rawdata[i+9:k]
1459 j = k+3
1460 self._toStringSubclass(data, CData)
1461 else:
1462 try:
1463 j = SGMLParser.parse_declaration(self, i)
1464 except SGMLParseError:
1465 toHandle = self.rawdata[i:]
1466 self.handle_data(toHandle)
1467 j = i + len(toHandle)
1468 return j
1469
1470 class BeautifulSoup(BeautifulStoneSoup):
1471
1472 """This parser knows the following facts about HTML:
1473
1474 * Some tags have no closing tag and should be interpreted as being
1475 closed as soon as they are encountered.
1476
1477 * The text inside some tags (ie. 'script') may contain tags which
1478 are not really part of the document and which should be parsed
1479 as text, not tags. If you want to parse the text as tags, you can
1480 always fetch it and parse it explicitly.
1481
1482 * Tag nesting rules:
1483
1484 Most tags can't be nested at all. For instance, the occurance of
1485 a <p> tag should implicitly close the previous <p> tag.
1486
1487 <p>Para1<p>Para2
1488 should be transformed into:
1489 <p>Para1</p><p>Para2
1490
1491 Some tags can be nested arbitrarily. For instance, the occurance
1492 of a <blockquote> tag should _not_ implicitly close the previous
1493 <blockquote> tag.
1494
1495 Alice said: <blockquote>Bob said: <blockquote>Blah
1496 should NOT be transformed into:
1497 Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1498
1499 Some tags can be nested, but the nesting is reset by the
1500 interposition of other tags. For instance, a <tr> tag should
1501 implicitly close the previous <tr> tag within the same <table>,
1502 but not close a <tr> tag in another table.
1503
1504 <table><tr>Blah<tr>Blah
1505 should be transformed into:
1506 <table><tr>Blah</tr><tr>Blah
1507 but,
1508 <tr>Blah<table><tr>Blah
1509 should NOT be transformed into
1510 <tr>Blah<table></tr><tr>Blah
1511
1512 Differing assumptions about tag nesting rules are a major source
1513 of problems with the BeautifulSoup class. If BeautifulSoup is not
1514 treating as nestable a tag your page author treats as nestable,
1515 try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1516 BeautifulStoneSoup before writing your own subclass."""
1517
1518 def __init__(self, *args, **kwargs):
1519 if not kwargs.has_key('smartQuotesTo'):
1520 kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1521 kwargs['isHTML'] = True
1522 BeautifulStoneSoup.__init__(self, *args, **kwargs)
1523
1524 SELF_CLOSING_TAGS = buildTagMap(None,
1525 ('br' , 'hr', 'input', 'img', 'meta',
1526 'spacer', 'link', 'frame', 'base', 'col'))
1527
1528 PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
1529
1530 QUOTE_TAGS = {'script' : None, 'textarea' : None}
1531
1532 #According to the HTML standard, each of these inline tags can
1533 #contain another tag of the same type. Furthermore, it's common
1534 #to actually use these tags this way.
1535 NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1536 'center')
1537
1538 #According to the HTML standard, these block tags can contain
1539 #another tag of the same type. Furthermore, it's common
1540 #to actually use these tags this way.
1541 NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
1542
1543 #Lists can contain other lists, but there are restrictions.
1544 NESTABLE_LIST_TAGS = { 'ol' : [],
1545 'ul' : [],
1546 'li' : ['ul', 'ol'],
1547 'dl' : [],
1548 'dd' : ['dl'],
1549 'dt' : ['dl'] }
1550
1551 #Tables can contain other tables, but there are restrictions.
1552 NESTABLE_TABLE_TAGS = {'table' : [],
1553 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1554 'td' : ['tr'],
1555 'th' : ['tr'],
1556 'thead' : ['table'],
1557 'tbody' : ['table'],
1558 'tfoot' : ['table'],
1559 }
1560
1561 NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
1562
1563 #If one of these tags is encountered, all tags up to the next tag of
1564 #this type are popped.
1565 RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1566 NON_NESTABLE_BLOCK_TAGS,
1567 NESTABLE_LIST_TAGS,
1568 NESTABLE_TABLE_TAGS)
1569
1570 NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1571 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1572
1573 # Used to detect the charset in a META tag; see start_meta
1574 CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
1575
1576 def start_meta(self, attrs):
1577 """Beautiful Soup can detect a charset included in a META tag,
1578 try to convert the document to that charset, and re-parse the
1579 document from the beginning."""
1580 httpEquiv = None
1581 contentType = None
1582 contentTypeIndex = None
1583 tagNeedsEncodingSubstitution = False
1584
1585 for i in range(0, len(attrs)):
1586 key, value = attrs[i]
1587 key = key.lower()
1588 if key == 'http-equiv':
1589 httpEquiv = value
1590 elif key == 'content':
1591 contentType = value
1592 contentTypeIndex = i
1593
1594 if httpEquiv and contentType: # It's an interesting meta tag.
1595 match = self.CHARSET_RE.search(contentType)
1596 if match:
1597 if (self.declaredHTMLEncoding is not None or
1598 self.originalEncoding == self.fromEncoding):
1599 # An HTML encoding was sniffed while converting
1600 # the document to Unicode, or an HTML encoding was
1601 # sniffed during a previous pass through the
1602 # document, or an encoding was specified
1603 # explicitly and it worked. Rewrite the meta tag.
1604 def rewrite(match):
1605 return match.group(1) + "%SOUP-ENCODING%"
1606 newAttr = self.CHARSET_RE.sub(rewrite, contentType)
1607 attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1608 newAttr)
1609 tagNeedsEncodingSubstitution = True
1610 else:
1611 # This is our first pass through the document.
1612 # Go through it again with the encoding information.
1613 newCharset = match.group(3)
1614 if newCharset and newCharset != self.originalEncoding:
1615 self.declaredHTMLEncoding = newCharset
1616 self._feed(self.declaredHTMLEncoding)
1617 raise StopParsing
1618 pass
1619 tag = self.unknown_starttag("meta", attrs)
1620 if tag and tagNeedsEncodingSubstitution:
1621 tag.containsSubstitutions = True
1622
1623 class StopParsing(Exception):
1624 pass
1625
1626 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1627
1628 """The BeautifulSoup class is oriented towards skipping over
1629 common HTML errors like unclosed tags. However, sometimes it makes
1630 errors of its own. For instance, consider this fragment:
1631
1632 <b>Foo<b>Bar</b></b>
1633
1634 This is perfectly valid (if bizarre) HTML. However, the
1635 BeautifulSoup class will implicitly close the first b tag when it
1636 encounters the second 'b'. It will think the author wrote
1637 "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1638 there's no real-world reason to bold something that's already
1639 bold. When it encounters '</b></b>' it will close two more 'b'
1640 tags, for a grand total of three tags closed instead of two. This
1641 can throw off the rest of your document structure. The same is
1642 true of a number of other tags, listed below.
1643
1644 It's much more common for someone to forget to close a 'b' tag
1645 than to actually use nested 'b' tags, and the BeautifulSoup class
1646 handles the common case. This class handles the not-co-common
1647 case: where you can't believe someone wrote what they did, but
1648 it's valid HTML and BeautifulSoup screwed up by assuming it
1649 wouldn't be."""
1650
1651 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1652 ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1653 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1654 'big')
1655
1656 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
1657
1658 NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1659 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1660 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1661
1662 class MinimalSoup(BeautifulSoup):
1663 """The MinimalSoup class is for parsing HTML that contains
1664 pathologically bad markup. It makes no assumptions about tag
1665 nesting, but it does know which tags are self-closing, that
1666 <script> tags contain Javascript and should not be parsed, that
1667 META tags may contain encoding information, and so on.
1668
1669 This also makes it better for subclassing than BeautifulStoneSoup
1670 or BeautifulSoup."""
1671
1672 RESET_NESTING_TAGS = buildTagMap('noscript')
1673 NESTABLE_TAGS = {}
1674
1675 class BeautifulSOAP(BeautifulStoneSoup):
1676 """This class will push a tag with only a single string child into
1677 the tag's parent as an attribute. The attribute's name is the tag
1678 name, and the value is the string child. An example should give
1679 the flavor of the change:
1680
1681 <foo><bar>baz</bar></foo>
1682 =>
1683 <foo bar="baz"><bar>baz</bar></foo>
1684
1685 You can then access fooTag['bar'] instead of fooTag.barTag.string.
1686
1687 This is, of course, useful for scraping structures that tend to
1688 use subelements instead of attributes, such as SOAP messages. Note
1689 that it modifies its input, so don't print the modified version
1690 out.
1691
1692 I'm not sure how many people really want to use this class; let me
1693 know if you do. Mainly I like the name."""
1694
1695 def popTag(self):
1696 if len(self.tagStack) > 1:
1697 tag = self.tagStack[-1]
1698 parent = self.tagStack[-2]
1699 parent._getAttrMap()
1700 if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1701 isinstance(tag.contents[0], NavigableString) and
1702 not parent.attrMap.has_key(tag.name)):
1703 parent[tag.name] = tag.contents[0]
1704 BeautifulStoneSoup.popTag(self)
1705
1706 #Enterprise class names! It has come to our attention that some people
1707 #think the names of the Beautiful Soup parser classes are too silly
1708 #and "unprofessional" for use in enterprise screen-scraping. We feel
1709 #your pain! For such-minded folk, the Beautiful Soup Consortium And
1710 #All-Night Kosher Bakery recommends renaming this file to
1711 #"RobustParser.py" (or, in cases of extreme enterprisiness,
1712 #"RobustParserBeanInterface.class") and using the following
1713 #enterprise-friendly class aliases:
1714 class RobustXMLParser(BeautifulStoneSoup):
1715 pass
1716 class RobustHTMLParser(BeautifulSoup):
1717 pass
1718 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1719 pass
1720 class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1721 pass
1722 class SimplifyingSOAPParser(BeautifulSOAP):
1723 pass
1724
1725 ######################################################
1726 #
1727 # Bonus library: Unicode, Dammit
1728 #
1729 # This class forces XML data into a standard format (usually to UTF-8
1730 # or Unicode). It is heavily based on code from Mark Pilgrim's
1731 # Universal Feed Parser. It does not rewrite the XML or HTML to
1732 # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1733 # (XML) and BeautifulSoup.start_meta (HTML).
1734
1735 # Autodetects character encodings.
1736 # Download from http://chardet.feedparser.org/
1737 try:
1738 import chardet
1739 # import chardet.constants
1740 # chardet.constants._debug = 1
1741 except ImportError:
1742 chardet = None
1743
1744 # cjkcodecs and iconv_codec make Python know about more character encodings.
1745 # Both are available from http://cjkpython.i18n.org/
1746 # They're built in if you use Python 2.4.
1747 try:
1748 import cjkcodecs.aliases
1749 except ImportError:
1750 pass
1751 try:
1752 import iconv_codec
1753 except ImportError:
1754 pass
1755
1756 class UnicodeDammit:
1757 """A class for detecting the encoding of a *ML document and
1758 converting it to a Unicode string. If the source encoding is
1759 windows-1252, can replace MS smart quotes with their HTML or XML
1760 equivalents."""
1761
1762 # This dictionary maps commonly seen values for "charset" in HTML
1763 # meta tags to the corresponding Python codec names. It only covers
1764 # values that aren't in Python's aliases and can't be determined
1765 # by the heuristics in find_codec.
1766 CHARSET_ALIASES = { "macintosh" : "mac-roman",
1767 "x-sjis" : "shift-jis" }
1768
1769 def __init__(self, markup, overrideEncodings=[],
1770 smartQuotesTo='xml', isHTML=False):
1771 self.declaredHTMLEncoding = None
1772 self.markup, documentEncoding, sniffedEncoding = \
1773 self._detectEncoding(markup, isHTML)
1774 self.smartQuotesTo = smartQuotesTo
1775 self.triedEncodings = []
1776 if markup == '' or isinstance(markup, unicode):
1777 self.originalEncoding = None
1778 self.unicode = unicode(markup)
1779 return
1780
1781 u = None
1782 for proposedEncoding in overrideEncodings:
1783 u = self._convertFrom(proposedEncoding)
1784 if u: break
1785 if not u:
1786 for proposedEncoding in (documentEncoding, sniffedEncoding):
1787 u = self._convertFrom(proposedEncoding)
1788 if u: break
1789
1790 # If no luck and we have auto-detection library, try that:
1791 if not u and chardet and not isinstance(self.markup, unicode):
1792 u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1793
1794 # As a last resort, try utf-8 and windows-1252:
1795 if not u:
1796 for proposed_encoding in ("utf-8", "windows-1252"):
1797 u = self._convertFrom(proposed_encoding)
1798 if u: break
1799
1800 self.unicode = u
1801 if not u: self.originalEncoding = None
1802
1803 def _subMSChar(self, orig):
1804 """Changes a MS smart quote character to an XML or HTML
1805 entity."""
1806 sub = self.MS_CHARS.get(orig)
1807 if isinstance(sub, tuple):
1808 if self.smartQuotesTo == 'xml':
1809 sub = '&#x%s;' % sub[1]
1810 else:
1811 sub = '&%s;' % sub[0]
1812 return sub
1813
1814 def _convertFrom(self, proposed):
1815 proposed = self.find_codec(proposed)
1816 if not proposed or proposed in self.triedEncodings:
1817 return None
1818 self.triedEncodings.append(proposed)
1819 markup = self.markup
1820
1821 # Convert smart quotes to HTML if coming from an encoding
1822 # that might have them.
1823 if self.smartQuotesTo and proposed.lower() in("windows-1252",
1824 "iso-8859-1",
1825 "iso-8859-2"):
1826 markup = re.compile("([\x80-\x9f])").sub \
1827 (lambda(x): self._subMSChar(x.group(1)),
1828 markup)
1829
1830 try:
1831 # print "Trying to convert document to %s" % proposed
1832 u = self._toUnicode(markup, proposed)
1833 self.markup = u
1834 self.originalEncoding = proposed
1835 except Exception, e:
1836 # print "That didn't work!"
1837 # print e
1838 return None
1839 #print "Correct encoding: %s" % proposed
1840 return self.markup
1841
1842 def _toUnicode(self, data, encoding):
1843 '''Given a string and its encoding, decodes the string into Unicode.
1844 %encoding is a string recognized by encodings.aliases'''
1845
1846 # strip Byte Order Mark (if present)
1847 if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1848 and (data[2:4] != '\x00\x00'):
1849 encoding = 'utf-16be'
1850 data = data[2:]
1851 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1852 and (data[2:4] != '\x00\x00'):
1853 encoding = 'utf-16le'
1854 data = data[2:]
1855 elif data[:3] == '\xef\xbb\xbf':
1856 encoding = 'utf-8'
1857 data = data[3:]
1858 elif data[:4] == '\x00\x00\xfe\xff':
1859 encoding = 'utf-32be'
1860 data = data[4:]
1861 elif data[:4] == '\xff\xfe\x00\x00':
1862 encoding = 'utf-32le'
1863 data = data[4:]
1864 newdata = unicode(data, encoding)
1865 return newdata
1866
1867 def _detectEncoding(self, xml_data, isHTML=False):
1868 """Given a document, tries to detect its XML encoding."""
1869 xml_encoding = sniffed_xml_encoding = None
1870 try:
1871 if xml_data[:4] == '\x4c\x6f\xa7\x94':
1872 # EBCDIC
1873 xml_data = self._ebcdic_to_ascii(xml_data)
1874 elif xml_data[:4] == '\x00\x3c\x00\x3f':
1875 # UTF-16BE
1876 sniffed_xml_encoding = 'utf-16be'
1877 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1878 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1879 and (xml_data[2:4] != '\x00\x00'):
1880 # UTF-16BE with BOM
1881 sniffed_xml_encoding = 'utf-16be'
1882 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1883 elif xml_data[:4] == '\x3c\x00\x3f\x00':
1884 # UTF-16LE
1885 sniffed_xml_encoding = 'utf-16le'
1886 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1887 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1888 (xml_data[2:4] != '\x00\x00'):
1889 # UTF-16LE with BOM
1890 sniffed_xml_encoding = 'utf-16le'
1891 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1892 elif xml_data[:4] == '\x00\x00\x00\x3c':
1893 # UTF-32BE
1894 sniffed_xml_encoding = 'utf-32be'
1895 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1896 elif xml_data[:4] == '\x3c\x00\x00\x00':
1897 # UTF-32LE
1898 sniffed_xml_encoding = 'utf-32le'
1899 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1900 elif xml_data[:4] == '\x00\x00\xfe\xff':
1901 # UTF-32BE with BOM
1902 sniffed_xml_encoding = 'utf-32be'
1903 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1904 elif xml_data[:4] == '\xff\xfe\x00\x00':
1905 # UTF-32LE with BOM
1906 sniffed_xml_encoding = 'utf-32le'
1907 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1908 elif xml_data[:3] == '\xef\xbb\xbf':
1909 # UTF-8 with BOM
1910 sniffed_xml_encoding = 'utf-8'
1911 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1912 else:
1913 sniffed_xml_encoding = 'ascii'
1914 pass
1915 except:
1916 xml_encoding_match = None
1917 xml_encoding_match = re.compile(
1918 '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
1919 if not xml_encoding_match and isHTML:
1920 regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
1921 xml_encoding_match = regexp.search(xml_data)
1922 if xml_encoding_match is not None:
1923 xml_encoding = xml_encoding_match.groups()[0].lower()
1924 if isHTML:
1925 self.declaredHTMLEncoding = xml_encoding
1926 if sniffed_xml_encoding and \
1927 (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1928 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1929 'utf-16', 'utf-32', 'utf_16', 'utf_32',
1930 'utf16', 'u16')):
1931 xml_encoding = sniffed_xml_encoding
1932 return xml_data, xml_encoding, sniffed_xml_encoding
1933
1934
1935 def find_codec(self, charset):
1936 return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1937 or (charset and self._codec(charset.replace("-", ""))) \
1938 or (charset and self._codec(charset.replace("-", "_"))) \
1939 or charset
1940
1941 def _codec(self, charset):
1942 if not charset: return charset
1943 codec = None
1944 try:
1945 codecs.lookup(charset)
1946 codec = charset
1947 except (LookupError, ValueError):
1948 pass
1949 return codec
1950
1951 EBCDIC_TO_ASCII_MAP = None
1952 def _ebcdic_to_ascii(self, s):
1953 c = self.__class__
1954 if not c.EBCDIC_TO_ASCII_MAP:
1955 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1956 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1957 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1958 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1959 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1960 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1961 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1962 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1963 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1964 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1965 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1966 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1967 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1968 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1969 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1970 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1971 250,251,252,253,254,255)
1972 import string
1973 c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1974 ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1975 return s.translate(c.EBCDIC_TO_ASCII_MAP)
1976
1977 MS_CHARS = { '\x80' : ('euro', '20AC'),
1978 '\x81' : ' ',
1979 '\x82' : ('sbquo', '201A'),
1980 '\x83' : ('fnof', '192'),
1981 '\x84' : ('bdquo', '201E'),
1982 '\x85' : ('hellip', '2026'),
1983 '\x86' : ('dagger', '2020'),
1984 '\x87' : ('Dagger', '2021'),
1985 '\x88' : ('circ', '2C6'),
1986 '\x89' : ('permil', '2030'),
1987 '\x8A' : ('Scaron', '160'),
1988 '\x8B' : ('lsaquo', '2039'),
1989 '\x8C' : ('OElig', '152'),
1990 '\x8D' : '?',
1991 '\x8E' : ('#x17D', '17D'),
1992 '\x8F' : '?',
1993 '\x90' : '?',
1994 '\x91' : ('lsquo', '2018'),
1995 '\x92' : ('rsquo', '2019'),
1996 '\x93' : ('ldquo', '201C'),
1997 '\x94' : ('rdquo', '201D'),
1998 '\x95' : ('bull', '2022'),
1999 '\x96' : ('ndash', '2013'),
2000 '\x97' : ('mdash', '2014'),
2001 '\x98' : ('tilde', '2DC'),
2002 '\x99' : ('trade', '2122'),
2003 '\x9a' : ('scaron', '161'),
2004 '\x9b' : ('rsaquo', '203A'),
2005 '\x9c' : ('oelig', '153'),
2006 '\x9d' : '?',
2007 '\x9e' : ('#x17E', '17E'),
2008 '\x9f' : ('Yuml', ''),}
2009
2010 #######################################################################
2011
2012
2013 #By default, act as an HTML pretty-printer.
2014 if __name__ == '__main__':
2015 import sys
2016 soup = BeautifulSoup(sys.stdin)
2017 print soup.prettify()