Mercurial > repos > shellac > guppy_basecaller
diff env/lib/python3.7/site-packages/bs4/builder/_lxml.py @ 5:9b1c78e6ba9c draft default tip
"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
| author | shellac |
|---|---|
| date | Mon, 01 Jun 2020 08:59:25 -0400 |
| parents | 79f47841a781 |
| children |
line wrap: on
line diff
--- a/env/lib/python3.7/site-packages/bs4/builder/_lxml.py Thu May 14 16:47:39 2020 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,332 +0,0 @@ -# Use of this source code is governed by the MIT license. -__license__ = "MIT" - -__all__ = [ - 'LXMLTreeBuilderForXML', - 'LXMLTreeBuilder', - ] - -try: - from collections.abc import Callable # Python 3.6 -except ImportError as e: - from collections import Callable - -from io import BytesIO -from io import StringIO -from lxml import etree -from bs4.element import ( - Comment, - Doctype, - NamespacedAttribute, - ProcessingInstruction, - XMLProcessingInstruction, -) -from bs4.builder import ( - FAST, - HTML, - HTMLTreeBuilder, - PERMISSIVE, - ParserRejectedMarkup, - TreeBuilder, - XML) -from bs4.dammit import EncodingDetector - -LXML = 'lxml' - -def _invert(d): - "Invert a dictionary." - return dict((v,k) for k, v in list(d.items())) - -class LXMLTreeBuilderForXML(TreeBuilder): - DEFAULT_PARSER_CLASS = etree.XMLParser - - is_xml = True - processing_instruction_class = XMLProcessingInstruction - - NAME = "lxml-xml" - ALTERNATE_NAMES = ["xml"] - - # Well, it's permissive by XML parser standards. - features = [NAME, LXML, XML, FAST, PERMISSIVE] - - CHUNK_SIZE = 512 - - # This namespace mapping is specified in the XML Namespace - # standard. - DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace') - - DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS) - - # NOTE: If we parsed Element objects and looked at .sourceline, - # we'd be able to see the line numbers from the original document. - # But instead we build an XMLParser or HTMLParser object to serve - # as the target of parse messages, and those messages don't include - # line numbers. - # See: https://bugs.launchpad.net/lxml/+bug/1846906 - - def initialize_soup(self, soup): - """Let the BeautifulSoup object know about the standard namespace - mapping. - - :param soup: A `BeautifulSoup`. - """ - super(LXMLTreeBuilderForXML, self).initialize_soup(soup) - self._register_namespaces(self.DEFAULT_NSMAPS) - - def _register_namespaces(self, mapping): - """Let the BeautifulSoup object know about namespaces encountered - while parsing the document. - - This might be useful later on when creating CSS selectors. - - :param mapping: A dictionary mapping namespace prefixes to URIs. - """ - for key, value in list(mapping.items()): - if key and key not in self.soup._namespaces: - # Let the BeautifulSoup object know about a new namespace. - # If there are multiple namespaces defined with the same - # prefix, the first one in the document takes precedence. - self.soup._namespaces[key] = value - - def default_parser(self, encoding): - """Find the default parser for the given encoding. - - :param encoding: A string. - :return: Either a parser object or a class, which - will be instantiated with default arguments. - """ - if self._default_parser is not None: - return self._default_parser - return etree.XMLParser( - target=self, strip_cdata=False, recover=True, encoding=encoding) - - def parser_for(self, encoding): - """Instantiate an appropriate parser for the given encoding. - - :param encoding: A string. - :return: A parser object such as an `etree.XMLParser`. - """ - # Use the default parser. - parser = self.default_parser(encoding) - - if isinstance(parser, Callable): - # Instantiate the parser with default arguments - parser = parser( - target=self, strip_cdata=False, recover=True, encoding=encoding - ) - return parser - - def __init__(self, parser=None, empty_element_tags=None, **kwargs): - # TODO: Issue a warning if parser is present but not a - # callable, since that means there's no way to create new - # parsers for different encodings. - self._default_parser = parser - if empty_element_tags is not None: - self.empty_element_tags = set(empty_element_tags) - self.soup = None - self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] - super(LXMLTreeBuilderForXML, self).__init__(**kwargs) - - def _getNsTag(self, tag): - # Split the namespace URL out of a fully-qualified lxml tag - # name. Copied from lxml's src/lxml/sax.py. - if tag[0] == '{': - return tuple(tag[1:].split('}', 1)) - else: - return (None, tag) - - def prepare_markup(self, markup, user_specified_encoding=None, - exclude_encodings=None, - document_declared_encoding=None): - """Run any preliminary steps necessary to make incoming markup - acceptable to the parser. - - lxml really wants to get a bytestring and convert it to - Unicode itself. So instead of using UnicodeDammit to convert - the bytestring to Unicode using different encodings, this - implementation uses EncodingDetector to iterate over the - encodings, and tell lxml to try to parse the document as each - one in turn. - - :param markup: Some markup -- hopefully a bytestring. - :param user_specified_encoding: The user asked to try this encoding. - :param document_declared_encoding: The markup itself claims to be - in this encoding. - :param exclude_encodings: The user asked _not_ to try any of - these encodings. - - :yield: A series of 4-tuples: - (markup, encoding, declared encoding, - has undergone character replacement) - - Each 4-tuple represents a strategy for converting the - document to Unicode and parsing it. Each strategy will be tried - in turn. - """ - is_html = not self.is_xml - if is_html: - self.processing_instruction_class = ProcessingInstruction - else: - self.processing_instruction_class = XMLProcessingInstruction - - if isinstance(markup, str): - # We were given Unicode. Maybe lxml can parse Unicode on - # this system? - yield markup, None, document_declared_encoding, False - - if isinstance(markup, str): - # No, apparently not. Convert the Unicode to UTF-8 and - # tell lxml to parse it as UTF-8. - yield (markup.encode("utf8"), "utf8", - document_declared_encoding, False) - - try_encodings = [user_specified_encoding, document_declared_encoding] - detector = EncodingDetector( - markup, try_encodings, is_html, exclude_encodings) - for encoding in detector.encodings: - yield (detector.markup, encoding, document_declared_encoding, False) - - def feed(self, markup): - if isinstance(markup, bytes): - markup = BytesIO(markup) - elif isinstance(markup, str): - markup = StringIO(markup) - - # Call feed() at least once, even if the markup is empty, - # or the parser won't be initialized. - data = markup.read(self.CHUNK_SIZE) - try: - self.parser = self.parser_for(self.soup.original_encoding) - self.parser.feed(data) - while len(data) != 0: - # Now call feed() on the rest of the data, chunk by chunk. - data = markup.read(self.CHUNK_SIZE) - if len(data) != 0: - self.parser.feed(data) - self.parser.close() - except (UnicodeDecodeError, LookupError, etree.ParserError) as e: - raise ParserRejectedMarkup(e) - - def close(self): - self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] - - def start(self, name, attrs, nsmap={}): - # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. - attrs = dict(attrs) - nsprefix = None - # Invert each namespace map as it comes in. - if len(nsmap) == 0 and len(self.nsmaps) > 1: - # There are no new namespaces for this tag, but - # non-default namespaces are in play, so we need a - # separate tag stack to know when they end. - self.nsmaps.append(None) - elif len(nsmap) > 0: - # A new namespace mapping has come into play. - - # First, Let the BeautifulSoup object know about it. - self._register_namespaces(nsmap) - - # Then, add it to our running list of inverted namespace - # mappings. - self.nsmaps.append(_invert(nsmap)) - - # Also treat the namespace mapping as a set of attributes on the - # tag, so we can recreate it later. - attrs = attrs.copy() - for prefix, namespace in list(nsmap.items()): - attribute = NamespacedAttribute( - "xmlns", prefix, "http://www.w3.org/2000/xmlns/") - attrs[attribute] = namespace - - # Namespaces are in play. Find any attributes that came in - # from lxml with namespaces attached to their names, and - # turn then into NamespacedAttribute objects. - new_attrs = {} - for attr, value in list(attrs.items()): - namespace, attr = self._getNsTag(attr) - if namespace is None: - new_attrs[attr] = value - else: - nsprefix = self._prefix_for_namespace(namespace) - attr = NamespacedAttribute(nsprefix, attr, namespace) - new_attrs[attr] = value - attrs = new_attrs - - namespace, name = self._getNsTag(name) - nsprefix = self._prefix_for_namespace(namespace) - self.soup.handle_starttag(name, namespace, nsprefix, attrs) - - def _prefix_for_namespace(self, namespace): - """Find the currently active prefix for the given namespace.""" - if namespace is None: - return None - for inverted_nsmap in reversed(self.nsmaps): - if inverted_nsmap is not None and namespace in inverted_nsmap: - return inverted_nsmap[namespace] - return None - - def end(self, name): - self.soup.endData() - completed_tag = self.soup.tagStack[-1] - namespace, name = self._getNsTag(name) - nsprefix = None - if namespace is not None: - for inverted_nsmap in reversed(self.nsmaps): - if inverted_nsmap is not None and namespace in inverted_nsmap: - nsprefix = inverted_nsmap[namespace] - break - self.soup.handle_endtag(name, nsprefix) - if len(self.nsmaps) > 1: - # This tag, or one of its parents, introduced a namespace - # mapping, so pop it off the stack. - self.nsmaps.pop() - - def pi(self, target, data): - self.soup.endData() - self.soup.handle_data(target + ' ' + data) - self.soup.endData(self.processing_instruction_class) - - def data(self, content): - self.soup.handle_data(content) - - def doctype(self, name, pubid, system): - self.soup.endData() - doctype = Doctype.for_name_and_ids(name, pubid, system) - self.soup.object_was_parsed(doctype) - - def comment(self, content): - "Handle comments as Comment objects." - self.soup.endData() - self.soup.handle_data(content) - self.soup.endData(Comment) - - def test_fragment_to_document(self, fragment): - """See `TreeBuilder`.""" - return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment - - -class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): - - NAME = LXML - ALTERNATE_NAMES = ["lxml-html"] - - features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] - is_xml = False - processing_instruction_class = ProcessingInstruction - - def default_parser(self, encoding): - return etree.HTMLParser - - def feed(self, markup): - encoding = self.soup.original_encoding - try: - self.parser = self.parser_for(encoding) - self.parser.feed(markup) - self.parser.close() - except (UnicodeDecodeError, LookupError, etree.ParserError) as e: - raise ParserRejectedMarkup(e) - - - def test_fragment_to_document(self, fragment): - """See `TreeBuilder`.""" - return '<html><body>%s</body></html>' % fragment
