Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/bs4/builder/_lxml.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
| author | shellac |
|---|---|
| date | Sat, 02 May 2020 07:14:21 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:26e78fe6e8c4 |
|---|---|
| 1 # Use of this source code is governed by the MIT license. | |
| 2 __license__ = "MIT" | |
| 3 | |
| 4 __all__ = [ | |
| 5 'LXMLTreeBuilderForXML', | |
| 6 'LXMLTreeBuilder', | |
| 7 ] | |
| 8 | |
| 9 try: | |
| 10 from collections.abc import Callable # Python 3.6 | |
| 11 except ImportError as e: | |
| 12 from collections import Callable | |
| 13 | |
| 14 from io import BytesIO | |
| 15 from io import StringIO | |
| 16 from lxml import etree | |
| 17 from bs4.element import ( | |
| 18 Comment, | |
| 19 Doctype, | |
| 20 NamespacedAttribute, | |
| 21 ProcessingInstruction, | |
| 22 XMLProcessingInstruction, | |
| 23 ) | |
| 24 from bs4.builder import ( | |
| 25 FAST, | |
| 26 HTML, | |
| 27 HTMLTreeBuilder, | |
| 28 PERMISSIVE, | |
| 29 ParserRejectedMarkup, | |
| 30 TreeBuilder, | |
| 31 XML) | |
| 32 from bs4.dammit import EncodingDetector | |
| 33 | |
| 34 LXML = 'lxml' | |
| 35 | |
| 36 def _invert(d): | |
| 37 "Invert a dictionary." | |
| 38 return dict((v,k) for k, v in list(d.items())) | |
| 39 | |
| 40 class LXMLTreeBuilderForXML(TreeBuilder): | |
| 41 DEFAULT_PARSER_CLASS = etree.XMLParser | |
| 42 | |
| 43 is_xml = True | |
| 44 processing_instruction_class = XMLProcessingInstruction | |
| 45 | |
| 46 NAME = "lxml-xml" | |
| 47 ALTERNATE_NAMES = ["xml"] | |
| 48 | |
| 49 # Well, it's permissive by XML parser standards. | |
| 50 features = [NAME, LXML, XML, FAST, PERMISSIVE] | |
| 51 | |
| 52 CHUNK_SIZE = 512 | |
| 53 | |
| 54 # This namespace mapping is specified in the XML Namespace | |
| 55 # standard. | |
| 56 DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace') | |
| 57 | |
| 58 DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS) | |
| 59 | |
| 60 # NOTE: If we parsed Element objects and looked at .sourceline, | |
| 61 # we'd be able to see the line numbers from the original document. | |
| 62 # But instead we build an XMLParser or HTMLParser object to serve | |
| 63 # as the target of parse messages, and those messages don't include | |
| 64 # line numbers. | |
| 65 # See: https://bugs.launchpad.net/lxml/+bug/1846906 | |
| 66 | |
| 67 def initialize_soup(self, soup): | |
| 68 """Let the BeautifulSoup object know about the standard namespace | |
| 69 mapping. | |
| 70 | |
| 71 :param soup: A `BeautifulSoup`. | |
| 72 """ | |
| 73 super(LXMLTreeBuilderForXML, self).initialize_soup(soup) | |
| 74 self._register_namespaces(self.DEFAULT_NSMAPS) | |
| 75 | |
| 76 def _register_namespaces(self, mapping): | |
| 77 """Let the BeautifulSoup object know about namespaces encountered | |
| 78 while parsing the document. | |
| 79 | |
| 80 This might be useful later on when creating CSS selectors. | |
| 81 | |
| 82 :param mapping: A dictionary mapping namespace prefixes to URIs. | |
| 83 """ | |
| 84 for key, value in list(mapping.items()): | |
| 85 if key and key not in self.soup._namespaces: | |
| 86 # Let the BeautifulSoup object know about a new namespace. | |
| 87 # If there are multiple namespaces defined with the same | |
| 88 # prefix, the first one in the document takes precedence. | |
| 89 self.soup._namespaces[key] = value | |
| 90 | |
| 91 def default_parser(self, encoding): | |
| 92 """Find the default parser for the given encoding. | |
| 93 | |
| 94 :param encoding: A string. | |
| 95 :return: Either a parser object or a class, which | |
| 96 will be instantiated with default arguments. | |
| 97 """ | |
| 98 if self._default_parser is not None: | |
| 99 return self._default_parser | |
| 100 return etree.XMLParser( | |
| 101 target=self, strip_cdata=False, recover=True, encoding=encoding) | |
| 102 | |
| 103 def parser_for(self, encoding): | |
| 104 """Instantiate an appropriate parser for the given encoding. | |
| 105 | |
| 106 :param encoding: A string. | |
| 107 :return: A parser object such as an `etree.XMLParser`. | |
| 108 """ | |
| 109 # Use the default parser. | |
| 110 parser = self.default_parser(encoding) | |
| 111 | |
| 112 if isinstance(parser, Callable): | |
| 113 # Instantiate the parser with default arguments | |
| 114 parser = parser( | |
| 115 target=self, strip_cdata=False, recover=True, encoding=encoding | |
| 116 ) | |
| 117 return parser | |
| 118 | |
| 119 def __init__(self, parser=None, empty_element_tags=None, **kwargs): | |
| 120 # TODO: Issue a warning if parser is present but not a | |
| 121 # callable, since that means there's no way to create new | |
| 122 # parsers for different encodings. | |
| 123 self._default_parser = parser | |
| 124 if empty_element_tags is not None: | |
| 125 self.empty_element_tags = set(empty_element_tags) | |
| 126 self.soup = None | |
| 127 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] | |
| 128 super(LXMLTreeBuilderForXML, self).__init__(**kwargs) | |
| 129 | |
| 130 def _getNsTag(self, tag): | |
| 131 # Split the namespace URL out of a fully-qualified lxml tag | |
| 132 # name. Copied from lxml's src/lxml/sax.py. | |
| 133 if tag[0] == '{': | |
| 134 return tuple(tag[1:].split('}', 1)) | |
| 135 else: | |
| 136 return (None, tag) | |
| 137 | |
| 138 def prepare_markup(self, markup, user_specified_encoding=None, | |
| 139 exclude_encodings=None, | |
| 140 document_declared_encoding=None): | |
| 141 """Run any preliminary steps necessary to make incoming markup | |
| 142 acceptable to the parser. | |
| 143 | |
| 144 lxml really wants to get a bytestring and convert it to | |
| 145 Unicode itself. So instead of using UnicodeDammit to convert | |
| 146 the bytestring to Unicode using different encodings, this | |
| 147 implementation uses EncodingDetector to iterate over the | |
| 148 encodings, and tell lxml to try to parse the document as each | |
| 149 one in turn. | |
| 150 | |
| 151 :param markup: Some markup -- hopefully a bytestring. | |
| 152 :param user_specified_encoding: The user asked to try this encoding. | |
| 153 :param document_declared_encoding: The markup itself claims to be | |
| 154 in this encoding. | |
| 155 :param exclude_encodings: The user asked _not_ to try any of | |
| 156 these encodings. | |
| 157 | |
| 158 :yield: A series of 4-tuples: | |
| 159 (markup, encoding, declared encoding, | |
| 160 has undergone character replacement) | |
| 161 | |
| 162 Each 4-tuple represents a strategy for converting the | |
| 163 document to Unicode and parsing it. Each strategy will be tried | |
| 164 in turn. | |
| 165 """ | |
| 166 is_html = not self.is_xml | |
| 167 if is_html: | |
| 168 self.processing_instruction_class = ProcessingInstruction | |
| 169 else: | |
| 170 self.processing_instruction_class = XMLProcessingInstruction | |
| 171 | |
| 172 if isinstance(markup, str): | |
| 173 # We were given Unicode. Maybe lxml can parse Unicode on | |
| 174 # this system? | |
| 175 yield markup, None, document_declared_encoding, False | |
| 176 | |
| 177 if isinstance(markup, str): | |
| 178 # No, apparently not. Convert the Unicode to UTF-8 and | |
| 179 # tell lxml to parse it as UTF-8. | |
| 180 yield (markup.encode("utf8"), "utf8", | |
| 181 document_declared_encoding, False) | |
| 182 | |
| 183 try_encodings = [user_specified_encoding, document_declared_encoding] | |
| 184 detector = EncodingDetector( | |
| 185 markup, try_encodings, is_html, exclude_encodings) | |
| 186 for encoding in detector.encodings: | |
| 187 yield (detector.markup, encoding, document_declared_encoding, False) | |
| 188 | |
| 189 def feed(self, markup): | |
| 190 if isinstance(markup, bytes): | |
| 191 markup = BytesIO(markup) | |
| 192 elif isinstance(markup, str): | |
| 193 markup = StringIO(markup) | |
| 194 | |
| 195 # Call feed() at least once, even if the markup is empty, | |
| 196 # or the parser won't be initialized. | |
| 197 data = markup.read(self.CHUNK_SIZE) | |
| 198 try: | |
| 199 self.parser = self.parser_for(self.soup.original_encoding) | |
| 200 self.parser.feed(data) | |
| 201 while len(data) != 0: | |
| 202 # Now call feed() on the rest of the data, chunk by chunk. | |
| 203 data = markup.read(self.CHUNK_SIZE) | |
| 204 if len(data) != 0: | |
| 205 self.parser.feed(data) | |
| 206 self.parser.close() | |
| 207 except (UnicodeDecodeError, LookupError, etree.ParserError) as e: | |
| 208 raise ParserRejectedMarkup(e) | |
| 209 | |
| 210 def close(self): | |
| 211 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] | |
| 212 | |
| 213 def start(self, name, attrs, nsmap={}): | |
| 214 # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. | |
| 215 attrs = dict(attrs) | |
| 216 nsprefix = None | |
| 217 # Invert each namespace map as it comes in. | |
| 218 if len(nsmap) == 0 and len(self.nsmaps) > 1: | |
| 219 # There are no new namespaces for this tag, but | |
| 220 # non-default namespaces are in play, so we need a | |
| 221 # separate tag stack to know when they end. | |
| 222 self.nsmaps.append(None) | |
| 223 elif len(nsmap) > 0: | |
| 224 # A new namespace mapping has come into play. | |
| 225 | |
| 226 # First, Let the BeautifulSoup object know about it. | |
| 227 self._register_namespaces(nsmap) | |
| 228 | |
| 229 # Then, add it to our running list of inverted namespace | |
| 230 # mappings. | |
| 231 self.nsmaps.append(_invert(nsmap)) | |
| 232 | |
| 233 # Also treat the namespace mapping as a set of attributes on the | |
| 234 # tag, so we can recreate it later. | |
| 235 attrs = attrs.copy() | |
| 236 for prefix, namespace in list(nsmap.items()): | |
| 237 attribute = NamespacedAttribute( | |
| 238 "xmlns", prefix, "http://www.w3.org/2000/xmlns/") | |
| 239 attrs[attribute] = namespace | |
| 240 | |
| 241 # Namespaces are in play. Find any attributes that came in | |
| 242 # from lxml with namespaces attached to their names, and | |
| 243 # turn then into NamespacedAttribute objects. | |
| 244 new_attrs = {} | |
| 245 for attr, value in list(attrs.items()): | |
| 246 namespace, attr = self._getNsTag(attr) | |
| 247 if namespace is None: | |
| 248 new_attrs[attr] = value | |
| 249 else: | |
| 250 nsprefix = self._prefix_for_namespace(namespace) | |
| 251 attr = NamespacedAttribute(nsprefix, attr, namespace) | |
| 252 new_attrs[attr] = value | |
| 253 attrs = new_attrs | |
| 254 | |
| 255 namespace, name = self._getNsTag(name) | |
| 256 nsprefix = self._prefix_for_namespace(namespace) | |
| 257 self.soup.handle_starttag(name, namespace, nsprefix, attrs) | |
| 258 | |
| 259 def _prefix_for_namespace(self, namespace): | |
| 260 """Find the currently active prefix for the given namespace.""" | |
| 261 if namespace is None: | |
| 262 return None | |
| 263 for inverted_nsmap in reversed(self.nsmaps): | |
| 264 if inverted_nsmap is not None and namespace in inverted_nsmap: | |
| 265 return inverted_nsmap[namespace] | |
| 266 return None | |
| 267 | |
| 268 def end(self, name): | |
| 269 self.soup.endData() | |
| 270 completed_tag = self.soup.tagStack[-1] | |
| 271 namespace, name = self._getNsTag(name) | |
| 272 nsprefix = None | |
| 273 if namespace is not None: | |
| 274 for inverted_nsmap in reversed(self.nsmaps): | |
| 275 if inverted_nsmap is not None and namespace in inverted_nsmap: | |
| 276 nsprefix = inverted_nsmap[namespace] | |
| 277 break | |
| 278 self.soup.handle_endtag(name, nsprefix) | |
| 279 if len(self.nsmaps) > 1: | |
| 280 # This tag, or one of its parents, introduced a namespace | |
| 281 # mapping, so pop it off the stack. | |
| 282 self.nsmaps.pop() | |
| 283 | |
| 284 def pi(self, target, data): | |
| 285 self.soup.endData() | |
| 286 self.soup.handle_data(target + ' ' + data) | |
| 287 self.soup.endData(self.processing_instruction_class) | |
| 288 | |
| 289 def data(self, content): | |
| 290 self.soup.handle_data(content) | |
| 291 | |
| 292 def doctype(self, name, pubid, system): | |
| 293 self.soup.endData() | |
| 294 doctype = Doctype.for_name_and_ids(name, pubid, system) | |
| 295 self.soup.object_was_parsed(doctype) | |
| 296 | |
| 297 def comment(self, content): | |
| 298 "Handle comments as Comment objects." | |
| 299 self.soup.endData() | |
| 300 self.soup.handle_data(content) | |
| 301 self.soup.endData(Comment) | |
| 302 | |
| 303 def test_fragment_to_document(self, fragment): | |
| 304 """See `TreeBuilder`.""" | |
| 305 return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment | |
| 306 | |
| 307 | |
| 308 class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): | |
| 309 | |
| 310 NAME = LXML | |
| 311 ALTERNATE_NAMES = ["lxml-html"] | |
| 312 | |
| 313 features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] | |
| 314 is_xml = False | |
| 315 processing_instruction_class = ProcessingInstruction | |
| 316 | |
| 317 def default_parser(self, encoding): | |
| 318 return etree.HTMLParser | |
| 319 | |
| 320 def feed(self, markup): | |
| 321 encoding = self.soup.original_encoding | |
| 322 try: | |
| 323 self.parser = self.parser_for(encoding) | |
| 324 self.parser.feed(markup) | |
| 325 self.parser.close() | |
| 326 except (UnicodeDecodeError, LookupError, etree.ParserError) as e: | |
| 327 raise ParserRejectedMarkup(e) | |
| 328 | |
| 329 | |
| 330 def test_fragment_to_document(self, fragment): | |
| 331 """See `TreeBuilder`.""" | |
| 332 return '<html><body>%s</body></html>' % fragment |
