comparison env/lib/python3.9/site-packages/bs4/formatter.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 from bs4.dammit import EntitySubstitution
2
3 class Formatter(EntitySubstitution):
4 """Describes a strategy to use when outputting a parse tree to a string.
5
6 Some parts of this strategy come from the distinction between
7 HTML4, HTML5, and XML. Others are configurable by the user.
8
9 Formatters are passed in as the `formatter` argument to methods
10 like `PageElement.encode`. Most people won't need to think about
11 formatters, and most people who need to think about them can pass
12 in one of these predefined strings as `formatter` rather than
13 making a new Formatter object:
14
15 For HTML documents:
16 * 'html' - HTML entity substitution for generic HTML documents. (default)
17 * 'html5' - HTML entity substitution for HTML5 documents.
18 * 'minimal' - Only make the substitutions necessary to guarantee
19 valid HTML.
20 * None - Do not perform any substitution. This will be faster
21 but may result in invalid markup.
22
23 For XML documents:
24 * 'html' - Entity substitution for XHTML documents.
25 * 'minimal' - Only make the substitutions necessary to guarantee
26 valid XML. (default)
27 * None - Do not perform any substitution. This will be faster
28 but may result in invalid markup.
29 """
30 # Registries of XML and HTML formatters.
31 XML_FORMATTERS = {}
32 HTML_FORMATTERS = {}
33
34 HTML = 'html'
35 XML = 'xml'
36
37 HTML_DEFAULTS = dict(
38 cdata_containing_tags=set(["script", "style"]),
39 )
40
41 def _default(self, language, value, kwarg):
42 if value is not None:
43 return value
44 if language == self.XML:
45 return set()
46 return self.HTML_DEFAULTS[kwarg]
47
48 def __init__(
49 self, language=None, entity_substitution=None,
50 void_element_close_prefix='/', cdata_containing_tags=None,
51 ):
52 """Constructor.
53
54 :param language: This should be Formatter.XML if you are formatting
55 XML markup and Formatter.HTML if you are formatting HTML markup.
56
57 :param entity_substitution: A function to call to replace special
58 characters with XML/HTML entities. For examples, see
59 bs4.dammit.EntitySubstitution.substitute_html and substitute_xml.
60 :param void_element_close_prefix: By default, void elements
61 are represented as <tag/> (XML rules) rather than <tag>
62 (HTML rules). To get <tag>, pass in the empty string.
63 :param cdata_containing_tags: The list of tags that are defined
64 as containing CDATA in this dialect. For example, in HTML,
65 <script> and <style> tags are defined as containing CDATA,
66 and their contents should not be formatted.
67 """
68 self.language = language
69 self.entity_substitution = entity_substitution
70 self.void_element_close_prefix = void_element_close_prefix
71 self.cdata_containing_tags = self._default(
72 language, cdata_containing_tags, 'cdata_containing_tags'
73 )
74
75 def substitute(self, ns):
76 """Process a string that needs to undergo entity substitution.
77 This may be a string encountered in an attribute value or as
78 text.
79
80 :param ns: A string.
81 :return: A string with certain characters replaced by named
82 or numeric entities.
83 """
84 if not self.entity_substitution:
85 return ns
86 from .element import NavigableString
87 if (isinstance(ns, NavigableString)
88 and ns.parent is not None
89 and ns.parent.name in self.cdata_containing_tags):
90 # Do nothing.
91 return ns
92 # Substitute.
93 return self.entity_substitution(ns)
94
95 def attribute_value(self, value):
96 """Process the value of an attribute.
97
98 :param ns: A string.
99 :return: A string with certain characters replaced by named
100 or numeric entities.
101 """
102 return self.substitute(value)
103
104 def attributes(self, tag):
105 """Reorder a tag's attributes however you want.
106
107 By default, attributes are sorted alphabetically. This makes
108 behavior consistent between Python 2 and Python 3, and preserves
109 backwards compatibility with older versions of Beautiful Soup.
110 """
111 if tag.attrs is None:
112 return []
113 return sorted(tag.attrs.items())
114
115
116 class HTMLFormatter(Formatter):
117 """A generic Formatter for HTML."""
118 REGISTRY = {}
119 def __init__(self, *args, **kwargs):
120 return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
121
122
123 class XMLFormatter(Formatter):
124 """A generic Formatter for XML."""
125 REGISTRY = {}
126 def __init__(self, *args, **kwargs):
127 return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
128
129
130 # Set up aliases for the default formatters.
131 HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
132 entity_substitution=EntitySubstitution.substitute_html
133 )
134 HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
135 entity_substitution=EntitySubstitution.substitute_html,
136 void_element_close_prefix = None
137 )
138 HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
139 entity_substitution=EntitySubstitution.substitute_xml
140 )
141 HTMLFormatter.REGISTRY[None] = HTMLFormatter(
142 entity_substitution=None
143 )
144 XMLFormatter.REGISTRY["html"] = XMLFormatter(
145 entity_substitution=EntitySubstitution.substitute_html
146 )
147 XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
148 entity_substitution=EntitySubstitution.substitute_xml
149 )
150 XMLFormatter.REGISTRY[None] = Formatter(
151 Formatter(Formatter.XML, entity_substitution=None)
152 )