comparison env/lib/python3.9/site-packages/soupsieve/css_parser.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 """CSS selector parser."""
2 import re
3 from functools import lru_cache
4 from . import util
5 from . import css_match as cm
6 from . import css_types as ct
7 from .util import SelectorSyntaxError
8 import warnings
9
10 UNICODE_REPLACEMENT_CHAR = 0xFFFD
11
12 # Simple pseudo classes that take no parameters
13 PSEUDO_SIMPLE = {
14 ":any-link",
15 ":empty",
16 ":first-child",
17 ":first-of-type",
18 ":in-range",
19 ":out-of-range",
20 ":last-child",
21 ":last-of-type",
22 ":link",
23 ":only-child",
24 ":only-of-type",
25 ":root",
26 ':checked',
27 ':default',
28 ':disabled',
29 ':enabled',
30 ':indeterminate',
31 ':optional',
32 ':placeholder-shown',
33 ':read-only',
34 ':read-write',
35 ':required',
36 ':scope',
37 ':defined'
38 }
39
40 # Supported, simple pseudo classes that match nothing in the Soup Sieve environment
41 PSEUDO_SIMPLE_NO_MATCH = {
42 ':active',
43 ':current',
44 ':focus',
45 ':focus-visible',
46 ':focus-within',
47 ':future',
48 ':host',
49 ':hover',
50 ':local-link',
51 ':past',
52 ':paused',
53 ':playing',
54 ':target',
55 ':target-within',
56 ':user-invalid',
57 ':visited'
58 }
59
60 # Complex pseudo classes that take selector lists
61 PSEUDO_COMPLEX = {
62 ':contains',
63 ':-soup-contains',
64 ':-soup-contains-own',
65 ':has',
66 ':is',
67 ':matches',
68 ':not',
69 ':where'
70 }
71
72 PSEUDO_COMPLEX_NO_MATCH = {
73 ':current',
74 ':host',
75 ':host-context'
76 }
77
78 # Complex pseudo classes that take very specific parameters and are handled special
79 PSEUDO_SPECIAL = {
80 ':dir',
81 ':lang',
82 ':nth-child',
83 ':nth-last-child',
84 ':nth-last-of-type',
85 ':nth-of-type'
86 }
87
88 PSEUDO_SUPPORTED = PSEUDO_SIMPLE | PSEUDO_SIMPLE_NO_MATCH | PSEUDO_COMPLEX | PSEUDO_COMPLEX_NO_MATCH | PSEUDO_SPECIAL
89
90 # Sub-patterns parts
91 # Whitespace
92 NEWLINE = r'(?:\r\n|(?!\r\n)[\n\f\r])'
93 WS = r'(?:[ \t]|{})'.format(NEWLINE)
94 # Comments
95 COMMENTS = r'(?:/\*[^*]*\*+(?:[^/*][^*]*\*+)*/)'
96 # Whitespace with comments included
97 WSC = r'(?:{ws}|{comments})'.format(ws=WS, comments=COMMENTS)
98 # CSS escapes
99 CSS_ESCAPES = r'(?:\\(?:[a-f0-9]{{1,6}}{ws}?|[^\r\n\f]|$))'.format(ws=WS)
100 CSS_STRING_ESCAPES = r'(?:\\(?:[a-f0-9]{{1,6}}{ws}?|[^\r\n\f]|$|{nl}))'.format(ws=WS, nl=NEWLINE)
101 # CSS Identifier
102 IDENTIFIER = r'''
103 (?:(?:-?(?:[^\x00-\x2f\x30-\x40\x5B-\x5E\x60\x7B-\x9f]|{esc})+|--)
104 (?:[^\x00-\x2c\x2e\x2f\x3A-\x40\x5B-\x5E\x60\x7B-\x9f]|{esc})*)
105 '''.format(esc=CSS_ESCAPES)
106 # `nth` content
107 NTH = r'(?:[-+])?(?:[0-9]+n?|n)(?:(?<=n){ws}*(?:[-+]){ws}*(?:[0-9]+))?'.format(ws=WSC)
108 # Value: quoted string or identifier
109 VALUE = r'''
110 (?:"(?:\\(?:.|{nl})|[^\\"\r\n\f]+)*?"|'(?:\\(?:.|{nl})|[^\\'\r\n\f]+)*?'|{ident}+)
111 '''.format(nl=NEWLINE, ident=IDENTIFIER)
112 # Attribute value comparison. `!=` is handled special as it is non-standard.
113 ATTR = r'''
114 (?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}+(?P<case>[is]))?)?{ws}*\]
115 '''.format(ws=WSC, value=VALUE)
116
117 # Selector patterns
118 # IDs (`#id`)
119 PAT_ID = r'\#{ident}'.format(ident=IDENTIFIER)
120 # Classes (`.class`)
121 PAT_CLASS = r'\.{ident}'.format(ident=IDENTIFIER)
122 # Prefix:Tag (`prefix|tag`)
123 PAT_TAG = r'(?P<tag_ns>(?:{ident}|\*)?\|)?(?P<tag_name>{ident}|\*)'.format(ident=IDENTIFIER)
124 # Attributes (`[attr]`, `[attr=value]`, etc.)
125 PAT_ATTR = r'''
126 \[{ws}*(?P<attr_ns>(?:{ident}|\*)?\|)?(?P<attr_name>{ident}){attr}
127 '''.format(ws=WSC, ident=IDENTIFIER, attr=ATTR)
128 # Pseudo class (`:pseudo-class`, `:pseudo-class(`)
129 PAT_PSEUDO_CLASS = r'(?P<name>:{ident})(?P<open>\({ws}*)?'.format(ws=WSC, ident=IDENTIFIER)
130 # Pseudo class special patterns. Matches `:pseudo-class(` for special case pseudo classes.
131 PAT_PSEUDO_CLASS_SPECIAL = r'(?P<name>:{ident})(?P<open>\({ws}*)'.format(ws=WSC, ident=IDENTIFIER)
132 # Custom pseudo class (`:--custom-pseudo`)
133 PAT_PSEUDO_CLASS_CUSTOM = r'(?P<name>:(?=--){ident})'.format(ident=IDENTIFIER)
134 # Closing pseudo group (`)`)
135 PAT_PSEUDO_CLOSE = r'{ws}*\)'.format(ws=WSC)
136 # Pseudo element (`::pseudo-element`)
137 PAT_PSEUDO_ELEMENT = r':{}'.format(PAT_PSEUDO_CLASS)
138 # At rule (`@page`, etc.) (not supported)
139 PAT_AT_RULE = r'@P{ident}'.format(ident=IDENTIFIER)
140 # Pseudo class `nth-child` (`:nth-child(an+b [of S]?)`, `:first-child`, etc.)
141 PAT_PSEUDO_NTH_CHILD = r'''
142 (?P<pseudo_nth_child>{name}
143 (?P<nth_child>{nth}|even|odd))(?:{wsc}*\)|(?P<of>{comments}*{ws}{wsc}*of{comments}*{ws}{wsc}*))
144 '''.format(name=PAT_PSEUDO_CLASS_SPECIAL, wsc=WSC, comments=COMMENTS, ws=WS, nth=NTH)
145 # Pseudo class `nth-of-type` (`:nth-of-type(an+b)`, `:first-of-type`, etc.)
146 PAT_PSEUDO_NTH_TYPE = r'''
147 (?P<pseudo_nth_type>{name}
148 (?P<nth_type>{nth}|even|odd)){ws}*\)
149 '''.format(name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, nth=NTH)
150 # Pseudo class language (`:lang("*-de", en)`)
151 PAT_PSEUDO_LANG = r'{name}(?P<values>{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format(
152 name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, value=VALUE
153 )
154 # Pseudo class direction (`:dir(ltr)`)
155 PAT_PSEUDO_DIR = r'{name}(?P<dir>ltr|rtl){ws}*\)'.format(name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC)
156 # Combining characters (`>`, `~`, ` `, `+`, `,`)
157 PAT_COMBINE = r'{wsc}*?(?P<relation>[,+>~]|{ws}(?![,+>~])){wsc}*'.format(ws=WS, wsc=WSC)
158 # Extra: Contains (`:contains(text)`)
159 PAT_PSEUDO_CONTAINS = r'{name}(?P<values>{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format(
160 name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, value=VALUE
161 )
162
163 # Regular expressions
164 # CSS escape pattern
165 RE_CSS_ESC = re.compile(r'(?:(\\[a-f0-9]{{1,6}}{ws}?)|(\\[^\r\n\f])|(\\$))'.format(ws=WSC), re.I)
166 RE_CSS_STR_ESC = re.compile(
167 r'(?:(\\[a-f0-9]{{1,6}}{ws}?)|(\\[^\r\n\f])|(\\$)|(\\{nl}))'.format(ws=WS, nl=NEWLINE), re.I
168 )
169 # Pattern to break up `nth` specifiers
170 RE_NTH = re.compile(
171 r'(?P<s1>[-+])?(?P<a>[0-9]+n?|n)(?:(?<=n){ws}*(?P<s2>[-+]){ws}*(?P<b>[0-9]+))?'.format(ws=WSC),
172 re.I
173 )
174 # Pattern to iterate multiple values.
175 RE_VALUES = re.compile(r'(?:(?P<value>{value})|(?P<split>{ws}*,{ws}*))'.format(ws=WSC, value=VALUE), re.X)
176 # Whitespace checks
177 RE_WS = re.compile(WS)
178 RE_WS_BEGIN = re.compile('^{}*'.format(WSC))
179 RE_WS_END = re.compile('{}*$'.format(WSC))
180 RE_CUSTOM = re.compile(r'^{}$'.format(PAT_PSEUDO_CLASS_CUSTOM), re.X)
181
182 # Constants
183 # List split token
184 COMMA_COMBINATOR = ','
185 # Relation token for descendant
186 WS_COMBINATOR = " "
187
188 # Parse flags
189 FLG_PSEUDO = 0x01
190 FLG_NOT = 0x02
191 FLG_RELATIVE = 0x04
192 FLG_DEFAULT = 0x08
193 FLG_HTML = 0x10
194 FLG_INDETERMINATE = 0x20
195 FLG_OPEN = 0x40
196 FLG_IN_RANGE = 0x80
197 FLG_OUT_OF_RANGE = 0x100
198 FLG_PLACEHOLDER_SHOWN = 0x200
199
200 # Maximum cached patterns to store
201 _MAXCACHE = 500
202
203
204 @lru_cache(maxsize=_MAXCACHE)
205 def _cached_css_compile(pattern, namespaces, custom, flags):
206 """Cached CSS compile."""
207
208 custom_selectors = process_custom(custom)
209 return cm.SoupSieve(
210 pattern,
211 CSSParser(pattern, custom=custom_selectors, flags=flags).process_selectors(),
212 namespaces,
213 custom,
214 flags
215 )
216
217
218 def _purge_cache():
219 """Purge the cache."""
220
221 _cached_css_compile.cache_clear()
222
223
224 def process_custom(custom):
225 """Process custom."""
226
227 custom_selectors = {}
228 if custom is not None:
229 for key, value in custom.items():
230 name = util.lower(key)
231 if RE_CUSTOM.match(name) is None:
232 raise SelectorSyntaxError("The name '{}' is not a valid custom pseudo-class name".format(name))
233 if name in custom_selectors:
234 raise KeyError("The custom selector '{}' has already been registered".format(name))
235 custom_selectors[css_unescape(name)] = value
236 return custom_selectors
237
238
239 def css_unescape(content, string=False):
240 """
241 Unescape CSS value.
242
243 Strings allow for spanning the value on multiple strings by escaping a new line.
244 """
245
246 def replace(m):
247 """Replace with the appropriate substitute."""
248
249 if m.group(1):
250 codepoint = int(m.group(1)[1:], 16)
251 if codepoint == 0:
252 codepoint = UNICODE_REPLACEMENT_CHAR
253 value = chr(codepoint)
254 elif m.group(2):
255 value = m.group(2)[1:]
256 elif m.group(3):
257 value = '\ufffd'
258 else:
259 value = ''
260
261 return value
262
263 return (RE_CSS_ESC if not string else RE_CSS_STR_ESC).sub(replace, content)
264
265
266 def escape(ident):
267 """Escape identifier."""
268
269 string = []
270 length = len(ident)
271 start_dash = length > 0 and ident[0] == '-'
272 if length == 1 and start_dash:
273 # Need to escape identifier that is a single `-` with no other characters
274 string.append('\\{}'.format(ident))
275 else:
276 for index, c in enumerate(ident):
277 codepoint = ord(c)
278 if codepoint == 0x00:
279 string.append('\ufffd')
280 elif (0x01 <= codepoint <= 0x1F) or codepoint == 0x7F:
281 string.append('\\{:x} '.format(codepoint))
282 elif (index == 0 or (start_dash and index == 1)) and (0x30 <= codepoint <= 0x39):
283 string.append('\\{:x} '.format(codepoint))
284 elif (
285 codepoint in (0x2D, 0x5F) or codepoint >= 0x80 or (0x30 <= codepoint <= 0x39) or
286 (0x30 <= codepoint <= 0x39) or (0x41 <= codepoint <= 0x5A) or (0x61 <= codepoint <= 0x7A)
287 ):
288 string.append(c)
289 else:
290 string.append('\\{}'.format(c))
291 return ''.join(string)
292
293
294 class SelectorPattern(object):
295 """Selector pattern."""
296
297 def __init__(self, name, pattern):
298 """Initialize."""
299
300 self.name = name
301 self.re_pattern = re.compile(pattern, re.I | re.X | re.U)
302
303 def get_name(self):
304 """Get name."""
305
306 return self.name
307
308 def match(self, selector, index, flags):
309 """Match the selector."""
310
311 return self.re_pattern.match(selector, index)
312
313
314 class SpecialPseudoPattern(SelectorPattern):
315 """Selector pattern."""
316
317 def __init__(self, patterns):
318 """Initialize."""
319
320 self.patterns = {}
321 for p in patterns:
322 name = p[0]
323 pattern = p[3](name, p[2])
324 for pseudo in p[1]:
325 self.patterns[pseudo] = pattern
326
327 self.matched_name = None
328 self.re_pseudo_name = re.compile(PAT_PSEUDO_CLASS_SPECIAL, re.I | re.X | re.U)
329
330 def get_name(self):
331 """Get name."""
332
333 return self.matched_name.get_name()
334
335 def match(self, selector, index, flags):
336 """Match the selector."""
337
338 pseudo = None
339 m = self.re_pseudo_name.match(selector, index)
340 if m:
341 name = util.lower(css_unescape(m.group('name')))
342 pattern = self.patterns.get(name)
343 if pattern:
344 pseudo = pattern.match(selector, index, flags)
345 if pseudo:
346 self.matched_name = pattern
347
348 return pseudo
349
350
351 class _Selector(object):
352 """
353 Intermediate selector class.
354
355 This stores selector data for a compound selector as we are acquiring them.
356 Once we are done collecting the data for a compound selector, we freeze
357 the data in an object that can be pickled and hashed.
358 """
359
360 def __init__(self, **kwargs):
361 """Initialize."""
362
363 self.tag = kwargs.get('tag', None)
364 self.ids = kwargs.get('ids', [])
365 self.classes = kwargs.get('classes', [])
366 self.attributes = kwargs.get('attributes', [])
367 self.nth = kwargs.get('nth', [])
368 self.selectors = kwargs.get('selectors', [])
369 self.relations = kwargs.get('relations', [])
370 self.rel_type = kwargs.get('rel_type', None)
371 self.contains = kwargs.get('contains', [])
372 self.lang = kwargs.get('lang', [])
373 self.flags = kwargs.get('flags', 0)
374 self.no_match = kwargs.get('no_match', False)
375
376 def _freeze_relations(self, relations):
377 """Freeze relation."""
378
379 if relations:
380 sel = relations[0]
381 sel.relations.extend(relations[1:])
382 return ct.SelectorList([sel.freeze()])
383 else:
384 return ct.SelectorList()
385
386 def freeze(self):
387 """Freeze self."""
388
389 if self.no_match:
390 return ct.SelectorNull()
391 else:
392 return ct.Selector(
393 self.tag,
394 tuple(self.ids),
395 tuple(self.classes),
396 tuple(self.attributes),
397 tuple(self.nth),
398 tuple(self.selectors),
399 self._freeze_relations(self.relations),
400 self.rel_type,
401 tuple(self.contains),
402 tuple(self.lang),
403 self.flags
404 )
405
406 def __str__(self): # pragma: no cover
407 """String representation."""
408
409 return (
410 '_Selector(tag={!r}, ids={!r}, classes={!r}, attributes={!r}, nth={!r}, selectors={!r}, '
411 'relations={!r}, rel_type={!r}, contains={!r}, lang={!r}, flags={!r}, no_match={!r})'
412 ).format(
413 self.tag, self.ids, self.classes, self.attributes, self.nth, self.selectors,
414 self.relations, self.rel_type, self.contains, self.lang, self.flags, self.no_match
415 )
416
417 __repr__ = __str__
418
419
420 class CSSParser(object):
421 """Parse CSS selectors."""
422
423 css_tokens = (
424 SelectorPattern("pseudo_close", PAT_PSEUDO_CLOSE),
425 SpecialPseudoPattern(
426 (
427 (
428 "pseudo_contains",
429 (':contains', ':-soup-contains', ':-soup-contains-own'),
430 PAT_PSEUDO_CONTAINS,
431 SelectorPattern
432 ),
433 ("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD, SelectorPattern),
434 ("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE, SelectorPattern),
435 ("pseudo_lang", (':lang',), PAT_PSEUDO_LANG, SelectorPattern),
436 ("pseudo_dir", (':dir',), PAT_PSEUDO_DIR, SelectorPattern)
437 )
438 ),
439 SelectorPattern("pseudo_class_custom", PAT_PSEUDO_CLASS_CUSTOM),
440 SelectorPattern("pseudo_class", PAT_PSEUDO_CLASS),
441 SelectorPattern("pseudo_element", PAT_PSEUDO_ELEMENT),
442 SelectorPattern("at_rule", PAT_AT_RULE),
443 SelectorPattern("id", PAT_ID),
444 SelectorPattern("class", PAT_CLASS),
445 SelectorPattern("tag", PAT_TAG),
446 SelectorPattern("attribute", PAT_ATTR),
447 SelectorPattern("combine", PAT_COMBINE)
448 )
449
450 def __init__(self, selector, custom=None, flags=0):
451 """Initialize."""
452
453 self.pattern = selector.replace('\x00', '\ufffd')
454 self.flags = flags
455 self.debug = self.flags & util.DEBUG
456 self.custom = {} if custom is None else custom
457
458 def parse_attribute_selector(self, sel, m, has_selector):
459 """Create attribute selector from the returned regex match."""
460
461 inverse = False
462 op = m.group('cmp')
463 case = util.lower(m.group('case')) if m.group('case') else None
464 ns = css_unescape(m.group('attr_ns')[:-1]) if m.group('attr_ns') else ''
465 attr = css_unescape(m.group('attr_name'))
466 is_type = False
467 pattern2 = None
468
469 if case:
470 flags = re.I if case == 'i' else 0
471 elif util.lower(attr) == 'type':
472 flags = re.I
473 is_type = True
474 else:
475 flags = 0
476
477 if op:
478 if m.group('value').startswith(('"', "'")):
479 value = css_unescape(m.group('value')[1:-1], True)
480 else:
481 value = css_unescape(m.group('value'))
482 else:
483 value = None
484 if not op:
485 # Attribute name
486 pattern = None
487 elif op.startswith('^'):
488 # Value start with
489 pattern = re.compile(r'^%s.*' % re.escape(value), flags)
490 elif op.startswith('$'):
491 # Value ends with
492 pattern = re.compile(r'.*?%s$' % re.escape(value), flags)
493 elif op.startswith('*'):
494 # Value contains
495 pattern = re.compile(r'.*?%s.*' % re.escape(value), flags)
496 elif op.startswith('~'):
497 # Value contains word within space separated list
498 # `~=` should match nothing if it is empty or contains whitespace,
499 # so if either of these cases is present, use `[^\s\S]` which cannot be matched.
500 value = r'[^\s\S]' if not value or RE_WS.search(value) else re.escape(value)
501 pattern = re.compile(r'.*?(?:(?<=^)|(?<=[ \t\r\n\f]))%s(?=(?:[ \t\r\n\f]|$)).*' % value, flags)
502 elif op.startswith('|'):
503 # Value starts with word in dash separated list
504 pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags)
505 else:
506 # Value matches
507 pattern = re.compile(r'^%s$' % re.escape(value), flags)
508 if op.startswith('!'):
509 # Equivalent to `:not([attr=value])`
510 inverse = True
511 if is_type and pattern:
512 pattern2 = re.compile(pattern.pattern)
513
514 # Append the attribute selector
515 sel_attr = ct.SelectorAttribute(attr, ns, pattern, pattern2)
516 if inverse:
517 # If we are using `!=`, we need to nest the pattern under a `:not()`.
518 sub_sel = _Selector()
519 sub_sel.attributes.append(sel_attr)
520 not_list = ct.SelectorList([sub_sel.freeze()], True, False)
521 sel.selectors.append(not_list)
522 else:
523 sel.attributes.append(sel_attr)
524
525 has_selector = True
526 return has_selector
527
528 def parse_tag_pattern(self, sel, m, has_selector):
529 """Parse tag pattern from regex match."""
530
531 prefix = css_unescape(m.group('tag_ns')[:-1]) if m.group('tag_ns') else None
532 tag = css_unescape(m.group('tag_name'))
533 sel.tag = ct.SelectorTag(tag, prefix)
534 has_selector = True
535 return has_selector
536
537 def parse_pseudo_class_custom(self, sel, m, has_selector):
538 """
539 Parse custom pseudo class alias.
540
541 Compile custom selectors as we need them. When compiling a custom selector,
542 set it to `None` in the dictionary so we can avoid an infinite loop.
543 """
544
545 pseudo = util.lower(css_unescape(m.group('name')))
546 selector = self.custom.get(pseudo)
547 if selector is None:
548 raise SelectorSyntaxError(
549 "Undefined custom selector '{}' found at postion {}".format(pseudo, m.end(0)),
550 self.pattern,
551 m.end(0)
552 )
553
554 if not isinstance(selector, ct.SelectorList):
555 self.custom[pseudo] = None
556 selector = CSSParser(
557 selector, custom=self.custom, flags=self.flags
558 ).process_selectors(flags=FLG_PSEUDO)
559 self.custom[pseudo] = selector
560
561 sel.selectors.append(selector)
562 has_selector = True
563 return has_selector
564
565 def parse_pseudo_class(self, sel, m, has_selector, iselector, is_html):
566 """Parse pseudo class."""
567
568 complex_pseudo = False
569 pseudo = util.lower(css_unescape(m.group('name')))
570 if m.group('open'):
571 complex_pseudo = True
572 if complex_pseudo and pseudo in PSEUDO_COMPLEX:
573 has_selector = self.parse_pseudo_open(sel, pseudo, has_selector, iselector, m.end(0))
574 elif not complex_pseudo and pseudo in PSEUDO_SIMPLE:
575 if pseudo == ':root':
576 sel.flags |= ct.SEL_ROOT
577 elif pseudo == ':defined':
578 sel.flags |= ct.SEL_DEFINED
579 is_html = True
580 elif pseudo == ':scope':
581 sel.flags |= ct.SEL_SCOPE
582 elif pseudo == ':empty':
583 sel.flags |= ct.SEL_EMPTY
584 elif pseudo in (':link', ':any-link'):
585 sel.selectors.append(CSS_LINK)
586 elif pseudo == ':checked':
587 sel.selectors.append(CSS_CHECKED)
588 elif pseudo == ':default':
589 sel.selectors.append(CSS_DEFAULT)
590 elif pseudo == ':indeterminate':
591 sel.selectors.append(CSS_INDETERMINATE)
592 elif pseudo == ":disabled":
593 sel.selectors.append(CSS_DISABLED)
594 elif pseudo == ":enabled":
595 sel.selectors.append(CSS_ENABLED)
596 elif pseudo == ":required":
597 sel.selectors.append(CSS_REQUIRED)
598 elif pseudo == ":optional":
599 sel.selectors.append(CSS_OPTIONAL)
600 elif pseudo == ":read-only":
601 sel.selectors.append(CSS_READ_ONLY)
602 elif pseudo == ":read-write":
603 sel.selectors.append(CSS_READ_WRITE)
604 elif pseudo == ":in-range":
605 sel.selectors.append(CSS_IN_RANGE)
606 elif pseudo == ":out-of-range":
607 sel.selectors.append(CSS_OUT_OF_RANGE)
608 elif pseudo == ":placeholder-shown":
609 sel.selectors.append(CSS_PLACEHOLDER_SHOWN)
610 elif pseudo == ':first-child':
611 sel.nth.append(ct.SelectorNth(1, False, 0, False, False, ct.SelectorList()))
612 elif pseudo == ':last-child':
613 sel.nth.append(ct.SelectorNth(1, False, 0, False, True, ct.SelectorList()))
614 elif pseudo == ':first-of-type':
615 sel.nth.append(ct.SelectorNth(1, False, 0, True, False, ct.SelectorList()))
616 elif pseudo == ':last-of-type':
617 sel.nth.append(ct.SelectorNth(1, False, 0, True, True, ct.SelectorList()))
618 elif pseudo == ':only-child':
619 sel.nth.extend(
620 [
621 ct.SelectorNth(1, False, 0, False, False, ct.SelectorList()),
622 ct.SelectorNth(1, False, 0, False, True, ct.SelectorList())
623 ]
624 )
625 elif pseudo == ':only-of-type':
626 sel.nth.extend(
627 [
628 ct.SelectorNth(1, False, 0, True, False, ct.SelectorList()),
629 ct.SelectorNth(1, False, 0, True, True, ct.SelectorList())
630 ]
631 )
632 has_selector = True
633 elif complex_pseudo and pseudo in PSEUDO_COMPLEX_NO_MATCH:
634 self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN)
635 sel.no_match = True
636 has_selector = True
637 elif not complex_pseudo and pseudo in PSEUDO_SIMPLE_NO_MATCH:
638 sel.no_match = True
639 has_selector = True
640 elif pseudo in PSEUDO_SUPPORTED:
641 raise SelectorSyntaxError(
642 "Invalid syntax for pseudo class '{}'".format(pseudo),
643 self.pattern,
644 m.start(0)
645 )
646 else:
647 raise NotImplementedError(
648 "'{}' pseudo-class is not implemented at this time".format(pseudo)
649 )
650
651 return has_selector, is_html
652
653 def parse_pseudo_nth(self, sel, m, has_selector, iselector):
654 """Parse `nth` pseudo."""
655
656 mdict = m.groupdict()
657 if mdict.get('pseudo_nth_child'):
658 postfix = '_child'
659 else:
660 postfix = '_type'
661 mdict['name'] = util.lower(css_unescape(mdict['name']))
662 content = util.lower(mdict.get('nth' + postfix))
663 if content == 'even':
664 # 2n
665 s1 = 2
666 s2 = 0
667 var = True
668 elif content == 'odd':
669 # 2n+1
670 s1 = 2
671 s2 = 1
672 var = True
673 else:
674 nth_parts = RE_NTH.match(content)
675 s1 = '-' if nth_parts.group('s1') and nth_parts.group('s1') == '-' else ''
676 a = nth_parts.group('a')
677 var = a.endswith('n')
678 if a.startswith('n'):
679 s1 += '1'
680 elif var:
681 s1 += a[:-1]
682 else:
683 s1 += a
684 s2 = '-' if nth_parts.group('s2') and nth_parts.group('s2') == '-' else ''
685 if nth_parts.group('b'):
686 s2 += nth_parts.group('b')
687 else:
688 s2 = '0'
689 s1 = int(s1, 10)
690 s2 = int(s2, 10)
691
692 pseudo_sel = mdict['name']
693 if postfix == '_child':
694 if m.group('of'):
695 # Parse the rest of `of S`.
696 nth_sel = self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN)
697 else:
698 # Use default `*|*` for `of S`.
699 nth_sel = CSS_NTH_OF_S_DEFAULT
700 if pseudo_sel == ':nth-child':
701 sel.nth.append(ct.SelectorNth(s1, var, s2, False, False, nth_sel))
702 elif pseudo_sel == ':nth-last-child':
703 sel.nth.append(ct.SelectorNth(s1, var, s2, False, True, nth_sel))
704 else:
705 if pseudo_sel == ':nth-of-type':
706 sel.nth.append(ct.SelectorNth(s1, var, s2, True, False, ct.SelectorList()))
707 elif pseudo_sel == ':nth-last-of-type':
708 sel.nth.append(ct.SelectorNth(s1, var, s2, True, True, ct.SelectorList()))
709 has_selector = True
710 return has_selector
711
712 def parse_pseudo_open(self, sel, name, has_selector, iselector, index):
713 """Parse pseudo with opening bracket."""
714
715 flags = FLG_PSEUDO | FLG_OPEN
716 if name == ':not':
717 flags |= FLG_NOT
718 if name == ':has':
719 flags |= FLG_RELATIVE
720
721 sel.selectors.append(self.parse_selectors(iselector, index, flags))
722 has_selector = True
723 return has_selector
724
725 def parse_has_combinator(self, sel, m, has_selector, selectors, rel_type, index):
726 """Parse combinator tokens."""
727
728 combinator = m.group('relation').strip()
729 if not combinator:
730 combinator = WS_COMBINATOR
731 if combinator == COMMA_COMBINATOR:
732 if not has_selector:
733 # If we've not captured any selector parts, the comma is either at the beginning of the pattern
734 # or following another comma, both of which are unexpected. Commas must split selectors.
735 raise SelectorSyntaxError(
736 "The combinator '{}' at postion {}, must have a selector before it".format(combinator, index),
737 self.pattern,
738 index
739 )
740 sel.rel_type = rel_type
741 selectors[-1].relations.append(sel)
742 rel_type = ":" + WS_COMBINATOR
743 selectors.append(_Selector())
744 else:
745 if has_selector:
746 # End the current selector and associate the leading combinator with this selector.
747 sel.rel_type = rel_type
748 selectors[-1].relations.append(sel)
749 elif rel_type[1:] != WS_COMBINATOR:
750 # It's impossible to have two whitespace combinators after each other as the patterns
751 # will gobble up trailing whitespace. It is also impossible to have a whitespace
752 # combinator after any other kind for the same reason. But we could have
753 # multiple non-whitespace combinators. So if the current combinator is not a whitespace,
754 # then we've hit the multiple combinator case, so we should fail.
755 raise SelectorSyntaxError(
756 'The multiple combinators at position {}'.format(index),
757 self.pattern,
758 index
759 )
760 # Set the leading combinator for the next selector.
761 rel_type = ':' + combinator
762 sel = _Selector()
763
764 has_selector = False
765 return has_selector, sel, rel_type
766
767 def parse_combinator(self, sel, m, has_selector, selectors, relations, is_pseudo, index):
768 """Parse combinator tokens."""
769
770 combinator = m.group('relation').strip()
771 if not combinator:
772 combinator = WS_COMBINATOR
773 if not has_selector:
774 raise SelectorSyntaxError(
775 "The combinator '{}' at postion {}, must have a selector before it".format(combinator, index),
776 self.pattern,
777 index
778 )
779
780 if combinator == COMMA_COMBINATOR:
781 if not sel.tag and not is_pseudo:
782 # Implied `*`
783 sel.tag = ct.SelectorTag('*', None)
784 sel.relations.extend(relations)
785 selectors.append(sel)
786 del relations[:]
787 else:
788 sel.relations.extend(relations)
789 sel.rel_type = combinator
790 del relations[:]
791 relations.append(sel)
792 sel = _Selector()
793
794 has_selector = False
795 return has_selector, sel
796
797 def parse_class_id(self, sel, m, has_selector):
798 """Parse HTML classes and ids."""
799
800 selector = m.group(0)
801 if selector.startswith('.'):
802 sel.classes.append(css_unescape(selector[1:]))
803 else:
804 sel.ids.append(css_unescape(selector[1:]))
805 has_selector = True
806 return has_selector
807
808 def parse_pseudo_contains(self, sel, m, has_selector):
809 """Parse contains."""
810
811 pseudo = util.lower(css_unescape(m.group('name')))
812 if pseudo == ":contains":
813 warnings.warn(
814 "The pseudo class ':contains' is deprecated, ':-soup-contains' should be used moving forward.",
815 FutureWarning
816 )
817 contains_own = pseudo == ":-soup-contains-own"
818 values = css_unescape(m.group('values'))
819 patterns = []
820 for token in RE_VALUES.finditer(values):
821 if token.group('split'):
822 continue
823 value = token.group('value')
824 if value.startswith(("'", '"')):
825 value = css_unescape(value[1:-1], True)
826 else:
827 value = css_unescape(value)
828 patterns.append(value)
829 sel.contains.append(ct.SelectorContains(tuple(patterns), contains_own))
830 has_selector = True
831 return has_selector
832
833 def parse_pseudo_lang(self, sel, m, has_selector):
834 """Parse pseudo language."""
835
836 values = m.group('values')
837 patterns = []
838 for token in RE_VALUES.finditer(values):
839 if token.group('split'):
840 continue
841 value = token.group('value')
842 if value.startswith(('"', "'")):
843 value = css_unescape(value[1:-1], True)
844 else:
845 value = css_unescape(value)
846
847 patterns.append(value)
848
849 sel.lang.append(ct.SelectorLang(patterns))
850 has_selector = True
851
852 return has_selector
853
854 def parse_pseudo_dir(self, sel, m, has_selector):
855 """Parse pseudo direction."""
856
857 value = ct.SEL_DIR_LTR if util.lower(m.group('dir')) == 'ltr' else ct.SEL_DIR_RTL
858 sel.flags |= value
859 has_selector = True
860 return has_selector
861
862 def parse_selectors(self, iselector, index=0, flags=0):
863 """Parse selectors."""
864
865 sel = _Selector()
866 selectors = []
867 has_selector = False
868 closed = False
869 relations = []
870 rel_type = ":" + WS_COMBINATOR
871 is_open = bool(flags & FLG_OPEN)
872 is_pseudo = bool(flags & FLG_PSEUDO)
873 is_relative = bool(flags & FLG_RELATIVE)
874 is_not = bool(flags & FLG_NOT)
875 is_html = bool(flags & FLG_HTML)
876 is_default = bool(flags & FLG_DEFAULT)
877 is_indeterminate = bool(flags & FLG_INDETERMINATE)
878 is_in_range = bool(flags & FLG_IN_RANGE)
879 is_out_of_range = bool(flags & FLG_OUT_OF_RANGE)
880 is_placeholder_shown = bool(flags & FLG_PLACEHOLDER_SHOWN)
881
882 if self.debug: # pragma: no cover
883 if is_pseudo:
884 print(' is_pseudo: True')
885 if is_open:
886 print(' is_open: True')
887 if is_relative:
888 print(' is_relative: True')
889 if is_not:
890 print(' is_not: True')
891 if is_html:
892 print(' is_html: True')
893 if is_default:
894 print(' is_default: True')
895 if is_indeterminate:
896 print(' is_indeterminate: True')
897 if is_in_range:
898 print(' is_in_range: True')
899 if is_out_of_range:
900 print(' is_out_of_range: True')
901 if is_placeholder_shown:
902 print(' is_placeholder_shown: True')
903
904 if is_relative:
905 selectors.append(_Selector())
906
907 try:
908 while True:
909 key, m = next(iselector)
910
911 # Handle parts
912 if key == "at_rule":
913 raise NotImplementedError("At-rules found at position {}".format(m.start(0)))
914 elif key == 'pseudo_class_custom':
915 has_selector = self.parse_pseudo_class_custom(sel, m, has_selector)
916 elif key == 'pseudo_class':
917 has_selector, is_html = self.parse_pseudo_class(sel, m, has_selector, iselector, is_html)
918 elif key == 'pseudo_element':
919 raise NotImplementedError("Pseudo-element found at position {}".format(m.start(0)))
920 elif key == 'pseudo_contains':
921 has_selector = self.parse_pseudo_contains(sel, m, has_selector)
922 elif key in ('pseudo_nth_type', 'pseudo_nth_child'):
923 has_selector = self.parse_pseudo_nth(sel, m, has_selector, iselector)
924 elif key == 'pseudo_lang':
925 has_selector = self.parse_pseudo_lang(sel, m, has_selector)
926 elif key == 'pseudo_dir':
927 has_selector = self.parse_pseudo_dir(sel, m, has_selector)
928 # Currently only supports HTML
929 is_html = True
930 elif key == 'pseudo_close':
931 if not has_selector:
932 raise SelectorSyntaxError(
933 "Expected a selector at postion {}".format(m.start(0)),
934 self.pattern,
935 m.start(0)
936 )
937 if is_open:
938 closed = True
939 break
940 else:
941 raise SelectorSyntaxError(
942 "Unmatched pseudo-class close at postion {}".format(m.start(0)),
943 self.pattern,
944 m.start(0)
945 )
946 elif key == 'combine':
947 if is_relative:
948 has_selector, sel, rel_type = self.parse_has_combinator(
949 sel, m, has_selector, selectors, rel_type, index
950 )
951 else:
952 has_selector, sel = self.parse_combinator(
953 sel, m, has_selector, selectors, relations, is_pseudo, index
954 )
955 elif key == 'attribute':
956 has_selector = self.parse_attribute_selector(sel, m, has_selector)
957 elif key == 'tag':
958 if has_selector:
959 raise SelectorSyntaxError(
960 "Tag name found at position {} instead of at the start".format(m.start(0)),
961 self.pattern,
962 m.start(0)
963 )
964 has_selector = self.parse_tag_pattern(sel, m, has_selector)
965 elif key in ('class', 'id'):
966 has_selector = self.parse_class_id(sel, m, has_selector)
967
968 index = m.end(0)
969 except StopIteration:
970 pass
971
972 if is_open and not closed:
973 raise SelectorSyntaxError(
974 "Unclosed pseudo-class at position {}".format(index),
975 self.pattern,
976 index
977 )
978
979 if has_selector:
980 if not sel.tag and not is_pseudo:
981 # Implied `*`
982 sel.tag = ct.SelectorTag('*', None)
983 if is_relative:
984 sel.rel_type = rel_type
985 selectors[-1].relations.append(sel)
986 else:
987 sel.relations.extend(relations)
988 del relations[:]
989 selectors.append(sel)
990 else:
991 # We will always need to finish a selector when `:has()` is used as it leads with combining.
992 raise SelectorSyntaxError(
993 'Expected a selector at position {}'.format(index),
994 self.pattern,
995 index
996 )
997
998 # Some patterns require additional logic, such as default. We try to make these the
999 # last pattern, and append the appropriate flag to that selector which communicates
1000 # to the matcher what additional logic is required.
1001 if is_default:
1002 selectors[-1].flags = ct.SEL_DEFAULT
1003 if is_indeterminate:
1004 selectors[-1].flags = ct.SEL_INDETERMINATE
1005 if is_in_range:
1006 selectors[-1].flags = ct.SEL_IN_RANGE
1007 if is_out_of_range:
1008 selectors[-1].flags = ct.SEL_OUT_OF_RANGE
1009 if is_placeholder_shown:
1010 selectors[-1].flags = ct.SEL_PLACEHOLDER_SHOWN
1011
1012 return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html)
1013
1014 def selector_iter(self, pattern):
1015 """Iterate selector tokens."""
1016
1017 # Ignore whitespace and comments at start and end of pattern
1018 m = RE_WS_BEGIN.search(pattern)
1019 index = m.end(0) if m else 0
1020 m = RE_WS_END.search(pattern)
1021 end = (m.start(0) - 1) if m else (len(pattern) - 1)
1022
1023 if self.debug: # pragma: no cover
1024 print('## PARSING: {!r}'.format(pattern))
1025 while index <= end:
1026 m = None
1027 for v in self.css_tokens:
1028 m = v.match(pattern, index, self.flags)
1029 if m:
1030 name = v.get_name()
1031 if self.debug: # pragma: no cover
1032 print("TOKEN: '{}' --> {!r} at position {}".format(name, m.group(0), m.start(0)))
1033 index = m.end(0)
1034 yield name, m
1035 break
1036 if m is None:
1037 c = pattern[index]
1038 # If the character represents the start of one of the known selector types,
1039 # throw an exception mentioning that the known selector type is in error;
1040 # otherwise, report the invalid character.
1041 if c == '[':
1042 msg = "Malformed attribute selector at position {}".format(index)
1043 elif c == '.':
1044 msg = "Malformed class selector at position {}".format(index)
1045 elif c == '#':
1046 msg = "Malformed id selector at position {}".format(index)
1047 elif c == ':':
1048 msg = "Malformed pseudo-class selector at position {}".format(index)
1049 else:
1050 msg = "Invalid character {!r} position {}".format(c, index)
1051 raise SelectorSyntaxError(msg, self.pattern, index)
1052 if self.debug: # pragma: no cover
1053 print('## END PARSING')
1054
1055 def process_selectors(self, index=0, flags=0):
1056 """Process selectors."""
1057
1058 return self.parse_selectors(self.selector_iter(self.pattern), index, flags)
1059
1060
1061 # Precompile CSS selector lists for pseudo-classes (additional logic may be required beyond the pattern)
1062 # A few patterns are order dependent as they use patterns previous compiled.
1063
1064 # CSS pattern for `:link` and `:any-link`
1065 CSS_LINK = CSSParser(
1066 'html|*:is(a, area)[href]'
1067 ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1068 # CSS pattern for `:checked`
1069 CSS_CHECKED = CSSParser(
1070 '''
1071 html|*:is(input[type=checkbox], input[type=radio])[checked], html|option[selected]
1072 '''
1073 ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1074 # CSS pattern for `:default` (must compile CSS_CHECKED first)
1075 CSS_DEFAULT = CSSParser(
1076 '''
1077 :checked,
1078
1079 /*
1080 This pattern must be at the end.
1081 Special logic is applied to the last selector.
1082 */
1083 html|form html|*:is(button, input)[type="submit"]
1084 '''
1085 ).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_DEFAULT)
1086 # CSS pattern for `:indeterminate`
1087 CSS_INDETERMINATE = CSSParser(
1088 '''
1089 html|input[type="checkbox"][indeterminate],
1090 html|input[type="radio"]:is(:not([name]), [name=""]):not([checked]),
1091 html|progress:not([value]),
1092
1093 /*
1094 This pattern must be at the end.
1095 Special logic is applied to the last selector.
1096 */
1097 html|input[type="radio"][name]:not([name='']):not([checked])
1098 '''
1099 ).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_INDETERMINATE)
1100 # CSS pattern for `:disabled`
1101 CSS_DISABLED = CSSParser(
1102 '''
1103 html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset)[disabled],
1104 html|optgroup[disabled] > html|option,
1105 html|fieldset[disabled] > html|*:is(input:not([type=hidden]), button, select, textarea, fieldset),
1106 html|fieldset[disabled] >
1107 html|*:not(legend:nth-of-type(1)) html|*:is(input:not([type=hidden]), button, select, textarea, fieldset)
1108 '''
1109 ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1110 # CSS pattern for `:enabled`
1111 CSS_ENABLED = CSSParser(
1112 '''
1113 html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled)
1114 '''
1115 ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1116 # CSS pattern for `:required`
1117 CSS_REQUIRED = CSSParser(
1118 'html|*:is(input, textarea, select)[required]'
1119 ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1120 # CSS pattern for `:optional`
1121 CSS_OPTIONAL = CSSParser(
1122 'html|*:is(input, textarea, select):not([required])'
1123 ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1124 # CSS pattern for `:placeholder-shown`
1125 CSS_PLACEHOLDER_SHOWN = CSSParser(
1126 '''
1127 html|input:is(
1128 :not([type]),
1129 [type=""],
1130 [type=text],
1131 [type=search],
1132 [type=url],
1133 [type=tel],
1134 [type=email],
1135 [type=password],
1136 [type=number]
1137 )[placeholder]:not([placeholder='']):is(:not([value]), [value=""]),
1138 html|textarea[placeholder]:not([placeholder=''])
1139 '''
1140 ).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_PLACEHOLDER_SHOWN)
1141 # CSS pattern default for `:nth-child` "of S" feature
1142 CSS_NTH_OF_S_DEFAULT = CSSParser(
1143 '*|*'
1144 ).process_selectors(flags=FLG_PSEUDO)
1145 # CSS pattern for `:read-write` (CSS_DISABLED must be compiled first)
1146 CSS_READ_WRITE = CSSParser(
1147 '''
1148 html|*:is(
1149 textarea,
1150 input:is(
1151 :not([type]),
1152 [type=""],
1153 [type=text],
1154 [type=search],
1155 [type=url],
1156 [type=tel],
1157 [type=email],
1158 [type=number],
1159 [type=password],
1160 [type=date],
1161 [type=datetime-local],
1162 [type=month],
1163 [type=time],
1164 [type=week]
1165 )
1166 ):not([readonly], :disabled),
1167 html|*:is([contenteditable=""], [contenteditable="true" i])
1168 '''
1169 ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1170 # CSS pattern for `:read-only`
1171 CSS_READ_ONLY = CSSParser(
1172 '''
1173 html|*:not(:read-write)
1174 '''
1175 ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1176 # CSS pattern for `:in-range`
1177 CSS_IN_RANGE = CSSParser(
1178 '''
1179 html|input:is(
1180 [type="date"],
1181 [type="month"],
1182 [type="week"],
1183 [type="time"],
1184 [type="datetime-local"],
1185 [type="number"],
1186 [type="range"]
1187 ):is(
1188 [min],
1189 [max]
1190 )
1191 '''
1192 ).process_selectors(flags=FLG_PSEUDO | FLG_IN_RANGE | FLG_HTML)
1193 # CSS pattern for `:out-of-range`
1194 CSS_OUT_OF_RANGE = CSSParser(
1195 '''
1196 html|input:is(
1197 [type="date"],
1198 [type="month"],
1199 [type="week"],
1200 [type="time"],
1201 [type="datetime-local"],
1202 [type="number"],
1203 [type="range"]
1204 ):is(
1205 [min],
1206 [max]
1207 )
1208 '''
1209 ).process_selectors(flags=FLG_PSEUDO | FLG_OUT_OF_RANGE | FLG_HTML)