Skip to content

Commit 3d5c7f0

Browse files
committed
fixup! Move token normalization to the tokenizer
1 parent 2af27a1 commit 3d5c7f0

File tree

2 files changed

+47
-83
lines changed

2 files changed

+47
-83
lines changed

html5lib/_tokenizer.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ class HTMLTokenizer(object):
3636
* self.stream
3737
Points to HTMLInputStream object.
3838
"""
39-
39+
4040
def __init__(self, stream, parser=None, **kwargs):
4141

4242
self.stream = HTMLInputStream(stream, **kwargs)

html5lib/html5parser.py

+46-82
Original file line numberDiff line numberDiff line change
@@ -426,10 +426,13 @@ def getMetaclass(use_metaclass, metaclass_func):
426426
class Phase(with_metaclass(getMetaclass(debug, log))):
427427
"""Base class for helper object that implements each phase of processing
428428
"""
429+
__slots__ = ("parser", "tree", "__startTagCache", "__endTagCache")
429430

430431
def __init__(self, parser, tree):
431432
self.parser = parser
432433
self.tree = tree
434+
self.__startTagCache = {}
435+
self.__endTagCache = {}
433436

434437
def processEOF(self):
435438
raise NotImplementedError
@@ -449,7 +452,12 @@ def processSpaceCharacters(self, token):
449452
self.tree.insertText(token["data"])
450453

451454
def processStartTag(self, token):
452-
return self.startTagHandler[token["name"]](token)
455+
name = token["name"]
456+
try:
457+
func = self.__startTagCache[name]
458+
except KeyError:
459+
func = self.__startTagCache[name] = self.startTagHandler[name].__get__(self)
460+
return func(token)
453461

454462
def startTagHtml(self, token):
455463
if not self.parser.firstStartTag and token["name"] == "html":
@@ -462,9 +470,16 @@ def startTagHtml(self, token):
462470
self.parser.firstStartTag = False
463471

464472
def processEndTag(self, token):
465-
return self.endTagHandler[token["name"]](token)
473+
name = token["name"]
474+
try:
475+
func = self.__endTagCache[name]
476+
except KeyError:
477+
func = self.__endTagCache[name] = self.endTagHandler[name].__get__(self)
478+
return func(token)
466479

467480
class InitialPhase(Phase):
481+
__slots__ = tuple()
482+
468483
def processSpaceCharacters(self, token):
469484
pass
470485

@@ -593,6 +608,8 @@ def processEOF(self):
593608
return True
594609

595610
class BeforeHtmlPhase(Phase):
611+
__slots__ = tuple()
612+
596613
# helper methods
597614
def insertHtmlElement(self):
598615
self.tree.insertRoot(impliedTagToken("html", "StartTag"))
@@ -628,10 +645,7 @@ def processEndTag(self, token):
628645
return token
629646

630647
class BeforeHeadPhase(Phase):
631-
def __init__(self, parser, tree):
632-
Phase.__init__(self, parser, tree)
633-
self.startTagHandler = _utils.BoundMethodDispatcher(self, self.startTagHandler)
634-
self.endTagHandler = _utils.BoundMethodDispatcher(self, self.endTagHandler)
648+
__slots__ = tuple()
635649

636650
def processEOF(self):
637651
self.startTagHead(impliedTagToken("head", "StartTag"))
@@ -676,10 +690,7 @@ def endTagOther(self, token):
676690
endTagHandler.default = endTagOther
677691

678692
class InHeadPhase(Phase):
679-
def __init__(self, parser, tree):
680-
Phase.__init__(self, parser, tree)
681-
self.startTagHandler = _utils.BoundMethodDispatcher(self, self.startTagHandler)
682-
self.endTagHandler = _utils.BoundMethodDispatcher(self, self.endTagHandler)
693+
__slots__ = tuple()
683694

684695
# the real thing
685696
def processEOF(self):
@@ -781,10 +792,7 @@ def anythingElse(self):
781792
endTagHandler.default = endTagOther
782793

783794
class InHeadNoscriptPhase(Phase):
784-
def __init__(self, parser, tree):
785-
Phase.__init__(self, parser, tree)
786-
self.startTagHandler = _utils.BoundMethodDispatcher(self, self.startTagHandler)
787-
self.endTagHandler = _utils.BoundMethodDispatcher(self, self.endTagHandler)
795+
__slots__ = tuple()
788796

789797
def processEOF(self):
790798
self.parser.parseError("eof-in-head-noscript")
@@ -847,10 +855,7 @@ def anythingElse(self):
847855
endTagHandler.default = endTagOther
848856

849857
class AfterHeadPhase(Phase):
850-
def __init__(self, parser, tree):
851-
Phase.__init__(self, parser, tree)
852-
self.startTagHandler = _utils.BoundMethodDispatcher(self, self.startTagHandler)
853-
self.endTagHandler = _utils.BoundMethodDispatcher(self, self.endTagHandler)
858+
__slots__ = tuple()
854859

855860
def processEOF(self):
856861
self.anythingElse()
@@ -918,11 +923,10 @@ def anythingElse(self):
918923
class InBodyPhase(Phase):
919924
# http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
920925
# the really-really-really-very crazy mode
921-
def __init__(self, parser, tree):
922-
Phase.__init__(self, parser, tree)
923-
self.startTagHandler = _utils.BoundMethodDispatcher(self, self.startTagHandler)
924-
self.endTagHandler = _utils.BoundMethodDispatcher(self, self.endTagHandler)
926+
__slots__ = ("processSpaceCharacters",)
925927

928+
def __init__(self, *args, **kwargs):
929+
super(InBodyPhase, self).__init__(*args, **kwargs)
926930
# Set this to the default handler
927931
self.processSpaceCharacters = self.processSpaceCharactersNonPre
928932

@@ -1641,10 +1645,7 @@ def endTagOther(self, token):
16411645
endTagHandler.default = endTagOther
16421646

16431647
class TextPhase(Phase):
1644-
def __init__(self, parser, tree):
1645-
Phase.__init__(self, parser, tree)
1646-
self.startTagHandler = _utils.BoundMethodDispatcher(self, self.startTagHandler)
1647-
self.endTagHandler = _utils.BoundMethodDispatcher(self, self.endTagHandler)
1648+
__slots__ = tuple()
16481649

16491650
def processCharacters(self, token):
16501651
self.tree.insertText(token["data"])
@@ -1678,10 +1679,7 @@ def endTagOther(self, token):
16781679

16791680
class InTablePhase(Phase):
16801681
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
1681-
def __init__(self, parser, tree):
1682-
Phase.__init__(self, parser, tree)
1683-
self.startTagHandler = _utils.BoundMethodDispatcher(self, self.startTagHandler)
1684-
self.endTagHandler = _utils.BoundMethodDispatcher(self, self.endTagHandler)
1682+
__slots__ = tuple()
16851683

16861684
# helper methods
16871685
def clearStackToTableContext(self):
@@ -1825,8 +1823,10 @@ def endTagOther(self, token):
18251823
endTagHandler.default = endTagOther
18261824

18271825
class InTableTextPhase(Phase):
1828-
def __init__(self, parser, tree):
1829-
Phase.__init__(self, parser, tree)
1826+
__slots__ = ("originalPhase", "characterTokens")
1827+
1828+
def __init__(self, *args, **kwargs):
1829+
super(InTableTextPhase, self).__init__(*args, **kwargs)
18301830
self.originalPhase = None
18311831
self.characterTokens = []
18321832

@@ -1871,10 +1871,7 @@ def processEndTag(self, token):
18711871

18721872
class InCaptionPhase(Phase):
18731873
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption
1874-
def __init__(self, parser, tree):
1875-
Phase.__init__(self, parser, tree)
1876-
self.startTagHandler = _utils.BoundMethodDispatcher(self, self.startTagHandler)
1877-
self.endTagHandler = _utils.BoundMethodDispatcher(self, self.endTagHandler)
1874+
__slots__ = tuple()
18781875

18791876
def ignoreEndTagCaption(self):
18801877
return not self.tree.elementInScope("caption", variant="table")
@@ -1944,11 +1941,7 @@ def endTagOther(self, token):
19441941

19451942
class InColumnGroupPhase(Phase):
19461943
# http://www.whatwg.org/specs/web-apps/current-work/#in-column
1947-
1948-
def __init__(self, parser, tree):
1949-
Phase.__init__(self, parser, tree)
1950-
self.startTagHandler = _utils.BoundMethodDispatcher(self, self.startTagHandler)
1951-
self.endTagHandler = _utils.BoundMethodDispatcher(self, self.endTagHandler)
1944+
__slots__ = tuple()
19521945

19531946
def ignoreEndTagColgroup(self):
19541947
return self.tree.openElements[-1].name == "html"
@@ -2012,10 +2005,7 @@ def endTagOther(self, token):
20122005

20132006
class InTableBodyPhase(Phase):
20142007
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0
2015-
def __init__(self, parser, tree):
2016-
Phase.__init__(self, parser, tree)
2017-
self.startTagHandler = _utils.BoundMethodDispatcher(self, self.startTagHandler)
2018-
self.endTagHandler = _utils.BoundMethodDispatcher(self, self.endTagHandler)
2008+
__slots__ = tuple()
20192009

20202010
# helper methods
20212011
def clearStackToTableBodyContext(self):
@@ -2113,10 +2103,7 @@ def endTagOther(self, token):
21132103

21142104
class InRowPhase(Phase):
21152105
# http://www.whatwg.org/specs/web-apps/current-work/#in-row
2116-
def __init__(self, parser, tree):
2117-
Phase.__init__(self, parser, tree)
2118-
self.startTagHandler = _utils.BoundMethodDispatcher(self, self.startTagHandler)
2119-
self.endTagHandler = _utils.BoundMethodDispatcher(self, self.endTagHandler)
2106+
__slots__ = tuple()
21202107

21212108
# helper methods (XXX unify this with other table helper methods)
21222109
def clearStackToTableRowContext(self):
@@ -2205,10 +2192,7 @@ def endTagOther(self, token):
22052192

22062193
class InCellPhase(Phase):
22072194
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell
2208-
def __init__(self, parser, tree):
2209-
Phase.__init__(self, parser, tree)
2210-
self.startTagHandler = _utils.BoundMethodDispatcher(self, self.startTagHandler)
2211-
self.endTagHandler = _utils.BoundMethodDispatcher(self, self.endTagHandler)
2195+
__slots__ = tuple()
22122196

22132197
# helper
22142198
def closeCell(self):
@@ -2283,10 +2267,7 @@ def endTagOther(self, token):
22832267
endTagHandler.default = endTagOther
22842268

22852269
class InSelectPhase(Phase):
2286-
def __init__(self, parser, tree):
2287-
Phase.__init__(self, parser, tree)
2288-
self.startTagHandler = _utils.BoundMethodDispatcher(self, self.startTagHandler)
2289-
self.endTagHandler = _utils.BoundMethodDispatcher(self, self.endTagHandler)
2270+
__slots__ = tuple()
22902271

22912272
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
22922273
def processEOF(self):
@@ -2385,10 +2366,7 @@ def endTagOther(self, token):
23852366
endTagHandler.default = endTagOther
23862367

23872368
class InSelectInTablePhase(Phase):
2388-
def __init__(self, parser, tree):
2389-
Phase.__init__(self, parser, tree)
2390-
self.startTagHandler = _utils.BoundMethodDispatcher(self, self.startTagHandler)
2391-
self.endTagHandler = _utils.BoundMethodDispatcher(self, self.endTagHandler)
2369+
__slots__ = tuple()
23922370

23932371
def processEOF(self):
23942372
self.parser.phases["inSelect"].processEOF()
@@ -2426,6 +2404,8 @@ def endTagOther(self, token):
24262404
endTagHandler.default = endTagOther
24272405

24282406
class InForeignContentPhase(Phase):
2407+
__slots__ = tuple()
2408+
24292409
breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
24302410
"center", "code", "dd", "div", "dl", "dt",
24312411
"em", "embed", "h1", "h2", "h3",
@@ -2435,9 +2415,6 @@ class InForeignContentPhase(Phase):
24352415
"span", "strong", "strike", "sub", "sup",
24362416
"table", "tt", "u", "ul", "var"])
24372417

2438-
def __init__(self, parser, tree):
2439-
Phase.__init__(self, parser, tree)
2440-
24412418
def adjustSVGTagNames(self, token):
24422419
replacements = {"altglyph": "altGlyph",
24432420
"altglyphdef": "altGlyphDef",
@@ -2541,10 +2518,7 @@ def processEndTag(self, token):
25412518
return new_token
25422519

25432520
class AfterBodyPhase(Phase):
2544-
def __init__(self, parser, tree):
2545-
Phase.__init__(self, parser, tree)
2546-
self.startTagHandler = _utils.BoundMethodDispatcher(self, self.startTagHandler)
2547-
self.endTagHandler = _utils.BoundMethodDispatcher(self, self.endTagHandler)
2521+
__slots__ = tuple()
25482522

25492523
def processEOF(self):
25502524
# Stop parsing
@@ -2591,10 +2565,7 @@ def endTagOther(self, token):
25912565

25922566
class InFramesetPhase(Phase):
25932567
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
2594-
def __init__(self, parser, tree):
2595-
Phase.__init__(self, parser, tree)
2596-
self.startTagHandler = _utils.BoundMethodDispatcher(self, self.startTagHandler)
2597-
self.endTagHandler = _utils.BoundMethodDispatcher(self, self.endTagHandler)
2568+
__slots__ = tuple()
25982569

25992570
def processEOF(self):
26002571
if self.tree.openElements[-1].name != "html":
@@ -2650,10 +2621,7 @@ def endTagOther(self, token):
26502621

26512622
class AfterFramesetPhase(Phase):
26522623
# http://www.whatwg.org/specs/web-apps/current-work/#after3
2653-
def __init__(self, parser, tree):
2654-
Phase.__init__(self, parser, tree)
2655-
self.startTagHandler = _utils.BoundMethodDispatcher(self, self.startTagHandler)
2656-
self.endTagHandler = _utils.BoundMethodDispatcher(self, self.endTagHandler)
2624+
__slots__ = tuple()
26572625

26582626
def processEOF(self):
26592627
# Stop parsing
@@ -2688,9 +2656,7 @@ def endTagOther(self, token):
26882656
endTagHandler.default = endTagOther
26892657

26902658
class AfterAfterBodyPhase(Phase):
2691-
def __init__(self, parser, tree):
2692-
Phase.__init__(self, parser, tree)
2693-
self.startTagHandler = _utils.BoundMethodDispatcher(self, self.startTagHandler)
2659+
__slots__ = tuple()
26942660

26952661
def processEOF(self):
26962662
pass
@@ -2727,9 +2693,7 @@ def processEndTag(self, token):
27272693
startTagHandler.default = startTagOther
27282694

27292695
class AfterAfterFramesetPhase(Phase):
2730-
def __init__(self, parser, tree):
2731-
Phase.__init__(self, parser, tree)
2732-
self.startTagHandler = _utils.BoundMethodDispatcher(self, self.startTagHandler)
2696+
__slots__ = tuple()
27332697

27342698
def processEOF(self):
27352699
pass

0 commit comments

Comments
 (0)