Skip to content

Add benchmarks and optimize html5lib a bit #493

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jun 17, 2020
Merged
57 changes: 57 additions & 0 deletions benchmarks/bench_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import io
import os
import sys

import pyperf

sys.path[0:0] = [os.path.join(os.path.dirname(__file__), "..")]
import html5lib # noqa: E402


def bench_parse(fh, treebuilder):
fh.seek(0)
html5lib.parse(fh, treebuilder=treebuilder, useChardet=False)


def bench_serialize(loops, fh, treebuilder):
fh.seek(0)
doc = html5lib.parse(fh, treebuilder=treebuilder, useChardet=False)

range_it = range(loops)
t0 = pyperf.perf_counter()

for loops in range_it:
html5lib.serialize(doc, tree=treebuilder, encoding="ascii", inject_meta_charset=False)

return pyperf.perf_counter() - t0


BENCHMARKS = ["parse", "serialize"]


def add_cmdline_args(cmd, args):
if args.benchmark:
cmd.append(args.benchmark)


if __name__ == "__main__":
runner = pyperf.Runner(add_cmdline_args=add_cmdline_args)
runner.metadata["description"] = "Run benchmarks based on Anolis"
runner.argparser.add_argument("benchmark", nargs="?", choices=BENCHMARKS)

args = runner.parse_args()
if args.benchmark:
benchmarks = (args.benchmark,)
else:
benchmarks = BENCHMARKS

with open(os.path.join(os.path.dirname(__file__), "data", "html.html"), "rb") as fh:
source = io.BytesIO(fh.read())

if "parse" in benchmarks:
for tb in ("etree", "dom", "lxml"):
runner.bench_func("html_parse_%s" % tb, bench_parse, source, tb)

if "serialize" in benchmarks:
for tb in ("etree", "dom", "lxml"):
runner.bench_time_func("html_serialize_%s" % tb, bench_serialize, source, tb)
45 changes: 45 additions & 0 deletions benchmarks/bench_wpt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import io
import os
import sys

import pyperf

sys.path[0:0] = [os.path.join(os.path.dirname(__file__), "..")]
import html5lib # noqa: E402


def bench_html5lib(fh):
fh.seek(0)
html5lib.parse(fh, treebuilder="etree", useChardet=False)


def add_cmdline_args(cmd, args):
if args.benchmark:
cmd.append(args.benchmark)


BENCHMARKS = {}
for root, dirs, files in os.walk(os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "wpt")):
for f in files:
if f.endswith(".html"):
BENCHMARKS[f[: -len(".html")]] = os.path.join(root, f)


if __name__ == "__main__":
runner = pyperf.Runner(add_cmdline_args=add_cmdline_args)
runner.metadata["description"] = "Run parser benchmarks from WPT"
runner.argparser.add_argument("benchmark", nargs="?", choices=sorted(BENCHMARKS))

args = runner.parse_args()
if args.benchmark:
benchmarks = (args.benchmark,)
else:
benchmarks = sorted(BENCHMARKS)

for bench in benchmarks:
name = "wpt_%s" % bench
path = BENCHMARKS[bench]
with open(path, "rb") as fh:
fh2 = io.BytesIO(fh.read())

runner.bench_func(name, bench_html5lib, fh2)
8 changes: 8 additions & 0 deletions benchmarks/data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
The files in this data are derived from:

* `html.html`: from [html](http://github.com/whatwg/html), revision
77db356a293f2b152b648c836b6989d17afe42bb. This is the first 5000 lines of `source`. (This is
representative of the input to [Anolis](https://bitbucket.org/ms2ger/anolis/); first 5000 lines
chosen to make it parse in a reasonable time.)

* `wpt`: see `wpt/README.md`.
Loading