Skip to content

Commit

Permalink
Add support for lazy matchers
Browse files Browse the repository at this point in the history
Support is added for lazy builtin matchers (with a separately compiled
file), as well as loading json or yaml files using lazy matchers.

Lazy matchers are very much a tradeoff: they improve import speed, but
slow down run speed, possibly dramatically.

Use them by default for the re2 parser, but not the basic parser:
experimentally, on Python 3.11

- importing the package itself takes ~36ms
- importing the lazy matchers takes ~36ms (including the package, so ~0)
- importing the eager matchers takes ~97ms

the eager matchers have a significant overhead, *however* running the
bench on the sample file, they cause a runtime increase of 700~800ms
on the basic parser bench, as that ends up instantiating *every*
regex (likely due to match failures). Relatively this is not
huge (~2.5%), but the tradeoff doesn't seem great, especially since
the parser itself is initialized lazily.

The re2 parser does much better, only losing 20~30ms (~1%), this is
likely because it only needs to compile a fraction of the regexes (156
out of 1162 as of regexes.yaml version 0.18), and possibly because it
gets to avoid some of the most expensive to compile ones.

Fixes ua-parser#171, fixes ua-parser#173
  • Loading branch information
masklinn committed Feb 17, 2024
1 parent 04d0b7d commit bdc33fd
Show file tree
Hide file tree
Showing 9 changed files with 450 additions and 138 deletions.
239 changes: 139 additions & 100 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# flake8: noqa
import io
from contextlib import suppress
from contextlib import suppress, contextmanager
from os import fspath
from pathlib import Path
from typing import Optional, List, Dict
Expand Down Expand Up @@ -52,21 +52,6 @@ def run(self) -> None:
f"Unable to find regexes.yaml, should be at {yaml_src!r}"
)

def write_matcher(f, typ: str, fields: List[Optional[object]]):
f.write(f" {typ}(".encode())
while len(fields) > 1 and fields[-1] is None:
fields = fields[:-1]
f.write(", ".join(map(repr, fields)).encode())
f.write(b"),\n")

def write_params(fields):
# strip trailing None values
while len(fields) > 1 and fields[-1] is None:
fields.pop()

for field in fields:
fp.write((f" {field!r},\n").encode())

with yaml_src.open("rb") as f:
regexes = yaml.safe_load(f)

Expand All @@ -79,96 +64,150 @@ def write_params(fields):
outdir.mkdir(parents=True, exist_ok=True)

dest = outdir / "_matchers.py"
dest_lazy = outdir / "_lazy.py"
dest_legacy = outdir / "_regexes.py"

with dest.open("wb") as f, dest_legacy.open("wb") as fp:
# fmt: off
f.write(b"""\
with dest.open("wb") as eager, dest_lazy.open("wb") as lazy, dest_legacy.open(
"wb"
) as legacy:
eager = EagerWriter(eager)
lazy = LazyWriter(lazy)
legacy = LegacyWriter(legacy)

for section in ["user_agent_parsers", "os_parsers", "device_parsers"]:
with eager.section(section), lazy.section(section), legacy.section(
section
):
extract = EXTRACTORS[section]
for p in regexes[section]:
el = trim(extract(p))
eager.item(el)
lazy.item(el)
legacy.item(el)
eager.end()
lazy.end()
legacy.end()


def trim(l):
while len(l) > 1 and l[-1] is None:
l.pop()
return l


EXTRACTORS = {
"user_agent_parsers": lambda p: [
p["regex"],
p.get("family_replacement"),
p.get("v1_replacement"),
p.get("v2_replacement"),
],
"os_parsers": lambda p: [
p["regex"],
p.get("os_replacement"),
p.get("os_v1_replacement"),
p.get("os_v2_replacement"),
p.get("os_v3_replacement"),
p.get("os_v4_replacement"),
],
"device_parsers": lambda p: [
p["regex"],
p.get("regex_flag"),
p.get("device_replacement"),
p.get("brand_replacement"),
p.get("model_replacement"),
],
}


class Writer:
section_end = b""

def __init__(self, fp):
self.fp = fp
self.fp.write(
b"""\
########################################################
# NOTICE: this file is autogenerated from regexes.yaml #
########################################################
"""
)
self.fp.write(self.prefix)
self._section = None

@contextmanager
def section(self, id):
self._section = id
self.fp.write(self.sections[id])
yield
self.fp.write(self.section_end)

def item(self, elements):
# DeviceMatcher(re, flag, repl1),
self.fp.write(self.items[self._section])
self.fp.write(", ".join(map(repr, elements)).encode())
self.fp.write(b"),\n")

def end(self):
self.fp.write(self.suffix)


class LegacyWriter(Writer):
prefix = b"""\
__all__ = [
"USER_AGENT_PARSERS",
"DEVICE_PARSERS",
"OS_PARSERS",
]
from .user_agent_parser import UserAgentParser, DeviceParser, OSParser
"""
sections = {
"user_agent_parsers": b"USER_AGENT_PARSERS = [\n",
"os_parsers": b"\n\nOS_PARSERS = [\n",
"device_parsers": b"\n\nDEVICE_PARSERS = [\n",
}
section_end = b"]"
items = {
"user_agent_parsers": b" UserAgentParser(",
"os_parsers": b" OSParser(",
"device_parsers": b" DeviceParser(",
}
suffix = b"\n"


class EagerWriter(Writer):
prefix = b"""\
__all__ = ["MATCHERS"]
from typing import Tuple, List
from .core import UserAgentMatcher, OSMatcher, DeviceMatcher
MATCHERS: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]] = ([
"""
sections = {
"user_agent_parsers": b"",
"os_parsers": b"], [\n",
"device_parsers": b"], [\n",
}
items = {
"user_agent_parsers": b" UserAgentMatcher(",
"os_parsers": b" OSMatcher(",
"device_parsers": b" DeviceMatcher(",
}
suffix = b"])\n"


class LazyWriter(EagerWriter):
prefix = b"""\
__all__ = ["MATCHERS"]
from typing import Tuple, List
from .lazy import UserAgentMatcher, OSMatcher, DeviceMatcher
from .core import Matchers, UserAgentMatcher, OSMatcher, DeviceMatcher
MATCHERS: Matchers = ([
""")
fp.write(b"# -*- coding: utf-8 -*-\n")
fp.write(b"########################################################\n")
fp.write(b"# NOTICE: This file is autogenerated from regexes.yaml #\n")
fp.write(b"########################################################\n")
fp.write(b"\n")
fp.write(b"from .user_agent_parser import (\n")
fp.write(b" UserAgentParser, DeviceParser, OSParser,\n")
fp.write(b")\n")
fp.write(b"\n")
fp.write(b"__all__ = ('USER_AGENT_PARSERS', 'DEVICE_PARSERS', 'OS_PARSERS')\n")
fp.write(b"\n")
fp.write(b"USER_AGENT_PARSERS = [\n")
for device_parser in regexes["user_agent_parsers"]:
write_matcher(f, "UserAgentMatcher", [
device_parser["regex"],
device_parser.get("family_replacement"),
device_parser.get("v1_replacement"),
device_parser.get("v2_replacement"),
])

fp.write(b" UserAgentParser(\n")
write_params([
device_parser["regex"],
device_parser.get("family_replacement"),
device_parser.get("v1_replacement"),
device_parser.get("v2_replacement"),
])
fp.write(b" ),\n")
f.write(b" ], [\n")
fp.write(b"]\n\n")

fp.write(b"OS_PARSERS = [\n")
for device_parser in regexes["os_parsers"]:
write_matcher(f, "OSMatcher", [
device_parser["regex"],
device_parser.get("os_replacement"),
device_parser.get("os_v1_replacement"),
device_parser.get("os_v2_replacement"),
device_parser.get("os_v3_replacement"),
device_parser.get("os_v4_replacement"),
])

fp.write(b" OSParser(\n")
write_params([
device_parser["regex"],
device_parser.get("os_replacement"),
device_parser.get("os_v1_replacement"),
device_parser.get("os_v2_replacement"),
device_parser.get("os_v3_replacement"),
device_parser.get("os_v4_replacement"),
])
fp.write(b" ),\n")
f.write(b" ], [\n")
fp.write(b"]\n\n")

fp.write(b"DEVICE_PARSERS = [\n")
for device_parser in regexes["device_parsers"]:
write_matcher(f, "DeviceMatcher", [
device_parser["regex"],
device_parser.get("regex_flag"),
device_parser.get("device_replacement"),
device_parser.get("brand_replacement"),
device_parser.get("model_replacement"),
])

fp.write(b" DeviceParser(\n")
write_params([
device_parser["regex"],
device_parser.get("regex_flag"),
device_parser.get("device_replacement"),
device_parser.get("brand_replacement"),
device_parser.get("model_replacement"),
])
fp.write(b" ),\n")
f.write(b"])\n")
fp.write(b"]\n")
# fmt: on
MATCHERS: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]] = ([
"""


setup(
Expand Down
5 changes: 3 additions & 2 deletions src/ua_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
"UserAgent",
"UserAgentMatcher",
"load_builtins",
"load_lazy_builtins",
"load_data",
"load_yaml",
"parse",
Expand Down Expand Up @@ -65,7 +66,7 @@
)
from .basic import Parser as BasicParser
from .caching import CachingParser, Clearing, LRU, Locking
from .loaders import load_builtins, load_data, load_yaml
from .loaders import load_builtins, load_lazy_builtins, load_data, load_yaml

Re2Parser: Optional[Callable[[Matchers], Parser]] = None
with contextlib.suppress(ImportError):
Expand All @@ -79,7 +80,7 @@ def __getattr__(name: str) -> Parser:
global parser
if name == "parser":
if Re2Parser is not None:
parser = Re2Parser(load_builtins())
parser = Re2Parser(load_lazy_builtins())
else:
parser = CachingParser(
BasicParser(load_builtins()),
Expand Down
10 changes: 10 additions & 0 deletions src/ua_parser/_lazy.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
__all__ = ["MATCHERS"]

from typing import Tuple, List
from .lazy import UserAgentMatcher, OSMatcher, DeviceMatcher

MATCHERS: Tuple[
List[UserAgentMatcher],
List[OSMatcher],
List[DeviceMatcher],
]
11 changes: 9 additions & 2 deletions src/ua_parser/_matchers.pyi
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
from .core import Matchers
__all__ = ["MATCHERS"]

MATCHERS: Matchers
from typing import Tuple, List
from .core import UserAgentMatcher, OSMatcher, DeviceMatcher

MATCHERS: Tuple[
List[UserAgentMatcher],
List[OSMatcher],
List[DeviceMatcher],
]
7 changes: 4 additions & 3 deletions src/ua_parser/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
Device,
DeviceMatcher,
Domain,
Matcher,
Matchers,
OS,
OSMatcher,
Expand All @@ -23,9 +24,9 @@ class Parser(AbstractParser):
when one matches.
"""

user_agent_matchers: List[UserAgentMatcher]
os_matchers: List[OSMatcher]
device_matchers: List[DeviceMatcher]
user_agent_matchers: List[Matcher[UserAgent]]
os_matchers: List[Matcher[OS]]
device_matchers: List[Matcher[Device]]

def __init__(
self,
Expand Down
Loading

0 comments on commit bdc33fd

Please sign in to comment.