Add support for lazy matchers

Support is added for lazy builtin matchers (with a separately compiled file), as well as loading json or yaml files using lazy matchers. Lazy matchers are very much a tradeoff: they improve import speed, but slow down run speed, possibly dramatically. Use them by default for the re2 parser, but not the basic parser: experimentally, on Python 3.11 - importing the package itself takes ~36ms - importing the lazy matchers takes ~36ms (including the package, so ~0) - importing the eager matchers takes ~97ms the eager matchers have a significant overhead, *however* running the bench on the sample file, they cause a runtime increase of 700~800ms on the basic parser bench, as that ends up instantiating *every* regex (likely due to match failures). Relatively this is not huge (~2.5%), but the tradeoff doesn't seem great, especially since the parser itself is initialized lazily. The re2 parser does much better, only losing 20~30ms (~1%), this is likely because it only needs to compile a fraction of the regexes (156 out of 1162 as of regexes.yaml version 0.18), and possibly because it gets to avoid some of the most expensive to compile ones. Fixes ua-parser#171, fixes ua-parser#173
masklinn · Feb 17, 2024 · bdc33fd · bdc33fd
1 parent 04d0b7d
commit bdc33fd
Show file tree

Hide file tree

Showing 9 changed files with 450 additions and 138 deletions.
diff --git a/setup.py b/setup.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # flake8: noqa
 import io
-from contextlib import suppress
+from contextlib import suppress, contextmanager
 from os import fspath
 from pathlib import Path
 from typing import Optional, List, Dict
@@ -52,21 +52,6 @@ def run(self) -> None:
                 f"Unable to find regexes.yaml, should be at {yaml_src!r}"
             )
 
-        def write_matcher(f, typ: str, fields: List[Optional[object]]):
-            f.write(f"        {typ}(".encode())
-            while len(fields) > 1 and fields[-1] is None:
-                fields = fields[:-1]
-            f.write(", ".join(map(repr, fields)).encode())
-            f.write(b"),\n")
-
-        def write_params(fields):
-            # strip trailing None values
-            while len(fields) > 1 and fields[-1] is None:
-                fields.pop()
-
-            for field in fields:
-                fp.write((f"        {field!r},\n").encode())
-
         with yaml_src.open("rb") as f:
             regexes = yaml.safe_load(f)
 
@@ -79,96 +64,150 @@ def write_params(fields):
         outdir.mkdir(parents=True, exist_ok=True)
 
         dest = outdir / "_matchers.py"
+        dest_lazy = outdir / "_lazy.py"
         dest_legacy = outdir / "_regexes.py"
 
-        with dest.open("wb") as f, dest_legacy.open("wb") as fp:
-            # fmt: off
-            f.write(b"""\
+        with dest.open("wb") as eager, dest_lazy.open("wb") as lazy, dest_legacy.open(
+            "wb"
+        ) as legacy:
+            eager = EagerWriter(eager)
+            lazy = LazyWriter(lazy)
+            legacy = LegacyWriter(legacy)
+
+            for section in ["user_agent_parsers", "os_parsers", "device_parsers"]:
+                with eager.section(section), lazy.section(section), legacy.section(
+                    section
+                ):
+                    extract = EXTRACTORS[section]
+                    for p in regexes[section]:
+                        el = trim(extract(p))
+                        eager.item(el)
+                        lazy.item(el)
+                        legacy.item(el)
+            eager.end()
+            lazy.end()
+            legacy.end()
+
+
+def trim(l):
+    while len(l) > 1 and l[-1] is None:
+        l.pop()
+    return l
+
+
+EXTRACTORS = {
+    "user_agent_parsers": lambda p: [
+        p["regex"],
+        p.get("family_replacement"),
+        p.get("v1_replacement"),
+        p.get("v2_replacement"),
+    ],
+    "os_parsers": lambda p: [
+        p["regex"],
+        p.get("os_replacement"),
+        p.get("os_v1_replacement"),
+        p.get("os_v2_replacement"),
+        p.get("os_v3_replacement"),
+        p.get("os_v4_replacement"),
+    ],
+    "device_parsers": lambda p: [
+        p["regex"],
+        p.get("regex_flag"),
+        p.get("device_replacement"),
+        p.get("brand_replacement"),
+        p.get("model_replacement"),
+    ],
+}
+
+
+class Writer:
+    section_end = b""
+
+    def __init__(self, fp):
+        self.fp = fp
+        self.fp.write(
+            b"""\
 ########################################################
 # NOTICE: this file is autogenerated from regexes.yaml #
 ########################################################
+"""
+        )
+        self.fp.write(self.prefix)
+        self._section = None
+
+    @contextmanager
+    def section(self, id):
+        self._section = id
+        self.fp.write(self.sections[id])
+        yield
+        self.fp.write(self.section_end)
+
+    def item(self, elements):
+        #        DeviceMatcher(re, flag, repl1),
+        self.fp.write(self.items[self._section])
+        self.fp.write(", ".join(map(repr, elements)).encode())
+        self.fp.write(b"),\n")
+
+    def end(self):
+        self.fp.write(self.suffix)
+
+
+class LegacyWriter(Writer):
+    prefix = b"""\
+__all__ = [
+    "USER_AGENT_PARSERS",
+    "DEVICE_PARSERS",
+    "OS_PARSERS",
+]
+
+from .user_agent_parser import UserAgentParser, DeviceParser, OSParser
+
+"""
+    sections = {
+        "user_agent_parsers": b"USER_AGENT_PARSERS = [\n",
+        "os_parsers": b"\n\nOS_PARSERS = [\n",
+        "device_parsers": b"\n\nDEVICE_PARSERS = [\n",
+    }
+    section_end = b"]"
+    items = {
+        "user_agent_parsers": b"    UserAgentParser(",
+        "os_parsers": b"    OSParser(",
+        "device_parsers": b"    DeviceParser(",
+    }
+    suffix = b"\n"
+
+
+class EagerWriter(Writer):
+    prefix = b"""\
+__all__ = ["MATCHERS"]
+
+from typing import Tuple, List
+from .core import UserAgentMatcher, OSMatcher, DeviceMatcher
+
+MATCHERS: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]] = ([
+"""
+    sections = {
+        "user_agent_parsers": b"",
+        "os_parsers": b"], [\n",
+        "device_parsers": b"], [\n",
+    }
+    items = {
+        "user_agent_parsers": b"    UserAgentMatcher(",
+        "os_parsers": b"    OSMatcher(",
+        "device_parsers": b"    DeviceMatcher(",
+    }
+    suffix = b"])\n"
+
+
+class LazyWriter(EagerWriter):
+    prefix = b"""\
+__all__ = ["MATCHERS"]
+
+from typing import Tuple, List
+from .lazy import UserAgentMatcher, OSMatcher, DeviceMatcher
 
-from .core import Matchers, UserAgentMatcher, OSMatcher, DeviceMatcher
-
-MATCHERS: Matchers = ([
-""")
-            fp.write(b"# -*- coding: utf-8 -*-\n")
-            fp.write(b"########################################################\n")
-            fp.write(b"# NOTICE: This file is autogenerated from regexes.yaml #\n")
-            fp.write(b"########################################################\n")
-            fp.write(b"\n")
-            fp.write(b"from .user_agent_parser import (\n")
-            fp.write(b"    UserAgentParser, DeviceParser, OSParser,\n")
-            fp.write(b")\n")
-            fp.write(b"\n")
-            fp.write(b"__all__ = ('USER_AGENT_PARSERS', 'DEVICE_PARSERS', 'OS_PARSERS')\n")
-            fp.write(b"\n")
-            fp.write(b"USER_AGENT_PARSERS = [\n")
-            for device_parser in regexes["user_agent_parsers"]:
-                write_matcher(f, "UserAgentMatcher", [
-                    device_parser["regex"],
-                    device_parser.get("family_replacement"),
-                    device_parser.get("v1_replacement"),
-                    device_parser.get("v2_replacement"),
-                ])
-
-                fp.write(b"    UserAgentParser(\n")
-                write_params([
-                    device_parser["regex"],
-                    device_parser.get("family_replacement"),
-                    device_parser.get("v1_replacement"),
-                    device_parser.get("v2_replacement"),
-                ])
-                fp.write(b"    ),\n")
-            f.write(b"    ], [\n")
-            fp.write(b"]\n\n")
-
-            fp.write(b"OS_PARSERS = [\n")
-            for device_parser in regexes["os_parsers"]:
-                write_matcher(f, "OSMatcher", [
-                    device_parser["regex"],
-                    device_parser.get("os_replacement"),
-                    device_parser.get("os_v1_replacement"),
-                    device_parser.get("os_v2_replacement"),
-                    device_parser.get("os_v3_replacement"),
-                    device_parser.get("os_v4_replacement"),
-                ])
-
-                fp.write(b"    OSParser(\n")
-                write_params([
-                    device_parser["regex"],
-                    device_parser.get("os_replacement"),
-                    device_parser.get("os_v1_replacement"),
-                    device_parser.get("os_v2_replacement"),
-                    device_parser.get("os_v3_replacement"),
-                    device_parser.get("os_v4_replacement"),
-                ])
-                fp.write(b"    ),\n")
-            f.write(b"    ], [\n")
-            fp.write(b"]\n\n")
-
-            fp.write(b"DEVICE_PARSERS = [\n")
-            for device_parser in regexes["device_parsers"]:
-                write_matcher(f, "DeviceMatcher", [
-                    device_parser["regex"],
-                    device_parser.get("regex_flag"),
-                    device_parser.get("device_replacement"),
-                    device_parser.get("brand_replacement"),
-                    device_parser.get("model_replacement"),
-                ])
-
-                fp.write(b"    DeviceParser(\n")
-                write_params([
-                    device_parser["regex"],
-                    device_parser.get("regex_flag"),
-                    device_parser.get("device_replacement"),
-                    device_parser.get("brand_replacement"),
-                    device_parser.get("model_replacement"),
-                ])
-                fp.write(b"    ),\n")
-            f.write(b"])\n")
-            fp.write(b"]\n")
-            # fmt: on
+MATCHERS: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]] = ([
+"""
 
 
 setup(

diff --git a/src/ua_parser/__init__.py b/src/ua_parser/__init__.py
@@ -36,6 +36,7 @@
     "UserAgent",
     "UserAgentMatcher",
     "load_builtins",
+    "load_lazy_builtins",
     "load_data",
     "load_yaml",
     "parse",
@@ -65,7 +66,7 @@
 )
 from .basic import Parser as BasicParser
 from .caching import CachingParser, Clearing, LRU, Locking
-from .loaders import load_builtins, load_data, load_yaml
+from .loaders import load_builtins, load_lazy_builtins, load_data, load_yaml
 
 Re2Parser: Optional[Callable[[Matchers], Parser]] = None
 with contextlib.suppress(ImportError):
@@ -79,7 +80,7 @@ def __getattr__(name: str) -> Parser:
     global parser
     if name == "parser":
         if Re2Parser is not None:
-            parser = Re2Parser(load_builtins())
+            parser = Re2Parser(load_lazy_builtins())
         else:
             parser = CachingParser(
                 BasicParser(load_builtins()),

diff --git a/src/ua_parser/_lazy.pyi b/src/ua_parser/_lazy.pyi
@@ -0,0 +1,10 @@
+__all__ = ["MATCHERS"]
+
+from typing import Tuple, List
+from .lazy import UserAgentMatcher, OSMatcher, DeviceMatcher
+
+MATCHERS: Tuple[
+    List[UserAgentMatcher],
+    List[OSMatcher],
+    List[DeviceMatcher],
+]
diff --git a/src/ua_parser/_matchers.pyi b/src/ua_parser/_matchers.pyi
@@ -1,3 +1,10 @@
-from .core import Matchers
+__all__ = ["MATCHERS"]
 
-MATCHERS: Matchers
+from typing import Tuple, List
+from .core import UserAgentMatcher, OSMatcher, DeviceMatcher
+
+MATCHERS: Tuple[
+    List[UserAgentMatcher],
+    List[OSMatcher],
+    List[DeviceMatcher],
+]
diff --git a/src/ua_parser/basic.py b/src/ua_parser/basic.py
@@ -7,6 +7,7 @@
     Device,
     DeviceMatcher,
     Domain,
+    Matcher,
     Matchers,
     OS,
     OSMatcher,
@@ -23,9 +24,9 @@ class Parser(AbstractParser):
     when one matches.
     """
 
-    user_agent_matchers: List[UserAgentMatcher]
-    os_matchers: List[OSMatcher]
-    device_matchers: List[DeviceMatcher]
+    user_agent_matchers: List[Matcher[UserAgent]]
+    os_matchers: List[Matcher[OS]]
+    device_matchers: List[Matcher[Device]]
 
     def __init__(
         self,