Skip to content

Commit

Permalink
Merge pull request #4202 from tybug/domains
Browse files Browse the repository at this point in the history
Improve performance of `st.domains()`
  • Loading branch information
Zac-HD authored Dec 19, 2024
2 parents 904bdd9 + 3ed3d47 commit a6a166a
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 12 deletions.
3 changes: 3 additions & 0 deletions hypothesis-python/RELEASE.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
RELEASE_TYPE: patch

This patch improves generation performance for the provisional :func:`~hypothesis.provisional.domains` strategy, including its derivative strategies :func:`~hypothesis.provisional.urls` and :func:`~hypothesis.strategies.emails`.
37 changes: 25 additions & 12 deletions hypothesis-python/src/hypothesis/provisional.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,16 @@ def get_top_level_domains() -> tuple[str, ...]:
return ("COM", *sorted((d for d in _tlds if d != "ARPA"), key=len))


@st.composite
def _recase_randomly(draw, tld):
tld = list(tld)
changes = draw(st.tuples(*(st.booleans() for _ in range(len(tld)))))
for i, change_case in enumerate(changes):
if change_case:
tld[i] = tld[i].lower() if tld[i].isupper() else tld[i].upper()
return "".join(tld)


class DomainNameStrategy(st.SearchStrategy):
@staticmethod
def clean_inputs(
Expand Down Expand Up @@ -89,34 +99,37 @@ def __init__(
# information in https://tools.ietf.org/html/rfc1035#section-2.3.1
# which defines the allowed syntax of a subdomain string.
if self.max_element_length == 1:
self.label_regex = r"[a-zA-Z]"
label_regex = r"[a-zA-Z]"
elif self.max_element_length == 2:
self.label_regex = r"[a-zA-Z][a-zA-Z0-9]?"
label_regex = r"[a-zA-Z][a-zA-Z0-9]?"
else:
maximum_center_character_pattern_repetitions = self.max_element_length - 2
self.label_regex = r"[a-zA-Z]([a-zA-Z0-9\-]{0,%d}[a-zA-Z0-9])?" % (
label_regex = r"[a-zA-Z]([a-zA-Z0-9\-]{0,%d}[a-zA-Z0-9])?" % (
maximum_center_character_pattern_repetitions,
)

def do_draw(self, data):
# Construct reusable strategies here to avoid a performance hit by doing
# so repeatedly in do_draw.

# 1 - Select a valid top-level domain (TLD) name
# 2 - Check that the number of characters in our selected TLD won't
# prevent us from generating at least a 1 character subdomain.
# 3 - Randomize the TLD between upper and lower case characters.
domain = data.draw(

self.domain_strategy = (
st.sampled_from(get_top_level_domains())
.filter(lambda tld: len(tld) + 2 <= self.max_length)
.flatmap(
lambda tld: st.tuples(
*(st.sampled_from([c.lower(), c.upper()]) for c in tld)
).map("".join)
)
.flatmap(_recase_randomly)
)

# RFC-5890 s2.3.1 says such labels are reserved, and since we don't
# want to bother with xn-- punycode labels we'll exclude them all.
elem_st = st.from_regex(self.label_regex, fullmatch=True).filter(
self.elem_strategy = st.from_regex(label_regex, fullmatch=True).filter(
lambda label: len(label) < 4 or label[2:4] != "--"
)

def do_draw(self, data):
domain = data.draw(self.domain_strategy)
# The maximum possible number of subdomains is 126,
# 1 character subdomain + 1 '.' character, * 126 = 252,
# with a max of 255, that leaves 3 characters for a TLD.
Expand All @@ -125,7 +138,7 @@ def do_draw(self, data):
elements = cu.many(data, min_size=1, average_size=3, max_size=126)
while elements.more():
# Generate a new valid subdomain using the regex strategy.
sub_domain = data.draw(elem_st)
sub_domain = data.draw(self.elem_strategy)
if len(domain) + len(sub_domain) >= self.max_length:
data.stop_example(discard=True)
break
Expand Down

0 comments on commit a6a166a

Please sign in to comment.