Skip to content

Commit

Permalink
Optimize out unnecessary eclass bitmaps
Browse files Browse the repository at this point in the history
  • Loading branch information
Zoltan Herczeg committed Dec 5, 2024
1 parent e8a5cd7 commit a9a8a4b
Show file tree
Hide file tree
Showing 7 changed files with 344 additions and 312 deletions.
5 changes: 2 additions & 3 deletions HACKING
Original file line number Diff line number Diff line change
Expand Up @@ -666,9 +666,8 @@ inside an OP_ECLASS at match time. They are:
ECL_ANY match all characters; no additional data
ECL_NONE match no characters; no additional data

The meaning of the bitmap indicated by ECL_MAP is different to that of XCL_MAP
for OP_XCLASS, in one way. The ECL_MAP bitmap is present whenever any code
points < 256 match the class.
The meaning of the bitmap indicated by ECL_MAP is the same as XCL_MAP.
If the bitmap is present, all codepoints < 256 are checked against the bitmap.


Back references
Expand Down
195 changes: 3 additions & 192 deletions src/pcre2_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -6173,198 +6173,9 @@ for (;; pptr++)

if ((*pptr & CLASS_IS_ECLASS) != 0)
{
eclass_op_info op_info;
PCRE2_SIZE previous_length = (lengthptr != NULL)? *lengthptr : 0;
BOOL allbitsone = TRUE;
BOOL allbitszero = TRUE;

previous = code;
*code++ = OP_ECLASS;
code += LINK_SIZE;
*code++ = 0; /* Flags, currently zero. */
if (!PRIV(compile_class_nested)(options, xoptions, FALSE, &pptr, &code,
&op_info, errorcodeptr, cb, lengthptr))
if (!PRIV(compile_class_nested)(options, xoptions, &pptr, &code,
errorcodeptr, cb, lengthptr))
return 0;

if (lengthptr != NULL)
{
*lengthptr += code - previous;
code = previous;
/* (*lengthptr - previous_length) now holds the amount of buffer that
we require to make the call to compile_class_nested() with
lengthptr = NULL, and including the (1+LINK_SIZE+1) that we write out
before that call. */
}

/* Do some useful counting of what's in the bitmap. */
for (int i = 0; i < 8; i++)
if (op_info.bits.classwords[i] != 0xffffffff)
{
allbitsone = FALSE;
break;
}
for (int i = 0; i < 8; i++)
if (op_info.bits.classwords[i] != 0)
{
allbitszero = FALSE;
break;
}

/* After constant-folding the extended class syntax, it may turn out to be
a simple class after all. In that case, we can unwrap it from the
OP_ECLASS container - and in fact, we must do so, because in 8-bit
no-Unicode mode the matcher is compiled without support for OP_ECLASS. */

#ifndef SUPPORT_WIDE_CHARS
PCRE2_ASSERT(op_info.op_single_type != 0);
#else
if (op_info.op_single_type != 0)
#endif
{
/* Rewind back over the OP_ECLASS. */
code = previous;

/* If the bits are all ones, and the "high characters" are all matched
too, we use a special-cased encoding of OP_ALLANY. */

if (op_info.op_single_type == ECL_ANY && allbitsone)
{
/* Advancing code means rewinding lengthptr, at this point. */
if (lengthptr != NULL) *lengthptr -= 1;
*code++ = OP_ALLANY;
}

/* If the high bits are all matched / all not-matched, then we emit an
OP_NCLASS/OP_CLASS respectively. */

else if (op_info.op_single_type == ECL_ANY ||
op_info.op_single_type == ECL_NONE)
{
PCRE2_SIZE required_len = 1 + (32 / sizeof(PCRE2_UCHAR));

if (lengthptr != NULL)
{
if (required_len > (*lengthptr - previous_length))
*lengthptr = previous_length + required_len;
}

/* Advancing code means rewinding lengthptr, at this point. */
if (lengthptr != NULL) *lengthptr -= required_len;
*code++ = (op_info.op_single_type == ECL_ANY)? OP_NCLASS : OP_CLASS;
memcpy(code, op_info.bits.classbits, 32);
code += 32 / sizeof(PCRE2_UCHAR);
}

/* Otherwise, we have an ECL_XCLASS, so we have the OP_XCLASS data
there, but, we pulled out its bitmap into op_info, so now we have to
put that back into the OP_XCLASS. */

else
{
#ifndef SUPPORT_WIDE_CHARS
PCRE2_DEBUG_UNREACHABLE();
#else
BOOL need_map;
PCRE2_SIZE required_len;

PCRE2_ASSERT(op_info.op_single_type == ECL_XCLASS);
need_map = !allbitszero;
required_len =
op_info.length + (need_map? 32/sizeof(PCRE2_UCHAR) : 0);

if (lengthptr != NULL)
{
/* Don't unconditionally request all the space we need - we may
already have asked for more during processing of the ECLASS. */
if (required_len > (*lengthptr - previous_length))
*lengthptr = previous_length + required_len;

/* The code we write out here won't be ignored, even during the
(lengthptr != NULL) phase, because if there's a following quantifier
it will peek backwards. So we do have to write out a (truncated)
OP_XCLASS, even on this branch. */
*lengthptr -= 1 + LINK_SIZE + 1;
*code++ = OP_XCLASS;
PUT(code, 0, 1 + LINK_SIZE + 1);
code += LINK_SIZE;
*code++ = 0;
}
else
{
PCRE2_UCHAR *rest;
PCRE2_SIZE rest_len;
PCRE2_UCHAR flags;

/* 1 unit: OP_XCLASS | LINK_SIZE units | 1 unit: flags | ...rest */
PCRE2_ASSERT(op_info.length >= 1 + LINK_SIZE + 1);
rest = op_info.code_start + 1 + LINK_SIZE + 1;
rest_len = (op_info.code_start + op_info.length) - rest;

/* First read any data we use, before memmove splats it. */
flags = op_info.code_start[1 + LINK_SIZE];
PCRE2_ASSERT((flags & XCL_MAP) == 0);

/* Next do the memmove before any writes. */
memmove(
code + 1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0),
rest, CU2BYTES(rest_len));

/* Finally write the header data. */
*code++ = OP_XCLASS;
PUT(code, 0, (int)required_len);
code += LINK_SIZE;
*code++ = flags | (need_map? XCL_MAP : 0);
if (need_map)
{
memcpy(code, op_info.bits.classbits, 32);
code += 32 / sizeof(PCRE2_UCHAR);
}
code += rest_len;
}
#endif /* SUPPORT_WIDE_CHARS */
}
}

/* Otherwise, we're going to keep the OP_ECLASS. However, again we need
to do some adjustment to insert the bitmap if we have one. */

#ifdef SUPPORT_WIDE_CHARS
else
{
BOOL need_map = !allbitszero;
PCRE2_SIZE required_len = 1 + LINK_SIZE + 1 +
(need_map? 32/sizeof(PCRE2_UCHAR) : 0) + op_info.length;

if (lengthptr != NULL)
{
if (required_len > (*lengthptr - previous_length))
*lengthptr = previous_length + required_len;

/* As for the XCLASS branch above, we do have to write out a dummy
OP_ECLASS, because of the backwards peek by the quantifier code. Write
out a (truncated) OP_ECLASS, even on this branch. */
*lengthptr -= 1 + LINK_SIZE + 1;
*code++ = OP_ECLASS;
PUT(code, 0, 1 + LINK_SIZE + 1);
code += LINK_SIZE;
*code++ = 0;
}
else
{
if (need_map)
{
PCRE2_UCHAR *map_start = previous + 1 + LINK_SIZE + 1;
previous[1 + LINK_SIZE] |= ECL_MAP;
memmove(map_start + 32/sizeof(PCRE2_UCHAR), map_start,
CU2BYTES(code - map_start));
memcpy(map_start, op_info.bits.classbits, 32);
code += 32 / sizeof(PCRE2_UCHAR);
}
PUT(previous, 1, (int)(code - previous));
}
}
#endif /* SUPPORT_WIDE_CHARS */

goto CLASS_END_PROCESSING;
}

Expand Down Expand Up @@ -6489,7 +6300,7 @@ for (;; pptr++)
/* Now emit the OP_CLASS/OP_NCLASS/OP_XCLASS/OP_ALLANY opcode. */

pptr = PRIV(compile_class_not_nested)(options, xoptions, pptr + 1,
&code, meta == META_CLASS_NOT, FALSE,
&code, meta == META_CLASS_NOT, NULL,
errorcodeptr, cb, lengthptr);
if (pptr == NULL) return 0;
PCRE2_ASSERT(*pptr == META_CLASS_END);
Expand Down
6 changes: 3 additions & 3 deletions src/pcre2_compile.h
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ void PRIV(update_classbits)(uint32_t ptype, uint32_t pdata, BOOL negated,
OP_CLASS, OP_NCLASS, OP_XCLASS, or OP_ALLANY into pcode. */

uint32_t *PRIV(compile_class_not_nested)(uint32_t options, uint32_t xoptions,
uint32_t *start_ptr, PCRE2_UCHAR **pcode, BOOL negate_class, BOOL always_map,
uint32_t *start_ptr, PCRE2_UCHAR **pcode, BOOL negate_class, BOOL* has_bitmap,
int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr);

/* Compile the META codes in pptr into opcodes written to pcode. The pptr must
Expand All @@ -272,8 +272,8 @@ start at a META_CLASS or META_CLASS_NOT.
The pptr will be left pointing at the matching META_CLASS_END. */

BOOL PRIV(compile_class_nested)(uint32_t options, uint32_t xoptions,
BOOL negated, uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr);
uint32_t **pptr, PCRE2_UCHAR **pcode, int *errorcodeptr,
compile_block *cb, PCRE2_SIZE *lengthptr);

#endif /* PCRE2_COMPILE_H_IDEMPOTENT_GUARD */

Expand Down
Loading

0 comments on commit a9a8a4b

Please sign in to comment.