Skip to content

Commit

Permalink
Improve possessive repeat support
Browse files Browse the repository at this point in the history
  • Loading branch information
Zoltan Herczeg committed Dec 19, 2024
1 parent 413bd8a commit 0747eb9
Show file tree
Hide file tree
Showing 5 changed files with 191 additions and 93 deletions.
228 changes: 135 additions & 93 deletions src/pcre2_jit_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -1184,13 +1184,8 @@ while (cc < ccend)

case OP_TYPEPOSUPTO:
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
if (common->utf)
{
if (cc[1 + IMM2_SIZE] == OP_EXTUNI && locals_size <= 4 * SSIZE_OF(sw))
locals_size = 4 * SSIZE_OF(sw);
else if (locals_size <= 3 * SSIZE_OF(sw))
locals_size = 3 * SSIZE_OF(sw);
}
if (common->utf && locals_size <= 3 * SSIZE_OF(sw))
locals_size = 3 * SSIZE_OF(sw);
#endif
if (cc[1 + IMM2_SIZE] == OP_EXTUNI && locals_size <= 3 * SSIZE_OF(sw))
locals_size = 3 * SSIZE_OF(sw);
Expand Down Expand Up @@ -1307,7 +1302,7 @@ while (cc < ccend)

#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
case OP_CRPOSRANGE:
if (GET2(cc, 1) < GET2(cc, 1 + IMM2_SIZE) && locals_size <= 3 * SSIZE_OF(sw))
if (common->utf && GET2(cc, 1) != GET2(cc, 1 + IMM2_SIZE) && locals_size <= 3 * SSIZE_OF(sw))
locals_size = 3 * SSIZE_OF(sw);
cc += 1 + 2 * IMM2_SIZE;
break;
Expand Down Expand Up @@ -10447,8 +10442,10 @@ else
*exact = 1;
*opcode -= OP_PLUS - OP_STAR;
}
return cc;
}
else if (*opcode >= OP_CRPOSSTAR && *opcode <= OP_CRPOSQUERY)

if (*opcode >= OP_CRPOSSTAR && *opcode <= OP_CRPOSQUERY)
{
*opcode -= OP_CRPOSSTAR - OP_POSSTAR;
*end = cc + class_len;
Expand All @@ -10458,41 +10455,36 @@ else
*exact = 1;
*opcode = OP_POSSTAR;
}
return cc;
}
else

SLJIT_ASSERT(*opcode == OP_CRRANGE || *opcode == OP_CRMINRANGE || *opcode == OP_CRPOSRANGE);
*max = GET2(cc, (class_len + IMM2_SIZE));
*exact = GET2(cc, class_len);
*end = cc + class_len + 2 * IMM2_SIZE;

if (*max == 0)
{
SLJIT_ASSERT(*opcode == OP_CRRANGE || *opcode == OP_CRMINRANGE || *opcode == OP_CRPOSRANGE);
*max = GET2(cc, (class_len + IMM2_SIZE));
*exact = GET2(cc, class_len);
SLJIT_ASSERT(*exact > 1);
if (*opcode == OP_CRPOSRANGE)
*opcode = OP_POSUPTO;
else
*opcode -= OP_CRRANGE - OP_STAR;
return cc;
}

if (*max == 0)
{
if (*opcode == OP_CRPOSRANGE)
*opcode = OP_POSSTAR;
else
*opcode -= OP_CRRANGE - OP_STAR;
}
*max -= *exact;
if (*max == 0)
*opcode = OP_EXACT;
else
{
SLJIT_ASSERT(*exact > 0 || *max > 1);
if (*opcode == OP_CRPOSRANGE)
*opcode = OP_POSUPTO;
else if (*max == 1)
*opcode -= OP_CRRANGE - OP_QUERY;
else
{
*max -= *exact;
if (*max == 0)
*opcode = OP_EXACT;
else if (*max == 1)
{
if (*opcode == OP_CRPOSRANGE)
*opcode = OP_POSQUERY;
else
*opcode -= OP_CRRANGE - OP_QUERY;
}
else
{
if (*opcode == OP_CRPOSRANGE)
*opcode = OP_POSUPTO;
else
*opcode -= OP_CRRANGE - OP_UPTO;
}
}
*end = cc + class_len + 2 * IMM2_SIZE;
*opcode -= OP_CRRANGE - OP_UPTO;
}
return cc;
}
Expand Down Expand Up @@ -10593,36 +10585,49 @@ else
}

/* Handle fixed part first. */
if (exact > 1)
if (opcode != OP_POSUPTO)
{
SLJIT_ASSERT(early_fail_ptr == 0);
if (exact > 1)
{
SLJIT_ASSERT(early_fail_ptr == 0);

if (common->mode == PCRE2_JIT_COMPLETE
#ifdef SUPPORT_UNICODE
&& !common->utf
if (common->mode == PCRE2_JIT_COMPLETE
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
&& !common->utf
#endif
&& type != OP_ANYNL && type != OP_EXTUNI)
{
OP2(SLJIT_ADD, TMP1, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(exact));
add_jump(compiler, &backtrack->own_backtracks, CMP(SLJIT_GREATER, TMP1, 0, STR_END, 0));
OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, exact);
label = LABEL();
compile_char1_matchingpath(common, type, cc, &backtrack->own_backtracks, FALSE);
OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1);
JUMPTO(SLJIT_NOT_ZERO, label);
&& type != OP_ANYNL && type != OP_EXTUNI)
{
OP2(SLJIT_SUB, TMP1, 0, STR_END, 0, STR_PTR, 0);
add_jump(compiler, &backtrack->own_backtracks, CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, IN_UCHARS(exact)));

#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 32
if (type == OP_ALLANY && !common->invalid_utf)
#else
if (type == OP_ALLANY)
#endif
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(exact));
else
{
OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, exact);
label = LABEL();
compile_char1_matchingpath(common, type, cc, &backtrack->own_backtracks, FALSE);
OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1);
JUMPTO(SLJIT_NOT_ZERO, label);
}
}
else
{
SLJIT_ASSERT(tmp_base == TMP3 || common->locals_size >= 3 * SSIZE_OF(sw));
OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, exact);
label = LABEL();
compile_char1_matchingpath(common, type, cc, &backtrack->own_backtracks, TRUE);
OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1);
JUMPTO(SLJIT_NOT_ZERO, label);
}
}
else
{
SLJIT_ASSERT(tmp_base == TMP3 || common->locals_size >= 3 * SSIZE_OF(sw));
OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, exact);
label = LABEL();
else if (exact == 1 && opcode != OP_STAR && opcode != OP_MINSTAR && opcode != OP_POSSTAR)
compile_char1_matchingpath(common, type, cc, &backtrack->own_backtracks, TRUE);
OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1);
JUMPTO(SLJIT_NOT_ZERO, label);
}
}
else if (exact == 1 && opcode != OP_STAR && opcode != OP_MINSTAR && opcode != OP_POSSTAR)
compile_char1_matchingpath(common, type, cc, &backtrack->own_backtracks, TRUE);

if (early_fail_type == type_fail_range)
{
Expand Down Expand Up @@ -11037,64 +11042,101 @@ switch(opcode)
case OP_POSUPTO:
SLJIT_ASSERT(early_fail_ptr == 0);
SLJIT_ASSERT(tmp_base == TMP3 || common->locals_size >= 3 * SSIZE_OF(sw));

max += exact;
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
if (common->utf)
if (type == OP_EXTUNI || common->utf)
#else
if (type == OP_EXTUNI)
#endif
{
SLJIT_ASSERT(common->locals_size >= 3 * SSIZE_OF(sw));
if (tmp_base != TMP3)

if (exact == max)
{
SLJIT_ASSERT(type == OP_EXTUNI && common->locals_size >= 4 * SSIZE_OF(sw));
tmp_offset = LOCAL3;
/* Extuni has a separate OP_TYPEEXACT opcode for fixed iterations. */
SLJIT_ASSERT(tmp_base == TMP3);
/* Count match is not modified by compile_char1_matchingpath. */
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL2, COUNT_MATCH, 0);
OP1(SLJIT_MOV, TMP3, 0, SLJIT_IMM, 0);

label = LABEL();
OP1(SLJIT_MOV, COUNT_MATCH, 0, STR_PTR, 0);
compile_char1_matchingpath(common, type, cc, &no_match, TRUE);
OP2(SLJIT_ADD, TMP3, 0, TMP3, 0, SLJIT_IMM, 1);
JUMPTO(SLJIT_JUMP, label);

set_jumps(no_match, LABEL());

OP1(SLJIT_MOV, STR_PTR, 0, COUNT_MATCH, 0);
OP1(SLJIT_MOV, COUNT_MATCH, 0, SLJIT_MEM1(SLJIT_SP), LOCAL2);
add_jump(compiler, &backtrack->own_backtracks, CMP(SLJIT_LESS, TMP3, 0, SLJIT_IMM, exact));
break;
}

OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL2, STR_PTR, 0);
OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, max);
/* Count match is not modified by compile_char1_matchingpath. */
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL2, COUNT_MATCH, 0);
OP1(SLJIT_MOV, COUNT_MATCH, 0, SLJIT_IMM, max);

detect_partial_match(common, &no_match);
label = LABEL();
compile_char1_matchingpath(common, type, cc, &no_match, FALSE);
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL2, STR_PTR, 0);
OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1);
add_jump(compiler, &no_match, JUMP(SLJIT_ZERO));
detect_partial_match_to(common, label);
/* Extuni only modifies TMP3 on successful match. */
OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
compile_char1_matchingpath(common, type, cc, &no_match, TRUE);
OP2(SLJIT_SUB | SLJIT_SET_Z, COUNT_MATCH, 0, COUNT_MATCH, 0, SLJIT_IMM, 1);
JUMPTO(SLJIT_NOT_ZERO, label);

OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
set_jumps(no_match, LABEL());
OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCAL2);

if (exact > 0)
OP2U(SLJIT_SUB | SLJIT_SET_GREATER, COUNT_MATCH, 0, SLJIT_IMM, max - exact);

OP1(SLJIT_MOV, COUNT_MATCH, 0, SLJIT_MEM1(SLJIT_SP), LOCAL2);

if (exact > 0)
add_jump(compiler, &backtrack->own_backtracks, JUMP(SLJIT_GREATER));
OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
break;
}
#endif

if (type == OP_ALLANY)
SLJIT_ASSERT(tmp_base == TMP3);

if (exact == max)
{
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(max));
OP1(SLJIT_MOV, TMP3, 0, SLJIT_IMM, 0);

if (common->mode == PCRE2_JIT_COMPLETE)
{
OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
}
else
{
jump = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, STR_END, 0);
process_partial_match(common);
JUMPHERE(jump);
}
detect_partial_match(common, &no_match);
label = LABEL();
compile_char1_matchingpath(common, type, cc, &no_char1_match, FALSE);
OP2(SLJIT_ADD, TMP3, 0, TMP3, 0, SLJIT_IMM, 1);
detect_partial_match_to(common, label);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));

set_jumps(no_char1_match, LABEL());
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
set_jumps(no_match, LABEL());

add_jump(compiler, &backtrack->own_backtracks, CMP(SLJIT_LESS, TMP3, 0, SLJIT_IMM, exact));
break;
}

OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, max);
OP1(SLJIT_MOV, TMP3, 0, SLJIT_IMM, max);

detect_partial_match(common, &no_match);
label = LABEL();
/* Extuni may read more characters, but it never jumps since it is always successful. */
compile_char1_matchingpath(common, type, cc, &no_char1_match, FALSE);
OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1);
OP2(SLJIT_SUB | SLJIT_SET_Z, TMP3, 0, TMP3, 0, SLJIT_IMM, 1);
add_jump(compiler, &no_match, JUMP(SLJIT_ZERO));
detect_partial_match_to(common, label);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));

set_jumps(no_char1_match, LABEL());
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
set_jumps(no_match, LABEL());

if (exact > 0)
add_jump(compiler, &backtrack->own_backtracks, CMP(SLJIT_GREATER, TMP3, 0, SLJIT_IMM, max - exact));
break;

case OP_POSQUERY:
Expand Down
9 changes: 9 additions & 0 deletions testdata/testinput1
Original file line number Diff line number Diff line change
Expand Up @@ -7034,6 +7034,15 @@ $/x
\= Expect no match
z

/^.{4}+/s
abcdef
abcde
abcd
\= Expect no match
abc
ab
a

# --------------

# End of testinput1
12 changes: 12 additions & 0 deletions testdata/testinput4
Original file line number Diff line number Diff line change
Expand Up @@ -998,6 +998,13 @@
\= Expect no match
\x{660}\x{661}\x{662}ABC

/^\pN{3,}+(.)/utf
\x{7c0}8\x{662}\x{966}\x{95c}
\x{7c0}8\x{662}\x{95c}
\= Expect no match
\x{7c0}8\x{662}\x{966}
\x{7c0}8\x{95c}

/(?<=A\p{Nd})XYZ/utf
A2XYZ
123A5XYZPQR
Expand Down Expand Up @@ -1127,6 +1134,11 @@
A\x{300}\x{301}B\x{300}C\x{300}\x{301}X
A\x{300}\x{301}B\x{300}C\x{300}\x{301}DA\x{300}X

/^\X{3}+/utf
A\x{300}B\x{301}C
\= Expect no match
A\x{300}

/^\X/utf
A
A\x{300}BC
Expand Down
15 changes: 15 additions & 0 deletions testdata/testoutput1
Original file line number Diff line number Diff line change
Expand Up @@ -11054,6 +11054,21 @@ No match
z
No match

/^.{4}+/s
abcdef
0: abcd
abcde
0: abcd
abcd
0: abcd
\= Expect no match
abc
No match
ab
No match
a
No match

# --------------

# End of testinput1
Loading

0 comments on commit 0747eb9

Please sign in to comment.