diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index ed4572cdd..ec23c8c66 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -68,7 +68,7 @@ jobs: - name: Test (main test script) run: | - ulimit -S -s 32768 # Raise stack limit; ASAN with -O0 is very stack-hungry + ulimit -S -s 49152 # Raise stack limit; ASAN with -O0 is very stack-hungry ./RunTest - name: Test (JIT test program) diff --git a/HACKING b/HACKING index 68e01bf45..4e3a024c8 100644 --- a/HACKING +++ b/HACKING @@ -633,6 +633,41 @@ When XCL_NOT is set, the bit map, if present, contains bits for characters that are allowed (exactly as for OP_NCLASS), but the list of items that follow it specifies characters and properties that are not allowed. +The meaning of the bitmap indicated by XCL_MAP is that, if one is present, then +it fully describes which code points < 256 match the class (without needing to +invert the check according to XCL_NOT); the other items in the OP_XCLASS need +not be consulted. However, if a bitmap is not present, then code points < 256 +may still match, so the other items in the OP_XCLASS must be consulted. + +For classes containing logical expressions, such as "[\p{Greek} && \p{Lu}]" for +"uppercase Greek letters", OP_ECLASS is used. The expression is encoded as a +a stack-based series of operands and operators, in Reverse Polish Notation. +Like an OP_XCLASS, the OP_ECLASS is first followed by a LINK_SIZE value +containing the total length of the opcode and its data. That is followed by a +code containing flags: currently just ECL_MAP indicating that a bit map is +present. There follows the bit map, if ECL_MAP is set. Finally a sequence of +items that are either an operand or operator. Each item starts with a single +code unit containing its type: + + ECL_AND AND; no additional data + ECL_OR OR; no additional data + ECL_XOR XOR; no additional data + ECL_NOT NOT; no additional data + ECL_XCLASS The additional data which follows ECL_XCLASS is the same as for + an OP_XCLASS, except that this data is preceded by ECL_XCLASS + rather than OP_XCLASS. + +Additionally, there are two intermediate values used during compilation, but +these are folded away during generation of the opcode, and so never appear +inside an OP_ECLASS at match time. They are: + + ECL_ANY match all characters; no additional data + ECL_NONE match no characters; no additional data + +The meaning of the bitmap indicated by ECL_MAP is different to that of XCL_MAP +for OP_XCLASS, in one way. The ECL_MAP bitmap is present whenever any code +points < 256 match the class. + Back references --------------- diff --git a/src/pcre2_auto_possess.c b/src/pcre2_auto_possess.c index 19372fa81..69f3200f9 100644 --- a/src/pcre2_auto_possess.c +++ b/src/pcre2_auto_possess.c @@ -480,13 +480,13 @@ switch(c) case OP_NCLASS: case OP_CLASS: +#ifdef SUPPORT_WIDE_CHARS case OP_XCLASS: case OP_ECLASS: - /* TODO: [EC] https://github.com/PCRE2Project/pcre2/issues/537 - Add back the "ifdef SUPPORT_WIDE_CHARS" once we stop emitting ECLASS for this case. */ if (c == OP_XCLASS || c == OP_ECLASS) end = code + GET(code, 0) - 1; else +#endif end = code + 32 / sizeof(PCRE2_UCHAR); class_end = end; @@ -1118,10 +1118,7 @@ for(;;) list_ptr[2] + LINK_SIZE, (const uint8_t*)cb->start_code, utf)) return FALSE; break; -#endif - /* TODO: [EC] https://github.com/PCRE2Project/pcre2/issues/537 - Enclose in "ifdef SUPPORT_WIDE_CHARS" once we stop emitting ECLASS for this case. */ case OP_ECLASS: if (PRIV(eclass)(chr, (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE, @@ -1129,6 +1126,7 @@ for(;;) (const uint8_t*)cb->start_code, utf)) return FALSE; break; +#endif /* SUPPORT_WIDE_CHARS */ default: return FALSE; @@ -1236,13 +1234,17 @@ for (;;) } c = *code; } - else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS || c == OP_ECLASS) + else if (c == OP_CLASS || c == OP_NCLASS +#ifdef SUPPORT_WIDE_CHARS + || c == OP_XCLASS || c == OP_ECLASS +#endif + ) { - /* TODO: [EC] https://github.com/PCRE2Project/pcre2/issues/537 - Add back the "ifdef SUPPORT_WIDE_CHARS" once we stop emitting ECLASS for this case. */ +#ifdef SUPPORT_WIDE_CHARS if (c == OP_XCLASS || c == OP_ECLASS) repeat_opcode = code + GET(code, 1); else +#endif repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR)); c = *repeat_opcode; @@ -1315,12 +1317,12 @@ for (;;) code += GET(code, 1 + 2*LINK_SIZE); break; - /* TODO: [EC] https://github.com/PCRE2Project/pcre2/issues/537 - Add back the "ifdef SUPPORT_WIDE_CHARS" once we stop emitting ECLASS for this case. */ - case OP_ECLASS: +#ifdef SUPPORT_WIDE_CHARS case OP_XCLASS: + case OP_ECLASS: code += GET(code, 1); break; +#endif case OP_MARK: case OP_COMMIT_ARG: diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 564c8d845..3696522d6 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -782,12 +782,15 @@ static const uint8_t opcode_possessify[] = { OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */ 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */ - 0, 0, 0, /* CLASS, NCLASS, XCLASS */ + 0, 0, 0, 0, /* CLASS, NCLASS, XCLASS, ECLASS */ 0, 0, /* REF, REFI */ 0, 0, /* DNREF, DNREFI */ - 0, 0 /* RECURSE, CALLOUT */ + 0, 0, /* RECURSE, CALLOUT */ }; +/* Compile-time check that the table has the correct size. */ +STATIC_ASSERT(sizeof(opcode_possessify) == OP_CALLOUT+1, opcode_possessify); + #ifdef DEBUG_SHOW_PARSED /************************************************* @@ -6139,13 +6142,16 @@ for (;; pptr++) else { *code++ = OP_CLASS; - memset(code, 0, 32 * sizeof(uint8_t)); + memset(code, 0, 32); code += 32 / sizeof(PCRE2_UCHAR); } if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; zerofirstcu = firstcu; zerofirstcuflags = firstcuflags; + // XXX revisit why these flags here are different to when we process a + // non-empty class - is this a problem? You can have '[^\d\D]' is basically + // the same as '[]' so they should surely set the same flags. break; @@ -6170,21 +6176,190 @@ for (;; pptr++) if ((*pptr & CLASS_IS_ECLASS) != 0) { + eclass_op_info op_info; + PCRE2_SIZE previous_length = (lengthptr != NULL)? *lengthptr : 0; + BOOL allbitsone = TRUE; + BOOL allbitszero = TRUE; + previous = code; *code++ = OP_ECLASS; code += LINK_SIZE; - if (!PRIV(compile_class_nested)(options, xoptions, &pptr, &code, - errorcodeptr, cb, lengthptr)) + *code++ = 0; /* Flags, currently zero. */ + if (!PRIV(compile_class_nested)(options, xoptions, FALSE, &pptr, &code, + &op_info, errorcodeptr, cb, lengthptr)) return 0; - PUT(previous, 1, (int)(code - previous)); - zeroreqcu = reqcu; - zeroreqcuflags = reqcuflags; - if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; - zerofirstcu = firstcu; - zerofirstcuflags = firstcuflags; + if (lengthptr != NULL) + { + *lengthptr += code - previous; + code = previous; + /* (*lengthptr - previous_length) now holds the amount of buffer that + we require to make the call to compile_class_nested() with + lengthptr = NULL, and including the (1+LINK_SIZE+1) that we write out + before that call. */ + } + + /* Do some useful counting of what's in the bitmap. */ + for (int i = 0; i < 8; i++) + if (op_info.bits.classwords[i] != 0xffffffff) + { allbitsone = FALSE; break; } + for (int i = 0; i < 8; i++) + if (op_info.bits.classwords[i] != 0) { allbitszero = FALSE; break; } + + /* After constant-folding the extended class syntax, it may turn out to be + a simple class after all. In that case, we can unwrap it from the + OP_ECLASS container - and in fact, we must do so, because in 8-bit + no-Unicode mode the matcher is compiled without support for OP_ECLASS. */ + +#ifndef SUPPORT_WIDE_CHARS + PCRE2_ASSERT(op_info.op_single_type != 0); +#else + if (op_info.op_single_type != 0) +#endif + { + /* Rewind back over the OP_ECLASS. */ + code = previous; - break; /* We are finished with this class */ + /* If the bits are all ones, and the "high characters" are all matched + too, we use a special-cased encoding of OP_ALLANY. */ + + if (op_info.op_single_type == ECL_ANY && allbitsone) + { + /* Advancing code means rewinding lengthptr, at this point. */ + if (lengthptr != NULL) *lengthptr -= 1; + *code++ = OP_ALLANY; + } + + /* If the high bits are all matched / all not-matched, then we emit an + OP_NCLASS/OP_CLASS respectively. */ + + else if (op_info.op_single_type == ECL_ANY || + op_info.op_single_type == ECL_NONE) + { + PCRE2_SIZE required_len = 1 + (32 / sizeof(PCRE2_UCHAR)); + + if (lengthptr != NULL) + { + if (required_len > (*lengthptr - previous_length)) + *lengthptr = previous_length + required_len; + } + + /* Advancing code means rewinding lengthptr, at this point. */ + if (lengthptr != NULL) *lengthptr -= required_len; + *code++ = (op_info.op_single_type == ECL_ANY)? OP_NCLASS : OP_CLASS; + memcpy(code, op_info.bits.classbits, 32); + code += 32 / sizeof(PCRE2_UCHAR); + } + + /* Otherwise, we have an ECL_XCLASS, so we have the OP_XCLASS data + there, but, we pulled out its bitmap into op_info, so now we have to + put that back into the OP_XCLASS. */ + +#ifndef SUPPORT_WIDE_CHARS + else + { + PCRE2_DEBUG_UNREACHABLE(); + } +#else + else + { + BOOL need_map; + PCRE2_SIZE required_len; + + PCRE2_ASSERT(op_info.op_single_type == ECL_XCLASS); + need_map = !allbitszero; + required_len = + op_info.length + (need_map? 32/sizeof(PCRE2_UCHAR) : 0); + + if (lengthptr != NULL) + { + /* Don't unconditionally request all the space we need - we may + already have asked for more during processing of the ECLASS. */ + if (required_len > (*lengthptr - previous_length)) + { + *lengthptr = previous_length + required_len; + } + + /* The code we write out here won't be ignored, even during the + (lengthptr != NULL) phase, because if there's a following quantifier + it will peek backwards. So we do have to write out a (truncated) + OP_XCLASS, even on this branch. */ + *lengthptr -= 1 + LINK_SIZE + 1; + *code++ = OP_XCLASS; + PUT(code, 0, 1 + LINK_SIZE + 1); + code += LINK_SIZE; + *code++ = 0; + } + else + { + PCRE2_UCHAR *rest; + PCRE2_SIZE rest_len; + PCRE2_UCHAR flags; + + /* 1 unit: OP_XCLASS | LINK_SIZE units | 1 unit: flags | ...rest */ + PCRE2_ASSERT(op_info.length >= 1 + LINK_SIZE + 1); + rest = op_info.code_start + 1 + LINK_SIZE + 1; + rest_len = (op_info.code_start + op_info.length) - rest; + + /* First read any data we use, before memmove splats it. */ + flags = op_info.code_start[1 + LINK_SIZE]; + PCRE2_ASSERT((flags & XCL_MAP) == 0); + + /* Next do the memmove before any writes. */ + memmove( + code + 1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0), + rest, CU2BYTES(rest_len)); + + /* Finally write the header data. */ + *code++ = OP_XCLASS; + PUT(code, 0, (int)required_len); + code += LINK_SIZE; + *code++ = flags | (need_map? XCL_MAP : 0); + if (need_map) + { + memcpy(code, op_info.bits.classbits, 32); + code += 32 / sizeof(PCRE2_UCHAR); + } + code += rest_len; + } + } +#endif /* SUPPORT_WIDE_CHARS */ + } + + /* Otherwise, we're going to keep the OP_ECLASS. However, again we need + to do some adjustment to insert the bitmap if we have one. */ + +#ifdef SUPPORT_WIDE_CHARS + else + { + BOOL need_map = !allbitszero; + PCRE2_SIZE required_len = 1 + LINK_SIZE + 1 + + (need_map? 32/sizeof(PCRE2_UCHAR) : 0) + op_info.length; + + if (lengthptr != NULL) + { + if (required_len > (*lengthptr - previous_length)) + { + *lengthptr = previous_length + required_len; + } + } + else + { + if (need_map) + { + PCRE2_UCHAR *map_start = previous + 1 + LINK_SIZE + 1; + previous[1 + LINK_SIZE] |= ECL_MAP; + memmove(map_start + 32/sizeof(PCRE2_UCHAR), map_start, + CU2BYTES(code - map_start)); + memcpy(map_start, op_info.bits.classbits, 32); + code += 32 / sizeof(PCRE2_UCHAR); + } + PUT(previous, 1, (int)(code - previous)); + } + } +#endif /* SUPPORT_WIDE_CHARS */ + + goto CLASS_END_PROCESSING; } /* We can optimize the case of a single character in a class by generating @@ -6308,11 +6483,13 @@ for (;; pptr++) /* Now emit the OP_CLASS/OP_NCLASS/OP_XCLASS/OP_ALLANY opcode. */ pptr = PRIV(compile_class_not_nested)(options, xoptions, pptr + 1, - &code, meta == META_CLASS_NOT, + &code, meta == META_CLASS_NOT, FALSE, errorcodeptr, cb, lengthptr); if (pptr == NULL) return 0; PCRE2_ASSERT(*pptr == META_CLASS_END); + CLASS_END_PROCESSING: + /* If this class is the first thing in the branch, there can be no first char setting, whatever the repeat count. Any reqcu setting must remain unchanged after any kind of repeat. */ @@ -7203,10 +7380,8 @@ for (;; pptr++) #ifdef SUPPORT_WIDE_CHARS case OP_XCLASS: -#endif - /* TODO: [EC] https://github.com/PCRE2Project/pcre2/issues/537 - Enclose in the "ifdef SUPPORT_WIDE_CHARS" once we stop emitting ECLASS for this case. */ case OP_ECLASS: +#endif case OP_CLASS: case OP_NCLASS: case OP_REF: @@ -7848,12 +8023,12 @@ for (;; pptr++) tempcode += 1 + 32/sizeof(PCRE2_UCHAR); break; - /* TODO: [EC] https://github.com/PCRE2Project/pcre2/issues/537 - Add back the "ifdef SUPPORT_WIDE_CHARS" once we stop emitting ECLASS for this case. */ +#ifdef SUPPORT_WIDE_CHARS case OP_XCLASS: case OP_ECLASS: tempcode += GET(tempcode, 1); break; +#endif } /* If tempcode is equal to code (which points to the end of the repeated diff --git a/src/pcre2_compile.h b/src/pcre2_compile.h index 516a0b9ec..52f66cd71 100644 --- a/src/pcre2_compile.h +++ b/src/pcre2_compile.h @@ -160,7 +160,7 @@ versions. */ #define META_MINMAX_QUERY 0x80430000u /* {n,m}? repeat */ /* These meta codes must be kept in a group, with the OR/SUB/XOR in -this order. */ +this order, and AND/NOT at the start/end. */ #define META_ECLASS_AND 0x80440000u /* && (or &) in a class */ #define META_ECLASS_OR 0x80450000u /* || (or |, +) in a class */ @@ -218,11 +218,26 @@ therefore no need for it to have a length entry, so use a high value. */ #define CLIST_ALIGN_TO(base, align) \ ((base + ((size_t)(align) - 1)) & ~((size_t)(align) - 1)) +/* Structure for holding information about an OP_ECLASS internal operand. +An "operand" here could be just a single OP_[X]CLASS, or it could be some +complex expression; but it's some sequence of ECL_* codes which pushes one +value to the stack. */ +typedef struct { + /* The position of the operand - or NULL if (lengthptr != NULL). */ + PCRE2_UCHAR *code_start; + PCRE2_SIZE length; + /* The operand's type if it is a single code (ECL_XCLASS, ECL_ANY, ECL_NONE); + otherwise zero if the operand is not atomic. */ + uint8_t op_single_type; + /* Regardless of whether it's a single code or not, we fully constant-fold + the bitmap for code points < 256. */ + class_bits_storage bits; +} eclass_op_info; + /* Macros for the definitions below, to prevent name collisions. */ #define _pcre2_posix_class_maps PCRE2_SUFFIX(_pcre2_posix_class_maps) #define _pcre2_update_classbits PCRE2_SUFFIX(_pcre2_update_classbits_) -#define _pcre2_check_class_not_nested PCRE2_SUFFIX(_pcre2_check_class_not_nested_) #define _pcre2_compile_class_nested PCRE2_SUFFIX(_pcre2_compile_class_nested_) #define _pcre2_compile_class_not_nested PCRE2_SUFFIX(_pcre2_compile_class_not_nested_) @@ -244,11 +259,12 @@ extern const int PRIV(posix_class_maps)[]; void PRIV(update_classbits)(uint32_t ptype, uint32_t pdata, BOOL negated, uint8_t *classbits); -/* Returns TRUE when a series of META codes can be compiled to a simple class -(OP_CLASS, OP_NCLASS, OP_XCLASS, OP_ALLANY); otherwise FALSE if it requires an -extended class (OP_ECLASS).*/ +/* Compile the META codes from start_ptr...end_ptr, writing a single OP_CLASS +OP_CLASS, OP_NCLASS, OP_XCLASS, or OP_ALLANY into pcode. */ -BOOL PRIV(check_class_not_nested)(uint32_t *ptr, uint32_t **pendptr); +uint32_t *PRIV(compile_class_not_nested)(uint32_t options, uint32_t xoptions, + uint32_t *start_ptr, PCRE2_UCHAR **pcode, BOOL negate_class, BOOL always_map, + int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr); /* Compile the META codes in pptr into opcodes written to pcode. The pptr must start at a META_CLASS or META_CLASS_NOT. @@ -256,14 +272,7 @@ start at a META_CLASS or META_CLASS_NOT. The pptr will be left pointing at the matching META_CLASS_END. */ BOOL PRIV(compile_class_nested)(uint32_t options, uint32_t xoptions, - uint32_t **pptr, PCRE2_UCHAR **pcode, int *errorcodeptr, - compile_block *cb, PCRE2_SIZE *lengthptr); - -/* Compile the META codes from start_ptr...end_ptr, writing a single OP_CLASS -OP_CLASS, OP_NCLASS, OP_XCLASS, or OP_ALLANY into pcode. */ - -uint32_t *PRIV(compile_class_not_nested)(uint32_t options, uint32_t xoptions, - uint32_t *start_ptr, PCRE2_UCHAR **pcode, BOOL negate_class, + BOOL negated, uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr); #endif /* PCRE2_COMPILE_H_IDEMPOTENT_GUARD */ diff --git a/src/pcre2_compile_class.c b/src/pcre2_compile_class.c index 5e2e5355d..a9c0bfe05 100644 --- a/src/pcre2_compile_class.c +++ b/src/pcre2_compile_class.c @@ -737,7 +737,7 @@ BOOL set_bit; if (ptype == PT_ANY) { - if (!negated) memset(classbits, 0xff, 32 * sizeof(uint8_t)); + if (!negated) memset(classbits, 0xff, 32); return; } @@ -867,7 +867,7 @@ for (c = 0; c < 256; c++) /************************************************* -* External entry point for add range to class * +* Internal entry point for add range to class * *************************************************/ /* This function sets the overall range for characters < 256. @@ -933,7 +933,7 @@ for (c = byte_end; c <= classbits_end; c++) #if PCRE2_CODE_UNIT_WIDTH == 8 /************************************************* -* External entry point for add list to class * +* Internal entry point for add list to class * *************************************************/ /* This function is used for adding a list of horizontal or vertical whitespace @@ -998,12 +998,16 @@ while (p[0] < 256) +/************************************************* +* Main entry-point to compile a character class * +*************************************************/ + /* This function consumes a "leaf", which is a set of characters that will -become a single OP_CLASS (or OP_NCLASS, OP_XCLASS, or OP_ALLANY). */ +become a single OP_CLASS OP_NCLASS, OP_XCLASS, or OP_ALLANY. */ uint32_t * PRIV(compile_class_not_nested)(uint32_t options, uint32_t xoptions, - uint32_t *start_ptr, PCRE2_UCHAR **pcode, BOOL negate_class, + uint32_t *start_ptr, PCRE2_UCHAR **pcode, BOOL negate_class, BOOL always_map, int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr) { uint32_t *pptr = start_ptr; @@ -1085,14 +1089,14 @@ if (utf) } class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */ -#endif +#endif /* SUPPORT_WIDE_CHARS */ /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map in a temporary bit of memory, in case the class contains fewer than two 8-bit characters because in that case the compiled code doesn't use the bit map. */ -memset(classbits, 0, 32 * sizeof(uint8_t)); +memset(classbits, 0, 32); /* Process items until end_ptr is reached. */ @@ -1102,7 +1106,7 @@ while (TRUE) BOOL local_negate; int posix_class; int taboffset, tabopt; - uint8_t pbits[32]; + class_bits_storage pbits; uint32_t escape, c; /* Handle POSIX classes such as [:alpha:] etc. */ @@ -1114,7 +1118,7 @@ while (TRUE) local_negate = (meta == META_POSIX_NEG); posix_class = *(pptr++); - should_flip_negation = local_negate; /* Note negative special */ + if (local_negate) should_flip_negation = TRUE; /* Note negative special */ /* If matching is caseless, upper and lower are converted to alpha. This relies on the fact that the class table starts with alpha, @@ -1129,37 +1133,35 @@ while (TRUE) XCL_PROP/XCL_NOTPROP directly, which is done here. */ #ifdef SUPPORT_UNICODE + /* TODO This entire block of code here appears to be unreachable!? I simply + can't see how it can be hit, given that the frontend parser doesn't emit + META_POSIX for GRAPH/PRINT/PUNCT when UCP is set. */ if ((options & PCRE2_UCP) != 0 && (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0) { + uint32_t ptype; + switch(posix_class) { case PC_GRAPH: case PC_PRINT: case PC_PUNCT: + ptype = (posix_class == PC_GRAPH)? PT_PXGRAPH : + (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT; - if (lengthptr != NULL) + PRIV(update_classbits)(ptype, 0, local_negate, classbits); + + if ((xclass_props & XCLASS_HIGH_ANY) == 0) { - if ((xclass_props & XCLASS_HIGH_ANY) == 0) - { + if (lengthptr != NULL) *lengthptr += 3; - xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS; - } - } - else - { - uint32_t ptype = ((posix_class == PC_GRAPH)? PT_PXGRAPH : - (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT); - - PRIV(update_classbits)(ptype, 0, !local_negate, classbits); - - if ((xclass_props & XCLASS_HIGH_ANY) == 0) + else { *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP; *class_uchardata++ = (PCRE2_UCHAR)ptype; *class_uchardata++ = 0; - xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS; } + xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS; } continue; @@ -1195,8 +1197,7 @@ while (TRUE) /* Copy in the first table (always present) */ - memcpy(pbits, cbits + PRIV(posix_class_maps)[posix_class], - 32 * sizeof(uint8_t)); + memcpy(pbits.classbits, cbits + PRIV(posix_class_maps)[posix_class], 32); /* If there is a second table, add or remove it as required. */ @@ -1206,27 +1207,35 @@ while (TRUE) if (taboffset >= 0) { if (tabopt >= 0) - for (int i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset]; + for (int i = 0; i < 32; i++) + pbits.classbits[i] |= cbits[(int)i + taboffset]; else - for (int i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset]; + for (int i = 0; i < 32; i++) + pbits.classbits[i] &= ~cbits[(int)i + taboffset]; } /* Now see if we need to remove any special characters. An option value of 1 removes vertical space and 2 removes underscore. */ if (tabopt < 0) tabopt = -tabopt; - if (tabopt == 1) pbits[1] &= ~0x3c; - else if (tabopt == 2) pbits[11] &= 0x7f; + if (tabopt == 1) pbits.classbits[1] &= ~0x3c; + else if (tabopt == 2) pbits.classbits[11] &= 0x7f; /* Add the POSIX table or its complement into the main table that is being built and we are done. */ - if (local_negate) - for (int i = 0; i < 32; i++) classbits[i] |= (uint8_t)(~pbits[i]); - else - for (int i = 0; i < 32; i++) classbits[i] |= pbits[i]; + { + uint32_t *classwords = cb->classbits.classwords; -#ifdef SUPPORT_UNICODE + if (local_negate) + for (int i = 0; i < 8; i++) + classwords[i] |= (uint32_t)(~pbits.classwords[i]); + else + for (int i = 0; i < 8; i++) + classwords[i] |= pbits.classwords[i]; + } + +#ifdef SUPPORT_WIDE_CHARS /* Every class contains at least one < 256 character. */ xclass_props |= XCLASS_HAS_8BIT_CHARS; #endif @@ -1235,8 +1244,8 @@ while (TRUE) /* Other than POSIX classes, the only items we should encounter are \d-type escapes and literal characters (possibly as ranges). */ case META_BIGVALUE: - meta = *(pptr++); - break; + meta = *(pptr++); + break; case META_ESCAPE: escape = META_DATA(meta); @@ -1348,7 +1357,7 @@ while (TRUE) if (ptype == PT_ANY) { #if PCRE2_CODE_UNIT_WIDTH == 8 - if (!utf && escape == ESC_p) memset(classbits, 0xff, 32 * sizeof(uint8_t)); + if (!utf && escape == ESC_p) memset(classbits, 0xff, 32); #endif continue; } @@ -1364,26 +1373,20 @@ while (TRUE) pdata = 0; } - if (lengthptr != NULL) + PRIV(update_classbits)(ptype, pdata, + (escape == ESC_P), classbits); + + if ((xclass_props & XCLASS_HIGH_ANY) == 0) { - if ((xclass_props & XCLASS_HIGH_ANY) == 0) - { + if (lengthptr != NULL) *lengthptr += 3; - xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS; - } - } - else - { - PRIV(update_classbits)(ptype, pdata, - (escape == ESC_P), classbits); - - if ((xclass_props & XCLASS_HIGH_ANY) == 0) + else { *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP; *class_uchardata++ = ptype; *class_uchardata++ = pdata; - xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS; } + xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS; } } continue; @@ -1400,6 +1403,7 @@ while (TRUE) CLASS_END_CASES(meta) /* Literals. */ if (meta < META_END) break; + /* Non-literals: end of class contents. */ goto END_PROCESSING; } @@ -1596,7 +1600,7 @@ if (cranges != NULL) cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data); } } -#endif +#endif /* SUPPORT_WIDE_CHARS */ /* If there are characters with values > 255, or Unicode property settings (\p or \P), we have to compile an extended class, with its own opcode, @@ -1636,7 +1640,8 @@ if ((xclass_props & XCLASS_REQUIRED) != 0) /* If the map is required, move up the extra data to make room for it; otherwise just move the code pointer to the end of the extra data. */ - if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0) + if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0 || + always_map) { *code++ |= XCL_MAP; (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code, @@ -1772,8 +1777,7 @@ if ((SELECT_VALUE8(!utf, 0) || negate_class != should_flip_negation) && } *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; -if (lengthptr == NULL) /* Save time in the pre-compile phase */ - memcpy(code, classbits, 32); +memcpy(code, classbits, 32); code += 32 / sizeof(PCRE2_UCHAR); DONE: @@ -1783,87 +1787,475 @@ return pptr - 1; +/* ===================================================================*/ +/* Here follows a block of ECLASS-compiling functions. You may well want to +read them from top to bottom; they are ordered from leafmost (at the top) to +outermost parser (at the bottom of the file). */ + +/* This function folds one operand using the negation operator. +The new, combined chunk of stack code is written out to *pop_info. */ + +static void +fold_negation(eclass_op_info *pop_info, PCRE2_SIZE *lengthptr) +{ +/* If the chunk of stack code is already composed of multiple ops, we won't +descend in and try and propagate the negation down the tree. (That would lead +to O(n^2) compile-time, which could be exploitable with a malicious regex - +although maybe that's not really too much of a worry in a library that offers +an exponential-time matching function!) */ + +if (pop_info->op_single_type == 0) + { + if (lengthptr != NULL) + *lengthptr += 1; + else + pop_info->code_start[pop_info->length] = ECL_NOT; + pop_info->length += 1; + } + +/* Otherwise, it's a nice single-op item, so we can easily fold in the negation +without needing to produce an ECL_NOT. */ + +else if (pop_info->op_single_type == ECL_ANY || + pop_info->op_single_type == ECL_NONE) + { + pop_info->op_single_type = (pop_info->op_single_type == ECL_NONE)? + ECL_ANY : ECL_NONE; + if (lengthptr == NULL) + *(pop_info->code_start) = pop_info->op_single_type; + } +else + { + PCRE2_ASSERT(pop_info->op_single_type == ECL_XCLASS && + pop_info->length >= 1 + LINK_SIZE + 1); + if (lengthptr == NULL) + pop_info->code_start[1 + LINK_SIZE] ^= XCL_NOT; + } + +for (int i = 0; i < 8; i++) + pop_info->bits.classwords[i] = ~pop_info->bits.classwords[i]; +} + + + +/* This function folds together two operands using a binary operator. +The new, combined chunk of stack code is written out to *lhs_op_info. */ + +static void +fold_binary(int op, eclass_op_info *lhs_op_info, eclass_op_info *rhs_op_info, + PCRE2_SIZE *lengthptr) +{ +switch (op) { + +/* ECL_AND truth table: + + LHS RHS RESULT + ---------------- + ANY * RHS + * ANY LHS + NONE * NONE + * NONE NONE + X Y X & Y +*/ + +case ECL_AND: +if (rhs_op_info->op_single_type == ECL_ANY) + { + /* no-op: drop the RHS */ + } +else if (lhs_op_info->op_single_type == ECL_ANY) + { + /* no-op: drop the LHS, and memmove the RHS into its place */ + if (lengthptr == NULL) + memmove(lhs_op_info->code_start, rhs_op_info->code_start, + CU2BYTES(rhs_op_info->length)); + lhs_op_info->length = rhs_op_info->length; + lhs_op_info->op_single_type = rhs_op_info->op_single_type; + } +else if (rhs_op_info->op_single_type == ECL_NONE) + { + /* the result is ECL_NONE: write into the LHS */ + if (lengthptr == NULL) + lhs_op_info->code_start[0] = ECL_NONE; + lhs_op_info->length = 1; + lhs_op_info->op_single_type = ECL_NONE; + } +else if (lhs_op_info->op_single_type == ECL_NONE) + { + /* the result is ECL_NONE: drop the RHS */ + } +else + { + /* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */ + if (lengthptr != NULL) + *lengthptr += 1; + else + { + PCRE2_ASSERT(rhs_op_info->code_start == + lhs_op_info->code_start + lhs_op_info->length); + rhs_op_info->code_start[rhs_op_info->length] = ECL_AND; + } + lhs_op_info->length += rhs_op_info->length + 1; + lhs_op_info->op_single_type = 0; + } + +for (int i = 0; i < 8; i++) + lhs_op_info->bits.classwords[i] &= rhs_op_info->bits.classwords[i]; +break; + +/* ECL_OR truth table: + + LHS RHS RESULT + ---------------- + ANY * ANY + * ANY ANY + NONE * RHS + * NONE LHS + X Y X | Y +*/ + +case ECL_OR: +if (rhs_op_info->op_single_type == ECL_NONE) + { + /* no-op: drop the RHS */ + } +else if (lhs_op_info->op_single_type == ECL_NONE) + { + /* no-op: drop the LHS, and memmove the RHS into its place */ + if (lengthptr == NULL) + memmove(lhs_op_info->code_start, rhs_op_info->code_start, + CU2BYTES(rhs_op_info->length)); + lhs_op_info->length = rhs_op_info->length; + lhs_op_info->op_single_type = rhs_op_info->op_single_type; + } +else if (rhs_op_info->op_single_type == ECL_ANY) + { + /* the result is ECL_ANY: write into the LHS */ + if (lengthptr == NULL) + lhs_op_info->code_start[0] = ECL_ANY; + lhs_op_info->length = 1; + lhs_op_info->op_single_type = ECL_ANY; + } +else if (lhs_op_info->op_single_type == ECL_ANY) + { + /* the result is ECL_ANY: drop the RHS */ + } +else + { + /* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */ + if (lengthptr != NULL) + *lengthptr += 1; + else + { + PCRE2_ASSERT(rhs_op_info->code_start == + lhs_op_info->code_start + lhs_op_info->length); + rhs_op_info->code_start[rhs_op_info->length] = ECL_OR; + } + lhs_op_info->length += rhs_op_info->length + 1; + lhs_op_info->op_single_type = 0; + } + +for (int i = 0; i < 8; i++) + lhs_op_info->bits.classwords[i] |= rhs_op_info->bits.classwords[i]; +break; + +/* ECL_XOR truth table: + + LHS RHS RESULT + ---------------- + ANY * !RHS + * ANY !LHS + NONE * RHS + * NONE LHS + X Y X ^ Y +*/ + +case ECL_XOR: +if (rhs_op_info->op_single_type == ECL_NONE) + { + /* no-op: drop the RHS */ + } +else if (lhs_op_info->op_single_type == ECL_NONE) + { + /* no-op: drop the LHS, and memmove the RHS into its place */ + if (lengthptr == NULL) + memmove(lhs_op_info->code_start, rhs_op_info->code_start, + CU2BYTES(rhs_op_info->length)); + lhs_op_info->length = rhs_op_info->length; + lhs_op_info->op_single_type = rhs_op_info->op_single_type; + } +else if (rhs_op_info->op_single_type == ECL_ANY) + { + /* the result is !LHS: fold in the negation, and drop the RHS */ + fold_negation(lhs_op_info, lengthptr); + } +else if (lhs_op_info->op_single_type == ECL_ANY) + { + /* the result is !RHS: drop the LHS, memmove the RHS into its place, and + fold in the negation */ + if (lengthptr == NULL) + memmove(lhs_op_info->code_start, rhs_op_info->code_start, + CU2BYTES(rhs_op_info->length)); + lhs_op_info->length = rhs_op_info->length; + lhs_op_info->op_single_type = rhs_op_info->op_single_type; + + fold_negation(lhs_op_info, lengthptr); + } +else + { + /* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */ + if (lengthptr != NULL) + *lengthptr += 1; + else + { + PCRE2_ASSERT(rhs_op_info->code_start == + lhs_op_info->code_start + lhs_op_info->length); + rhs_op_info->code_start[rhs_op_info->length] = ECL_XOR; + } + lhs_op_info->length += rhs_op_info->length + 1; + lhs_op_info->op_single_type = 0; + } + +for (int i = 0; i < 8; i++) + lhs_op_info->bits.classwords[i] ^= rhs_op_info->bits.classwords[i]; +break; +} +} + + + /* This function consumes a group of implicitly-unioned class elements. These can be characters, ranges, properties, or nested classes, as long as they are all joined by being placed adjacently. */ static BOOL -compile_class_operand(uint32_t options, uint32_t xoptions, uint32_t **pptr, - PCRE2_UCHAR **pcode, int *errorcodeptr, compile_block *cb, - PCRE2_SIZE *lengthptr) +compile_class_operand(uint32_t options, uint32_t xoptions, BOOL negated, + uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, + int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr) { uint32_t *ptr = *pptr; +uint32_t *prev_ptr; PCRE2_UCHAR *code = *pcode; PCRE2_UCHAR *code_start = code; -BOOL first = TRUE; +PCRE2_SIZE prev_length = (lengthptr != NULL)? *lengthptr : 0; +PCRE2_SIZE extra_length; +uint32_t meta = META_CODE(*ptr); -while (TRUE) +switch (meta) { - switch (META_CODE(*ptr)) + case META_CLASS_EMPTY_NOT: + case META_CLASS_EMPTY: + ++ptr; + pop_info->length = 1; + if ((meta == META_CLASS_EMPTY) == negated) { - case META_CLASS_END: - case META_ECLASS_AND: - case META_ECLASS_OR: - case META_ECLASS_SUB: - case META_ECLASS_XOR: - case META_ECLASS_NOT: - goto DONE; + *code++ = pop_info->op_single_type = ECL_ANY; + memset(pop_info->bits.classbits, 0xff, 32); + } + else + { + *code++ = pop_info->op_single_type = ECL_NONE; + memset(pop_info->bits.classbits, 0, 32); + } + break; - case META_CLASS_EMPTY_NOT: - *code++ = OP_ALLANY; - ++ptr; - break; + case META_CLASS: + case META_CLASS_NOT: + if ((*ptr & CLASS_IS_ECLASS) != 0) + { + if (!PRIV(compile_class_nested)(options, xoptions, negated, &ptr, &code, + pop_info, errorcodeptr, cb, lengthptr)) + return FALSE; - case META_CLASS_EMPTY: - *code++ = OP_CLASS; - memset(code, 0, 32 * sizeof(uint8_t)); - code += 32 / sizeof(PCRE2_UCHAR); - ++ptr; - break; + PCRE2_ASSERT(*ptr == META_CLASS_END); + ptr++; + goto DONE; + } - case META_CLASS: - case META_CLASS_NOT: - if ((*ptr & CLASS_IS_ECLASS) == 0) - { - ptr = PRIV(compile_class_not_nested)(options, xoptions, ptr + 1, - &code, *ptr == META_CLASS_NOT, - errorcodeptr, cb, lengthptr); - if (ptr == NULL) return FALSE; - } - else if (!PRIV(compile_class_nested)(options, xoptions, &ptr, &code, - errorcodeptr, cb, lengthptr)) - return FALSE; + ptr++; + /* Fall through */ + default: + /* Scan forward characters, ranges, and properties. + For example: inside [a-z_ -- m] we don't have brackets around "a-z_" but + we still need to collect that fragment up into a "leaf" OP_CLASS. */ + + prev_ptr = ptr; + ptr = PRIV(compile_class_not_nested)(options, xoptions, ptr, &code, + (meta != META_CLASS_NOT) == negated, + TRUE, errorcodeptr, cb, lengthptr); + if (ptr == NULL) return FALSE; + + /* We must have a 100% guarantee that ptr increases when + compile_class_operand() returns, even on Release builds. */ + PCRE2_ASSERT(ptr > prev_ptr); + if (ptr <= prev_ptr) return FALSE; + + /* If we fell through above, consume the closing ']'. */ + if (meta == META_CLASS || meta == META_CLASS_NOT) + { PCRE2_ASSERT(*ptr == META_CLASS_END); ptr++; - break; + } - default: - /* Scan forward characters, ranges, and properties. - For example: inside [a-z_ -- m] we don't have brackets around "a-z_" but - we still need to collect that fragment up into a "leaf" OP_CLASS. */ - ptr = PRIV(compile_class_not_nested)(options, xoptions, ptr, &code, - FALSE, errorcodeptr, cb, lengthptr); - if (ptr == NULL) return FALSE; - break; + /* Regardless of whether (lengthptr == NULL), some data will still be written + out to *pcode, which we need: we have to peek at it, to transform the opcode + into the ECLASS version (since we need to hoist up the bitmaps). */ + PCRE2_ASSERT(code > code_start); + extra_length = (lengthptr != NULL)? *lengthptr - prev_length : 0; + + /* Easiest case: convert OP_ALLANY to ECL_ANY */ + + if (*code_start == OP_ALLANY) + { + PCRE2_ASSERT(code - code_start == 1 && extra_length == 0); + pop_info->length = 1; + *code_start = pop_info->op_single_type = ECL_ANY; + memset(pop_info->bits.classbits, 0xff, 32); } - /* Join second and subsequent leaves with an OR. */ - if (!first) *code++ = OP_ECLASS_OR; + /* For OP_CLASS and OP_NCLASS, we hoist out the bitmap and convert to + ECL_NONE / ECL_ANY respectively. */ - if (lengthptr != NULL) + else if (*code_start == OP_CLASS || *code_start == OP_NCLASS) { - *lengthptr += code - code_start; - code = code_start; + PCRE2_ASSERT(code - code_start == 1 + 32 / sizeof(PCRE2_UCHAR) && + extra_length == 0); + pop_info->length = 1; + *code_start = pop_info->op_single_type = + (*code_start == OP_CLASS)? ECL_NONE : ECL_ANY; + memcpy(pop_info->bits.classbits, code_start + 1, 32); + /* Rewind the code pointer, but make sure we adjust *lengthptr, because we + do need to reserve that space (even though we only use it temporarily). */ + if (lengthptr != NULL) + *lengthptr += code - (code_start + 1); + code = code_start + 1; } - first = FALSE; + /* Finally, for OP_XCLASS we hoist out the bitmap (if any), and convert to + ECL_XCLASS. */ + + else + { + PCRE2_UCHAR flags; + PCRE2_UCHAR *map_start; + PCRE2_UCHAR *data_start; + PCRE2_SIZE put_length; + + PCRE2_ASSERT(*code_start == OP_XCLASS); + *code_start = pop_info->op_single_type = ECL_XCLASS; + + PCRE2_ASSERT(code - code_start >= 1 + LINK_SIZE + 1); + flags = code_start[1 + LINK_SIZE]; + + if ((flags & XCL_MAP) != 0) + { + PCRE2_ASSERT(code - code_start >= + 1 + LINK_SIZE + 1 + 32 / (int)sizeof(PCRE2_UCHAR)); + + put_length = GET(code_start, 1) - 32 / sizeof(PCRE2_UCHAR); + PUT(code_start, 1, (int)put_length); + code_start[1 + LINK_SIZE] &= ~(PCRE2_UCHAR)XCL_MAP; + + map_start = code_start + 1 + LINK_SIZE + 1; + data_start = map_start + 32 / sizeof(PCRE2_UCHAR); + memcpy(pop_info->bits.classbits, map_start, 32); + memmove(map_start, data_start, CU2BYTES(code - data_start)); + + /* Rewind the code pointer, but make sure we adjust *lengthptr, because we + do need to reserve that space (even though we only use it temporarily). */ + if (lengthptr != NULL) + *lengthptr += 32 / sizeof(PCRE2_UCHAR); + code -= 32 / sizeof(PCRE2_UCHAR); + } + else + { + memset(pop_info->bits.classbits, 0, 32); + } + + pop_info->length = (code - code_start) + extra_length; + } + + break; + } /* End of switch(meta) */ + +pop_info->code_start = (lengthptr == NULL)? code_start : NULL; + +if (lengthptr != NULL) + { + *lengthptr += code - code_start; + code = code_start; } DONE: -PCRE2_ASSERT(!first); /* Confirm that we found something. */ PCRE2_ASSERT(lengthptr == NULL || (code == code_start)); +*pptr = ptr; +*pcode = code; +return TRUE; + +// XXX produce some manual tests to verify that it's OK to leave some unconsumed +// tokens? should crash/fail even without debug assertions +} + + + +/* This function consumes a group of implicitly-unioned class elements. +These can be characters, ranges, properties, or nested classes, as long +as they are all joined by being placed adjacently. */ + +static BOOL +compile_class_juxtaposition(uint32_t options, uint32_t xoptions, BOOL negated, + uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, + int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr) +{ +uint32_t *ptr = *pptr; +PCRE2_UCHAR *code = *pcode; +#ifdef PCRE2_DEBUG +PCRE2_UCHAR *start_code = *pcode; +#endif + +/* See compile_class_binary_loose() for comments on compile-time folding of +the "negated" flag. */ + +/* Because it's a non-empty class, there must be an operand at the start. */ +if (!compile_class_operand(options, xoptions, negated, &ptr, &code, pop_info, + errorcodeptr, cb, lengthptr)) + return FALSE; + +while (*ptr != META_CLASS_END && + !(*ptr >= META_ECLASS_AND && *ptr <= META_ECLASS_NOT)) + { + uint32_t op; + BOOL rhs_negated; + eclass_op_info rhs_op_info; + + if (negated) + { + /* !(A juxtapose B) -> !A && !B */ + op = ECL_AND; + rhs_negated = TRUE; + } + else + { + /* A juxtapose B -> A || B */ + op = ECL_OR; + rhs_negated = FALSE; + } + + /* An operand must follow the operator. */ + if (!compile_class_operand(options, xoptions, rhs_negated, &ptr, &code, + &rhs_op_info, errorcodeptr, cb, lengthptr)) + return FALSE; + + /* Convert infix to postfix (RPN). */ + fold_binary(op, pop_info, &rhs_op_info, lengthptr); + if (lengthptr == NULL) + code = pop_info->code_start + pop_info->length; + } + +PCRE2_ASSERT(lengthptr == NULL || code == start_code); + *pptr = ptr; *pcode = code; return TRUE; @@ -1874,13 +2266,12 @@ return TRUE; /* This function consumes unary prefix operators. */ static BOOL -compile_class_unary(uint32_t options, uint32_t xoptions, uint32_t **pptr, - PCRE2_UCHAR **pcode, int *errorcodeptr, compile_block *cb, - PCRE2_SIZE *lengthptr) +compile_class_unary(uint32_t options, uint32_t xoptions, BOOL negated, + uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, + int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr) { uint32_t *ptr = *pptr; PCRE2_UCHAR *code = *pcode; -BOOL negated = FALSE; #ifdef PCRE2_DEBUG PCRE2_UCHAR *start_code = *pcode; #endif @@ -1892,19 +2283,10 @@ while (*ptr == META_ECLASS_NOT) } /* Because it's a non-empty class, there must be an operand. */ -if (!compile_class_operand(options, xoptions, &ptr, &code, errorcodeptr, cb, - lengthptr)) +if (!compile_class_juxtaposition(options, xoptions, negated, &ptr, &code, + pop_info, errorcodeptr, cb, lengthptr)) return FALSE; -/* Convert prefix to postfix (RPN). */ -if (negated) - { - if (lengthptr != NULL) - (*lengthptr)++; - else - *code++ = OP_ECLASS_NOT; - } - PCRE2_ASSERT(lengthptr == NULL || code == start_code); *pptr = ptr; @@ -1917,9 +2299,9 @@ return TRUE; /* This function consumes tightly-binding binary operators. */ static BOOL -compile_class_binary_tight(uint32_t options, uint32_t xoptions, uint32_t **pptr, - PCRE2_UCHAR **pcode, int *errorcodeptr, compile_block *cb, - PCRE2_SIZE *lengthptr) +compile_class_binary_tight(uint32_t options, uint32_t xoptions, BOOL negated, + uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, + int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr) { uint32_t *ptr = *pptr; PCRE2_UCHAR *code = *pcode; @@ -1927,26 +2309,44 @@ PCRE2_UCHAR *code = *pcode; PCRE2_UCHAR *start_code = *pcode; #endif +/* See compile_class_binary_loose() for comments on compile-time folding of +the "negated" flag. */ + /* Because it's a non-empty class, there must be an operand at the start. */ -if (!compile_class_unary(options, xoptions, &ptr, &code, errorcodeptr, cb, - lengthptr)) +if (!compile_class_unary(options, xoptions, negated, &ptr, &code, pop_info, + errorcodeptr, cb, lengthptr)) return FALSE; while (*ptr == META_ECLASS_AND) { - uint32_t op = OP_ECLASS_AND; + uint32_t op; + BOOL rhs_negated; + eclass_op_info rhs_op_info; + + if (negated) + { + /* !(A && B) -> !A || !B */ + op = ECL_OR; + rhs_negated = TRUE; + } + else + { + /* A && B -> A && B */ + op = ECL_AND; + rhs_negated = FALSE; + } + ++ptr; /* An operand must follow the operator. */ - if (!compile_class_unary(options, xoptions, &ptr, &code, errorcodeptr, cb, - lengthptr)) + if (!compile_class_unary(options, xoptions, rhs_negated, &ptr, &code, + &rhs_op_info, errorcodeptr, cb, lengthptr)) return FALSE; /* Convert infix to postfix (RPN). */ - if (lengthptr != NULL) - (*lengthptr)++; - else - *code++ = op; + fold_binary(op, pop_info, &rhs_op_info, lengthptr); + if (lengthptr == NULL) + code = pop_info->code_start + pop_info->length; } PCRE2_ASSERT(lengthptr == NULL || code == start_code); @@ -1961,9 +2361,9 @@ return TRUE; /* This function consumes loosely-binding binary operators. */ static BOOL -compile_class_binary_loose(uint32_t options, uint32_t xoptions, uint32_t **pptr, - PCRE2_UCHAR **pcode, int *errorcodeptr, compile_block *cb, - PCRE2_SIZE *lengthptr) +compile_class_binary_loose(uint32_t options, uint32_t xoptions, BOOL negated, + uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, + int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr) { uint32_t *ptr = *pptr; PCRE2_UCHAR *code = *pcode; @@ -1971,28 +2371,73 @@ PCRE2_UCHAR *code = *pcode; PCRE2_UCHAR *start_code = *pcode; #endif +/* We really want to fold the negation operator, if at all possible, so that +simple cases can be reduced down. In particular, in 8-bit no-UTF mode, we want +to produce a fully-folded expression, so that we can guarantee not to emit any +OP_ECLASS codes (in the same way that we never emit OP_XCLASS in this mode). + +This has the consequence that with a little ingenuity, we can in fact avoid +emitting (nearly...) all cases of the "NOT" operator. Imagine that we have: + !(A ... +We have parsed the preceding "!", and we are about to parse the "A" operand. We +don't know yet whether there will even be a following binary operand! Both of +these are possibilities for what follows: + !(A && B) + !(A) +However, we can still fold the "!" into the "A" operand, because no matter what +the following binary operator will be, we can produce an expression which is +equivalent. */ + /* Because it's a non-empty class, there must be an operand at the start. */ -if (!compile_class_binary_tight(options, xoptions, &ptr, &code, errorcodeptr, - cb, lengthptr)) +if (!compile_class_binary_tight(options, xoptions, negated, &ptr, &code, + pop_info, errorcodeptr, cb, lengthptr)) return FALSE; while (*ptr >= META_ECLASS_OR && *ptr <= META_ECLASS_XOR) { - uint32_t op = *ptr == META_ECLASS_OR ? OP_ECLASS_OR : - *ptr == META_ECLASS_SUB ? OP_ECLASS_SUB : - OP_ECLASS_XOR; + uint32_t op; + BOOL op_neg; + BOOL rhs_negated; + eclass_op_info rhs_op_info; + + if (negated) + { + /* The whole expression is being negated; we respond by unconditionally + negating the LHS A, before seeing what follows. And hooray! We can recover, + no matter what follows. */ + /* !(A || B) -> !A && !B */ + /* !(A -- B) -> !(A && !B) -> !A || B */ + /* !(A XOR B) -> !(!A XOR !B) -> !A XNOR !B */ + op = (*ptr == META_ECLASS_OR )? ECL_AND : + (*ptr == META_ECLASS_SUB)? ECL_OR : + /*ptr == META_ECLASS_XOR*/ ECL_XOR; + op_neg = (*ptr == META_ECLASS_XOR); + rhs_negated = *ptr != META_ECLASS_SUB; + } + else + { + /* A || B -> A || B */ + /* A -- B -> A && !B */ + /* A XOR B -> A XOR B */ + op = (*ptr == META_ECLASS_OR )? ECL_OR : + (*ptr == META_ECLASS_SUB)? ECL_AND : + /*ptr == META_ECLASS_XOR*/ ECL_XOR; + op_neg = FALSE; + rhs_negated = *ptr == META_ECLASS_SUB; + } + ++ptr; /* An operand must follow the operator. */ - if (!compile_class_binary_tight(options, xoptions, &ptr, &code, errorcodeptr, - cb, lengthptr)) + if (!compile_class_binary_tight(options, xoptions, rhs_negated, &ptr, &code, + &rhs_op_info, errorcodeptr, cb, lengthptr)) return FALSE; /* Convert infix to postfix (RPN). */ - if (lengthptr != NULL) - (*lengthptr)++; - else - *code++ = op; + fold_binary(op, pop_info, &rhs_op_info, lengthptr); + if (op_neg) fold_negation(pop_info, lengthptr); + if (lengthptr == NULL) + code = pop_info->code_start + pop_info->length; } PCRE2_ASSERT(lengthptr == NULL || code == start_code); @@ -2013,19 +2458,12 @@ applications. The pptr will be left pointing at the matching META_CLASS_END. */ BOOL -PRIV(compile_class_nested)(uint32_t options, uint32_t xoptions, - uint32_t **pptr, PCRE2_UCHAR **pcode, int *errorcodeptr, - compile_block *cb, PCRE2_SIZE *lengthptr) +PRIV(compile_class_nested)(uint32_t options, uint32_t xoptions, BOOL negated, + uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, + int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr) { -/* TODO: [EC] https://github.com/PCRE2Project/pcre2/issues/537 -We shall convert this recursive descent into a stack-based precedence parser. -We shall optimise it, so that OP_CLASS/NCLASS are constant-folded. -We shall potentially fold all the bitsets, so that there's only one bitset -held by the OP_ECLASS. */ - uint32_t *ptr = *pptr; PCRE2_UCHAR *code = *pcode; -BOOL negated; #ifdef PCRE2_DEBUG PCRE2_UCHAR *start_code = *pcode; #endif @@ -2034,21 +2472,14 @@ PCRE2_UCHAR *start_code = *pcode; PCRE2_ASSERT(*ptr == (META_CLASS | CLASS_IS_ECLASS) || *ptr == (META_CLASS_NOT | CLASS_IS_ECLASS)); -negated = *ptr++ == (META_CLASS_NOT | CLASS_IS_ECLASS); +if (*ptr++ == (META_CLASS_NOT | CLASS_IS_ECLASS)) + negated = !negated; /* Because it's a non-empty class, there must be an operand at the start. */ -if (!compile_class_binary_loose(options, xoptions, &ptr, &code, errorcodeptr, - cb, lengthptr)) +if (!compile_class_binary_loose(options, xoptions, negated, &ptr, &code, + pop_info, errorcodeptr, cb, lengthptr)) return FALSE; -if (negated) - { - if (lengthptr != NULL) - (*lengthptr)++; - else - *code++ = OP_ECLASS_NOT; - } - PCRE2_ASSERT(*ptr == META_CLASS_END); PCRE2_ASSERT(lengthptr == NULL || code == start_code); diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c index aa1f58cc6..503502b76 100644 --- a/src/pcre2_dfa_match.c +++ b/src/pcre2_dfa_match.c @@ -156,6 +156,7 @@ static const uint8_t coptable[] = { 0, /* CLASS */ 0, /* NCLASS */ 0, /* XCLASS - variable length */ + 0, /* ECLASS - variable length */ 0, /* REF */ 0, /* REFI */ 0, /* DNREF */ @@ -190,8 +191,6 @@ static const uint8_t coptable[] = { 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */ 0, 0, 0, /* CLOSE, SKIPZERO, DEFINE */ 0, 0, /* \B and \b in UCP mode */ - 0, /* ECLASS */ - 0, 0, 0, 0, 0 /* ECLASS ops, nested inside ECLASS */ }; /* This table identifies those opcodes that inspect a character. It is used to @@ -237,6 +236,7 @@ static const uint8_t poptable[] = { 1, /* CLASS */ 1, /* NCLASS */ 1, /* XCLASS - variable length */ + 1, /* ECLASS - variable length */ 0, /* REF */ 0, /* REFI */ 0, /* DNREF */ @@ -271,10 +271,12 @@ static const uint8_t poptable[] = { 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */ 0, 0, 0, /* CLOSE, SKIPZERO, DEFINE */ 1, 1, /* \B and \b in UCP mode */ - 1, /* ECLASS */ - 0, 0, 0, 0, 0 /* ECLASS ops, nested inside ECLASS */ }; +/* Compile-time check that these tables have the correct size. */ +STATIC_ASSERT(sizeof(coptable) == OP_TABLE_LENGTH, coptable); +STATIC_ASSERT(sizeof(poptable) == OP_TABLE_LENGTH, poptable); + /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W, and \w */ @@ -846,19 +848,6 @@ for (;;) switch (codevalue) { -/* ========================================================================== */ - /* These cases are never obeyed. This is a fudge that causes a compile- - time error if the vectors coptable or poptable, which are indexed by - opcode, are not the correct length. It seems to be the only way to do - such a check at compile time, as the sizeof() operator does not work - in the C preprocessor. */ - - case OP_TABLE_LENGTH: - case OP_TABLE_LENGTH + - ((sizeof(coptable) == OP_TABLE_LENGTH) && - (sizeof(poptable) == OP_TABLE_LENGTH)): - return 0; - /* ========================================================================== */ /* Reached a closing bracket. If not at the end of the pattern, carry on with the next opcode. For repeating opcodes, also add the repeat @@ -2668,13 +2657,16 @@ for (;;) case OP_CLASS: case OP_NCLASS: +#ifdef SUPPORT_WIDE_CHARS case OP_XCLASS: case OP_ECLASS: +#endif { BOOL isinclass = FALSE; int next_state_offset; PCRE2_SPTR ecode; +#ifdef SUPPORT_WIDE_CHARS /* An extended class may have a table or a list of single characters, ranges, or both, and it may be positive or negative. There's a function that sorts all this out. */ @@ -2698,10 +2690,12 @@ for (;;) (const uint8_t*)mb->start_code, utf); } + else +#endif /* SUPPORT_WIDE_CHARS */ + /* For a simple class, there is always just a 32-byte table, and we can set isinclass from it. */ - else { ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR)); if (clen > 0) diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index 379d23c95..4be1e7306 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -88,6 +88,12 @@ typedef int BOOL; #define TRUE 1 #endif +/* Helper macro for static (compile-time) assertions. Can be used inside +functions, or at the top-level of a file. */ +#define STATIC_ASSERT_JOIN(a,b) a ## b +#define STATIC_ASSERT(cond, msg) \ + typedef int STATIC_ASSERT_JOIN(static_assertion_,msg)[(cond)?1:-1] + /* Valgrind (memcheck) support */ #ifdef SUPPORT_VALGRIND @@ -1431,6 +1437,23 @@ can be used to encode / decode the character value stored in an item. */ #define XCL_CHAR_END 0x1 #define XCL_CHAR_SHIFT 1 +/* Flag bits for an extended class (OP_ECLASS), which is used for complex +character matches such as [\p{Greek} && \p{Ll}]. */ + +#define ECL_MAP 0x01 /* Flag: a 32-byte map is present */ + +/* Type tags for the items stored in an extended class (OP_ECLASS). These items +follow the OP_ECLASS's flag char and bitmap, and represent a Reverse Polish +Notation list of operands and operators manipulating a stack of bits. */ + +#define ECL_AND 1 /* Pop two from the stack, AND, and push result. */ +#define ECL_OR 2 /* Pop two from the stack, OR, and push result. */ +#define ECL_XOR 3 /* Pop two from the stack, XOR, and push result. */ +#define ECL_NOT 4 /* Pop one from the stack, NOT, and push result. */ +#define ECL_XCLASS 5 /* XCLASS nested within ECLASS; match and push result. */ +#define ECL_ANY 6 /* Temporary, only used during compilation. */ +#define ECL_NONE 7 /* Temporary, only used during compilation. */ + /* These are escaped items that aren't just an encoding of a particular data value such as \n. They must have non-zero values, as check_escape() returns 0 for a data character. In the escapes[] table in pcre2_compile.c their values @@ -1651,112 +1674,105 @@ enum { character > 255 is encountered. */ OP_XCLASS, /* 112 Extended class for handling > 255 chars within the class. This does both positive and negative. */ - OP_REF, /* 113 Match a back reference, casefully */ - OP_REFI, /* 114 Match a back reference, caselessly */ - OP_DNREF, /* 115 Match a duplicate name backref, casefully */ - OP_DNREFI, /* 116 Match a duplicate name backref, caselessly */ - OP_RECURSE, /* 117 Match a numbered subpattern (possibly recursive) */ - OP_CALLOUT, /* 118 Call out to external function if provided */ - OP_CALLOUT_STR, /* 119 Call out with string argument */ - - OP_ALT, /* 120 Start of alternation */ - OP_KET, /* 121 End of group that doesn't have an unbounded repeat */ - OP_KETRMAX, /* 122 These two must remain together and in this */ - OP_KETRMIN, /* 123 order. They are for groups the repeat for ever. */ - OP_KETRPOS, /* 124 Possessive unlimited repeat. */ + OP_ECLASS, /* 113 Really-extended class, for handling logical + expressions computed over characters. */ + OP_REF, /* 114 Match a back reference, casefully */ + OP_REFI, /* 115 Match a back reference, caselessly */ + OP_DNREF, /* 116 Match a duplicate name backref, casefully */ + OP_DNREFI, /* 117 Match a duplicate name backref, caselessly */ + OP_RECURSE, /* 118 Match a numbered subpattern (possibly recursive) */ + OP_CALLOUT, /* 119 Call out to external function if provided */ + OP_CALLOUT_STR, /* 120 Call out with string argument */ + + OP_ALT, /* 121 Start of alternation */ + OP_KET, /* 122 End of group that doesn't have an unbounded repeat */ + OP_KETRMAX, /* 123 These two must remain together and in this */ + OP_KETRMIN, /* 124 order. They are for groups the repeat for ever. */ + OP_KETRPOS, /* 125 Possessive unlimited repeat. */ /* The assertions must come before BRA, CBRA, ONCE, and COND. */ - OP_REVERSE, /* 125 Move pointer back - used in lookbehind assertions */ - OP_VREVERSE, /* 126 Move pointer back - variable */ - OP_ASSERT, /* 127 Positive lookahead */ - OP_ASSERT_NOT, /* 128 Negative lookahead */ - OP_ASSERTBACK, /* 129 Positive lookbehind */ - OP_ASSERTBACK_NOT, /* 130 Negative lookbehind */ - OP_ASSERT_NA, /* 131 Positive non-atomic lookahead */ - OP_ASSERTBACK_NA, /* 132 Positive non-atomic lookbehind */ - OP_ASSERT_SCS, /* 133 Scan substring */ + OP_REVERSE, /* 126 Move pointer back - used in lookbehind assertions */ + OP_VREVERSE, /* 127 Move pointer back - variable */ + OP_ASSERT, /* 128 Positive lookahead */ + OP_ASSERT_NOT, /* 129 Negative lookahead */ + OP_ASSERTBACK, /* 130 Positive lookbehind */ + OP_ASSERTBACK_NOT, /* 131 Negative lookbehind */ + OP_ASSERT_NA, /* 132 Positive non-atomic lookahead */ + OP_ASSERTBACK_NA, /* 133 Positive non-atomic lookbehind */ + OP_ASSERT_SCS, /* 134 Scan substring */ /* ONCE, SCRIPT_RUN, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come immediately after the assertions, with ONCE first, as there's a test for >= ONCE for a subpattern that isn't an assertion. The POS versions must immediately follow the non-POS versions in each case. */ - OP_ONCE, /* 134 Atomic group, contains captures */ - OP_SCRIPT_RUN, /* 135 Non-capture, but check characters' scripts */ - OP_BRA, /* 136 Start of non-capturing bracket */ - OP_BRAPOS, /* 137 Ditto, with unlimited, possessive repeat */ - OP_CBRA, /* 138 Start of capturing bracket */ - OP_CBRAPOS, /* 139 Ditto, with unlimited, possessive repeat */ - OP_COND, /* 140 Conditional group */ + OP_ONCE, /* 135 Atomic group, contains captures */ + OP_SCRIPT_RUN, /* 136 Non-capture, but check characters' scripts */ + OP_BRA, /* 137 Start of non-capturing bracket */ + OP_BRAPOS, /* 138 Ditto, with unlimited, possessive repeat */ + OP_CBRA, /* 139 Start of capturing bracket */ + OP_CBRAPOS, /* 140 Ditto, with unlimited, possessive repeat */ + OP_COND, /* 141 Conditional group */ /* These five must follow the previous five, in the same order. There's a check for >= SBRA to distinguish the two sets. */ - OP_SBRA, /* 141 Start of non-capturing bracket, check empty */ - OP_SBRAPOS, /* 142 Ditto, with unlimited, possessive repeat */ - OP_SCBRA, /* 143 Start of capturing bracket, check empty */ - OP_SCBRAPOS, /* 144 Ditto, with unlimited, possessive repeat */ - OP_SCOND, /* 145 Conditional group, check empty */ + OP_SBRA, /* 142 Start of non-capturing bracket, check empty */ + OP_SBRAPOS, /* 143 Ditto, with unlimited, possessive repeat */ + OP_SCBRA, /* 144 Start of capturing bracket, check empty */ + OP_SCBRAPOS, /* 145 Ditto, with unlimited, possessive repeat */ + OP_SCOND, /* 146 Conditional group, check empty */ /* The next two pairs must (respectively) be kept together. */ - OP_CREF, /* 146 Used to hold a capture number as condition */ - OP_DNCREF, /* 147 Used to point to duplicate names as a condition */ - OP_RREF, /* 148 Used to hold a recursion number as condition */ - OP_DNRREF, /* 149 Used to point to duplicate names as a condition */ - OP_FALSE, /* 150 Always false (used by DEFINE and VERSION) */ - OP_TRUE, /* 151 Always true (used by VERSION) */ + OP_CREF, /* 147 Used to hold a capture number as condition */ + OP_DNCREF, /* 148 Used to point to duplicate names as a condition */ + OP_RREF, /* 149 Used to hold a recursion number as condition */ + OP_DNRREF, /* 150 Used to point to duplicate names as a condition */ + OP_FALSE, /* 151 Always false (used by DEFINE and VERSION) */ + OP_TRUE, /* 152 Always true (used by VERSION) */ - OP_BRAZERO, /* 152 These two must remain together and in this */ - OP_BRAMINZERO, /* 153 order. */ - OP_BRAPOSZERO, /* 154 */ + OP_BRAZERO, /* 153 These two must remain together and in this */ + OP_BRAMINZERO, /* 154 order. */ + OP_BRAPOSZERO, /* 155 */ /* These are backtracking control verbs */ - OP_MARK, /* 155 always has an argument */ - OP_PRUNE, /* 156 */ - OP_PRUNE_ARG, /* 157 same, but with argument */ - OP_SKIP, /* 158 */ - OP_SKIP_ARG, /* 159 same, but with argument */ - OP_THEN, /* 160 */ - OP_THEN_ARG, /* 161 same, but with argument */ - OP_COMMIT, /* 162 */ - OP_COMMIT_ARG, /* 163 same, but with argument */ + OP_MARK, /* 156 always has an argument */ + OP_PRUNE, /* 157 */ + OP_PRUNE_ARG, /* 158 same, but with argument */ + OP_SKIP, /* 159 */ + OP_SKIP_ARG, /* 160 same, but with argument */ + OP_THEN, /* 161 */ + OP_THEN_ARG, /* 162 same, but with argument */ + OP_COMMIT, /* 163 */ + OP_COMMIT_ARG, /* 164 same, but with argument */ /* These are forced failure and success verbs. FAIL and ACCEPT do accept an argument, but these cases can be compiled as, for example, (*MARK:X)(*FAIL) without the need for a special opcode. */ - OP_FAIL, /* 164 */ - OP_ACCEPT, /* 165 */ - OP_ASSERT_ACCEPT, /* 166 Used inside assertions */ - OP_CLOSE, /* 167 Used before OP_ACCEPT to close open captures */ + OP_FAIL, /* 165 */ + OP_ACCEPT, /* 166 */ + OP_ASSERT_ACCEPT, /* 167 Used inside assertions */ + OP_CLOSE, /* 168 Used before OP_ACCEPT to close open captures */ /* This is used to skip a subpattern with a {0} quantifier */ - OP_SKIPZERO, /* 168 */ + OP_SKIPZERO, /* 169 */ /* This is used to identify a DEFINE group during compilation so that it can be checked for having only one branch. It is changed to OP_FALSE before compilation finishes. */ - OP_DEFINE, /* 169 */ + OP_DEFINE, /* 170 */ /* These opcodes replace their normal counterparts in UCP mode when PCRE2_EXTRA_ASCII_BSW is not set. */ - OP_NOT_UCP_WORD_BOUNDARY, /* 170 */ - OP_UCP_WORD_BOUNDARY, /* 171 */ - - /* These are used for "extended classes" such as [a-z -- aeiou]. */ - - OP_ECLASS, /* 172 */ - OP_ECLASS_AND, /* 173 */ - OP_ECLASS_OR, /* 174 */ - OP_ECLASS_SUB, /* 175 */ - OP_ECLASS_XOR, /* 176 */ - OP_ECLASS_NOT, /* 177 */ + OP_NOT_UCP_WORD_BOUNDARY, /* 171 */ + OP_UCP_WORD_BOUNDARY, /* 172 */ /* This is not an opcode, but is used to check that tables indexed by opcode are the correct length, in order to catch updating errors - there have been @@ -1799,7 +1815,8 @@ some cases doesn't actually use these names at all). */ "*+","++", "?+", "{", \ "*", "*?", "+", "+?", "?", "??", "{", "{", \ "*+","++", "?+", "{", \ - "class", "nclass", "xclass", "Ref", "Refi", "DnRef", "DnRefi", \ + "class", "nclass", "xclass", "eclass", \ + "Ref", "Refi", "DnRef", "DnRefi", \ "Recurse", "Callout", "CalloutStr", \ "Alt", "Ket", "KetRmax", "KetRmin", "KetRpos", \ "Reverse", "VReverse", "Assert", "Assert not", \ @@ -1818,8 +1835,7 @@ some cases doesn't actually use these names at all). */ "*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \ "*THEN", "*THEN", "*COMMIT", "*COMMIT", "*FAIL", \ "*ACCEPT", "*ASSERT_ACCEPT", \ - "Close", "Skip zero", "Define", "\\B (ucp)", "\\b (ucp)", \ - "eclass", "&&", "||", "--", "~~", "!!" + "Close", "Skip zero", "Define", "\\B (ucp)", "\\b (ucp)" /* This macro defines the length of fixed length operations in the compiled @@ -1874,6 +1890,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */ 1+(32/sizeof(PCRE2_UCHAR)), /* CLASS */ \ 1+(32/sizeof(PCRE2_UCHAR)), /* NCLASS */ \ 0, /* XCLASS - variable length */ \ + 0, /* ECLASS - variable length */ \ 1+IMM2_SIZE, /* REF */ \ 1+IMM2_SIZE+1, /* REFI */ \ 1+2*IMM2_SIZE, /* DNREF */ \ @@ -1918,9 +1935,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */ 1, 1, 1, /* FAIL, ACCEPT, ASSERT_ACCEPT */ \ 1+IMM2_SIZE, 1, /* CLOSE, SKIPZERO */ \ 1, /* DEFINE */ \ - 1, 1, /* \B and \b in UCP mode */ \ - 0, /* ECLASS - variable length */ \ - 1, 1, 1, 1, 1 /* ECLASS ops, nested inside ECLASS */ + 1, 1 /* \B and \b in UCP mode */ /* A magic value for OP_RREF to indicate the "any recursion" condition. */ diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h index 85adb0b5e..df9399389 100644 --- a/src/pcre2_intmodedep.h +++ b/src/pcre2_intmodedep.h @@ -874,11 +874,10 @@ typedef struct heapframe { PCRE2_SIZE ovector[131072]; /* Must be last in the structure */ } heapframe; -/* This typedef is a check that the size of the heapframe structure is a -multiple of PCRE2_SIZE. See various comments above. */ +/* Assert that the size of the heapframe structure is a multiple of PCRE2_SIZE. +See various comments above. */ -typedef char check_heapframe_size[ - ((sizeof(heapframe) % sizeof(PCRE2_SIZE)) == 0)? (+1):(-1)]; +STATIC_ASSERT((sizeof(heapframe) % sizeof(PCRE2_SIZE)) == 0, heapframe_size); /* Structure for computing the alignment of heapframe. */ diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c index 98ed65db1..1c77873ba 100644 --- a/src/pcre2_jit_compile.c +++ b/src/pcre2_jit_compile.c @@ -1105,11 +1105,11 @@ switch(*cc) case OP_CALLOUT_STR: return cc + GET(cc, 1 + 2*LINK_SIZE); - /* TODO: [EC] https://github.com/PCRE2Project/pcre2/issues/537 - Add back the "if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8" once we stop emitting ECLASS for this case. */ +#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 case OP_ECLASS: case OP_XCLASS: return cc + GET(cc, 1); +#endif case OP_MARK: case OP_COMMIT_ARG: diff --git a/src/pcre2_match.c b/src/pcre2_match.c index 03ff138fb..7e6a5a90f 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -158,11 +158,9 @@ enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39 }; #ifdef SUPPORT_WIDE_CHARS -enum { RM100=100, RM101 }; +enum { RM100=100, RM101, RM102, RM103 }; #endif -enum { RM150=150, RM151 }; - #ifdef SUPPORT_UNICODE enum { RM200=200, RM201, RM202, RM203, RM204, RM205, RM206, RM207, RM208, RM209, RM210, RM211, RM212, RM213, RM214, RM215, @@ -2335,8 +2333,7 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode, #define Lmin F->temp_32[0] #define Lmax F->temp_32[1] - /* TODO: [EC] https://github.com/PCRE2Project/pcre2/issues/537 - Enclose in "ifdef SUPPORT_WIDE_CHARS" once we stop emitting ECLASS for this case. */ +#ifdef SUPPORT_WIDE_CHARS case OP_ECLASS: { Leclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */ @@ -2401,7 +2398,7 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode, { for (;;) { - RMATCH(Fecode, RM150); + RMATCH(Fecode, RM102); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); if (Feptr >= mb->end_subject) @@ -2449,7 +2446,7 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode, for(;;) { - RMATCH(Fecode, RM151); + RMATCH(Fecode, RM103); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */ #ifdef SUPPORT_UNICODE @@ -2461,6 +2458,7 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode, PCRE2_UNREACHABLE(); /* Control never reaches here */ } +#endif /* SUPPORT_WIDE_CHARS: end of ECLASS */ #undef Lstart_eptr #undef Leclass_data @@ -6830,11 +6828,9 @@ switch (Freturn_id) LBL(33) LBL(34) LBL(35) LBL(36) LBL(37) LBL(38) LBL(39) #ifdef SUPPORT_WIDE_CHARS - LBL(100) LBL(101) + LBL(100) LBL(101) LBL(102) LBL(103) #endif - LBL(150) LBL(151) - #ifdef SUPPORT_UNICODE LBL(200) LBL(201) LBL(202) LBL(203) LBL(204) LBL(205) LBL(206) LBL(207) LBL(208) LBL(209) LBL(210) LBL(211) LBL(212) LBL(213) diff --git a/src/pcre2_pattern_info.c b/src/pcre2_pattern_info.c index 32f5b5105..fe4d3c661 100644 --- a/src/pcre2_pattern_info.c +++ b/src/pcre2_pattern_info.c @@ -384,10 +384,12 @@ while (TRUE) #endif break; +#ifdef SUPPORT_WIDE_CHARS case OP_XCLASS: case OP_ECLASS: cc += GET(cc, 1); break; +#endif case OP_MARK: case OP_COMMIT_ARG: diff --git a/src/pcre2_printint.c b/src/pcre2_printint.c index 4e5f5aeb5..d3c726e24 100644 --- a/src/pcre2_printint.c +++ b/src/pcre2_printint.c @@ -53,6 +53,7 @@ pcre2_internal.h, which is #included by pcre2test before this file. */ #ifndef OP_LISTS_DEFINED static const char *OP_names[] = { OP_NAME_LIST }; +STATIC_ASSERT(sizeof(OP_names)/sizeof(*OP_names) == OP_TABLE_LENGTH, OP_names); #define OP_LISTS_DEFINED #endif @@ -66,6 +67,7 @@ static const char *OP_names[] = { OP_NAME_LIST }; #define print_custring_bylen PCRE2_SUFFIX(print_custring_bylen_) #define print_prop PCRE2_SUFFIX(print_prop_) #define print_char_list PCRE2_SUFFIX(print_char_list_) +#define print_map PCRE2_SUFFIX(print_map_) #define print_class PCRE2_SUFFIX(print_class_) /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that @@ -73,7 +75,8 @@ the definition is next to the definition of the opcodes in pcre2_internal.h. The contents of the table are, however, mode-dependent. */ static const uint8_t OP_lengths[] = { OP_LENGTHS }; - +STATIC_ASSERT(sizeof(OP_lengths)/sizeof(*OP_lengths) == OP_TABLE_LENGTH, + PCRE2_SUFFIX(OP_lengths_)); /************************************************* @@ -247,7 +250,7 @@ const char *yield = "??"; size_t len = 0; unsigned int ptypex = (ptype == PT_SC)? PT_SCX : ptype; -for (int i = PRIV(utt_size) - 1; i >= 0; i--) +for (ptrdiff_t i = PRIV(utt_size) - 1; i >= 0; i--) { const ucp_type_table *u = PRIV(utt) + i; @@ -440,6 +443,60 @@ return code + LINK_SIZE; +/************************************************* +* Print a character bitmap * +*************************************************/ + +/* Prints a 32-byte bitmap, which occurs within a character class opcode. + +Arguments: + f file to write to + map pointer to the bitmap + negated TRUE if the bitmap will be printed as negated + +Returns: nothing +*/ + +static void +print_map(FILE *f, const uint8_t *map, BOOL negated) +{ +BOOL first = TRUE; +uint8_t inverted_map[32]; +int i; + +if (negated) + { + /* Using 255 ^ instead of ~ avoids clang sanitize warning. */ + for (i = 0; i < 32; i++) inverted_map[i] = 255 ^ map[i]; + map = inverted_map; + } + +for (i = 0; i < 256; i++) + { + if ((map[i/8] & (1u << (i&7))) != 0) + { + int j; + for (j = i+1; j < 256; j++) + if ((map[j/8] & (1u << (j&7))) == 0) break; + if (i == '-' || i == '\\' || i == ']' || (first && i == '^')) + fprintf(f, "\\"); + if (PRINTABLE(i)) fprintf(f, "%c", i); + else fprintf(f, "\\x%02x", i); + first = FALSE; + if (--j > i) + { + if (j != i + 1) fprintf(f, "-"); + if (j == '-' || j == '\\' || j == ']') fprintf(f, "\\"); + if (PRINTABLE(j)) fprintf(f, "%c", j); + else fprintf(f, "\\x%02x", j); + } + i = j; + } + } +} + + + /************************************************* * Print character class * *************************************************/ @@ -449,7 +506,8 @@ OP_XCLASS. Arguments: f file to write to - code pointer in the compiled code + type OP_CLASS, OP_NCLASS, or OP_XCLASS + code pointer in the compiled code (after the OP tag) utf TRUE if re is UTF (will be FALSE if UTF is not supported) before text to print before after text to print after @@ -458,18 +516,17 @@ Returns: nothing */ static void -print_class(FILE *f, PCRE2_SPTR code, const uint8_t *char_lists_end, BOOL utf, - const char *before, const char *after) +print_class(FILE *f, int type, PCRE2_SPTR code, const uint8_t *char_lists_end, + BOOL utf, const char *before, const char *after) { BOOL printmap, negated; PCRE2_SPTR ccode; -int i; /* Negative XCLASS and NCLASS both have a bitmap indicating which characters are accepted. For clarity we print this inverted and prefixed by "^". */ -if (*code == OP_XCLASS) +if (type == OP_XCLASS) { - ccode = code + LINK_SIZE + 1; + ccode = code + LINK_SIZE; printmap = (*ccode & XCL_MAP) != 0; negated = (*ccode & XCL_NOT) != 0; ccode++; @@ -477,8 +534,8 @@ if (*code == OP_XCLASS) else /* CLASS or NCLASS */ { printmap = TRUE; - negated = *code == OP_NCLASS; - ccode = code + 1; + negated = type == OP_NCLASS; + ccode = code; } fprintf(f, "%s[%s", before, negated? "^" : ""); @@ -486,42 +543,12 @@ fprintf(f, "%s[%s", before, negated? "^" : ""); /* Print a bit map */ if (printmap) { - BOOL first = TRUE; - uint8_t inverted_map[32]; - const uint8_t *map = (const uint8_t *)ccode; - if (negated) - { - /* Using 255 ^ instead of ~ avoids clang sanitize warning. */ - for (i = 0; i < 32; i++) inverted_map[i] = 255 ^ map[i]; - map = inverted_map; - } - for (i = 0; i < 256; i++) - { - if ((map[i/8] & (1u << (i&7))) != 0) - { - int j; - for (j = i+1; j < 256; j++) - if ((map[j/8] & (1u << (j&7))) == 0) break; - if (i == '-' || i == '\\' || i == ']' || (first && i == '^')) - fprintf(f, "\\"); - if (PRINTABLE(i)) fprintf(f, "%c", i); - else fprintf(f, "\\x%02x", i); - first = FALSE; - if (--j > i) - { - if (j != i + 1) fprintf(f, "-"); - if (j == '-' || j == '\\' || j == ']') fprintf(f, "\\"); - if (PRINTABLE(j)) fprintf(f, "%c", j); - else fprintf(f, "\\x%02x", j); - } - i = j; - } - } + print_map(f, (const uint8_t *)ccode, negated); ccode += 32 / sizeof(PCRE2_UCHAR); } /* For an XCLASS there is always some additional data */ -if (*code == OP_XCLASS) +if (type == OP_XCLASS) { PCRE2_UCHAR ch; @@ -579,7 +606,7 @@ if (*code == OP_XCLASS) } } - PCRE2_ASSERT(ccode == code + GET(code, 1)); + PCRE2_ASSERT(ccode == code + (GET(code, 0) - 1)); } /* Indicate a non-UTF class which was created by negation */ @@ -952,11 +979,24 @@ for(;;) print_prop(f, code, " ", ""); break; +#ifdef SUPPORT_WIDE_CHARS case OP_ECLASS: extra = GET(code, 1); - fprintf(f, " %s eclass[\n", flag); + fprintf(f, " eclass[\n"); /* We print the opcodes contained inside as well. */ - ccode = code + 1 + LINK_SIZE; + ccode = code + 1 + LINK_SIZE + 1; + if ((ccode[-1] & ECL_MAP) != 0) + { + const uint8_t *map = (const uint8_t *)ccode; + /* The first 6 ASCII characters (SOH...ACK) are totally, utterly useless. + If they're set in the bitmap, then it's clearly been formed by negation.*/ + BOOL print_negated = (map[0] & 0x7e) == 0x7e; + + fprintf(f, " bitmap: [%s", print_negated? "^" : ""); + print_map(f, map, print_negated); + fprintf(f, "]\n"); + ccode += 32 / sizeof(PCRE2_UCHAR); + } while (ccode < code + extra) { if (print_lengths) @@ -966,63 +1006,47 @@ for(;;) switch (*ccode) { - case OP_ECLASS_AND: - fprintf(f, " op: &&\n"); - ccode += OP_lengths[*ccode]; + case ECL_AND: + fprintf(f, " AND\n"); + ccode += 1; break; - case OP_ECLASS_OR: - fprintf(f, " op: ||\n"); - ccode += OP_lengths[*ccode]; + case ECL_OR: + fprintf(f, " OR\n"); + ccode += 1; break; - case OP_ECLASS_SUB: - fprintf(f, " op: --\n"); - ccode += OP_lengths[*ccode]; + case ECL_XOR: + fprintf(f, " XOR\n"); + ccode += 1; break; - case OP_ECLASS_XOR: - fprintf(f, " op: ~~\n"); - ccode += OP_lengths[*ccode]; - break; - case OP_ECLASS_NOT: - fprintf(f, " op: ^\n"); - ccode += OP_lengths[*ccode]; - break; - - /* TODO: [EC] https://github.com/PCRE2Project/pcre2/issues/537 - Add back the "ifdef SUPPORT_WIDE_CHARS" once we stop emitting ECLASS for this case. */ - case OP_CLASS: - case OP_NCLASS: - case OP_XCLASS: - print_class(f, ccode, (uint8_t*)codestart, utf, " cls:", "\n"); - if (*ccode == OP_XCLASS) - ccode += GET(ccode, 1); - else - ccode += OP_lengths[*ccode]; + case ECL_NOT: + fprintf(f, " NOT\n"); + ccode += 1; break; - case OP_ALLANY: - fprintf(f, " %s\n", OP_names[*ccode]); - ccode += OP_lengths[*ccode]; + case ECL_XCLASS: + print_class(f, OP_XCLASS, ccode+1, (uint8_t*)codestart, utf, + " xclass: ", "\n"); + ccode += GET(ccode, 1); break; default: fprintf(f, " UNEXPECTED\n"); - ccode += OP_lengths[*ccode]; + ccode += 1; break; } } fprintf(f, " ]"); goto CLASS_REF_REPEAT; - - /* OP_XCLASS cannot occur in 8-bit, non-UTF mode. However, there's no harm - in having this code always here, and it makes it less messy without all - those #ifdefs. */ +#endif /* SUPPORT_WIDE_CHARS */ case OP_CLASS: case OP_NCLASS: +#ifdef SUPPORT_WIDE_CHARS case OP_XCLASS: - print_class(f, code, (uint8_t*)codestart, utf, " ", ""); if (*code == OP_XCLASS) extra = GET(code, 1); +#endif + print_class(f, *code, code+1, (uint8_t*)codestart, utf, " ", ""); ccode = code + OP_lengths[*code] + extra; /* Handle repeats after a class or a back reference */ diff --git a/src/pcre2_study.c b/src/pcre2_study.c index a79e4135e..85764cea5 100644 --- a/src/pcre2_study.c +++ b/src/pcre2_study.c @@ -415,11 +415,9 @@ for (;;) /* Check a class for variable quantification */ - /* TODO: [EC] https://github.com/PCRE2Project/pcre2/issues/537 - Add back the "ifdef SUPPORT_WIDE_CHARS" once we stop emitting ECLASS for this case. */ - case OP_CLASS: case OP_NCLASS: +#ifdef SUPPORT_WIDE_CHARS case OP_XCLASS: case OP_ECLASS: /* The original code caused an unsigned overflow in 64 bit systems, @@ -427,6 +425,7 @@ for (;;) if (op == OP_XCLASS || op == OP_ECLASS) cc += GET(cc, 1); else +#endif cc += PRIV(OP_lengths)[OP_CLASS]; switch (*cc) @@ -1718,11 +1717,10 @@ do /* Set-based ECLASS: treat it the same as a "complex" XCLASS; give up. */ - /* TODO: [EC] https://github.com/PCRE2Project/pcre2/issues/537 - Enclose in "ifdef SUPPORT_WIDE_CHARS" once we stop emitting ECLASS for this case. */ - +#ifdef SUPPORT_WIDE_CHARS case OP_ECLASS: return SSB_FAIL; +#endif /* Extended class: if there are any property checks, or if this is a negative XCLASS without a map, give up. If there are no property checks, diff --git a/src/pcre2_xclass.c b/src/pcre2_xclass.c index 6b2b110b2..1300e0511 100644 --- a/src/pcre2_xclass.c +++ b/src/pcre2_xclass.c @@ -467,67 +467,71 @@ PRIV(eclass)(uint32_t c, PCRE2_SPTR data_start, PCRE2_SPTR data_end, const uint8_t *char_lists_end, BOOL utf) { PCRE2_SPTR ptr = data_start; +PCRE2_UCHAR flags; uint32_t stack = 0; +int stack_depth = 0; + +PCRE2_ASSERT(data_start < data_end); +flags = *ptr++; +PCRE2_ASSERT((flags & ECL_MAP) == 0 || + (data_end - ptr) >= 32 / (int)sizeof(PCRE2_UCHAR)); + +/* Code points < 256 are matched against a bitmap, if one is present. If no +bitmap is present, then the ECLASS does not match any code points < 256. */ + +if (c < 256) + { + if ((flags & ECL_MAP) != 0) + return (((const uint8_t *)ptr)[c/8] & (1u << (c&7))) != 0; + return FALSE; + } + +/* Skip the bitmap. */ + +if ((flags & ECL_MAP) != 0) + ptr += 32 / sizeof(PCRE2_UCHAR); /* Do a little loop, until we reach the end of the ECLASS. */ while (ptr < data_end) { switch (*ptr) { - case OP_ECLASS_AND: + case ECL_AND: ++ptr; stack = (stack >> 1) & (stack | ~(uint32_t)1u); + PCRE2_ASSERT(stack_depth >= 2); + --stack_depth; break; - case OP_ECLASS_OR: + case ECL_OR: ++ptr; stack = (stack >> 1) | (stack & (uint32_t)1u); + PCRE2_ASSERT(stack_depth >= 2); + --stack_depth; break; - case OP_ECLASS_SUB: - ++ptr; - stack = (stack >> 1) & (~stack | ~(uint32_t)1u); - break; - - case OP_ECLASS_XOR: + case ECL_XOR: ++ptr; stack = (stack >> 1) ^ (stack & (uint32_t)1u); + PCRE2_ASSERT(stack_depth >= 2); + --stack_depth; break; - case OP_ECLASS_NOT: + case ECL_NOT: ++ptr; stack ^= (uint32_t)1u; + PCRE2_ASSERT(stack_depth >= 1); break; - case OP_CLASS: - case OP_NCLASS: - { - uint32_t matched; - if (c > 255) - matched = *ptr == OP_NCLASS; - else - matched = (((const unsigned char *)(ptr+1))[c/8] & (1u << (c&7))) != 0; - - ptr += 1 + (32 / sizeof(PCRE2_UCHAR)); - stack = (stack << 1) | matched; - break; - } - -#ifdef SUPPORT_WIDE_CHARS - case OP_XCLASS: + case ECL_XCLASS: { uint32_t matched = PRIV(xclass)(c, ptr + 1 + LINK_SIZE, char_lists_end, utf); ptr += GET(ptr, 1); stack = (stack << 1) | matched; + ++stack_depth; break; } -#endif - - case OP_ALLANY: - ++ptr; - stack = (stack << 1) | 1u; - break; /* This should never occur, but compilers may mutter if there is no default. */ @@ -538,6 +542,9 @@ while (ptr < data_end) } } +PCRE2_ASSERT(stack_depth == 1); +(void)stack_depth; /* Ignore unused variable, if assertions are disabled. */ + /* The final bit left on the stack now holds the match result. */ return (stack & 1u) != 0; } diff --git a/testdata/testoutput11-32 b/testdata/testoutput11-32 index d24dcdc97..b7cda3d07 100644 --- a/testdata/testoutput11-32 +++ b/testdata/testoutput11-32 @@ -903,9 +903,9 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\x{80000000}-\x{8000000f}] - cls:[\x{80000002}] - op: -- + xclass: [\x{80000000}-\x{8000000f}] + xclass: [^\x{80000002}] + AND ] Ket End @@ -922,9 +922,9 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\x{80000000}-\x{8000000f}] - cls:[\x{80000002}] - op: -- + xclass: [\x{80000000}-\x{8000000f}] + xclass: [^\x{80000002}] + AND ] Ket End @@ -947,9 +947,9 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\x{80000000}-\x{8000000f}] - cls:[\x{8fffffff}] - op: || + xclass: [\x{80000000}-\x{8000000f}] + xclass: [\x{8fffffff}] + OR ] Ket End @@ -968,9 +968,9 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\x{80000000}-\x{8000000f}] - cls:[\x{80000002}] - op: -- + xclass: [\x{80000000}-\x{8000000f}] + xclass: [^\x{80000002}] + AND ] Ket End @@ -987,9 +987,9 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\x{80000000}-\x{8000000f}] - cls:[\x{80000002}] - op: -- + xclass: [\x{80000000}-\x{8000000f}] + xclass: [^\x{80000002}] + AND ] Ket End diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 629fe438d..78bab1be5 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -19677,11 +19677,7 @@ Failed: error 212 at offset 5: missing terminating ] for extended character clas /[a[B]]C/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[a] - cls:[B] - op: || - ] + [Ba] C Ket End @@ -19697,11 +19693,7 @@ No match /[[A][B]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[A] - cls:[B] - op: || - ] + [AB] Ket End ------------------------------------------------------------------ @@ -19718,11 +19710,7 @@ No match /[[A]||[B]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[A] - cls:[B] - op: || - ] + [AB] Ket End ------------------------------------------------------------------ @@ -19737,11 +19725,7 @@ No match /[[^A][B]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[^A] - cls:[B] - op: || - ] + [^A] Ket End ------------------------------------------------------------------ @@ -19756,12 +19740,7 @@ No match /[^[A][B]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[A] - cls:[B] - op: || - op: ^ - ] + [^AB] Ket End ------------------------------------------------------------------ @@ -19776,12 +19755,7 @@ No match /[^[A]&&[B]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[A] - cls:[B] - op: && - op: ^ - ] + AllAny Ket End ------------------------------------------------------------------ @@ -19795,11 +19769,7 @@ No match /[[AC]||[BC]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[AC] - cls:[BC] - op: || - ] + [A-C] Ket End ------------------------------------------------------------------ @@ -19816,11 +19786,7 @@ No match /[[AC]&&[BC]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[AC] - cls:[BC] - op: && - ] + [C] Ket End ------------------------------------------------------------------ @@ -19837,11 +19803,7 @@ No match /[[AC]--[BC]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[AC] - cls:[BC] - op: -- - ] + [A] Ket End ------------------------------------------------------------------ @@ -19858,11 +19820,7 @@ No match /[[AC]~~[BC]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[AC] - cls:[BC] - op: ~~ - ] + [AB] Ket End ------------------------------------------------------------------ @@ -19879,11 +19837,7 @@ No match /[A[]]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[A] - cls:[\]] - op: || - ] + [A\]] Ket End ------------------------------------------------------------------ @@ -19898,11 +19852,7 @@ No match /[A[^]]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[A] - cls:[^\]] - op: || - ] + [^\]] Ket End ------------------------------------------------------------------ @@ -19919,11 +19869,7 @@ No match /[A[]]/B,alt_extended_class,allow_empty_class ------------------------------------------------------------------ Bra - eclass[ - cls:[A] - cls:[] - op: || - ] + [A] Ket End ------------------------------------------------------------------ @@ -19938,11 +19884,7 @@ No match /[A[^]]/B,alt_extended_class,allow_empty_class ------------------------------------------------------------------ Bra - eclass[ - cls:[A] - AllAny - op: || - ] + AllAny Ket End ------------------------------------------------------------------ @@ -19958,11 +19900,7 @@ No match /[A-C--B]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[A-C] - cls:[B] - op: -- - ] + [AC] Ket End ------------------------------------------------------------------ @@ -19977,12 +19915,7 @@ No match /[^A-C--B]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[A-C] - cls:[B] - op: -- - op: ^ - ] + [^AC] Ket End ------------------------------------------------------------------ @@ -19997,11 +19930,7 @@ No match /[[\d\D]--b]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - AllAny - cls:[b] - op: -- - ] + [^b] Ket End ------------------------------------------------------------------ @@ -20016,11 +19945,7 @@ No match /[\dAC-E[:space:]&&[^z]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[\x09-\x0d 0-9AC-E] - cls:[^z] - op: && - ] + [\x09-\x0d 0-9AC-E] Ket End ------------------------------------------------------------------ @@ -20047,11 +19972,7 @@ No match /[z||[^\dAC-E[:space:]]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[z] - cls:[^\x09-\x0d 0-9AC-E] - op: || - ] + [^\x09-\x0d 0-9AC-E] Ket End ------------------------------------------------------------------ @@ -20080,11 +20001,7 @@ No match /[ab||cd]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[ab] - cls:[cd] - op: || - ] + [a-d] Ket End ------------------------------------------------------------------ @@ -20099,15 +20016,7 @@ No match /[[a]b||[c]d]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[a] - cls:[b] - op: || - cls:[c] - cls:[d] - op: || - op: || - ] + [a-d] Ket End ------------------------------------------------------------------ @@ -20122,15 +20031,7 @@ No match /[a[b]||c[d]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[a] - cls:[b] - op: || - cls:[c] - cls:[d] - op: || - op: || - ] + [a-d] Ket End ------------------------------------------------------------------ @@ -20145,11 +20046,7 @@ No match /[-&&-]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[\-] - cls:[\-] - op: && - ] + [\-] Ket End ------------------------------------------------------------------ @@ -20162,11 +20059,7 @@ No match /[a-&&-a]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[\-a] - cls:[\-a] - op: && - ] + [\-a] Ket End ------------------------------------------------------------------ @@ -20181,11 +20074,7 @@ No match /[-a&&a-]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[\-a] - cls:[\-a] - op: && - ] + [\-a] Ket End ------------------------------------------------------------------ @@ -20200,15 +20089,7 @@ No match /[[a]-&&-[a]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[a] - cls:[\-] - op: || - cls:[\-] - cls:[a] - op: || - op: && - ] + [\-a] Ket End ------------------------------------------------------------------ @@ -20223,15 +20104,7 @@ No match /[-[a]&&[a]-]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[\-] - cls:[a] - op: || - cls:[a] - cls:[\-] - op: || - op: && - ] + [\-a] Ket End ------------------------------------------------------------------ @@ -20247,12 +20120,7 @@ No match ------------------------------------------------------------------ Bra Bra - eclass[ - cls:[a] - cls:[^b] - op: || - op: ^ - ] + [b] Ket Ket End @@ -20270,13 +20138,7 @@ No match /[ ^ a[ ^ b] ]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[ ^a] - cls:[ ^b] - op: || - cls:[ ] - op: || - ] + [ ^ab] Ket End ------------------------------------------------------------------ @@ -20295,11 +20157,7 @@ No match /[a-c--b]+/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[a-c] - cls:[b] - op: -- - ]++ + [ac]++ Ket End ------------------------------------------------------------------ @@ -20314,11 +20172,7 @@ No match /[a-c--b]{2,3}/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[a-c] - cls:[b] - op: -- - ]{2,3}+ + [ac]{2,3}+ Ket End ------------------------------------------------------------------ @@ -20336,11 +20190,7 @@ No match ------------------------------------------------------------------ Bra x - eclass[ - cls:[a-c] - cls:[b] - op: -- - ]++ + [ac]++ y Ket End @@ -20364,15 +20214,7 @@ No match /[A--B--C--D]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[A] - cls:[B] - op: -- - cls:[C] - op: -- - cls:[D] - op: -- - ] + [A] Ket End ------------------------------------------------------------------ @@ -20385,13 +20227,7 @@ No match /[A--A--A]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[A] - cls:[A] - op: -- - cls:[A] - op: -- - ] + [] Ket End ------------------------------------------------------------------ @@ -20404,13 +20240,7 @@ No match /[[A--A]--A]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[A] - cls:[A] - op: -- - cls:[A] - op: -- - ] + [] Ket End ------------------------------------------------------------------ @@ -20423,13 +20253,7 @@ No match /[A--[A--A]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[A] - cls:[A] - cls:[A] - op: -- - op: -- - ] + [A] Ket End ------------------------------------------------------------------ @@ -20442,11 +20266,7 @@ No match /[A--^B]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[A] - cls:[B^] - op: -- - ] + [A] Ket End ------------------------------------------------------------------ @@ -20464,11 +20284,7 @@ No match ------------------------------------------------------------------ Bra CBra 1 - eclass[ - cls:[a-z] - cls:[n] - op: -- - ] + [a-mo-z] Ket \1 Ket @@ -20491,11 +20307,7 @@ No match Bra CBra 1 x - eclass[ - cls:[a-z] - cls:[n] - op: -- - ] + [a-mo-z] y Ket \1 @@ -20520,11 +20332,7 @@ No match \1 Alt CBra 1 - eclass[ - cls:[a-z] - cls:[n] - op: -- - ] + [a-mo-z] Ket Ket Bra @@ -20532,11 +20340,7 @@ No match \1 Alt CBra 1 - eclass[ - cls:[a-z] - cls:[n] - op: -- - ] + [a-mo-z] Ket Ket Ket @@ -20562,11 +20366,7 @@ No match \1 Alt CBra 1 - eclass[ - cls:[a-z] - cls:[n] - op: -- - ] + [a-mo-z] Ket KetRmax Ket @@ -20588,11 +20388,7 @@ No match /[\d-[z]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[\-0-9] - cls:[z] - op: || - ] + [\-0-9z] Ket End ------------------------------------------------------------------ @@ -20606,11 +20402,7 @@ No match /[\d-||z]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[\-0-9] - cls:[z] - op: || - ] + [\-0-9z] Ket End ------------------------------------------------------------------ @@ -20624,11 +20416,7 @@ No match /[z[\d-]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[z] - cls:[\-0-9] - op: || - ] + [\-0-9z] Ket End ------------------------------------------------------------------ @@ -20642,11 +20430,7 @@ No match /[1-[z]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[\-1] - cls:[z] - op: || - ] + [\-1z] Ket End ------------------------------------------------------------------ @@ -20660,11 +20444,7 @@ No match /[1-||z]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[\-1] - cls:[z] - op: || - ] + [\-1z] Ket End ------------------------------------------------------------------ @@ -20678,11 +20458,7 @@ No match /[z[1-]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[z] - cls:[\-1] - op: || - ] + [\-1z] Ket End ------------------------------------------------------------------ @@ -20783,11 +20559,7 @@ Failed: error 150 at offset 5: invalid range in character class /[abc -- b]+/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[ a-c] - cls:[ b] - op: -- - ]++ + [ac]++ Ket End ------------------------------------------------------------------ @@ -20818,9 +20590,7 @@ Failed: error 106 at offset 3: missing terminating ] for character class /(?[ []] ])/B,allow_empty_class ------------------------------------------------------------------ Bra - eclass[ - cls:[\]] - ] + [\]] Ket End ------------------------------------------------------------------ diff --git a/testdata/testoutput5 b/testdata/testoutput5 index 80d1fd531..3c25bafeb 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -6238,9 +6238,10 @@ Failed: error 115 at offset 52: reference to non-existent subpattern ------------------------------------------------------------------ Bra eclass[ - cls:[\p{Lu}] - cls:[\p{Nd}] - op: || + bitmap: [0-9A-Z\xc0-\xd6\xd8-\xde] + xclass: [\p{Lu}] + xclass: [\p{Nd}] + OR ] Ket End @@ -6259,9 +6260,10 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\p{L}] - cls:[\p{Nd}] - op: || + bitmap: [0-9A-Za-z\xaa\xb5\xba\xc0-\xd6\xd8-\xf6\xf8-\xff] + xclass: [\p{L}] + xclass: [\p{Nd}] + OR ] Ket End @@ -6280,9 +6282,10 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\p{Lu}] - cls:[\p{Nd}] - op: || + bitmap: [0-9A-Z\xc0-\xd6\xd8-\xde] + xclass: [\p{Lu}] + xclass: [\p{Nd}] + OR ] Ket End @@ -6299,9 +6302,10 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[^\p{L}] - cls:[\p{Nd}] - op: || + bitmap: [^A-Za-z\xaa\xb5\xba\xc0-\xd6\xd8-\xf6\xf8-\xff] + xclass: [^\p{L}] + xclass: [\p{Nd}] + OR ] Ket End @@ -6318,10 +6322,10 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\p{L}] - cls:[\p{Nd}] - op: || - op: ^ + bitmap: [^0-9A-Za-z\xaa\xb5\xba\xc0-\xd6\xd8-\xf6\xf8-\xff] + xclass: [^\p{L}] + xclass: [^\p{Nd}] + AND ] Ket End @@ -6338,10 +6342,10 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\p{L}] - cls:[\p{Nd}] - op: && - op: ^ + bitmap: [^] + xclass: [^\p{L}] + xclass: [^\p{Nd}] + OR ] Ket End @@ -6355,9 +6359,10 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\p{Lu}\p{Ll}] - cls:[\p{Nd}\p{Ll}] - op: || + bitmap: [0-9A-Za-z\xb5\xc0-\xd6\xd8-\xf6\xf8-\xff] + xclass: [\p{Lu}\p{Ll}] + xclass: [\p{Nd}\p{Ll}] + OR ] Ket End @@ -6376,9 +6381,10 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\p{Lu}\p{Ll}] - cls:[\p{Nd}\p{Ll}] - op: && + bitmap: [a-z\xb5\xdf-\xf6\xf8-\xff] + xclass: [\p{Lu}\p{Ll}] + xclass: [\p{Nd}\p{Ll}] + AND ] Ket End @@ -6397,9 +6403,10 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\p{Lu}\p{Ll}] - cls:[\p{Nd}\p{Ll}] - op: -- + bitmap: [A-Z\xc0-\xd6\xd8-\xde] + xclass: [\p{Lu}\p{Ll}] + xclass: [^\p{Nd}\p{Ll}] + AND ] Ket End @@ -6418,9 +6425,10 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\p{Lu}\p{Ll}] - cls:[\p{Nd}\p{Ll}] - op: ~~ + bitmap: [0-9A-Z\xc0-\xd6\xd8-\xde] + xclass: [\p{Lu}\p{Ll}] + xclass: [\p{Nd}\p{Ll}] + XOR ] Ket End @@ -6438,11 +6446,7 @@ No match /[\pL[]]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[\p{L}] - cls:[\]] - op: || - ] + [A-Z\]a-z\xaa\xb5\xba\xc0-\xd6\xd8-\xf6\xf8-\xff\p{L}] Ket End ------------------------------------------------------------------ @@ -6457,11 +6461,7 @@ No match /[\pL[^]]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[\p{L}] - cls:[^\]] - op: || - ] + [^\]] Ket End ------------------------------------------------------------------ @@ -6478,11 +6478,7 @@ No match /[\pL[]]/B,alt_extended_class,allow_empty_class ------------------------------------------------------------------ Bra - eclass[ - cls:[\p{L}] - cls:[] - op: || - ] + [A-Za-z\xaa\xb5\xba\xc0-\xd6\xd8-\xf6\xf8-\xff\p{L}] Ket End ------------------------------------------------------------------ @@ -6497,11 +6493,7 @@ No match /[\pL[^]]/B,alt_extended_class,allow_empty_class ------------------------------------------------------------------ Bra - eclass[ - cls:[\p{L}] - AllAny - op: || - ] + AllAny Ket End ------------------------------------------------------------------ @@ -6517,11 +6509,7 @@ No match /[\dAC-E[:space:]\p{Lu}&&[^z]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[\x09-\x0d 0-9A-Z\xc0-\xd6\xd8-\xde\p{Lu}] - cls:[^z] - op: && - ] + [\x09-\x0d 0-9A-Z\xc0-\xd6\xd8-\xde\p{Lu}] Ket End ------------------------------------------------------------------ @@ -6546,11 +6534,7 @@ No match /[z||[^\dAC-E[:space:]\p{Lu}]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[z] - cls:[^\x09-\x0d 0-9A-Z\xc0-\xd6\xd8-\xde\p{Lu}] - op: || - ] + [^\x09-\x0d 0-9A-Z\xc0-\xd6\xd8-\xde\p{Lu}] Ket End ------------------------------------------------------------------ @@ -6579,11 +6563,7 @@ No match /[\p{Lu}\p{Nd}||cd]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[\p{Lu}\p{Nd}] - cls:[cd] - op: || - ] + [0-9A-Zcd\xc0-\xd6\xd8-\xde\p{Lu}\p{Nd}] Ket End ------------------------------------------------------------------ @@ -6601,13 +6581,10 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\p{Lu}] - cls:[\p{Nd}] - op: || - cls:[c] - cls:[d] - op: || - op: || + bitmap: [0-9A-Zcd\xc0-\xd6\xd8-\xde] + xclass: [\p{Lu}] + xclass: [\p{Nd}] + OR ] Ket End @@ -6626,13 +6603,10 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\p{Lu}] - cls:[\p{Nd}] - op: || - cls:[c] - cls:[d] - op: || - op: || + bitmap: [0-9A-Zcd\xc0-\xd6\xd8-\xde] + xclass: [\p{Lu}] + xclass: [\p{Nd}] + OR ] Ket End @@ -6783,9 +6757,10 @@ Failed: error 150 at offset 6: invalid range in character class ------------------------------------------------------------------ Bra eclass[ - cls:[\-A-Z\xc0-\xd6\xd8-\xde\p{Lu}] - cls:[\-A-Za-z\xaa\xb5\xba\xc0-\xd6\xd8-\xf6\xf8-\xff\p{L}] - op: && + bitmap: [\-A-Z\xc0-\xd6\xd8-\xde] + xclass: [\p{Lu}] + xclass: [\p{L}] + AND ] Ket End @@ -6802,9 +6777,10 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\-A-Z\xc0-\xd6\xd8-\xde\p{Lu}] - cls:[\-A-Za-z\xaa\xb5\xba\xc0-\xd6\xd8-\xf6\xf8-\xff\p{L}] - op: && + bitmap: [\-A-Z\xc0-\xd6\xd8-\xde] + xclass: [\p{Lu}] + xclass: [\p{L}] + AND ] Ket End @@ -6821,13 +6797,10 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\p{Lu}] - cls:[\-] - op: || - cls:[\-] - cls:[\p{L}] - op: || - op: && + bitmap: [\-A-Z\xc0-\xd6\xd8-\xde] + xclass: [\p{Lu}] + xclass: [\p{L}] + AND ] Ket End @@ -6844,13 +6817,10 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\-] - cls:[\p{Lu}] - op: || - cls:[\p{L}] - cls:[\-] - op: || - op: && + bitmap: [\-A-Z\xc0-\xd6\xd8-\xde] + xclass: [\p{Lu}] + xclass: [\p{L}] + AND ] Ket End @@ -6867,12 +6837,7 @@ No match ------------------------------------------------------------------ Bra Bra - eclass[ - cls:[5] - cls:[^\p{Nd}] - op: || - op: ^ - ] + [0-46-9\p{Nd}] Ket Ket End @@ -6891,12 +6856,7 @@ No match ------------------------------------------------------------------ Bra Bra - eclass[ - cls:[\p{Nd}] - cls:[^5] - op: || - op: ^ - ] + [] Ket Ket End @@ -6916,10 +6876,9 @@ No match Bra Bra eclass[ - cls:[\p{Nd}] - cls:[^\p{Nd}] - op: || - op: ^ + xclass: [^\p{Nd}] + xclass: [\p{Nd}] + AND ] Ket Ket @@ -6939,11 +6898,10 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[ ^a-z\xb5\xdf-\xf6\xf8-\xff\p{Ll}] - cls:[ 0-9^\p{Nd}] - op: || - cls:[ ] - op: || + bitmap: [ 0-9^a-z\xb5\xdf-\xf6\xf8-\xff] + xclass: [\p{Ll}] + xclass: [\p{Nd}] + OR ] Ket End @@ -6965,11 +6923,7 @@ No match /[a-c--\p{Nd}]+/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[a-c] - cls:[\p{Nd}] - op: -- - ]++ + [a-c]++ Ket End ------------------------------------------------------------------ @@ -6984,11 +6938,7 @@ No match /[a-c--\p{Nd}]{2,3}/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[a-c] - cls:[\p{Nd}] - op: -- - ]{2,3}+ + [a-c]{2,3}+ Ket End ------------------------------------------------------------------ @@ -7006,11 +6956,7 @@ No match ------------------------------------------------------------------ Bra x - eclass[ - cls:[a-c] - cls:[\p{Nd}] - op: -- - ]++ + [a-c]++ y Ket End @@ -7035,11 +6981,11 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\p{L}] - cls:[\p{L}] - op: -- - cls:[\p{L}] - op: -- + xclass: [\p{L}] + xclass: [^\p{L}] + AND + xclass: [^\p{L}] + AND ] Ket End @@ -7054,11 +7000,11 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\p{L}] - cls:[\p{L}] - op: -- - cls:[\p{L}] - op: -- + xclass: [\p{L}] + xclass: [^\p{L}] + AND + xclass: [^\p{L}] + AND ] Ket End @@ -7073,11 +7019,12 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\p{L}] - cls:[\p{L}] - cls:[\p{L}] - op: -- - op: -- + bitmap: [A-Za-z\xaa\xb5\xba\xc0-\xd6\xd8-\xf6\xf8-\xff] + xclass: [\p{L}] + xclass: [^\p{L}] + xclass: [\p{L}] + OR + AND ] Ket End @@ -7092,9 +7039,10 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\p{L}] - cls:[0-9^\p{Nd}] - op: -- + bitmap: [A-Za-z\xaa\xb5\xba\xc0-\xd6\xd8-\xf6\xf8-\xff] + xclass: [\p{L}] + xclass: [^\p{Nd}] + AND ] Ket End @@ -7111,13 +7059,7 @@ No match ------------------------------------------------------------------ Bra CBra 1 - eclass[ - cls:[a-z] - cls:[\p{L}] - cls:[n] - op: && - op: -- - ] + [a-mo-z] Ket \1 Ket @@ -7140,13 +7082,7 @@ No match Bra CBra 1 x - eclass[ - cls:[a-z] - cls:[\p{L}] - cls:[n] - op: && - op: -- - ] + [a-mo-z] y Ket \1 @@ -7171,13 +7107,7 @@ No match \1 Alt CBra 1 - eclass[ - cls:[a-z] - cls:[\p{L}] - cls:[n] - op: && - op: -- - ] + [a-mo-z] Ket Ket Bra @@ -7185,13 +7115,7 @@ No match \1 Alt CBra 1 - eclass[ - cls:[a-z] - cls:[\p{L}] - cls:[n] - op: && - op: -- - ] + [a-mo-z] Ket Ket Ket @@ -7217,13 +7141,7 @@ No match \1 Alt CBra 1 - eclass[ - cls:[a-z] - cls:[\p{L}] - cls:[n] - op: && - op: -- - ] + [a-mo-z] Ket KetRmax Ket @@ -7246,11 +7164,12 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\p{Nd}] - cls:[\p{L}] - cls:[\p{Lu}] - op: -- - op: || + bitmap: [0-9a-z\xaa\xb5\xba\xdf-\xf6\xf8-\xff] + xclass: [\p{Nd}] + xclass: [\p{L}] + xclass: [^\p{Lu}] + AND + OR ] Ket End @@ -7266,11 +7185,7 @@ No match /[\P{Nd}||2]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[\P{Nd}] - cls:[2] - op: || - ] + [\x00-/2:-\xff\P{Nd}] Ket End ------------------------------------------------------------------ @@ -7289,10 +7204,7 @@ No match /[^[\P{Nd}]]/B,alt_extended_class ------------------------------------------------------------------ Bra - eclass[ - cls:[\P{Nd}] - op: ^ - ] + [^\x00-/:-\xff\P{Nd}] Ket End ------------------------------------------------------------------ @@ -7312,9 +7224,10 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[\p{Lu}] - cls:[\p{Ll}] - op: ~~ + bitmap: [A-Za-z\xb5\xc0-\xd6\xd8-\xf6\xf8-\xff] + xclass: [\p{Lu}] + xclass: [\p{Ll}] + XOR ] Ket End @@ -7333,9 +7246,10 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[1A-Za-z\xb5\xc0-\xd6\xd8-\xf6\xf8-\xff\p{Lc}] - cls:[\p{Lc}] - op: ~~ + bitmap: [1] + xclass: [\p{Lc}] + xclass: [\p{Lc}] + XOR ] Ket End @@ -7354,9 +7268,10 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[1A-Z\xc0-\xd6\xd8-\xde\p{Lu}] - cls:[1a-z\xb5\xdf-\xf6\xf8-\xff\p{Ll}] - op: && + bitmap: [1] + xclass: [\p{Lu}] + xclass: [\p{Ll}] + AND ] Ket End @@ -7377,9 +7292,10 @@ No match ------------------------------------------------------------------ Bra eclass[ - cls:[1A-Za-z\xb5\xc0-\xd6\xd8-\xf6\xf8-\xff\p{Lc}] - cls:[1A-Za-z\xb5\xc0-\xd6\xd8-\xf6\xf8-\xff\p{Lc}] - op: && + bitmap: [1A-Za-z\xb5\xc0-\xd6\xd8-\xf6\xf8-\xff] + xclass: [\p{Lc}] + xclass: [\p{Lc}] + AND ] Ket End