Skip to content

Commit

Permalink
Update EBCDIC support to support testing on normal ASCII systems (#656)
Browse files Browse the repository at this point in the history
The pcre2test utility needs quite a few changes to accommodate this.
It is simpler to add a new mode to it, than to make it fully
EBCDIC-native. On an ASCII system, pcre2test performs ASCII I/O, but
tranlates the input when passing it to the fully-EBCDIC-supporting
library.
  • Loading branch information
NWilson authored Feb 12, 2025
1 parent ce6e960 commit 0d0ac3a
Show file tree
Hide file tree
Showing 63 changed files with 3,478 additions and 1,577 deletions.
64 changes: 55 additions & 9 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,10 @@ set(

set(PCRE2_EBCDIC_NL25 OFF CACHE BOOL "Use 0x25 as EBCDIC NL character instead of 0x15; implies EBCDIC.")

set(PCRE2_EBCDIC_IGNORING_COMPILER OFF CACHE BOOL "Force EBCDIC 1047 using numeric literals rather than C character literals; implies EBCDIC.")

option(PCRE2_REBUILD_CHARTABLES "Rebuild char tables" OFF)

set(
PCRE2_LINK_SIZE
"2"
Expand Down Expand Up @@ -579,13 +583,42 @@ if(NEWLINE_DEFAULT STREQUAL "")
)
endif()

set(REBUILD_CHARTABLES OFF)
if(PCRE2_REBUILD_CHARTABLES)
set(REBUILD_CHARTABLES ON)
endif()

set(EBCDIC OFF)
if(PCRE2_EBCDIC)
set(EBCDIC 1)
set(EBCDIC ON)
endif()

if(PCRE2_EBCDIC_NL25)
set(EBCDIC 1)
set(EBCDIC_NL25 1)
set(EBCDIC ON)
set(EBCDIC_NL25 ON)
endif()

if(PCRE2_EBCDIC_IGNORING_COMPILER)
set(EBCDIC ON)
set(EBCDIC_IGNORING_COMPILER ON)
endif()

# Make sure that if EBCDIC is set (without EBCDIC_IGNORING_COMPILER), then
# REBUILD_CHARTABLES is also enabled.
# Also check that UTF support is not requested, because PCRE2 cannot handle
# EBCDIC and UTF in the same build. To do so it would need to use different
# character constants depending on the mode.
# Also, EBCDIC cannot be used with 16-bit and 32-bit libraries.
if(EBCDIC)
if(NOT EBCDIC_IGNORING_COMPILER)
set(REBUILD_CHARTABLES ON)
endif()
if(PCRE2_SUPPORT_UNICODE)
message(FATAL_ERROR "Support for EBCDIC and Unicode cannot be enabled at the same time")
endif()
if(PCRE2_BUILD_PCRE2_16 OR PCRE2_BUILD_PCRE2_32)
message(FATAL_ERROR "EBCDIC support is available only for the 8-bit library")
endif()
endif()

# Output files
Expand Down Expand Up @@ -659,8 +692,7 @@ endif()

# Character table generation

option(PCRE2_REBUILD_CHARTABLES "Rebuild char tables" OFF)
if(PCRE2_REBUILD_CHARTABLES)
if(REBUILD_CHARTABLES)
add_executable(pcre2_dftables src/pcre2_dftables.c)
add_custom_command(
OUTPUT ${PROJECT_BINARY_DIR}/pcre2_chartables.c
Expand All @@ -670,8 +702,12 @@ if(PCRE2_REBUILD_CHARTABLES)
COMMENT "Generating character tables (pcre2_chartables.c) for current locale"
VERBATIM
)
else()
elseif(NOT PCRE2_EBCDIC)
configure_file(${PROJECT_SOURCE_DIR}/src/pcre2_chartables.c.dist ${PROJECT_BINARY_DIR}/pcre2_chartables.c COPYONLY)
elseif(PCRE2_EBCDIC_NL25)
configure_file(${PROJECT_SOURCE_DIR}/src/pcre2_chartables.c.ebcdic-1047-nl25 ${PROJECT_BINARY_DIR}/pcre2_chartables.c COPYONLY)
else()
configure_file(${PROJECT_SOURCE_DIR}/src/pcre2_chartables.c.ebcdic-1047-nl15 ${PROJECT_BINARY_DIR}/pcre2_chartables.c COPYONLY)
endif()

# Source code
Expand Down Expand Up @@ -1345,9 +1381,19 @@ if(PCRE2_SHOW_REPORT)
message(STATUS " Newline char/sequence ............. : ${PCRE2_NEWLINE}")
message(STATUS " \\R matches only ANYCRLF ........... : ${PCRE2_SUPPORT_BSR_ANYCRLF}")
message(STATUS " \\C is disabled .................... : ${PCRE2_NEVER_BACKSLASH_C}")
message(STATUS " EBCDIC coding ..................... : ${PCRE2_EBCDIC}")
message(STATUS " EBCDIC coding with NL=0x25 ........ : ${PCRE2_EBCDIC_NL25}")
message(STATUS " Rebuild char tables ............... : ${PCRE2_REBUILD_CHARTABLES}")

if(NOT EBCDIC)
set(EBCDIC_NL_CODE "n/a")
elseif(EBCDIC_NL25)
set(EBCDIC_NL_CODE "0x25")
else()
set(EBCDIC_NL_CODE "0x15")
endif()
message(STATUS " EBCDIC coding ..................... : ${EBCDIC}")
message(STATUS " EBCDIC code for NL ................ : ${EBCDIC_NL_CODE}")
message(STATUS " EBCDIC coding ignoring compiler ... : ${PCRE2_EBCDIC_IGNORING_COMPILER}")
message(STATUS " Rebuild char tables ............... : ${REBUILD_CHARTABLES}")

message(STATUS " Internal link size ................ : ${PCRE2_LINK_SIZE}")
message(STATUS " Maximum variable lookbehind ....... : ${PCRE2_MAX_VARLOOKBEHIND}")
message(STATUS " Parentheses nest limit ............ : ${PCRE2_PARENS_NEST_LIMIT}")
Expand Down
23 changes: 20 additions & 3 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -362,9 +362,21 @@ src/pcre2_chartables.c: pcre2_dftables$(EXEEXT)
rm -f $@
./pcre2_dftables$(EXEEXT) $@
else
if WITH_EBCDIC
if WITH_EBCDIC_NL25
src/pcre2_chartables.c: $(srcdir)/src/pcre2_chartables.c.ebcdic-1047-nl25
rm -f $@
$(LN_S) $(abs_srcdir)/src/pcre2_chartables.c.ebcdic-1047-nl25 $(abs_builddir)/src/pcre2_chartables.c
else # WITH_EBCDIC_NL25
src/pcre2_chartables.c: $(srcdir)/src/pcre2_chartables.c.ebcdic-1047-nl15
rm -f $@
$(LN_S) $(abs_srcdir)/src/pcre2_chartables.c.ebcdic-1047-nl15 $(abs_builddir)/src/pcre2_chartables.c
endif # WITH_EBCDIC_NL25
else # WITH_EBCDIC
src/pcre2_chartables.c: $(srcdir)/src/pcre2_chartables.c.dist
rm -f $@
$(LN_S) $(abs_srcdir)/src/pcre2_chartables.c.dist $(abs_builddir)/src/pcre2_chartables.c
endif # WITH_EBCDIC
endif # WITH_REBUILD_CHARTABLES

BUILT_SOURCES = src/pcre2_chartables.c
Expand Down Expand Up @@ -460,7 +472,10 @@ endif # WITH_PCRE2_32
# The pcre2_chartables.c.dist file is the default version of
# pcre2_chartables.c, used unless --enable-rebuild-chartables is specified.

EXTRA_DIST += src/pcre2_chartables.c.dist
EXTRA_DIST += \
src/pcre2_chartables.c.dist \
src/pcre2_chartables.c.ebcdic-1047-nl15 \
src/pcre2_chartables.c.ebcdic-1047-nl25
CLEANFILES += src/pcre2_chartables.c

# The JIT compiler lives in a separate directory, but its files are #included
Expand Down Expand Up @@ -768,7 +783,8 @@ EXTRA_DIST += \
testdata/testinput25 \
testdata/testinput26 \
testdata/testinput27 \
testdata/testinputEBC \
testdata/testinput28 \
testdata/testinput29 \
testdata/testinputheap \
testdata/testoutput1 \
testdata/testoutput2 \
Expand Down Expand Up @@ -810,7 +826,8 @@ EXTRA_DIST += \
testdata/testoutput25 \
testdata/testoutput26 \
testdata/testoutput27 \
testdata/testoutputEBC \
testdata/testoutput28 \
testdata/testoutput29 \
testdata/testoutputheap-16 \
testdata/testoutputheap-32 \
testdata/testoutputheap-8 \
Expand Down
37 changes: 30 additions & 7 deletions README
Original file line number Diff line number Diff line change
Expand Up @@ -309,11 +309,22 @@ library. They are also documented in the pcre2build man page.

--enable-ebcdic --disable-unicode

This automatically implies --enable-rebuild-chartables (see above). However,
when PCRE2 is built this way, it always operates in EBCDIC. It cannot support
both EBCDIC and UTF-8/16/32. There is a second option, --enable-ebcdic-nl25,
which specifies that the code value for the EBCDIC NL character is 0x25
instead of the default 0x15.
This automatically implies --enable-rebuild-chartables (see above), in order
to ensure that you have the correct default character tables for your system's
codepage. There is an exception when you set --enable-ebcdic-ignoring-compiler
(see below), which allows using a default set of EBCDIC 1047 character tables
rather than forcing use of --enable-rebuild-chartables.

When PCRE2 is built with EBCDIC support, it always operates in EBCDIC. It
cannot support both EBCDIC and ASCII or UTF-8/16/32.

There is a second option, --enable-ebcdic-nl25, which specifies that the code
value for the EBCDIC NL character is 0x25 instead of the default 0x15.

There is a third option, --enable-ebcdic-ignoring-compiler, which disregards
the compiler's codepage for determining the numeric value of C character
constants such as 'z', and instead forces PCRE2 to use numeric constants for
the EBCDIC 1047 codepage instead.

. If you specify --enable-debug, additional debugging code is included in the
build. This option is intended for use by the PCRE2 maintainers.
Expand Down Expand Up @@ -744,8 +755,16 @@ and with UTF support, respectively. Test 23 tests \C when it is locked out.
Tests 24 and 25 test the experimental pattern conversion functions, without and
with UTF support, respectively.

Test 26 checks Unicode property support using tests that are generated
automatically from the Unicode data tables.
Test 26 checks Unicode property support using tests that were generated
automatically from the Unicode data tables. These are the archived version of
the tests from Unicode 15.

Test 27 checks Unicode property support using tests that are generated
automatically from the currently-used Unicode data tables.

Test 28 tests EBCDIC support, and is only run when PCRE2 is specifically
compiled for EBCDIC. Test 29 tests EBCDIC when NL has been configured to be
0x25.


Character tables
Expand Down Expand Up @@ -822,6 +841,10 @@ The distribution should contain the files listed below.
src/pcre2_chartables.c.dist a default set of character tables that assume
ASCII coding; unless --enable-rebuild-chartables is
specified, used by copying to pcre2_chartables.c
src/pcre2_chartables.c.ebcdic-1047-{nl15,nl25} a default set of character
tables for EBCDIC 1047; used if
--enable-ebcdic-ignoring-compiler is specified
without --enable-rebuild-chartables

src/pcre2posix.c )
src/pcre2_auto_possess.c )
Expand Down
Loading

0 comments on commit 0d0ac3a

Please sign in to comment.