Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(#220) Switch to lookup tables in hotspots isNameChar()/isNameStartChar() #221

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions release-notes/VERSION
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ Project: woodstox
#213: SAX: `Locator#getSystemId` and `Locator#getPublicId` are not
available during `startDocument` event
(fix contributed by Philipp N)
#221: Switch to lookup tables in hotspots `isNameChar()`/`isNameStartChar()`
(contributed by @winfriedgerlach)

7.1.0 (22-Oct-2024)

Expand Down
45 changes: 25 additions & 20 deletions src/main/java/com/ctc/wstx/io/WstxInputData.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

import com.ctc.wstx.util.XmlChars;

import java.util.stream.IntStream;

/**
* Base class used by readers (specifically, by
* {@link com.ctc.wstx.sr.StreamScanner}, and its sub-classes)
Expand Down Expand Up @@ -50,6 +52,23 @@ public class WstxInputData
*/
public final static int MAX_UNICODE_CHAR = 0x10FFFF;

private static final boolean[] asciiNameStartChars = new boolean[128];
static {
IntStream.rangeClosed('a', 'z').forEach(i -> asciiNameStartChars[i] = true);
IntStream.rangeClosed('A', 'Z').forEach(i -> asciiNameStartChars[i] = true);
asciiNameStartChars['_'] = true;
}

private static final boolean[] asciiNameChars = new boolean[128];
static {
IntStream.rangeClosed('a', 'z').forEach(i -> asciiNameChars[i] = true);
IntStream.rangeClosed('A', 'Z').forEach(i -> asciiNameChars[i] = true);
IntStream.rangeClosed('0', '9').forEach(i -> asciiNameChars[i] = true);
asciiNameChars['.'] = true;
asciiNameChars['-'] = true;
asciiNameChars['_'] = true;
}

/*
////////////////////////////////////////////////////
// Configuration
Expand Down Expand Up @@ -153,14 +172,9 @@ protected final boolean isNameStartChar(char c)
/* First, let's handle 7-bit ascii range (identical between xml
* 1.0 and 1.1)
*/
if (c <= 0x7A) { // 'z' or earlier
if (c >= 0x61) { // 'a' - 'z' are ok
return true;
}
if (c < 0x41) { // before 'A' just white space
return false;
}
return (c <= 0x5A) || (c == '_'); // 'A' - 'Z' and '_' are ok
if (c < 128) {
// this is performance critical, so we use a lookup table instead of if-branches
return asciiNameStartChars[c];
}
/* Ok, otherwise need to use a big honking bit sets... which
* differ between 1.0 and 1.1
Expand All @@ -178,18 +192,9 @@ protected final boolean isNameStartChar(char c)
protected final boolean isNameChar(char c)
{
// First, let's handle 7-bit ascii range
if (c <= 0x7A) { // 'z' or earlier
if (c >= 0x61) { // 'a' - 'z' are ok
return true;
}
if (c <= 0x5A) {
if (c >= 0x41) { // 'A' - 'Z' ok too
return true;
}
// As are 0-9, '.' and '-'
return (c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-');
}
return (c == 0x5F); // '_' is ok too
if (c < 128) {
// this is performance critical, so we use a lookup table instead of if-branches
return asciiNameChars[c];
}
return mXml11 ? XmlChars.is11NameChar(c) : XmlChars.is10NameChar(c);
}
Expand Down
77 changes: 77 additions & 0 deletions src/test/java/com/ctc/wstx/io/WstxInputDataTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package com.ctc.wstx.io;

import com.ctc.wstx.util.XmlChars;
import junit.framework.TestCase;
import org.junit.Test;

import java.util.stream.IntStream;

public class WstxInputDataTest extends TestCase {

@Test
public void testIsNameStartCharBehavesSameAsBranchyVersion() {
WstxInputData wstxInputDataXml10 = new WstxInputData();
WstxInputData wstxInputDataXml11 = new WstxInputData();
wstxInputDataXml11.mXml11 = true;

// include all 7-bit ASCII characters plus some left and right
IntStream.range(-10, 138).forEach(i -> {
char c = (char) i;
assertEquals(isNameStartCharBranchy(c, false), wstxInputDataXml10.isNameStartChar(c));
assertEquals(isNameStartCharBranchy(c, true), wstxInputDataXml11.isNameStartChar(c));
});
}

// previous implementation with branches
private final boolean isNameStartCharBranchy(char c, boolean mXml11) {
/* First, let's handle 7-bit ascii range (identical between xml
* 1.0 and 1.1)
*/
if (c <= 0x7A) { // 'z' or earlier
if (c >= 0x61) { // 'a' - 'z' are ok
return true;
}
if (c < 0x41) { // before 'A' just white space
return false;
}
return (c <= 0x5A) || (c == '_'); // 'A' - 'Z' and '_' are ok
}
/* Ok, otherwise need to use a big honking bit sets... which
* differ between 1.0 and 1.1
*/
return mXml11 ? XmlChars.is11NameStartChar(c) : XmlChars.is10NameStartChar(c);
}

@Test
public void testIsNameCharBehavesSameAsBranchyVersion() {
WstxInputData wstxInputDataXml10 = new WstxInputData();
WstxInputData wstxInputDataXml11 = new WstxInputData();
wstxInputDataXml11.mXml11 = true;

// include all 7-bit ASCII characters plus some left and right
IntStream.range(-10, 138).forEach(i -> {
char c = (char) i;
assertEquals(isNameCharBranchy(c, false), wstxInputDataXml10.isNameChar(c));
assertEquals(isNameCharBranchy(c, true), wstxInputDataXml11.isNameChar(c));
});
}

// previous implementation with branches
private final boolean isNameCharBranchy(char c, boolean mXml11) {
// First, let's handle 7-bit ascii range
if (c <= 0x7A) { // 'z' or earlier
if (c >= 0x61) { // 'a' - 'z' are ok
return true;
}
if (c <= 0x5A) {
if (c >= 0x41) { // 'A' - 'Z' ok too
return true;
}
// As are 0-9, '.' and '-'
return (c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-');
}
return (c == 0x5F); // '_' is ok too
}
return mXml11 ? XmlChars.is11NameChar(c) : XmlChars.is10NameChar(c);
}
}
Loading