Skip to content

Commit

Permalink
Merge pull request apache#113 from sun-rui/stringHashCodeInC
Browse files Browse the repository at this point in the history
Implement string hash code in C.
  • Loading branch information
shivaram committed Dec 12, 2014
2 parents 7d81b05 + a7d9cdb commit d83c017
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 38 deletions.
1 change: 1 addition & 0 deletions pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,4 @@ export(
"setCheckpointDir"
)
export("sparkR.init")
useDynLib(SparkR, stringHashCode)
33 changes: 1 addition & 32 deletions pkg/R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -146,27 +146,6 @@ getDependencies <- function(name) {
binData
}

# Helper function used to wrap a 'numeric' value to integer bounds.
# Useful for implementing C-like integer arithmetic
wrapInt <- function(value) {
if (value > .Machine$integer.max) {
value <- value - 2 * .Machine$integer.max - 2
} else if (value < -1 * .Machine$integer.max) {
value <- 2 * .Machine$integer.max + value + 2
}
value
}

# Multiply `val` by 31 and add `addVal` to the result. Ensures that
# integer-overflows are handled at every step.
mult31AndAdd <- function(val, addVal) {
vec <- c(bitwShiftL(val, c(4,3,2,1,0)), addVal)
Reduce(function(a, b) {
wrapInt(as.numeric(a) + as.numeric(b))
},
vec)
}

#' Compute the hashCode of an object
#'
#' Java-style function to compute the hashCode for the given object. Returns
Expand All @@ -191,17 +170,7 @@ hashCode <- function(key) {
intBits <- packBits(rawToBits(rawVec), "integer")
as.integer(bitwXor(intBits[2], intBits[1]))
} else if (class(key) == "character") {
n <- nchar(key)
if (n == 0) {
0L
} else {
asciiVals <- sapply(charToRaw(key), function(x) { strtoi(x, 16L) })
hashC <- 0
for (k in 1:length(asciiVals)) {
hashC <- mult31AndAdd(hashC, asciiVals[k])
}
as.integer(hashC)
}
.Call("stringHashCode", key)
} else {
warning(paste("Could not hash object, returning 0", sep=""))
as.integer(0)
Expand Down
11 changes: 8 additions & 3 deletions pkg/src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ else
MAVEN_YARN_FLAG :=
endif

all: $(TARGET_NAME)
all: $(TARGET_NAME) sharelib

$(SBT_TARGET_NAME): build.sbt $(SCALA_FILES) $(RESOURCE_FILES)
./sbt/sbt assembly
Expand All @@ -41,12 +41,17 @@ $(MAVEN_TARGET_NAME): pom.xml $(SCALA_FILES) $(RESOURCE_FILES)
mvn -Dhadoop.version=$(SPARK_HADOOP_VERSION) -Dspark.version=$(SPARK_VERSION) -DskipTests $(MAVEN_YARN_FLAG) -Dyarn.version=$(SPARK_YARN_VERSION) clean package shade:shade
cp -f $(MAVEN_TARGET_NAME) ../inst/$(JAR_NAME)

sharelib: string_hash_code.c
R CMD SHLIB -o SparkR.so string_hash_code.c

clean:
$(BUILD_TOOL) clean
rm -rf target
rm -rf project/target
rm -rf project/project
-rm sbt/sbt-launch-*.jar
rm -f ../inst/$(JAR_NAME)

rm -f ../inst/$(JAR_NAME)
rm -f *.o
rm -f *.so

.PHONY: all clean
11 changes: 8 additions & 3 deletions pkg/src/Makefile.win
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,23 @@ SPARK_YARN_VERSION ?= 2.4.0

TARGET_NAME := $(MAVEN_TARGET_NAME)

all: $(TARGET_NAME)
all: $(TARGET_NAME) sharelib

$(MAVEN_TARGET_NAME): $(SCALA_FILES) $(RESOURCE_FILES)
mvn.bat -Dhadoop.version=$(SPARK_HADOOP_VERSION) -Dspark.version=$(SPARK_VERSION) -Dyarn.version=$(SPARK_YARN_VERSION) -DskipTests clean package shade:shade
cp -f $(MAVEN_TARGET_NAME) ../inst/$(JAR_NAME)

sharelib: string_hash_code.c
R CMD SHLIB -o SparkR.dll string_hash_code.c

clean:
mvn.bat clean
rm -rf target
rm -rf project/target
rm -rf project/project
-rm sbt/sbt-launch-*.jar
rm -f ../inst/$(JAR_NAME)

rm -f ../inst/$(JAR_NAME)
rm -f *.o
rm -f *.dll

.PHONY: all clean
27 changes: 27 additions & 0 deletions pkg/src/string_hash_code.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*
* A C function for R extension which implements the Java String hash algorithm.
* Refer to http://en.wikipedia.org/wiki/Java_hashCode%28%29#The_java.lang.String_hash_function
*
*/

#include <R.h>
#include <Rinternals.h>

SEXP stringHashCode(SEXP string) {
const char* str;
R_xlen_t len, i;
int hashCode = 0;

if (!IS_SCALAR(string, STRSXP)) {
error("invalid input");
}

str = CHAR(asChar(string));
len = XLENGTH(asChar(string));

for (i = 0; i < len; i++) {
hashCode = (hashCode << 5) - hashCode + *str++;
}

return ScalarInteger(hashCode);
}

0 comments on commit d83c017

Please sign in to comment.