From 83bd7b1fa2910b11b93864febed24b9e113196f6 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Mon, 2 May 2011 17:05:18 -0400 Subject: [PATCH] Use ASCIIString instead of Latin1String (closes #4). It still remains to implement the optimizations this enables as well as making sure that all strings are output such that when input again via repl they are equal to the original string. Also fixes the UTF-8 test "suite" (fixes #9). --- Makefile | 6 ++++-- ascii.j | 28 ++++++++++++++++++++++++++++ expr.j | 2 +- latin1.j | 10 ++++------ multi.j | 2 +- src/alloc.c | 2 +- src/array.c | 4 ++-- src/boot.j | 11 +++-------- src/dump.c | 4 ++-- src/init.c | 2 +- src/julia.h | 6 +++--- start.j | 2 +- string.j | 6 ++++-- sysimg.j | 2 +- table.j | 6 +----- test_utf8.j | 6 +++--- utf8.j | 6 ++---- 17 files changed, 62 insertions(+), 43 deletions(-) create mode 100644 ascii.j diff --git a/Makefile b/Makefile index 34d32bc09fd25..4b53beac67184 100644 --- a/Makefile +++ b/Makefile @@ -24,9 +24,11 @@ pcre_h.j: test: debug ./julia tests.j -testall: test +test-utf8: ./julia test_utf8.j +testall: test test-utf8 + SLOCCOUNT = sloccount \ --addlang makefile \ --personcost 100000 \ @@ -51,4 +53,4 @@ clean: cleanall: clean $(MAKE) -C src cleanother -.PHONY: default debug release julia-debug julia-release test testall sloccount clean cleanall +.PHONY: default debug release julia-debug julia-release test test-* testall sloccount clean cleanall diff --git a/ascii.j b/ascii.j new file mode 100644 index 0000000000000..3ebd446436c09 --- /dev/null +++ b/ascii.j @@ -0,0 +1,28 @@ +## from src/boot.j +# type ASCIIString <: String; data::Array{Uint8,1}; end + +next(s::ASCIIString, i::Index) = (char(s.data[i]), i+1) + +## overload methods for efficiency ## + +length(s::ASCIIString) = length(s.data) +cmp(a::ASCIIString, b::ASCIIString) = lexcmp(a.data, b.data) +ind2chr(s::ASCIIString, i::Int) = i +chr2ind(s::ASCIIString, i::Int) = i +strchr(s::ASCIIString, c::Char) = c < 0x80 ? memchr(s.data, c) : error("char not found") +nextind(s::ASCIIString, i::Int) = i +prevind(s::ASCIIString, i::Int) = i-1 +strcat(s::ASCIIString, t::ASCIIString, x::ASCIIString...) = ASCIIString(strdatacat(s, t, x...)) + +## outputing ASCII strings ## + +print(s::ASCIIString) = print(s.data) +write(io, s::ASCIIString) = write(io, s.data) + +## transcoding to ASCII ## + +ascii(s::ASCIIString) = s +function ascii(s::String) + f = c -> (c < 0x80) ? uint8(c) : error("invalid ASCII code point: U+$(hex(c))") + ASCIIString(map(f, chars(s))) +end diff --git a/expr.j b/expr.j index 661156a46fe13..2821624a4a872 100644 --- a/expr.j +++ b/expr.j @@ -1,6 +1,6 @@ ## symbols ## -symbol(s::Latin1String) = symbol(s.data) +symbol(s::ASCIIString) = symbol(s.data) symbol(s::UTF8String) = symbol(s.data) symbol(a::Array{Uint8,1}) = ccall(:jl_symbol_n, Any, (Ptr{Uint8}, Int32), a, int32(length(a)))::Symbol diff --git a/latin1.j b/latin1.j index d81fe5d6f174f..4256133a60ebd 100644 --- a/latin1.j +++ b/latin1.j @@ -1,7 +1,6 @@ -## from boot.j: -# type Latin1String <: String -# data::Array{Uint8,1} -# end +type Latin1String <: String + data::Array{Uint8,1} +end next(s::Latin1String, i::Index) = (char(s.data[i]), i+1) @@ -27,7 +26,6 @@ write(io, s::Latin1String) = write(io, s.data) latin1(s::Latin1String) = s function latin1(s::String) - f = c -> (c <= 0xff) ? uint8(c) : - error("invalid Latin-1 code point: U+$(hex(c))") + f = c -> (c <= 0xff) ? uint8(c) : error("invalid Latin-1 code point: U+$(hex(c))") Latin1String(map(f, chars(s))) end diff --git a/multi.j b/multi.j index c094df6f8124d..12549b852d398 100644 --- a/multi.j +++ b/multi.j @@ -194,7 +194,7 @@ function identify_socket(otherid, fd, sock) @assert i < PGRP.myid PGRP.workers[i] = Worker(locs[i].host, locs[i].port, fd, sock) PGRP.workers[i].id = i - #write(stdout_stream, latin1("$(PGRP.myid) heard from $i\n")) + #write(stdout_stream, "$(PGRP.myid) heard from $i\n") () end diff --git a/src/alloc.c b/src/alloc.c index c2939a0f82c86..5c7031db85b22 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -32,7 +32,7 @@ jl_type_t *jl_array_uint8_type; jl_type_t *jl_array_any_type; jl_struct_type_t *jl_weakref_type; jl_tag_type_t *jl_string_type; -jl_struct_type_t *jl_latin1_string_type; +jl_struct_type_t *jl_ascii_string_type; jl_struct_type_t *jl_utf8_string_type; jl_struct_type_t *jl_expr_type; jl_bits_type_t *jl_intrinsic_type; diff --git a/src/array.c b/src/array.c index d3ce60b8bf47f..22ebeb0a2d745 100644 --- a/src/array.c +++ b/src/array.c @@ -195,8 +195,8 @@ jl_value_t *jl_pchar_to_string(char *str, size_t len) { jl_array_t *a = jl_pchar_to_array(str, len); JL_GC_PUSH(&a); - jl_struct_type_t* string_type = u8_isvalid(a->data, len) < 2 ? - jl_latin1_string_type : jl_utf8_string_type; + jl_struct_type_t* string_type = u8_isvalid(a->data, len) == 1 ? // ASCII + jl_ascii_string_type : jl_utf8_string_type; jl_value_t *s = jl_apply((jl_function_t*)string_type, (jl_value_t**)&a, 1); JL_GC_POP(); return s; diff --git a/src/boot.j b/src/boot.j index f395d9beabfc0..a53fa08e4b018 100644 --- a/src/boot.j +++ b/src/boot.j @@ -117,15 +117,10 @@ isequal(w, v::WeakRef) = isequal(w, v.value) abstract String -type Latin1String <: String - data::Array{Uint8,1} -end - -type UTF8String <: String - data::Array{Uint8,1} -end +type ASCIIString <: String; data::Array{Uint8,1}; end +type UTF8String <: String; data::Array{Uint8,1}; end -typealias ByteString Union(Latin1String,UTF8String) +typealias ByteString Union(ASCIIString,UTF8String) abstract Exception diff --git a/src/dump.c b/src/dump.c index bc86d49d04894..5d15b2b6ce726 100644 --- a/src/dump.c +++ b/src/dump.c @@ -785,7 +785,7 @@ void jl_save_system_image(char *fname, char *startscriptname) jl_serialize_value(&f, jl_float64_type); jl_serialize_value(&f, jl_weakref_type); jl_serialize_value(&f, jl_string_type); - jl_serialize_value(&f, jl_latin1_string_type); + jl_serialize_value(&f, jl_ascii_string_type); jl_serialize_value(&f, jl_utf8_string_type); jl_serialize_value(&f, jl_errorexception_type); jl_serialize_value(&f, jl_typeerror_type); @@ -852,7 +852,7 @@ void jl_restore_system_image(char *fname) jl_weakref_type->env = NULL; jl_weakref_type->linfo = NULL; jl_string_type = (jl_tag_type_t*)jl_deserialize_value(&f); - jl_latin1_string_type = (jl_struct_type_t*)jl_deserialize_value(&f); + jl_ascii_string_type = (jl_struct_type_t*)jl_deserialize_value(&f); jl_utf8_string_type = (jl_struct_type_t*)jl_deserialize_value(&f); jl_errorexception_type = (jl_struct_type_t*)jl_deserialize_value(&f); jl_typeerror_type = (jl_struct_type_t*)jl_deserialize_value(&f); diff --git a/src/init.c b/src/init.c index 048e25a877dd8..34466871c28cd 100644 --- a/src/init.c +++ b/src/init.c @@ -251,7 +251,7 @@ void jl_get_builtin_hooks() jl_weakref_type->env = NULL; jl_weakref_type->linfo = NULL; jl_string_type = (jl_tag_type_t*)global("String"); - jl_latin1_string_type = (jl_struct_type_t*)global("Latin1String"); + jl_ascii_string_type = (jl_struct_type_t*)global("ASCIIString"); jl_utf8_string_type = (jl_struct_type_t*)global("UTF8String"); jl_errorexception_type = (jl_struct_type_t*)global("ErrorException"); jl_typeerror_type = (jl_struct_type_t*)global("TypeError"); diff --git a/src/julia.h b/src/julia.h index d2b0f8a030e10..dec4415a990a1 100644 --- a/src/julia.h +++ b/src/julia.h @@ -263,7 +263,7 @@ extern jl_struct_type_t *jl_array_type; extern jl_typename_t *jl_array_typename; extern jl_struct_type_t *jl_weakref_type; extern jl_tag_type_t *jl_string_type; -extern jl_struct_type_t *jl_latin1_string_type; +extern jl_struct_type_t *jl_ascii_string_type; extern jl_struct_type_t *jl_utf8_string_type; extern jl_struct_type_t *jl_errorexception_type; extern jl_struct_type_t *jl_typeerror_type; @@ -401,9 +401,9 @@ void *allocb_permanent(size_t sz); #define jl_is_task(v) jl_typeis(v,jl_task_type) #define jl_is_func(v) (jl_is_func_type(jl_typeof(v)) || jl_is_struct_type(v)) #define jl_is_function(v) jl_is_func(v) -#define jl_is_latin1_string(v) jl_typeis(v,jl_latin1_string_type) +#define jl_is_ascii_string(v) jl_typeis(v,jl_ascii_string_type) #define jl_is_utf8_string(v) jl_typeis(v,jl_utf8_string_type) -#define jl_is_byte_string(v) (jl_is_latin1_string(v) || jl_is_utf8_string(v)) +#define jl_is_byte_string(v) (jl_is_ascii_string(v) || jl_is_utf8_string(v)) #define jl_is_string(v) jl_subtype(v,(jl_value_t*)jl_string_type,1) #define jl_is_cpointer(v) jl_is_cpointer_type(jl_typeof(v)) #define jl_is_pointer(v) jl_is_cpointer_type(jl_typeof(v)) diff --git a/start.j b/start.j index c1b76323a9df2..92b414beb146c 100644 --- a/start.j +++ b/start.j @@ -36,7 +36,7 @@ set_current_output_stream(stdout_stream) stdin_stream = fdio(ccall(:jl_stdin, Int32, ())) stderr_stream = fdio(ccall(:jl_stderr, Int32, ())) load("string.j") -load("latin1.j") +load("ascii.j") load("utf8.j") load("show.j") load("regex.j") diff --git a/string.j b/string.j index a17581409087d..3857045f926d5 100644 --- a/string.j +++ b/string.j @@ -340,7 +340,9 @@ function print_escaped(s::String, q::Bool, xmax::Char) if q; print('"'); end end -print_escaped(s::Latin1String, q) = print_escaped(s, q, '\xff') +# TODO: make sure ASCII, Latin-1 and UTF-8 strings all get +# printed so that when input back they are equivalent. + print_escaped(s::String, q) = print_escaped(s, q, '\x7f') print_escaped(s::String) = print_escaped(s, false) print_quoted (s::String) = print_escaped(s, true) @@ -708,7 +710,7 @@ function uint2str(n::Int, b::Int) ccall(:uint2str, Ptr{Uint8}, (Ptr{Uint8}, Ulong, Uint64, Uint32), data, ulong(sz), uint64(n), uint32(b)) - Latin1String(data[1:(sz-1)]) # cut out terminating NUL + ASCIIString(data[1:(sz-1)]) # cut out terminating NUL end uint2str(n::Int, b::Int, len::Int) = lpad(uint2str(n,b),len,'0') diff --git a/sysimg.j b/sysimg.j index 31991633cf4a8..5ab4c95866cf2 100644 --- a/sysimg.j +++ b/sysimg.j @@ -33,7 +33,7 @@ load("io.j") ccall(:jl_set_memio_func, Void, ()) set_current_output_stream(make_stdout_stream()) # for error reporting load("string.j") -load("latin1.j") +load("ascii.j") load("utf8.j") load("show.j") diff --git a/table.j b/table.j index 4c8cb17af56c9..c86cd25b90567 100644 --- a/table.j +++ b/table.j @@ -81,11 +81,7 @@ function hash(a::Array) h end -# TODO: should we distinguish a UTF8String and -# a Latin1String containing the same exact data? - -hash(s::Union(UTF8String,Latin1String)) = - ccall(:memhash32, Uint32, (Ptr{Void}, Size), s.data, length(s.data)) +hash(s::ByteString) = ccall(:memhash32, Uint32, (Ptr{Void}, Size), s.data, length(s.data)) # hash table diff --git a/test_utf8.j b/test_utf8.j index b4e4e333744f3..a6bb73482d5e6 100644 --- a/test_utf8.j +++ b/test_utf8.j @@ -1,6 +1,6 @@ -utf32 = CharString(read(open("unicode/UTF-32LE.txt"), Char, 1112065)[2:]); -utf8 = UTF8String(read(open("unicode/UTF-8.txt"), Uint8, 4382595)[4:]); -@assert utf32 == utf8 +str1 = CharString(read(open("unicode/UTF-32LE.txt"), Char, 1112065)[2:]); +str2 = UTF8String(read(open("unicode/UTF-8.txt"), Uint8, 4382595)[4:]); +@assert str1 == str2 str1 = "∀ ε > 0, ∃ δ > 0: |x-y| < δ ⇒ |f(x)-f(y)| < ε" str2 = CharString( diff --git a/utf8.j b/utf8.j index 8c86e72fdf195..9f7044993bf49 100644 --- a/utf8.j +++ b/utf8.j @@ -1,7 +1,5 @@ -## from boot.j: -# type UTF8String <: String -# data::Array{Uint8,1} -# end +## from src/boot.j: +# type UTF8String <: String; data::Array{Uint8,1}; end ## basic UTF-8 decoding & iteration ##