From 8a9f33a060eafca7080cea4ae084194877e50ed5 Mon Sep 17 00:00:00 2001 From: Michael Go Date: Tue, 29 Oct 2024 22:04:37 -0300 Subject: [PATCH] raise syntax error from lexer parser with utf8 character --- lib/liquid/lexer.rb | 11 +++++++++-- test/unit/lexer_unit_test.rb | 11 +++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/lib/liquid/lexer.rb b/lib/liquid/lexer.rb index 5e8e1023a..6d956484b 100644 --- a/lib/liquid/lexer.rb +++ b/lib/liquid/lexer.rb @@ -171,6 +171,7 @@ def tokenize break if @ss.eos? + start_pos = @ss.pos peeked = @ss.peek_byte if (special = SPECIAL_TABLE[peeked]) @@ -196,7 +197,7 @@ def tokenize @output << found @ss.scan_byte else - raise SyntaxError, "Unexpected character #{peeked.chr}" + raise_syntax_error(start_pos) end elsif (sub_table = COMPARISON_JUMP_TABLE[peeked]) @ss.scan_byte @@ -217,7 +218,7 @@ def tokenize [type, t] end else - raise SyntaxError, "Unexpected character #{peeked.chr}" + raise_syntax_error(start_pos) end end end @@ -225,6 +226,12 @@ def tokenize @output << EOS end + + def raise_syntax_error(start_pos) + @ss.pos = start_pos + # the character could be a UTF-8 character, use getch to get all the bytes + raise SyntaxError, "Unexpected character #{@ss.getch}" + end end Lexer = StringScanner.instance_methods.include?(:scan_byte) ? Lexer2 : Lexer1 diff --git a/test/unit/lexer_unit_test.rb b/test/unit/lexer_unit_test.rb index 0676033e6..b372c232e 100644 --- a/test/unit/lexer_unit_test.rb +++ b/test/unit/lexer_unit_test.rb @@ -84,4 +84,15 @@ def test_greater_than_two_digits tokens = Lexer.new("foo > 12").tokenize assert_equal([[:id, 'foo'], [:comparison, '>'], [:number, '12'], [:end_of_string]], tokens) end + + def test_error_with_utf8_character + error = assert_raises(SyntaxError) do + Lexer.new("1 < 1Ø").tokenize + end + + assert_equal( + 'Liquid syntax error: Unexpected character Ø', + error.message, + ) + end end