Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add IDNA support and integrate with DNS lookup #2543

Merged
merged 13 commits into from
Apr 10, 2018
29 changes: 29 additions & 0 deletions spec/std/uri/punycode_spec.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
require "spec"
require "uri/punycode"

describe URI::Punycode do
[
{"3年B組金八先生", "3B-ww4c5e180e575a65lsy2b"},
{"安室奈美恵-with-SUPER-MONKEYS", "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"},
{"Hello-Another-Way-それぞれの場所", "Hello-Another-Way--fc4qua05auwb3674vfr0b"},
{"ひとつ屋根の下2", "2-u9tlzr9756bt3uc0v"},
{"MajiでKoiする5秒前", "MajiKoi5-783gue6qz075azm5e"},
{"パフィーdeルンバ", "de-jg4avhby1noc0d"},
{"そのスピードで", "d9juau41awczczp"},
{"Hello-Another-Way-それぞれ", "Hello-Another-Way--fc4qua97gba"},
].each do |example|
dec, enc = example

it "encodes #{dec} to #{enc}" do
URI::Punycode.encode(dec).should eq enc
end

it "decodes #{enc} to #{dec}" do
URI::Punycode.decode(enc).should eq dec
end
end

it "translate to ascii only host name" do
URI::Punycode.to_ascii("test.テスト.テスト").should eq "test.xn--zckzah.xn--zckzah"
end
end
8 changes: 8 additions & 0 deletions src/socket/addrinfo.cr
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
require "uri/punycode"

class Socket
# Domain name resolver.
struct Addrinfo
Expand Down Expand Up @@ -76,6 +78,12 @@ class Socket
end

private def self.getaddrinfo(domain, service, family, type, protocol, timeout)
# RFC 3986 says:
# > When a non-ASCII registered name represents an internationalized domain name
# > intended for resolution via the DNS, the name must be transformed to the IDNA
# > encoding [RFC3490] prior to name lookup.
domain = URI::Punycode.to_ascii domain

hints = LibC::Addrinfo.new
hints.ai_family = (family || Family::UNSPEC).to_i32
hints.ai_socktype = type
Expand Down
173 changes: 173 additions & 0 deletions src/uri/punycode.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
# `Punycode` provides an interface for IDNA encoding (RFC 5980),
# which is defined in RFC 3493
#
# Implementation based on Mathias Bynens `punnycode.js` project
# https://github.com/bestiejs/punycode.js/
#
# RFC 3492:
# Method to use non-ascii characters as host name of URI
# https://www.ietf.org/rfc/rfc3492.txt
#
# RFC 5980:
# Internationalized Domain Names in Application
# https://www.ietf.org/rfc/rfc5980.txt
class URI
class Punycode
private BASE = 36
private TMIN = 1
private TMAX = 26
private SKEW = 38
private DAMP = 700
private INITIAL_BIAS = 72
private INITIAL_N = 128

private DELIMITER = '-'

private BASE36 = "abcdefghijklmnopqrstuvwxyz0123456789"

private def self.adapt(delta, numpoints, firsttime)
delta /= firsttime ? DAMP : 2
delta += delta / numpoints
k = 0
while delta > ((BASE - TMIN) * TMAX) / 2
delta /= BASE - TMIN
k += BASE
end
k + (((BASE - TMIN + 1) * delta) / (delta + SKEW))
end

def self.encode(string)
String.build { |io| encode string, io }
end

def self.encode(string, io)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a minor nit bit: methods receiving an optional IO should have it prepended as first argument. This makes it easier if maybe additional arguments are added later to keep both signatures similar. Even if that's unlikely I'd recommend to stick with this strategy.

Copy link
Contributor Author

@makenowjust makenowjust Mar 6, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

URI.encode is also accept String as first argument and IO as second argument. I think no problem.

others = [] of Char

string.each_char do |c|
if c < '\u0080'
io << c
else
others.push c
end
end

return if others.empty?
others.sort!

h = string.size - others.size + 1
delta = 0_u32
n = INITIAL_N
bias = INITIAL_BIAS
firsttime = true
prev = nil

io << DELIMITER if h > 1

others.each do |m|
next if m == prev
prev = m

raise Exception.new("Overflow: input needs wider integers to process") if m.ord - n > (Int32::MAX - delta) / h
delta += (m.ord - n) * h
n = m.ord + 1

string.each_char do |c|
if c < m
raise Exception.new("Overflow: input needs wider integers to process") if delta > Int32::MAX - 1
delta += 1
elsif c == m
q = delta
k = BASE
loop do
t = k <= bias ? TMIN : k >= bias + TMAX ? TMAX : k - bias
break if q < t
io << BASE36[t + ((q - t) % (BASE - t))]
q = (q - t) / (BASE - t)
k += BASE
end
io << BASE36[q]

bias = adapt delta, h, firsttime
delta = 0
h += 1
firsttime = false
end
end
delta += 1
end
end

def self.decode(string)
output, _, rest = string.rpartition(DELIMITER)
output = output.chars

n = INITIAL_N
bias = INITIAL_BIAS
i = 0
init = true
w = oldi = k = 0

rest.each_char do |c|
if init
w = 1
oldi = i
k = BASE
init = false
end

digit = case c
when .ascii_lowercase?
c.ord - 0x61
when .ascii_uppercase?
c.ord - 0x41
when .ascii_number?
c.ord - 0x30 + 26
else
raise ArgumentError.new("Invalid input")
end

i += digit * w
t = k <= bias ? TMIN : k >= bias + TMAX ? TMAX : k - bias

unless digit < t
w *= BASE - t
k += BASE
else
outsize = output.size + 1
bias = adapt i - oldi, outsize, oldi == 0
n += i / outsize
i %= outsize
output.insert i, n.chr
i += 1
init = true
end
end

raise ArgumentError.new("Invalid input") unless init

output.join
end

def self.to_ascii(string)
return string if string.ascii_only?

String.build do |io|
first = true
string.split('.') do |part|
unless first
io << "."
end

if part.ascii_only?
io << part
else
io << "xn--"
encode part, io
end

first = false
end
end
end
end
end