diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml new file mode 100644 index 00000000..6ed90a1f --- /dev/null +++ b/.github/workflows/benchmarks.yml @@ -0,0 +1,26 @@ +name: Benchmarks + +on: [push, pull_request] + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Install Memcached 1.6.23 + working-directory: scripts + env: + MEMCACHED_VERSION: 1.6.23 + run: | + chmod +x ./install_memcached.sh + ./install_memcached.sh + memcached -d + memcached -d -p 11222 + - name: Set up Ruby + uses: ruby/setup-ruby@v1 + with: + ruby-version: 3.2 + bundler-cache: true # 'bundle install' and cache + - name: Run Benchmarks + run: RUBY_YJIT_ENABLE=1 BENCH_TARGET=all bundle exec bin/benchmark diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml new file mode 100644 index 00000000..e4e59004 --- /dev/null +++ b/.github/workflows/profile.yml @@ -0,0 +1,38 @@ +name: Profiles + +on: [push, pull_request] + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Install Memcached 1.6.23 + working-directory: scripts + env: + MEMCACHED_VERSION: 1.6.23 + run: | + chmod +x ./install_memcached.sh + ./install_memcached.sh + memcached -d + - name: Set up Ruby + uses: ruby/setup-ruby@v1 + with: + ruby-version: 3.4 + bundler-cache: true # 'bundle install' and cache + - name: Run Profiles + run: RUBY_YJIT_ENABLE=1 BENCH_TARGET=all bundle exec bin/profile + - name: Upload profile results + uses: actions/upload-artifact@v4 + with: + name: profile-results + path: | + client_get_profile.json + socket_get_profile.json + client_set_profile.json + socket_set_profile.json + client_get_multi_profile.json + socket_get_multi_profile.json + client_set_multi_profile.json + socket_set_multi_profile.json diff --git a/bin/benchmark b/bin/benchmark new file mode 100755 index 00000000..8894fb5a --- /dev/null +++ b/bin/benchmark @@ -0,0 +1,255 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +# This helps benchmark current performance of Dalli +# as well as compare performance of optimizated and non-optimized calls like multi-set vs set +# +# run with: +# bundle exec bin/benchmark +# RUBY_YJIT_ENABLE=1 BENCH_TARGET=get bundle exec bin/benchmark +require 'bundler/inline' +require 'json' + +gemfile do + source 'https://rubygems.org' + gem 'benchmark-ips' + gem 'logger' +end + +require_relative '../lib/dalli' +require 'benchmark/ips' +require 'monitor' + +## +# StringSerializer is a serializer that avoids the overhead of Marshal or JSON. +## +class StringSerializer + def self.dump(value) + value + end + + def self.load(value) + value + end +end + +dalli_url = ENV['BENCH_CACHE_URL'] || '127.0.0.1:11211' +bench_target = ENV['BENCH_TARGET'] || 'set' +bench_time = (ENV['BENCH_TIME'] || 10).to_i +bench_warmup = (ENV['BENCH_WARMUP'] || 3).to_i +bench_payload_size = (ENV['BENCH_PAYLOAD_SIZE'] || 700_000).to_i +payload = 'B' * bench_payload_size +TERMINATOR = "\r\n" +puts "yjit: #{RubyVM::YJIT.enabled?}" + +client = Dalli::Client.new(dalli_url, serializer: StringSerializer, compress: false, raw: true) +multi_client = Dalli::Client.new('localhost:11211,localhost:11222', serializer: StringSerializer, compress: false, + raw: true) + +# The raw socket implementation is used to benchmark the performance of dalli & the overhead of the various abstractions +# in the library. +sock = TCPSocket.new('127.0.0.1', '11211', connect_timeout: 1) +sock.setsockopt(Socket::IPPROTO_TCP, Socket::TCP_NODELAY, true) +sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_KEEPALIVE, true) +# Benchmarks didn't see any performance gains from increasing the SO_RCVBUF buffer size +# sock.setsockopt(Socket::SOL_SOCKET, ::Socket::SO_RCVBUF, 1024 * 1024 * 8) +# Benchamrks did see an improvement in performance when increasing the SO_SNDBUF buffer size +# sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDBUF, 1024 * 1024 * 8) + +# ensure the clients are all connected and working +client.set('key', payload) +multi_client.set('multi_key', payload) +sock.write("set sock_key 0 3600 #{payload.bytesize}\r\n") +sock.write(payload) +sock.write(TERMINATOR) +sock.flush +sock.readline # clear the buffer + +raise 'dalli client mismatch' if payload != client.get('key') + +raise 'multi dalli client mismatch' if payload != multi_client.get('multi_key') + +sock.write("mg sock_key v\r\n") +sock.readline +sock_value = sock.read(payload.bytesize) +sock.read(TERMINATOR.bytesize) +raise 'sock mismatch' if payload != sock_value + +# ensure we have basic data for the benchmarks and get calls +payload_smaller = 'B' * (bench_payload_size / 10) +pairs = {} +100.times do |i| + pairs["multi_#{i}"] = payload_smaller +end +client.quiet do + pairs.each do |key, value| + client.set(key, value, 3600, raw: true) + end +end + +### +# GC Suite +# benchmark without GC skewing things +### +class GCSuite + def warming(*) + run_gc + end + + def running(*) + run_gc + end + + def warmup_stats(*); end + + def add_report(*); end + + private + + def run_gc + GC.enable + GC.start + GC.disable + end +end +suite = GCSuite.new + +# rubocop:disable Metrics/MethodLength +# rubocop:disable Metrics/PerceivedComplexity +# rubocop:disable Metrics/AbcSize +# rubocop:disable Metrics/CyclomaticComplexity +def sock_get_multi(sock, pairs) + count = pairs.length + pairs.each_key do |key| + count -= 1 + tail = count.zero? ? '' : 'q' + sock.write("mg #{key} v f k #{tail}\r\n") + end + sock.flush + # read all the memcached responses back and build a hash of key value pairs + results = {} + last_result = false + while (line = sock.readline.chomp!(TERMINATOR)) != '' + last_result = true if line.start_with?('EN ') + next unless line.start_with?('VA ') || last_result + + _, value_length, _flags, key = line.split + results[key[1..]] = sock.read(value_length.to_i) + sock.read(TERMINATOR.length) + break if results.size == pairs.size + break if last_result + end + results +end +# rubocop:enable Metrics/MethodLength +# rubocop:enable Metrics/PerceivedComplexity +# rubocop:enable Metrics/AbcSize +# rubocop:enable Metrics/CyclomaticComplexity + +if %w[all set].include?(bench_target) + Benchmark.ips do |x| + x.config(warmup: bench_warmup, time: bench_time, suite: suite) + x.report('client set') { client.set('key', payload) } + # x.report('multi client set') { multi_client.set('string_key', payload) } + x.report('raw sock set') do + sock.write("ms sock_key #{payload.bytesize} T3600 MS\r\n") + sock.write(payload) + sock.write("\r\n") + sock.flush + sock.readline # clear the buffer + end + x.compare! + end +end + +@lock = Monitor.new +if %w[all get].include?(bench_target) + Benchmark.ips do |x| + x.config(warmup: bench_warmup, time: bench_time, suite: suite) + x.report('get dalli') do + result = client.get('key') + raise 'mismatch' unless result == payload + end + # NOTE: while this is the fastest it is not thread safe and is blocking vs IO sharing friendly + x.report('get sock') do + sock.write("mg sock_key v\r\n") + sock.readline + result = sock.read(payload.bytesize) + sock.read(TERMINATOR.bytesize) + raise 'mismatch' unless result == payload + end + # NOTE: This shows that when adding thread safety & non-blocking IO we are slower for single process/thread use case + x.report('get sock non-blocking') do + @lock.synchronize do + sock.write("mg sock_key v\r\n") + sock.readline + count = payload.bytesize + value = String.new(capacity: count + 1) + loop do + begin + value << sock.read_nonblock(count - value.bytesize) + rescue Errno::EAGAIN + sock.wait_readable + retry + rescue EOFError + puts 'EOFError' + break + end + break if value.bytesize == count + end + sock.read(TERMINATOR.bytesize) + raise 'mismatch' unless value == payload + end + end + x.compare! + end +end + +if %w[all get_multi].include?(bench_target) + Benchmark.ips do |x| + x.config(warmup: bench_warmup, time: bench_time, suite: suite) + x.report('get 100 keys') do + result = client.get_multi(pairs.keys) + raise 'mismatch' unless result == pairs + end + x.report('get 100 keys raw sock') do + result = sock_get_multi(sock, pairs) + raise 'mismatch' unless result == pairs + end + x.compare! + end +end + +if %w[all set_multi].include?(bench_target) + Benchmark.ips do |x| + x.config(warmup: bench_warmup, time: bench_time, suite: suite) + x.report('write 100 keys simple') do + client.quiet do + pairs.each do |key, value| + client.set(key, value, 3600, raw: true) + end + end + end + # TODO: uncomment this once we add PR adding set_multi + # x.report('multi client set_multi 100') do + # multi_client.set_multi(pairs, 3600, raw: true) + # end + x.report('write 100 keys rawsock') do + count = pairs.length + tail = '' + value_bytesize = payload_smaller.bytesize + ttl = 3600 + + pairs.each do |key, value| + count -= 1 + tail = count.zero? ? '' : 'q' + sock.write(String.new("ms #{key} #{value_bytesize} c F0 T#{ttl} MS #{tail}\r\n", + capacity: key.size + value_bytesize + 40) << value << TERMINATOR) + end + sock.flush + sock.gets(TERMINATOR) # clear the buffer + end + # x.report('write_mutli 100 keys') { client.set_multi(pairs, 3600, raw: true) } + x.compare! + end +end diff --git a/bin/profile b/bin/profile new file mode 100755 index 00000000..cd2b32c6 --- /dev/null +++ b/bin/profile @@ -0,0 +1,199 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +# This helps profile specific call paths in Dalli +# finding and fixing performance issues in these profiles should result in improvements in the dalli benchmarks +# +# run with: +# RUBY_YJIT_ENABLE=1 bundle exec bin/profile +require 'bundler/inline' +require 'json' + +gemfile do + source 'https://rubygems.org' + gem 'benchmark-ips' + gem 'vernier' + gem 'logger' +end + +require_relative '../lib/dalli' +require 'benchmark/ips' +require 'vernier' + +## +# StringSerializer is a serializer that avoids the overhead of Marshal or JSON. +## +class StringSerializer + def self.dump(value) + value + end + + def self.load(value) + value + end +end + +dalli_url = ENV['BENCH_CACHE_URL'] || '127.0.0.1:11211' +bench_target = ENV['BENCH_TARGET'] || 'get' +bench_time = (ENV['BENCH_TIME'] || 10).to_i +bench_payload_size = (ENV['BENCH_PAYLOAD_SIZE'] || 700_000).to_i +TERMINATOR = "\r\n" +puts "yjit: #{RubyVM::YJIT.enabled?}" + +client = Dalli::Client.new(dalli_url, serializer: StringSerializer, compress: false) + +# The raw socket implementation is used to benchmark the performance of dalli & the overhead of the various abstractions +# in the library. +sock = TCPSocket.new('127.0.0.1', '11211', connect_timeout: 1) +sock.setsockopt(Socket::IPPROTO_TCP, Socket::TCP_NODELAY, true) +sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_KEEPALIVE, true) +# Benchmarks didn't see any performance gains from increasing the SO_RCVBUF buffer size +# sock.setsockopt(Socket::SOL_SOCKET, ::Socket::SO_RCVBUF, 1024 * 1024 * 8) +# Benchamrks did see an improvement in performance when increasing the SO_SNDBUF buffer size +# sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDBUF, 1024 * 1024 * 8) + +payload = 'B' * bench_payload_size +dalli_key = 'dalli_key' +# ensure the clients are all connected and working +client.set(dalli_key, payload) +sock.write("set sock_key 0 3600 #{payload.bytesize}\r\n") +sock.write(payload) +sock.write(TERMINATOR) +sock.flush +sock.readline # clear the buffer + +# ensure we have basic data for the benchmarks and get calls +payload_smaller = 'B' * (bench_payload_size / 10) +pairs = {} +100.times do |i| + pairs["multi_#{i}"] = payload_smaller +end +client.quiet do + pairs.each do |key, value| + client.set(key, value, 3600, raw: true) + end +end + +# rubocop:disable Metrics/MethodLength +# rubocop:disable Metrics/PerceivedComplexity +# rubocop:disable Metrics/AbcSize +# rubocop:disable Metrics/CyclomaticComplexity +def sock_get_multi(sock, pairs) + count = pairs.length + pairs.each_key do |key| + count -= 1 + tail = count.zero? ? '' : 'q' + sock.write("mg #{key} v f k #{tail}\r\n") + end + sock.flush + # read all the memcached responses back and build a hash of key value pairs + results = {} + last_result = false + while (line = sock.readline.chomp!(TERMINATOR)) != '' + last_result = true if line.start_with?('EN ') + next unless line.start_with?('VA ') || last_result + + _, value_length, _flags, key = line.split + results[key[1..]] = sock.read(value_length.to_i) + sock.read(TERMINATOR.length) + break if results.size == pairs.size + break if last_result + end + results +end +# rubocop:enable Metrics/PerceivedComplexity +# rubocop:enable Metrics/AbcSize +# rubocop:enable Metrics/CyclomaticComplexity + +def sock_set_multi(sock, pairs) + count = pairs.length + tail = '' + ttl = 3600 + + pairs.each do |key, value| + count -= 1 + tail = count.zero? ? '' : 'q' + sock.write(String.new("ms #{key} #{value.bytesize} c F0 T#{ttl} MS #{tail}\r\n", + capacity: key.size + value.bytesize + 40)) + sock.write(value) + sock.write(TERMINATOR) + end + sock.flush + sock.gets(TERMINATOR) # clear the buffer +end +# rubocop:enable Metrics/MethodLength + +if %w[all get].include?(bench_target) + Vernier.profile(out: 'client_get_profile.json') do + start_time = Time.now + while Time.now - start_time < bench_time + result = client.get(dalli_key) + raise 'mismatch' unless result == payload + end + end + + Vernier.profile(out: 'socket_get_profile.json') do + start_time = Time.now + while Time.now - start_time < bench_time + sock.write("mg sock_key v\r\n") + sock.readline + result = sock.read(payload.bytesize) + sock.read(TERMINATOR.bytesize) + raise 'mismatch' unless result == payload + end + end +end + +if %w[all set].include?(bench_target) + Vernier.profile(out: 'client_set_profile.json') do + start_time = Time.now + client.set(dalli_key, payload, 3600, raw: true) while Time.now - start_time < bench_time + end + + Vernier.profile(out: 'socket_set_profile.json') do + start_time = Time.now + while Time.now - start_time < bench_time + sock.write("ms sock_key #{payload.bytesize} T3600 MS\r\n") + sock.write(payload) + sock.write("\r\n") + sock.flush + sock.readline # clear the buffer + end + end +end + +if %w[all get_multi].include?(bench_target) + Vernier.profile(out: 'client_get_multi_profile.json') do + start_time = Time.now + while Time.now - start_time < bench_time + result = client.get_multi(pairs.keys) + raise 'mismatch' unless result == pairs + end + end + + Vernier.profile(out: 'socket_get_multi_profile.json') do + start_time = Time.now + while Time.now - start_time < bench_time + result = sock_get_multi(sock, pairs) + raise 'mismatch' unless result == pairs + end + end +end + +if %w[all set_multi].include?(bench_target) + Vernier.profile(out: 'client_set_multi_profile.json') do + start_time = Time.now + # until we port over set_multi, compare the simple loop + # client.set_multi(pairs, 3600, raw: true) while Time.now - start_time < bench_time + while Time.now - start_time < bench_time + pairs.each do |key, value| + client.set(key, value, 3600, raw: true) + end + end + end + + Vernier.profile(out: 'socket_set_multi_profile.json') do + start_time = Time.now + sock_set_multi(sock, pairs) while Time.now - start_time < bench_time + end +end