diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
new file mode 100644
index 00000000..6ed90a1f
--- /dev/null
+++ b/.github/workflows/benchmarks.yml
@@ -0,0 +1,26 @@
+name: Benchmarks
+
+on: [push, pull_request]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install Memcached 1.6.23
+      working-directory: scripts
+      env:
+        MEMCACHED_VERSION: 1.6.23
+      run: |
+        chmod +x ./install_memcached.sh
+        ./install_memcached.sh
+        memcached -d
+        memcached -d -p 11222
+    - name: Set up Ruby
+      uses: ruby/setup-ruby@v1
+      with:
+        ruby-version: 3.2
+        bundler-cache: true # 'bundle install' and cache
+    - name: Run Benchmarks
+      run: RUBY_YJIT_ENABLE=1 BENCH_TARGET=all bundle exec bin/benchmark
diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml
new file mode 100644
index 00000000..e4e59004
--- /dev/null
+++ b/.github/workflows/profile.yml
@@ -0,0 +1,38 @@
+name: Profiles
+
+on: [push, pull_request]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install Memcached 1.6.23
+      working-directory: scripts
+      env:
+        MEMCACHED_VERSION: 1.6.23
+      run: |
+        chmod +x ./install_memcached.sh
+        ./install_memcached.sh
+        memcached -d
+    - name: Set up Ruby
+      uses: ruby/setup-ruby@v1
+      with:
+        ruby-version: 3.4
+        bundler-cache: true # 'bundle install' and cache
+    - name: Run Profiles
+      run: RUBY_YJIT_ENABLE=1 BENCH_TARGET=all bundle exec bin/profile
+    - name: Upload profile results
+      uses: actions/upload-artifact@v4
+      with:
+        name: profile-results
+        path: |
+          client_get_profile.json
+          socket_get_profile.json
+          client_set_profile.json
+          socket_set_profile.json
+          client_get_multi_profile.json
+          socket_get_multi_profile.json
+          client_set_multi_profile.json
+          socket_set_multi_profile.json
diff --git a/bin/benchmark b/bin/benchmark
new file mode 100755
index 00000000..8894fb5a
--- /dev/null
+++ b/bin/benchmark
@@ -0,0 +1,255 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+# This helps benchmark current performance of Dalli
+# as well as compare performance of optimizated and non-optimized calls like multi-set vs set
+#
+# run with:
+# bundle exec bin/benchmark
+# RUBY_YJIT_ENABLE=1 BENCH_TARGET=get bundle exec bin/benchmark
+require 'bundler/inline'
+require 'json'
+
+gemfile do
+  source 'https://rubygems.org'
+  gem 'benchmark-ips'
+  gem 'logger'
+end
+
+require_relative '../lib/dalli'
+require 'benchmark/ips'
+require 'monitor'
+
+##
+# StringSerializer is a serializer that avoids the overhead of Marshal or JSON.
+##
+class StringSerializer
+  def self.dump(value)
+    value
+  end
+
+  def self.load(value)
+    value
+  end
+end
+
+dalli_url = ENV['BENCH_CACHE_URL'] || '127.0.0.1:11211'
+bench_target = ENV['BENCH_TARGET'] || 'set'
+bench_time = (ENV['BENCH_TIME'] || 10).to_i
+bench_warmup = (ENV['BENCH_WARMUP'] || 3).to_i
+bench_payload_size = (ENV['BENCH_PAYLOAD_SIZE'] || 700_000).to_i
+payload = 'B' * bench_payload_size
+TERMINATOR = "\r\n"
+puts "yjit: #{RubyVM::YJIT.enabled?}"
+
+client = Dalli::Client.new(dalli_url, serializer: StringSerializer, compress: false, raw: true)
+multi_client = Dalli::Client.new('localhost:11211,localhost:11222', serializer: StringSerializer, compress: false,
+                                                                    raw: true)
+
+# The raw socket implementation is used to benchmark the performance of dalli & the overhead of the various abstractions
+# in the library.
+sock = TCPSocket.new('127.0.0.1', '11211', connect_timeout: 1)
+sock.setsockopt(Socket::IPPROTO_TCP, Socket::TCP_NODELAY, true)
+sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_KEEPALIVE, true)
+# Benchmarks didn't see any performance gains from increasing the SO_RCVBUF buffer size
+# sock.setsockopt(Socket::SOL_SOCKET, ::Socket::SO_RCVBUF, 1024 * 1024 * 8)
+# Benchamrks did see an improvement in performance when increasing the SO_SNDBUF buffer size
+# sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDBUF, 1024 * 1024 * 8)
+
+# ensure the clients are all connected and working
+client.set('key', payload)
+multi_client.set('multi_key', payload)
+sock.write("set sock_key 0 3600 #{payload.bytesize}\r\n")
+sock.write(payload)
+sock.write(TERMINATOR)
+sock.flush
+sock.readline # clear the buffer
+
+raise 'dalli client mismatch' if payload != client.get('key')
+
+raise 'multi dalli client mismatch' if payload != multi_client.get('multi_key')
+
+sock.write("mg sock_key v\r\n")
+sock.readline
+sock_value = sock.read(payload.bytesize)
+sock.read(TERMINATOR.bytesize)
+raise 'sock mismatch' if payload != sock_value
+
+# ensure we have basic data for the benchmarks and get calls
+payload_smaller = 'B' * (bench_payload_size / 10)
+pairs = {}
+100.times do |i|
+  pairs["multi_#{i}"] = payload_smaller
+end
+client.quiet do
+  pairs.each do |key, value|
+    client.set(key, value, 3600, raw: true)
+  end
+end
+
+###
+# GC Suite
+# benchmark without GC skewing things
+###
+class GCSuite
+  def warming(*)
+    run_gc
+  end
+
+  def running(*)
+    run_gc
+  end
+
+  def warmup_stats(*); end
+
+  def add_report(*); end
+
+  private
+
+  def run_gc
+    GC.enable
+    GC.start
+    GC.disable
+  end
+end
+suite = GCSuite.new
+
+# rubocop:disable Metrics/MethodLength
+# rubocop:disable Metrics/PerceivedComplexity
+# rubocop:disable Metrics/AbcSize
+# rubocop:disable Metrics/CyclomaticComplexity
+def sock_get_multi(sock, pairs)
+  count = pairs.length
+  pairs.each_key do |key|
+    count -= 1
+    tail = count.zero? ? '' : 'q'
+    sock.write("mg #{key} v f k #{tail}\r\n")
+  end
+  sock.flush
+  # read all the memcached responses back and build a hash of key value pairs
+  results = {}
+  last_result = false
+  while (line = sock.readline.chomp!(TERMINATOR)) != ''
+    last_result = true if line.start_with?('EN ')
+    next unless line.start_with?('VA ') || last_result
+
+    _, value_length, _flags, key = line.split
+    results[key[1..]] = sock.read(value_length.to_i)
+    sock.read(TERMINATOR.length)
+    break if results.size == pairs.size
+    break if last_result
+  end
+  results
+end
+# rubocop:enable Metrics/MethodLength
+# rubocop:enable Metrics/PerceivedComplexity
+# rubocop:enable Metrics/AbcSize
+# rubocop:enable Metrics/CyclomaticComplexity
+
+if %w[all set].include?(bench_target)
+  Benchmark.ips do |x|
+    x.config(warmup: bench_warmup, time: bench_time, suite: suite)
+    x.report('client set') { client.set('key', payload) }
+    # x.report('multi client set') { multi_client.set('string_key', payload) }
+    x.report('raw sock set') do
+      sock.write("ms sock_key #{payload.bytesize} T3600 MS\r\n")
+      sock.write(payload)
+      sock.write("\r\n")
+      sock.flush
+      sock.readline # clear the buffer
+    end
+    x.compare!
+  end
+end
+
+@lock = Monitor.new
+if %w[all get].include?(bench_target)
+  Benchmark.ips do |x|
+    x.config(warmup: bench_warmup, time: bench_time, suite: suite)
+    x.report('get dalli') do
+      result = client.get('key')
+      raise 'mismatch' unless result == payload
+    end
+    # NOTE: while this is the fastest it is not thread safe and is blocking vs IO sharing friendly
+    x.report('get sock') do
+      sock.write("mg sock_key v\r\n")
+      sock.readline
+      result = sock.read(payload.bytesize)
+      sock.read(TERMINATOR.bytesize)
+      raise 'mismatch' unless result == payload
+    end
+    # NOTE: This shows that when adding thread safety & non-blocking IO we are slower for single process/thread use case
+    x.report('get sock non-blocking') do
+      @lock.synchronize do
+        sock.write("mg sock_key v\r\n")
+        sock.readline
+        count = payload.bytesize
+        value = String.new(capacity: count + 1)
+        loop do
+          begin
+            value << sock.read_nonblock(count - value.bytesize)
+          rescue Errno::EAGAIN
+            sock.wait_readable
+            retry
+          rescue EOFError
+            puts 'EOFError'
+            break
+          end
+          break if value.bytesize == count
+        end
+        sock.read(TERMINATOR.bytesize)
+        raise 'mismatch' unless value == payload
+      end
+    end
+    x.compare!
+  end
+end
+
+if %w[all get_multi].include?(bench_target)
+  Benchmark.ips do |x|
+    x.config(warmup: bench_warmup, time: bench_time, suite: suite)
+    x.report('get 100 keys') do
+      result = client.get_multi(pairs.keys)
+      raise 'mismatch' unless result == pairs
+    end
+    x.report('get 100 keys raw sock') do
+      result = sock_get_multi(sock, pairs)
+      raise 'mismatch' unless result == pairs
+    end
+    x.compare!
+  end
+end
+
+if %w[all set_multi].include?(bench_target)
+  Benchmark.ips do |x|
+    x.config(warmup: bench_warmup, time: bench_time, suite: suite)
+    x.report('write 100 keys simple') do
+      client.quiet do
+        pairs.each do |key, value|
+          client.set(key, value, 3600, raw: true)
+        end
+      end
+    end
+    # TODO: uncomment this once we add PR adding set_multi
+    # x.report('multi client set_multi 100') do
+    #   multi_client.set_multi(pairs, 3600, raw: true)
+    # end
+    x.report('write 100 keys rawsock') do
+      count = pairs.length
+      tail = ''
+      value_bytesize = payload_smaller.bytesize
+      ttl = 3600
+
+      pairs.each do |key, value|
+        count -= 1
+        tail = count.zero? ? '' : 'q'
+        sock.write(String.new("ms #{key} #{value_bytesize} c F0 T#{ttl} MS #{tail}\r\n",
+                              capacity: key.size + value_bytesize + 40) << value << TERMINATOR)
+      end
+      sock.flush
+      sock.gets(TERMINATOR) # clear the buffer
+    end
+    # x.report('write_mutli 100 keys') { client.set_multi(pairs, 3600, raw: true) }
+    x.compare!
+  end
+end
diff --git a/bin/profile b/bin/profile
new file mode 100755
index 00000000..cd2b32c6
--- /dev/null
+++ b/bin/profile
@@ -0,0 +1,199 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+# This helps profile specific call paths in Dalli
+# finding and fixing performance issues in these profiles should result in improvements in the dalli benchmarks
+#
+# run with:
+# RUBY_YJIT_ENABLE=1 bundle exec bin/profile
+require 'bundler/inline'
+require 'json'
+
+gemfile do
+  source 'https://rubygems.org'
+  gem 'benchmark-ips'
+  gem 'vernier'
+  gem 'logger'
+end
+
+require_relative '../lib/dalli'
+require 'benchmark/ips'
+require 'vernier'
+
+##
+# StringSerializer is a serializer that avoids the overhead of Marshal or JSON.
+##
+class StringSerializer
+  def self.dump(value)
+    value
+  end
+
+  def self.load(value)
+    value
+  end
+end
+
+dalli_url = ENV['BENCH_CACHE_URL'] || '127.0.0.1:11211'
+bench_target = ENV['BENCH_TARGET'] || 'get'
+bench_time = (ENV['BENCH_TIME'] || 10).to_i
+bench_payload_size = (ENV['BENCH_PAYLOAD_SIZE'] || 700_000).to_i
+TERMINATOR = "\r\n"
+puts "yjit: #{RubyVM::YJIT.enabled?}"
+
+client = Dalli::Client.new(dalli_url, serializer: StringSerializer, compress: false)
+
+# The raw socket implementation is used to benchmark the performance of dalli & the overhead of the various abstractions
+# in the library.
+sock = TCPSocket.new('127.0.0.1', '11211', connect_timeout: 1)
+sock.setsockopt(Socket::IPPROTO_TCP, Socket::TCP_NODELAY, true)
+sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_KEEPALIVE, true)
+# Benchmarks didn't see any performance gains from increasing the SO_RCVBUF buffer size
+# sock.setsockopt(Socket::SOL_SOCKET, ::Socket::SO_RCVBUF, 1024 * 1024 * 8)
+# Benchamrks did see an improvement in performance when increasing the SO_SNDBUF buffer size
+# sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDBUF, 1024 * 1024 * 8)
+
+payload = 'B' * bench_payload_size
+dalli_key = 'dalli_key'
+# ensure the clients are all connected and working
+client.set(dalli_key, payload)
+sock.write("set sock_key 0 3600 #{payload.bytesize}\r\n")
+sock.write(payload)
+sock.write(TERMINATOR)
+sock.flush
+sock.readline # clear the buffer
+
+# ensure we have basic data for the benchmarks and get calls
+payload_smaller = 'B' * (bench_payload_size / 10)
+pairs = {}
+100.times do |i|
+  pairs["multi_#{i}"] = payload_smaller
+end
+client.quiet do
+  pairs.each do |key, value|
+    client.set(key, value, 3600, raw: true)
+  end
+end
+
+# rubocop:disable Metrics/MethodLength
+# rubocop:disable Metrics/PerceivedComplexity
+# rubocop:disable Metrics/AbcSize
+# rubocop:disable Metrics/CyclomaticComplexity
+def sock_get_multi(sock, pairs)
+  count = pairs.length
+  pairs.each_key do |key|
+    count -= 1
+    tail = count.zero? ? '' : 'q'
+    sock.write("mg #{key} v f k #{tail}\r\n")
+  end
+  sock.flush
+  # read all the memcached responses back and build a hash of key value pairs
+  results = {}
+  last_result = false
+  while (line = sock.readline.chomp!(TERMINATOR)) != ''
+    last_result = true if line.start_with?('EN ')
+    next unless line.start_with?('VA ') || last_result
+
+    _, value_length, _flags, key = line.split
+    results[key[1..]] = sock.read(value_length.to_i)
+    sock.read(TERMINATOR.length)
+    break if results.size == pairs.size
+    break if last_result
+  end
+  results
+end
+# rubocop:enable Metrics/PerceivedComplexity
+# rubocop:enable Metrics/AbcSize
+# rubocop:enable Metrics/CyclomaticComplexity
+
+def sock_set_multi(sock, pairs)
+  count = pairs.length
+  tail = ''
+  ttl = 3600
+
+  pairs.each do |key, value|
+    count -= 1
+    tail = count.zero? ? '' : 'q'
+    sock.write(String.new("ms #{key} #{value.bytesize} c F0 T#{ttl} MS #{tail}\r\n",
+                          capacity: key.size + value.bytesize + 40))
+    sock.write(value)
+    sock.write(TERMINATOR)
+  end
+  sock.flush
+  sock.gets(TERMINATOR) # clear the buffer
+end
+# rubocop:enable Metrics/MethodLength
+
+if %w[all get].include?(bench_target)
+  Vernier.profile(out: 'client_get_profile.json') do
+    start_time = Time.now
+    while Time.now - start_time < bench_time
+      result = client.get(dalli_key)
+      raise 'mismatch' unless result == payload
+    end
+  end
+
+  Vernier.profile(out: 'socket_get_profile.json') do
+    start_time = Time.now
+    while Time.now - start_time < bench_time
+      sock.write("mg sock_key v\r\n")
+      sock.readline
+      result = sock.read(payload.bytesize)
+      sock.read(TERMINATOR.bytesize)
+      raise 'mismatch' unless result == payload
+    end
+  end
+end
+
+if %w[all set].include?(bench_target)
+  Vernier.profile(out: 'client_set_profile.json') do
+    start_time = Time.now
+    client.set(dalli_key, payload, 3600, raw: true) while Time.now - start_time < bench_time
+  end
+
+  Vernier.profile(out: 'socket_set_profile.json') do
+    start_time = Time.now
+    while Time.now - start_time < bench_time
+      sock.write("ms sock_key #{payload.bytesize} T3600 MS\r\n")
+      sock.write(payload)
+      sock.write("\r\n")
+      sock.flush
+      sock.readline # clear the buffer
+    end
+  end
+end
+
+if %w[all get_multi].include?(bench_target)
+  Vernier.profile(out: 'client_get_multi_profile.json') do
+    start_time = Time.now
+    while Time.now - start_time < bench_time
+      result = client.get_multi(pairs.keys)
+      raise 'mismatch' unless result == pairs
+    end
+  end
+
+  Vernier.profile(out: 'socket_get_multi_profile.json') do
+    start_time = Time.now
+    while Time.now - start_time < bench_time
+      result = sock_get_multi(sock, pairs)
+      raise 'mismatch' unless result == pairs
+    end
+  end
+end
+
+if %w[all set_multi].include?(bench_target)
+  Vernier.profile(out: 'client_set_multi_profile.json') do
+    start_time = Time.now
+    # until we port over set_multi, compare the simple loop
+    # client.set_multi(pairs, 3600, raw: true) while Time.now - start_time < bench_time
+    while Time.now - start_time < bench_time
+      pairs.each do |key, value|
+        client.set(key, value, 3600, raw: true)
+      end
+    end
+  end
+
+  Vernier.profile(out: 'socket_set_multi_profile.json') do
+    start_time = Time.now
+    sock_set_multi(sock, pairs) while Time.now - start_time < bench_time
+  end
+end