diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad353d84..80a76e7e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,14 +97,18 @@ file(GLOB tilemaker_src_files
 	src/mbtiles.cpp
 	src/mmap_allocator.cpp
 	src/node_stores.cpp
+	src/options_parser.cpp
 	src/osm_lua_processing.cpp
 	src/osm_mem_tiles.cpp
 	src/osm_store.cpp
 	src/output_object.cpp
 	src/pbf_blocks.cpp
 	src/pmtiles.cpp
+	src/pooled_string.cpp
 	src/read_pbf.cpp
 	src/read_shp.cpp
+	src/sharded_node_store.cpp
+	src/sharded_way_store.cpp
 	src/shared_data.cpp
 	src/shp_mem_tiles.cpp
 	src/sorted_node_store.cpp
diff --git a/Makefile b/Makefile
index 45b7c8af..d44245ae 100644
--- a/Makefile
+++ b/Makefile
@@ -106,14 +106,18 @@ tilemaker: \
 	src/mbtiles.o \
 	src/mmap_allocator.o \
 	src/node_stores.o \
+	src/options_parser.o \
 	src/osm_lua_processing.o \
 	src/osm_mem_tiles.o \
 	src/osm_store.o \
 	src/output_object.o \
 	src/pbf_blocks.o \
 	src/pmtiles.o \
+	src/pooled_string.o \
 	src/read_pbf.o \
 	src/read_shp.o \
+	src/sharded_node_store.o \
+	src/sharded_way_store.o \
 	src/shared_data.o \
 	src/shp_mem_tiles.o \
 	src/sorted_node_store.o \
@@ -125,7 +129,49 @@ tilemaker: \
 	src/write_geometry.o
 	$(CXX) $(CXXFLAGS) -o tilemaker $^ $(INC) $(LIB) $(LDFLAGS)
 
-test: test_sorted_way_store
+test: \
+	test_append_vector \
+	test_attribute_store \
+	test_deque_map \
+	test_pooled_string \
+	test_sorted_node_store \
+	test_sorted_way_store
+
+test_append_vector: \
+	src/mmap_allocator.o \
+	test/append_vector.test.o
+	$(CXX) $(CXXFLAGS) -o test.append_vector $^ $(INC) $(LIB) $(LDFLAGS) && ./test.append_vector
+
+test_attribute_store: \
+	src/mmap_allocator.o \
+	src/attribute_store.o \
+	src/pooled_string.o \
+	test/attribute_store.test.o
+	$(CXX) $(CXXFLAGS) -o test.attribute_store $^ $(INC) $(LIB) $(LDFLAGS) && ./test.attribute_store
+
+test_deque_map: \
+	test/deque_map.test.o
+	$(CXX) $(CXXFLAGS) -o test.deque_map $^ $(INC) $(LIB) $(LDFLAGS) && ./test.deque_map
+
+test_options_parser: \
+	src/options_parser.o \
+	test/options_parser.test.o
+	$(CXX) $(CXXFLAGS) -o test.options_parser $^ $(INC) $(LIB) $(LDFLAGS) && ./test.options_parser
+
+test_pooled_string: \
+	src/mmap_allocator.o \
+	src/pooled_string.o \
+	test/pooled_string.test.o
+	$(CXX) $(CXXFLAGS) -o test.pooled_string $^ $(INC) $(LIB) $(LDFLAGS) && ./test.pooled_string
+
+test_sorted_node_store: \
+	src/external/streamvbyte_decode.o \
+	src/external/streamvbyte_encode.o \
+	src/external/streamvbyte_zigzag.o \
+	src/mmap_allocator.o \
+	src/sorted_node_store.o \
+	test/sorted_node_store.test.o
+	$(CXX) $(CXXFLAGS) -o test.sorted_node_store $^ $(INC) $(LIB) $(LDFLAGS) && ./test.sorted_node_store
 
 test_sorted_way_store: \
 	src/external/streamvbyte_decode.o \
@@ -133,7 +179,7 @@ test_sorted_way_store: \
 	src/external/streamvbyte_zigzag.o \
 	src/mmap_allocator.o \
 	src/sorted_way_store.o \
-	src/sorted_way_store.test.o
+	test/sorted_way_store.test.o
 	$(CXX) $(CXXFLAGS) -o test.sorted_way_store $^ $(INC) $(LIB) $(LDFLAGS) && ./test.sorted_way_store
 
 
@@ -153,6 +199,6 @@ install:
 	install docs/man/tilemaker.1 ${DESTDIR}${MANPREFIX}/man1/
 
 clean:
-	rm -f tilemaker src/*.o src/external/*.o include/*.o include/*.pb.h
+	rm -f tilemaker src/*.o src/external/*.o include/*.o include/*.pb.h test/*.o
 
 .PHONY: install
diff --git a/include/append_vector.h b/include/append_vector.h
new file mode 100644
index 00000000..07531217
--- /dev/null
+++ b/include/append_vector.h
@@ -0,0 +1,195 @@
+#ifndef _APPEND_VECTOR_H
+#define _APPEND_VECTOR_H
+
+#include "mmap_allocator.h"
+#include <vector>
+#include <queue>
+
+// Tilemaker collects OutputObjects in a list that
+// - spills to disk
+// - only gets appended to
+//
+// Vector is great for linear access, but resizes cause expensive disk I/O to
+// copy elements.
+//
+// Deque is great for growing without disk I/O, but it allocates in blocks of 512,
+// which is inefficient for linear access.
+//
+// Instead, we author a limited vector-of-vectors class that allocates in bigger chunks,
+// to get the best of both worlds.
+
+#define APPEND_VECTOR_SIZE 8192
+namespace AppendVectorNS {
+	template <class T>
+	class AppendVector {
+	public:
+		struct Iterator {
+			using iterator_category = std::random_access_iterator_tag;
+			using difference_type   = std::ptrdiff_t;
+			using value_type        = T;
+			using pointer           = T*;
+			using reference         = T&;
+
+			Iterator(AppendVector<T>& appendVector, uint16_t vec, uint16_t offset):
+				appendVector(&appendVector), vec(vec), offset(offset) {}
+
+			Iterator():
+				appendVector(nullptr), vec(0), offset(0) {}
+
+
+			bool operator<(const Iterator& other) const {
+				if (vec < other.vec)
+					return true;
+
+				if (vec > other.vec)
+					return false;
+
+				return offset < other.offset;
+			}
+
+			bool operator>=(const Iterator& other) const {
+				return !(*this < other);
+			}
+
+			Iterator operator-(int delta) const {
+				int64_t absolute = vec * APPEND_VECTOR_SIZE + offset;
+				absolute -= delta;
+				return Iterator(*appendVector, absolute / APPEND_VECTOR_SIZE, absolute % APPEND_VECTOR_SIZE);
+			}
+
+			Iterator operator+(int delta) const {
+				int64_t absolute = vec * APPEND_VECTOR_SIZE + offset;
+				absolute += delta;
+				return Iterator(*appendVector, absolute / APPEND_VECTOR_SIZE, absolute % APPEND_VECTOR_SIZE);
+			}
+
+			bool operator==(const Iterator& other) const {
+				return appendVector == other.appendVector && vec == other.vec && offset == other.offset;
+			}
+
+			bool operator!=(const Iterator& other) const {
+				return !(*this == other);
+			}
+
+			std::ptrdiff_t operator-(const Iterator& other) const {
+				int64_t absolute = vec * APPEND_VECTOR_SIZE + offset;
+				int64_t otherAbsolute = other.vec * APPEND_VECTOR_SIZE + other.offset;
+
+				return absolute - otherAbsolute;
+			}
+
+			reference operator*() const {
+				auto& vector = appendVector->vecs[vec];
+				auto& el = vector[offset];
+				return el;
+			}
+
+			pointer operator->() const {
+				auto& vector = appendVector->vecs[vec];
+				auto& el = vector[offset];
+				return &el;
+			}
+
+			Iterator& operator+= (int delta) {
+				int64_t absolute = vec * APPEND_VECTOR_SIZE + offset;
+				absolute += delta;
+
+				vec = absolute / APPEND_VECTOR_SIZE;
+				offset = absolute % APPEND_VECTOR_SIZE;
+				return *this;
+			}
+
+			Iterator& operator-= (int delta) {
+				int64_t absolute = vec * APPEND_VECTOR_SIZE + offset;
+				absolute -= delta;
+
+				vec = absolute / APPEND_VECTOR_SIZE;
+				offset = absolute % APPEND_VECTOR_SIZE;
+				return *this;
+			}
+
+			// Prefix increment
+			Iterator& operator++() {
+				offset++;
+				if (offset == APPEND_VECTOR_SIZE) {
+					offset = 0;
+					vec++;
+				}
+				return *this;
+			}  
+
+			// Postfix increment
+			Iterator operator++(int) { Iterator tmp = *this; ++(*this); return tmp; }
+
+			// Prefix decrement
+			Iterator& operator--() {
+				if (offset > 0) {
+					offset--;
+				} else {
+					vec--;
+					offset = APPEND_VECTOR_SIZE - 1;
+				}
+
+				return *this;
+			}
+
+			// Postfix decrement
+			Iterator operator--(int) { Iterator tmp = *this; --(*this); return tmp; }
+
+		private:
+			mutable AppendVector<T>* appendVector;
+			int32_t vec, offset;
+		};
+
+		AppendVector():
+			count(0),
+			vecs(1) {
+		}
+
+		void clear() {
+			count = 0;
+			vecs.clear();
+			vecs.push_back(std::vector<T, mmap_allocator<T>>());
+			vecs.back().reserve(APPEND_VECTOR_SIZE);
+		}
+
+		size_t size() const {
+			return count;
+		}
+
+		T& operator [](int idx) {
+			auto& vec = vecs[idx / APPEND_VECTOR_SIZE];
+			auto& el = vec[idx % APPEND_VECTOR_SIZE];
+			return el;
+		}
+
+		Iterator begin() {
+			return Iterator(*this, 0, 0);
+		}
+
+		Iterator end() {
+			return Iterator(*this, vecs.size() - 1, count % APPEND_VECTOR_SIZE);
+		}
+
+		void push_back(const T& el) {
+			if (vecs.back().capacity() == 0)
+				vecs.back().reserve(APPEND_VECTOR_SIZE);
+
+			vecs.back().push_back(el);
+
+			if (vecs.back().size() == vecs.back().capacity()) {
+				vecs.push_back(std::vector<T, mmap_allocator<T>>());
+				vecs.back().reserve(APPEND_VECTOR_SIZE);
+			}
+
+			count++;
+		}
+
+		size_t count;
+		std::deque<std::vector<T, mmap_allocator<T>>> vecs;
+	};
+}
+
+#undef APPEND_VECTOR_SIZE
+
+#endif
diff --git a/include/attribute_store.h b/include/attribute_store.h
index ad1aa4e1..3aea19cf 100644
--- a/include/attribute_store.h
+++ b/include/attribute_store.h
@@ -10,6 +10,8 @@
 #include <boost/functional/hash.hpp>
 #include <boost/container/flat_map.hpp>
 #include <vector>
+#include "pooled_string.h"
+#include "deque_map.h"
 
 /* AttributeStore - global dictionary for attributes */
 
@@ -39,34 +41,75 @@ class AttributeKeyStore {
 	std::map<const std::string*, uint16_t, string_ptr_less_than> keys2index;
 };
 
-enum class AttributePairType: char { False = 0, True = 1, Float = 2, String = 3 };
+enum class AttributePairType: char { Bool = 0, Float = 1, String = 2 };
 // AttributePair is a key/value pair (with minzoom)
+#pragma pack(push, 1)
 struct AttributePair {
-	std::string stringValue_;
-	float floatValue_;
-	short keyIndex;
-	char minzoom;
-	AttributePairType valueType;
+	short keyIndex : 9;
+	AttributePairType valueType : 3;
+	char minzoom : 4;
+	union {
+		float floatValue_;
+		PooledString stringValue_;
+	};
 
 	AttributePair(uint32_t keyIndex, bool value, char minzoom)
-		: keyIndex(keyIndex), valueType(value ? AttributePairType::True : AttributePairType::False), minzoom(minzoom)
+		: keyIndex(keyIndex), valueType(AttributePairType::Bool), minzoom(minzoom), floatValue_(value ? 1 : 0)
 	{
 	}
-	AttributePair(uint32_t keyIndex, const std::string& value, char minzoom)
+	AttributePair(uint32_t keyIndex, const PooledString& value, char minzoom)
 		: keyIndex(keyIndex), valueType(AttributePairType::String), stringValue_(value), minzoom(minzoom)
 	{
 	}
 	AttributePair(uint32_t keyIndex, float value, char minzoom)
-		: keyIndex(keyIndex), valueType(AttributePairType::Float), floatValue_(value), minzoom(minzoom)
+		: keyIndex(keyIndex), valueType(AttributePairType::Float), minzoom(minzoom), floatValue_(value)
 	{
 	}
 
+	AttributePair(const AttributePair& other):
+		keyIndex(other.keyIndex), valueType(other.valueType), minzoom(other.minzoom)
+	{
+		if (valueType == AttributePairType::Bool || valueType == AttributePairType::Float) {
+			floatValue_ = other.floatValue_;
+			return;
+		}
+
+		stringValue_ = other.stringValue_;
+	}
+
+	AttributePair& operator=(const AttributePair& other) {
+		keyIndex = other.keyIndex;
+		valueType = other.valueType;
+		minzoom = other.minzoom;
+
+		if (valueType == AttributePairType::Bool || valueType == AttributePairType::Float) {
+			floatValue_ = other.floatValue_;
+			return *this;
+		}
+
+		stringValue_ = other.stringValue_;
+		return *this;
+	}
+
+	bool operator<(const AttributePair& other) const {
+		if (minzoom != other.minzoom)
+			return minzoom < other.minzoom;
+		if (keyIndex != other.keyIndex)
+			return keyIndex < other.keyIndex;
+		if (valueType != other.valueType) return valueType < other.valueType;
+
+		if (hasStringValue()) return pooledString() < other.pooledString();
+		if (hasBoolValue()) return boolValue() < other.boolValue();
+		if (hasFloatValue()) return floatValue() < other.floatValue();
+		throw std::runtime_error("Invalid type in attribute store");
+	}
+
 	bool operator==(const AttributePair &other) const {
 		if (minzoom!=other.minzoom || keyIndex!=other.keyIndex || valueType!=other.valueType) return false;
 		if (valueType == AttributePairType::String)
 			return stringValue_ == other.stringValue_;
 
-		if (valueType == AttributePairType::Float)
+		if (valueType == AttributePairType::Float || valueType == AttributePairType::Bool)
 			return floatValue_ == other.floatValue_;
 
 		return true;
@@ -74,13 +117,16 @@ struct AttributePair {
 
 	bool hasStringValue() const { return valueType == AttributePairType::String; }
 	bool hasFloatValue() const { return valueType == AttributePairType::Float; }
-	bool hasBoolValue() const { return valueType == AttributePairType::True || valueType == AttributePairType::False; };
+	bool hasBoolValue() const { return valueType == AttributePairType::Bool; }
 
-	const std::string& stringValue() const { return stringValue_; }
+	const PooledString& pooledString() const { return stringValue_; }
+	const std::string stringValue() const { return stringValue_.toString(); }
 	float floatValue() const { return floatValue_; }
-	bool boolValue() const { return valueType == AttributePairType::True; }
+	bool boolValue() const { return floatValue_; }
+
+	void ensureStringIsOwned();
 
-	static bool isHot(const AttributePair& pair, const std::string& keyName) {
+	static bool isHot(const std::string& keyName, const std::string& value) {
 		// Is this pair a candidate for the hot pool?
 
 		// Hot pairs are pairs that we think are likely to be re-used, like
@@ -89,25 +135,11 @@ struct AttributePair {
 		// The trick is that we commit to putting them in the hot pool
 		// before we know if we were right.
 
-		// All boolean pairs are eligible.
-		if (pair.hasBoolValue())
-			return true;
-
-		// Small integers are eligible.
-		if (pair.hasFloatValue()) {
-			float v = pair.floatValue();
-
-			if (ceil(v) == v && v >= 0 && v <= 25)
-				return true;
-		}
-
-		// The remaining things should be strings, but just in case...
-		if (!pair.hasStringValue())
-			return false;
+		// The rules for floats/booleans are managed in their addAttribute call.
 
 		// Only strings that are IDish are eligible: only lowercase letters.
 		bool ok = true;
-		for (const auto& c: pair.stringValue()) {
+		for (const auto& c: value) {
 			if (c != '-' && c != '_' && (c < 'a' || c > 'z'))
 				return false;
 		}
@@ -124,9 +156,10 @@ struct AttributePair {
 		boost::hash_combine(rv, keyIndex);
 		boost::hash_combine(rv, valueType);
 
-		if(hasStringValue())
-			boost::hash_combine(rv, stringValue());
-		else if(hasFloatValue())
+		if(hasStringValue()) {
+			const char* data = pooledString().data();
+			boost::hash_range(rv, data, data + pooledString().size());
+		} else if(hasFloatValue())
 			boost::hash_combine(rv, floatValue());
 		else if(hasBoolValue())
 			boost::hash_combine(rv, boolValue());
@@ -137,6 +170,7 @@ struct AttributePair {
 		return rv;
 	}
 };
+#pragma pack(pop)
 
 
 // We shard the cold pools to reduce the odds of lock contention on
@@ -153,40 +187,22 @@ class AttributePairStore {
 public:
 	AttributePairStore():
 		finalized(false),
-		pairs(ATTRIBUTE_SHARDS),
-		pairsMaps(ATTRIBUTE_SHARDS),
-		pairsMutex(ATTRIBUTE_SHARDS),
-		hotShardSize(0)
+		pairsMutex(ATTRIBUTE_SHARDS)
 	{
-		// NB: the hot shard is stored in its own, pre-allocated vector.
-		// pairs[0] is _not_ the hot shard
-		hotShard.reserve(1 << 16);
-		for (size_t i = 0; i < 1 << 16; i++)
-			hotShard.push_back(AttributePair(0, false, 0));
+		// The "hot" shard has a capacity of 64K, the others are unbounded.
+		pairs.push_back(DequeMap<AttributePair>(1 << 16));
+		// Reserve offset 0 as a sentinel
+		pairs[0].add(AttributePair(0, false, 0));
+		for (size_t i = 1; i < ATTRIBUTE_SHARDS; i++)
+			pairs.push_back(DequeMap<AttributePair>());
 	}
 
 	void finalize() { finalized = true; }
 	const AttributePair& getPair(uint32_t i) const;
 	const AttributePair& getPairUnsafe(uint32_t i) const;
-	uint32_t addPair(const AttributePair& pair, bool isHot);
-
-	struct key_value_less_ptr {
-		bool operator()(AttributePair const* lhs, AttributePair const* rhs) const {            
-			if (lhs->minzoom != rhs->minzoom)
-				return lhs->minzoom < rhs->minzoom;
-			if (lhs->keyIndex != rhs->keyIndex)
-				return lhs->keyIndex < rhs->keyIndex;
-			if (lhs->valueType != rhs->valueType) return lhs->valueType < rhs->valueType;
-
-			if (lhs->hasStringValue()) return lhs->stringValue() < rhs->stringValue();
-			if (lhs->hasBoolValue()) return lhs->boolValue() < rhs->boolValue();
-			if (lhs->hasFloatValue()) return lhs->floatValue() < rhs->floatValue();
-			throw std::runtime_error("Invalid type in attribute store");
-		}
-	}; 
+	uint32_t addPair(AttributePair& pair, bool isHot);
 
-	std::vector<std::deque<AttributePair>> pairs;
-	std::vector<boost::container::flat_map<const AttributePair*, uint32_t, AttributePairStore::key_value_less_ptr>> pairsMaps;
+	std::vector<DequeMap<AttributePair>> pairs;
 
 private:
 	bool finalized;
@@ -198,41 +214,37 @@ class AttributePairStore {
 	// we suspect will be popular. It only ever has 64KB items,
 	// so that we can reference it with a short.
 	mutable std::vector<std::mutex> pairsMutex;
-	std::atomic<uint32_t> hotShardSize;
-	std::vector<AttributePair> hotShard;
 };
 
 // AttributeSet is a set of AttributePairs
 // = the complete attributes for one object
 struct AttributeSet {
 
-	struct less_ptr {
-		bool operator()(const AttributeSet* lhs, const AttributeSet* rhs) const {            
-			if (lhs->useVector != rhs->useVector)
-				return lhs->useVector < rhs->useVector;
-
-			if (lhs->useVector) {
-				if (lhs->intValues.size() != rhs->intValues.size())
-					return lhs->intValues.size() < rhs->intValues.size();
-
-				for (int i = 0; i < lhs->intValues.size(); i++) {
-					if (lhs->intValues[i] != rhs->intValues[i]) {
-						return lhs->intValues[i] < rhs->intValues[i];
-					}
-				}
+	bool operator<(const AttributeSet& other) const {
+		if (useVector != other.useVector)
+			return useVector < other.useVector;
 
-				return false;
-			}
+		if (useVector) {
+			if (intValues.size() != other.intValues.size())
+				return intValues.size() < other.intValues.size();
 
-			for (int i = 0; i < sizeof(lhs->shortValues)/sizeof(lhs->shortValues[0]); i++) {
-				if (lhs->shortValues[i] != rhs->shortValues[i]) {
-					return lhs->shortValues[i] < rhs->shortValues[i];
+			for (int i = 0; i < intValues.size(); i++) {
+				if (intValues[i] != other.intValues[i]) {
+					return intValues[i] < other.intValues[i];
 				}
 			}
 
 			return false;
 		}
-	}; 
+
+		for (int i = 0; i < sizeof(shortValues)/sizeof(shortValues[0]); i++) {
+			if (shortValues[i] != other.shortValues[i]) {
+				return shortValues[i] < other.shortValues[i];
+			}
+		}
+
+		return false;
+	}
 
 	size_t hash() const {
 		// Values are in canonical form after finalizeSet is called, so
@@ -253,6 +265,7 @@ struct AttributeSet {
 		return idx;
 	}
 
+	bool operator!=(const AttributeSet& other) const { return !(*this == other); }
 	bool operator==(const AttributeSet &other) const {
 		// Equivalent if, for every value in values, there is a value in other.values
 		// whose pair is the same.
@@ -380,6 +393,8 @@ struct AttributeSet {
 struct AttributeStore {
 	AttributeIndex add(AttributeSet &attributes);
 	std::vector<const AttributePair*> getUnsafe(AttributeIndex index) const;
+	void reset(); // used for testing
+	size_t size() const;
 	void reportSize() const;
 	void finalize();
 
@@ -390,7 +405,6 @@ struct AttributeStore {
 	AttributeStore():
 		finalized(false),
 		sets(ATTRIBUTE_SHARDS),
-		setsMaps(ATTRIBUTE_SHARDS),
 		setsMutex(ATTRIBUTE_SHARDS),
 		lookups(0) {
 	}
@@ -400,8 +414,7 @@ struct AttributeStore {
 
 private:
 	bool finalized;
-	std::vector<std::deque<AttributeSet>> sets;
-	std::vector<boost::container::flat_map<const AttributeSet*, uint32_t, AttributeSet::less_ptr>> setsMaps;
+	std::vector<DequeMap<AttributeSet>> sets;
 	mutable std::vector<std::mutex> setsMutex;
 
 	mutable std::mutex mutex;
diff --git a/include/deque_map.h b/include/deque_map.h
new file mode 100644
index 00000000..bcb4ddbc
--- /dev/null
+++ b/include/deque_map.h
@@ -0,0 +1,128 @@
+#ifndef DEQUE_MAP_H
+#define DEQUE_MAP_H
+
+#include <algorithm>
+#include <boost/range/irange.hpp>
+#include <cstring>
+#include <deque>
+#include <vector>
+
+// A class which looks deep within the soul of some instance of
+// a class T and assigns it a number based on the order in which
+// it joined (or reminds it of its number).
+//
+// Used to translate an 8-byte pointer into a 4-byte ID that can be
+// used repeatedly.
+template <class T>
+class DequeMap {
+public:
+	DequeMap(): maxSize(0) {}
+	DequeMap(uint32_t maxSize): maxSize(maxSize) {}
+
+	bool full() const {
+		return maxSize != 0 && size() == maxSize;
+	}
+
+	// If `entry` is already in the map, return its index.
+	// Otherwise, if maxSize is `0`, or greater than the number of entries in the map,
+	// add the item and return its index.
+	// Otherwise, return -1.
+	int32_t add(const T& entry) {
+		// Search to see if we've already got this entry.
+		const auto offsets = boost::irange<uint32_t>(0, keys.size());
+		const auto it = std::lower_bound(
+			offsets.begin(),
+			offsets.end(),
+			entry,
+			[&](const auto &e, auto id) {
+				return objects.at(keys[e]) < id;
+			}
+		);
+
+		// We do, return its index.
+		if (it != offsets.end() && objects[keys[*it]] == entry)
+			return keys[*it];
+
+		if (maxSize > 0 && objects.size() >= maxSize)
+			return -1;
+
+		// We don't, so store it...
+		const uint32_t newIndex = objects.size();
+		objects.push_back(entry);
+
+		// ...and add its index to our keys vector.
+		const uint32_t keysOffset = it == offsets.end() ? offsets.size() : *it;
+
+		const uint32_t desiredSize = keys.size() + 1;
+
+		// Amortize growth
+		if (keys.capacity() < desiredSize)
+			keys.reserve(keys.capacity() * 1.5);
+
+		keys.resize(desiredSize);
+
+		// Unless we're adding to the end, we need to shuffle existing keys down
+		// to make room for our new index.
+		if (keysOffset != newIndex) {
+			std::memmove(&keys[keysOffset + 1], &keys[keysOffset], sizeof(uint32_t) * (keys.size() - 1 - keysOffset));
+		}
+
+		keys[keysOffset] = newIndex;
+		return newIndex;
+	}
+
+	void clear() {
+		objects.clear();
+		keys.clear();
+	}
+
+	// Returns the index of `entry` if present, -1 otherwise.
+	int32_t find(const T& entry) const {
+		const auto offsets = boost::irange<uint32_t>(0, keys.size());
+		const auto it = std::lower_bound(
+			offsets.begin(),
+			offsets.end(),
+			entry,
+			[&](const auto &e, auto id) {
+				return objects.at(keys[e]) < id;
+			}
+		);
+
+		// We do, return its index.
+		if (it != offsets.end() && objects[keys[*it]] == entry)
+			return keys[*it];
+
+		return -1;
+	}
+
+	const T& at(uint32_t index) const {
+		return objects.at(index);
+	}
+
+	size_t size() const { return objects.size(); }
+
+	struct iterator {
+		const DequeMap<T>& dm;
+		size_t offset;
+		iterator(const DequeMap<T>& dm, size_t offset): dm(dm), offset(offset) {}
+		void operator++() { offset++; }
+		bool operator!=(iterator& other) { return offset != other.offset; }
+		const T& operator*() const { return dm.objects[dm.keys[offset]]; }
+	};
+
+	iterator begin() const { return iterator{*this, 0}; }
+	iterator end() const { return iterator{*this, keys.size()}; }
+
+private:
+	uint32_t maxSize;
+
+	// Using a deque is necessary, as it provides pointer-stability for previously
+	// added objects when it grows the storage (as opposed to, e.g., vector).
+	std::deque<T> objects;
+
+	// Whereas `objects` is ordered by insertion-time, `keys` is sorted such that
+	// objects[key[0]] < objects[key[1]] < ... < objects[key[$]]
+	// operator< of T.
+	std::vector<uint32_t> keys;
+};
+#endif
diff --git a/include/helpers.h b/include/helpers.h
index 7cb9c027..029a801d 100644
--- a/include/helpers.h
+++ b/include/helpers.h
@@ -3,7 +3,8 @@
 #define _HELPERS_H
 
 #include <zlib.h>
-#include "geom.h"
+#include <sstream>
+#include <vector>
 
 // General helper routines
 
diff --git a/include/node_store.h b/include/node_store.h
index cc84aba2..76fe18b3 100644
--- a/include/node_store.h
+++ b/include/node_store.h
@@ -23,6 +23,11 @@ class NodeStore
 	// Accessors
 	virtual size_t size() const = 0;
 	virtual LatpLon at(NodeID i) const = 0;
+
+	virtual bool contains(size_t shard, NodeID id) const = 0;
+	virtual NodeStore& shard(size_t shard) = 0;
+	virtual const NodeStore& shard(size_t shard) const = 0;
+	virtual size_t shards() const = 0;
 };
 
 #endif
diff --git a/include/node_stores.h b/include/node_stores.h
index c5151bec..05d00f4e 100644
--- a/include/node_stores.h
+++ b/include/node_stores.h
@@ -5,6 +5,7 @@
 #include <memory>
 #include "node_store.h"
 #include "sorted_node_store.h"
+#include "sharded_node_store.h"
 #include "mmap_allocator.h"
 
 class BinarySearchNodeStore : public NodeStore
@@ -19,10 +20,16 @@ class BinarySearchNodeStore : public NodeStore
 	LatpLon at(NodeID i) const override;
 	size_t size() const override;
 	void insert(const std::vector<element_t>& elements) override;
-	void clear() { 
+	void clear() override {
 		reopen();
 	}
-	void batchStart() {}
+	void batchStart() override {}
+
+	bool contains(size_t shard, NodeID id) const override;
+	NodeStore& shard(size_t shard) override { return *this; }
+	const NodeStore& shard(size_t shard) const override { return *this; }
+	size_t shards() const override { return 1; }
+	
 
 private: 
 	mutable std::mutex mutex;
@@ -49,7 +56,14 @@ class CompactNodeStore : public NodeStore
 	void insert(const std::vector<element_t>& elements) override;
 	void clear() override;
 	void finalize(size_t numThreads) override {}
-	void batchStart() {}
+	void batchStart() override {}
+
+	// CompactNodeStore has no metadata to know whether or not it contains
+	// a node, so it's not suitable for used in sharded scenarios.
+	bool contains(size_t shard, NodeID id) const override { return true; }
+	NodeStore& shard(size_t shard) override { return *this; }
+	const NodeStore& shard(size_t shard) const override { return *this; }
+	size_t shards() const override { return 1; }
 
 private: 
 	// @brief Insert a latp/lon pair.
diff --git a/include/options_parser.h b/include/options_parser.h
new file mode 100644
index 00000000..3ca73785
--- /dev/null
+++ b/include/options_parser.h
@@ -0,0 +1,58 @@
+#ifndef OPTIONS_PARSER_H
+#define OPTIONS_PARSER_H
+
+#include <exception>
+#include <string>
+#include <vector>
+
+namespace OptionsParser {
+	struct OptionException : std::exception {
+		OptionException(std::string message): message(message) {}
+
+		/// Returns the explanatory string.
+		const char* what() const noexcept override {
+				return message.data();
+		}
+
+		private:
+			std::string message;
+	};
+
+	enum class OutputMode: char { File = 0, MBTiles = 1, PMTiles = 2 };
+
+	struct OsmOptions {
+		std::string storeFile;
+		bool fast = false;
+		bool compact = false;
+		bool skipIntegrity = false;
+		bool uncompressedNodes = false;
+		bool uncompressedWays = false;
+		bool materializeGeometries = false;
+		// lazyGeometries is the inverse of materializeGeometries. It can be passed
+		// to override an implicit materializeGeometries, as in the non-store case.
+		bool lazyGeometries = false;
+		bool shardStores = false;
+	};
+
+	struct Options {
+		std::vector<std::string> inputFiles;
+		std::string luaFile;
+		std::string jsonFile;
+		uint32_t threadNum = 0;
+		std::string outputFile;
+		std::string bbox;
+
+		OsmOptions osm;
+		bool showHelp = false;
+		bool verbose = false;
+		bool mergeSqlite = false;
+		bool mapsplit = false;
+		OutputMode outputMode = OutputMode::File;
+		bool logTileTimings = false;
+	};
+
+	Options parse(const int argc, const char* argv[]);
+	void showHelp();
+};
+
+#endif
diff --git a/include/osm_mem_tiles.h b/include/osm_mem_tiles.h
index a6266ea3..3c920b08 100644
--- a/include/osm_mem_tiles.h
+++ b/include/osm_mem_tiles.h
@@ -6,10 +6,15 @@
 #include "osm_store.h"
 #include "geometry_cache.h"
 
-#define OSM_THRESHOLD (1ull << 35)
-#define USE_WAY_STORE (1ull << 35)
-#define IS_WAY(x) (((x) >> 35) == (USE_WAY_STORE >> 35))
-#define OSM_ID(x) ((x) & 0b111111111111111111111111111111111)
+// NB: Currently, USE_NODE_STORE and USE_WAY_STORE are equivalent.
+// If we permit LayerAsCentroid to be generated from the OSM stores,
+// this will have to change.
+#define OSM_THRESHOLD (1ull << TILE_DATA_ID_SIZE)
+#define USE_NODE_STORE (2ull << TILE_DATA_ID_SIZE)
+#define IS_NODE(x) (((x) >> TILE_DATA_ID_SIZE) == (USE_NODE_STORE >> TILE_DATA_ID_SIZE))
+#define USE_WAY_STORE (1ull << TILE_DATA_ID_SIZE)
+#define IS_WAY(x) (((x) >> TILE_DATA_ID_SIZE) == (USE_WAY_STORE >> TILE_DATA_ID_SIZE))
+#define OSM_ID(x) ((x) & 0b1111111111111111111111111111111111)
 
 class NodeStore;
 class WayStore;
@@ -32,18 +37,21 @@ class OsmMemTiles : public TileDataSource {
 		const WayStore& wayStore
 	);
 
+	std::string name() const override { return "osm"; }
+
 	Geometry buildWayGeometry(
 		const OutputGeometryType geomType, 
 		const NodeID objectID,
 		const TileBbox &bbox
 	) override;
+	LatpLon buildNodeGeometry(NodeID const objectID, const TileBbox &bbox) const override;
 
 
 	void Clear();
 
 private:
-	void populateLinestring(Linestring& ls, NodeID objectID);
-	Linestring& getOrBuildLinestring(NodeID objectID);
+	void populateLinestring(Linestring& ls, NodeID objectID) const;
+	Linestring& getOrBuildLinestring(NodeID objectID) const;
 	void populateMultiPolygon(MultiPolygon& dst, NodeID objectID) override;
 
 	const NodeStore& nodeStore;
diff --git a/include/output_object.h b/include/output_object.h
index 3d2d862e..385fd46d 100644
--- a/include/output_object.h
+++ b/include/output_object.h
@@ -22,9 +22,6 @@ std::ostream& operator<<(std::ostream& os, OutputGeometryType geomType);
 
 /**
  * \brief OutputObject - any object (node, linestring, polygon) to be outputted to tiles
-
- * Possible future improvements to save memory:
- * - use a global dictionary for attribute key/values
 */
 #pragma pack(push, 4)
 class OutputObject {
diff --git a/include/pooled_string.h b/include/pooled_string.h
new file mode 100644
index 00000000..56d44453
--- /dev/null
+++ b/include/pooled_string.h
@@ -0,0 +1,61 @@
+#ifndef _POOLED_STRING_H
+#define _POOLED_STRING_H
+
+// std::string is quite general:
+// - mutable
+// - unlimited length
+// - capacity can differ from size
+// - can deallocate its dynamic memory
+//
+// Our use case, by contrast is immutable, bounded strings that live for the
+// duration of the process.
+//
+// This gives us some room to have less memory overhead, especially on
+// g++, whose implementation of std::string requires 32 bytes.
+//
+// Thus, we implement `PooledString`. It has a size of 16 bytes, and a small
+// string optimization for strings <= 15 bytes. (We will separately teach
+// AttributePair to encode Latin-character strings more efficiently, so that many
+// strings of size 24 or less fit in 15 bytes.)
+//
+// If it needs to allocate memory, it does so from a shared pool. It is unable
+// to free the memory once allocated.
+
+// PooledString has one of three modes:
+// - [126:127] = 00: small-string, length is in [120:125], lower 15 bytes are string
+// - [126:127] = 10: pooled string, table is in bytes 1..3, offset in bytes 4..5, length in bytes 6..7
+// - [126:127] = 11: pointer to std::string, pointer is in bytes 8..15
+//
+// Note that the pointer mode is not safe to be stored. It exists just to allow
+// lookups in the AttributePair map before deciding to allocate a string.
+
+#include <vector>
+#include <string>
+
+namespace PooledStringNS {
+  class PooledString {
+    public:
+      // Create a short string or heap string, long-lived.
+      PooledString(const std::string& str);
+
+
+      // Create a std string - only valid so long as the string that is
+      // pointed to is valid.
+      PooledString(const std::string* str);
+      size_t size() const;
+      bool operator<(const PooledString& other) const;
+      bool operator==(const PooledString& other) const;
+      bool operator!=(const PooledString& other) const;
+      std::string toString() const;
+      const char* data() const;
+      void ensureStringIsOwned();
+
+    private:
+      // 0..3 is index into table, 4..5 is offset, 6..7 is length
+      uint8_t storage[16];
+  };
+}
+
+using PooledString = PooledStringNS::PooledString;
+
+#endif
diff --git a/include/read_pbf.h b/include/read_pbf.h
index b934a563..94adb8e0 100644
--- a/include/read_pbf.h
+++ b/include/read_pbf.h
@@ -53,11 +53,14 @@ class PbfReader
 	using pbfreader_generate_stream = std::function< std::shared_ptr<std::istream> () >;
 
 	int ReadPbfFile(
+		uint shards,
 		bool hasSortTypeThenID,
 		const std::unordered_set<std::string>& nodeKeys,
 		unsigned int threadNum,
 		const pbfreader_generate_stream& generate_stream,
-		const pbfreader_generate_output& generate_output
+		const pbfreader_generate_output& generate_output,
+		const NodeStore& nodeStore,
+		const WayStore& wayStore
 	);
 
 	// Read tags into a map from a way/node/relation
@@ -79,17 +82,28 @@ class PbfReader
 		const BlockMetadata& blockMetadata,
 		const std::unordered_set<std::string>& nodeKeys,
 		bool locationsOnWays,
-		ReadPhase phase
+		ReadPhase phase,
+		uint shard,
+		uint effectiveShard
 	);
 	bool ReadNodes(OsmLuaProcessing &output, PrimitiveGroup &pg, PrimitiveBlock const &pb, const std::unordered_set<int> &nodeKeyPositions);
 
-	bool ReadWays(OsmLuaProcessing &output, PrimitiveGroup &pg, PrimitiveBlock const &pb, bool locationsOnWays);
+	bool ReadWays(
+		OsmLuaProcessing &output,
+		PrimitiveGroup &pg,
+		PrimitiveBlock const &pb,
+		bool locationsOnWays,
+		uint shard,
+		uint effectiveShards
+	);
 	bool ScanRelations(OsmLuaProcessing &output, PrimitiveGroup &pg, PrimitiveBlock const &pb);
 	bool ReadRelations(
 		OsmLuaProcessing& output,
 		PrimitiveGroup& pg,
 		const PrimitiveBlock& pb,
-		const BlockMetadata& blockMetadata
+		const BlockMetadata& blockMetadata,
+		uint shard,
+		uint effectiveShards
 	);
 
 	inline bool RelationIsType(Relation const &rel, int typeKey, int val) {
diff --git a/include/sharded_node_store.h b/include/sharded_node_store.h
new file mode 100644
index 00000000..836c34ef
--- /dev/null
+++ b/include/sharded_node_store.h
@@ -0,0 +1,32 @@
+#ifndef _SHARDED_NODE_STORE
+#define _SHARDED_NODE_STORE
+
+#include <functional>
+#include <memory>
+#include "node_store.h"
+
+class ShardedNodeStore : public NodeStore {
+public:
+	ShardedNodeStore(std::function<std::shared_ptr<NodeStore>()> createNodeStore);
+	~ShardedNodeStore();
+	void reopen() override;
+	void finalize(size_t threadNum) override;
+	LatpLon at(NodeID i) const override;
+	size_t size() const override;
+	void batchStart() override;
+	void insert(const std::vector<element_t>& elements) override;
+	void clear() override {
+		reopen();
+	}
+
+	bool contains(size_t shard, NodeID id) const override;
+	NodeStore& shard(size_t shard) override { return *stores[shard]; }
+	const NodeStore& shard(size_t shard) const override { return *stores[shard]; }
+	size_t shards() const override;
+
+private:
+	std::function<std::shared_ptr<NodeStore>()> createNodeStore;
+	std::vector<std::shared_ptr<NodeStore>> stores;
+};
+
+#endif
diff --git a/include/sharded_way_store.h b/include/sharded_way_store.h
new file mode 100644
index 00000000..40a3d331
--- /dev/null
+++ b/include/sharded_way_store.h
@@ -0,0 +1,35 @@
+#ifndef _SHARDED_WAY_STORE
+#define _SHARDED_WAY_STORE
+
+#include <functional>
+#include <memory>
+#include "way_store.h"
+
+class NodeStore;
+
+class ShardedWayStore : public WayStore {
+public:
+	ShardedWayStore(std::function<std::shared_ptr<WayStore>()> createWayStore, const NodeStore& nodeStore);
+	~ShardedWayStore();
+	void reopen() override;
+	void batchStart() override;
+	std::vector<LatpLon> at(WayID wayid) const override;
+	bool requiresNodes() const override;
+	void insertLatpLons(std::vector<WayStore::ll_element_t> &newWays) override;
+	void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) override;
+	void clear() override;
+	std::size_t size() const override;
+	void finalize(unsigned int threadNum) override;
+
+	bool contains(size_t shard, WayID id) const override;
+	WayStore& shard(size_t shard) override;
+	const WayStore& shard(size_t shard) const override;
+	size_t shards() const override;
+	
+private:
+	std::function<std::shared_ptr<WayStore>()> createWayStore;
+	const NodeStore& nodeStore;
+	std::vector<std::shared_ptr<WayStore>> stores;
+};
+
+#endif
diff --git a/include/shared_data.h b/include/shared_data.h
index 23ba9a06..45c6e34b 100644
--- a/include/shared_data.h
+++ b/include/shared_data.h
@@ -7,6 +7,7 @@
 
 #include "rapidjson/document.h"
 
+#include "options_parser.h"
 #include "osm_store.h"
 #include "output_object.h"
 #include "mbtiles.h"
@@ -61,10 +62,6 @@ class LayerDefinition {
 	std::string serialiseToJSON() const;
 };
 
-const int OUTPUT_FILE = 0;
-const int OUTPUT_MBTILES = 1;
-const int OUTPUT_PMTILES = 2;
-
 ///\brief Config read from JSON to control behavior of program
 class Config {
 	
@@ -91,7 +88,7 @@ class SharedData {
 
 public:
 	const class LayerDefinition &layers;
-	int outputMode;
+	OptionsParser::OutputMode outputMode;
 	bool mergeSqlite;
 	MBTiles mbtiles;
 	PMTiles pmtiles;
diff --git a/include/shp_mem_tiles.h b/include/shp_mem_tiles.h
index 267a0090..508921ff 100644
--- a/include/shp_mem_tiles.h
+++ b/include/shp_mem_tiles.h
@@ -11,6 +11,8 @@ class ShpMemTiles : public TileDataSource
 public:
 	ShpMemTiles(size_t threadNum, uint baseZoom);
 
+	std::string name() const override { return "shp"; }
+
 	void CreateNamedLayerIndex(const std::string& layerName);
 
 	// Used in shape file loading
diff --git a/include/sorted_node_store.h b/include/sorted_node_store.h
index 5c156ad3..61fdfad3 100644
--- a/include/sorted_node_store.h
+++ b/include/sorted_node_store.h
@@ -3,6 +3,7 @@
 
 #include "node_store.h"
 #include "mmap_allocator.h"
+#include <atomic>
 #include <map>
 #include <memory>
 #include <mutex>
@@ -65,10 +66,15 @@ class SortedNodeStore : public NodeStore
 	size_t size() const override;
 	void batchStart() override;
 	void insert(const std::vector<element_t>& elements) override;
-	void clear() { 
+	void clear() override {
 		reopen();
 	}
 
+	bool contains(size_t shard, NodeID id) const override;
+	NodeStore& shard(size_t shard) override { return *this; }
+	const NodeStore& shard(size_t shard) const override { return *this; }
+	size_t shards() const override { return 1; }
+
 private: 
 	// When true, store chunks compressed. Only store compressed if the
 	// chunk is sufficiently large.
@@ -82,6 +88,15 @@ class SortedNodeStore : public NodeStore
 	// multiple threads. They'll get folded into the index during finalize()
 	std::map<NodeID, std::vector<element_t>> orphanage;
 	std::vector<std::vector<element_t>> workerBuffers;
+
+	std::atomic<uint64_t> totalGroups;
+	std::atomic<uint64_t> totalNodes;
+	std::atomic<uint64_t> totalGroupSpace;
+	std::atomic<uint64_t> totalAllocatedSpace;
+	std::atomic<uint64_t> totalChunks;
+	std::atomic<uint64_t> chunkSizeFreqs[257];
+	std::atomic<uint64_t> groupSizeFreqs[257];
+
 	void collectOrphans(const std::vector<element_t>& orphans);
 	void publishGroup(const std::vector<element_t>& nodes);
 };
diff --git a/include/sorted_way_store.h b/include/sorted_way_store.h
index 145e467b..b99ba7de 100644
--- a/include/sorted_way_store.h
+++ b/include/sorted_way_store.h
@@ -1,6 +1,7 @@
 #ifndef _SORTED_WAY_STORE_H
 #define _SORTED_WAY_STORE_H
 
+#include <atomic>
 #include <map>
 #include <memory>
 #include <mutex>
@@ -89,10 +90,15 @@ class SortedWayStore: public WayStore {
 	std::vector<LatpLon> at(WayID wayid) const override;
 	bool requiresNodes() const override { return true; }
 	void insertLatpLons(std::vector<WayStore::ll_element_t> &newWays) override;
-	const void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) override;
+	void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) override;
 	void clear() override;
 	std::size_t size() const override;
 	void finalize(unsigned int threadNum) override;
+
+	bool contains(size_t shard, WayID id) const override;
+	WayStore& shard(size_t shard) override { return *this; }
+	const WayStore& shard(size_t shard) const override { return *this; }
+	size_t shards() const override { return 1; }
 	
 	static uint16_t encodeWay(
 		const std::vector<NodeID>& way,
@@ -113,6 +119,13 @@ class SortedWayStore: public WayStore {
 	// multiple threads. They'll get folded into the index during finalize()
 	std::map<WayID, std::vector<std::pair<WayID, std::vector<NodeID>>>> orphanage;
 	std::vector<std::vector<std::pair<WayID, std::vector<NodeID>>>> workerBuffers;
+
+	std::atomic<uint64_t> totalWays;
+	std::atomic<uint64_t> totalNodes;
+	std::atomic<uint64_t> totalGroups;
+	std::atomic<uint64_t> totalGroupSpace;
+	std::atomic<uint64_t> totalChunks;
+
 	void collectOrphans(const std::vector<std::pair<WayID, std::vector<NodeID>>>& orphans);
 	void publishGroup(const std::vector<std::pair<WayID, std::vector<NodeID>>>& ways);
 };
diff --git a/include/tile_data.h b/include/tile_data.h
index 814b53ce..6b59ee3f 100644
--- a/include/tile_data.h
+++ b/include/tile_data.h
@@ -8,7 +8,11 @@
 #include <memory>
 #include <boost/sort/sort.hpp>
 #include "output_object.h"
+#include "append_vector.h"
 #include "clip_cache.h"
+#include "mmap_allocator.h"
+
+#define TILE_DATA_ID_SIZE 34
 
 typedef std::vector<class TileDataSource *> SourceList;
 
@@ -45,16 +49,40 @@ struct OutputObjectXYID {
 };
 
 template<typename OO> void finalizeObjects(
+	const std::string& name,
 	const size_t& threadNum,
 	const unsigned int& baseZoom,
-	typename std::vector<std::vector<OO>>::iterator begin,
-	typename std::vector<std::vector<OO>>::iterator end
+	typename std::vector<AppendVectorNS::AppendVector<OO>>::iterator begin,
+	typename std::vector<AppendVectorNS::AppendVector<OO>>::iterator end,
+	typename std::vector<std::vector<OO>>& lowZoom
 	) {
-	for (typename std::vector<std::vector<OO>>::iterator it = begin; it != end; it++) {
+	size_t z6OffsetDivisor = baseZoom >= CLUSTER_ZOOM ? (1 << (baseZoom - CLUSTER_ZOOM)) : 1;
+#ifdef CLOCK_MONOTONIC
+	timespec startTs, endTs;
+	clock_gettime(CLOCK_MONOTONIC, &startTs);
+#endif
+
+	int i = -1;
+	for (auto it = begin; it != end; it++) {
+		i++;
+		if (it->size() > 0 || i % 10 == 0 || i == 4095) {
+			std::cout << "\r" << name << ": finalizing z6 tile " << (i + 1) << "/" << CLUSTER_ZOOM_AREA;
+
+#ifdef CLOCK_MONOTONIC
+			clock_gettime(CLOCK_MONOTONIC, &endTs);
+			uint64_t elapsedNs = 1e9 * (endTs.tv_sec - startTs.tv_sec) + endTs.tv_nsec - startTs.tv_nsec;
+			std::cout << " (" << std::to_string((uint32_t)(elapsedNs / 1e6)) << " ms)";
+#endif
+			std::cout << std::flush;
+		}
 		if (it->size() == 0)
 			continue;
 
-		it->shrink_to_fit();
+		// We track a separate copy of low zoom objects to avoid scanning large
+		// lists of objects that may be on slow disk storage.
+		for (auto objectIt = it->begin(); objectIt != it->end(); objectIt++)
+			if (objectIt->oo.minZoom < CLUSTER_ZOOM)
+				lowZoom[i].push_back(*objectIt);
 
 		// If the user is doing a a small extract, there are few populated
 		// entries in `object`.
@@ -102,17 +130,18 @@ template<typename OO> void finalizeObjects(
 			},
 			threadNum
 		);
-
 	}
+
+	std::cout << std::endl;
 }
 
 template<typename OO> void collectTilesWithObjectsAtZoomTemplate(
 	const unsigned int& baseZoom,
-	const typename std::vector<std::vector<OO>>::iterator objects,
+	const typename std::vector<AppendVectorNS::AppendVector<OO>>::iterator objects,
 	const size_t size,
-	const unsigned int zoom,
-	TileCoordinatesSet& output
+	std::vector<TileCoordinatesSet>& zooms
 ) {
+	size_t maxZoom = zooms.size() - 1;
 	uint16_t z6OffsetDivisor = baseZoom >= CLUSTER_ZOOM ? (1 << (baseZoom - CLUSTER_ZOOM)) : 1;
 	int64_t lastX = -1;
 	int64_t lastY = -1;
@@ -126,13 +155,18 @@ template<typename OO> void collectTilesWithObjectsAtZoomTemplate(
 			TileCoordinate baseY = z6y * z6OffsetDivisor + objects[i][j].y;
 
 			// Translate the x, y at the requested zoom level
-			TileCoordinate x = baseX / (1 << (baseZoom - zoom));
-			TileCoordinate y = baseY / (1 << (baseZoom - zoom));
+			TileCoordinate x = baseX / (1 << (baseZoom - maxZoom));
+			TileCoordinate y = baseY / (1 << (baseZoom - maxZoom));
 
 			if (lastX != x || lastY != y) {
-				output.set(x, y);
 				lastX = x;
 				lastY = y;
+
+				for (int zoom = maxZoom; zoom >= 0; zoom--) {
+					zooms[zoom].set(x, y);
+					x /= 2;
+					y /= 2;
+				}
 			}
 		}
 	}
@@ -148,107 +182,124 @@ inline OutputObjectID outputObjectWithId<OutputObjectXYID>(const OutputObjectXYI
 	return OutputObjectID({ input.oo, input.id });
 }
 
+template<typename OO> void collectLowZoomObjectsForTile(
+	const unsigned int& baseZoom,
+	typename std::vector<std::vector<OO>> objects,
+	unsigned int zoom,
+	const TileCoordinates& dstIndex,
+	std::vector<OutputObjectID>& output
+) {
+	if (zoom >= CLUSTER_ZOOM)
+		throw std::runtime_error("collectLowZoomObjectsForTile should not be called for high zooms");
+
+	uint16_t z6OffsetDivisor = baseZoom >= CLUSTER_ZOOM ? (1 << (baseZoom - CLUSTER_ZOOM)) : 1;
+
+	for (size_t i = 0; i < objects.size(); i++) {
+		const size_t z6x = i / CLUSTER_ZOOM_WIDTH;
+		const size_t z6y = i % CLUSTER_ZOOM_WIDTH;
+
+		for (size_t j = 0; j < objects[i].size(); j++) {
+			// Compute the x, y at the base zoom level
+			TileCoordinate baseX = z6x * z6OffsetDivisor + objects[i][j].x;
+			TileCoordinate baseY = z6y * z6OffsetDivisor + objects[i][j].y;
+
+			// Translate the x, y at the requested zoom level
+			TileCoordinate x = baseX / (1 << (baseZoom - zoom));
+			TileCoordinate y = baseY / (1 << (baseZoom - zoom));
+
+			if (dstIndex.x == x && dstIndex.y == y) {
+				if (objects[i][j].oo.minZoom <= zoom) {
+					output.push_back(outputObjectWithId(objects[i][j]));
+				}
+			}
+		}
+	}
+}
+
 template<typename OO> void collectObjectsForTileTemplate(
 	const unsigned int& baseZoom,
-	typename std::vector<std::vector<OO>>::iterator objects,
+	typename std::vector<AppendVectorNS::AppendVector<OO>>::iterator objects,
 	size_t iStart,
 	size_t iEnd,
 	unsigned int zoom,
 	const TileCoordinates& dstIndex,
 	std::vector<OutputObjectID>& output
 ) {
+	if (zoom < CLUSTER_ZOOM)
+		throw std::runtime_error("collectObjectsForTileTemplate should not be called for low zooms");
+
 	uint16_t z6OffsetDivisor = baseZoom >= CLUSTER_ZOOM ? (1 << (baseZoom - CLUSTER_ZOOM)) : 1;
 
 	for (size_t i = iStart; i < iEnd; i++) {
-		const size_t z6x = i / CLUSTER_ZOOM_WIDTH;
-		const size_t z6y = i % CLUSTER_ZOOM_WIDTH;
+		// If z >= 6, we can compute the exact bounds within the objects array.
+		// Translate to the base zoom, then do a binary search to find
+		// the starting point.
+		TileCoordinate z6x = dstIndex.x / (1 << (zoom - CLUSTER_ZOOM));
+		TileCoordinate z6y = dstIndex.y / (1 << (zoom - CLUSTER_ZOOM));
+
+		TileCoordinate baseX = dstIndex.x * (1 << (baseZoom - zoom));
+		TileCoordinate baseY = dstIndex.y * (1 << (baseZoom - zoom));
+
+		Z6Offset needleX = baseX - z6x * z6OffsetDivisor;
+		Z6Offset needleY = baseY - z6y * z6OffsetDivisor;
+
+		// Kind of gross that we have to do this. Might be better if we split
+		// into two arrays, one of x/y and one of OOs. Would have better locality for
+		// searching, too.
+		OutputObject dummyOo(POINT_, 0, 0, 0, 0);
+		const size_t bz = baseZoom;
+
+		const OO targetXY = {dummyOo, needleX, needleY };
+		auto iter = std::lower_bound(
+			objects[i].begin(),
+			objects[i].end(),
+			targetXY,
+			[bz](const OO& a, const OO& b) {
+				// Cluster by parent zoom, so that a subsequent search
+				// can find a contiguous range of entries for any tile
+				// at zoom 6 or higher.
+				const size_t aX = a.x;
+				const size_t aY = a.y;
+				const size_t bX = b.x;
+				const size_t bY = b.y;
+				for (size_t z = CLUSTER_ZOOM; z <= bz; z++) {
+					const auto aXz = aX / (1 << (bz - z));
+					const auto aYz = aY / (1 << (bz - z));
+					const auto bXz = bX / (1 << (bz - z));
+					const auto bYz = bY / (1 << (bz - z));
 
-		if (zoom >= CLUSTER_ZOOM) {
-			// If z >= 6, we can compute the exact bounds within the objects array.
-			// Translate to the base zoom, then do a binary search to find
-			// the starting point.
-			TileCoordinate z6x = dstIndex.x / (1 << (zoom - CLUSTER_ZOOM));
-			TileCoordinate z6y = dstIndex.y / (1 << (zoom - CLUSTER_ZOOM));
-
-			TileCoordinate baseX = dstIndex.x * (1 << (baseZoom - zoom));
-			TileCoordinate baseY = dstIndex.y * (1 << (baseZoom - zoom));
-
-			Z6Offset needleX = baseX - z6x * z6OffsetDivisor;
-			Z6Offset needleY = baseY - z6y * z6OffsetDivisor;
-
-			// Kind of gross that we have to do this. Might be better if we split
-			// into two arrays, one of x/y and one of OOs. Would have better locality for
-			// searching, too.
-			OutputObject dummyOo(POINT_, 0, 0, 0, 0);
-			const size_t bz = baseZoom;
-
-			const OO targetXY = {dummyOo, needleX, needleY };
-			auto iter = std::lower_bound(
-				objects[i].begin(),
-				objects[i].end(),
-				targetXY,
-				[bz](const OO& a, const OO& b) {
-					// Cluster by parent zoom, so that a subsequent search
-					// can find a contiguous range of entries for any tile
-					// at zoom 6 or higher.
-					const size_t aX = a.x;
-					const size_t aY = a.y;
-					const size_t bX = b.x;
-					const size_t bY = b.y;
-					for (size_t z = CLUSTER_ZOOM; z <= bz; z++) {
-						const auto aXz = aX / (1 << (bz - z));
-						const auto aYz = aY / (1 << (bz - z));
-						const auto bXz = bX / (1 << (bz - z));
-						const auto bYz = bY / (1 << (bz - z));
-
-						if (aXz != bXz)
-							return aXz < bXz;
-
-						if (aYz != bYz)
-							return aYz < bYz;
-					}
-					return false;
-				}
-			);
-			for (; iter != objects[i].end(); iter++) {
-				// Compute the x, y at the base zoom level
-				TileCoordinate baseX = z6x * z6OffsetDivisor + iter->x;
-				TileCoordinate baseY = z6y * z6OffsetDivisor + iter->y;
-
-				// Translate the x, y at the requested zoom level
-				TileCoordinate x = baseX / (1 << (baseZoom - zoom));
-				TileCoordinate y = baseY / (1 << (baseZoom - zoom));
-
-				if (dstIndex.x == x && dstIndex.y == y) {
-					if (iter->oo.minZoom <= zoom) {
-						output.push_back(outputObjectWithId(*iter));
-					}
-				} else {
-					// Short-circuit when we're confident we'd no longer see relevant matches.
-					// We've ordered the entries in `objects` such that all objects that
-					// share the same tile at any zoom are in contiguous runs.
-					//
-					// Thus, as soon as we fail to find a match, we can stop looking.
-					break;
-				}
+					if (aXz != bXz)
+						return aXz < bXz;
 
+					if (aYz != bYz)
+						return aYz < bYz;
+				}
+				return false;
 			}
-		} else {
-			for (size_t j = 0; j < objects[i].size(); j++) {
-				// Compute the x, y at the base zoom level
-				TileCoordinate baseX = z6x * z6OffsetDivisor + objects[i][j].x;
-				TileCoordinate baseY = z6y * z6OffsetDivisor + objects[i][j].y;
-
-				// Translate the x, y at the requested zoom level
-				TileCoordinate x = baseX / (1 << (baseZoom - zoom));
-				TileCoordinate y = baseY / (1 << (baseZoom - zoom));
-
-				if (dstIndex.x == x && dstIndex.y == y) {
-					if (objects[i][j].oo.minZoom <= zoom) {
-						output.push_back(outputObjectWithId(objects[i][j]));
-					}
+		);
+
+		for (; iter != objects[i].end(); iter++) {
+			// Compute the x, y at the base zoom level
+			TileCoordinate baseX = z6x * z6OffsetDivisor + iter->x;
+			TileCoordinate baseY = z6y * z6OffsetDivisor + iter->y;
+
+			// Translate the x, y at the requested zoom level
+			TileCoordinate x = baseX / (1 << (baseZoom - zoom));
+			TileCoordinate y = baseY / (1 << (baseZoom - zoom));
+
+			if (dstIndex.x == x && dstIndex.y == y) {
+				if (iter->oo.minZoom <= zoom) {
+					output.push_back(outputObjectWithId(*iter));
 				}
+			} else {
+				// Short-circuit when we're confident we'd no longer see relevant matches.
+				// We've ordered the entries in `objects` such that all objects that
+				// share the same tile at any zoom are in contiguous runs.
+				//
+				// Thus, as soon as we fail to find a match, we can stop looking.
+				break;
 			}
+
 		}
 	}
 }
@@ -275,6 +326,7 @@ class TileDataSource {
 	std::vector<std::pair<size_t, multi_linestring_store_t*>> availableMultiLinestringStoreLeases;
 	std::vector<std::pair<size_t, multi_polygon_store_t*>> availableMultiPolygonStoreLeases;
 
+	virtual std::string name() const = 0;
 
 protected:	
 	size_t numShards;
@@ -292,8 +344,10 @@ class TileDataSource {
 	//
 	// If config.include_ids is true, objectsWithIds will be populated.
 	// Otherwise, objects.
-	std::vector<std::vector<OutputObjectXY>> objects;
-	std::vector<std::vector<OutputObjectXYID>> objectsWithIds;
+	std::vector<AppendVectorNS::AppendVector<OutputObjectXY>> objects;
+	std::vector<std::vector<OutputObjectXY>> lowZoomObjects;
+	std::vector<AppendVectorNS::AppendVector<OutputObjectXYID>> objectsWithIds;
+	std::vector<std::vector<OutputObjectXYID>> lowZoomObjectsWithIds;
 	
 	// rtree index of large objects
 	using oo_rtree_param_type = boost::geometry::index::quadratic<128>;
@@ -313,9 +367,9 @@ class TileDataSource {
 public:
 	TileDataSource(size_t threadNum, unsigned int baseZoom, bool includeID);
 
-	void collectTilesWithObjectsAtZoom(uint zoom, TileCoordinatesSet& output);
+	void collectTilesWithObjectsAtZoom(std::vector<TileCoordinatesSet>& zooms);
 
-	void collectTilesWithLargeObjectsAtZoom(uint zoom, TileCoordinatesSet& output);
+	void collectTilesWithLargeObjectsAtZoom(std::vector<TileCoordinatesSet>& zooms);
 
 	void collectObjectsForTile(uint zoom, TileCoordinates dstIndex, std::vector<OutputObjectID>& output);
 	void finalize(size_t threadNum);
@@ -355,7 +409,7 @@ class TileDataSource {
 	);
 
 	virtual Geometry buildWayGeometry(OutputGeometryType const geomType, NodeID const objectID, const TileBbox &bbox);
-	LatpLon buildNodeGeometry(OutputGeometryType const geomType, NodeID const objectID, const TileBbox &bbox) const;
+	virtual LatpLon buildNodeGeometry(NodeID const objectID, const TileBbox &bbox) const;
 
 	void open() {
 		// Put something at index 0 of all stores so that 0 can be used
@@ -373,18 +427,18 @@ class TileDataSource {
 	NodeID storePoint(Point const &input);
 
 	inline size_t getShard(NodeID id) const {
-		// Note: we only allocate 35 bits for the IDs. This allows us to
-		// use bit 36 for TileDataSource-specific handling (e.g.,
+		// Note: we only allocate 34 bits for the IDs. This allows us to
+		// use bits 35 and 36 for TileDataSource-specific handling (e.g.,
 		// OsmMemTiles may want to generate points/ways on the fly by
 		// referring to the WayStore).
 
-		return id >> (35 - shardBits);
+		return id >> (TILE_DATA_ID_SIZE - shardBits);
 	}
 
 	virtual void populateMultiPolygon(MultiPolygon& dst, NodeID objectID);
 
 	inline size_t getId(NodeID id) const {
-		return id & (~(~0ull << (35 - shardBits)));
+		return id & (~(~0ull << (TILE_DATA_ID_SIZE - shardBits)));
 	}
 
 	const Point& retrievePoint(NodeID id) const {
@@ -426,9 +480,9 @@ class TileDataSource {
 	}
 };
 
-TileCoordinatesSet getTilesAtZoom(
+void populateTilesAtZoom(
 	const std::vector<class TileDataSource *>& sources,
-	unsigned int zoom
+	std::vector<TileCoordinatesSet>& zooms
 );
 
 #endif //_TILE_DATA_H
diff --git a/include/way_store.h b/include/way_store.h
index 8650cbea..36862344 100644
--- a/include/way_store.h
+++ b/include/way_store.h
@@ -17,10 +17,15 @@ class WayStore {
 	virtual std::vector<LatpLon> at(WayID wayid) const = 0;
 	virtual bool requiresNodes() const = 0;
 	virtual void insertLatpLons(std::vector<ll_element_t>& newWays) = 0;
-	virtual const void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) = 0;
+	virtual void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) = 0;
 	virtual void clear() = 0;
 	virtual std::size_t size() const = 0;
 	virtual void finalize(unsigned int threadNum) = 0;
+
+	virtual bool contains(size_t shard, WayID id) const = 0;
+	virtual WayStore& shard(size_t shard) = 0;
+	virtual const WayStore& shard(size_t shard) const = 0;
+	virtual size_t shards() const = 0;
 };
 
 #endif
diff --git a/include/way_stores.h b/include/way_stores.h
index dfb5f74c..0f94e845 100644
--- a/include/way_stores.h
+++ b/include/way_stores.h
@@ -5,6 +5,7 @@
 #include <mutex>
 #include "way_store.h"
 #include "sorted_way_store.h"
+#include "sharded_way_store.h"
 
 class BinarySearchWayStore: public WayStore {
 
@@ -16,11 +17,16 @@ class BinarySearchWayStore: public WayStore {
 	std::vector<LatpLon> at(WayID wayid) const override;
 	bool requiresNodes() const override { return false; }
 	void insertLatpLons(std::vector<WayStore::ll_element_t> &newWays) override;
-	const void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) override;
+	void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) override;
 	void clear() override;
 	std::size_t size() const override;
 	void finalize(unsigned int threadNum) override;
 
+	bool contains(size_t shard, WayID id) const override;
+	WayStore& shard(size_t shard) override { return *this; }
+	const WayStore& shard(size_t shard) const override { return *this; }
+	size_t shards() const override { return 1; }
+
 private:
 	mutable std::mutex mutex;
 	std::unique_ptr<map_t> mLatpLonLists;
diff --git a/src/attribute_store.cpp b/src/attribute_store.cpp
index f4f9f299..6fbacbe9 100644
--- a/src/attribute_store.cpp
+++ b/src/attribute_store.cpp
@@ -55,19 +55,38 @@ const std::string& AttributeKeyStore::getKeyUnsafe(uint16_t index) const {
 	return keys[index];
 }
 
+// AttributePair
+void AttributePair::ensureStringIsOwned() {
+	// Before we store an AttributePair in our long-term storage, we need
+	// to make sure it's not pointing to a non-long-lived std::string.
+	if (valueType == AttributePairType::Bool || valueType == AttributePairType::Float)
+		return;
+
+	stringValue_.ensureStringIsOwned();
+}
+
 // AttributePairStore
-thread_local boost::container::flat_map<const AttributePair*, uint32_t, AttributePairStore::key_value_less_ptr> tlsHotShardMap;
-thread_local uint16_t tlsHotShardSize = 0;
+thread_local DequeMap<AttributePair> tlsHotShard(1 << 16);
 const AttributePair& AttributePairStore::getPair(uint32_t i) const {
 	uint32_t shard = i >> (32 - SHARD_BITS);
 	uint32_t offset = i & (~(~0u << (32 - SHARD_BITS)));
 
-	if (shard == 0)
-		return hotShard[offset];
+	if (shard == 0) {
+		if (offset < tlsHotShard.size())
+			return tlsHotShard.at(offset);
+
+		{
+			std::lock_guard<std::mutex> lock(pairsMutex[0]);
+			tlsHotShard = pairs[0];
+		}
+
+		return tlsHotShard.at(offset);
+	}
 
 	std::lock_guard<std::mutex> lock(pairsMutex[shard]);
 	return pairs[shard].at(offset);
 };
+
 const AttributePair& AttributePairStore::getPairUnsafe(uint32_t i) const {
 	// NB: This is unsafe if called before the PBF has been fully read.
 	// If called during the output phase, it's safe.
@@ -75,44 +94,36 @@ const AttributePair& AttributePairStore::getPairUnsafe(uint32_t i) const {
 	uint32_t shard = i >> (32 - SHARD_BITS);
 	uint32_t offset = i & (~(~0u << (32 - SHARD_BITS)));
 
-	if (shard == 0)
-		return hotShard[offset];
-
 	return pairs[shard].at(offset);
 };
 
-uint32_t AttributePairStore::addPair(const AttributePair& pair, bool isHot) {
+uint32_t AttributePairStore::addPair(AttributePair& pair, bool isHot) {
 	if (isHot) {
 		{
 			// First, check our thread-local map.
-			const auto& it = tlsHotShardMap.find(&pair);
-			if (it != tlsHotShardMap.end())
-				return it->second;
+			const auto& index = tlsHotShard.find(pair);
+			if (index != -1)
+				return index;
 		}
+
 		// Not found, ensure our local map is up-to-date for future calls,
 		// and fall through to the main map.
-		//
-		// Note that we can read `hotShard` without a lock
-		while (tlsHotShardSize < hotShardSize.load()) {
-			tlsHotShardSize++;
-			tlsHotShardMap[&hotShard[tlsHotShardSize]] = tlsHotShardSize;
+		if (!tlsHotShard.full()) {
+			std::lock_guard<std::mutex> lock(pairsMutex[0]);
+			tlsHotShard = pairs[0];
 		}
 
 		// This might be a popular pair, worth re-using.
 		// Have we already assigned it a hot ID?
 		std::lock_guard<std::mutex> lock(pairsMutex[0]);
-		const auto& it = pairsMaps[0].find(&pair);
-		if (it != pairsMaps[0].end())
-			return it->second;
+		const auto& index = pairs[0].find(pair);
+		if (index != -1)
+			return index;
 
-		if (hotShardSize.load() < 1 << 16) {
-			hotShardSize++;
-			uint32_t offset = hotShardSize.load();
-
-			hotShard[offset] = pair;
-			const AttributePair* ptr = &hotShard[offset];
+		if (!pairs[0].full()) {
+			pair.ensureStringIsOwned();
+			uint32_t offset = pairs[0].add(pair);
 			uint32_t rv = (0 << (32 - SHARD_BITS)) + offset;
-			pairsMaps[0][ptr] = rv;
 			return rv;
 		}
 	}
@@ -129,20 +140,17 @@ uint32_t AttributePairStore::addPair(const AttributePair& pair, bool isHot) {
 	if (shard == 0) shard = 1;
 
 	std::lock_guard<std::mutex> lock(pairsMutex[shard]);
-	const auto& it = pairsMaps[shard].find(&pair);
-	if (it != pairsMaps[shard].end())
-		return it->second;
+	const auto& index = pairs[shard].find(pair);
+	if (index != -1)
+		return (shard << (32 - SHARD_BITS)) + index;
 
-	uint32_t offset = pairs[shard].size();
+	pair.ensureStringIsOwned();
+	uint32_t offset = pairs[shard].add(pair);
 
 	if (offset >= (1 << (32 - SHARD_BITS)))
 		throw std::out_of_range("pair shard overflow");
 
-	pairs[shard].push_back(pair);
-	const AttributePair* ptr = &pairs[shard][offset];
 	uint32_t rv = (shard << (32 - SHARD_BITS)) + offset;
-
-	pairsMaps[shard][ptr] = rv;
 	return rv;
 };
 
@@ -199,20 +207,21 @@ void AttributeSet::removePairWithKey(const AttributePairStore& pairStore, uint32
 }
 
 void AttributeStore::addAttribute(AttributeSet& attributeSet, std::string const &key, const std::string& v, char minzoom) {
-	AttributePair kv(keyStore.key2index(key),v,minzoom);
-	bool isHot = AttributePair::isHot(kv, key);
+	PooledString ps(&v);
+	AttributePair kv(keyStore.key2index(key), ps, minzoom);
+	bool isHot = AttributePair::isHot(key, v);
 	attributeSet.removePairWithKey(pairStore, kv.keyIndex);
 	attributeSet.addPair(pairStore.addPair(kv, isHot));
 }
 void AttributeStore::addAttribute(AttributeSet& attributeSet, std::string const &key, bool v, char minzoom) {
 	AttributePair kv(keyStore.key2index(key),v,minzoom);
-	bool isHot = AttributePair::isHot(kv, key);
+	bool isHot = true; // All bools are eligible to be hot pairs
 	attributeSet.removePairWithKey(pairStore, kv.keyIndex);
 	attributeSet.addPair(pairStore.addPair(kv, isHot));
 }
 void AttributeStore::addAttribute(AttributeSet& attributeSet, std::string const &key, float v, char minzoom) {
 	AttributePair kv(keyStore.key2index(key),v,minzoom);
-	bool isHot = AttributePair::isHot(kv, key);
+	bool isHot = v >= 0 && v <= 25 && ceil(v) == v; // Whole numbers in 0..25 are eligible to be hot pairs
 	attributeSet.removePairWithKey(pairStore, kv.keyIndex);
 	attributeSet.addPair(pairStore.addPair(kv, isHot));
 }
@@ -268,19 +277,11 @@ AttributeIndex AttributeStore::add(AttributeSet &attributes) {
 	std::lock_guard<std::mutex> lock(setsMutex[shard]);
 	lookups++;
 
-	// Do we already have it?
-	const auto& existing = setsMaps[shard].find(&attributes);
-	if (existing != setsMaps[shard].end()) return existing->second;
-
-	// No, so add and return the index
-	uint32_t offset = sets[shard].size();
+	const uint32_t offset = sets[shard].add(attributes);
 	if (offset >= (1 << (32 - SHARD_BITS)))
 		throw std::out_of_range("set shard overflow");
-	sets[shard].push_back(attributes);
 
-	const AttributeSet* ptr = &sets[shard][offset];
 	uint32_t rv = (shard << (32 - SHARD_BITS)) + offset;
-	setsMaps[shard][ptr] = rv;
 	return rv;
 }
 
@@ -307,16 +308,21 @@ std::vector<const AttributePair*> AttributeStore::getUnsafe(AttributeIndex index
 	}
 }
 
-void AttributeStore::reportSize() const {
+size_t AttributeStore::size() const {
 	size_t numAttributeSets = 0;
 	for (int i = 0; i < ATTRIBUTE_SHARDS; i++)
 		numAttributeSets += sets[i].size();
-	std::cout << "Attributes: " << numAttributeSets << " sets from " << lookups.load() << " objects" << std::endl;
+
+	return numAttributeSets;
+}
+
+void AttributeStore::reportSize() const {
+	std::cout << "Attributes: " << size() << " sets from " << lookups.load() << " objects" << std::endl;
 
 	// Print detailed histogram of frequencies of attributes.
 	if (false) {
 		for (int i = 0; i < ATTRIBUTE_SHARDS; i++) {
-			std::cout << "pairsMaps[" << i << "] has " << pairStore.pairsMaps[i].size() << " entries" << std::endl;
+			std::cout << "pairs[" << i << "] has " << pairStore.pairs[i].size() << " entries" << std::endl;
 		}
 
 		std::map<uint32_t, uint32_t> tagCountDist;
@@ -368,6 +374,14 @@ void AttributeStore::reportSize() const {
 	}
 }
 
+void AttributeStore::reset() {
+	// This is only used for tests.
+	tlsKeys2Index.clear();
+	tlsKeys2IndexSize = 0;
+
+	tlsHotShard.clear();
+}
+
 void AttributeStore::finalize() {
 	finalized = true;
 	keyStore.finalize();
diff --git a/src/helpers.cpp b/src/helpers.cpp
index 444ddcf0..4af04612 100644
--- a/src/helpers.cpp
+++ b/src/helpers.cpp
@@ -4,6 +4,8 @@
 #include <iomanip>
 #include <sstream>
 #include <cstring>
+#include <boost/lexical_cast.hpp>
+#include <boost/algorithm/string.hpp>
 
 #include "helpers.h"
 
@@ -11,7 +13,6 @@
 #define MOD_GZIP_ZLIB_CFACTOR 9
 #define MOD_GZIP_ZLIB_BSIZE 8096
 
-namespace geom = boost::geometry;
 using namespace std;
 
 // Bounding box string parsing
diff --git a/src/mmap_allocator.cpp b/src/mmap_allocator.cpp
index dc71f687..2b5e26fd 100644
--- a/src/mmap_allocator.cpp
+++ b/src/mmap_allocator.cpp
@@ -79,10 +79,10 @@ thread_local mmap_shm_ptr mmap_shm_thread_region_ptr;
 std::mutex mmap_allocator_mutex;
 
 mmap_file::mmap_file(std::string const &filename, std::size_t offset)
-	: mapping(filename.c_str(), boost::interprocess::read_write)
+	: filename(filename)
+	, mapping(filename.c_str(), boost::interprocess::read_write)
 	, region(mapping, boost::interprocess::read_write)
 	, buffer(boost::interprocess::create_only, reinterpret_cast<uint8_t *>(region.get_address()) + offset, region.get_size() - offset)
-	, filename(filename)
 { }
 
 mmap_file::~mmap_file()
diff --git a/src/node_stores.cpp b/src/node_stores.cpp
index 8c84b811..06e2fc5e 100644
--- a/src/node_stores.cpp
+++ b/src/node_stores.cpp
@@ -14,6 +14,17 @@ void BinarySearchNodeStore::reopen()
 	}
 }
 
+bool BinarySearchNodeStore::contains(size_t shard, NodeID i) const {
+	auto internalShard = mLatpLons[shardPart(i)];
+	auto id = idPart(i);
+
+	auto iter = std::lower_bound(internalShard->begin(), internalShard->end(), id, [](auto const &e, auto i) { 
+		return e.first < i; 
+	});
+
+	return !(iter == internalShard->end() || iter->first != id);
+}
+
 LatpLon BinarySearchNodeStore::at(NodeID i) const {
 	auto shard = mLatpLons[shardPart(i)];
 	auto id = idPart(i);
diff --git a/src/options_parser.cpp b/src/options_parser.cpp
new file mode 100644
index 00000000..529e5f4a
--- /dev/null
+++ b/src/options_parser.cpp
@@ -0,0 +1,114 @@
+#include "options_parser.h"
+
+#include <thread>
+#include <boost/filesystem.hpp>
+#include <boost/program_options.hpp>
+#include <iostream>
+#include "helpers.h"
+
+#ifndef TM_VERSION
+#define TM_VERSION (version not set)
+#endif
+#define STR1(x)  #x
+#define STR(x)  STR1(x)
+
+using namespace std;
+namespace po = boost::program_options;
+
+po::options_description getParser(OptionsParser::Options& options) {
+	po::options_description desc("tilemaker " STR(TM_VERSION) "\nConvert OpenStreetMap .pbf files into vector tiles\n\nAvailable options");
+	desc.add_options()
+		("help",                                                                 "show help message")
+		("input",  po::value< vector<string> >(&options.inputFiles),                     "source .osm.pbf file")
+		("output", po::value< string >(&options.outputFile),                             "target directory or .mbtiles/.pmtiles file")
+		("bbox",   po::value< string >(&options.bbox),                                   "bounding box to use if input file does not have a bbox header set, example: minlon,minlat,maxlon,maxlat")
+		("merge"  ,po::bool_switch(&options.mergeSqlite),                                "merge with existing .mbtiles (overwrites otherwise)")
+		("config", po::value< string >(&options.jsonFile)->default_value("config.json"), "config JSON file")
+		("process",po::value< string >(&options.luaFile)->default_value("process.lua"),  "tag-processing Lua file")
+		("verbose",po::bool_switch(&options.verbose),                                   "verbose error output")
+		("skip-integrity",po::bool_switch(&options.osm.skipIntegrity),                       "don't enforce way/node integrity")
+		("log-tile-timings", po::bool_switch(&options.logTileTimings), "log how long each tile takes");
+	po::options_description performance("Performance options");
+	performance.add_options()
+		("store",  po::value< string >(&options.osm.storeFile),  "temporary storage for node/ways/relations data")
+		("fast",   po::bool_switch(&options.osm.fast), "prefer speed at the expense of memory")
+		("compact",po::bool_switch(&options.osm.compact),  "use faster data structure for node lookups\nNOTE: This requires the input to be renumbered (osmium renumber)")
+		("no-compress-nodes", po::bool_switch(&options.osm.uncompressedNodes),  "store nodes uncompressed")
+		("no-compress-ways", po::bool_switch(&options.osm.uncompressedWays),  "store ways uncompressed")
+		("lazy-geometries", po::bool_switch(&options.osm.lazyGeometries),  "generate geometries from the OSM stores; uses less memory")
+		("materialize-geometries", po::bool_switch(&options.osm.materializeGeometries),  "materialize geometries; uses more memory")
+		("shard-stores", po::bool_switch(&options.osm.shardStores),  "use an alternate reading/writing strategy for low-memory machines")
+		("threads",po::value<uint32_t>(&options.threadNum)->default_value(0),              "number of threads (automatically detected if 0)")
+			;
+
+	desc.add(performance);
+	return desc;
+}
+
+void OptionsParser::showHelp() {
+	Options options;
+	auto parser = getParser(options);
+	std::cout << parser << std::endl;
+}
+
+OptionsParser::Options OptionsParser::parse(const int argc, const char* argv[]) {
+	Options options;
+
+	po::options_description desc = getParser(options);
+	po::positional_options_description p;
+	p.add("input", 1).add("output", 1);
+
+	po::variables_map vm;
+	try {
+		po::store(po::command_line_parser(argc, argv).options(desc).positional(p).run(), vm);
+	} catch (const po::unknown_option& ex) {
+		throw OptionException{"Unknown option: " + ex.get_option_name()};
+	}
+	po::notify(vm);
+
+	if (options.osm.storeFile.empty()) {
+		options.osm.materializeGeometries = true;
+	} else {
+		if (!options.osm.fast) {
+			options.osm.shardStores = true;
+		}
+	}
+
+	// You can pass --lazy-geometries to override the default of materialized geometries for
+	// the non-store case.
+	if (options.osm.lazyGeometries)
+		options.osm.materializeGeometries = false;
+
+	
+	if (vm.count("help")) {
+		options.showHelp = true;
+		return options;
+	}
+	if (vm.count("output") == 0) {
+		throw OptionException{ "You must specify an output file or directory. Run with --help to find out more." };
+	}
+
+	if (vm.count("input") == 0) {
+		throw OptionException{ "No source .osm.pbf file supplied" };
+	}
+
+	if (ends_with(options.outputFile, ".mbtiles") || ends_with(options.outputFile, ".sqlite")) {
+		options.outputMode = OutputMode::MBTiles;
+	} else if (ends_with(options.outputFile, ".pmtiles")) {
+		options.outputMode = OutputMode::PMTiles;
+	}
+
+	if (options.threadNum == 0) {
+		options.threadNum = max(thread::hardware_concurrency(), 1u);
+	}
+
+	// ---- Check config
+	if (!boost::filesystem::exists(options.jsonFile)) {
+		throw OptionException{ "Couldn't open .json config: " + options.jsonFile };
+	}
+	if (!boost::filesystem::exists(options.luaFile)) {
+		throw OptionException{"Couldn't open .lua script: " + options.luaFile };
+	}
+
+	return options;
+}
diff --git a/src/osm_lua_processing.cpp b/src/osm_lua_processing.cpp
index a1bc2536..faf69ec7 100644
--- a/src/osm_lua_processing.cpp
+++ b/src/osm_lua_processing.cpp
@@ -350,7 +350,9 @@ void OsmLuaProcessing::Layer(const string &layerName, bool area) {
 
 			if(CorrectGeometry(p) == CorrectGeometryResult::Invalid) return;
 
-			NodeID id = osmMemTiles.storePoint(p);
+			NodeID id = USE_NODE_STORE | originalOsmID;
+			if (materializeGeometries)
+				id = osmMemTiles.storePoint(p);
 			OutputObject oo(geomType, layers.layerMap[layerName], id, 0, layerMinZoom);
 			outputs.push_back(std::make_pair(std::move(oo), attributes));
 			return;
@@ -466,7 +468,21 @@ void OsmLuaProcessing::LayerAsCentroid(const string &layerName) {
 		return;
 	}
 
-	NodeID id = osmMemTiles.storePoint(geomp);
+	NodeID id = 0;
+	// We don't do lazy centroids for relations - calculating their centroid
+	// can be quite expensive, and there's not as many of them as there are
+	// ways.
+	if (materializeGeometries || isRelation) {
+		id = osmMemTiles.storePoint(geomp);
+	} else if (!isRelation && !isWay) {
+		// Sometimes people call LayerAsCentroid(...) on a node, because they're
+		// writing a generic handler that doesn't know if it's a node or a way,
+		// e.g. POIs.
+		id = USE_NODE_STORE | originalOsmID;
+	} else {
+		id = USE_WAY_STORE | originalOsmID;
+		wayEmitted = true;
+	}
 	OutputObject oo(POINT_, layers.layerMap[layerName], id, 0, layerMinZoom);
 	outputs.push_back(std::make_pair(std::move(oo), attributes));
 }
diff --git a/src/osm_mem_tiles.cpp b/src/osm_mem_tiles.cpp
index f5527d0e..7dc03f45 100644
--- a/src/osm_mem_tiles.cpp
+++ b/src/osm_mem_tiles.cpp
@@ -18,6 +18,30 @@ OsmMemTiles::OsmMemTiles(
 {
 }
 
+LatpLon OsmMemTiles::buildNodeGeometry(
+	NodeID const objectID,
+	const TileBbox &bbox
+) const {
+	if (objectID < OSM_THRESHOLD) {
+		return TileDataSource::buildNodeGeometry(objectID, bbox);
+	}
+
+	if (IS_NODE(objectID))
+		return nodeStore.at(OSM_ID(objectID));
+
+
+	if (IS_WAY(objectID)) {
+		Linestring& ls = getOrBuildLinestring(objectID);
+		Point centroid;
+		Polygon p;
+		geom::assign_points(p, ls);
+		geom::centroid(p, centroid);
+		return LatpLon{(int32_t)(centroid.y()*10000000.0), (int32_t)(centroid.x()*10000000.0)};
+	}
+
+	throw std::runtime_error("OsmMemTiles::buildNodeGeometry: unsupported objectID");
+}
+
 Geometry OsmMemTiles::buildWayGeometry(
 	const OutputGeometryType geomType, 
 	const NodeID objectID,
@@ -58,7 +82,7 @@ Geometry OsmMemTiles::buildWayGeometry(
 	throw std::runtime_error("buildWayGeometry: unexpected objectID: " + std::to_string(objectID));
 }
 
-void OsmMemTiles::populateLinestring(Linestring& ls, NodeID objectID) {
+void OsmMemTiles::populateLinestring(Linestring& ls, NodeID objectID) const {
 	std::vector<LatpLon> nodes = wayStore.at(OSM_ID(objectID));
 
 	for (const LatpLon& node : nodes) {
@@ -66,7 +90,7 @@ void OsmMemTiles::populateLinestring(Linestring& ls, NodeID objectID) {
 	}
 }
 
-Linestring& OsmMemTiles::getOrBuildLinestring(NodeID objectID) {
+Linestring& OsmMemTiles::getOrBuildLinestring(NodeID objectID) const {
 	// Note: this function returns a reference, not a shared_ptr.
 	//
 	// This is safe, because this function is the only thing that can
diff --git a/src/output_object.cpp b/src/output_object.cpp
index b68fb27f..7f9f0edb 100644
--- a/src/output_object.cpp
+++ b/src/output_object.cpp
@@ -87,9 +87,12 @@ void OutputObject::writeAttributes(
 int OutputObject::findValue(const vector<vector_tile::Tile_Value>* valueList, const AttributePair& value) const {
 	for (size_t i=0; i<valueList->size(); i++) {
 		const vector_tile::Tile_Value& v = valueList->at(i);
-		if (v.has_string_value() && value.hasStringValue() && v.string_value()==value.stringValue()) { return i; }
-		if (v.has_float_value()  && value.hasFloatValue()  && v.float_value() ==value.floatValue() ) { return i; }
-		if (v.has_bool_value()	 && value.hasBoolValue()   && v.bool_value()  ==value.boolValue()	) { return i; }
+		if (v.has_string_value() && value.hasStringValue()) {
+			const size_t valueSize = value.pooledString().size();
+			if (valueSize == v.string_value().size() && memcmp(v.string_value().data(), value.pooledString().data(), valueSize) == 0)
+				return i;
+		} else if (v.has_float_value()  && value.hasFloatValue()  && v.float_value() ==value.floatValue() ) { return i; }
+		else if (v.has_bool_value()	 && value.hasBoolValue()   && v.bool_value()  ==value.boolValue()	) { return i; }
 	}
 	return -1;
 }
diff --git a/src/pooled_string.cpp b/src/pooled_string.cpp
new file mode 100644
index 00000000..500408d4
--- /dev/null
+++ b/src/pooled_string.cpp
@@ -0,0 +1,170 @@
+#include "pooled_string.h"
+#include <mutex>
+#include <cstring>
+
+namespace PooledStringNS {
+	std::vector<char*> tables;
+	std::mutex mutex;
+
+	const uint8_t ShortString = 0b00;
+	const uint8_t HeapString = 0b10;
+	const uint8_t StdString = 0b11;
+
+	// Each thread has its own string table, we only take a lock
+	// to push a new table onto the vector.
+	thread_local int64_t tableIndex = -1;
+	thread_local int64_t spaceLeft = -1;
+}
+
+PooledString::PooledString(const std::string& str) {
+	if (str.size() >= 65536)
+		throw std::runtime_error("cannot store string longer than 64K");
+
+	if (str.size() <= 15) {
+		storage[0] = str.size();
+		memcpy(storage + 1, str.data(), str.size());
+		memset(storage + 1 + str.size(), 0, 16 - 1 - str.size());
+	} else {
+		memset(storage + 8, 0, 8);
+		storage[0] = 1 << 7;
+
+		if (spaceLeft < 0 || spaceLeft < str.size()) {
+			std::lock_guard<std::mutex> lock(mutex);
+			spaceLeft = 65536;
+			char* buffer = (char*)malloc(spaceLeft);
+			if (buffer == 0)
+				throw std::runtime_error("PooledString could not malloc");
+			tables.push_back(buffer);
+			tableIndex = tables.size() - 1;
+		}
+
+		storage[1] = tableIndex >> 16;
+		storage[2] = tableIndex >> 8;
+		storage[3] = tableIndex;
+
+		uint16_t offset = 65536 - spaceLeft;
+		storage[4] = offset >> 8;
+		storage[5] = offset;
+
+		uint16_t length = str.size();
+		storage[6] = length >> 8;
+		storage[7] = length;
+
+		memcpy(tables[tableIndex] + offset, str.data(), str.size());
+
+		spaceLeft -= str.size();
+	}
+}
+
+PooledString::PooledString(const std::string* str) {
+	storage[0] = StdString << 6;
+
+	*(const std::string**)((void*)(storage + 8)) = str;
+}
+
+bool PooledStringNS::PooledString::operator==(const PooledString& other) const {
+	// NOTE: We have surprising equality semantics!
+	//
+	// If one of the strings is a StdString, it's value equality.
+	//
+	// Else, for short strings, you are equal if the strings are equal.
+	//
+	// For large strings, you are equal if you use the same heap memory locations.
+	// This implies that someone outside of PooledString is managing pooling! In our
+	// case, it is the responsibility of AttributePairStore.
+	uint8_t kind = storage[0] >> 6;
+	uint8_t otherKind = other.storage[0] >> 6;
+
+	if (kind == StdString || otherKind == StdString) {
+		size_t mySize = size();
+		if (mySize != other.size())
+			return false;
+
+		return memcmp(data(), other.data(), mySize) == 0;
+	}
+
+	return memcmp(storage, other.storage, 16) == 0;
+}
+
+bool PooledStringNS::PooledString::operator!=(const PooledString& other) const {
+	return !(*this == other);
+}
+
+const char* PooledStringNS::PooledString::data() const {
+	uint8_t kind = storage[0] >> 6;
+
+	if (kind == ShortString)
+		return (char *)(storage + 1);
+
+	if (kind == StdString) {
+		const std::string* str = *(const std::string**)((void*)(storage + 8));
+		return str->data();
+	}
+
+	uint32_t tableIndex = (storage[1] << 16) + (storage[2] << 8) + storage[3];
+	uint16_t offset = (storage[4] << 8) + storage[5];
+
+	const char* data = tables[tableIndex] + offset;
+	return data;
+}
+
+size_t PooledStringNS::PooledString::size() const {
+	uint8_t kind = storage[0] >> 6;
+	// If the uppermost bit is set, we're in heap.
+	if (kind == HeapString) {
+		uint16_t length = (storage[6] << 8) + storage[7];
+		return length;
+	}
+
+	if (kind == ShortString)
+		// Otherwise it's stored in the lower 7 bits of the highest byte.
+		return storage[0] & 0b01111111;
+
+	const std::string* str = *(const std::string**)((void*)(storage + 8));
+	return str->size();
+}
+
+std::string PooledStringNS::PooledString::toString() const {
+	std::string rv;
+	uint8_t kind = storage[0] >> 6;
+	if (kind == HeapString) {
+		// heap
+		rv.reserve(size());
+
+		uint32_t tableIndex = (storage[1] << 16) + (storage[2] << 8) + storage[3];
+		uint16_t offset = (storage[4] << 8) + storage[5];
+
+		char* data = tables[tableIndex] + offset;
+		rv.append(data, size());
+		return rv;
+	}
+
+	if (kind == ShortString) {
+		for (int i = 0; i < storage[0]; i++)
+			rv += storage[i + 1];
+		return rv;
+	}
+
+	const std::string* str = *(const std::string**)((void*)(storage + 8));
+	return *str;
+}
+
+void PooledStringNS::PooledString::ensureStringIsOwned() {
+	uint8_t kind = storage[0] >> 6;
+
+	if (kind != StdString)
+		return;
+
+	*this = PooledString(toString());
+}
+
+bool PooledStringNS::PooledString::operator<(const PooledString& other) const {
+	size_t mySize = size();
+	size_t otherSize = other.size();
+
+	if (mySize != otherSize)
+		return mySize < otherSize;
+
+	return memcmp(data(), other.data(), mySize) < 0;
+}
+
diff --git a/src/read_pbf.cpp b/src/read_pbf.cpp
index 605618fa..9b8b2f15 100644
--- a/src/read_pbf.cpp
+++ b/src/read_pbf.cpp
@@ -73,7 +73,14 @@ bool PbfReader::ReadNodes(OsmLuaProcessing &output, PrimitiveGroup &pg, Primitiv
 	return false;
 }
 
-bool PbfReader::ReadWays(OsmLuaProcessing &output, PrimitiveGroup &pg, PrimitiveBlock const &pb, bool locationsOnWays) {
+bool PbfReader::ReadWays(
+	OsmLuaProcessing &output,
+	PrimitiveGroup &pg,
+	PrimitiveBlock const &pb,
+	bool locationsOnWays,
+	uint shard,
+	uint effectiveShards
+) {
 	// ----	Read ways
 
 	if (pg.ways_size() > 0) {
@@ -83,15 +90,18 @@ bool PbfReader::ReadWays(OsmLuaProcessing &output, PrimitiveGroup &pg, Primitive
 
 		std::vector<WayStore::ll_element_t> llWays;
 		std::vector<std::pair<WayID, std::vector<NodeID>>> nodeWays;
+		LatpLonVec llVec;
+		std::vector<NodeID> nodeVec;
 
 		for (int j=0; j<pg.ways_size(); j++) {
+			llVec.clear();
+			nodeVec.clear();
+
 			pbfWay = pg.ways(j);
 			WayID wayId = static_cast<WayID>(pbfWay.id());
 			if (wayId >= pow(2,42)) throw std::runtime_error("Way ID negative or too large: "+std::to_string(wayId));
 
 			// Assemble nodelist
-			LatpLonVec llVec;
-			std::vector<NodeID> nodeVec;
 			if (locationsOnWays) {
 				int lat=0, lon=0;
 				llVec.reserve(pbfWay.lats_size());
@@ -105,8 +115,17 @@ bool PbfReader::ReadWays(OsmLuaProcessing &output, PrimitiveGroup &pg, Primitive
 				int64_t nodeId = 0;
 				llVec.reserve(pbfWay.refs_size());
 				nodeVec.reserve(pbfWay.refs_size());
+
+				bool skipToNext = false;
+
 				for (int k=0; k<pbfWay.refs_size(); k++) {
 					nodeId += pbfWay.refs(k);
+
+					if (k == 0 && effectiveShards > 1 && !osmStore.nodes.contains(shard, nodeId)) {
+						skipToNext = true;
+						break;
+					}
+
 					try {
 						llVec.push_back(osmStore.nodes.at(static_cast<NodeID>(nodeId)));
 						nodeVec.push_back(nodeId);
@@ -114,6 +133,9 @@ bool PbfReader::ReadWays(OsmLuaProcessing &output, PrimitiveGroup &pg, Primitive
 						if (osmStore.integrity_enforced()) throw err;
 					}
 				}
+
+				if (skipToNext)
+					continue;
 			}
 			if (llVec.empty()) continue;
 
@@ -138,9 +160,9 @@ bool PbfReader::ReadWays(OsmLuaProcessing &output, PrimitiveGroup &pg, Primitive
 		}
 
 		if (wayStoreRequiresNodes) {
-			osmStore.ways.insertNodes(nodeWays);
+			osmStore.ways.shard(shard).insertNodes(nodeWays);
 		} else {
-			osmStore.ways.insertLatpLons(llWays);
+			osmStore.ways.shard(shard).insertLatpLons(llWays);
 		}
 
 		return true;
@@ -184,7 +206,9 @@ bool PbfReader::ReadRelations(
 	OsmLuaProcessing& output,
 	PrimitiveGroup& pg,
 	const PrimitiveBlock& pb,
-	const BlockMetadata& blockMetadata
+	const BlockMetadata& blockMetadata,
+	uint shard,
+	uint effectiveShards
 ) {
 	// ----	Read relations
 
@@ -210,15 +234,27 @@ bool PbfReader::ReadRelations(
 				WayVec outerWayVec, innerWayVec;
 				int64_t lastID = 0;
 				bool isInnerOuter = isBoundary || isMultiPolygon;
+				bool skipToNext = false;
+				bool firstWay = true;
 				for (int n=0; n < pbfRelation.memids_size(); n++) {
 					lastID += pbfRelation.memids(n);
 					if (pbfRelation.types(n) != Relation_MemberType_WAY) { continue; }
 					int32_t role = pbfRelation.roles_sid(n);
 					if (role==innerKey || role==outerKey) isInnerOuter=true;
 					WayID wayId = static_cast<WayID>(lastID);
+
+					if (firstWay && effectiveShards > 1 && !osmStore.ways.contains(shard, wayId)) {
+						skipToNext = true;
+						break;
+					}
+					if (firstWay)
+						firstWay = false;
 					(role == innerKey ? innerWayVec : outerWayVec).push_back(wayId);
 				}
 
+				if (skipToNext)
+					continue;
+
 				try {
 					tag_map_t tags;
 					readTags(pbfRelation, pb, tags);
@@ -244,7 +280,9 @@ bool PbfReader::ReadBlock(
 	const BlockMetadata& blockMetadata,
 	const unordered_set<string>& nodeKeys,
 	bool locationsOnWays,
-	ReadPhase phase
+	ReadPhase phase,
+	uint shard,
+	uint effectiveShards
 ) 
 {
 	infile.seekg(blockMetadata.offset);
@@ -272,8 +310,12 @@ bool PbfReader::ReadBlock(
 		{
 			if (ioMutex.try_lock()) {
 				std::ostringstream str;
+				str << "\r";
 				void_mmap_allocator::reportStoreSize(str);
-				str << "Block " << blocksProcessed.load() << "/" << blocksToProcess.load() << " ways " << pg.ways_size() << " relations " << pg.relations_size() << "                  \r";
+				if (effectiveShards > 1)
+					str << std::to_string(shard + 1) << "/" << std::to_string(effectiveShards) << " ";
+
+				str << "Block " << blocksProcessed.load() << "/" << blocksToProcess.load() << " ways " << pg.ways_size() << " relations " << pg.relations_size() << "                  ";
 				std::cout << str.str();
 				std::cout.flush();
 				ioMutex.unlock();
@@ -293,14 +335,17 @@ bool PbfReader::ReadBlock(
 			osmStore.ensureUsedWaysInited();
 			bool done = ScanRelations(output, pg, pb);
 			if(done) { 
-				std::cout << "(Scanning for ways used in relations: " << (100*blocksProcessed.load()/blocksToProcess.load()) << "%)\r";
-				std::cout.flush();
+				if (ioMutex.try_lock()) {
+					std::cout << "\r(Scanning for ways used in relations: " << (100*blocksProcessed.load()/blocksToProcess.load()) << "%)           ";
+					std::cout.flush();
+					ioMutex.unlock();
+				}
 				continue;
 			}
 		}
 	
 		if(phase == ReadPhase::Ways) {
-			bool done = ReadWays(output, pg, pb, locationsOnWays);
+			bool done = ReadWays(output, pg, pb, locationsOnWays, shard, effectiveShards);
 			if(done) { 
 				output_progress();
 				++read_groups;
@@ -309,7 +354,7 @@ bool PbfReader::ReadBlock(
 		}
 
 		if(phase == ReadPhase::Relations) {
-			bool done = ReadRelations(output, pg, pb, blockMetadata);
+			bool done = ReadRelations(output, pg, pb, blockMetadata, shard, effectiveShards);
 			if(done) { 
 				output_progress();
 				++read_groups;
@@ -332,7 +377,7 @@ bool PbfReader::ReadBlock(
 
 	// We can only delete blocks if we're confident we've processed everything,
 	// which is not possible in the case of subdivided blocks.
-	return blockMetadata.chunks == 1;
+	return (shard + 1 == effectiveShards) && blockMetadata.chunks == 1;
 }
 
 bool blockHasPrimitiveGroupSatisfying(
@@ -362,11 +407,14 @@ bool blockHasPrimitiveGroupSatisfying(
 }
 
 int PbfReader::ReadPbfFile(
+	uint shards,
 	bool hasSortTypeThenID,
 	unordered_set<string> const& nodeKeys,
 	unsigned int threadNum,
 	const pbfreader_generate_stream& generate_stream,
-	const pbfreader_generate_output& generate_output
+	const pbfreader_generate_output& generate_output,
+	const NodeStore& nodeStore,
+	const WayStore& wayStore
 )
 {
 	auto infile = generate_stream();
@@ -459,84 +507,113 @@ int PbfReader::ReadPbfFile(
 
 	std::vector<ReadPhase> all_phases = { ReadPhase::Nodes, ReadPhase::RelationScan, ReadPhase::Ways, ReadPhase::Relations };
 	for(auto phase: all_phases) {
-		// Launch the pool with threadNum threads
-		boost::asio::thread_pool pool(threadNum);
-		std::mutex block_mutex;
-
-		// If we're in ReadPhase::Relations and there aren't many blocks left
-		// to read, increase parallelism by letting each thread only process
-		// a portion of the block.
-		if (phase == ReadPhase::Relations && blocks.size() < threadNum * 2) {
-			std::cout << "only " << blocks.size() << " relation blocks; subdividing for better parallelism" << std::endl;
-			std::map<std::size_t, BlockMetadata> moreBlocks;
-			for (const auto& block : blocks) {
-				BlockMetadata newBlock = block.second;
-				newBlock.chunks = threadNum;
-				for (size_t i = 0; i < threadNum; i++) {
-					newBlock.chunk = i;
-					moreBlocks[moreBlocks.size()] = newBlock;
+		uint effectiveShards = 1;
+
+		// On memory-constrained machines, we might read ways/relations
+		// multiple times in order to keep the working set of nodes limited.
+		if (phase == ReadPhase::Ways || phase == ReadPhase::Relations)
+			effectiveShards = shards;
+
+		for (int shard = 0; shard < effectiveShards; shard++) {
+			// If we're in ReadPhase::Ways, only do a pass if there is at least one
+			// entry in the pass's shard.
+			if (phase == ReadPhase::Ways && nodeStore.shard(shard).size() == 0)
+				continue;
+
+			// Ditto, but for relations
+			if (phase == ReadPhase::Relations && wayStore.shard(shard).size() == 0)
+				continue;
+
+#ifdef CLOCK_MONOTONIC
+			timespec start, end;
+			clock_gettime(CLOCK_MONOTONIC, &start);
+#endif
+
+			// Launch the pool with threadNum threads
+			boost::asio::thread_pool pool(threadNum);
+			std::mutex block_mutex;
+
+			// If we're in ReadPhase::Relations and there aren't many blocks left
+			// to read, increase parallelism by letting each thread only process
+			// a portion of the block.
+			if (phase == ReadPhase::Relations && blocks.size() < threadNum * 2) {
+				std::cout << "only " << blocks.size() << " relation blocks; subdividing for better parallelism" << std::endl;
+				std::map<std::size_t, BlockMetadata> moreBlocks;
+				for (const auto& block : blocks) {
+					BlockMetadata newBlock = block.second;
+					newBlock.chunks = threadNum;
+					for (size_t i = 0; i < threadNum; i++) {
+						newBlock.chunk = i;
+						moreBlocks[moreBlocks.size()] = newBlock;
+					}
 				}
+				blocks = moreBlocks;
 			}
-			blocks = moreBlocks;
-		}
 
-		std::deque<std::vector<IndexedBlockMetadata>> blockRanges;
-		std::map<std::size_t, BlockMetadata> filteredBlocks;
-		for (const auto& entry : blocks) {
-			if ((phase == ReadPhase::Nodes && entry.second.hasNodes) ||
-					(phase == ReadPhase::RelationScan && entry.second.hasRelations) ||
-					(phase == ReadPhase::Ways && entry.second.hasWays) ||
-					(phase == ReadPhase::Relations && entry.second.hasRelations))
-				filteredBlocks[entry.first] = entry.second;
-		}
+			std::deque<std::vector<IndexedBlockMetadata>> blockRanges;
+			std::map<std::size_t, BlockMetadata> filteredBlocks;
+			for (const auto& entry : blocks) {
+				if ((phase == ReadPhase::Nodes && entry.second.hasNodes) ||
+						(phase == ReadPhase::RelationScan && entry.second.hasRelations) ||
+						(phase == ReadPhase::Ways && entry.second.hasWays) ||
+						(phase == ReadPhase::Relations && entry.second.hasRelations))
+					filteredBlocks[entry.first] = entry.second;
+			}
 
-		blocksToProcess = filteredBlocks.size();
-		blocksProcessed = 0;
-
-		// When processing blocks, we try to give each worker large batches
-		// of contiguous blocks, so that they might benefit from long runs
-		// of sorted indexes, and locality of nearby IDs.
-		const size_t batchSize = (filteredBlocks.size() / (threadNum * 8)) + 1;
-
-		size_t consumed = 0;
-		auto it = filteredBlocks.begin();
-		while(it != filteredBlocks.end()) {
-			std::vector<IndexedBlockMetadata> blockRange;
-			blockRange.reserve(batchSize);
-			size_t max = consumed + batchSize;
-			for (; consumed < max && it != filteredBlocks.end(); consumed++) {
-				IndexedBlockMetadata ibm;
-				memcpy(&ibm, &it->second, sizeof(BlockMetadata));
-				ibm.index = it->first;
-				blockRange.push_back(ibm);
-				it++;
+			blocksToProcess = filteredBlocks.size();
+			blocksProcessed = 0;
+
+			// When processing blocks, we try to give each worker large batches
+			// of contiguous blocks, so that they might benefit from long runs
+			// of sorted indexes, and locality of nearby IDs.
+			const size_t batchSize = (filteredBlocks.size() / (threadNum * 8)) + 1;
+
+			size_t consumed = 0;
+			auto it = filteredBlocks.begin();
+			while(it != filteredBlocks.end()) {
+				std::vector<IndexedBlockMetadata> blockRange;
+				blockRange.reserve(batchSize);
+				size_t max = consumed + batchSize;
+				for (; consumed < max && it != filteredBlocks.end(); consumed++) {
+					IndexedBlockMetadata ibm;
+					memcpy(&ibm, &it->second, sizeof(BlockMetadata));
+					ibm.index = it->first;
+					blockRange.push_back(ibm);
+					it++;
+				}
+				blockRanges.push_back(blockRange);
 			}
-			blockRanges.push_back(blockRange);
-		}
 
-		{
-			for(const std::vector<IndexedBlockMetadata>& blockRange: blockRanges) {
-				boost::asio::post(pool, [=, &blockRange, &blocks, &block_mutex, &nodeKeys]() {
-					if (phase == ReadPhase::Nodes)
-						osmStore.nodes.batchStart();
-					if (phase == ReadPhase::Ways)
-						osmStore.ways.batchStart();
-
-					for (const IndexedBlockMetadata& indexedBlockMetadata: blockRange) {
-						auto infile = generate_stream();
-						auto output = generate_output();
-
-						if(ReadBlock(*infile, *output, indexedBlockMetadata, nodeKeys, locationsOnWays, phase)) {
-							const std::lock_guard<std::mutex> lock(block_mutex);
-							blocks.erase(indexedBlockMetadata.index);	
+			{
+				for(const std::vector<IndexedBlockMetadata>& blockRange: blockRanges) {
+					boost::asio::post(pool, [=, &blockRange, &blocks, &block_mutex, &nodeKeys]() {
+						if (phase == ReadPhase::Nodes)
+							osmStore.nodes.batchStart();
+						if (phase == ReadPhase::Ways)
+							osmStore.ways.batchStart();
+
+						for (const IndexedBlockMetadata& indexedBlockMetadata: blockRange) {
+							auto infile = generate_stream();
+							auto output = generate_output();
+
+							if(ReadBlock(*infile, *output, indexedBlockMetadata, nodeKeys, locationsOnWays, phase, shard, effectiveShards)) {
+								const std::lock_guard<std::mutex> lock(block_mutex);
+								blocks.erase(indexedBlockMetadata.index);	
+							}
 							blocksProcessed++;
 						}
-					}
-				});
+					});
+				}
 			}
+		
+			pool.join();
+
+#ifdef CLOCK_MONOTONIC
+			clock_gettime(CLOCK_MONOTONIC, &end);
+			uint64_t elapsedNs = 1e9 * (end.tv_sec - start.tv_sec) + end.tv_nsec - start.tv_nsec;
+			std::cout << "(" << std::to_string((uint32_t)(elapsedNs / 1e6)) << " ms)" << std::endl;
+#endif
 		}
-	
-		pool.join();
 
 		if(phase == ReadPhase::Nodes) {
 			osmStore.nodes.finalize(threadNum);
diff --git a/src/sharded_node_store.cpp b/src/sharded_node_store.cpp
new file mode 100644
index 00000000..0d915fbd
--- /dev/null
+++ b/src/sharded_node_store.cpp
@@ -0,0 +1,103 @@
+#include "sharded_node_store.h"
+
+thread_local size_t lastNodeShard = 0;
+
+ShardedNodeStore::ShardedNodeStore(std::function<std::shared_ptr<NodeStore>()> createNodeStore):
+	createNodeStore(createNodeStore) {
+	for (int i = 0; i < shards(); i++)
+		stores.push_back(createNodeStore());
+}
+
+ShardedNodeStore::~ShardedNodeStore() {
+}
+
+void ShardedNodeStore::reopen() {
+	for (auto& store : stores)
+		store->reopen();
+}
+
+void ShardedNodeStore::finalize(size_t threadNum) {
+	for (auto& store : stores)
+		store->finalize(threadNum);
+}
+
+LatpLon ShardedNodeStore::at(NodeID id) const {
+	for (int i = 0; i < shards(); i++) {
+		size_t index = (lastNodeShard + i) % shards();
+
+		if (stores[index]->contains(0, id)) {
+			lastNodeShard = index;
+			return stores[index]->at(id);
+		}
+	}
+
+	// Superfluous return to silence a compiler warning
+	return stores[shards() - 1]->at(id);
+}
+
+size_t ShardedNodeStore::size() const {
+	size_t rv = 0;
+	for (auto& store : stores)
+		rv += store->size();
+
+	return rv;
+}
+
+void ShardedNodeStore::batchStart() {
+	for (auto& store : stores)
+		store->batchStart();
+}
+
+size_t pickStore(const LatpLon& el) {
+	// Assign the element to a shard. This is a pretty naive division
+	// of the globe, tuned to have max ~10GB of nodes/ways per shard.
+
+	const size_t z5x = lon2tilex(el.lon / 10000000, 5);
+	const size_t z5y = latp2tiley(el.latp / 10000000, 5);
+
+	const size_t z4x = z5x / 2;
+	const size_t z4y = z5y / 2;
+
+	const size_t z3x = z4x / 2;
+	const size_t z3y = z4y / 2;
+
+	if (z3x == 5 && z3y == 2) return 5; // Western Russia
+	if (z3x == 4 && z3y == 3) return 5; // North Africa
+	if (z3x == 5 && z3y == 3) return 5; // India
+
+	if ((z5x == 16 && z5y == 10) || (z5x == 16 && z5y == 11)) return 4; // some of Central Europe
+	if ((z5x == 17 && z5y == 10) || (z5x == 17 && z5y == 11)) return 1; // some more of Central Europe
+
+	if (z3x == 4 && z3y == 2) return 3; // rest of Central Europe
+
+	const size_t z2x = z3x / 2;
+	const size_t z2y = z3y / 2;
+
+	if (z2x == 3 && z2y == 1) return 3; // Asia, Russia
+	if (z2x == 1 && z2y == 1) return 2; // North Atlantic Ocean and bordering countries
+	if (z2x == 0 && z2y == 1) return 1; // North America
+
+//	std::cout << "z2x=" << std::to_string(z2x) << ", z2y=" << std::to_string(z2y) << std::endl;
+	return 0; // Artic, Antartcica, Oceania, South Africa, South America
+}
+
+void ShardedNodeStore::insert(const std::vector<element_t>& elements) {
+	std::vector<std::vector<element_t>> perStore(shards());
+
+	for (const auto& el : elements) {
+		perStore[pickStore(el.second)].push_back(el);
+	}
+
+	for (int i = 0; i < shards(); i++) {
+		if (!perStore[i].empty())
+			stores[i]->insert(perStore[i]);
+	}
+}
+
+bool ShardedNodeStore::contains(size_t shard, NodeID id) const {
+	return stores[shard]->contains(0, id);
+}
+
+size_t ShardedNodeStore::shards() const {
+	return 6;
+}
diff --git a/src/sharded_way_store.cpp b/src/sharded_way_store.cpp
new file mode 100644
index 00000000..d9741082
--- /dev/null
+++ b/src/sharded_way_store.cpp
@@ -0,0 +1,81 @@
+#include "sharded_way_store.h"
+#include "node_store.h"
+
+thread_local size_t lastWayShard = 0;
+
+ShardedWayStore::ShardedWayStore(std::function<std::shared_ptr<WayStore>()> createWayStore, const NodeStore& nodeStore):
+	createWayStore(createWayStore),
+	nodeStore(nodeStore) {
+	for (int i = 0; i < shards(); i++)
+		stores.push_back(createWayStore());
+}
+
+ShardedWayStore::~ShardedWayStore() {
+}
+
+void ShardedWayStore::reopen() {
+	for (auto& store : stores)
+		store->reopen();
+}
+
+void ShardedWayStore::batchStart() {
+	for (auto& store : stores)
+		store->batchStart();
+}
+
+std::vector<LatpLon> ShardedWayStore::at(WayID wayid) const {
+	for (int i = 0; i < shards(); i++) {
+		size_t index = (lastWayShard + i) % shards();
+		if (stores[index]->contains(0, wayid)) {
+			lastWayShard = index;
+			return stores[index]->at(wayid);
+		}
+	}
+
+	// Superfluous return to silence a compiler warning
+	return stores[shards() - 1]->at(wayid);
+}
+
+bool ShardedWayStore::requiresNodes() const {
+	return stores[0]->requiresNodes();
+}
+
+void ShardedWayStore::insertLatpLons(std::vector<WayStore::ll_element_t> &newWays) {
+	throw std::runtime_error("ShardedWayStore::insertLatpLons: don't call this directly");
+}
+
+void ShardedWayStore::insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) {
+	throw std::runtime_error("ShardedWayStore::insertNodes: don't call this directly");
+}
+
+void ShardedWayStore::clear() {
+	for (auto& store : stores)
+		store->clear();
+}
+
+std::size_t ShardedWayStore::size() const {
+	size_t rv = 0;
+	for (auto& store : stores)
+		rv += store->size();
+	return rv;
+}
+
+void ShardedWayStore::finalize(unsigned int threadNum) {
+	for (auto& store : stores)
+		store->finalize(threadNum);
+}
+
+bool ShardedWayStore::contains(size_t shard, WayID id) const {
+	return stores[shard]->contains(0, id);
+}
+
+WayStore& ShardedWayStore::shard(size_t shard) {
+	return *stores[shard].get();
+}
+
+const WayStore& ShardedWayStore::shard(size_t shard) const {
+	return *stores[shard].get();
+}
+
+size_t ShardedWayStore::shards() const { return nodeStore.shards(); }
+
diff --git a/src/shared_data.cpp b/src/shared_data.cpp
index 78cfe11d..da9787d8 100644
--- a/src/shared_data.cpp
+++ b/src/shared_data.cpp
@@ -10,7 +10,7 @@ using namespace rapidjson;
 
 SharedData::SharedData(Config &configIn, const class LayerDefinition &layers)
 	: layers(layers), config(configIn) {
-	outputMode=OUTPUT_FILE;
+	outputMode=OptionsParser::OutputMode::File;
 	mergeSqlite=false;
 }
 
diff --git a/src/sorted_node_store.cpp b/src/sorted_node_store.cpp
index 76aa81b8..82dccb55 100644
--- a/src/sorted_node_store.cpp
+++ b/src/sorted_node_store.cpp
@@ -2,7 +2,6 @@
 #include <algorithm>
 #include <cstring>
 #include <string>
-#include <atomic>
 #include <map>
 #include <bitset>
 #include "sorted_node_store.h"
@@ -16,40 +15,51 @@ namespace SortedNodeStoreTypes {
 	const uint16_t ChunkAlignment = 16;
 	const uint32_t ChunkCompressed = 1 << 31;
 
-	std::atomic<uint64_t> totalGroups;
-	std::atomic<uint64_t> totalNodes;
-	std::atomic<uint64_t> totalGroupSpace;
-	std::atomic<uint64_t> totalAllocatedSpace;
-	std::atomic<uint64_t> totalChunks;
-	std::atomic<uint64_t> chunkSizeFreqs[257];
-	std::atomic<uint64_t> groupSizeFreqs[257];
-
-
-	// When SortedNodeStore first starts, it's not confident that it has seen an
-	// entire segment, so it's in "collecting orphans" mode. Once it crosses a
-	// threshold of 64K elements, it ceases to be in this mode.
-	//
-	// Orphans are rounded up across multiple threads, and dealt with in
-	// the finalize step.
-	thread_local bool collectingOrphans = true;
-	thread_local uint64_t groupStart = -1;
-	thread_local std::vector<NodeStore::element_t>* localNodes = nullptr;
-
-	thread_local int64_t cachedChunk = -1;
-	thread_local std::vector<int32_t> cacheChunkLons;
-	thread_local std::vector<int32_t> cacheChunkLatps;
-
-	thread_local uint32_t arenaSpace = 0;
-	thread_local char* arenaPtr = nullptr;
+	struct ThreadStorage {
+		ThreadStorage():
+			collectingOrphans(true),
+			groupStart(-1),
+			localNodes(nullptr),
+			cachedChunk(-1),
+			arenaSpace(0),
+			arenaPtr(nullptr) {}
+		// When SortedNodeStore first starts, it's not confident that it has seen an
+		// entire segment, so it's in "collecting orphans" mode. Once it crosses a
+		// threshold of 64K elements, it ceases to be in this mode.
+		//
+		// Orphans are rounded up across multiple threads, and dealt with in
+		// the finalize step.
+		bool collectingOrphans = true;
+		uint64_t groupStart = -1;
+		std::vector<NodeStore::element_t>* localNodes = nullptr;
+
+		int64_t cachedChunk = -1;
+		std::vector<int32_t> cacheChunkLons;
+		std::vector<int32_t> cacheChunkLatps;
+
+		uint32_t arenaSpace = 0;
+		char* arenaPtr = nullptr;
+	};
+
+	thread_local std::deque<std::pair<const SortedNodeStore*, ThreadStorage>> threadStorage;
+
+	ThreadStorage& s(const SortedNodeStore* who) {
+		for (auto& entry : threadStorage)
+			if (entry.first == who)
+				return entry.second;
+
+		threadStorage.push_back(std::make_pair(who, ThreadStorage()));
+
+		auto& rv = threadStorage.back();
+		return rv.second;
+	}
 }
 
 using namespace SortedNodeStoreTypes;
 
 SortedNodeStore::SortedNodeStore(bool compressNodes): compressNodes(compressNodes) {
-	// Each group can store 64K nodes. If we allocate 256K slots
-	// for groups, we support 2^34 = 17B nodes, or about twice
-	// the number used by OSM as of November 2023.
-	groups.resize(256 * 1024);
+	s(this); // allocate our ThreadStorage before multi-threading
+	reopen();
 }
 
 void SortedNodeStore::reopen()
@@ -61,11 +71,16 @@ void SortedNodeStore::reopen()
 	totalNodes = 0;
 	totalGroups = 0;
 	totalGroupSpace = 0;
+	totalAllocatedSpace = 0;
 	totalChunks = 0;
 	memset(chunkSizeFreqs, 0, sizeof(chunkSizeFreqs));
 	memset(groupSizeFreqs, 0, sizeof(groupSizeFreqs));
 	orphanage.clear();
 	workerBuffers.clear();
+
+	// Each group can store 64K nodes. If we allocate 256K slots
+	// for groups, we support 2^34 = 17B nodes, or about twice
+	// the number used by OSM as of November 2023.
 	groups.clear();
 	groups.resize(256 * 1024);
 }
@@ -73,6 +88,48 @@ void SortedNodeStore::reopen()
 SortedNodeStore::~SortedNodeStore() {
 	for (const auto entry: allocatedMemory)
 		void_mmap_allocator::deallocate(entry.first, entry.second);
+
+	s(this) = ThreadStorage();
+}
+
+bool SortedNodeStore::contains(size_t shard, NodeID id) const {
+	const size_t groupIndex = id / (GroupSize * ChunkSize);
+	const size_t chunk = (id % (GroupSize * ChunkSize)) / ChunkSize;
+	const uint64_t chunkMaskByte = chunk / 8;
+	const uint64_t chunkMaskBit = chunk % 8;
+
+	const uint64_t nodeMaskByte = (id % ChunkSize) / 8;
+	const uint64_t nodeMaskBit = id % 8;
+
+	GroupInfo* groupPtr = groups[groupIndex];
+
+	if (groupPtr == nullptr)
+		return false;
+
+	size_t chunkOffset = 0;
+	{
+		chunkOffset = popcnt(groupPtr->chunkMask, chunkMaskByte);
+		uint8_t maskByte = groupPtr->chunkMask[chunkMaskByte];
+		maskByte = maskByte & ((1 << chunkMaskBit) - 1);
+		chunkOffset += popcnt(&maskByte, 1);
+
+		if (!(groupPtr->chunkMask[chunkMaskByte] & (1 << chunkMaskBit)))
+			return false;
+	}
+
+	uint16_t scaledOffset = groupPtr->chunkOffsets[chunkOffset];
+	ChunkInfoBase* basePtr = (ChunkInfoBase*)(((char *)(groupPtr->chunkOffsets + popcnt(groupPtr->chunkMask, 32))) + (scaledOffset * ChunkAlignment));
+
+	size_t nodeOffset = 0;
+	nodeOffset = popcnt(basePtr->nodeMask, nodeMaskByte);
+	uint8_t maskByte = basePtr->nodeMask[nodeMaskByte];
+	maskByte = maskByte & ((1 << nodeMaskBit) - 1);
+	nodeOffset += popcnt(&maskByte, 1);
+	if (!(basePtr->nodeMask[nodeMaskByte] & (1 << nodeMaskBit)))
+		return false;
+
+
+	return true;
 }
 
 LatpLon SortedNodeStore::at(const NodeID id) const {
@@ -109,29 +166,30 @@ LatpLon SortedNodeStore::at(const NodeID id) const {
 		size_t latpSize = (ptr->flags >> 10) & ((1 << 10) - 1);
 		// TODO: we don't actually need the lonSize to decompress the data.
 		//       May as well store it as a sanity check for now.
-		size_t lonSize = ptr->flags & ((1 << 10) - 1);
+		// size_t lonSize = ptr->flags & ((1 << 10) - 1);
 		size_t n = popcnt(ptr->nodeMask, 32) - 1;
 
 		const size_t neededChunk = groupIndex * ChunkSize + chunk;
 
 		// Really naive caching strategy - just cache the last-used chunk.
 		// Probably good enough?
-		if (cachedChunk != neededChunk) {
-			cachedChunk = neededChunk;
-			cacheChunkLons.reserve(256);
-			cacheChunkLatps.reserve(256);
+		ThreadStorage& tls = s(this);
+		if (tls.cachedChunk != neededChunk) {
+			tls.cachedChunk = neededChunk;
+			tls.cacheChunkLons.reserve(256);
+			tls.cacheChunkLatps.reserve(256);
 
 			uint8_t* latpData = ptr->data;
 			uint8_t* lonData = ptr->data + latpSize;
 			uint32_t recovdata[256] = {0};
 
 			streamvbyte_decode(latpData, recovdata, n);
-			cacheChunkLatps[0] = ptr->firstLatp;
-			zigzag_delta_decode(recovdata, &cacheChunkLatps[1], n, cacheChunkLatps[0]);
+			tls.cacheChunkLatps[0] = ptr->firstLatp;
+			zigzag_delta_decode(recovdata, &tls.cacheChunkLatps[1], n, tls.cacheChunkLatps[0]);
 
 			streamvbyte_decode(lonData, recovdata, n);
-			cacheChunkLons[0] = ptr->firstLon;
-			zigzag_delta_decode(recovdata, &cacheChunkLons[1], n, cacheChunkLons[0]);
+			tls.cacheChunkLons[0] = ptr->firstLon;
+			zigzag_delta_decode(recovdata, &tls.cacheChunkLons[1], n, tls.cacheChunkLons[0]);
 		}
 
 		size_t nodeOffset = 0;
@@ -142,7 +200,7 @@ LatpLon SortedNodeStore::at(const NodeID id) const {
 		if (!(ptr->nodeMask[nodeMaskByte] & (1 << nodeMaskBit)))
 			throw std::out_of_range("SortedNodeStore: node " + std::to_string(id) + " missing, no node");
 
-		return { cacheChunkLatps[nodeOffset], cacheChunkLons[nodeOffset] };
+		return { tls.cacheChunkLatps[nodeOffset], tls.cacheChunkLons[nodeOffset] };
 	}
 
 	UncompressedChunkInfo* ptr = (UncompressedChunkInfo*)basePtr;
@@ -184,58 +242,60 @@ size_t SortedNodeStore::size() const {
 }
 
 void SortedNodeStore::insert(const std::vector<element_t>& elements) {
-	if (localNodes == nullptr) {
+	ThreadStorage& tls = s(this);
+	if (tls.localNodes == nullptr) {
 		std::lock_guard<std::mutex> lock(orphanageMutex);
 		if (workerBuffers.size() == 0)
 			workerBuffers.reserve(256);
 		else if (workerBuffers.size() == workerBuffers.capacity())
 			throw std::runtime_error("SortedNodeStore doesn't support more than 256 cores");
 		workerBuffers.push_back(std::vector<element_t>());
-		localNodes = &workerBuffers.back();
+		tls.localNodes = &workerBuffers.back();
 	}
 
-	if (groupStart == -1) {
+	if (tls.groupStart == -1) {
 		// Mark where the first full group starts, so we know when to transition
 		// out of collecting orphans.
-		groupStart = elements[0].first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+		tls.groupStart = elements[0].first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
 	}
 
 	int i = 0;
-	while (collectingOrphans && i < elements.size()) {
+	while (tls.collectingOrphans && i < elements.size()) {
 		const element_t& el = elements[i];
-		if (el.first >= groupStart + (GroupSize * ChunkSize)) {
-			collectingOrphans = false;
+		if (el.first >= tls.groupStart + (GroupSize * ChunkSize)) {
+			tls.collectingOrphans = false;
 			// Calculate new groupStart, rounding to previous boundary.
-			groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
-			collectOrphans(*localNodes);
-			localNodes->clear();
+			tls.groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+			collectOrphans(*tls.localNodes);
+			tls.localNodes->clear();
 		}
-		localNodes->push_back(el);
+		tls.localNodes->push_back(el);
 		i++;
 	}
 
 	while(i < elements.size()) {
 		const element_t& el = elements[i];
 
-		if (el.first >= groupStart + (GroupSize * ChunkSize)) {
-			publishGroup(*localNodes);
-			localNodes->clear();
-			groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+		if (el.first >= tls.groupStart + (GroupSize * ChunkSize)) {
+			publishGroup(*tls.localNodes);
+			tls.localNodes->clear();
+			tls.groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
 		}
 
-		localNodes->push_back(el);
+		tls.localNodes->push_back(el);
 		i++;
 	}
 }
 
 void SortedNodeStore::batchStart() {
-	collectingOrphans = true;
-	groupStart = -1;
-	if (localNodes == nullptr || localNodes->size() == 0)
+	ThreadStorage& tls = s(this);
+	tls.collectingOrphans = true;
+	tls.groupStart = -1;
+	if (tls.localNodes == nullptr || tls.localNodes->size() == 0)
 		return;
 
-	collectOrphans(*localNodes);
-	localNodes->clear();
+	collectOrphans(*tls.localNodes);
+	tls.localNodes->clear();
 }
 
 void SortedNodeStore::finalize(size_t threadNum) {
@@ -264,7 +324,7 @@ void SortedNodeStore::finalize(size_t threadNum) {
 
 	orphanage.clear();
 
-	std::cout << "SortedNodeStore: " << totalGroups << " groups, " << totalChunks << " chunks, " << totalNodes.load() << " nodes, " << totalGroupSpace.load() << " bytes (" << (1000ull * (totalAllocatedSpace.load() - totalGroupSpace.load()) / totalAllocatedSpace.load()) / 10.0 << "% wasted)" << std::endl;
+	std::cout << "SortedNodeStore: " << totalGroups << " groups, " << totalChunks << " chunks, " << totalNodes.load() << " nodes, " << totalGroupSpace.load() << " bytes (" << (1000ull * (totalAllocatedSpace.load() - totalGroupSpace.load()) / (totalAllocatedSpace.load() + 1)) / 10.0 << "% wasted)" << std::endl;
 	/*
 	for (int i = 0; i < 257; i++)
 		std::cout << "chunkSizeFreqs[ " << i << " ]= " << chunkSizeFreqs[i].load() << std::endl;
@@ -410,22 +470,23 @@ void SortedNodeStore::publishGroup(const std::vector<element_t>& nodes) {
 
 	GroupInfo* groupInfo = nullptr;
 
-	if (arenaSpace < groupSpace) {
+	ThreadStorage& tls = s(this);
+	if (tls.arenaSpace < groupSpace) {
 		// A full group takes ~330KB. Nodes are read _fast_, and there ends
 		// up being contention calling the allocator when reading the
 		// planet on a machine with 48 cores -- so allocate in large chunks.
-		arenaSpace = 4 * 1024 * 1024;
-		totalAllocatedSpace += arenaSpace;
-		arenaPtr = (char*)void_mmap_allocator::allocate(arenaSpace);
-		if (arenaPtr == nullptr)
+		tls.arenaSpace = 4 * 1024 * 1024;
+		totalAllocatedSpace += tls.arenaSpace;
+		tls.arenaPtr = (char*)void_mmap_allocator::allocate(tls.arenaSpace);
+		if (tls.arenaPtr == nullptr)
 			throw std::runtime_error("SortedNodeStore: failed to allocate arena");
 		std::lock_guard<std::mutex> lock(orphanageMutex);
-		allocatedMemory.push_back(std::make_pair((void*)arenaPtr, arenaSpace));
+		allocatedMemory.push_back(std::make_pair((void*)tls.arenaPtr, tls.arenaSpace));
 	}
 
-	arenaSpace -= groupSpace;
-	groupInfo = (GroupInfo*)arenaPtr;
-	arenaPtr += groupSpace;
+	tls.arenaSpace -= groupSpace;
+	groupInfo = (GroupInfo*)tls.arenaPtr;
+	tls.arenaPtr += groupSpace;
 
 	if (groups[groupIndex] != nullptr)
 		throw std::runtime_error("SortedNodeStore: group already present");
diff --git a/src/sorted_way_store.cpp b/src/sorted_way_store.cpp
index 8fdaa806..450a4bcc 100644
--- a/src/sorted_way_store.cpp
+++ b/src/sorted_way_store.cpp
@@ -1,4 +1,3 @@
-#include <atomic>
 #include <algorithm>
 #include <bitset>
 #include <cstring>
@@ -19,40 +18,56 @@ namespace SortedWayStoreTypes {
 	const uint16_t ClosedWay = 1 << 14;
 	const uint16_t UniformUpperBits = 1 << 13;
 
-	thread_local bool collectingOrphans = true;
-	thread_local uint64_t groupStart = -1;
-	thread_local std::vector<std::pair<WayID, std::vector<NodeID>>>* localWays = NULL;
+	struct ThreadStorage {
+		ThreadStorage():
+			collectingOrphans(true),
+			groupStart(-1),
+			localWays(nullptr) {}
 
-	thread_local std::vector<uint8_t> encodedWay;
+		bool collectingOrphans;
+		uint64_t groupStart;
+		std::vector<std::pair<WayID, std::vector<NodeID>>>* localWays;
+		std::vector<uint8_t> encodedWay;
+	};
+
+	thread_local std::deque<std::pair<const SortedWayStore*, ThreadStorage>> threadStorage;
+
+	inline ThreadStorage& s(const SortedWayStore* who) {
+		for (auto& entry : threadStorage)
+			if (entry.first == who)
+				return entry.second;
+
+		threadStorage.push_back(std::make_pair(who, ThreadStorage()));
+
+		auto& rv = threadStorage.back();
+		return rv.second;
+	}
 
 	// C++ doesn't support variable length arrays declared on stack.
 	// g++ and clang support it, but msvc doesn't. Rather than pay the
 	// cost of a vector for every decode, we use a thread_local with room for at
 	// least 2,000 nodes.
+	//
+	// Note: these are scratch buffers, so they remain as true thread-locals,
+	// and aren't part of ThreadStorage.
 	thread_local uint64_t highBytes[2000];
 	thread_local uint32_t uint32Buffer[2000];
 	thread_local int32_t int32Buffer[2000];
 	thread_local uint8_t uint8Buffer[8192];
-
-	std::atomic<uint64_t> totalWays;
-	std::atomic<uint64_t> totalNodes;
-	std::atomic<uint64_t> totalGroups;
-	std::atomic<uint64_t> totalGroupSpace;
-	std::atomic<uint64_t> totalChunks;
 }
 
 using namespace SortedWayStoreTypes;
 
 SortedWayStore::SortedWayStore(bool compressWays, const NodeStore& nodeStore): compressWays(compressWays), nodeStore(nodeStore) {
-	// Each group can store 64K ways. If we allocate 32K slots,
-	// we support 2^31 = 2B ways, or about twice the number used
-	// by OSM as of December 2023.
-	groups.resize(32 * 1024);
+	s(this); // allocate our ThreadStorage before multi-threading
+	reopen();
 }
 
 SortedWayStore::~SortedWayStore() {
 	for (const auto entry: allocatedMemory)
 		void_mmap_allocator::deallocate(entry.first, entry.second);
+
+	s(this) = ThreadStorage();
 }
 
 void SortedWayStore::reopen() {
@@ -67,11 +82,64 @@ void SortedWayStore::reopen() {
 	totalChunks = 0;
 	orphanage.clear();
 	workerBuffers.clear();
+
+	// Each group can store 64K ways. If we allocate 32K slots,
+	// we support 2^31 = 2B ways, or about twice the number used
+	// by OSM as of December 2023.
 	groups.clear();
-	groups.resize(256 * 1024);
+	groups.resize(32 * 1024);
 
 }
 
+bool SortedWayStore::contains(size_t shard, WayID id) const {
+	const size_t groupIndex = id / (GroupSize * ChunkSize);
+	const size_t chunk = (id % (GroupSize * ChunkSize)) / ChunkSize;
+	const uint64_t chunkMaskByte = chunk / 8;
+	const uint64_t chunkMaskBit = chunk % 8;
+
+	const uint64_t wayMaskByte = (id % ChunkSize) / 8;
+	const uint64_t wayMaskBit = id % 8;
+
+	GroupInfo* groupPtr = groups[groupIndex];
+
+	if (groupPtr == nullptr)
+		return false;
+
+	size_t chunkOffset = 0;
+	{
+		chunkOffset = popcnt(groupPtr->chunkMask, chunkMaskByte);
+		uint8_t maskByte = groupPtr->chunkMask[chunkMaskByte];
+		maskByte = maskByte & ((1 << chunkMaskBit) - 1);
+		chunkOffset += popcnt(&maskByte, 1);
+
+		if (!(groupPtr->chunkMask[chunkMaskByte] & (1 << chunkMaskBit)))
+			return false;
+	}
+
+	ChunkInfo* chunkPtr = (ChunkInfo*)((char*)groupPtr + groupPtr->chunkOffsets[chunkOffset]);
+
+	{
+		size_t wayOffset = 0;
+		wayOffset = popcnt(chunkPtr->smallWayMask, wayMaskByte);
+		uint8_t maskByte = chunkPtr->smallWayMask[wayMaskByte];
+		maskByte = maskByte & ((1 << wayMaskBit) - 1);
+		wayOffset += popcnt(&maskByte, 1);
+		if (chunkPtr->smallWayMask[wayMaskByte] & (1 << wayMaskBit))
+			return true;
+	}
+
+	size_t wayOffset = 0;
+	wayOffset += popcnt(chunkPtr->smallWayMask, 32);
+	wayOffset += popcnt(chunkPtr->bigWayMask, wayMaskByte);
+	uint8_t maskByte = chunkPtr->bigWayMask[wayMaskByte];
+	maskByte = maskByte & ((1 << wayMaskBit) - 1);
+	wayOffset += popcnt(&maskByte, 1);
+	if (!(chunkPtr->bigWayMask[wayMaskByte] & (1 << wayMaskBit)))
+		return false;
+
+	return true;
+}
+
 std::vector<LatpLon> SortedWayStore::at(WayID id) const {
 	const size_t groupIndex = id / (GroupSize * ChunkSize);
 	const size_t chunk = (id % (GroupSize * ChunkSize)) / ChunkSize;
@@ -140,52 +208,53 @@ void SortedWayStore::insertLatpLons(std::vector<WayStore::ll_element_t> &newWays
 	throw std::runtime_error("SortedWayStore does not support insertLatpLons");
 }
 
-const void SortedWayStore::insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) {
+void SortedWayStore::insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) {
 	// read_pbf can call with an empty array if the only ways it read were unable to
 	// be processed due to missing nodes, so be robust against empty way vector.
 	if (newWays.empty())
 		return;
 
-	if (localWays == nullptr) {
+	ThreadStorage& tls = s(this);
+	if (tls.localWays == nullptr) {
 		std::lock_guard<std::mutex> lock(orphanageMutex);
 		if (workerBuffers.size() == 0)
 			workerBuffers.reserve(256);
 		else if (workerBuffers.size() == workerBuffers.capacity())
 			throw std::runtime_error("SortedWayStore doesn't support more than 256 cores");
 		workerBuffers.push_back(std::vector<std::pair<WayID, std::vector<NodeID>>>());
-		localWays = &workerBuffers.back();
+		tls.localWays = &workerBuffers.back();
 	}
 
-	if (groupStart == -1) {
+	if (tls.groupStart == -1) {
 		// Mark where the first full group starts, so we know when to transition
 		// out of collecting orphans.
-		groupStart = newWays[0].first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+		tls.groupStart = newWays[0].first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
 	}
 
 	int i = 0;
-	while (collectingOrphans && i < newWays.size()) {
+	while (tls.collectingOrphans && i < newWays.size()) {
 		const auto& el = newWays[i];
-		if (el.first >= groupStart + (GroupSize * ChunkSize)) {
-			collectingOrphans = false;
+		if (el.first >= tls.groupStart + (GroupSize * ChunkSize)) {
+			tls.collectingOrphans = false;
 			// Calculate new groupStart, rounding to previous boundary.
-			groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
-			collectOrphans(*localWays);
-			localWays->clear();
+			tls.groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+			collectOrphans(*tls.localWays);
+			tls.localWays->clear();
 		}
-		localWays->push_back(el);
+		tls.localWays->push_back(el);
 		i++;
 	}
 
 	while(i < newWays.size()) {
 		const auto& el = newWays[i];
 
-		if (el.first >= groupStart + (GroupSize * ChunkSize)) {
-			publishGroup(*localWays);
-			localWays->clear();
-			groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+		if (el.first >= tls.groupStart + (GroupSize * ChunkSize)) {
+			publishGroup(*tls.localWays);
+			tls.localWays->clear();
+			tls.groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
 		}
 
-		localWays->push_back(el);
+		tls.localWays->push_back(el);
 		i++;
 	}
 }
@@ -229,13 +298,14 @@ void SortedWayStore::finalize(unsigned int threadNum) {
 }
 
 void SortedWayStore::batchStart() {
-	collectingOrphans = true;
-	groupStart = -1;
-	if (localWays == nullptr || localWays->size() == 0)
+	ThreadStorage& tls = s(this);
+	tls.collectingOrphans = true;
+	tls.groupStart = -1;
+	if (tls.localWays == nullptr || tls.localWays->size() == 0)
 		return;
 
-	collectOrphans(*localWays);
-	localWays->clear();
+	collectOrphans(*tls.localWays);
+	tls.localWays->clear();
 }
 
 void SortedWayStore::collectOrphans(const std::vector<std::pair<WayID, std::vector<NodeID>>>& orphans) {
@@ -244,6 +314,7 @@ void SortedWayStore::collectOrphans(const std::vector<std::pair<WayID, std::vect
 
 	std::vector<std::pair<WayID, std::vector<NodeID>>>& vec = orphanage[groupIndex];
 	const size_t i = vec.size();
+
 	vec.resize(i + orphans.size());
 	std::copy(orphans.begin(), orphans.end(), vec.begin() + i);
 }
@@ -284,7 +355,6 @@ std::vector<NodeID> SortedWayStore::decodeWay(uint16_t flags, const uint8_t* inp
 		for (int i = 0; i < length; i++)
 			rv.push_back(highBytes[i] | lowIntData[i]);
 	} else {
-		uint16_t compressedLength = *(uint16_t*)input;
 		input += 2;
 
 		uint32_t firstInt = *(uint32_t*)(input);
@@ -408,6 +478,7 @@ void populateMask(uint8_t* mask, const std::vector<uint8_t>& ids) {
 }
 
 void SortedWayStore::publishGroup(const std::vector<std::pair<WayID, std::vector<NodeID>>>& ways) {
+	ThreadStorage& tls = s(this);
 	totalWays += ways.size();
 	if (ways.size() == 0) {
 		throw std::runtime_error("SortedWayStore: group is empty");
@@ -451,12 +522,12 @@ void SortedWayStore::publishGroup(const std::vector<std::pair<WayID, std::vector
 		const WayID id = way.first;
 		lastChunk->wayIds.push_back(id % ChunkSize);
 
-		uint16_t flags = encodeWay(way.second, encodedWay, compressWays && way.second.size() >= 4);
+		uint16_t flags = encodeWay(way.second, tls.encodedWay, compressWays && way.second.size() >= 4);
 		lastChunk->wayFlags.push_back(flags);
 
 		std::vector<uint8_t> encoded;
-		encoded.resize(encodedWay.size());
-		memcpy(encoded.data(), encodedWay.data(), encodedWay.size());
+		encoded.resize(tls.encodedWay.size());
+		memcpy(encoded.data(), tls.encodedWay.data(), tls.encodedWay.size());
 
 		lastChunk->encodedWays.push_back(std::move(encoded));
 	}
diff --git a/src/tile_data.cpp b/src/tile_data.cpp
index 696ed333..f78bbdda 100644
--- a/src/tile_data.cpp
+++ b/src/tile_data.cpp
@@ -47,12 +47,14 @@ TileDataSource::TileDataSource(size_t threadNum, unsigned int baseZoom, bool inc
 	z6OffsetDivisor(baseZoom >= CLUSTER_ZOOM ? (1 << (baseZoom - CLUSTER_ZOOM)) : 1),
 	objectsMutex(threadNum * 4),
 	objects(CLUSTER_ZOOM_AREA),
+	lowZoomObjects(CLUSTER_ZOOM_AREA),
 	objectsWithIds(CLUSTER_ZOOM_AREA),
+	lowZoomObjectsWithIds(CLUSTER_ZOOM_AREA),
 	baseZoom(baseZoom),
 	pointStores(threadNum),
 	linestringStores(threadNum),
-	multipolygonStores(threadNum),
 	multilinestringStores(threadNum),
+	multipolygonStores(threadNum),
 	multiPolygonClipCache(ClipCache<MultiPolygon>(threadNum, baseZoom)),
 	multiLinestringClipCache(ClipCache<MultiLinestring>(threadNum, baseZoom))
 {
@@ -72,8 +74,9 @@ TileDataSource::TileDataSource(size_t threadNum, unsigned int baseZoom, bool inc
 }
 
 void TileDataSource::finalize(size_t threadNum) {
-	finalizeObjects<OutputObjectXY>(threadNum, baseZoom, objects.begin(), objects.end());
-	finalizeObjects<OutputObjectXYID>(threadNum, baseZoom, objectsWithIds.begin(), objectsWithIds.end());
+	finalizeObjects<OutputObjectXY>(name(), threadNum, baseZoom, objects.begin(), objects.end(), lowZoomObjects);
+	finalizeObjects<OutputObjectXYID>(name(), threadNum, baseZoom, objectsWithIds.begin(), objectsWithIds.end(), lowZoomObjectsWithIds);
+
 }
 
 void TileDataSource::addObjectToSmallIndex(const TileCoordinates& index, const OutputObject& oo, uint64_t id) {
@@ -105,32 +108,39 @@ void TileDataSource::addObjectToSmallIndex(const TileCoordinates& index, const O
 		});
 }
 
-void TileDataSource::collectTilesWithObjectsAtZoom(uint zoom, TileCoordinatesSet& output) {
+void TileDataSource::collectTilesWithObjectsAtZoom(std::vector<TileCoordinatesSet>& zooms) {
 	// Scan through all shards. Convert to base zoom, then convert to the requested zoom.
-	collectTilesWithObjectsAtZoomTemplate<OutputObjectXY>(baseZoom, objects.begin(), objects.size(), zoom, output);
-	collectTilesWithObjectsAtZoomTemplate<OutputObjectXYID>(baseZoom, objectsWithIds.begin(), objectsWithIds.size(), zoom, output);
+	collectTilesWithObjectsAtZoomTemplate<OutputObjectXY>(baseZoom, objects.begin(), objects.size(), zooms);
+	collectTilesWithObjectsAtZoomTemplate<OutputObjectXYID>(baseZoom, objectsWithIds.begin(), objectsWithIds.size(), zooms);
 }
 
-void addCoveredTilesToOutput(const uint baseZoom, const uint zoom, const Box& box, TileCoordinatesSet& output) {
-	int scale = pow(2, baseZoom-zoom);
+void addCoveredTilesToOutput(const uint baseZoom, std::vector<TileCoordinatesSet>& zooms, const Box& box) {
+	size_t maxZoom = zooms.size() - 1;
+	int scale = pow(2, baseZoom - maxZoom);
 	TileCoordinate minx = box.min_corner().x() / scale;
 	TileCoordinate maxx = box.max_corner().x() / scale;
 	TileCoordinate miny = box.min_corner().y() / scale;
 	TileCoordinate maxy = box.max_corner().y() / scale;
 	for (int x=minx; x<=maxx; x++) {
 		for (int y=miny; y<=maxy; y++) {
-			output.set(x, y);
+			size_t zx = x, zy = y;
+
+			for (int zoom = maxZoom; zoom >= 0; zoom--) {
+				zooms[zoom].set(zx, zy);
+				zx /= 2;
+				zy /= 2;
+			}
 		}
 	}
 }
 
 // Find the tiles used by the "large objects" from the rtree index
-void TileDataSource::collectTilesWithLargeObjectsAtZoom(uint zoom, TileCoordinatesSet &output) {
+void TileDataSource::collectTilesWithLargeObjectsAtZoom(std::vector<TileCoordinatesSet>& zooms) {
 	for(auto const &result: boxRtree)
-		addCoveredTilesToOutput(baseZoom, zoom, result.first, output);
+		addCoveredTilesToOutput(baseZoom, zooms, result.first);
 
 	for(auto const &result: boxRtreeWithIds)
-		addCoveredTilesToOutput(baseZoom, zoom, result.first, output);
+		addCoveredTilesToOutput(baseZoom, zooms, result.first);
 }
 
 // Copy objects from the tile at dstIndex (in the dataset srcTiles) into output
@@ -139,11 +149,15 @@ void TileDataSource::collectObjectsForTile(
 	TileCoordinates dstIndex,
 	std::vector<OutputObjectID>& output
 ) {
+	if (zoom < CLUSTER_ZOOM) {
+		collectLowZoomObjectsForTile<OutputObjectXY>(baseZoom, lowZoomObjects, zoom, dstIndex, output);
+		collectLowZoomObjectsForTile<OutputObjectXYID>(baseZoom, lowZoomObjectsWithIds, zoom, dstIndex, output);
+		return;
+	}
+
 	size_t iStart = 0;
 	size_t iEnd = objects.size();
 
-	// TODO: we could also narrow the search space for z1..z5, too.
-	//       They're less important, as they have fewer tiles.
 	if (zoom >= CLUSTER_ZOOM) {
 		// Compute the x, y at the base zoom level
 		TileCoordinate z6x = dstIndex.x / (1 << (zoom - CLUSTER_ZOOM));
@@ -188,11 +202,7 @@ Geometry TileDataSource::buildWayGeometry(OutputGeometryType const geomType,
                                           NodeID const objectID, const TileBbox &bbox) {
 	switch(geomType) {
 		case POINT_: {
-			auto p = retrievePoint(objectID);
-			if (geom::within(p, bbox.clippingBox)) {
-				return p;
-			} 
-			return MultiLinestring();
+			throw std::runtime_error("unexpected geomType in buildWayGeometry");
 		}
 
 		case LINESTRING_: {
@@ -329,22 +339,12 @@ Geometry TileDataSource::buildWayGeometry(OutputGeometryType const geomType,
 	}
 }
 
-LatpLon TileDataSource::buildNodeGeometry(OutputGeometryType const geomType, 
-                                          NodeID const objectID, const TileBbox &bbox) const {
-	switch(geomType) {
-		case POINT_: {
-			auto p = retrievePoint(objectID);
-			LatpLon out;
-			out.latp = p.y();
-			out.lon  = p.x();
-			return out;
-		}
-
-		default:
-			break;
-	}
-
-	throw std::runtime_error("Geometry type is not point");			
+LatpLon TileDataSource::buildNodeGeometry(NodeID const objectID, const TileBbox &bbox) const {
+	auto p = retrievePoint(objectID);
+	LatpLon out;
+	out.latp = p.y();
+	out.lon  = p.x();
+	return out;
 }
 
 
@@ -366,18 +366,14 @@ void TileDataSource::reportSize() const {
 	std::cout << "Generated points: " << (points - 1) << ", lines: " << (linestrings - 2) << ", polygons: " << (polygons - 1) << std::endl;
 }
 
-TileCoordinatesSet getTilesAtZoom(
+void populateTilesAtZoom(
 	const std::vector<class TileDataSource *>& sources,
-	unsigned int zoom
+	std::vector<TileCoordinatesSet>& zooms
 ) {
-	TileCoordinatesSet tileCoordinates(zoom);
-
 	for(size_t i=0; i<sources.size(); i++) {
-		sources[i]->collectTilesWithObjectsAtZoom(zoom, tileCoordinates);
-		sources[i]->collectTilesWithLargeObjectsAtZoom(zoom, tileCoordinates);
+		sources[i]->collectTilesWithObjectsAtZoom(zooms);
+		sources[i]->collectTilesWithLargeObjectsAtZoom(zooms);
 	}
-
-	return tileCoordinates;
 }
 
 std::vector<OutputObjectID> TileDataSource::getObjectsForTile(
@@ -532,7 +528,7 @@ NodeID TileDataSource::storePoint(const Point& input) {
 
 	NodeID offset = store.second->size();
 	store.second->emplace_back(input);
-	NodeID rv = (store.first << (35 - shardBits)) + offset;
+	NodeID rv = (store.first << (TILE_DATA_ID_SIZE - shardBits)) + offset;
 	return rv;
 }
 
@@ -542,7 +538,7 @@ NodeID TileDataSource::storeLinestring(const Linestring& src) {
 
 	NodeID offset = store.second->size();
 	store.second->emplace_back(std::move(dst));
-	NodeID rv = (store.first << (35 - shardBits)) + offset;
+	NodeID rv = (store.first << (TILE_DATA_ID_SIZE - shardBits)) + offset;
 	return rv;
 }
 
@@ -564,7 +560,7 @@ NodeID TileDataSource::storeMultiPolygon(const MultiPolygon& src) {
 
 	NodeID offset = store.second->size();
 	store.second->emplace_back(std::move(dst));
-	NodeID rv = (store.first << (35 - shardBits)) + offset;
+	NodeID rv = (store.first << (TILE_DATA_ID_SIZE - shardBits)) + offset;
 	return rv;
 }
 
@@ -579,7 +575,7 @@ NodeID TileDataSource::storeMultiLinestring(const MultiLinestring& src) {
 
 	NodeID offset = store.second->size();
 	store.second->emplace_back(std::move(dst));
-	NodeID rv = (store.first << (35 - shardBits)) + offset;
+	NodeID rv = (store.first << (TILE_DATA_ID_SIZE - shardBits)) + offset;
 	return rv;
 }
 
diff --git a/src/tile_worker.cpp b/src/tile_worker.cpp
index 5f5c48b2..7951fcaf 100644
--- a/src/tile_worker.cpp
+++ b/src/tile_worker.cpp
@@ -176,7 +176,7 @@ void ProcessObjects(
 
 		if (oo.oo.geomType == POINT_) {
 			vector_tile::Tile_Feature *featurePtr = vtLayer->add_features();
-			LatpLon pos = source->buildNodeGeometry(oo.oo.geomType, oo.oo.objectID, bbox);
+			LatpLon pos = source->buildNodeGeometry(oo.oo.objectID, bbox);
 			featurePtr->add_geometry(9);					// moveTo, repeat x1
 			pair<int,int> xy = bbox.scaleLatpLon(pos.latp/10000000.0, pos.lon/10000000.0);
 			featurePtr->add_geometry((xy.first  << 1) ^ (xy.first  >> 31));
@@ -378,13 +378,13 @@ void outputProc(
 
 	// Write to file or sqlite
 	string outputdata, compressed;
-	if (sharedData.outputMode == OUTPUT_MBTILES) {
+	if (sharedData.outputMode == OptionsParser::OutputMode::MBTiles) {
 		// Write to sqlite
 		tile.SerializeToString(&outputdata);
 		if (sharedData.config.compress) { compressed = compress_string(outputdata, Z_DEFAULT_COMPRESSION, sharedData.config.gzip); }
 		sharedData.mbtiles.saveTile(zoom, bbox.index.x, bbox.index.y, sharedData.config.compress ? &compressed : &outputdata, sharedData.mergeSqlite);
 
-	} else if (sharedData.outputMode == OUTPUT_PMTILES) {
+	} else if (sharedData.outputMode == OptionsParser::OutputMode::PMTiles) {
 		// Write to pmtiles
 		tile.SerializeToString(&outputdata);
 		sharedData.pmtiles.saveTile(zoom, bbox.index.x, bbox.index.y, outputdata);
diff --git a/src/tilemaker.cpp b/src/tilemaker.cpp
index 852be49b..1c821001 100644
--- a/src/tilemaker.cpp
+++ b/src/tilemaker.cpp
@@ -48,6 +48,7 @@
 #include "osm_lua_processing.h"
 #include "mbtiles.h"
 
+#include "options_parser.h"
 #include "shared_data.h"
 #include "read_pbf.h"
 #include "read_shp.h"
@@ -80,89 +81,46 @@ bool verbose = false;
  *
  * Worker threads write the output tiles, and start in the outputProc function.
  */
-int main(int argc, char* argv[]) {
-
+int main(const int argc, const char* argv[]) {
 	// ----	Read command-line options
-	vector<string> inputFiles;
-	string luaFile;
-	string osmStoreFile;
-	string jsonFile;
-	uint threadNum;
-	string outputFile;
-	string bbox;
-	bool _verbose = false, mergeSqlite = false, mapsplit = false, osmStoreCompact = false, skipIntegrity = false, osmStoreUncompressedNodes = false, osmStoreUncompressedWays = false, materializeGeometries = false;
-	int outputMode = OUTPUT_FILE;
-	bool logTileTimings = false;
-
-	po::options_description desc("tilemaker " STR(TM_VERSION) "\nConvert OpenStreetMap .pbf files into vector tiles\n\nAvailable options");
-	desc.add_options()
-		("help",                                                                 "show help message")
-		("input",  po::value< vector<string> >(&inputFiles),                     "source .osm.pbf file")
-		("output", po::value< string >(&outputFile),                             "target directory or .mbtiles/.pmtiles file")
-		("bbox",   po::value< string >(&bbox),                                   "bounding box to use if input file does not have a bbox header set, example: minlon,minlat,maxlon,maxlat")
-		("merge"  ,po::bool_switch(&mergeSqlite),                                "merge with existing .mbtiles (overwrites otherwise)")
-		("config", po::value< string >(&jsonFile)->default_value("config.json"), "config JSON file")
-		("process",po::value< string >(&luaFile)->default_value("process.lua"),  "tag-processing Lua file")
-		("store",  po::value< string >(&osmStoreFile),  "temporary storage for node/ways/relations data")
-		("compact",po::bool_switch(&osmStoreCompact),  "Reduce overall memory usage (compact mode).\nNOTE: This requires the input to be renumbered (osmium renumber)")
-		("no-compress-nodes", po::bool_switch(&osmStoreUncompressedNodes),  "Store nodes uncompressed")
-		("no-compress-ways", po::bool_switch(&osmStoreUncompressedWays),  "Store ways uncompressed")
-		("materialize-geometries", po::bool_switch(&materializeGeometries),  "Materialize geometries - faster, but requires more memory")
-		("verbose",po::bool_switch(&_verbose),                                   "verbose error output")
-		("skip-integrity",po::bool_switch(&skipIntegrity),                       "don't enforce way/node integrity")
-		("log-tile-timings", po::bool_switch(&logTileTimings), "log how long each tile takes")
-		("threads",po::value< uint >(&threadNum)->default_value(0),              "number of threads (automatically detected if 0)");
-	po::positional_options_description p;
-	p.add("input", 1).add("output", 1);
-	po::variables_map vm;
+	OptionsParser::Options options;
 	try {
-		po::store(po::command_line_parser(argc, argv).options(desc).positional(p).run(), vm);
-	} catch (const po::unknown_option& ex) {
-		cerr << "Unknown option: " << ex.get_option_name() << endl;
-		return -1;
+		options = OptionsParser::parse(argc, argv);
+	} catch (OptionsParser::OptionException& e) {
+		cerr << e.what() << endl;
+		return 1;
 	}
-	po::notify(vm);
-	
-	if (vm.count("help")) { cout << desc << endl; return 0; }
-	if (vm.count("output")==0) { cerr << "You must specify an output file or directory. Run with --help to find out more." << endl; return -1; }
-	if (vm.count("input")==0) { cout << "No source .osm.pbf file supplied" << endl; }
 
-	vector<string> bboxElements = parseBox(bbox);
+	if (options.showHelp) { OptionsParser::showHelp(); return 0; }
 
-	if (ends_with(outputFile, ".mbtiles") || ends_with(outputFile, ".sqlite")) { outputMode = OUTPUT_MBTILES; }
-	else if (ends_with(outputFile, ".pmtiles")) { outputMode = OUTPUT_PMTILES; }
-	if (threadNum == 0) { threadNum = max(thread::hardware_concurrency(), 1u); }
-	verbose = _verbose;
+	verbose = options.verbose;
 
-
-	// ---- Check config
-	
-	if (!boost::filesystem::exists(jsonFile)) { cerr << "Couldn't open .json config: " << jsonFile << endl; return -1; }
-	if (!boost::filesystem::exists(luaFile )) { cerr << "Couldn't open .lua script: "  << luaFile  << endl; return -1; }
+	vector<string> bboxElements = parseBox(options.bbox);
 
 	// ---- Remove existing .mbtiles if it exists
-
-	if ((outputMode==OUTPUT_MBTILES || outputMode==OUTPUT_PMTILES) && !mergeSqlite && static_cast<bool>(std::ifstream(outputFile))) {
+	if ((options.outputMode == OptionsParser::OutputMode::MBTiles || options.outputMode == OptionsParser::OutputMode::PMTiles) && !options.mergeSqlite && static_cast<bool>(std::ifstream(options.outputFile))) {
 		cout << "Output file exists, will overwrite (Ctrl-C to abort";
-		if (outputMode==OUTPUT_MBTILES) cout << ", rerun with --merge to keep";
+		if (options.outputMode == OptionsParser::OutputMode::MBTiles) cout << ", rerun with --merge to keep";
 		cout << ")" << endl;
 		std::this_thread::sleep_for(std::chrono::milliseconds(2000));
-		if (remove(outputFile.c_str()) != 0) {
+		if (remove(options.outputFile.c_str()) != 0) {
 			cerr << "Couldn't remove existing file" << endl;
 			return 0;
 		}
-	} else if (mergeSqlite && outputMode!=OUTPUT_MBTILES) {
+	} else if (options.mergeSqlite && options.outputMode != OptionsParser::OutputMode::MBTiles) {
 		cerr << "--merge only works with .mbtiles" << endl;
 		return 0;
-	} else if (mergeSqlite && !static_cast<bool>(std::ifstream(outputFile))) {
+	} else if (options.mergeSqlite && !static_cast<bool>(std::ifstream(options.outputFile))) {
 		cout << "--merge specified but .mbtiles file doesn't already exist, ignoring" << endl;
-		mergeSqlite = false;
+		options.mergeSqlite = false;
 	}
 
+
 	// ----	Read bounding box from first .pbf (if there is one) or mapsplit file
 
 	bool hasClippingBox = false;
 	Box clippingBox;
+	bool mapsplit = false;
 	MBTiles mapsplitFile;
 	double minLon=0.0, maxLon=0.0, minLat=0.0, maxLat=0.0;
 	if (!bboxElements.empty()) {
@@ -172,14 +130,14 @@ int main(int argc, char* argv[]) {
 		maxLon = bboxElementFromStr(bboxElements.at(2));
 		maxLat = bboxElementFromStr(bboxElements.at(3));
 
-	} else if (inputFiles.size()==1 && (ends_with(inputFiles[0], ".mbtiles") || ends_with(inputFiles[0], ".sqlite") || ends_with(inputFiles[0], ".msf"))) {
+	} else if (options.inputFiles.size()==1 && (ends_with(options.inputFiles[0], ".mbtiles") || ends_with(options.inputFiles[0], ".sqlite") || ends_with(options.inputFiles[0], ".msf"))) {
 		mapsplit = true;
-		mapsplitFile.openForReading(inputFiles[0]);
+		mapsplitFile.openForReading(options.inputFiles[0]);
 		mapsplitFile.readBoundingBox(minLon, maxLon, minLat, maxLat);
 		hasClippingBox = true;
 
-	} else if (inputFiles.size()>0) {
-		int ret = ReadPbfBoundingBox(inputFiles[0], minLon, maxLon, minLat, maxLat, hasClippingBox);
+	} else if (options.inputFiles.size()>0) {
+		int ret = ReadPbfBoundingBox(options.inputFiles[0], minLon, maxLon, minLat, maxLat, hasClippingBox);
 		if(ret != 0) return ret;
 	}
 
@@ -193,7 +151,7 @@ int main(int argc, char* argv[]) {
 	rapidjson::Document jsonConfig;
 	class Config config;
 	try {
-		FILE* fp = fopen(jsonFile.c_str(), "r");
+		FILE* fp = fopen(options.jsonFile.c_str(), "r");
 		char readBuffer[65536];
 		rapidjson::FileReadStream is(fp, readBuffer, sizeof(readBuffer));
 		jsonConfig.ParseStream(is);
@@ -211,52 +169,73 @@ int main(int argc, char* argv[]) {
 	}
 
 	// For each tile, objects to be used in processing
-	shared_ptr<NodeStore> nodeStore;
-
 	bool allPbfsHaveSortTypeThenID = true;
 	bool anyPbfHasLocationsOnWays = false;
 
-	for (const std::string& file: inputFiles) {
+	for (const std::string& file: options.inputFiles) {
 		if (ends_with(file, ".pbf")) {
 			allPbfsHaveSortTypeThenID = allPbfsHaveSortTypeThenID && PbfHasOptionalFeature(file, OptionSortTypeThenID);
 			anyPbfHasLocationsOnWays = anyPbfHasLocationsOnWays || PbfHasOptionalFeature(file, OptionLocationsOnWays);
 		}
 	}
 
-	if (osmStoreCompact)
-		nodeStore = make_shared<CompactNodeStore>();
-	else {
-		if (allPbfsHaveSortTypeThenID)
-			nodeStore = make_shared<SortedNodeStore>(!osmStoreUncompressedNodes);
-		else
-			nodeStore = make_shared<BinarySearchNodeStore>();
+	auto createNodeStore = [allPbfsHaveSortTypeThenID, options]() {
+		if (options.osm.compact) {
+			std::shared_ptr<NodeStore> rv = make_shared<CompactNodeStore>();
+			return rv;
+		}
+
+		if (allPbfsHaveSortTypeThenID) {
+			std::shared_ptr<NodeStore> rv = make_shared<SortedNodeStore>(!options.osm.uncompressedNodes);
+			return rv;
+		}
+		std::shared_ptr<NodeStore> rv =  make_shared<BinarySearchNodeStore>();
+		return rv;
+	};
+
+	shared_ptr<NodeStore> nodeStore;
+
+	if (options.osm.shardStores) {
+		nodeStore = std::make_shared<ShardedNodeStore>(createNodeStore);
+	} else {
+		nodeStore = createNodeStore();
 	}
 
+	auto createWayStore = [anyPbfHasLocationsOnWays, allPbfsHaveSortTypeThenID, options, &nodeStore]() {
+		if (!anyPbfHasLocationsOnWays && allPbfsHaveSortTypeThenID) {
+			std::shared_ptr<WayStore> rv = make_shared<SortedWayStore>(!options.osm.uncompressedWays, *nodeStore.get());
+			return rv;
+		}
+
+		std::shared_ptr<WayStore> rv = make_shared<BinarySearchWayStore>();
+		return rv;
+	};
+
 	shared_ptr<WayStore> wayStore;
-	if (!anyPbfHasLocationsOnWays && allPbfsHaveSortTypeThenID) {
-		wayStore = make_shared<SortedWayStore>(!osmStoreUncompressedNodes, *nodeStore.get());
+	if (options.osm.shardStores) {
+		wayStore = std::make_shared<ShardedWayStore>(createWayStore, *nodeStore.get());
 	} else {
-		wayStore = make_shared<BinarySearchWayStore>();
+		wayStore = createWayStore();
 	}
 
 	OSMStore osmStore(*nodeStore.get(), *wayStore.get());
-	osmStore.use_compact_store(osmStoreCompact);
-	osmStore.enforce_integrity(!skipIntegrity);
-	if(!osmStoreFile.empty()) {
-		std::cout << "Using osm store file: " << osmStoreFile << std::endl;
-		osmStore.open(osmStoreFile);
+	osmStore.use_compact_store(options.osm.compact);
+	osmStore.enforce_integrity(!options.osm.skipIntegrity);
+	if(!options.osm.storeFile.empty()) {
+		std::cout << "Using osm store file: " << options.osm.storeFile << std::endl;
+		osmStore.open(options.osm.storeFile);
 	}
 
 	AttributeStore attributeStore;
 
 	class LayerDefinition layers(config.layers);
-	class OsmMemTiles osmMemTiles(threadNum, config.baseZoom, config.includeID, *nodeStore, *wayStore);
-	class ShpMemTiles shpMemTiles(threadNum, config.baseZoom);
+	class OsmMemTiles osmMemTiles(options.threadNum, config.baseZoom, config.includeID, *nodeStore, *wayStore);
+	class ShpMemTiles shpMemTiles(options.threadNum, config.baseZoom);
 	osmMemTiles.open();
 	shpMemTiles.open();
 
-	OsmLuaProcessing osmLuaProcessing(osmStore, config, layers, luaFile, 
-		shpMemTiles, osmMemTiles, attributeStore, materializeGeometries);
+	OsmLuaProcessing osmLuaProcessing(osmStore, config, layers, options.luaFile, 
+		shpMemTiles, osmMemTiles, attributeStore, options.osm.materializeGeometries);
 
 	// ---- Load external shp files
 
@@ -274,7 +253,7 @@ int main(int argc, char* argv[]) {
 			readShapefile(clippingBox,
 			              layers,
 			              config.baseZoom, layerNum,
-			              threadNum,
+			              options.threadNum,
 			              shpMemTiles, osmLuaProcessing);
 		}
 	}
@@ -291,24 +270,27 @@ int main(int argc, char* argv[]) {
 	std::vector<bool> sortOrders = layers.getSortOrders();
 
 	if (!mapsplit) {
-		for (auto inputFile : inputFiles) {
+		for (auto inputFile : options.inputFiles) {
 			cout << "Reading .pbf " << inputFile << endl;
 			ifstream infile(inputFile, ios::in | ios::binary);
 			if (!infile) { cerr << "Couldn't open .pbf file " << inputFile << endl; return -1; }
 			
 			const bool hasSortTypeThenID = PbfHasOptionalFeature(inputFile, OptionSortTypeThenID);
 			int ret = pbfReader.ReadPbfFile(
+				nodeStore->shards(),
 				hasSortTypeThenID,
 				nodeKeys,
-				threadNum,
+				options.threadNum,
 				[&]() {
 					thread_local std::shared_ptr<ifstream> pbfStream(new ifstream(inputFile, ios::in | ios::binary));
 					return pbfStream;
 				},
 				[&]() {
-					thread_local std::shared_ptr<OsmLuaProcessing> osmLuaProcessing(new OsmLuaProcessing(osmStore, config, layers, luaFile, shpMemTiles, osmMemTiles, attributeStore, materializeGeometries));
+					thread_local std::shared_ptr<OsmLuaProcessing> osmLuaProcessing(new OsmLuaProcessing(osmStore, config, layers, options.luaFile, shpMemTiles, osmMemTiles, attributeStore, options.osm.materializeGeometries));
 					return osmLuaProcessing;
-				}
+				},
+				*nodeStore,
+				*wayStore
 			);
 			if (ret != 0) return ret;
 		} 
@@ -319,16 +301,16 @@ int main(int argc, char* argv[]) {
 	// ----	Initialise SharedData
 	SourceList sources = {&osmMemTiles, &shpMemTiles};
 	class SharedData sharedData(config, layers);
-	sharedData.outputFile = outputFile;
-	sharedData.outputMode = outputMode;
-	sharedData.mergeSqlite = mergeSqlite;
+	sharedData.outputFile = options.outputFile;
+	sharedData.outputMode = options.outputMode;
+	sharedData.mergeSqlite = options.mergeSqlite;
 
 	// ----	Initialise mbtiles/pmtiles if required
 	
-	if (sharedData.outputMode==OUTPUT_MBTILES) {
+	if (sharedData.outputMode == OptionsParser::OutputMode::MBTiles) {
 		sharedData.mbtiles.openForWriting(sharedData.outputFile);
 		sharedData.writeMBTilesProjectData();
-	} else if (sharedData.outputMode==OUTPUT_PMTILES) {
+	} else if (sharedData.outputMode == OptionsParser::OutputMode::PMTiles) {
 		sharedData.pmtiles.open(sharedData.outputFile);
 	}
 
@@ -362,6 +344,7 @@ int main(int argc, char* argv[]) {
 			vector<char> pbf = mapsplitFile.readTile(srcZ,srcX,tmsY);
 
 			int ret = pbfReader.ReadPbfFile(
+				nodeStore->shards(),
 				false,
 				nodeKeys,
 				1,
@@ -369,8 +352,10 @@ int main(int argc, char* argv[]) {
 					return make_unique<boost::interprocess::bufferstream>(pbf.data(), pbf.size(),  ios::in | ios::binary);
 				},
 				[&]() {
-					return std::make_unique<OsmLuaProcessing>(osmStore, config, layers, luaFile, shpMemTiles, osmMemTiles, attributeStore, materializeGeometries);
-				}
+					return std::make_unique<OsmLuaProcessing>(osmStore, config, layers, options.luaFile, shpMemTiles, osmMemTiles, attributeStore, options.osm.materializeGeometries);
+				},
+				*nodeStore,
+				*wayStore
 			);
 			if (ret != 0) return ret;
 
@@ -378,7 +363,7 @@ int main(int argc, char* argv[]) {
 		}
 
 		// Launch the pool with threadNum threads
-		boost::asio::thread_pool pool(threadNum);
+		boost::asio::thread_pool pool(options.threadNum);
 
 		// Mutex is hold when IO is performed
 		std::mutex io_mutex;
@@ -387,14 +372,14 @@ int main(int argc, char* argv[]) {
 		std::atomic<uint64_t> tilesWritten(0);
 
 		for (auto source : sources) {
-			source->finalize(threadNum);
+			source->finalize(options.threadNum);
 		}
 		// tiles by zoom level
 
 		// The clipping bbox check is expensive - as an optimization, compute the set of
 		// z6 tiles that are wholly covered by the clipping box. Membership in this
 		// set is quick to test.
-		std::set<TileCoordinates> coveredZ6Tiles;
+		TileCoordinatesSet coveredZ6Tiles(6);
 		if (hasClippingBox) {
 			for (int x = 0; x < 1 << 6; x++) {
 				for (int y = 0; y < 1 << 6; y++) {
@@ -402,20 +387,47 @@ int main(int argc, char* argv[]) {
 								TileBbox(TileCoordinates(x, y), 6, false, false).getTileBox(),
 								clippingBox
 							))
-						coveredZ6Tiles.insert(TileCoordinates(x, y));
+						coveredZ6Tiles.set(x, y);
 				}
 			}
 		}
 
 		// For large areas (arbitrarily defined as 100 z6 tiles), use a dense index for pmtiles
-		if (coveredZ6Tiles.size()>100 && outputMode==OUTPUT_PMTILES) {
+		if (coveredZ6Tiles.size()>100 && options.outputMode == OptionsParser::OutputMode::PMTiles) {
 			std::cout << "Using dense index for .pmtiles" << std::endl;
 			sharedData.pmtiles.isSparse = false;
 		}
 
 		std::deque<std::pair<unsigned int, TileCoordinates>> tileCoordinates;
+		std::vector<TileCoordinatesSet> zoomResults;
+		for (uint zoom = 0; zoom <= sharedData.config.endZoom; zoom++) {
+			zoomResults.push_back(TileCoordinatesSet(zoom));
+		}
+
+		{
+#ifdef CLOCK_MONOTONIC
+			timespec start, end;
+			clock_gettime(CLOCK_MONOTONIC, &start);
+#endif
+			std::cout << "collecting tiles" << std::flush;
+			populateTilesAtZoom(sources, zoomResults);
+#ifdef CLOCK_MONOTONIC
+			clock_gettime(CLOCK_MONOTONIC, &end);
+			uint64_t tileNs = 1e9 * (end.tv_sec - start.tv_sec) + end.tv_nsec - start.tv_nsec;
+			std::cout << ": " << (uint32_t)(tileNs / 1e6) << "ms";
+#endif
+		}
+
+		std::cout << ", filtering tiles:" << std::flush;
 		for (uint zoom=sharedData.config.startZoom; zoom <= sharedData.config.endZoom; zoom++) {
-			auto zoomResult = getTilesAtZoom(sources, zoom);
+			std::cout << " z" << std::to_string(zoom) << std::flush;
+#ifdef CLOCK_MONOTONIC
+			timespec start, end;
+			clock_gettime(CLOCK_MONOTONIC, &start);
+#endif
+
+			const auto& zoomResult = zoomResults[zoom];
+			int numTiles = 0;
 			for (int x = 0; x < 1 << zoom; x++) {
 				for (int y = 0; y < 1 << zoom; y++) {
 					if (!zoomResult.test(x, y))
@@ -433,7 +445,7 @@ int main(int argc, char* argv[]) {
 						if (zoom >= 6) {
 							TileCoordinate z6x = x / (1 << (zoom - 6));
 							TileCoordinate z6y = y / (1 << (zoom - 6));
-							isInAWhollyCoveredZ6Tile = coveredZ6Tiles.find(TileCoordinates(z6x, z6y)) != coveredZ6Tiles.end();
+							isInAWhollyCoveredZ6Tile = coveredZ6Tiles.test(z6x, z6y);
 						}
 
 						if(!isInAWhollyCoveredZ6Tile && !boost::geometry::intersects(TileBbox(TileCoordinates(x, y), zoom, false, false).getTileBox(), clippingBox)) 
@@ -441,9 +453,22 @@ int main(int argc, char* argv[]) {
 					}
 
 					tileCoordinates.push_back(std::make_pair(zoom, TileCoordinates(x, y)));
+					numTiles++;
 				}
 			}
+
+			std::cout << " (" << numTiles;
+#ifdef CLOCK_MONOTONIC
+			clock_gettime(CLOCK_MONOTONIC, &end);
+			uint64_t tileNs = 1e9 * (end.tv_sec - start.tv_sec) + end.tv_nsec - start.tv_nsec;
+			std::cout << ", " << (uint32_t)(tileNs / 1e6) << "ms";
+
+#endif
+			std::cout << ")" << std::flush;
 		}
+		zoomResults.clear();
+
+		std::cout << std::endl;
 
 		// Cluster tiles: breadth-first for z0..z5, depth-first for z6
 		const size_t baseZoom = config.baseZoom;
@@ -494,7 +519,7 @@ int main(int argc, char* argv[]) {
 
 				return false;
 			}, 
-			threadNum);
+			options.threadNum);
 
 		std::size_t batchSize = 0;
 		for(std::size_t startIndex = 0; startIndex < tileCoordinates.size(); startIndex += batchSize) {
@@ -523,9 +548,9 @@ int main(int argc, char* argv[]) {
 					unsigned int zoom = tileCoordinates[i].first;
 					TileCoordinates coords = tileCoordinates[i].second;
 
-#ifndef _WIN32
+#ifdef CLOCK_MONOTONIC
 					timespec start, end;
-					if (logTileTimings)
+					if (options.logTileTimings)
 						clock_gettime(CLOCK_MONOTONIC, &start);
 #endif
 
@@ -535,8 +560,8 @@ int main(int argc, char* argv[]) {
 					}
 					outputProc(sharedData, sources, attributeStore, data, coords, zoom);
 
-#ifndef _WIN32
-					if (logTileTimings) {
+#ifdef CLOCK_MONOTONIC
+					if (options.logTileTimings) {
 						clock_gettime(CLOCK_MONOTONIC, &end);
 						uint64_t tileNs = 1e9 * (end.tv_sec - start.tv_sec) + end.tv_nsec - start.tv_nsec;
 						std::string output = "z" + std::to_string(zoom) + "/" + std::to_string(coords.x) + "/" + std::to_string(coords.y) + " took " + std::to_string(tileNs/1e6) + " ms";
@@ -545,7 +570,7 @@ int main(int argc, char* argv[]) {
 #endif
 				}
 
-				if (logTileTimings) {
+				if (options.logTileTimings) {
 					const std::lock_guard<std::mutex> lock(io_mutex);
 					std::cout << std::endl;
 					for (const auto& output : tileTimings)
@@ -575,10 +600,10 @@ int main(int argc, char* argv[]) {
 
 	// ----	Close tileset
 
-	if (outputMode==OUTPUT_MBTILES) {
+	if (options.outputMode == OptionsParser::OutputMode::MBTiles) {
 		sharedData.writeMBTilesMetadata(jsonConfig);
 		sharedData.mbtiles.closeForWriting();
-	} else if (outputMode==OUTPUT_PMTILES) {
+	} else if (options.outputMode == OptionsParser::OutputMode::PMTiles) {
 		sharedData.writePMTilesBounds();
 		std::string metadata = sharedData.pmTilesMetadata();
 		sharedData.pmtiles.close(metadata);
diff --git a/src/way_stores.cpp b/src/way_stores.cpp
index 05d884d0..790ad816 100644
--- a/src/way_stores.cpp
+++ b/src/way_stores.cpp
@@ -14,6 +14,14 @@ void BinarySearchWayStore::reopen() {
 	mLatpLonLists = std::make_unique<map_t>();
 }
 
+bool BinarySearchWayStore::contains(size_t shard, WayID id) const {
+	auto iter = std::lower_bound(mLatpLonLists->begin(), mLatpLonLists->end(), id, [](auto const &e, auto id) { 
+		return e.first < id; 
+	});
+
+	return !(iter == mLatpLonLists->end() || iter->first != id);
+}
+
 std::vector<LatpLon> BinarySearchWayStore::at(WayID wayid) const {
 	std::lock_guard<std::mutex> lock(mutex);
 	
@@ -39,7 +47,7 @@ void BinarySearchWayStore::insertLatpLons(std::vector<WayStore::ll_element_t> &n
 	std::copy(std::make_move_iterator(newWays.begin()), std::make_move_iterator(newWays.end()), mLatpLonLists->begin() + i); 
 }
 
-const void BinarySearchWayStore::insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) {
+void BinarySearchWayStore::insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) {
 	throw std::runtime_error("BinarySearchWayStore does not support insertNodes");
 }
 
diff --git a/test/append_vector.test.cpp b/test/append_vector.test.cpp
new file mode 100644
index 00000000..db4949e2
--- /dev/null
+++ b/test/append_vector.test.cpp
@@ -0,0 +1,98 @@
+#include <iostream>
+#include <boost/sort/sort.hpp>
+#include "external/minunit.h"
+#include "append_vector.h"
+
+using namespace AppendVectorNS;
+
+MU_TEST(test_append_vector) {
+	AppendVector<int32_t> vec;
+	AppendVector<int32_t> vec2;
+	mu_check(vec.size() == 0);
+	mu_check(vec.begin() == vec.end());
+	mu_check(vec.begin() != vec2.begin());
+
+	for (int i = 0; i < 10000; i++) {
+		vec.push_back(i);
+	}
+	mu_check(vec.size() == 10000);
+
+	mu_check(vec[25] == 25);
+
+	const AppendVector<int32_t>::Iterator& it = vec.begin();
+	mu_check(*it == 0);
+	mu_check(*(it + 1) == 1);
+	mu_check(*(it + 2) == 2);
+	mu_check(*(it + 9000) == 9000);
+	mu_check(*(it + 1 - 1) == 0);
+	mu_check(*(vec.end() + -1) == 9999);
+	mu_check(*(vec.end() - 1) == 9999);
+	mu_check(*(vec.end() - 2) == 9998);
+	mu_check(*(vec.end() - 9000) == 1000);
+	mu_check(*(vec.begin() - -1) == 1);
+
+	boost::sort::block_indirect_sort(
+		vec.begin(),
+		vec.end(),
+		[](auto const &a, auto const&b) { return b < a; },
+		1
+	);
+
+	mu_check(vec[0] == 9999);
+	mu_check(vec[9999] == 0);
+
+	boost::sort::block_indirect_sort(
+		vec.begin(),
+		vec.end(),
+		[](auto const &a, auto const&b) { return a < b; },
+		1
+	);
+
+	mu_check(vec[0] == 0);
+	mu_check(vec[9999] == 9999);
+
+	auto iter = std::lower_bound(
+		vec.begin(),
+		vec.end(),
+		123,
+		[](const int32_t& a, const int32_t& toFind) {
+			return a < toFind;
+		}
+	);
+
+	mu_check(iter != vec.end());
+	mu_check(*iter == 123);
+
+	iter = std::lower_bound(
+		vec.begin(),
+		vec.end(),
+		123123,
+		[](const int32_t& a, const int32_t& toFind) {
+			return a < toFind;
+		}
+	);
+
+	mu_check(iter == vec.end());
+
+	iter = std::lower_bound(
+		vec.begin(),
+		vec.end(),
+		-2,
+		[](const int32_t& a, const int32_t& toFind) {
+			return a < toFind;
+		}
+	);
+
+	mu_check(iter == vec.begin());
+}
+
+MU_TEST_SUITE(test_suite_append_vector) {
+	MU_RUN_TEST(test_append_vector);
+}
+
+int main() {
+	MU_RUN_SUITE(test_suite_append_vector);
+	MU_REPORT();
+	return MU_EXIT_CODE;
+}
+
diff --git a/test/attribute_store.test.cpp b/test/attribute_store.test.cpp
new file mode 100644
index 00000000..db104a14
--- /dev/null
+++ b/test/attribute_store.test.cpp
@@ -0,0 +1,103 @@
+#include <iostream>
+#include <algorithm>
+#include "external/minunit.h"
+#include "attribute_store.h"
+
+MU_TEST(test_attribute_store) {
+	AttributeStore store;
+	store.reset();
+
+	mu_check(store.size() == 0);
+
+	AttributeSet s1;
+	store.addAttribute(s1, "str1", std::string("someval"), 0);
+	store.addAttribute(s1, "str2", std::string("a very long string"), 0);
+	store.addAttribute(s1, "bool1", false, 0);
+	store.addAttribute(s1, "bool2", true, 0);
+	store.addAttribute(s1, "float1", (float)42.0, 0);
+
+	const auto s1Index = store.add(s1);
+
+	mu_check(store.size() == 1);
+
+	const auto s1Pairs = store.getUnsafe(s1Index);
+	mu_check(s1Pairs.size() == 5);
+	const auto str1 = std::find_if(s1Pairs.begin(), s1Pairs.end(), [&store](auto ap) {
+			return ap->keyIndex == store.keyStore.key2index("str1");
+	});
+	mu_check(str1 != s1Pairs.end());
+	mu_check((*str1)->hasStringValue());
+	mu_check((*str1)->stringValue() == "someval");
+
+	const auto str2 = std::find_if(s1Pairs.begin(), s1Pairs.end(), [&store](auto ap) {
+			return ap->keyIndex == store.keyStore.key2index("str2");
+	});
+	mu_check(str2 != s1Pairs.end());
+	mu_check((*str2)->hasStringValue());
+	mu_check((*str2)->stringValue() == "a very long string");
+
+	const auto bool1 = std::find_if(s1Pairs.begin(), s1Pairs.end(), [&store](auto ap) {
+			return ap->keyIndex == store.keyStore.key2index("bool1");
+	});
+	mu_check(bool1 != s1Pairs.end());
+	mu_check((*bool1)->hasBoolValue());
+	mu_check((*bool1)->boolValue() == false);
+
+	const auto bool2 = std::find_if(s1Pairs.begin(), s1Pairs.end(), [&store](auto ap) {
+			return ap->keyIndex == store.keyStore.key2index("bool2");
+	});
+	mu_check(bool2 != s1Pairs.end());
+	mu_check((*bool2)->hasBoolValue());
+	mu_check((*bool2)->boolValue() == true);
+
+	const auto float1 = std::find_if(s1Pairs.begin(), s1Pairs.end(), [&store](auto ap) {
+			return ap->keyIndex == store.keyStore.key2index("float1");
+	});
+	mu_check(float1 != s1Pairs.end());
+	mu_check((*float1)->hasFloatValue());
+	mu_check((*float1)->floatValue() == 42);
+}
+
+MU_TEST(test_attribute_store_reuses) {
+	AttributeStore store;
+	store.reset();
+
+	mu_check(store.size() == 0);
+
+	{
+		AttributeSet s1a;
+		store.addAttribute(s1a, "str1", std::string("someval"), 0);
+		const auto s1aIndex = store.add(s1a);
+
+		AttributeSet s1b;
+		store.addAttribute(s1b, "str1", std::string("someval"), 0);
+		const auto s1bIndex = store.add(s1b);
+
+		mu_check(s1aIndex == s1bIndex);
+	}
+
+	{
+		AttributeSet s1a;
+		store.addAttribute(s1a, "str1", std::string("this is a very long string"), 0);
+		const auto s1aIndex = store.add(s1a);
+
+		AttributeSet s1b;
+		store.addAttribute(s1b, "str1", std::string("this is a very long string"), 0);
+		const auto s1bIndex = store.add(s1b);
+
+		mu_check(s1aIndex == s1bIndex);
+	}
+
+
+}
+
+MU_TEST_SUITE(test_suite_attribute_store) {
+	MU_RUN_TEST(test_attribute_store);
+	MU_RUN_TEST(test_attribute_store_reuses);
+}
+
+int main() {
+	MU_RUN_SUITE(test_suite_attribute_store);
+	MU_REPORT();
+	return MU_EXIT_CODE;
+}
diff --git a/test/deque_map.test.cpp b/test/deque_map.test.cpp
new file mode 100644
index 00000000..23a3d3cc
--- /dev/null
+++ b/test/deque_map.test.cpp
@@ -0,0 +1,63 @@
+#include <iostream>
+#include <algorithm>
+#include "external/minunit.h"
+#include "deque_map.h"
+
+MU_TEST(test_deque_map) {
+	DequeMap<std::string> strs;
+
+	mu_check(strs.size() == 0);
+	mu_check(!strs.full());
+	mu_check(strs.find("foo") == -1);
+	mu_check(strs.add("foo") == 0);
+	mu_check(!strs.full());
+	mu_check(strs.find("foo") == 0);
+	mu_check(strs.size() == 1);
+	mu_check(strs.add("foo") == 0);
+	mu_check(strs.size() == 1);
+	mu_check(strs.add("bar") == 1);
+	mu_check(strs.size() == 2);
+	mu_check(strs.add("aardvark") == 2);
+	mu_check(strs.size() == 3);
+	mu_check(strs.add("foo") == 0);
+	mu_check(strs.add("bar") == 1);
+	mu_check(strs.add("quux") == 3);
+	mu_check(strs.size() == 4);
+
+	mu_check(strs.at(0) == "foo");
+	mu_check(strs.at(1) == "bar");
+	mu_check(strs.at(2) == "aardvark");
+	mu_check(strs.at(3) == "quux");
+
+	std::vector<std::string> rv;
+	for (std::string x : strs) {
+		rv.push_back(x);
+	}
+	mu_check(rv[0] == "aardvark");
+	mu_check(rv[1] == "bar");
+	mu_check(rv[2] == "foo");
+	mu_check(rv[3] == "quux");
+
+	DequeMap<std::string> boundedMap(1);
+	mu_check(!boundedMap.full());
+	mu_check(boundedMap.add("foo") == 0);
+	mu_check(boundedMap.add("foo") == 0);
+	mu_check(boundedMap.full());
+	mu_check(boundedMap.add("bar") == -1);
+	boundedMap.clear();
+	mu_check(!boundedMap.full());
+	mu_check(boundedMap.find("foo") == -1);
+	mu_check(boundedMap.add("bar") == 0);
+	mu_check(boundedMap.add("bar") == 0);
+	mu_check(boundedMap.full());
+}
+
+MU_TEST_SUITE(test_suite_deque_map) {
+	MU_RUN_TEST(test_deque_map);
+}
+
+int main() {
+	MU_RUN_SUITE(test_suite_deque_map);
+	MU_REPORT();
+	return MU_EXIT_CODE;
+}
diff --git a/test/options_parser.test.cpp b/test/options_parser.test.cpp
new file mode 100644
index 00000000..e230fc0d
--- /dev/null
+++ b/test/options_parser.test.cpp
@@ -0,0 +1,107 @@
+#include <iostream>
+#include "external/minunit.h"
+#include "options_parser.h"
+
+const char* PROGRAM_NAME = "./tilemaker";
+using namespace OptionsParser;
+
+Options parse(std::vector<std::string>& args) {
+	const char* argv[100];
+
+	argv[0] = PROGRAM_NAME;
+	for(int i = 0; i < args.size(); i++)
+		argv[1 + i] = args[i].data();
+
+	return parse(1 + args.size(), argv);
+}
+
+#define ASSERT_THROWS(MESSAGE, ...) \
+{ \
+	std::vector<std::string> args = { __VA_ARGS__ }; \
+	bool threw = false; \
+	try { \
+		auto opts = parse(args); \
+	} catch(OptionsParser::OptionException& e) { \
+		threw = std::string(e.what()).find(MESSAGE) != std::string::npos; \
+	} \
+	if (!threw) mu_check((std::string("expected exception with ") + MESSAGE).empty()); \
+}
+
+MU_TEST(test_options_parser) {
+	// No args is invalid.
+	ASSERT_THROWS("You must specify an output file");
+
+	// Output without input is invalid
+	ASSERT_THROWS("No source .osm.pbf", "--output", "foo.mbtiles");
+
+	// You can ask for --help.
+	{
+		std::vector<std::string> args = {"--help"};
+		auto opts = parse(args);
+		mu_check(opts.showHelp);
+	}
+
+	// Minimal valid is output and input
+	{
+		std::vector<std::string> args = {"--output", "foo.mbtiles", "--input", "ontario.pbf"};
+		auto opts = parse(args);
+		mu_check(opts.inputFiles.size() == 1);
+		mu_check(opts.inputFiles[0] == "ontario.pbf");
+		mu_check(opts.outputFile == "foo.mbtiles");
+		mu_check(opts.outputMode == OutputMode::MBTiles);
+		mu_check(opts.osm.materializeGeometries);
+		mu_check(!opts.osm.shardStores);
+	}
+
+	// --lazy-geometries overrides default
+	{
+		std::vector<std::string> args = {"--output", "foo.mbtiles", "--input", "ontario.pbf", "--lazy-geometries"};
+		auto opts = parse(args);
+		mu_check(opts.inputFiles.size() == 1);
+		mu_check(opts.inputFiles[0] == "ontario.pbf");
+		mu_check(opts.outputFile == "foo.mbtiles");
+		mu_check(opts.outputMode == OutputMode::MBTiles);
+		mu_check(!opts.osm.materializeGeometries);
+		mu_check(opts.osm.lazyGeometries);
+		mu_check(!opts.osm.shardStores);
+	}
+
+	// --store should optimize for reduced memory
+	{
+		std::vector<std::string> args = {"--output", "foo.mbtiles", "--input", "ontario.pbf", "--store", "/tmp/store"};
+		auto opts = parse(args);
+		mu_check(opts.inputFiles.size() == 1);
+		mu_check(opts.inputFiles[0] == "ontario.pbf");
+		mu_check(opts.outputFile == "foo.mbtiles");
+		mu_check(opts.outputMode == OutputMode::MBTiles);
+		mu_check(opts.osm.storeFile == "/tmp/store");
+		mu_check(!opts.osm.materializeGeometries);
+		mu_check(opts.osm.shardStores);
+	}
+
+	// --store --fast should optimize for speed
+	{
+		std::vector<std::string> args = {"--output", "foo.pmtiles", "--input", "ontario.pbf", "--store", "/tmp/store", "--fast"};
+		auto opts = parse(args);
+		mu_check(opts.inputFiles.size() == 1);
+		mu_check(opts.inputFiles[0] == "ontario.pbf");
+		mu_check(opts.outputFile == "foo.pmtiles");
+		mu_check(opts.outputMode == OutputMode::PMTiles);
+		mu_check(opts.osm.storeFile == "/tmp/store");
+		mu_check(!opts.osm.materializeGeometries);
+		mu_check(!opts.osm.shardStores);
+	}
+
+	ASSERT_THROWS("Couldn't open .json config", "--input", "foo", "--output", "bar", "--config", "nonexistent-config.json");
+	ASSERT_THROWS("Couldn't open .lua script", "--input", "foo", "--output", "bar", "--process", "nonexistent-script.lua");
+}
+
+MU_TEST_SUITE(test_suite_options_parser) {
+	MU_RUN_TEST(test_options_parser);
+}
+
+int main() {
+	MU_RUN_SUITE(test_suite_options_parser);
+	MU_REPORT();
+	return MU_EXIT_CODE;
+}
diff --git a/test/pooled_string.test.cpp b/test/pooled_string.test.cpp
new file mode 100644
index 00000000..91fb2da5
--- /dev/null
+++ b/test/pooled_string.test.cpp
@@ -0,0 +1,55 @@
+#include <iostream>
+#include "external/minunit.h"
+#include "pooled_string.h"
+
+MU_TEST(test_pooled_string) {
+	mu_check(PooledString("").size() == 0);
+	mu_check(PooledString("").toString() == "");
+	mu_check(PooledString("f").size() == 1);
+	mu_check(PooledString("f").toString() == "f");
+	mu_check(PooledString("hi").size() == 2);
+	mu_check(PooledString("f") == PooledString("f"));
+	mu_check(PooledString("f") != PooledString("g"));
+
+	mu_check(PooledString("this is more than fifteen bytes").size() == 31);
+	mu_check(PooledString("this is more than fifteen bytes") != PooledString("f"));
+
+	PooledString big("this is also a really long string");
+	mu_check(big == big);
+	mu_check(big.toString() == "this is also a really long string");
+
+	PooledString big2("this is also a quite long string");
+	mu_check(big != big2);
+	mu_check(big.toString() != big2.toString());
+
+	std::string shortString("short");
+	std::string longString("this is a very long string");
+
+	PooledString stdShortString(&shortString);
+	mu_check(stdShortString.size() == 5);
+	mu_check(stdShortString.toString() == "short");
+
+	PooledString stdLongString(&longString);
+	mu_check(stdLongString.size() == 26);
+	mu_check(stdLongString.toString() == "this is a very long string");
+
+	// PooledStrings that are backed by std::string have the usual
+	// == semantics.
+	mu_check(stdShortString == PooledString("short"));
+	mu_check(PooledString("short") == stdShortString);
+
+	mu_check(stdLongString == PooledString("this is a very long string"));
+	mu_check(PooledString("this is a very long string") == stdLongString);
+
+	mu_check(stdShortString != stdLongString);
+}
+
+MU_TEST_SUITE(test_suite_pooled_string) {
+	MU_RUN_TEST(test_pooled_string);
+}
+
+int main() {
+	MU_RUN_SUITE(test_suite_pooled_string);
+	MU_REPORT();
+	return MU_EXIT_CODE;
+}
diff --git a/test/sorted_node_store.test.cpp b/test/sorted_node_store.test.cpp
new file mode 100644
index 00000000..de66445f
--- /dev/null
+++ b/test/sorted_node_store.test.cpp
@@ -0,0 +1,41 @@
+#include <iostream>
+#include "external/minunit.h"
+#include "sorted_node_store.h"
+
+MU_TEST(test_sorted_node_store) {
+	bool compressed = true;
+
+	for (int i = 0; i < 2; i++) {
+		compressed = !compressed;
+		SortedNodeStore s1(compressed), s2(compressed);
+		mu_check(s1.size() == 0);
+		mu_check(s2.size() == 0);
+
+		s1.batchStart();
+		s2.batchStart();
+
+		s1.insert({ {1, {2, 3 } } });
+		s2.insert({ {2, {3, 4 } } });
+
+		s1.finalize(1);
+		s2.finalize(1);
+
+		mu_check(s1.size() == 1);
+		mu_check(s1.at(1) == LatpLon({2, 3}));
+		mu_check(s1.contains(0, 1));
+		mu_check(!s1.contains(0, 2));
+		mu_check(!s1.contains(0, 1ull << 34));
+		mu_check(s2.size() == 1);
+		mu_check(s2.at(2) == LatpLon({3, 4}));
+	}
+}
+
+MU_TEST_SUITE(test_suite_sorted_node_store) {
+	MU_RUN_TEST(test_sorted_node_store);
+}
+
+int main() {
+	MU_RUN_SUITE(test_suite_sorted_node_store);
+	MU_REPORT();
+	return MU_EXIT_CODE;
+}
diff --git a/test/sorted_way_store.test.cpp b/test/sorted_way_store.test.cpp
index 1c50a494..65d34816 100644
--- a/test/sorted_way_store.test.cpp
+++ b/test/sorted_way_store.test.cpp
@@ -13,6 +13,10 @@ class TestNodeStore : public NodeStore {
 		return { (int32_t)id, -(int32_t)id };
 	}
 	void insert(const std::vector<std::pair<NodeID, LatpLon>>& elements) override {}
+
+	bool contains(size_t shard, NodeID id) const override { return true; }
+	size_t shard() const override { return 0; }
+	size_t shards() const override { return 1; }
 };
 
 void roundtripWay(const std::vector<NodeID>& way) {
@@ -70,6 +74,39 @@ MU_TEST(test_encode_way) {
 	}
 }
 
+MU_TEST(test_multiple_stores) {
+	bool compressed = false;
+
+	for (int i = 0; i < 2; i++) {
+		compressed = !compressed;
+		TestNodeStore ns;
+		SortedWayStore s1(compressed, ns), s2(compressed, ns);
+		s1.batchStart();
+		s2.batchStart();
+
+		s1.insertNodes({{ 1, { 1 } }});
+
+		// We store small ways differently than large ways, so
+		// store both kinds for testing.
+		std::vector<NodeID> longWay;
+		for (int i = 200; i < 2048; i++)
+			longWay.push_back(i + 3 * (i % 37));
+
+		s1.insertNodes({{ 42, longWay }});
+		s2.insertNodes({{ 2, { 2 } }});
+
+		s1.finalize(1);
+		s2.finalize(1);
+
+		mu_check(s1.size() == 2);
+		mu_check(s2.size() == 1);
+
+		mu_check(s1.contains(0, 1));
+		mu_check(s1.contains(0, 42));
+		mu_check(!s1.contains(0, 2));
+	}
+}
+
 MU_TEST(test_way_store) {
 	TestNodeStore ns;
 	SortedWayStore sws(true, ns);
@@ -178,6 +215,7 @@ MU_TEST(test_populate_mask) {
 
 MU_TEST_SUITE(test_suite_sorted_way_store) {
 	MU_RUN_TEST(test_encode_way);
+	MU_RUN_TEST(test_multiple_stores);
 	MU_RUN_TEST(test_way_store);
 }