From 5f30a30eb34ba2184c90f6a2084bbfbdcb30a65d Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Fri, 15 Dec 2023 18:28:11 -0500
Subject: [PATCH 01/49] move OutputObjects to mmap store

For the planet, we need 1.3B output objects, 12 bytes per, so ~15GB
of RAM.
---
 include/tile_data.h | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/include/tile_data.h b/include/tile_data.h
index 814b53ce..c7b808ab 100644
--- a/include/tile_data.h
+++ b/include/tile_data.h
@@ -9,6 +9,7 @@
 #include <boost/sort/sort.hpp>
 #include "output_object.h"
 #include "clip_cache.h"
+#include "mmap_allocator.h"
 
 typedef std::vector<class TileDataSource *> SourceList;
 
@@ -47,10 +48,10 @@ struct OutputObjectXYID {
 template<typename OO> void finalizeObjects(
 	const size_t& threadNum,
 	const unsigned int& baseZoom,
-	typename std::vector<std::vector<OO>>::iterator begin,
-	typename std::vector<std::vector<OO>>::iterator end
+	typename std::vector<std::deque<OO, mmap_allocator<OO>>>::iterator begin,
+	typename std::vector<std::deque<OO, mmap_allocator<OO>>>::iterator end
 	) {
-	for (typename std::vector<std::vector<OO>>::iterator it = begin; it != end; it++) {
+	for (auto it = begin; it != end; it++) {
 		if (it->size() == 0)
 			continue;
 
@@ -108,7 +109,7 @@ template<typename OO> void finalizeObjects(
 
 template<typename OO> void collectTilesWithObjectsAtZoomTemplate(
 	const unsigned int& baseZoom,
-	const typename std::vector<std::vector<OO>>::iterator objects,
+	const typename std::vector<std::deque<OO, mmap_allocator<OO>>>::iterator objects,
 	const size_t size,
 	const unsigned int zoom,
 	TileCoordinatesSet& output
@@ -150,7 +151,7 @@ inline OutputObjectID outputObjectWithId<OutputObjectXYID>(const OutputObjectXYI
 
 template<typename OO> void collectObjectsForTileTemplate(
 	const unsigned int& baseZoom,
-	typename std::vector<std::vector<OO>>::iterator objects,
+	typename std::vector<std::deque<OO, mmap_allocator<OO>>>::iterator objects,
 	size_t iStart,
 	size_t iEnd,
 	unsigned int zoom,
@@ -292,8 +293,8 @@ class TileDataSource {
 	//
 	// If config.include_ids is true, objectsWithIds will be populated.
 	// Otherwise, objects.
-	std::vector<std::vector<OutputObjectXY>> objects;
-	std::vector<std::vector<OutputObjectXYID>> objectsWithIds;
+	std::vector<std::deque<OutputObjectXY, mmap_allocator<OutputObjectXY>>> objects;
+	std::vector<std::deque<OutputObjectXYID, mmap_allocator<OutputObjectXYID>>> objectsWithIds;
 	
 	// rtree index of large objects
 	using oo_rtree_param_type = boost::geometry::index::quadratic<128>;

From b9187693992e130f6037314f153947ca8002a831 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Fri, 15 Dec 2023 18:45:27 -0500
Subject: [PATCH 02/49] treat objects at low zoom specially

For GB, ~0.3% of objects are visible at low zooms.

I noticed in previous planet runs that fetching the objects for tiles in
the low zooms was quite slow - I think it's because we're scanning 1.3B
objects each time, only to discard most of them. Now we'll only be
scanning ~4M objects per tile, which is still an absurd number, but
should mitigate most of the speed issue without having to properly
index things.

This will also help us maintain performance for memory-constrained
users, as we won't be scanning all 15GB of data on disk, just a smaller
~45MB chunk.
---
 include/tile_data.h | 10 ++++++++--
 src/tile_data.cpp   | 12 ++++++++++--
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/include/tile_data.h b/include/tile_data.h
index c7b808ab..13d61dbe 100644
--- a/include/tile_data.h
+++ b/include/tile_data.h
@@ -49,7 +49,8 @@ template<typename OO> void finalizeObjects(
 	const size_t& threadNum,
 	const unsigned int& baseZoom,
 	typename std::vector<std::deque<OO, mmap_allocator<OO>>>::iterator begin,
-	typename std::vector<std::deque<OO, mmap_allocator<OO>>>::iterator end
+	typename std::vector<std::deque<OO, mmap_allocator<OO>>>::iterator end,
+	typename std::vector<std::deque<OO, mmap_allocator<OO>>>& lowZoom
 	) {
 	for (auto it = begin; it != end; it++) {
 		if (it->size() == 0)
@@ -57,6 +58,10 @@ template<typename OO> void finalizeObjects(
 
 		it->shrink_to_fit();
 
+		for (auto objectIt = it->begin(); objectIt != it->end(); objectIt++)
+			if (objectIt->oo.minZoom < CLUSTER_ZOOM)
+				lowZoom[0].push_back(*objectIt);
+
 		// If the user is doing a a small extract, there are few populated
 		// entries in `object`.
 		//
@@ -103,7 +108,6 @@ template<typename OO> void finalizeObjects(
 			},
 			threadNum
 		);
-
 	}
 }
 
@@ -294,7 +298,9 @@ class TileDataSource {
 	// If config.include_ids is true, objectsWithIds will be populated.
 	// Otherwise, objects.
 	std::vector<std::deque<OutputObjectXY, mmap_allocator<OutputObjectXY>>> objects;
+	std::vector<std::deque<OutputObjectXY, mmap_allocator<OutputObjectXY>>> lowZoomObjects;
 	std::vector<std::deque<OutputObjectXYID, mmap_allocator<OutputObjectXYID>>> objectsWithIds;
+	std::vector<std::deque<OutputObjectXYID, mmap_allocator<OutputObjectXYID>>> lowZoomObjectsWithIds;
 	
 	// rtree index of large objects
 	using oo_rtree_param_type = boost::geometry::index::quadratic<128>;
diff --git a/src/tile_data.cpp b/src/tile_data.cpp
index 696ed333..234288fd 100644
--- a/src/tile_data.cpp
+++ b/src/tile_data.cpp
@@ -47,7 +47,9 @@ TileDataSource::TileDataSource(size_t threadNum, unsigned int baseZoom, bool inc
 	z6OffsetDivisor(baseZoom >= CLUSTER_ZOOM ? (1 << (baseZoom - CLUSTER_ZOOM)) : 1),
 	objectsMutex(threadNum * 4),
 	objects(CLUSTER_ZOOM_AREA),
+	lowZoomObjects(1),
 	objectsWithIds(CLUSTER_ZOOM_AREA),
+	lowZoomObjectsWithIds(1),
 	baseZoom(baseZoom),
 	pointStores(threadNum),
 	linestringStores(threadNum),
@@ -72,8 +74,9 @@ TileDataSource::TileDataSource(size_t threadNum, unsigned int baseZoom, bool inc
 }
 
 void TileDataSource::finalize(size_t threadNum) {
-	finalizeObjects<OutputObjectXY>(threadNum, baseZoom, objects.begin(), objects.end());
-	finalizeObjects<OutputObjectXYID>(threadNum, baseZoom, objectsWithIds.begin(), objectsWithIds.end());
+	finalizeObjects<OutputObjectXY>(threadNum, baseZoom, objects.begin(), objects.end(), lowZoomObjects);
+	finalizeObjects<OutputObjectXYID>(threadNum, baseZoom, objectsWithIds.begin(), objectsWithIds.end(), lowZoomObjectsWithIds);
+
 }
 
 void TileDataSource::addObjectToSmallIndex(const TileCoordinates& index, const OutputObject& oo, uint64_t id) {
@@ -139,6 +142,11 @@ void TileDataSource::collectObjectsForTile(
 	TileCoordinates dstIndex,
 	std::vector<OutputObjectID>& output
 ) {
+	if (zoom < CLUSTER_ZOOM) {
+		collectObjectsForTileTemplate<OutputObjectXY>(baseZoom, lowZoomObjects.begin(), 0, 1, zoom, dstIndex, output);
+		collectObjectsForTileTemplate<OutputObjectXYID>(baseZoom, lowZoomObjectsWithIds.begin(), 0, 1, zoom, dstIndex, output);
+	}
+
 	size_t iStart = 0;
 	size_t iEnd = objects.size();
 

From d7caf1024bc361bcc4a5f836c255e44211cf53d6 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sat, 16 Dec 2023 10:46:05 -0500
Subject: [PATCH 03/49] make more explicit that this is unexpected

---
 src/tile_data.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/tile_data.cpp b/src/tile_data.cpp
index 234288fd..a50ddfea 100644
--- a/src/tile_data.cpp
+++ b/src/tile_data.cpp
@@ -196,11 +196,7 @@ Geometry TileDataSource::buildWayGeometry(OutputGeometryType const geomType,
                                           NodeID const objectID, const TileBbox &bbox) {
 	switch(geomType) {
 		case POINT_: {
-			auto p = retrievePoint(objectID);
-			if (geom::within(p, bbox.clippingBox)) {
-				return p;
-			} 
-			return MultiLinestring();
+			throw std::runtime_error("unexpected geomType in buildWayGeometry");
 		}
 
 		case LINESTRING_: {

From 8dff5bf1c1f1af7298b85d482ee4e88d14e09197 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sat, 16 Dec 2023 10:46:27 -0500
Subject: [PATCH 04/49] extend --materialize-geometries to nodes

For Points stored via Layer(...) calls, store the node ID in the
OSM store, unless `--materialize-geometries` is present.

This saves ~200MB of RAM for North America, so perhaps 1 GB for the
planet if NA has similar characteristics as the planet.

Also fix the OSM_ID(...) macro - it was lopping off many more bits
than needed, due to some previous experiments. Now that we want to track
nodes, we need at least 34 bits.

This may pose a problem down the road when we try to address thrashing.
The mechanism I hoped to use was to divide the OSM stores into multiple
stores covering different low zoom tiles. Ideally, we'd be able to
recall which store to look in -- but we only have 36 bits, we need 34
to store the Node ID, so that leaves us with 1.5 bits => can divide into
3 stores.

Since the node store for the planet is 44GB, dividing into 3 stores
doesn't give us very much headroom on a 32 GB box. Ah well, we can
sort this out later.
---
 include/osm_mem_tiles.h    |  8 +++++++-
 include/tile_data.h        |  2 +-
 src/osm_lua_processing.cpp |  4 +++-
 src/osm_mem_tiles.cpp      | 21 +++++++++++++++++++++
 4 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/include/osm_mem_tiles.h b/include/osm_mem_tiles.h
index a6266ea3..74aeb18f 100644
--- a/include/osm_mem_tiles.h
+++ b/include/osm_mem_tiles.h
@@ -6,10 +6,15 @@
 #include "osm_store.h"
 #include "geometry_cache.h"
 
+// NB: Currently, USE_NODE_STORE and USE_WAY_STORE are equivalent.
+// If we permit LayerAsCentroid to be generated from the OSM stores,
+// this will have to change.
 #define OSM_THRESHOLD (1ull << 35)
+#define USE_NODE_STORE (1ull << 35)
+#define IS_NODE(x) (((x) >> 35) == (USE_NODE_STORE >> 35))
 #define USE_WAY_STORE (1ull << 35)
 #define IS_WAY(x) (((x) >> 35) == (USE_WAY_STORE >> 35))
-#define OSM_ID(x) ((x) & 0b111111111111111111111111111111111)
+#define OSM_ID(x) ((x) & 0b11111111111111111111111111111111111)
 
 class NodeStore;
 class WayStore;
@@ -37,6 +42,7 @@ class OsmMemTiles : public TileDataSource {
 		const NodeID objectID,
 		const TileBbox &bbox
 	) override;
+	LatpLon buildNodeGeometry(OutputGeometryType const geomType, NodeID const objectID, const TileBbox &bbox) const override;
 
 
 	void Clear();
diff --git a/include/tile_data.h b/include/tile_data.h
index 13d61dbe..e02a0255 100644
--- a/include/tile_data.h
+++ b/include/tile_data.h
@@ -362,7 +362,7 @@ class TileDataSource {
 	);
 
 	virtual Geometry buildWayGeometry(OutputGeometryType const geomType, NodeID const objectID, const TileBbox &bbox);
-	LatpLon buildNodeGeometry(OutputGeometryType const geomType, NodeID const objectID, const TileBbox &bbox) const;
+	virtual LatpLon buildNodeGeometry(OutputGeometryType const geomType, NodeID const objectID, const TileBbox &bbox) const;
 
 	void open() {
 		// Put something at index 0 of all stores so that 0 can be used
diff --git a/src/osm_lua_processing.cpp b/src/osm_lua_processing.cpp
index a1bc2536..a90c8b6a 100644
--- a/src/osm_lua_processing.cpp
+++ b/src/osm_lua_processing.cpp
@@ -350,7 +350,9 @@ void OsmLuaProcessing::Layer(const string &layerName, bool area) {
 
 			if(CorrectGeometry(p) == CorrectGeometryResult::Invalid) return;
 
-			NodeID id = osmMemTiles.storePoint(p);
+			NodeID id = USE_NODE_STORE | originalOsmID;
+			if (materializeGeometries)
+				id = osmMemTiles.storePoint(p);
 			OutputObject oo(geomType, layers.layerMap[layerName], id, 0, layerMinZoom);
 			outputs.push_back(std::make_pair(std::move(oo), attributes));
 			return;
diff --git a/src/osm_mem_tiles.cpp b/src/osm_mem_tiles.cpp
index f5527d0e..5cfc3c3d 100644
--- a/src/osm_mem_tiles.cpp
+++ b/src/osm_mem_tiles.cpp
@@ -18,6 +18,27 @@ OsmMemTiles::OsmMemTiles(
 {
 }
 
+LatpLon OsmMemTiles::buildNodeGeometry(
+	OutputGeometryType const geomType, 
+	NodeID const objectID,
+	const TileBbox &bbox
+) const {
+	if (objectID < OSM_THRESHOLD) {
+		return TileDataSource::buildNodeGeometry(geomType, objectID, bbox);
+	}
+
+	switch(geomType) {
+		case POINT_: {
+			return nodeStore.at(OSM_ID(objectID));
+		}
+
+		default:
+			break;
+	}
+
+	throw std::runtime_error("Geometry type is not point");			
+}
+
 Geometry OsmMemTiles::buildWayGeometry(
 	const OutputGeometryType geomType, 
 	const NodeID objectID,

From b86fddc307e173ca03edac9b229318f0a3660ace Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sat, 16 Dec 2023 12:44:49 -0500
Subject: [PATCH 05/49] rejig AttributePair layout

On g++, this reduces the size from 48 bytes to 34 bytes.

There aren't _that_ many attribute pairs, even on the planet scale, but
this plus a better encoding of string attributes might save us ~2GB at
the planet level, which is meaningful for a 32GB box
---
 include/attribute_store.h | 65 ++++++++++++++++++++++++++++++++-------
 include/output_object.h   |  3 --
 2 files changed, 54 insertions(+), 14 deletions(-)

diff --git a/include/attribute_store.h b/include/attribute_store.h
index ad1aa4e1..9d606139 100644
--- a/include/attribute_store.h
+++ b/include/attribute_store.h
@@ -39,17 +39,20 @@ class AttributeKeyStore {
 	std::map<const std::string*, uint16_t, string_ptr_less_than> keys2index;
 };
 
-enum class AttributePairType: char { False = 0, True = 1, Float = 2, String = 3 };
+enum class AttributePairType: char { Bool = 0, Float = 1, String = 2 };
 // AttributePair is a key/value pair (with minzoom)
+#pragma pack(push, 1)
 struct AttributePair {
-	std::string stringValue_;
-	float floatValue_;
-	short keyIndex;
-	char minzoom;
-	AttributePairType valueType;
+	short keyIndex : 9;
+	AttributePairType valueType : 3;
+	char minzoom : 4;
+	union {
+		float floatValue_;
+		std::string stringValue_;
+	};
 
 	AttributePair(uint32_t keyIndex, bool value, char minzoom)
-		: keyIndex(keyIndex), valueType(value ? AttributePairType::True : AttributePairType::False), minzoom(minzoom)
+		: keyIndex(keyIndex), valueType(AttributePairType::Bool), minzoom(minzoom), floatValue_(value ? 1 : 0)
 	{
 	}
 	AttributePair(uint32_t keyIndex, const std::string& value, char minzoom)
@@ -57,16 +60,55 @@ struct AttributePair {
 	{
 	}
 	AttributePair(uint32_t keyIndex, float value, char minzoom)
-		: keyIndex(keyIndex), valueType(AttributePairType::Float), floatValue_(value), minzoom(minzoom)
+		: keyIndex(keyIndex), valueType(AttributePairType::Float), minzoom(minzoom), floatValue_(value)
 	{
 	}
 
+	~AttributePair() {
+		if (valueType == AttributePairType::Bool || valueType == AttributePairType::Float)
+			return;
+
+		stringValue_.~basic_string();
+	}
+
+	AttributePair(const AttributePair& other):
+		keyIndex(other.keyIndex), valueType(other.valueType), minzoom(other.minzoom)
+	{
+		if (valueType == AttributePairType::Bool || valueType == AttributePairType::Float) {
+			floatValue_ = other.floatValue_;
+			return;
+		}
+
+		new (&stringValue_) std::string;
+		stringValue_ = other.stringValue_;
+	}
+
+	AttributePair& operator=(const AttributePair& other) {
+		if (!(valueType == AttributePairType::Bool || valueType == AttributePairType::Float)) {
+			stringValue_.~basic_string();
+		}
+
+		keyIndex = other.keyIndex;
+		valueType = other.valueType;
+		minzoom = other.minzoom;
+
+		if (valueType == AttributePairType::Bool || valueType == AttributePairType::Float) {
+			floatValue_ = other.floatValue_;
+			return *this;
+		}
+
+		new (&stringValue_) std::string;
+		stringValue_ = other.stringValue_;
+
+		return *this;
+	}
+
 	bool operator==(const AttributePair &other) const {
 		if (minzoom!=other.minzoom || keyIndex!=other.keyIndex || valueType!=other.valueType) return false;
 		if (valueType == AttributePairType::String)
 			return stringValue_ == other.stringValue_;
 
-		if (valueType == AttributePairType::Float)
+		if (valueType == AttributePairType::Float || valueType == AttributePairType::Bool)
 			return floatValue_ == other.floatValue_;
 
 		return true;
@@ -74,11 +116,11 @@ struct AttributePair {
 
 	bool hasStringValue() const { return valueType == AttributePairType::String; }
 	bool hasFloatValue() const { return valueType == AttributePairType::Float; }
-	bool hasBoolValue() const { return valueType == AttributePairType::True || valueType == AttributePairType::False; };
+	bool hasBoolValue() const { return valueType == AttributePairType::Bool; }
 
 	const std::string& stringValue() const { return stringValue_; }
 	float floatValue() const { return floatValue_; }
-	bool boolValue() const { return valueType == AttributePairType::True; }
+	bool boolValue() const { return floatValue_; }
 
 	static bool isHot(const AttributePair& pair, const std::string& keyName) {
 		// Is this pair a candidate for the hot pool?
@@ -137,6 +179,7 @@ struct AttributePair {
 		return rv;
 	}
 };
+#pragma pack(pop)
 
 
 // We shard the cold pools to reduce the odds of lock contention on
diff --git a/include/output_object.h b/include/output_object.h
index 3d2d862e..385fd46d 100644
--- a/include/output_object.h
+++ b/include/output_object.h
@@ -22,9 +22,6 @@ std::ostream& operator<<(std::ostream& os, OutputGeometryType geomType);
 
 /**
  * \brief OutputObject - any object (node, linestring, polygon) to be outputted to tiles
-
- * Possible future improvements to save memory:
- * - use a global dictionary for attribute key/values
 */
 #pragma pack(push, 4)
 class OutputObject {

From a54938e3db84658af148a633d8ea077357642b9d Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sat, 16 Dec 2023 14:48:42 -0500
Subject: [PATCH 06/49] fix initialization order warning

---
 src/mmap_allocator.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mmap_allocator.cpp b/src/mmap_allocator.cpp
index dc71f687..2b5e26fd 100644
--- a/src/mmap_allocator.cpp
+++ b/src/mmap_allocator.cpp
@@ -79,10 +79,10 @@ thread_local mmap_shm_ptr mmap_shm_thread_region_ptr;
 std::mutex mmap_allocator_mutex;
 
 mmap_file::mmap_file(std::string const &filename, std::size_t offset)
-	: mapping(filename.c_str(), boost::interprocess::read_write)
+	: filename(filename)
+	, mapping(filename.c_str(), boost::interprocess::read_write)
 	, region(mapping, boost::interprocess::read_write)
 	, buffer(boost::interprocess::create_only, reinterpret_cast<uint8_t *>(region.get_address()) + offset, region.get_size() - offset)
-	, filename(filename)
 { }
 
 mmap_file::~mmap_file()

From fa5a2bf858ba9733a3db4724a847c9ad3fd8f0e6 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sat, 16 Dec 2023 14:49:08 -0500
Subject: [PATCH 07/49] add PooledString

Not used by anything yet. Given Tilemaker's limited needs, we can get
away with a stripped-down string class that is less flexible than
std::string, in exchange for memory savings.

The key benefits - 16 bytes, not 32 bytes (g++) or 24 bytes (clang).

When it does allocate (for strings longer than 15 bytes), it allocates
from a pool so there's less per-allocation overhead.
---
 CMakeLists.txt              |  1 +
 Makefile                    | 13 +++--
 include/pooled_string.h     | 45 +++++++++++++++++
 src/attribute_store.cpp     |  2 +-
 src/pooled_string.cpp       | 98 +++++++++++++++++++++++++++++++++++++
 test/pooled_string.test.cpp | 34 +++++++++++++
 6 files changed, 189 insertions(+), 4 deletions(-)
 create mode 100644 include/pooled_string.h
 create mode 100644 src/pooled_string.cpp
 create mode 100644 test/pooled_string.test.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 203e68e8..d69e61ed 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -102,6 +102,7 @@ file(GLOB tilemaker_src_files
 	src/osm_store.cpp
 	src/output_object.cpp
 	src/pbf_blocks.cpp
+	src/pooled_string.cpp
 	src/read_pbf.cpp
 	src/read_shp.cpp
 	src/shared_data.cpp
diff --git a/Makefile b/Makefile
index 234dca6a..84fa084c 100644
--- a/Makefile
+++ b/Makefile
@@ -111,6 +111,7 @@ tilemaker: \
 	src/osm_store.o \
 	src/output_object.o \
 	src/pbf_blocks.o \
+	src/pooled_string.o \
 	src/read_pbf.o \
 	src/read_shp.o \
 	src/shared_data.o \
@@ -124,7 +125,13 @@ tilemaker: \
 	src/write_geometry.o
 	$(CXX) $(CXXFLAGS) -o tilemaker $^ $(INC) $(LIB) $(LDFLAGS)
 
-test: test_sorted_way_store
+test: test_pooled_string test_sorted_way_store
+
+test_pooled_string: \
+	src/mmap_allocator.o \
+	src/pooled_string.o \
+	test/pooled_string.test.o
+	$(CXX) $(CXXFLAGS) -o test.pooled_string $^ $(INC) $(LIB) $(LDFLAGS) && ./test.pooled_string
 
 test_sorted_way_store: \
 	src/external/streamvbyte_decode.o \
@@ -132,7 +139,7 @@ test_sorted_way_store: \
 	src/external/streamvbyte_zigzag.o \
 	src/mmap_allocator.o \
 	src/sorted_way_store.o \
-	src/sorted_way_store.test.o
+	test/sorted_way_store.test.o
 	$(CXX) $(CXXFLAGS) -o test.sorted_way_store $^ $(INC) $(LIB) $(LDFLAGS) && ./test.sorted_way_store
 
 
@@ -152,6 +159,6 @@ install:
 	install docs/man/tilemaker.1 ${DESTDIR}${MANPREFIX}/man1/
 
 clean:
-	rm -f tilemaker src/*.o src/external/*.o include/*.o include/*.pb.h
+	rm -f tilemaker src/*.o src/external/*.o include/*.o include/*.pb.h test/*.o
 
 .PHONY: install
diff --git a/include/pooled_string.h b/include/pooled_string.h
new file mode 100644
index 00000000..2fbeca37
--- /dev/null
+++ b/include/pooled_string.h
@@ -0,0 +1,45 @@
+#ifndef _POOLED_STRING_H
+#define _POOLED_STRING_H
+
+// std::string allows the allocated size to differ from the used size,
+// which means it needs an extra pointer. It also supports large strings.
+//
+// Our use case does not require this: we have immutable strings and always
+// know their exact size, which fit in 64K.
+//
+// Further, g++'s implementation of std::string is inefficient - it takes 32
+// bytes (vs clang's 24 bytes), while only allowing a small-string optimization
+// for strings of length 15 or less.
+//
+// std::string also needs to be able to free its allocated memory -- in our case,
+// we're fine with the memory living until the process dies.
+//
+// Instead, we implemented `PooledString`. It has a size of 16 bytes, and a small
+// string optimization for strings <= 15 bytes. (We will separately teach
+// AttributePair to encode Latin-character strings more efficiently, so that many
+// strings of size 24 or less fit in 15 bytes.)
+//
+// If it needs to allocate memory, it does so from a shared pool. It is unable
+// to free the memory once allocated.
+
+#include <vector>
+#include <string>
+
+namespace PooledStringNS {
+  class PooledString {
+    public:
+      PooledString(const std::string& str);
+      size_t size() const;
+      bool operator==(const PooledString& other) const;
+      bool operator!=(const PooledString& other) const;
+      std::string toString() const;
+
+    private:
+      // 0..3 is index into table, 4..5 is offset, 6..7 is length
+      uint8_t storage[16];
+  };
+}
+
+using PooledString = PooledStringNS::PooledString;
+
+#endif
diff --git a/src/attribute_store.cpp b/src/attribute_store.cpp
index f4f9f299..83acc1ac 100644
--- a/src/attribute_store.cpp
+++ b/src/attribute_store.cpp
@@ -92,7 +92,7 @@ uint32_t AttributePairStore::addPair(const AttributePair& pair, bool isHot) {
 		// Not found, ensure our local map is up-to-date for future calls,
 		// and fall through to the main map.
 		//
-		// Note that we can read `hotShard` without a lock
+		// Note that we can read `hotShard` without a lock, its size is fixed
 		while (tlsHotShardSize < hotShardSize.load()) {
 			tlsHotShardSize++;
 			tlsHotShardMap[&hotShard[tlsHotShardSize]] = tlsHotShardSize;
diff --git a/src/pooled_string.cpp b/src/pooled_string.cpp
new file mode 100644
index 00000000..031a5c06
--- /dev/null
+++ b/src/pooled_string.cpp
@@ -0,0 +1,98 @@
+#include "pooled_string.h"
+#include <mutex>
+#include <cstring>
+
+namespace PooledStringNS {
+	std::vector<char*> tables;
+	std::mutex mutex;
+
+	// Each thread has its own string table, we only take a lock
+	// to push a new table onto the vector.
+	thread_local int64_t tableIndex = -1;
+	thread_local int64_t spaceLeft = -1;
+}
+
+PooledString::PooledString(const std::string& str) {
+	if (str.size() >= 65536)
+		throw std::runtime_error("cannot store string longer than 64K");
+
+	if (str.size() <= 15) {
+		storage[0] = str.size();
+		memcpy(storage + 1, str.data(), str.size());
+		memset(storage + 1 + str.size(), 0, 16 - 1 - str.size());
+	} else {
+		memset(storage + 8, 0, 8);
+		storage[0] = 1 << 7;
+
+		if (spaceLeft < 0 || spaceLeft < str.size()) {
+			std::lock_guard<std::mutex> lock(mutex);
+			spaceLeft = 65536;
+			char* buffer = (char*)malloc(spaceLeft);
+			if (buffer == 0)
+				throw std::runtime_error("PooledString could not malloc");
+			tables.push_back(buffer);
+			tableIndex = tables.size() - 1;
+		}
+
+		storage[1] = tableIndex >> 16;
+		storage[2] = tableIndex >> 8;
+		storage[3] = tableIndex;
+
+		uint16_t offset = 65536 - spaceLeft;
+		storage[4] = offset >> 8;
+		storage[5] = offset;
+
+		uint16_t length = str.size();
+		storage[6] = length >> 8;
+		storage[7] = length;
+
+		memcpy(tables[tableIndex] + offset, str.data(), str.size());
+
+		spaceLeft -= str.size();
+	}
+}
+
+bool PooledStringNS::PooledString::operator==(const PooledString& other) const {
+	// NOTE: We have surprising equality semantics!
+	//
+	// For short strings, you are equal if the strings are equal.
+	//
+	// For large strings, you are equal if you use the same heap memory locations.
+	// This implies that someone outside of PooledString is managing pooling! In our
+	// case, it is the responsibility of AttributePairStore.
+	return memcmp(storage, other.storage, 16) == 0;
+}
+
+bool PooledStringNS::PooledString::operator!=(const PooledString& other) const {
+	return !(*this == other);
+}
+
+size_t PooledStringNS::PooledString::size() const {
+	// If the uppermost bit is set, we're in heap.
+	if (storage[0] >> 7) {
+		uint16_t length = (storage[6] << 8) + storage[7];
+		return length;
+	}
+
+	// Otherwise it's stored in the lower 7 bits of the highest byte.
+	return storage[0] & 0b01111111;
+}
+
+std::string PooledStringNS::PooledString::toString() const {
+	std::string rv;
+	if (storage[0] == 1 << 7) {
+		// heap
+		rv.reserve(size());
+
+		uint32_t tableIndex = (storage[1] << 16) + (storage[2] << 8) + storage[3];
+		uint16_t offset = (storage[4] << 8) + storage[5];
+
+		char* data = tables[tableIndex] + offset;
+		rv.append(data, size());
+		return rv;
+	}
+
+	for (int i = 0; i < storage[0]; i++)
+		rv += storage[i + 1];
+	return rv;
+}
diff --git a/test/pooled_string.test.cpp b/test/pooled_string.test.cpp
new file mode 100644
index 00000000..d32d1ccd
--- /dev/null
+++ b/test/pooled_string.test.cpp
@@ -0,0 +1,34 @@
+#include <iostream>
+#include "external/minunit.h"
+#include "pooled_string.h"
+
+MU_TEST(test_pooled_string) {
+	mu_check(PooledString("").size() == 0);
+	mu_check(PooledString("").toString() == "");
+	mu_check(PooledString("f").size() == 1);
+	mu_check(PooledString("f").toString() == "f");
+	mu_check(PooledString("hi").size() == 2);
+	mu_check(PooledString("f") == PooledString("f"));
+	mu_check(PooledString("f") != PooledString("g"));
+
+	mu_check(PooledString("this is more than fifteen bytes").size() == 31);
+	mu_check(PooledString("this is more than fifteen bytes") != PooledString("f"));
+
+	PooledString big("this is also a really long string");
+	mu_check(big == big);
+	mu_check(big.toString() == "this is also a really long string");
+
+	PooledString big2("this is also a quite long string");
+	mu_check(big != big2);
+	mu_check(big.toString() != big2.toString());
+}
+
+MU_TEST_SUITE(test_suite_pooled_string) {
+	MU_RUN_TEST(test_pooled_string);
+}
+
+int main() {
+	MU_RUN_SUITE(test_suite_pooled_string);
+	MU_REPORT();
+	return MU_EXIT_CODE;
+}

From 3eb07c2cb15236dae3d557f097a9743d1cc0c3e5 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sat, 16 Dec 2023 15:47:57 -0500
Subject: [PATCH 08/49] add tests for attribute store

...I'm going to replace the string implementation, so let's have some
backstop to make sure I don't break things
---
 Makefile                      |  8 +++-
 include/attribute_store.h     |  2 +
 src/attribute_store.cpp       |  9 ++++-
 test/attribute_store.test.cpp | 70 +++++++++++++++++++++++++++++++++++
 4 files changed, 86 insertions(+), 3 deletions(-)
 create mode 100644 test/attribute_store.test.cpp

diff --git a/Makefile b/Makefile
index 84fa084c..8bd26611 100644
--- a/Makefile
+++ b/Makefile
@@ -125,7 +125,13 @@ tilemaker: \
 	src/write_geometry.o
 	$(CXX) $(CXXFLAGS) -o tilemaker $^ $(INC) $(LIB) $(LDFLAGS)
 
-test: test_pooled_string test_sorted_way_store
+test: test_attribute_store test_pooled_string test_sorted_way_store
+
+test_attribute_store: \
+	src/mmap_allocator.o \
+	src/attribute_store.o \
+	test/attribute_store.test.o
+	$(CXX) $(CXXFLAGS) -o test.attribute_store $^ $(INC) $(LIB) $(LDFLAGS) && ./test.attribute_store
 
 test_pooled_string: \
 	src/mmap_allocator.o \
diff --git a/include/attribute_store.h b/include/attribute_store.h
index 9d606139..1482db23 100644
--- a/include/attribute_store.h
+++ b/include/attribute_store.h
@@ -10,6 +10,7 @@
 #include <boost/functional/hash.hpp>
 #include <boost/container/flat_map.hpp>
 #include <vector>
+#include "pooled_string.h"
 
 /* AttributeStore - global dictionary for attributes */
 
@@ -423,6 +424,7 @@ struct AttributeSet {
 struct AttributeStore {
 	AttributeIndex add(AttributeSet &attributes);
 	std::vector<const AttributePair*> getUnsafe(AttributeIndex index) const;
+	size_t size() const;
 	void reportSize() const;
 	void finalize();
 
diff --git a/src/attribute_store.cpp b/src/attribute_store.cpp
index 83acc1ac..46801668 100644
--- a/src/attribute_store.cpp
+++ b/src/attribute_store.cpp
@@ -307,11 +307,16 @@ std::vector<const AttributePair*> AttributeStore::getUnsafe(AttributeIndex index
 	}
 }
 
-void AttributeStore::reportSize() const {
+size_t AttributeStore::size() const {
 	size_t numAttributeSets = 0;
 	for (int i = 0; i < ATTRIBUTE_SHARDS; i++)
 		numAttributeSets += sets[i].size();
-	std::cout << "Attributes: " << numAttributeSets << " sets from " << lookups.load() << " objects" << std::endl;
+
+	return numAttributeSets;
+}
+
+void AttributeStore::reportSize() const {
+	std::cout << "Attributes: " << size() << " sets from " << lookups.load() << " objects" << std::endl;
 
 	// Print detailed histogram of frequencies of attributes.
 	if (false) {
diff --git a/test/attribute_store.test.cpp b/test/attribute_store.test.cpp
new file mode 100644
index 00000000..4fb1f979
--- /dev/null
+++ b/test/attribute_store.test.cpp
@@ -0,0 +1,70 @@
+#include <iostream>
+#include <algorithm>
+#include "external/minunit.h"
+#include "attribute_store.h"
+
+MU_TEST(test_attribute_store) {
+	AttributeStore store;
+
+	mu_check(store.size() == 0);
+
+	AttributeSet s1;
+	store.addAttribute(s1, "str1", std::string("someval"), 0);
+	store.addAttribute(s1, "str2", std::string("a very long string"), 0);
+	store.addAttribute(s1, "bool1", false, 0);
+	store.addAttribute(s1, "bool2", true, 0);
+	store.addAttribute(s1, "float1", (float)42.0, 0);
+
+	const auto s1Index = store.add(s1);
+
+	mu_check(store.size() == 1);
+
+	const auto s1Pairs = store.getUnsafe(s1Index);
+	mu_check(s1Pairs.size() == 5);
+
+	const auto str1 = std::find_if(s1Pairs.begin(), s1Pairs.end(), [&store](auto ap) {
+			return ap->keyIndex == store.keyStore.key2index("str1");
+	});
+	mu_check(str1 != s1Pairs.end());
+	mu_check((*str1)->hasStringValue());
+	mu_check((*str1)->stringValue() == "someval");
+
+	const auto str2 = std::find_if(s1Pairs.begin(), s1Pairs.end(), [&store](auto ap) {
+			return ap->keyIndex == store.keyStore.key2index("str2");
+	});
+	mu_check(str2 != s1Pairs.end());
+	mu_check((*str2)->hasStringValue());
+	mu_check((*str2)->stringValue() == "a very long string");
+
+	const auto bool1 = std::find_if(s1Pairs.begin(), s1Pairs.end(), [&store](auto ap) {
+			return ap->keyIndex == store.keyStore.key2index("bool1");
+	});
+	mu_check(bool1 != s1Pairs.end());
+	mu_check((*bool1)->hasBoolValue());
+	mu_check((*bool1)->boolValue() == false);
+
+	const auto bool2 = std::find_if(s1Pairs.begin(), s1Pairs.end(), [&store](auto ap) {
+			return ap->keyIndex == store.keyStore.key2index("bool2");
+	});
+	mu_check(bool2 != s1Pairs.end());
+	mu_check((*bool2)->hasBoolValue());
+	mu_check((*bool2)->boolValue() == true);
+
+	const auto float1 = std::find_if(s1Pairs.begin(), s1Pairs.end(), [&store](auto ap) {
+			return ap->keyIndex == store.keyStore.key2index("float1");
+	});
+	mu_check(float1 != s1Pairs.end());
+	mu_check((*float1)->hasFloatValue());
+	mu_check((*float1)->floatValue() == 42);
+
+}
+
+MU_TEST_SUITE(test_suite_attribute_store) {
+	MU_RUN_TEST(test_attribute_store);
+}
+
+int main() {
+	MU_RUN_SUITE(test_suite_attribute_store);
+	MU_REPORT();
+	return MU_EXIT_CODE;
+}

From b3eac9958caee521d1f8d5275d214072e15696cc Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sat, 16 Dec 2023 16:37:05 -0500
Subject: [PATCH 09/49] rejig isHot

Break dependency on AttributePair, just work on std::string
---
 include/attribute_store.h | 20 +++-----------------
 src/attribute_store.cpp   |  6 +++---
 2 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/include/attribute_store.h b/include/attribute_store.h
index 1482db23..345fc81d 100644
--- a/include/attribute_store.h
+++ b/include/attribute_store.h
@@ -123,7 +123,7 @@ struct AttributePair {
 	float floatValue() const { return floatValue_; }
 	bool boolValue() const { return floatValue_; }
 
-	static bool isHot(const AttributePair& pair, const std::string& keyName) {
+	static bool isHot(const std::string& keyName, const std::string& value) {
 		// Is this pair a candidate for the hot pool?
 
 		// Hot pairs are pairs that we think are likely to be re-used, like
@@ -132,25 +132,11 @@ struct AttributePair {
 		// The trick is that we commit to putting them in the hot pool
 		// before we know if we were right.
 
-		// All boolean pairs are eligible.
-		if (pair.hasBoolValue())
-			return true;
-
-		// Small integers are eligible.
-		if (pair.hasFloatValue()) {
-			float v = pair.floatValue();
-
-			if (ceil(v) == v && v >= 0 && v <= 25)
-				return true;
-		}
-
-		// The remaining things should be strings, but just in case...
-		if (!pair.hasStringValue())
-			return false;
+		// The rules for floats/booleans are managed in their addAttribute call.
 
 		// Only strings that are IDish are eligible: only lowercase letters.
 		bool ok = true;
-		for (const auto& c: pair.stringValue()) {
+		for (const auto& c: value) {
 			if (c != '-' && c != '_' && (c < 'a' || c > 'z'))
 				return false;
 		}
diff --git a/src/attribute_store.cpp b/src/attribute_store.cpp
index 46801668..beb68cf7 100644
--- a/src/attribute_store.cpp
+++ b/src/attribute_store.cpp
@@ -200,19 +200,19 @@ void AttributeSet::removePairWithKey(const AttributePairStore& pairStore, uint32
 
 void AttributeStore::addAttribute(AttributeSet& attributeSet, std::string const &key, const std::string& v, char minzoom) {
 	AttributePair kv(keyStore.key2index(key),v,minzoom);
-	bool isHot = AttributePair::isHot(kv, key);
+	bool isHot = AttributePair::isHot(key, v);
 	attributeSet.removePairWithKey(pairStore, kv.keyIndex);
 	attributeSet.addPair(pairStore.addPair(kv, isHot));
 }
 void AttributeStore::addAttribute(AttributeSet& attributeSet, std::string const &key, bool v, char minzoom) {
 	AttributePair kv(keyStore.key2index(key),v,minzoom);
-	bool isHot = AttributePair::isHot(kv, key);
+	bool isHot = true; // All bools are eligible to be hot pairs
 	attributeSet.removePairWithKey(pairStore, kv.keyIndex);
 	attributeSet.addPair(pairStore.addPair(kv, isHot));
 }
 void AttributeStore::addAttribute(AttributeSet& attributeSet, std::string const &key, float v, char minzoom) {
 	AttributePair kv(keyStore.key2index(key),v,minzoom);
-	bool isHot = AttributePair::isHot(kv, key);
+	bool isHot = v >= 0 && v <= 25 && ceil(v) == v; // Whole numbers in 0..25 are eligible to be hot pairs
 	attributeSet.removePairWithKey(pairStore, kv.keyIndex);
 	attributeSet.addPair(pairStore.addPair(kv, isHot));
 }

From 2784903b3945969abb5e0cb470139f669a4d1171 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sat, 16 Dec 2023 16:37:53 -0500
Subject: [PATCH 10/49] teach PooledString to work with std::string

...this will be useful for doing map lookups when testing if an
AttributePair has already been created with the given value.
---
 include/pooled_string.h     | 36 ++++++++++++++------
 src/pooled_string.cpp       | 68 ++++++++++++++++++++++++++++++++-----
 test/pooled_string.test.cpp | 21 ++++++++++++
 3 files changed, 106 insertions(+), 19 deletions(-)

diff --git a/include/pooled_string.h b/include/pooled_string.h
index 2fbeca37..95edd454 100644
--- a/include/pooled_string.h
+++ b/include/pooled_string.h
@@ -1,20 +1,19 @@
 #ifndef _POOLED_STRING_H
 #define _POOLED_STRING_H
 
-// std::string allows the allocated size to differ from the used size,
-// which means it needs an extra pointer. It also supports large strings.
+// std::string is quite general:
+// - mutable
+// - unlimited length
+// - capacity can differ from size
+// - can deallocate its dynamic memory
 //
-// Our use case does not require this: we have immutable strings and always
-// know their exact size, which fit in 64K.
+// Our use case, by contrast is immutable, bounded strings that live for the
+// duration of the process.
 //
-// Further, g++'s implementation of std::string is inefficient - it takes 32
-// bytes (vs clang's 24 bytes), while only allowing a small-string optimization
-// for strings of length 15 or less.
+// This gives us some room to have less memory overhead, especially on
+// g++, whose implementation of std::string requires 32 bytes.
 //
-// std::string also needs to be able to free its allocated memory -- in our case,
-// we're fine with the memory living until the process dies.
-//
-// Instead, we implemented `PooledString`. It has a size of 16 bytes, and a small
+// Thus, we implement `PooledString`. It has a size of 16 bytes, and a small
 // string optimization for strings <= 15 bytes. (We will separately teach
 // AttributePair to encode Latin-character strings more efficiently, so that many
 // strings of size 24 or less fit in 15 bytes.)
@@ -22,17 +21,32 @@
 // If it needs to allocate memory, it does so from a shared pool. It is unable
 // to free the memory once allocated.
 
+// PooledString has one of three modes:
+// - [126:127] = 00: small-string, length is in [120:125], lower 15 bytes are string
+// - [126:127] = 10: pooled string, table is in bytes 1..3, offset in bytes 4..5, length in bytes 6..7
+// - [126:127] = 11: pointer to std::string, pointer is in bytes 8..15
+//
+// Note that the pointer mode is not safe to be stored. It exists just to allow
+// lookups in the AttributePair map before deciding to allocate a string.
+
 #include <vector>
 #include <string>
 
 namespace PooledStringNS {
   class PooledString {
     public:
+      // Create a short string or heap string, long-lived.
       PooledString(const std::string& str);
+
+
+      // Create a std string - only valid so long as the string that is
+      // pointed to is valid.
+      PooledString(const std::string* str);
       size_t size() const;
       bool operator==(const PooledString& other) const;
       bool operator!=(const PooledString& other) const;
       std::string toString() const;
+      const char* data() const;
 
     private:
       // 0..3 is index into table, 4..5 is offset, 6..7 is length
diff --git a/src/pooled_string.cpp b/src/pooled_string.cpp
index 031a5c06..cc4532f4 100644
--- a/src/pooled_string.cpp
+++ b/src/pooled_string.cpp
@@ -6,6 +6,10 @@ namespace PooledStringNS {
 	std::vector<char*> tables;
 	std::mutex mutex;
 
+	const uint8_t ShortString = 0b00;
+	const uint8_t HeapString = 0b10;
+	const uint8_t StdString = 0b11;
+
 	// Each thread has its own string table, we only take a lock
 	// to push a new table onto the vector.
 	thread_local int64_t tableIndex = -1;
@@ -52,14 +56,33 @@ PooledString::PooledString(const std::string& str) {
 	}
 }
 
+PooledString::PooledString(const std::string* str) {
+	storage[0] = StdString << 6;
+
+	*(const std::string**)((void*)(storage + 8)) = str;
+}
+
 bool PooledStringNS::PooledString::operator==(const PooledString& other) const {
 	// NOTE: We have surprising equality semantics!
 	//
-	// For short strings, you are equal if the strings are equal.
+	// If one of the strings is a StdString, it's value equality.
+	//
+	// Else, for short strings, you are equal if the strings are equal.
 	//
 	// For large strings, you are equal if you use the same heap memory locations.
 	// This implies that someone outside of PooledString is managing pooling! In our
 	// case, it is the responsibility of AttributePairStore.
+	uint8_t kind = storage[0] >> 6;
+	uint8_t otherKind = other.storage[0] >> 6;
+
+	if (kind == StdString || otherKind == StdString) {
+		size_t mySize = size();
+		if (mySize != other.size())
+			return false;
+
+		return memcmp(data(), other.data(), mySize) == 0;
+	}
+
 	return memcmp(storage, other.storage, 16) == 0;
 }
 
@@ -67,20 +90,44 @@ bool PooledStringNS::PooledString::operator!=(const PooledString& other) const {
 	return !(*this == other);
 }
 
+const char* PooledStringNS::PooledString::data() const {
+	uint8_t kind = storage[0] >> 6;
+
+	if (kind == ShortString)
+		return (char *)(storage + 1);
+
+	if (kind == StdString) {
+		const std::string* str = *(const std::string**)((void*)(storage + 8));
+		return str->data();
+	}
+
+	uint32_t tableIndex = (storage[1] << 16) + (storage[2] << 8) + storage[3];
+	uint16_t offset = (storage[4] << 8) + storage[5];
+
+	const char* data = tables[tableIndex] + offset;
+	return data;
+}
+
 size_t PooledStringNS::PooledString::size() const {
+	uint8_t kind = storage[0] >> 6;
 	// If the uppermost bit is set, we're in heap.
-	if (storage[0] >> 7) {
+	if (kind == HeapString) {
 		uint16_t length = (storage[6] << 8) + storage[7];
 		return length;
 	}
 
-	// Otherwise it's stored in the lower 7 bits of the highest byte.
-	return storage[0] & 0b01111111;
+	if (kind == ShortString)
+		// Otherwise it's stored in the lower 7 bits of the highest byte.
+		return storage[0] & 0b01111111;
+
+	const std::string* str = *(const std::string**)((void*)(storage + 8));
+	return str->size();
 }
 
 std::string PooledStringNS::PooledString::toString() const {
 	std::string rv;
-	if (storage[0] == 1 << 7) {
+	uint8_t kind = storage[0] >> 6;
+	if (kind == HeapString) {
 		// heap
 		rv.reserve(size());
 
@@ -92,7 +139,12 @@ std::string PooledStringNS::PooledString::toString() const {
 		return rv;
 	}
 
-	for (int i = 0; i < storage[0]; i++)
-		rv += storage[i + 1];
-	return rv;
+	if (kind == ShortString) {
+		for (int i = 0; i < storage[0]; i++)
+			rv += storage[i + 1];
+		return rv;
+	}
+
+	const std::string* str = *(const std::string**)((void*)(storage + 8));
+	return *str;
 }
diff --git a/test/pooled_string.test.cpp b/test/pooled_string.test.cpp
index d32d1ccd..91fb2da5 100644
--- a/test/pooled_string.test.cpp
+++ b/test/pooled_string.test.cpp
@@ -21,6 +21,27 @@ MU_TEST(test_pooled_string) {
 	PooledString big2("this is also a quite long string");
 	mu_check(big != big2);
 	mu_check(big.toString() != big2.toString());
+
+	std::string shortString("short");
+	std::string longString("this is a very long string");
+
+	PooledString stdShortString(&shortString);
+	mu_check(stdShortString.size() == 5);
+	mu_check(stdShortString.toString() == "short");
+
+	PooledString stdLongString(&longString);
+	mu_check(stdLongString.size() == 26);
+	mu_check(stdLongString.toString() == "this is a very long string");
+
+	// PooledStrings that are backed by std::string have the usual
+	// == semantics.
+	mu_check(stdShortString == PooledString("short"));
+	mu_check(PooledString("short") == stdShortString);
+
+	mu_check(stdLongString == PooledString("this is a very long string"));
+	mu_check(PooledString("this is a very long string") == stdLongString);
+
+	mu_check(stdShortString != stdLongString);
 }
 
 MU_TEST_SUITE(test_suite_pooled_string) {

From efe6af959884304fe6783360ed94c8e29346dd07 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sat, 16 Dec 2023 18:02:12 -0500
Subject: [PATCH 11/49] use PooledString in AttributePair

AttributePair has now been trimmed from 48 bytes to 18 bytes. There are
40M AttributeSets for the planet. That suggests there's probably ~30M AttributePairs,
so hopefully this is a savings of ~900MB at the planet level.

Runtime doesn't seem affected.

There's a further opportunity for savings if we can make more strings
qualify for the short string optimization. Only about 40% of strings
fit in the 15 byte short string optimization.

Of the remaining 60%, many are Latin-alphabet title cased strings like
`Wellington Avenue` -- this could be encoded using 5 bits per letter,
saving us an allocation.

Even in the most optimistic case where:

- there are 30M AttributePairs
- of these, 90% are strings (= 27M)
- of these, 60% don't fit in SSO (=16m)
- of these, we can make 100% fit in SSO

...we only save about 256MB at the planet level, but at some significant
complexity cost. So probably not worth pursuing at the moment.
---
 Makefile                      |  1 +
 include/attribute_store.h     | 35 +++++++++++++----------------------
 include/pooled_string.h       |  2 ++
 src/attribute_store.cpp       | 26 ++++++++++++++++++++++++--
 src/output_object.cpp         |  9 ++++++---
 src/pooled_string.cpp         | 20 ++++++++++++++++++++
 src/tilemaker.cpp             |  1 -
 test/attribute_store.test.cpp | 34 ++++++++++++++++++++++++++++++++++
 8 files changed, 100 insertions(+), 28 deletions(-)

diff --git a/Makefile b/Makefile
index 8bd26611..f8ff37c3 100644
--- a/Makefile
+++ b/Makefile
@@ -130,6 +130,7 @@ test: test_attribute_store test_pooled_string test_sorted_way_store
 test_attribute_store: \
 	src/mmap_allocator.o \
 	src/attribute_store.o \
+	src/pooled_string.o \
 	test/attribute_store.test.o
 	$(CXX) $(CXXFLAGS) -o test.attribute_store $^ $(INC) $(LIB) $(LDFLAGS) && ./test.attribute_store
 
diff --git a/include/attribute_store.h b/include/attribute_store.h
index 345fc81d..194f62f6 100644
--- a/include/attribute_store.h
+++ b/include/attribute_store.h
@@ -49,14 +49,14 @@ struct AttributePair {
 	char minzoom : 4;
 	union {
 		float floatValue_;
-		std::string stringValue_;
+		PooledString stringValue_;
 	};
 
 	AttributePair(uint32_t keyIndex, bool value, char minzoom)
 		: keyIndex(keyIndex), valueType(AttributePairType::Bool), minzoom(minzoom), floatValue_(value ? 1 : 0)
 	{
 	}
-	AttributePair(uint32_t keyIndex, const std::string& value, char minzoom)
+	AttributePair(uint32_t keyIndex, const PooledString& value, char minzoom)
 		: keyIndex(keyIndex), valueType(AttributePairType::String), stringValue_(value), minzoom(minzoom)
 	{
 	}
@@ -65,13 +65,6 @@ struct AttributePair {
 	{
 	}
 
-	~AttributePair() {
-		if (valueType == AttributePairType::Bool || valueType == AttributePairType::Float)
-			return;
-
-		stringValue_.~basic_string();
-	}
-
 	AttributePair(const AttributePair& other):
 		keyIndex(other.keyIndex), valueType(other.valueType), minzoom(other.minzoom)
 	{
@@ -80,15 +73,10 @@ struct AttributePair {
 			return;
 		}
 
-		new (&stringValue_) std::string;
 		stringValue_ = other.stringValue_;
 	}
 
 	AttributePair& operator=(const AttributePair& other) {
-		if (!(valueType == AttributePairType::Bool || valueType == AttributePairType::Float)) {
-			stringValue_.~basic_string();
-		}
-
 		keyIndex = other.keyIndex;
 		valueType = other.valueType;
 		minzoom = other.minzoom;
@@ -98,9 +86,7 @@ struct AttributePair {
 			return *this;
 		}
 
-		new (&stringValue_) std::string;
 		stringValue_ = other.stringValue_;
-
 		return *this;
 	}
 
@@ -119,10 +105,13 @@ struct AttributePair {
 	bool hasFloatValue() const { return valueType == AttributePairType::Float; }
 	bool hasBoolValue() const { return valueType == AttributePairType::Bool; }
 
-	const std::string& stringValue() const { return stringValue_; }
+	const PooledString& pooledString() const { return stringValue_; }
+	const std::string stringValue() const { return stringValue_.toString(); }
 	float floatValue() const { return floatValue_; }
 	bool boolValue() const { return floatValue_; }
 
+	void ensureStringIsOwned();
+
 	static bool isHot(const std::string& keyName, const std::string& value) {
 		// Is this pair a candidate for the hot pool?
 
@@ -153,9 +142,10 @@ struct AttributePair {
 		boost::hash_combine(rv, keyIndex);
 		boost::hash_combine(rv, valueType);
 
-		if(hasStringValue())
-			boost::hash_combine(rv, stringValue());
-		else if(hasFloatValue())
+		if(hasStringValue()) {
+			const char* data = pooledString().data();
+			boost::hash_range(rv, data, data + pooledString().size());
+		} else if(hasFloatValue())
 			boost::hash_combine(rv, floatValue());
 		else if(hasBoolValue())
 			boost::hash_combine(rv, boolValue());
@@ -198,7 +188,7 @@ class AttributePairStore {
 	void finalize() { finalized = true; }
 	const AttributePair& getPair(uint32_t i) const;
 	const AttributePair& getPairUnsafe(uint32_t i) const;
-	uint32_t addPair(const AttributePair& pair, bool isHot);
+	uint32_t addPair(AttributePair& pair, bool isHot);
 
 	struct key_value_less_ptr {
 		bool operator()(AttributePair const* lhs, AttributePair const* rhs) const {            
@@ -208,7 +198,7 @@ class AttributePairStore {
 				return lhs->keyIndex < rhs->keyIndex;
 			if (lhs->valueType != rhs->valueType) return lhs->valueType < rhs->valueType;
 
-			if (lhs->hasStringValue()) return lhs->stringValue() < rhs->stringValue();
+			if (lhs->hasStringValue()) return lhs->pooledString() < rhs->pooledString();
 			if (lhs->hasBoolValue()) return lhs->boolValue() < rhs->boolValue();
 			if (lhs->hasFloatValue()) return lhs->floatValue() < rhs->floatValue();
 			throw std::runtime_error("Invalid type in attribute store");
@@ -410,6 +400,7 @@ struct AttributeSet {
 struct AttributeStore {
 	AttributeIndex add(AttributeSet &attributes);
 	std::vector<const AttributePair*> getUnsafe(AttributeIndex index) const;
+	void reset(); // used for testing
 	size_t size() const;
 	void reportSize() const;
 	void finalize();
diff --git a/include/pooled_string.h b/include/pooled_string.h
index 95edd454..56d44453 100644
--- a/include/pooled_string.h
+++ b/include/pooled_string.h
@@ -43,10 +43,12 @@ namespace PooledStringNS {
       // pointed to is valid.
       PooledString(const std::string* str);
       size_t size() const;
+      bool operator<(const PooledString& other) const;
       bool operator==(const PooledString& other) const;
       bool operator!=(const PooledString& other) const;
       std::string toString() const;
       const char* data() const;
+      void ensureStringIsOwned();
 
     private:
       // 0..3 is index into table, 4..5 is offset, 6..7 is length
diff --git a/src/attribute_store.cpp b/src/attribute_store.cpp
index beb68cf7..71c0925b 100644
--- a/src/attribute_store.cpp
+++ b/src/attribute_store.cpp
@@ -55,6 +55,16 @@ const std::string& AttributeKeyStore::getKeyUnsafe(uint16_t index) const {
 	return keys[index];
 }
 
+// AttributePair
+void AttributePair::ensureStringIsOwned() {
+	// Before we store an AttributePair in our long-term storage, we need
+	// to make sure it's not pointing to a non-long-lived std::string.
+	if (valueType == AttributePairType::Bool || valueType == AttributePairType::Float)
+		return;
+
+	stringValue_.ensureStringIsOwned();
+}
+
 // AttributePairStore
 thread_local boost::container::flat_map<const AttributePair*, uint32_t, AttributePairStore::key_value_less_ptr> tlsHotShardMap;
 thread_local uint16_t tlsHotShardSize = 0;
@@ -68,6 +78,7 @@ const AttributePair& AttributePairStore::getPair(uint32_t i) const {
 	std::lock_guard<std::mutex> lock(pairsMutex[shard]);
 	return pairs[shard].at(offset);
 };
+
 const AttributePair& AttributePairStore::getPairUnsafe(uint32_t i) const {
 	// NB: This is unsafe if called before the PBF has been fully read.
 	// If called during the output phase, it's safe.
@@ -81,7 +92,7 @@ const AttributePair& AttributePairStore::getPairUnsafe(uint32_t i) const {
 	return pairs[shard].at(offset);
 };
 
-uint32_t AttributePairStore::addPair(const AttributePair& pair, bool isHot) {
+uint32_t AttributePairStore::addPair(AttributePair& pair, bool isHot) {
 	if (isHot) {
 		{
 			// First, check our thread-local map.
@@ -109,6 +120,7 @@ uint32_t AttributePairStore::addPair(const AttributePair& pair, bool isHot) {
 			hotShardSize++;
 			uint32_t offset = hotShardSize.load();
 
+			pair.ensureStringIsOwned();
 			hotShard[offset] = pair;
 			const AttributePair* ptr = &hotShard[offset];
 			uint32_t rv = (0 << (32 - SHARD_BITS)) + offset;
@@ -138,6 +150,7 @@ uint32_t AttributePairStore::addPair(const AttributePair& pair, bool isHot) {
 	if (offset >= (1 << (32 - SHARD_BITS)))
 		throw std::out_of_range("pair shard overflow");
 
+	pair.ensureStringIsOwned();
 	pairs[shard].push_back(pair);
 	const AttributePair* ptr = &pairs[shard][offset];
 	uint32_t rv = (shard << (32 - SHARD_BITS)) + offset;
@@ -199,7 +212,8 @@ void AttributeSet::removePairWithKey(const AttributePairStore& pairStore, uint32
 }
 
 void AttributeStore::addAttribute(AttributeSet& attributeSet, std::string const &key, const std::string& v, char minzoom) {
-	AttributePair kv(keyStore.key2index(key),v,minzoom);
+	PooledString ps(&v);
+	AttributePair kv(keyStore.key2index(key), ps, minzoom);
 	bool isHot = AttributePair::isHot(key, v);
 	attributeSet.removePairWithKey(pairStore, kv.keyIndex);
 	attributeSet.addPair(pairStore.addPair(kv, isHot));
@@ -373,6 +387,14 @@ void AttributeStore::reportSize() const {
 	}
 }
 
+void AttributeStore::reset() {
+	// This is only used for tests.
+	tlsKeys2Index.clear();
+	tlsKeys2IndexSize = 0;
+	tlsHotShardMap.clear();
+	tlsHotShardSize = 0;
+}
+
 void AttributeStore::finalize() {
 	finalized = true;
 	keyStore.finalize();
diff --git a/src/output_object.cpp b/src/output_object.cpp
index b68fb27f..7f9f0edb 100644
--- a/src/output_object.cpp
+++ b/src/output_object.cpp
@@ -87,9 +87,12 @@ void OutputObject::writeAttributes(
 int OutputObject::findValue(const vector<vector_tile::Tile_Value>* valueList, const AttributePair& value) const {
 	for (size_t i=0; i<valueList->size(); i++) {
 		const vector_tile::Tile_Value& v = valueList->at(i);
-		if (v.has_string_value() && value.hasStringValue() && v.string_value()==value.stringValue()) { return i; }
-		if (v.has_float_value()  && value.hasFloatValue()  && v.float_value() ==value.floatValue() ) { return i; }
-		if (v.has_bool_value()	 && value.hasBoolValue()   && v.bool_value()  ==value.boolValue()	) { return i; }
+		if (v.has_string_value() && value.hasStringValue()) {
+			const size_t valueSize = value.pooledString().size();
+			if (valueSize == v.string_value().size() && memcmp(v.string_value().data(), value.pooledString().data(), valueSize) == 0)
+				return i;
+		} else if (v.has_float_value()  && value.hasFloatValue()  && v.float_value() ==value.floatValue() ) { return i; }
+		else if (v.has_bool_value()	 && value.hasBoolValue()   && v.bool_value()  ==value.boolValue()	) { return i; }
 	}
 	return -1;
 }
diff --git a/src/pooled_string.cpp b/src/pooled_string.cpp
index cc4532f4..500408d4 100644
--- a/src/pooled_string.cpp
+++ b/src/pooled_string.cpp
@@ -148,3 +148,23 @@ std::string PooledStringNS::PooledString::toString() const {
 	const std::string* str = *(const std::string**)((void*)(storage + 8));
 	return *str;
 }
+
+void PooledStringNS::PooledString::ensureStringIsOwned() {
+	uint8_t kind = storage[0] >> 6;
+
+	if (kind != StdString)
+		return;
+
+	*this = PooledString(toString());
+}
+
+bool PooledStringNS::PooledString::operator<(const PooledString& other) const {
+	size_t mySize = size();
+	size_t otherSize = other.size();
+
+	if (mySize != otherSize)
+		return mySize < otherSize;
+
+	return memcmp(data(), other.data(), mySize) < 0;
+}
+
diff --git a/src/tilemaker.cpp b/src/tilemaker.cpp
index 68540384..cdc01975 100644
--- a/src/tilemaker.cpp
+++ b/src/tilemaker.cpp
@@ -163,7 +163,6 @@ vector<string> parseBox(const string& bbox) {
  * Worker threads write the output tiles, and start in the outputProc function.
  */
 int main(int argc, char* argv[]) {
-
 	// ----	Read command-line options
 	vector<string> inputFiles;
 	string luaFile;
diff --git a/test/attribute_store.test.cpp b/test/attribute_store.test.cpp
index 4fb1f979..3f2e28e5 100644
--- a/test/attribute_store.test.cpp
+++ b/test/attribute_store.test.cpp
@@ -5,6 +5,7 @@
 
 MU_TEST(test_attribute_store) {
 	AttributeStore store;
+	store.reset();
 
 	mu_check(store.size() == 0);
 
@@ -56,11 +57,44 @@ MU_TEST(test_attribute_store) {
 	mu_check(float1 != s1Pairs.end());
 	mu_check((*float1)->hasFloatValue());
 	mu_check((*float1)->floatValue() == 42);
+}
+
+MU_TEST(test_attribute_store_reuses) {
+	AttributeStore store;
+	store.reset();
+
+	mu_check(store.size() == 0);
+
+	{
+		AttributeSet s1a;
+		store.addAttribute(s1a, "str1", std::string("someval"), 0);
+		const auto s1aIndex = store.add(s1a);
+
+		AttributeSet s1b;
+		store.addAttribute(s1b, "str1", std::string("someval"), 0);
+		const auto s1bIndex = store.add(s1b);
+
+		mu_check(s1aIndex == s1bIndex);
+	}
+
+	{
+		AttributeSet s1a;
+		store.addAttribute(s1a, "str1", std::string("this is a very long string"), 0);
+		const auto s1aIndex = store.add(s1a);
+
+		AttributeSet s1b;
+		store.addAttribute(s1b, "str1", std::string("this is a very long string"), 0);
+		const auto s1bIndex = store.add(s1b);
+
+		mu_check(s1aIndex == s1bIndex);
+	}
+
 
 }
 
 MU_TEST_SUITE(test_suite_attribute_store) {
 	MU_RUN_TEST(test_attribute_store);
+	MU_RUN_TEST(test_attribute_store_reuses);
 }
 
 int main() {

From 9394bc75c845ae1f475184162609c0a62fa920db Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sat, 16 Dec 2023 22:15:22 -0500
Subject: [PATCH 12/49] log timings

When doing the planet, especially on a box with limited memory, there
are long periods with no output. Show some output so the user doesn't
think things are hung.

This also might be useful in detecting perf regressions more granularly.
---
 include/osm_mem_tiles.h |  2 ++
 include/shp_mem_tiles.h |  2 ++
 include/tile_data.h     | 21 +++++++++++++++++++++
 src/read_pbf.cpp        | 16 +++++++++++++---
 src/tile_data.cpp       |  4 ++--
 src/tilemaker.cpp       | 24 ++++++++++++++++++++++--
 6 files changed, 62 insertions(+), 7 deletions(-)

diff --git a/include/osm_mem_tiles.h b/include/osm_mem_tiles.h
index 74aeb18f..e7aff7ee 100644
--- a/include/osm_mem_tiles.h
+++ b/include/osm_mem_tiles.h
@@ -37,6 +37,8 @@ class OsmMemTiles : public TileDataSource {
 		const WayStore& wayStore
 	);
 
+	std::string name() const override { return "osm"; }
+
 	Geometry buildWayGeometry(
 		const OutputGeometryType geomType, 
 		const NodeID objectID,
diff --git a/include/shp_mem_tiles.h b/include/shp_mem_tiles.h
index 267a0090..508921ff 100644
--- a/include/shp_mem_tiles.h
+++ b/include/shp_mem_tiles.h
@@ -11,6 +11,8 @@ class ShpMemTiles : public TileDataSource
 public:
 	ShpMemTiles(size_t threadNum, uint baseZoom);
 
+	std::string name() const override { return "shp"; }
+
 	void CreateNamedLayerIndex(const std::string& layerName);
 
 	// Used in shape file loading
diff --git a/include/tile_data.h b/include/tile_data.h
index e02a0255..f40c754c 100644
--- a/include/tile_data.h
+++ b/include/tile_data.h
@@ -46,13 +46,31 @@ struct OutputObjectXYID {
 };
 
 template<typename OO> void finalizeObjects(
+	const std::string& name,
 	const size_t& threadNum,
 	const unsigned int& baseZoom,
 	typename std::vector<std::deque<OO, mmap_allocator<OO>>>::iterator begin,
 	typename std::vector<std::deque<OO, mmap_allocator<OO>>>::iterator end,
 	typename std::vector<std::deque<OO, mmap_allocator<OO>>>& lowZoom
 	) {
+#ifdef CLOCK_MONOTONIC
+	timespec startTs, endTs;
+	clock_gettime(CLOCK_MONOTONIC, &startTs);
+#endif
+
+	int i = 0;
 	for (auto it = begin; it != end; it++) {
+		i++;
+		if (i % 10 == 0 || i == 4096) {
+			std::cout << "\r" << name << ": finalizing z6 tile " << i << "/" << CLUSTER_ZOOM_AREA;
+
+#ifdef CLOCK_MONOTONIC
+			clock_gettime(CLOCK_MONOTONIC, &endTs);
+			uint64_t elapsedNs = 1e9 * (endTs.tv_sec - startTs.tv_sec) + endTs.tv_nsec - startTs.tv_nsec;
+			std::cout << " (" << std::to_string((uint32_t)(elapsedNs / 1e6)) << " ms)";
+#endif
+			std::cout << std::flush;
+		}
 		if (it->size() == 0)
 			continue;
 
@@ -109,6 +127,8 @@ template<typename OO> void finalizeObjects(
 			threadNum
 		);
 	}
+
+	std::cout << std::endl;
 }
 
 template<typename OO> void collectTilesWithObjectsAtZoomTemplate(
@@ -280,6 +300,7 @@ class TileDataSource {
 	std::vector<std::pair<size_t, multi_linestring_store_t*>> availableMultiLinestringStoreLeases;
 	std::vector<std::pair<size_t, multi_polygon_store_t*>> availableMultiPolygonStoreLeases;
 
+	virtual std::string name() const = 0;
 
 protected:	
 	size_t numShards;
diff --git a/src/read_pbf.cpp b/src/read_pbf.cpp
index 605618fa..2eb1795c 100644
--- a/src/read_pbf.cpp
+++ b/src/read_pbf.cpp
@@ -273,7 +273,7 @@ bool PbfReader::ReadBlock(
 			if (ioMutex.try_lock()) {
 				std::ostringstream str;
 				void_mmap_allocator::reportStoreSize(str);
-				str << "Block " << blocksProcessed.load() << "/" << blocksToProcess.load() << " ways " << pg.ways_size() << " relations " << pg.relations_size() << "                  \r";
+				str << "\rBlock " << blocksProcessed.load() << "/" << blocksToProcess.load() << " ways " << pg.ways_size() << " relations " << pg.relations_size() << "                  ";
 				std::cout << str.str();
 				std::cout.flush();
 				ioMutex.unlock();
@@ -293,7 +293,7 @@ bool PbfReader::ReadBlock(
 			osmStore.ensureUsedWaysInited();
 			bool done = ScanRelations(output, pg, pb);
 			if(done) { 
-				std::cout << "(Scanning for ways used in relations: " << (100*blocksProcessed.load()/blocksToProcess.load()) << "%)\r";
+				std::cout << "\r(Scanning for ways used in relations: " << (100*blocksProcessed.load()/blocksToProcess.load()) << "%)           ";
 				std::cout.flush();
 				continue;
 			}
@@ -459,6 +459,11 @@ int PbfReader::ReadPbfFile(
 
 	std::vector<ReadPhase> all_phases = { ReadPhase::Nodes, ReadPhase::RelationScan, ReadPhase::Ways, ReadPhase::Relations };
 	for(auto phase: all_phases) {
+#ifdef CLOCK_MONOTONIC
+		timespec start, end;
+		clock_gettime(CLOCK_MONOTONIC, &start);
+#endif
+
 		// Launch the pool with threadNum threads
 		boost::asio::thread_pool pool(threadNum);
 		std::mutex block_mutex;
@@ -529,8 +534,8 @@ int PbfReader::ReadPbfFile(
 						if(ReadBlock(*infile, *output, indexedBlockMetadata, nodeKeys, locationsOnWays, phase)) {
 							const std::lock_guard<std::mutex> lock(block_mutex);
 							blocks.erase(indexedBlockMetadata.index);	
-							blocksProcessed++;
 						}
+						blocksProcessed++;
 					}
 				});
 			}
@@ -538,6 +543,11 @@ int PbfReader::ReadPbfFile(
 	
 		pool.join();
 
+#ifdef CLOCK_MONOTONIC
+		clock_gettime(CLOCK_MONOTONIC, &end);
+		uint64_t elapsedNs = 1e9 * (end.tv_sec - start.tv_sec) + end.tv_nsec - start.tv_nsec;
+		std::cout << "(" << std::to_string((uint32_t)(elapsedNs / 1e6)) << " ms)" << std::endl;
+#endif
 		if(phase == ReadPhase::Nodes) {
 			osmStore.nodes.finalize(threadNum);
 		}
diff --git a/src/tile_data.cpp b/src/tile_data.cpp
index a50ddfea..d3fc15c2 100644
--- a/src/tile_data.cpp
+++ b/src/tile_data.cpp
@@ -74,8 +74,8 @@ TileDataSource::TileDataSource(size_t threadNum, unsigned int baseZoom, bool inc
 }
 
 void TileDataSource::finalize(size_t threadNum) {
-	finalizeObjects<OutputObjectXY>(threadNum, baseZoom, objects.begin(), objects.end(), lowZoomObjects);
-	finalizeObjects<OutputObjectXYID>(threadNum, baseZoom, objectsWithIds.begin(), objectsWithIds.end(), lowZoomObjectsWithIds);
+	finalizeObjects<OutputObjectXY>(name(), threadNum, baseZoom, objects.begin(), objects.end(), lowZoomObjects);
+	finalizeObjects<OutputObjectXYID>(name(), threadNum, baseZoom, objectsWithIds.begin(), objectsWithIds.end(), lowZoomObjectsWithIds);
 
 }
 
diff --git a/src/tilemaker.cpp b/src/tilemaker.cpp
index cdc01975..8a3f6419 100644
--- a/src/tilemaker.cpp
+++ b/src/tilemaker.cpp
@@ -506,8 +506,16 @@ int main(int argc, char* argv[]) {
 		}
 
 		std::deque<std::pair<unsigned int, TileCoordinates>> tileCoordinates;
+		std::cout << "collecting tiles:";
 		for (uint zoom=sharedData.config.startZoom; zoom <= sharedData.config.endZoom; zoom++) {
+			std::cout << " z" << std::to_string(zoom) << std::flush;
+#ifdef CLOCK_MONOTONIC
+			timespec start, end;
+			clock_gettime(CLOCK_MONOTONIC, &start);
+#endif
+
 			auto zoomResult = getTilesAtZoom(sources, zoom);
+			int numTiles = 0;
 			for (int x = 0; x < 1 << zoom; x++) {
 				for (int y = 0; y < 1 << zoom; y++) {
 					if (!zoomResult.test(x, y))
@@ -533,10 +541,22 @@ int main(int argc, char* argv[]) {
 					}
 
 					tileCoordinates.push_back(std::make_pair(zoom, TileCoordinates(x, y)));
+					numTiles++;
 				}
 			}
+
+			std::cout << " (" << numTiles;
+#ifdef CLOCK_MONOTONIC
+			clock_gettime(CLOCK_MONOTONIC, &end);
+			uint64_t tileNs = 1e9 * (end.tv_sec - start.tv_sec) + end.tv_nsec - start.tv_nsec;
+			std::cout << ", " << (uint32_t)(tileNs / 1e6) << "ms";
+
+#endif
+			std::cout << ")" << std::flush;
 		}
 
+		std::cout << std::endl;
+
 		// Cluster tiles: breadth-first for z0..z5, depth-first for z6
 		const size_t baseZoom = config.baseZoom;
 		boost::sort::block_indirect_sort(
@@ -615,7 +635,7 @@ int main(int argc, char* argv[]) {
 					unsigned int zoom = tileCoordinates[i].first;
 					TileCoordinates coords = tileCoordinates[i].second;
 
-#ifndef _WIN32
+#ifdef CLOCK_MONOTONIC
 					timespec start, end;
 					if (logTileTimings)
 						clock_gettime(CLOCK_MONOTONIC, &start);
@@ -627,7 +647,7 @@ int main(int argc, char* argv[]) {
 					}
 					outputProc(sharedData, sources, attributeStore, data, coords, zoom);
 
-#ifndef _WIN32
+#ifdef CLOCK_MONOTONIC
 					if (logTileTimings) {
 						clock_gettime(CLOCK_MONOTONIC, &end);
 						uint64_t tileNs = 1e9 * (end.tv_sec - start.tv_sec) + end.tv_nsec - start.tv_nsec;

From 3020011054c4813d84918b03493dd902e7961bd5 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 17 Dec 2023 00:37:04 -0500
Subject: [PATCH 13/49] AppendVector: an append-only chunked vector

When using --store, deque is nice because growing doesn't require
invalidating the old storage and copying it to a new location.

However, it's also bad, because deque allocates in 512-byte chunks,
which causes each 4KB OS page to have data from different z6 tiles.

Instead, use our own container that tries to get the best of both worlds.

Writing a random access iterator is new for me, so I don't trust this
code that much. The saving grace is that the container is very limited,
so errors in the iterator impelementation may not get exercised in
practice.
---
 Makefile                    |   7 +-
 include/append_vector.h     | 193 ++++++++++++++++++++++++++++++++++++
 include/tile_data.h         |  21 ++--
 test/append_vector.test.cpp |  85 ++++++++++++++++
 4 files changed, 294 insertions(+), 12 deletions(-)
 create mode 100644 include/append_vector.h
 create mode 100644 test/append_vector.test.cpp

diff --git a/Makefile b/Makefile
index f8ff37c3..1bddb089 100644
--- a/Makefile
+++ b/Makefile
@@ -125,7 +125,12 @@ tilemaker: \
 	src/write_geometry.o
 	$(CXX) $(CXXFLAGS) -o tilemaker $^ $(INC) $(LIB) $(LDFLAGS)
 
-test: test_attribute_store test_pooled_string test_sorted_way_store
+test: test_append_vector test_attribute_store test_pooled_string test_sorted_way_store
+
+test_append_vector: \
+	src/mmap_allocator.o \
+	test/append_vector.test.o
+	$(CXX) $(CXXFLAGS) -o test.append_vector $^ $(INC) $(LIB) $(LDFLAGS) && ./test.append_vector
 
 test_attribute_store: \
 	src/mmap_allocator.o \
diff --git a/include/append_vector.h b/include/append_vector.h
new file mode 100644
index 00000000..3fe9b907
--- /dev/null
+++ b/include/append_vector.h
@@ -0,0 +1,193 @@
+#ifndef _APPEND_VECTOR_H
+#define _APPEND_VECTOR_H
+
+#include "mmap_allocator.h"
+#include <vector>
+#include <queue>
+
+// Tilemaker collects OutputObjects in a list that
+// - spills to disk
+// - only gets appended to
+//
+// Vector is great for linear access, but resizes cause expensive disk I/O to
+// copy elements.
+//
+// Deque is great for growing without disk I/O, but it allocates in blocks of 512,
+// which is inefficient for linear access.
+//
+// Instead, we author a limited vector-of-vectors class that allocates in bigger chunks,
+// to get the best of both worlds.
+
+#define APPEND_VECTOR_SIZE 8192
+namespace AppendVectorNS {
+	template <class T>
+	class AppendVector {
+	public:
+		struct Iterator {
+			using iterator_category = std::random_access_iterator_tag;
+			using difference_type   = std::ptrdiff_t;
+			using value_type        = T;
+			using pointer           = T*;
+			using reference         = T&;
+
+			Iterator(AppendVector<T>& appendVector, uint16_t vec, uint16_t offset):
+				appendVector(&appendVector), vec(vec), offset(offset) {}
+
+			Iterator():
+				appendVector(nullptr), vec(0), offset(0) {}
+
+
+			bool operator<(const Iterator& other) const {
+				if (vec < other.vec)
+					return true;
+
+				if (vec > other.vec)
+					return false;
+
+				return offset < other.offset;
+			}
+
+			bool operator>=(const Iterator& other) const {
+				return !(*this < other);
+			}
+
+			Iterator operator-(int delta) const {
+				int64_t absolute = vec * APPEND_VECTOR_SIZE + offset;
+				absolute -= delta;
+				return Iterator(*appendVector, absolute / APPEND_VECTOR_SIZE, absolute % APPEND_VECTOR_SIZE);
+			}
+
+			Iterator operator+(int delta) const {
+				int64_t absolute = vec * APPEND_VECTOR_SIZE + offset;
+				absolute += delta;
+				return Iterator(*appendVector, absolute / APPEND_VECTOR_SIZE, absolute % APPEND_VECTOR_SIZE);
+			}
+
+			bool operator==(const Iterator& other) const {
+				return appendVector == other.appendVector && vec == other.vec && offset == other.offset;
+			}
+
+			bool operator!=(const Iterator& other) const {
+				return !(*this == other);
+			}
+
+			std::ptrdiff_t operator-(const Iterator& other) const {
+				int64_t absolute = vec * APPEND_VECTOR_SIZE + offset;
+				int64_t otherAbsolute = other.vec * APPEND_VECTOR_SIZE + other.offset;
+
+				return absolute - otherAbsolute;
+			}
+
+			reference operator*() const {
+				auto& vector = appendVector->vecs[vec];
+				auto& el = vector[offset];
+				return el;
+			}
+
+			pointer operator->() const {
+				auto& vector = appendVector->vecs[vec];
+				auto& el = vector[offset];
+				return &el;
+			}
+
+			Iterator& operator+= (int delta) {
+				int64_t absolute = vec * APPEND_VECTOR_SIZE + offset;
+				absolute += delta;
+
+				vec = absolute / APPEND_VECTOR_SIZE;
+				offset = absolute % APPEND_VECTOR_SIZE;
+				return *this;
+			}
+
+			Iterator& operator-= (int delta) {
+				int64_t absolute = vec * APPEND_VECTOR_SIZE + offset;
+				absolute -= delta;
+
+				vec = absolute / APPEND_VECTOR_SIZE;
+				offset = absolute % APPEND_VECTOR_SIZE;
+				return *this;
+			}
+
+			// Prefix increment
+			Iterator& operator++() {
+				offset++;
+				if (offset == APPEND_VECTOR_SIZE) {
+					offset = 0;
+					vec++;
+				}
+				return *this;
+			}  
+
+			// Postfix increment
+			Iterator operator++(int) { Iterator tmp = *this; ++(*this); return tmp; }
+
+			// Prefix decrement
+			Iterator& operator--() {
+				if (offset > 0) {
+					offset--;
+				} else {
+					vec--;
+					offset = APPEND_VECTOR_SIZE - 1;
+				}
+
+				return *this;
+			}
+
+			// Postfix decrement
+			Iterator operator--(int) { Iterator tmp = *this; --(*this); return tmp; }
+
+		private:
+			mutable AppendVector<T>* appendVector;
+			int32_t vec, offset;
+		};
+
+		AppendVector():
+			count(0),
+			vecs(1) {
+		}
+
+		void clear() {
+			count = 0;
+			vecs.clear();
+			vecs.push_back(std::vector<T, mmap_allocator<T>>());
+			vecs.back().reserve(APPEND_VECTOR_SIZE);
+		}
+
+		size_t size() const {
+			return count;
+		}
+
+		T& operator [](int idx) {
+			return vecs[idx / APPEND_VECTOR_SIZE][idx % APPEND_VECTOR_SIZE];
+		}
+
+		Iterator begin() {
+			return Iterator(*this, 0, 0);
+		}
+
+		Iterator end() {
+			return Iterator(*this, vecs.size() - 1, count % APPEND_VECTOR_SIZE);
+		}
+
+		void push_back(const T& el) {
+			if (vecs.back().capacity() == 0)
+				vecs.back().reserve(APPEND_VECTOR_SIZE);
+
+			vecs.back().push_back(el);
+
+			if (vecs.back().size() == vecs.back().capacity()) {
+				vecs.push_back(std::vector<T, mmap_allocator<T>>());
+				vecs.back().reserve(APPEND_VECTOR_SIZE);
+			}
+
+			count++;
+		}
+
+		size_t count;
+		std::deque<std::vector<T, mmap_allocator<T>>> vecs;
+	};
+}
+
+#undef APPEND_VECTOR_SIZE
+
+#endif
diff --git a/include/tile_data.h b/include/tile_data.h
index f40c754c..78793c27 100644
--- a/include/tile_data.h
+++ b/include/tile_data.h
@@ -8,6 +8,7 @@
 #include <memory>
 #include <boost/sort/sort.hpp>
 #include "output_object.h"
+#include "append_vector.h"
 #include "clip_cache.h"
 #include "mmap_allocator.h"
 
@@ -49,9 +50,9 @@ template<typename OO> void finalizeObjects(
 	const std::string& name,
 	const size_t& threadNum,
 	const unsigned int& baseZoom,
-	typename std::vector<std::deque<OO, mmap_allocator<OO>>>::iterator begin,
-	typename std::vector<std::deque<OO, mmap_allocator<OO>>>::iterator end,
-	typename std::vector<std::deque<OO, mmap_allocator<OO>>>& lowZoom
+	typename std::vector<AppendVectorNS::AppendVector<OO>>::iterator begin,
+	typename std::vector<AppendVectorNS::AppendVector<OO>>::iterator end,
+	typename std::vector<AppendVectorNS::AppendVector<OO>>& lowZoom
 	) {
 #ifdef CLOCK_MONOTONIC
 	timespec startTs, endTs;
@@ -74,8 +75,6 @@ template<typename OO> void finalizeObjects(
 		if (it->size() == 0)
 			continue;
 
-		it->shrink_to_fit();
-
 		for (auto objectIt = it->begin(); objectIt != it->end(); objectIt++)
 			if (objectIt->oo.minZoom < CLUSTER_ZOOM)
 				lowZoom[0].push_back(*objectIt);
@@ -133,7 +132,7 @@ template<typename OO> void finalizeObjects(
 
 template<typename OO> void collectTilesWithObjectsAtZoomTemplate(
 	const unsigned int& baseZoom,
-	const typename std::vector<std::deque<OO, mmap_allocator<OO>>>::iterator objects,
+	const typename std::vector<AppendVectorNS::AppendVector<OO>>::iterator objects,
 	const size_t size,
 	const unsigned int zoom,
 	TileCoordinatesSet& output
@@ -175,7 +174,7 @@ inline OutputObjectID outputObjectWithId<OutputObjectXYID>(const OutputObjectXYI
 
 template<typename OO> void collectObjectsForTileTemplate(
 	const unsigned int& baseZoom,
-	typename std::vector<std::deque<OO, mmap_allocator<OO>>>::iterator objects,
+	typename std::vector<AppendVectorNS::AppendVector<OO>>::iterator objects,
 	size_t iStart,
 	size_t iEnd,
 	unsigned int zoom,
@@ -318,10 +317,10 @@ class TileDataSource {
 	//
 	// If config.include_ids is true, objectsWithIds will be populated.
 	// Otherwise, objects.
-	std::vector<std::deque<OutputObjectXY, mmap_allocator<OutputObjectXY>>> objects;
-	std::vector<std::deque<OutputObjectXY, mmap_allocator<OutputObjectXY>>> lowZoomObjects;
-	std::vector<std::deque<OutputObjectXYID, mmap_allocator<OutputObjectXYID>>> objectsWithIds;
-	std::vector<std::deque<OutputObjectXYID, mmap_allocator<OutputObjectXYID>>> lowZoomObjectsWithIds;
+	std::vector<AppendVectorNS::AppendVector<OutputObjectXY>> objects;
+	std::vector<AppendVectorNS::AppendVector<OutputObjectXY>> lowZoomObjects;
+	std::vector<AppendVectorNS::AppendVector<OutputObjectXYID>> objectsWithIds;
+	std::vector<AppendVectorNS::AppendVector<OutputObjectXYID>> lowZoomObjectsWithIds;
 	
 	// rtree index of large objects
 	using oo_rtree_param_type = boost::geometry::index::quadratic<128>;
diff --git a/test/append_vector.test.cpp b/test/append_vector.test.cpp
new file mode 100644
index 00000000..300f6e30
--- /dev/null
+++ b/test/append_vector.test.cpp
@@ -0,0 +1,85 @@
+#include <iostream>
+#include <boost/sort/sort.hpp>
+#include "external/minunit.h"
+#include "append_vector.h"
+
+using namespace AppendVectorNS;
+
+MU_TEST(test_append_vector) {
+	AppendVector<uint32_t> vec;
+	mu_check(vec.size() == 0);
+
+	for (int i = 0; i < 10000; i++) {
+		vec.push_back(i);
+	}
+	mu_check(vec.size() == 10000);
+
+	mu_check(vec[25] == 25);
+
+	const AppendVector<uint32_t>::Iterator& it = vec.begin();
+	mu_check(*it == 0);
+	mu_check(*(it + 1) == 1);
+	mu_check(*(it + 2) == 2);
+	mu_check(*(it + 9000) == 9000);
+	mu_check(*(it + 1 - 1) == 0);
+	mu_check(*(vec.end() + -1) == 9999);
+	mu_check(*(vec.end() - 1) == 9999);
+	mu_check(*(vec.end() - 2) == 9998);
+	mu_check(*(vec.end() - 9000) == 1000);
+	mu_check(*(vec.begin() - -1) == 1);
+
+	boost::sort::block_indirect_sort(
+		vec.begin(),
+		vec.end(),
+		[](auto const &a, auto const&b) { return b < a; },
+		1
+	);
+
+	mu_check(vec[0] == 9999);
+	mu_check(vec[9999] == 0);
+
+	boost::sort::block_indirect_sort(
+		vec.begin(),
+		vec.end(),
+		[](auto const &a, auto const&b) { return a < b; },
+		1
+	);
+
+	mu_check(vec[0] == 0);
+	mu_check(vec[9999] == 9999);
+
+	auto iter = std::lower_bound(
+		vec.begin(),
+		vec.end(),
+		123,
+		[](const uint32_t& a, const uint32_t& toFind) {
+			return a < toFind;
+		}
+	);
+
+	mu_check(iter != vec.end());
+	mu_check(*iter == 123);
+
+	iter = std::lower_bound(
+		vec.begin(),
+		vec.end(),
+		123123,
+		[](const uint32_t& a, const uint32_t& toFind) {
+			return a < toFind;
+		}
+	);
+
+	mu_check(iter == vec.end());
+
+}
+
+MU_TEST_SUITE(test_suite_append_vector) {
+	MU_RUN_TEST(test_append_vector);
+}
+
+int main() {
+	MU_RUN_SUITE(test_suite_append_vector);
+	MU_REPORT();
+	return MU_EXIT_CODE;
+}
+

From 330b0a79a7e815df9af4bc0050b24a9d0f699647 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 17 Dec 2023 01:22:47 -0500
Subject: [PATCH 14/49] fix progress when --store present

---
 src/read_pbf.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/read_pbf.cpp b/src/read_pbf.cpp
index 2eb1795c..ff585d03 100644
--- a/src/read_pbf.cpp
+++ b/src/read_pbf.cpp
@@ -272,8 +272,9 @@ bool PbfReader::ReadBlock(
 		{
 			if (ioMutex.try_lock()) {
 				std::ostringstream str;
+				str << "\r";
 				void_mmap_allocator::reportStoreSize(str);
-				str << "\rBlock " << blocksProcessed.load() << "/" << blocksToProcess.load() << " ways " << pg.ways_size() << " relations " << pg.relations_size() << "                  ";
+				str << "Block " << blocksProcessed.load() << "/" << blocksToProcess.load() << " ways " << pg.ways_size() << " relations " << pg.relations_size() << "                  ";
 				std::cout << str.str();
 				std::cout.flush();
 				ioMutex.unlock();

From 9d97d30f8999c31b2af456b7ec7c21197eec606d Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 17 Dec 2023 08:51:14 -0500
Subject: [PATCH 15/49] mutex on RelationScan progress output

---
 src/read_pbf.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/read_pbf.cpp b/src/read_pbf.cpp
index ff585d03..0202a67d 100644
--- a/src/read_pbf.cpp
+++ b/src/read_pbf.cpp
@@ -294,8 +294,11 @@ bool PbfReader::ReadBlock(
 			osmStore.ensureUsedWaysInited();
 			bool done = ScanRelations(output, pg, pb);
 			if(done) { 
-				std::cout << "\r(Scanning for ways used in relations: " << (100*blocksProcessed.load()/blocksToProcess.load()) << "%)           ";
-				std::cout.flush();
+				if (ioMutex.try_lock()) {
+					std::cout << "\r(Scanning for ways used in relations: " << (100*blocksProcessed.load()/blocksToProcess.load()) << "%)           ";
+					std::cout.flush();
+					ioMutex.unlock();
+				}
 				continue;
 			}
 		}

From f9993cf9534cd933903f4396ee8d0c91ffe3efd5 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 17 Dec 2023 09:49:00 -0500
Subject: [PATCH 16/49] make NodeStore/WayStore shardable

This adds three methods to the stores:

- `shard()` returns which shard you are
- `shards()` returns how many shards total
- `contains(shard, id)` returns whether or not shard N has an item with
  id X

SortedNodeStore/SortedWayStore are not implemented yet, that'll come in
a future commit.

This will allow us to create a `ShardedNodeStore` and `ShardedWayStore`
that contain N stores. We will try to ensure that each store has data
that is geographically close to each other.

Then, when reading, we'll do multiple passes of the PBF to populate each store.
This should let us reduce the working set used to populate the stores,
at the cost of additional linear scans of the PBF. Linear scans of disk
are much less painful than random scans, so that should be a good trade.
---
 include/node_store.h           |  4 ++++
 include/node_stores.h          | 11 +++++++++++
 include/sorted_node_store.h    |  4 ++++
 include/sorted_way_store.h     |  4 ++++
 include/way_store.h            |  4 ++++
 include/way_stores.h           |  4 ++++
 src/node_stores.cpp            | 11 +++++++++++
 src/way_stores.cpp             |  8 ++++++++
 test/sorted_way_store.test.cpp |  4 ++++
 9 files changed, 54 insertions(+)

diff --git a/include/node_store.h b/include/node_store.h
index cc84aba2..9ef2a4c6 100644
--- a/include/node_store.h
+++ b/include/node_store.h
@@ -23,6 +23,10 @@ class NodeStore
 	// Accessors
 	virtual size_t size() const = 0;
 	virtual LatpLon at(NodeID i) const = 0;
+
+	virtual bool contains(size_t shard, NodeID id) const = 0;
+	virtual size_t shard() const = 0;
+	virtual size_t shards() const = 0;
 };
 
 #endif
diff --git a/include/node_stores.h b/include/node_stores.h
index c5151bec..f093081f 100644
--- a/include/node_stores.h
+++ b/include/node_stores.h
@@ -24,6 +24,11 @@ class BinarySearchNodeStore : public NodeStore
 	}
 	void batchStart() {}
 
+	bool contains(size_t shard, NodeID id) const override;
+	size_t shard() const override { return 0; }
+	size_t shards() const override { return 1; }
+	
+
 private: 
 	mutable std::mutex mutex;
 	std::vector<std::shared_ptr<map_t>> mLatpLons;
@@ -51,6 +56,12 @@ class CompactNodeStore : public NodeStore
 	void finalize(size_t numThreads) override {}
 	void batchStart() {}
 
+	// CompactNodeStore has no metadata to know whether or not it contains
+	// a node, so it's not suitable for used in sharded scenarios.
+	bool contains(size_t shard, NodeID id) const override { return true; }
+	size_t shard() const override { return 0; }
+	size_t shards() const override { return 1; }
+
 private: 
 	// @brief Insert a latp/lon pair.
 	// @param i OSM ID of a node
diff --git a/include/sorted_node_store.h b/include/sorted_node_store.h
index 5c156ad3..20bad4e0 100644
--- a/include/sorted_node_store.h
+++ b/include/sorted_node_store.h
@@ -69,6 +69,10 @@ class SortedNodeStore : public NodeStore
 		reopen();
 	}
 
+	bool contains(size_t shard, NodeID ID) const override { throw std::runtime_error("SortedNodeStore::contains not implemented"); }
+	size_t shard() const override { return 0; }
+	size_t shards() const override { return 1; }
+
 private: 
 	// When true, store chunks compressed. Only store compressed if the
 	// chunk is sufficiently large.
diff --git a/include/sorted_way_store.h b/include/sorted_way_store.h
index 145e467b..448fffda 100644
--- a/include/sorted_way_store.h
+++ b/include/sorted_way_store.h
@@ -93,6 +93,10 @@ class SortedWayStore: public WayStore {
 	void clear() override;
 	std::size_t size() const override;
 	void finalize(unsigned int threadNum) override;
+
+	bool contains(size_t shard, WayID id) const override { throw std::runtime_error("SortedWayStore::contains not implemented"); }
+	size_t shard() const override { return 0; }
+	size_t shards() const override { return 1; }
 	
 	static uint16_t encodeWay(
 		const std::vector<NodeID>& way,
diff --git a/include/way_store.h b/include/way_store.h
index 8650cbea..5e274a5c 100644
--- a/include/way_store.h
+++ b/include/way_store.h
@@ -21,6 +21,10 @@ class WayStore {
 	virtual void clear() = 0;
 	virtual std::size_t size() const = 0;
 	virtual void finalize(unsigned int threadNum) = 0;
+
+	virtual bool contains(size_t shard, WayID id) const = 0;
+	virtual size_t shard() const = 0;
+	virtual size_t shards() const = 0;
 };
 
 #endif
diff --git a/include/way_stores.h b/include/way_stores.h
index dfb5f74c..4ed8db7e 100644
--- a/include/way_stores.h
+++ b/include/way_stores.h
@@ -21,6 +21,10 @@ class BinarySearchWayStore: public WayStore {
 	std::size_t size() const override;
 	void finalize(unsigned int threadNum) override;
 
+	bool contains(size_t shard, WayID id) const override;
+	size_t shard() const override { return 0; }
+	size_t shards() const override { return 1; }
+
 private:
 	mutable std::mutex mutex;
 	std::unique_ptr<map_t> mLatpLonLists;
diff --git a/src/node_stores.cpp b/src/node_stores.cpp
index 8c84b811..06e2fc5e 100644
--- a/src/node_stores.cpp
+++ b/src/node_stores.cpp
@@ -14,6 +14,17 @@ void BinarySearchNodeStore::reopen()
 	}
 }
 
+bool BinarySearchNodeStore::contains(size_t shard, NodeID i) const {
+	auto internalShard = mLatpLons[shardPart(i)];
+	auto id = idPart(i);
+
+	auto iter = std::lower_bound(internalShard->begin(), internalShard->end(), id, [](auto const &e, auto i) { 
+		return e.first < i; 
+	});
+
+	return !(iter == internalShard->end() || iter->first != id);
+}
+
 LatpLon BinarySearchNodeStore::at(NodeID i) const {
 	auto shard = mLatpLons[shardPart(i)];
 	auto id = idPart(i);
diff --git a/src/way_stores.cpp b/src/way_stores.cpp
index 05d884d0..e19cbf5a 100644
--- a/src/way_stores.cpp
+++ b/src/way_stores.cpp
@@ -14,6 +14,14 @@ void BinarySearchWayStore::reopen() {
 	mLatpLonLists = std::make_unique<map_t>();
 }
 
+bool BinarySearchWayStore::contains(size_t shard, WayID id) const {
+	auto iter = std::lower_bound(mLatpLonLists->begin(), mLatpLonLists->end(), id, [](auto const &e, auto id) { 
+		return e.first < id; 
+	});
+
+	return !(iter == mLatpLonLists->end() || iter->first != id);
+}
+
 std::vector<LatpLon> BinarySearchWayStore::at(WayID wayid) const {
 	std::lock_guard<std::mutex> lock(mutex);
 	
diff --git a/test/sorted_way_store.test.cpp b/test/sorted_way_store.test.cpp
index 1c50a494..217a1110 100644
--- a/test/sorted_way_store.test.cpp
+++ b/test/sorted_way_store.test.cpp
@@ -13,6 +13,10 @@ class TestNodeStore : public NodeStore {
 		return { (int32_t)id, -(int32_t)id };
 	}
 	void insert(const std::vector<std::pair<NodeID, LatpLon>>& elements) override {}
+
+	bool contains(size_t shard, NodeID id) const override { return true; }
+	size_t shard() const override { return 0; }
+	size_t shards() const override { return 1; }
 };
 
 void roundtripWay(const std::vector<NodeID>& way) {

From b49b1e7da4f8293518451e4a367e9cd846074177 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 17 Dec 2023 10:08:02 -0500
Subject: [PATCH 17/49] add minimal SortedNodeStore test

I'm going to rejig the innards of this class, so let's have some tests.
---
 Makefile                        | 17 ++++++++++++++++-
 test/sorted_node_store.test.cpp | 27 +++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 1 deletion(-)
 create mode 100644 test/sorted_node_store.test.cpp

diff --git a/Makefile b/Makefile
index 1bddb089..9eae44c3 100644
--- a/Makefile
+++ b/Makefile
@@ -125,7 +125,12 @@ tilemaker: \
 	src/write_geometry.o
 	$(CXX) $(CXXFLAGS) -o tilemaker $^ $(INC) $(LIB) $(LDFLAGS)
 
-test: test_append_vector test_attribute_store test_pooled_string test_sorted_way_store
+test: \
+	test_append_vector \
+	test_attribute_store \
+	test_pooled_string \
+	test_sorted_node_store \
+	test_sorted_way_store
 
 test_append_vector: \
 	src/mmap_allocator.o \
@@ -145,6 +150,16 @@ test_pooled_string: \
 	test/pooled_string.test.o
 	$(CXX) $(CXXFLAGS) -o test.pooled_string $^ $(INC) $(LIB) $(LDFLAGS) && ./test.pooled_string
 
+test_sorted_node_store: \
+	src/external/streamvbyte_decode.o \
+	src/external/streamvbyte_encode.o \
+	src/external/streamvbyte_zigzag.o \
+	src/mmap_allocator.o \
+	src/sorted_node_store.o \
+	test/sorted_node_store.test.o
+	$(CXX) $(CXXFLAGS) -o test.sorted_node_store $^ $(INC) $(LIB) $(LDFLAGS) && ./test.sorted_node_store
+
+
 test_sorted_way_store: \
 	src/external/streamvbyte_decode.o \
 	src/external/streamvbyte_encode.o \
diff --git a/test/sorted_node_store.test.cpp b/test/sorted_node_store.test.cpp
new file mode 100644
index 00000000..ea6956d6
--- /dev/null
+++ b/test/sorted_node_store.test.cpp
@@ -0,0 +1,27 @@
+#include <iostream>
+#include "external/minunit.h"
+#include "sorted_node_store.h"
+
+MU_TEST(test_sorted_node_store) {
+	SortedNodeStore sns(true);
+	mu_check(sns.size() == 0);
+
+	sns.batchStart();
+
+	sns.insert({ {1, {2, 3 } } });
+
+	sns.finalize(1);
+
+	mu_check(sns.size() == 1);
+
+}
+
+MU_TEST_SUITE(test_suite_sorted_node_store) {
+	MU_RUN_TEST(test_sorted_node_store);
+}
+
+int main() {
+	MU_RUN_SUITE(test_suite_sorted_node_store);
+	MU_REPORT();
+	return MU_EXIT_CODE;
+}

From e81c6ee0c5fcc6fe82de9c26f487dd9151624882 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 17 Dec 2023 10:13:28 -0500
Subject: [PATCH 18/49] stop using internal linkage for atomics

In order to shard the stores, we need to have multiple instances
of the class.

Two things block this currently: atomics at file-level, and
thread-locals.

Moving the atomics to the class is easy.

Making the thread-locals per-class will require an approach similar
to that adopted in
https://github.com/systemed/tilemaker/blob/52b62dfbd5b6f8e4feb6cad4e3de86ba27874b3a/include/leased_store.h#L48,
where we have a container that tracks the per-class data.
---
 include/sorted_node_store.h | 10 ++++++++++
 include/sorted_way_store.h  |  8 ++++++++
 src/sorted_node_store.cpp   | 20 ++++++--------------
 src/sorted_way_store.cpp    | 17 ++++++-----------
 4 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/include/sorted_node_store.h b/include/sorted_node_store.h
index 20bad4e0..8f276f4a 100644
--- a/include/sorted_node_store.h
+++ b/include/sorted_node_store.h
@@ -3,6 +3,7 @@
 
 #include "node_store.h"
 #include "mmap_allocator.h"
+#include <atomic>
 #include <map>
 #include <memory>
 #include <mutex>
@@ -86,6 +87,15 @@ class SortedNodeStore : public NodeStore
 	// multiple threads. They'll get folded into the index during finalize()
 	std::map<NodeID, std::vector<element_t>> orphanage;
 	std::vector<std::vector<element_t>> workerBuffers;
+
+	std::atomic<uint64_t> totalGroups;
+	std::atomic<uint64_t> totalNodes;
+	std::atomic<uint64_t> totalGroupSpace;
+	std::atomic<uint64_t> totalAllocatedSpace;
+	std::atomic<uint64_t> totalChunks;
+	std::atomic<uint64_t> chunkSizeFreqs[257];
+	std::atomic<uint64_t> groupSizeFreqs[257];
+
 	void collectOrphans(const std::vector<element_t>& orphans);
 	void publishGroup(const std::vector<element_t>& nodes);
 };
diff --git a/include/sorted_way_store.h b/include/sorted_way_store.h
index 448fffda..b28c4257 100644
--- a/include/sorted_way_store.h
+++ b/include/sorted_way_store.h
@@ -1,6 +1,7 @@
 #ifndef _SORTED_WAY_STORE_H
 #define _SORTED_WAY_STORE_H
 
+#include <atomic>
 #include <map>
 #include <memory>
 #include <mutex>
@@ -117,6 +118,13 @@ class SortedWayStore: public WayStore {
 	// multiple threads. They'll get folded into the index during finalize()
 	std::map<WayID, std::vector<std::pair<WayID, std::vector<NodeID>>>> orphanage;
 	std::vector<std::vector<std::pair<WayID, std::vector<NodeID>>>> workerBuffers;
+
+	std::atomic<uint64_t> totalWays;
+	std::atomic<uint64_t> totalNodes;
+	std::atomic<uint64_t> totalGroups;
+	std::atomic<uint64_t> totalGroupSpace;
+	std::atomic<uint64_t> totalChunks;
+
 	void collectOrphans(const std::vector<std::pair<WayID, std::vector<NodeID>>>& orphans);
 	void publishGroup(const std::vector<std::pair<WayID, std::vector<NodeID>>>& ways);
 };
diff --git a/src/sorted_node_store.cpp b/src/sorted_node_store.cpp
index 76aa81b8..0f856af6 100644
--- a/src/sorted_node_store.cpp
+++ b/src/sorted_node_store.cpp
@@ -2,7 +2,6 @@
 #include <algorithm>
 #include <cstring>
 #include <string>
-#include <atomic>
 #include <map>
 #include <bitset>
 #include "sorted_node_store.h"
@@ -16,15 +15,6 @@ namespace SortedNodeStoreTypes {
 	const uint16_t ChunkAlignment = 16;
 	const uint32_t ChunkCompressed = 1 << 31;
 
-	std::atomic<uint64_t> totalGroups;
-	std::atomic<uint64_t> totalNodes;
-	std::atomic<uint64_t> totalGroupSpace;
-	std::atomic<uint64_t> totalAllocatedSpace;
-	std::atomic<uint64_t> totalChunks;
-	std::atomic<uint64_t> chunkSizeFreqs[257];
-	std::atomic<uint64_t> groupSizeFreqs[257];
-
-
 	// When SortedNodeStore first starts, it's not confident that it has seen an
 	// entire segment, so it's in "collecting orphans" mode. Once it crosses a
 	// threshold of 64K elements, it ceases to be in this mode.
@@ -46,10 +36,7 @@ namespace SortedNodeStoreTypes {
 using namespace SortedNodeStoreTypes;
 
 SortedNodeStore::SortedNodeStore(bool compressNodes): compressNodes(compressNodes) {
-	// Each group can store 64K nodes. If we allocate 256K slots
-	// for groups, we support 2^34 = 17B nodes, or about twice
-	// the number used by OSM as of November 2023.
-	groups.resize(256 * 1024);
+	reopen();
 }
 
 void SortedNodeStore::reopen()
@@ -61,11 +48,16 @@ void SortedNodeStore::reopen()
 	totalNodes = 0;
 	totalGroups = 0;
 	totalGroupSpace = 0;
+	totalAllocatedSpace = 0;
 	totalChunks = 0;
 	memset(chunkSizeFreqs, 0, sizeof(chunkSizeFreqs));
 	memset(groupSizeFreqs, 0, sizeof(groupSizeFreqs));
 	orphanage.clear();
 	workerBuffers.clear();
+
+	// Each group can store 64K nodes. If we allocate 256K slots
+	// for groups, we support 2^34 = 17B nodes, or about twice
+	// the number used by OSM as of November 2023.
 	groups.clear();
 	groups.resize(256 * 1024);
 }
diff --git a/src/sorted_way_store.cpp b/src/sorted_way_store.cpp
index 8fdaa806..d0d05f00 100644
--- a/src/sorted_way_store.cpp
+++ b/src/sorted_way_store.cpp
@@ -1,4 +1,3 @@
-#include <atomic>
 #include <algorithm>
 #include <bitset>
 #include <cstring>
@@ -34,20 +33,12 @@ namespace SortedWayStoreTypes {
 	thread_local int32_t int32Buffer[2000];
 	thread_local uint8_t uint8Buffer[8192];
 
-	std::atomic<uint64_t> totalWays;
-	std::atomic<uint64_t> totalNodes;
-	std::atomic<uint64_t> totalGroups;
-	std::atomic<uint64_t> totalGroupSpace;
-	std::atomic<uint64_t> totalChunks;
 }
 
 using namespace SortedWayStoreTypes;
 
 SortedWayStore::SortedWayStore(bool compressWays, const NodeStore& nodeStore): compressWays(compressWays), nodeStore(nodeStore) {
-	// Each group can store 64K ways. If we allocate 32K slots,
-	// we support 2^31 = 2B ways, or about twice the number used
-	// by OSM as of December 2023.
-	groups.resize(32 * 1024);
+	reopen();
 }
 
 SortedWayStore::~SortedWayStore() {
@@ -67,8 +58,12 @@ void SortedWayStore::reopen() {
 	totalChunks = 0;
 	orphanage.clear();
 	workerBuffers.clear();
+
+	// Each group can store 64K ways. If we allocate 32K slots,
+	// we support 2^31 = 2B ways, or about twice the number used
+	// by OSM as of December 2023.
 	groups.clear();
-	groups.resize(256 * 1024);
+	groups.resize(32 * 1024);
 
 }
 

From 1c4174d21c61cde3c59c96332aaf413da6ed5902 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 17 Dec 2023 10:29:12 -0500
Subject: [PATCH 19/49] SortedNodeStore: abstract TLS behind storage()

Still only supports 1 class, but this is a step along the path.
---
 src/sorted_node_store.cpp | 127 +++++++++++++++++++++-----------------
 1 file changed, 71 insertions(+), 56 deletions(-)

diff --git a/src/sorted_node_store.cpp b/src/sorted_node_store.cpp
index 0f856af6..0d7b43aa 100644
--- a/src/sorted_node_store.cpp
+++ b/src/sorted_node_store.cpp
@@ -15,26 +15,41 @@ namespace SortedNodeStoreTypes {
 	const uint16_t ChunkAlignment = 16;
 	const uint32_t ChunkCompressed = 1 << 31;
 
-	// When SortedNodeStore first starts, it's not confident that it has seen an
-	// entire segment, so it's in "collecting orphans" mode. Once it crosses a
-	// threshold of 64K elements, it ceases to be in this mode.
-	//
-	// Orphans are rounded up across multiple threads, and dealt with in
-	// the finalize step.
-	thread_local bool collectingOrphans = true;
-	thread_local uint64_t groupStart = -1;
-	thread_local std::vector<NodeStore::element_t>* localNodes = nullptr;
-
-	thread_local int64_t cachedChunk = -1;
-	thread_local std::vector<int32_t> cacheChunkLons;
-	thread_local std::vector<int32_t> cacheChunkLatps;
-
-	thread_local uint32_t arenaSpace = 0;
-	thread_local char* arenaPtr = nullptr;
+	struct ThreadStorage {
+		ThreadStorage():
+			collectingOrphans(true),
+			groupStart(-1),
+			localNodes(nullptr),
+			cachedChunk(-1),
+			arenaSpace(0),
+			arenaPtr(nullptr) {}
+		// When SortedNodeStore first starts, it's not confident that it has seen an
+		// entire segment, so it's in "collecting orphans" mode. Once it crosses a
+		// threshold of 64K elements, it ceases to be in this mode.
+		//
+		// Orphans are rounded up across multiple threads, and dealt with in
+		// the finalize step.
+		bool collectingOrphans = true;
+		uint64_t groupStart = -1;
+		std::vector<NodeStore::element_t>* localNodes = nullptr;
+
+		int64_t cachedChunk = -1;
+		std::vector<int32_t> cacheChunkLons;
+		std::vector<int32_t> cacheChunkLatps;
+
+		uint32_t arenaSpace = 0;
+		char* arenaPtr = nullptr;
+	};
+
+	thread_local ThreadStorage threadStorage;
 }
 
 using namespace SortedNodeStoreTypes;
 
+ThreadStorage& storage() {
+	return threadStorage;
+}
+
 SortedNodeStore::SortedNodeStore(bool compressNodes): compressNodes(compressNodes) {
 	reopen();
 }
@@ -101,29 +116,29 @@ LatpLon SortedNodeStore::at(const NodeID id) const {
 		size_t latpSize = (ptr->flags >> 10) & ((1 << 10) - 1);
 		// TODO: we don't actually need the lonSize to decompress the data.
 		//       May as well store it as a sanity check for now.
-		size_t lonSize = ptr->flags & ((1 << 10) - 1);
+		// size_t lonSize = ptr->flags & ((1 << 10) - 1);
 		size_t n = popcnt(ptr->nodeMask, 32) - 1;
 
 		const size_t neededChunk = groupIndex * ChunkSize + chunk;
 
 		// Really naive caching strategy - just cache the last-used chunk.
 		// Probably good enough?
-		if (cachedChunk != neededChunk) {
-			cachedChunk = neededChunk;
-			cacheChunkLons.reserve(256);
-			cacheChunkLatps.reserve(256);
+		if (storage().cachedChunk != neededChunk) {
+			storage().cachedChunk = neededChunk;
+			storage().cacheChunkLons.reserve(256);
+			storage().cacheChunkLatps.reserve(256);
 
 			uint8_t* latpData = ptr->data;
 			uint8_t* lonData = ptr->data + latpSize;
 			uint32_t recovdata[256] = {0};
 
 			streamvbyte_decode(latpData, recovdata, n);
-			cacheChunkLatps[0] = ptr->firstLatp;
-			zigzag_delta_decode(recovdata, &cacheChunkLatps[1], n, cacheChunkLatps[0]);
+			storage().cacheChunkLatps[0] = ptr->firstLatp;
+			zigzag_delta_decode(recovdata, &storage().cacheChunkLatps[1], n, storage().cacheChunkLatps[0]);
 
 			streamvbyte_decode(lonData, recovdata, n);
-			cacheChunkLons[0] = ptr->firstLon;
-			zigzag_delta_decode(recovdata, &cacheChunkLons[1], n, cacheChunkLons[0]);
+			storage().cacheChunkLons[0] = ptr->firstLon;
+			zigzag_delta_decode(recovdata, &storage().cacheChunkLons[1], n, storage().cacheChunkLons[0]);
 		}
 
 		size_t nodeOffset = 0;
@@ -134,7 +149,7 @@ LatpLon SortedNodeStore::at(const NodeID id) const {
 		if (!(ptr->nodeMask[nodeMaskByte] & (1 << nodeMaskBit)))
 			throw std::out_of_range("SortedNodeStore: node " + std::to_string(id) + " missing, no node");
 
-		return { cacheChunkLatps[nodeOffset], cacheChunkLons[nodeOffset] };
+		return { storage().cacheChunkLatps[nodeOffset], storage().cacheChunkLons[nodeOffset] };
 	}
 
 	UncompressedChunkInfo* ptr = (UncompressedChunkInfo*)basePtr;
@@ -176,58 +191,58 @@ size_t SortedNodeStore::size() const {
 }
 
 void SortedNodeStore::insert(const std::vector<element_t>& elements) {
-	if (localNodes == nullptr) {
+	if (storage().localNodes == nullptr) {
 		std::lock_guard<std::mutex> lock(orphanageMutex);
 		if (workerBuffers.size() == 0)
 			workerBuffers.reserve(256);
 		else if (workerBuffers.size() == workerBuffers.capacity())
 			throw std::runtime_error("SortedNodeStore doesn't support more than 256 cores");
 		workerBuffers.push_back(std::vector<element_t>());
-		localNodes = &workerBuffers.back();
+		storage().localNodes = &workerBuffers.back();
 	}
 
-	if (groupStart == -1) {
+	if (storage().groupStart == -1) {
 		// Mark where the first full group starts, so we know when to transition
 		// out of collecting orphans.
-		groupStart = elements[0].first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+		storage().groupStart = elements[0].first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
 	}
 
 	int i = 0;
-	while (collectingOrphans && i < elements.size()) {
+	while (storage().collectingOrphans && i < elements.size()) {
 		const element_t& el = elements[i];
-		if (el.first >= groupStart + (GroupSize * ChunkSize)) {
-			collectingOrphans = false;
+		if (el.first >= storage().groupStart + (GroupSize * ChunkSize)) {
+			storage().collectingOrphans = false;
 			// Calculate new groupStart, rounding to previous boundary.
-			groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
-			collectOrphans(*localNodes);
-			localNodes->clear();
+			storage().groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+			collectOrphans(*storage().localNodes);
+			storage().localNodes->clear();
 		}
-		localNodes->push_back(el);
+		storage().localNodes->push_back(el);
 		i++;
 	}
 
 	while(i < elements.size()) {
 		const element_t& el = elements[i];
 
-		if (el.first >= groupStart + (GroupSize * ChunkSize)) {
-			publishGroup(*localNodes);
-			localNodes->clear();
-			groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+		if (el.first >= storage().groupStart + (GroupSize * ChunkSize)) {
+			publishGroup(*storage().localNodes);
+			storage().localNodes->clear();
+			storage().groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
 		}
 
-		localNodes->push_back(el);
+		storage().localNodes->push_back(el);
 		i++;
 	}
 }
 
 void SortedNodeStore::batchStart() {
-	collectingOrphans = true;
-	groupStart = -1;
-	if (localNodes == nullptr || localNodes->size() == 0)
+	storage().collectingOrphans = true;
+	storage().groupStart = -1;
+	if (storage().localNodes == nullptr || storage().localNodes->size() == 0)
 		return;
 
-	collectOrphans(*localNodes);
-	localNodes->clear();
+	collectOrphans(*storage().localNodes);
+	storage().localNodes->clear();
 }
 
 void SortedNodeStore::finalize(size_t threadNum) {
@@ -402,22 +417,22 @@ void SortedNodeStore::publishGroup(const std::vector<element_t>& nodes) {
 
 	GroupInfo* groupInfo = nullptr;
 
-	if (arenaSpace < groupSpace) {
+	if (storage().arenaSpace < groupSpace) {
 		// A full group takes ~330KB. Nodes are read _fast_, and there ends
 		// up being contention calling the allocator when reading the
 		// planet on a machine with 48 cores -- so allocate in large chunks.
-		arenaSpace = 4 * 1024 * 1024;
-		totalAllocatedSpace += arenaSpace;
-		arenaPtr = (char*)void_mmap_allocator::allocate(arenaSpace);
-		if (arenaPtr == nullptr)
+		storage().arenaSpace = 4 * 1024 * 1024;
+		totalAllocatedSpace += storage().arenaSpace;
+		storage().arenaPtr = (char*)void_mmap_allocator::allocate(storage().arenaSpace);
+		if (storage().arenaPtr == nullptr)
 			throw std::runtime_error("SortedNodeStore: failed to allocate arena");
 		std::lock_guard<std::mutex> lock(orphanageMutex);
-		allocatedMemory.push_back(std::make_pair((void*)arenaPtr, arenaSpace));
+		allocatedMemory.push_back(std::make_pair((void*)storage().arenaPtr, storage().arenaSpace));
 	}
 
-	arenaSpace -= groupSpace;
-	groupInfo = (GroupInfo*)arenaPtr;
-	arenaPtr += groupSpace;
+	storage().arenaSpace -= groupSpace;
+	groupInfo = (GroupInfo*)storage().arenaPtr;
+	storage().arenaPtr += groupSpace;
 
 	if (groups[groupIndex] != nullptr)
 		throw std::runtime_error("SortedNodeStore: group already present");

From 99b5912524d015e59ecc94ac36b22a662fc2ee20 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 17 Dec 2023 10:42:09 -0500
Subject: [PATCH 20/49] SortedWayStore: abstract TLS behind storage()

---
 src/sorted_node_store.cpp |  8 ++---
 src/sorted_way_store.cpp  | 71 +++++++++++++++++++++++----------------
 2 files changed, 46 insertions(+), 33 deletions(-)

diff --git a/src/sorted_node_store.cpp b/src/sorted_node_store.cpp
index 0d7b43aa..2f079755 100644
--- a/src/sorted_node_store.cpp
+++ b/src/sorted_node_store.cpp
@@ -42,14 +42,14 @@ namespace SortedNodeStoreTypes {
 	};
 
 	thread_local ThreadStorage threadStorage;
+
+	ThreadStorage& storage() {
+		return threadStorage;
+	}
 }
 
 using namespace SortedNodeStoreTypes;
 
-ThreadStorage& storage() {
-	return threadStorage;
-}
-
 SortedNodeStore::SortedNodeStore(bool compressNodes): compressNodes(compressNodes) {
 	reopen();
 }
diff --git a/src/sorted_way_store.cpp b/src/sorted_way_store.cpp
index d0d05f00..f87dde21 100644
--- a/src/sorted_way_store.cpp
+++ b/src/sorted_way_store.cpp
@@ -18,21 +18,35 @@ namespace SortedWayStoreTypes {
 	const uint16_t ClosedWay = 1 << 14;
 	const uint16_t UniformUpperBits = 1 << 13;
 
-	thread_local bool collectingOrphans = true;
-	thread_local uint64_t groupStart = -1;
-	thread_local std::vector<std::pair<WayID, std::vector<NodeID>>>* localWays = NULL;
+	struct ThreadStorage {
+		ThreadStorage():
+			collectingOrphans(true),
+			groupStart(-1),
+			localWays(nullptr) {}
+
+		bool collectingOrphans;
+		uint64_t groupStart;
+		std::vector<std::pair<WayID, std::vector<NodeID>>>* localWays;
+		std::vector<uint8_t> encodedWay;
+	};
 
-	thread_local std::vector<uint8_t> encodedWay;
+	thread_local ThreadStorage threadStorage;
 
 	// C++ doesn't support variable length arrays declared on stack.
 	// g++ and clang support it, but msvc doesn't. Rather than pay the
 	// cost of a vector for every decode, we use a thread_local with room for at
 	// least 2,000 nodes.
+	//
+	// Note: these are scratch buffers, so they remain as true thread-locals,
+	// and aren't part of ThreadStorage.
 	thread_local uint64_t highBytes[2000];
 	thread_local uint32_t uint32Buffer[2000];
 	thread_local int32_t int32Buffer[2000];
 	thread_local uint8_t uint8Buffer[8192];
 
+	ThreadStorage& storage() {
+		return threadStorage;
+	}
 }
 
 using namespace SortedWayStoreTypes;
@@ -141,46 +155,46 @@ const void SortedWayStore::insertNodes(const std::vector<std::pair<WayID, std::v
 	if (newWays.empty())
 		return;
 
-	if (localWays == nullptr) {
+	if (storage().localWays == nullptr) {
 		std::lock_guard<std::mutex> lock(orphanageMutex);
 		if (workerBuffers.size() == 0)
 			workerBuffers.reserve(256);
 		else if (workerBuffers.size() == workerBuffers.capacity())
 			throw std::runtime_error("SortedWayStore doesn't support more than 256 cores");
 		workerBuffers.push_back(std::vector<std::pair<WayID, std::vector<NodeID>>>());
-		localWays = &workerBuffers.back();
+		storage().localWays = &workerBuffers.back();
 	}
 
-	if (groupStart == -1) {
+	if (storage().groupStart == -1) {
 		// Mark where the first full group starts, so we know when to transition
 		// out of collecting orphans.
-		groupStart = newWays[0].first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+		storage().groupStart = newWays[0].first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
 	}
 
 	int i = 0;
-	while (collectingOrphans && i < newWays.size()) {
+	while (storage().collectingOrphans && i < newWays.size()) {
 		const auto& el = newWays[i];
-		if (el.first >= groupStart + (GroupSize * ChunkSize)) {
-			collectingOrphans = false;
+		if (el.first >= storage().groupStart + (GroupSize * ChunkSize)) {
+			storage().collectingOrphans = false;
 			// Calculate new groupStart, rounding to previous boundary.
-			groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
-			collectOrphans(*localWays);
-			localWays->clear();
+			storage().groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+			collectOrphans(*storage().localWays);
+			storage().localWays->clear();
 		}
-		localWays->push_back(el);
+		storage().localWays->push_back(el);
 		i++;
 	}
 
 	while(i < newWays.size()) {
 		const auto& el = newWays[i];
 
-		if (el.first >= groupStart + (GroupSize * ChunkSize)) {
-			publishGroup(*localWays);
-			localWays->clear();
-			groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+		if (el.first >= storage().groupStart + (GroupSize * ChunkSize)) {
+			publishGroup(*storage().localWays);
+			storage().localWays->clear();
+			storage().groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
 		}
 
-		localWays->push_back(el);
+		storage().localWays->push_back(el);
 		i++;
 	}
 }
@@ -224,13 +238,13 @@ void SortedWayStore::finalize(unsigned int threadNum) {
 }
 
 void SortedWayStore::batchStart() {
-	collectingOrphans = true;
-	groupStart = -1;
-	if (localWays == nullptr || localWays->size() == 0)
+	storage().collectingOrphans = true;
+	storage().groupStart = -1;
+	if (storage().localWays == nullptr || storage().localWays->size() == 0)
 		return;
 
-	collectOrphans(*localWays);
-	localWays->clear();
+	collectOrphans(*storage().localWays);
+	storage().localWays->clear();
 }
 
 void SortedWayStore::collectOrphans(const std::vector<std::pair<WayID, std::vector<NodeID>>>& orphans) {
@@ -279,7 +293,6 @@ std::vector<NodeID> SortedWayStore::decodeWay(uint16_t flags, const uint8_t* inp
 		for (int i = 0; i < length; i++)
 			rv.push_back(highBytes[i] | lowIntData[i]);
 	} else {
-		uint16_t compressedLength = *(uint16_t*)input;
 		input += 2;
 
 		uint32_t firstInt = *(uint32_t*)(input);
@@ -446,12 +459,12 @@ void SortedWayStore::publishGroup(const std::vector<std::pair<WayID, std::vector
 		const WayID id = way.first;
 		lastChunk->wayIds.push_back(id % ChunkSize);
 
-		uint16_t flags = encodeWay(way.second, encodedWay, compressWays && way.second.size() >= 4);
+		uint16_t flags = encodeWay(way.second, storage().encodedWay, compressWays && way.second.size() >= 4);
 		lastChunk->wayFlags.push_back(flags);
 
 		std::vector<uint8_t> encoded;
-		encoded.resize(encodedWay.size());
-		memcpy(encoded.data(), encodedWay.data(), encodedWay.size());
+		encoded.resize(storage().encodedWay.size());
+		memcpy(encoded.data(), storage().encodedWay.data(), storage().encodedWay.size());
 
 		lastChunk->encodedWays.push_back(std::move(encoded));
 	}

From f225ebdb8d44f25624162ef17bd7b5f652e33c15 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 17 Dec 2023 10:54:42 -0500
Subject: [PATCH 21/49] SortedNodeStore: support multiple instances

---
 src/sorted_node_store.cpp       | 91 ++++++++++++++++++---------------
 test/sorted_node_store.test.cpp | 20 +++++---
 2 files changed, 62 insertions(+), 49 deletions(-)

diff --git a/src/sorted_node_store.cpp b/src/sorted_node_store.cpp
index 2f079755..0194ca8a 100644
--- a/src/sorted_node_store.cpp
+++ b/src/sorted_node_store.cpp
@@ -41,10 +41,17 @@ namespace SortedNodeStoreTypes {
 		char* arenaPtr = nullptr;
 	};
 
-	thread_local ThreadStorage threadStorage;
+	thread_local std::deque<std::pair<const SortedNodeStore*, ThreadStorage>> threadStorage;
 
-	ThreadStorage& storage() {
-		return threadStorage;
+	ThreadStorage& s(const SortedNodeStore* who) {
+		for (auto& entry : threadStorage)
+			if (entry.first == who)
+				return entry.second;
+
+		threadStorage.push_back(std::make_pair(who, ThreadStorage()));
+
+		auto& rv = threadStorage.back();
+		return rv.second;
 	}
 }
 
@@ -123,22 +130,22 @@ LatpLon SortedNodeStore::at(const NodeID id) const {
 
 		// Really naive caching strategy - just cache the last-used chunk.
 		// Probably good enough?
-		if (storage().cachedChunk != neededChunk) {
-			storage().cachedChunk = neededChunk;
-			storage().cacheChunkLons.reserve(256);
-			storage().cacheChunkLatps.reserve(256);
+		if (s(this).cachedChunk != neededChunk) {
+			s(this).cachedChunk = neededChunk;
+			s(this).cacheChunkLons.reserve(256);
+			s(this).cacheChunkLatps.reserve(256);
 
 			uint8_t* latpData = ptr->data;
 			uint8_t* lonData = ptr->data + latpSize;
 			uint32_t recovdata[256] = {0};
 
 			streamvbyte_decode(latpData, recovdata, n);
-			storage().cacheChunkLatps[0] = ptr->firstLatp;
-			zigzag_delta_decode(recovdata, &storage().cacheChunkLatps[1], n, storage().cacheChunkLatps[0]);
+			s(this).cacheChunkLatps[0] = ptr->firstLatp;
+			zigzag_delta_decode(recovdata, &s(this).cacheChunkLatps[1], n, s(this).cacheChunkLatps[0]);
 
 			streamvbyte_decode(lonData, recovdata, n);
-			storage().cacheChunkLons[0] = ptr->firstLon;
-			zigzag_delta_decode(recovdata, &storage().cacheChunkLons[1], n, storage().cacheChunkLons[0]);
+			s(this).cacheChunkLons[0] = ptr->firstLon;
+			zigzag_delta_decode(recovdata, &s(this).cacheChunkLons[1], n, s(this).cacheChunkLons[0]);
 		}
 
 		size_t nodeOffset = 0;
@@ -149,7 +156,7 @@ LatpLon SortedNodeStore::at(const NodeID id) const {
 		if (!(ptr->nodeMask[nodeMaskByte] & (1 << nodeMaskBit)))
 			throw std::out_of_range("SortedNodeStore: node " + std::to_string(id) + " missing, no node");
 
-		return { storage().cacheChunkLatps[nodeOffset], storage().cacheChunkLons[nodeOffset] };
+		return { s(this).cacheChunkLatps[nodeOffset], s(this).cacheChunkLons[nodeOffset] };
 	}
 
 	UncompressedChunkInfo* ptr = (UncompressedChunkInfo*)basePtr;
@@ -191,58 +198,58 @@ size_t SortedNodeStore::size() const {
 }
 
 void SortedNodeStore::insert(const std::vector<element_t>& elements) {
-	if (storage().localNodes == nullptr) {
+	if (s(this).localNodes == nullptr) {
 		std::lock_guard<std::mutex> lock(orphanageMutex);
 		if (workerBuffers.size() == 0)
 			workerBuffers.reserve(256);
 		else if (workerBuffers.size() == workerBuffers.capacity())
 			throw std::runtime_error("SortedNodeStore doesn't support more than 256 cores");
 		workerBuffers.push_back(std::vector<element_t>());
-		storage().localNodes = &workerBuffers.back();
+		s(this).localNodes = &workerBuffers.back();
 	}
 
-	if (storage().groupStart == -1) {
+	if (s(this).groupStart == -1) {
 		// Mark where the first full group starts, so we know when to transition
 		// out of collecting orphans.
-		storage().groupStart = elements[0].first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+		s(this).groupStart = elements[0].first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
 	}
 
 	int i = 0;
-	while (storage().collectingOrphans && i < elements.size()) {
+	while (s(this).collectingOrphans && i < elements.size()) {
 		const element_t& el = elements[i];
-		if (el.first >= storage().groupStart + (GroupSize * ChunkSize)) {
-			storage().collectingOrphans = false;
+		if (el.first >= s(this).groupStart + (GroupSize * ChunkSize)) {
+			s(this).collectingOrphans = false;
 			// Calculate new groupStart, rounding to previous boundary.
-			storage().groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
-			collectOrphans(*storage().localNodes);
-			storage().localNodes->clear();
+			s(this).groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+			collectOrphans(*s(this).localNodes);
+			s(this).localNodes->clear();
 		}
-		storage().localNodes->push_back(el);
+		s(this).localNodes->push_back(el);
 		i++;
 	}
 
 	while(i < elements.size()) {
 		const element_t& el = elements[i];
 
-		if (el.first >= storage().groupStart + (GroupSize * ChunkSize)) {
-			publishGroup(*storage().localNodes);
-			storage().localNodes->clear();
-			storage().groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+		if (el.first >= s(this).groupStart + (GroupSize * ChunkSize)) {
+			publishGroup(*s(this).localNodes);
+			s(this).localNodes->clear();
+			s(this).groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
 		}
 
-		storage().localNodes->push_back(el);
+		s(this).localNodes->push_back(el);
 		i++;
 	}
 }
 
 void SortedNodeStore::batchStart() {
-	storage().collectingOrphans = true;
-	storage().groupStart = -1;
-	if (storage().localNodes == nullptr || storage().localNodes->size() == 0)
+	s(this).collectingOrphans = true;
+	s(this).groupStart = -1;
+	if (s(this).localNodes == nullptr || s(this).localNodes->size() == 0)
 		return;
 
-	collectOrphans(*storage().localNodes);
-	storage().localNodes->clear();
+	collectOrphans(*s(this).localNodes);
+	s(this).localNodes->clear();
 }
 
 void SortedNodeStore::finalize(size_t threadNum) {
@@ -417,22 +424,22 @@ void SortedNodeStore::publishGroup(const std::vector<element_t>& nodes) {
 
 	GroupInfo* groupInfo = nullptr;
 
-	if (storage().arenaSpace < groupSpace) {
+	if (s(this).arenaSpace < groupSpace) {
 		// A full group takes ~330KB. Nodes are read _fast_, and there ends
 		// up being contention calling the allocator when reading the
 		// planet on a machine with 48 cores -- so allocate in large chunks.
-		storage().arenaSpace = 4 * 1024 * 1024;
-		totalAllocatedSpace += storage().arenaSpace;
-		storage().arenaPtr = (char*)void_mmap_allocator::allocate(storage().arenaSpace);
-		if (storage().arenaPtr == nullptr)
+		s(this).arenaSpace = 4 * 1024 * 1024;
+		totalAllocatedSpace += s(this).arenaSpace;
+		s(this).arenaPtr = (char*)void_mmap_allocator::allocate(s(this).arenaSpace);
+		if (s(this).arenaPtr == nullptr)
 			throw std::runtime_error("SortedNodeStore: failed to allocate arena");
 		std::lock_guard<std::mutex> lock(orphanageMutex);
-		allocatedMemory.push_back(std::make_pair((void*)storage().arenaPtr, storage().arenaSpace));
+		allocatedMemory.push_back(std::make_pair((void*)s(this).arenaPtr, s(this).arenaSpace));
 	}
 
-	storage().arenaSpace -= groupSpace;
-	groupInfo = (GroupInfo*)storage().arenaPtr;
-	storage().arenaPtr += groupSpace;
+	s(this).arenaSpace -= groupSpace;
+	groupInfo = (GroupInfo*)s(this).arenaPtr;
+	s(this).arenaPtr += groupSpace;
 
 	if (groups[groupIndex] != nullptr)
 		throw std::runtime_error("SortedNodeStore: group already present");
diff --git a/test/sorted_node_store.test.cpp b/test/sorted_node_store.test.cpp
index ea6956d6..ba7edb2d 100644
--- a/test/sorted_node_store.test.cpp
+++ b/test/sorted_node_store.test.cpp
@@ -3,17 +3,23 @@
 #include "sorted_node_store.h"
 
 MU_TEST(test_sorted_node_store) {
-	SortedNodeStore sns(true);
-	mu_check(sns.size() == 0);
+	SortedNodeStore s1(true), s2(true);
+	mu_check(s1.size() == 0);
+	mu_check(s2.size() == 0);
 
-	sns.batchStart();
+	s1.batchStart();
+	s2.batchStart();
 
-	sns.insert({ {1, {2, 3 } } });
+	s1.insert({ {1, {2, 3 } } });
+	s2.insert({ {2, {3, 4 } } });
 
-	sns.finalize(1);
-
-	mu_check(sns.size() == 1);
+	s1.finalize(1);
+	s2.finalize(1);
 
+	mu_check(s1.size() == 1);
+	mu_check(s1.at(1) == LatpLon({2, 3}));
+	mu_check(s2.size() == 1);
+	mu_check(s2.at(2) == LatpLon({3, 4}));
 }
 
 MU_TEST_SUITE(test_suite_sorted_node_store) {

From 6c7917b996b79275549253d3f77eb1337f7c6652 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 17 Dec 2023 11:17:13 -0500
Subject: [PATCH 22/49] SortedWayStorage: support multiple instances

---
 src/sorted_node_store.cpp      |  2 +
 src/sorted_way_store.cpp       | 68 +++++++++++++++++++---------------
 test/sorted_way_store.test.cpp | 17 +++++++++
 3 files changed, 58 insertions(+), 29 deletions(-)

diff --git a/src/sorted_node_store.cpp b/src/sorted_node_store.cpp
index 0194ca8a..6761ad00 100644
--- a/src/sorted_node_store.cpp
+++ b/src/sorted_node_store.cpp
@@ -87,6 +87,8 @@ void SortedNodeStore::reopen()
 SortedNodeStore::~SortedNodeStore() {
 	for (const auto entry: allocatedMemory)
 		void_mmap_allocator::deallocate(entry.first, entry.second);
+
+	s(this) = ThreadStorage();
 }
 
 LatpLon SortedNodeStore::at(const NodeID id) const {
diff --git a/src/sorted_way_store.cpp b/src/sorted_way_store.cpp
index f87dde21..c1dd9ae2 100644
--- a/src/sorted_way_store.cpp
+++ b/src/sorted_way_store.cpp
@@ -30,7 +30,18 @@ namespace SortedWayStoreTypes {
 		std::vector<uint8_t> encodedWay;
 	};
 
-	thread_local ThreadStorage threadStorage;
+	thread_local std::deque<std::pair<const SortedWayStore*, ThreadStorage>> threadStorage;
+
+	ThreadStorage& s(const SortedWayStore* who) {
+		for (auto& entry : threadStorage)
+			if (entry.first == who)
+				return entry.second;
+
+		threadStorage.push_back(std::make_pair(who, ThreadStorage()));
+
+		auto& rv = threadStorage.back();
+		return rv.second;
+	}
 
 	// C++ doesn't support variable length arrays declared on stack.
 	// g++ and clang support it, but msvc doesn't. Rather than pay the
@@ -43,10 +54,6 @@ namespace SortedWayStoreTypes {
 	thread_local uint32_t uint32Buffer[2000];
 	thread_local int32_t int32Buffer[2000];
 	thread_local uint8_t uint8Buffer[8192];
-
-	ThreadStorage& storage() {
-		return threadStorage;
-	}
 }
 
 using namespace SortedWayStoreTypes;
@@ -58,6 +65,8 @@ SortedWayStore::SortedWayStore(bool compressWays, const NodeStore& nodeStore): c
 SortedWayStore::~SortedWayStore() {
 	for (const auto entry: allocatedMemory)
 		void_mmap_allocator::deallocate(entry.first, entry.second);
+
+	s(this) = ThreadStorage();
 }
 
 void SortedWayStore::reopen() {
@@ -155,46 +164,46 @@ const void SortedWayStore::insertNodes(const std::vector<std::pair<WayID, std::v
 	if (newWays.empty())
 		return;
 
-	if (storage().localWays == nullptr) {
+	if (s(this).localWays == nullptr) {
 		std::lock_guard<std::mutex> lock(orphanageMutex);
 		if (workerBuffers.size() == 0)
 			workerBuffers.reserve(256);
 		else if (workerBuffers.size() == workerBuffers.capacity())
 			throw std::runtime_error("SortedWayStore doesn't support more than 256 cores");
 		workerBuffers.push_back(std::vector<std::pair<WayID, std::vector<NodeID>>>());
-		storage().localWays = &workerBuffers.back();
+		s(this).localWays = &workerBuffers.back();
 	}
 
-	if (storage().groupStart == -1) {
+	if (s(this).groupStart == -1) {
 		// Mark where the first full group starts, so we know when to transition
 		// out of collecting orphans.
-		storage().groupStart = newWays[0].first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+		s(this).groupStart = newWays[0].first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
 	}
 
 	int i = 0;
-	while (storage().collectingOrphans && i < newWays.size()) {
+	while (s(this).collectingOrphans && i < newWays.size()) {
 		const auto& el = newWays[i];
-		if (el.first >= storage().groupStart + (GroupSize * ChunkSize)) {
-			storage().collectingOrphans = false;
+		if (el.first >= s(this).groupStart + (GroupSize * ChunkSize)) {
+			s(this).collectingOrphans = false;
 			// Calculate new groupStart, rounding to previous boundary.
-			storage().groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
-			collectOrphans(*storage().localWays);
-			storage().localWays->clear();
+			s(this).groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+			collectOrphans(*s(this).localWays);
+			s(this).localWays->clear();
 		}
-		storage().localWays->push_back(el);
+		s(this).localWays->push_back(el);
 		i++;
 	}
 
 	while(i < newWays.size()) {
 		const auto& el = newWays[i];
 
-		if (el.first >= storage().groupStart + (GroupSize * ChunkSize)) {
-			publishGroup(*storage().localWays);
-			storage().localWays->clear();
-			storage().groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+		if (el.first >= s(this).groupStart + (GroupSize * ChunkSize)) {
+			publishGroup(*s(this).localWays);
+			s(this).localWays->clear();
+			s(this).groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
 		}
 
-		storage().localWays->push_back(el);
+		s(this).localWays->push_back(el);
 		i++;
 	}
 }
@@ -238,13 +247,13 @@ void SortedWayStore::finalize(unsigned int threadNum) {
 }
 
 void SortedWayStore::batchStart() {
-	storage().collectingOrphans = true;
-	storage().groupStart = -1;
-	if (storage().localWays == nullptr || storage().localWays->size() == 0)
+	s(this).collectingOrphans = true;
+	s(this).groupStart = -1;
+	if (s(this).localWays == nullptr || s(this).localWays->size() == 0)
 		return;
 
-	collectOrphans(*storage().localWays);
-	storage().localWays->clear();
+	collectOrphans(*s(this).localWays);
+	s(this).localWays->clear();
 }
 
 void SortedWayStore::collectOrphans(const std::vector<std::pair<WayID, std::vector<NodeID>>>& orphans) {
@@ -253,6 +262,7 @@ void SortedWayStore::collectOrphans(const std::vector<std::pair<WayID, std::vect
 
 	std::vector<std::pair<WayID, std::vector<NodeID>>>& vec = orphanage[groupIndex];
 	const size_t i = vec.size();
+
 	vec.resize(i + orphans.size());
 	std::copy(orphans.begin(), orphans.end(), vec.begin() + i);
 }
@@ -459,12 +469,12 @@ void SortedWayStore::publishGroup(const std::vector<std::pair<WayID, std::vector
 		const WayID id = way.first;
 		lastChunk->wayIds.push_back(id % ChunkSize);
 
-		uint16_t flags = encodeWay(way.second, storage().encodedWay, compressWays && way.second.size() >= 4);
+		uint16_t flags = encodeWay(way.second, s(this).encodedWay, compressWays && way.second.size() >= 4);
 		lastChunk->wayFlags.push_back(flags);
 
 		std::vector<uint8_t> encoded;
-		encoded.resize(storage().encodedWay.size());
-		memcpy(encoded.data(), storage().encodedWay.data(), storage().encodedWay.size());
+		encoded.resize(s(this).encodedWay.size());
+		memcpy(encoded.data(), s(this).encodedWay.data(), s(this).encodedWay.size());
 
 		lastChunk->encodedWays.push_back(std::move(encoded));
 	}
diff --git a/test/sorted_way_store.test.cpp b/test/sorted_way_store.test.cpp
index 217a1110..8c4c432d 100644
--- a/test/sorted_way_store.test.cpp
+++ b/test/sorted_way_store.test.cpp
@@ -74,6 +74,22 @@ MU_TEST(test_encode_way) {
 	}
 }
 
+MU_TEST(test_multiple_stores) {
+	TestNodeStore ns;
+	SortedWayStore s1(true, ns), s2(true, ns);
+	s1.batchStart();
+	s2.batchStart();
+
+	s1.insertNodes({{ 1, { 1 } }});
+	s2.insertNodes({{ 2, { 2 } }});
+
+	s1.finalize(1);
+	s2.finalize(1);
+
+	mu_check(s1.size() == 1);
+	mu_check(s2.size() == 1);
+}
+
 MU_TEST(test_way_store) {
 	TestNodeStore ns;
 	SortedWayStore sws(true, ns);
@@ -182,6 +198,7 @@ MU_TEST(test_populate_mask) {
 
 MU_TEST_SUITE(test_suite_sorted_way_store) {
 	MU_RUN_TEST(test_encode_way);
+	MU_RUN_TEST(test_multiple_stores);
 	MU_RUN_TEST(test_way_store);
 }
 

From 5d9ca2b8fdba64f6616f111bf51ed43ba66b575e Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 17 Dec 2023 11:40:43 -0500
Subject: [PATCH 23/49] actually fix the low zoom object collection

D'oh, this "worked" due to two bugs cancelling each other:

(a) the code to find things in the low zoom list never found anything,
    because it assumed a base z6 tile of 0/0

(b) we weren't returning early, so the normal code still ran

Rejigged to actually do what I was intending
---
 include/tile_data.h | 53 ++++++++++++++++++++++++++++++++++++++-------
 src/tile_data.cpp   |  7 +++---
 2 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/include/tile_data.h b/include/tile_data.h
index 78793c27..b5641ace 100644
--- a/include/tile_data.h
+++ b/include/tile_data.h
@@ -52,8 +52,9 @@ template<typename OO> void finalizeObjects(
 	const unsigned int& baseZoom,
 	typename std::vector<AppendVectorNS::AppendVector<OO>>::iterator begin,
 	typename std::vector<AppendVectorNS::AppendVector<OO>>::iterator end,
-	typename std::vector<AppendVectorNS::AppendVector<OO>>& lowZoom
+	typename AppendVectorNS::AppendVector<std::pair<TileCoordinates, OO>>& lowZoom
 	) {
+	size_t z6OffsetDivisor = baseZoom >= CLUSTER_ZOOM ? (1 << (baseZoom - CLUSTER_ZOOM)) : 1;
 #ifdef CLOCK_MONOTONIC
 	timespec startTs, endTs;
 	clock_gettime(CLOCK_MONOTONIC, &startTs);
@@ -75,9 +76,20 @@ template<typename OO> void finalizeObjects(
 		if (it->size() == 0)
 			continue;
 
+		// We track a separate copy of low zoom objects to avoid scanning large
+		// lists of objects that may be on slow disk storage.
 		for (auto objectIt = it->begin(); objectIt != it->end(); objectIt++)
-			if (objectIt->oo.minZoom < CLUSTER_ZOOM)
-				lowZoom[0].push_back(*objectIt);
+			if (objectIt->oo.minZoom < CLUSTER_ZOOM) {
+				size_t z6x = i / CLUSTER_ZOOM_WIDTH;
+				size_t z6y = i % CLUSTER_ZOOM_WIDTH;
+				lowZoom.push_back(std::make_pair(
+					TileCoordinates(
+						z6OffsetDivisor * z6x + objectIt->x,
+						z6OffsetDivisor * z6y + objectIt->y
+					),
+					*objectIt
+				));
+			}
 
 		// If the user is doing a a small extract, there are few populated
 		// entries in `object`.
@@ -172,6 +184,31 @@ inline OutputObjectID outputObjectWithId<OutputObjectXYID>(const OutputObjectXYI
 	return OutputObjectID({ input.oo, input.id });
 }
 
+template<typename OO> void collectLowZoomObjectsForTile(
+	const unsigned int& baseZoom,
+	typename AppendVectorNS::AppendVector<std::pair<TileCoordinates, OO>> objects,
+	unsigned int zoom,
+	const TileCoordinates& dstIndex,
+	std::vector<OutputObjectID>& output
+) {
+	for (size_t j = 0; j < objects.size(); j++) {
+		const auto& object = objects[j];
+
+		TileCoordinate baseX = object.first.x;
+		TileCoordinate baseY = object.first.y;
+
+		// Translate the x, y at the requested zoom level
+		TileCoordinate x = baseX / (1 << (baseZoom - zoom));
+		TileCoordinate y = baseY / (1 << (baseZoom - zoom));
+
+		if (dstIndex.x == x && dstIndex.y == y) {
+			if (object.second.oo.minZoom <= zoom) {
+				output.push_back(outputObjectWithId(object.second));
+			}
+		}
+	}
+}
+
 template<typename OO> void collectObjectsForTileTemplate(
 	const unsigned int& baseZoom,
 	typename std::vector<AppendVectorNS::AppendVector<OO>>::iterator objects,
@@ -184,9 +221,6 @@ template<typename OO> void collectObjectsForTileTemplate(
 	uint16_t z6OffsetDivisor = baseZoom >= CLUSTER_ZOOM ? (1 << (baseZoom - CLUSTER_ZOOM)) : 1;
 
 	for (size_t i = iStart; i < iEnd; i++) {
-		const size_t z6x = i / CLUSTER_ZOOM_WIDTH;
-		const size_t z6y = i % CLUSTER_ZOOM_WIDTH;
-
 		if (zoom >= CLUSTER_ZOOM) {
 			// If z >= 6, we can compute the exact bounds within the objects array.
 			// Translate to the base zoom, then do a binary search to find
@@ -258,6 +292,9 @@ template<typename OO> void collectObjectsForTileTemplate(
 
 			}
 		} else {
+			const size_t z6x = i / CLUSTER_ZOOM_WIDTH;
+			const size_t z6y = i % CLUSTER_ZOOM_WIDTH;
+
 			for (size_t j = 0; j < objects[i].size(); j++) {
 				// Compute the x, y at the base zoom level
 				TileCoordinate baseX = z6x * z6OffsetDivisor + objects[i][j].x;
@@ -318,9 +355,9 @@ class TileDataSource {
 	// If config.include_ids is true, objectsWithIds will be populated.
 	// Otherwise, objects.
 	std::vector<AppendVectorNS::AppendVector<OutputObjectXY>> objects;
-	std::vector<AppendVectorNS::AppendVector<OutputObjectXY>> lowZoomObjects;
+	AppendVectorNS::AppendVector<std::pair<TileCoordinates, OutputObjectXY>> lowZoomObjects;
 	std::vector<AppendVectorNS::AppendVector<OutputObjectXYID>> objectsWithIds;
-	std::vector<AppendVectorNS::AppendVector<OutputObjectXYID>> lowZoomObjectsWithIds;
+	AppendVectorNS::AppendVector<std::pair<TileCoordinates, OutputObjectXYID>> lowZoomObjectsWithIds;
 	
 	// rtree index of large objects
 	using oo_rtree_param_type = boost::geometry::index::quadratic<128>;
diff --git a/src/tile_data.cpp b/src/tile_data.cpp
index d3fc15c2..b56e97e7 100644
--- a/src/tile_data.cpp
+++ b/src/tile_data.cpp
@@ -47,9 +47,7 @@ TileDataSource::TileDataSource(size_t threadNum, unsigned int baseZoom, bool inc
 	z6OffsetDivisor(baseZoom >= CLUSTER_ZOOM ? (1 << (baseZoom - CLUSTER_ZOOM)) : 1),
 	objectsMutex(threadNum * 4),
 	objects(CLUSTER_ZOOM_AREA),
-	lowZoomObjects(1),
 	objectsWithIds(CLUSTER_ZOOM_AREA),
-	lowZoomObjectsWithIds(1),
 	baseZoom(baseZoom),
 	pointStores(threadNum),
 	linestringStores(threadNum),
@@ -143,8 +141,9 @@ void TileDataSource::collectObjectsForTile(
 	std::vector<OutputObjectID>& output
 ) {
 	if (zoom < CLUSTER_ZOOM) {
-		collectObjectsForTileTemplate<OutputObjectXY>(baseZoom, lowZoomObjects.begin(), 0, 1, zoom, dstIndex, output);
-		collectObjectsForTileTemplate<OutputObjectXYID>(baseZoom, lowZoomObjectsWithIds.begin(), 0, 1, zoom, dstIndex, output);
+		collectLowZoomObjectsForTile<OutputObjectXY>(baseZoom, lowZoomObjects, zoom, dstIndex, output);
+		collectLowZoomObjectsForTile<OutputObjectXYID>(baseZoom, lowZoomObjectsWithIds, zoom, dstIndex, output);
+		return;
 	}
 
 	size_t iStart = 0;

From 24b73f1f434d28098ddd68150aad2da75ed139b3 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 17 Dec 2023 12:53:39 -0500
Subject: [PATCH 24/49] AppendVector tweaks

---
 include/append_vector.h     |  4 +++-
 test/append_vector.test.cpp | 21 +++++++++++++++++----
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/include/append_vector.h b/include/append_vector.h
index 3fe9b907..07531217 100644
--- a/include/append_vector.h
+++ b/include/append_vector.h
@@ -158,7 +158,9 @@ namespace AppendVectorNS {
 		}
 
 		T& operator [](int idx) {
-			return vecs[idx / APPEND_VECTOR_SIZE][idx % APPEND_VECTOR_SIZE];
+			auto& vec = vecs[idx / APPEND_VECTOR_SIZE];
+			auto& el = vec[idx % APPEND_VECTOR_SIZE];
+			return el;
 		}
 
 		Iterator begin() {
diff --git a/test/append_vector.test.cpp b/test/append_vector.test.cpp
index 300f6e30..db4949e2 100644
--- a/test/append_vector.test.cpp
+++ b/test/append_vector.test.cpp
@@ -6,8 +6,11 @@
 using namespace AppendVectorNS;
 
 MU_TEST(test_append_vector) {
-	AppendVector<uint32_t> vec;
+	AppendVector<int32_t> vec;
+	AppendVector<int32_t> vec2;
 	mu_check(vec.size() == 0);
+	mu_check(vec.begin() == vec.end());
+	mu_check(vec.begin() != vec2.begin());
 
 	for (int i = 0; i < 10000; i++) {
 		vec.push_back(i);
@@ -16,7 +19,7 @@ MU_TEST(test_append_vector) {
 
 	mu_check(vec[25] == 25);
 
-	const AppendVector<uint32_t>::Iterator& it = vec.begin();
+	const AppendVector<int32_t>::Iterator& it = vec.begin();
 	mu_check(*it == 0);
 	mu_check(*(it + 1) == 1);
 	mu_check(*(it + 2) == 2);
@@ -52,7 +55,7 @@ MU_TEST(test_append_vector) {
 		vec.begin(),
 		vec.end(),
 		123,
-		[](const uint32_t& a, const uint32_t& toFind) {
+		[](const int32_t& a, const int32_t& toFind) {
 			return a < toFind;
 		}
 	);
@@ -64,13 +67,23 @@ MU_TEST(test_append_vector) {
 		vec.begin(),
 		vec.end(),
 		123123,
-		[](const uint32_t& a, const uint32_t& toFind) {
+		[](const int32_t& a, const int32_t& toFind) {
 			return a < toFind;
 		}
 	);
 
 	mu_check(iter == vec.end());
 
+	iter = std::lower_bound(
+		vec.begin(),
+		vec.end(),
+		-2,
+		[](const int32_t& a, const int32_t& toFind) {
+			return a < toFind;
+		}
+	);
+
+	mu_check(iter == vec.begin());
 }
 
 MU_TEST_SUITE(test_suite_append_vector) {

From 2a053652cf3d14f91b45e695213d637277dbe525 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 17 Dec 2023 13:01:09 -0500
Subject: [PATCH 25/49] more low zoom fixes

---
 include/tile_data.h | 209 ++++++++++++++++++++------------------------
 src/tile_data.cpp   |   4 +-
 2 files changed, 98 insertions(+), 115 deletions(-)

diff --git a/include/tile_data.h b/include/tile_data.h
index b5641ace..804761c8 100644
--- a/include/tile_data.h
+++ b/include/tile_data.h
@@ -52,7 +52,7 @@ template<typename OO> void finalizeObjects(
 	const unsigned int& baseZoom,
 	typename std::vector<AppendVectorNS::AppendVector<OO>>::iterator begin,
 	typename std::vector<AppendVectorNS::AppendVector<OO>>::iterator end,
-	typename AppendVectorNS::AppendVector<std::pair<TileCoordinates, OO>>& lowZoom
+	typename std::vector<std::vector<OO>>& lowZoom
 	) {
 	size_t z6OffsetDivisor = baseZoom >= CLUSTER_ZOOM ? (1 << (baseZoom - CLUSTER_ZOOM)) : 1;
 #ifdef CLOCK_MONOTONIC
@@ -60,11 +60,11 @@ template<typename OO> void finalizeObjects(
 	clock_gettime(CLOCK_MONOTONIC, &startTs);
 #endif
 
-	int i = 0;
+	int i = -1;
 	for (auto it = begin; it != end; it++) {
 		i++;
-		if (i % 10 == 0 || i == 4096) {
-			std::cout << "\r" << name << ": finalizing z6 tile " << i << "/" << CLUSTER_ZOOM_AREA;
+		if (it->size() > 0 || i % 10 == 0 || i == 4095) {
+			std::cout << "\r" << name << ": finalizing z6 tile " << (i + 1) << "/" << CLUSTER_ZOOM_AREA;
 
 #ifdef CLOCK_MONOTONIC
 			clock_gettime(CLOCK_MONOTONIC, &endTs);
@@ -79,17 +79,8 @@ template<typename OO> void finalizeObjects(
 		// We track a separate copy of low zoom objects to avoid scanning large
 		// lists of objects that may be on slow disk storage.
 		for (auto objectIt = it->begin(); objectIt != it->end(); objectIt++)
-			if (objectIt->oo.minZoom < CLUSTER_ZOOM) {
-				size_t z6x = i / CLUSTER_ZOOM_WIDTH;
-				size_t z6y = i % CLUSTER_ZOOM_WIDTH;
-				lowZoom.push_back(std::make_pair(
-					TileCoordinates(
-						z6OffsetDivisor * z6x + objectIt->x,
-						z6OffsetDivisor * z6y + objectIt->y
-					),
-					*objectIt
-				));
-			}
+			if (objectIt->oo.minZoom < CLUSTER_ZOOM)
+				lowZoom[i].push_back(*objectIt);
 
 		// If the user is doing a a small extract, there are few populated
 		// entries in `object`.
@@ -186,24 +177,33 @@ inline OutputObjectID outputObjectWithId<OutputObjectXYID>(const OutputObjectXYI
 
 template<typename OO> void collectLowZoomObjectsForTile(
 	const unsigned int& baseZoom,
-	typename AppendVectorNS::AppendVector<std::pair<TileCoordinates, OO>> objects,
+	typename std::vector<std::vector<OO>> objects,
 	unsigned int zoom,
 	const TileCoordinates& dstIndex,
 	std::vector<OutputObjectID>& output
 ) {
-	for (size_t j = 0; j < objects.size(); j++) {
-		const auto& object = objects[j];
+	if (zoom >= CLUSTER_ZOOM)
+		throw std::runtime_error("collectLowZoomObjectsForTile should not be called for high zooms");
+
+	uint16_t z6OffsetDivisor = baseZoom >= CLUSTER_ZOOM ? (1 << (baseZoom - CLUSTER_ZOOM)) : 1;
+
+	for (size_t i = 0; i < objects.size(); i++) {
+		const size_t z6x = i / CLUSTER_ZOOM_WIDTH;
+		const size_t z6y = i % CLUSTER_ZOOM_WIDTH;
 
-		TileCoordinate baseX = object.first.x;
-		TileCoordinate baseY = object.first.y;
+		for (size_t j = 0; j < objects[i].size(); j++) {
+			// Compute the x, y at the base zoom level
+			TileCoordinate baseX = z6x * z6OffsetDivisor + objects[i][j].x;
+			TileCoordinate baseY = z6y * z6OffsetDivisor + objects[i][j].y;
 
-		// Translate the x, y at the requested zoom level
-		TileCoordinate x = baseX / (1 << (baseZoom - zoom));
-		TileCoordinate y = baseY / (1 << (baseZoom - zoom));
+			// Translate the x, y at the requested zoom level
+			TileCoordinate x = baseX / (1 << (baseZoom - zoom));
+			TileCoordinate y = baseY / (1 << (baseZoom - zoom));
 
-		if (dstIndex.x == x && dstIndex.y == y) {
-			if (object.second.oo.minZoom <= zoom) {
-				output.push_back(outputObjectWithId(object.second));
+			if (dstIndex.x == x && dstIndex.y == y) {
+				if (objects[i][j].oo.minZoom <= zoom) {
+					output.push_back(outputObjectWithId(objects[i][j]));
+				}
 			}
 		}
 	}
@@ -218,98 +218,81 @@ template<typename OO> void collectObjectsForTileTemplate(
 	const TileCoordinates& dstIndex,
 	std::vector<OutputObjectID>& output
 ) {
+	if (zoom < CLUSTER_ZOOM)
+		throw std::runtime_error("collectObjectsForTileTemplate should not be called for low zooms");
+
 	uint16_t z6OffsetDivisor = baseZoom >= CLUSTER_ZOOM ? (1 << (baseZoom - CLUSTER_ZOOM)) : 1;
 
 	for (size_t i = iStart; i < iEnd; i++) {
-		if (zoom >= CLUSTER_ZOOM) {
-			// If z >= 6, we can compute the exact bounds within the objects array.
-			// Translate to the base zoom, then do a binary search to find
-			// the starting point.
-			TileCoordinate z6x = dstIndex.x / (1 << (zoom - CLUSTER_ZOOM));
-			TileCoordinate z6y = dstIndex.y / (1 << (zoom - CLUSTER_ZOOM));
-
-			TileCoordinate baseX = dstIndex.x * (1 << (baseZoom - zoom));
-			TileCoordinate baseY = dstIndex.y * (1 << (baseZoom - zoom));
-
-			Z6Offset needleX = baseX - z6x * z6OffsetDivisor;
-			Z6Offset needleY = baseY - z6y * z6OffsetDivisor;
-
-			// Kind of gross that we have to do this. Might be better if we split
-			// into two arrays, one of x/y and one of OOs. Would have better locality for
-			// searching, too.
-			OutputObject dummyOo(POINT_, 0, 0, 0, 0);
-			const size_t bz = baseZoom;
-
-			const OO targetXY = {dummyOo, needleX, needleY };
-			auto iter = std::lower_bound(
-				objects[i].begin(),
-				objects[i].end(),
-				targetXY,
-				[bz](const OO& a, const OO& b) {
-					// Cluster by parent zoom, so that a subsequent search
-					// can find a contiguous range of entries for any tile
-					// at zoom 6 or higher.
-					const size_t aX = a.x;
-					const size_t aY = a.y;
-					const size_t bX = b.x;
-					const size_t bY = b.y;
-					for (size_t z = CLUSTER_ZOOM; z <= bz; z++) {
-						const auto aXz = aX / (1 << (bz - z));
-						const auto aYz = aY / (1 << (bz - z));
-						const auto bXz = bX / (1 << (bz - z));
-						const auto bYz = bY / (1 << (bz - z));
-
-						if (aXz != bXz)
-							return aXz < bXz;
-
-						if (aYz != bYz)
-							return aYz < bYz;
-					}
-					return false;
-				}
-			);
-			for (; iter != objects[i].end(); iter++) {
-				// Compute the x, y at the base zoom level
-				TileCoordinate baseX = z6x * z6OffsetDivisor + iter->x;
-				TileCoordinate baseY = z6y * z6OffsetDivisor + iter->y;
-
-				// Translate the x, y at the requested zoom level
-				TileCoordinate x = baseX / (1 << (baseZoom - zoom));
-				TileCoordinate y = baseY / (1 << (baseZoom - zoom));
-
-				if (dstIndex.x == x && dstIndex.y == y) {
-					if (iter->oo.minZoom <= zoom) {
-						output.push_back(outputObjectWithId(*iter));
-					}
-				} else {
-					// Short-circuit when we're confident we'd no longer see relevant matches.
-					// We've ordered the entries in `objects` such that all objects that
-					// share the same tile at any zoom are in contiguous runs.
-					//
-					// Thus, as soon as we fail to find a match, we can stop looking.
-					break;
-				}
+		// If z >= 6, we can compute the exact bounds within the objects array.
+		// Translate to the base zoom, then do a binary search to find
+		// the starting point.
+		TileCoordinate z6x = dstIndex.x / (1 << (zoom - CLUSTER_ZOOM));
+		TileCoordinate z6y = dstIndex.y / (1 << (zoom - CLUSTER_ZOOM));
+
+		TileCoordinate baseX = dstIndex.x * (1 << (baseZoom - zoom));
+		TileCoordinate baseY = dstIndex.y * (1 << (baseZoom - zoom));
+
+		Z6Offset needleX = baseX - z6x * z6OffsetDivisor;
+		Z6Offset needleY = baseY - z6y * z6OffsetDivisor;
+
+		// Kind of gross that we have to do this. Might be better if we split
+		// into two arrays, one of x/y and one of OOs. Would have better locality for
+		// searching, too.
+		OutputObject dummyOo(POINT_, 0, 0, 0, 0);
+		const size_t bz = baseZoom;
+
+		const OO targetXY = {dummyOo, needleX, needleY };
+		auto iter = std::lower_bound(
+			objects[i].begin(),
+			objects[i].end(),
+			targetXY,
+			[bz](const OO& a, const OO& b) {
+				// Cluster by parent zoom, so that a subsequent search
+				// can find a contiguous range of entries for any tile
+				// at zoom 6 or higher.
+				const size_t aX = a.x;
+				const size_t aY = a.y;
+				const size_t bX = b.x;
+				const size_t bY = b.y;
+				for (size_t z = CLUSTER_ZOOM; z <= bz; z++) {
+					const auto aXz = aX / (1 << (bz - z));
+					const auto aYz = aY / (1 << (bz - z));
+					const auto bXz = bX / (1 << (bz - z));
+					const auto bYz = bY / (1 << (bz - z));
+
+					if (aXz != bXz)
+						return aXz < bXz;
 
+					if (aYz != bYz)
+						return aYz < bYz;
+				}
+				return false;
 			}
-		} else {
-			const size_t z6x = i / CLUSTER_ZOOM_WIDTH;
-			const size_t z6y = i % CLUSTER_ZOOM_WIDTH;
-
-			for (size_t j = 0; j < objects[i].size(); j++) {
-				// Compute the x, y at the base zoom level
-				TileCoordinate baseX = z6x * z6OffsetDivisor + objects[i][j].x;
-				TileCoordinate baseY = z6y * z6OffsetDivisor + objects[i][j].y;
-
-				// Translate the x, y at the requested zoom level
-				TileCoordinate x = baseX / (1 << (baseZoom - zoom));
-				TileCoordinate y = baseY / (1 << (baseZoom - zoom));
-
-				if (dstIndex.x == x && dstIndex.y == y) {
-					if (objects[i][j].oo.minZoom <= zoom) {
-						output.push_back(outputObjectWithId(objects[i][j]));
-					}
+		);
+
+		for (; iter != objects[i].end(); iter++) {
+			// Compute the x, y at the base zoom level
+			TileCoordinate baseX = z6x * z6OffsetDivisor + iter->x;
+			TileCoordinate baseY = z6y * z6OffsetDivisor + iter->y;
+
+			// Translate the x, y at the requested zoom level
+			TileCoordinate x = baseX / (1 << (baseZoom - zoom));
+			TileCoordinate y = baseY / (1 << (baseZoom - zoom));
+
+			if (dstIndex.x == x && dstIndex.y == y) {
+				if (iter->oo.minZoom <= zoom) {
+					output.push_back(outputObjectWithId(*iter));
 				}
+			} else {
+				// Short-circuit when we're confident we'd no longer see relevant matches.
+				// We've ordered the entries in `objects` such that all objects that
+				// share the same tile at any zoom are in contiguous runs.
+				//
+				// Thus, as soon as we fail to find a match, we can stop looking.
+				break;
 			}
+
 		}
 	}
 }
@@ -355,9 +338,9 @@ class TileDataSource {
 	// If config.include_ids is true, objectsWithIds will be populated.
 	// Otherwise, objects.
 	std::vector<AppendVectorNS::AppendVector<OutputObjectXY>> objects;
-	AppendVectorNS::AppendVector<std::pair<TileCoordinates, OutputObjectXY>> lowZoomObjects;
+	std::vector<std::vector<OutputObjectXY>> lowZoomObjects;
 	std::vector<AppendVectorNS::AppendVector<OutputObjectXYID>> objectsWithIds;
-	AppendVectorNS::AppendVector<std::pair<TileCoordinates, OutputObjectXYID>> lowZoomObjectsWithIds;
+	std::vector<std::vector<OutputObjectXYID>> lowZoomObjectsWithIds;
 	
 	// rtree index of large objects
 	using oo_rtree_param_type = boost::geometry::index::quadratic<128>;
diff --git a/src/tile_data.cpp b/src/tile_data.cpp
index b56e97e7..fbae2038 100644
--- a/src/tile_data.cpp
+++ b/src/tile_data.cpp
@@ -47,7 +47,9 @@ TileDataSource::TileDataSource(size_t threadNum, unsigned int baseZoom, bool inc
 	z6OffsetDivisor(baseZoom >= CLUSTER_ZOOM ? (1 << (baseZoom - CLUSTER_ZOOM)) : 1),
 	objectsMutex(threadNum * 4),
 	objects(CLUSTER_ZOOM_AREA),
+	lowZoomObjects(CLUSTER_ZOOM_AREA),
 	objectsWithIds(CLUSTER_ZOOM_AREA),
+	lowZoomObjectsWithIds(CLUSTER_ZOOM_AREA),
 	baseZoom(baseZoom),
 	pointStores(threadNum),
 	linestringStores(threadNum),
@@ -149,8 +151,6 @@ void TileDataSource::collectObjectsForTile(
 	size_t iStart = 0;
 	size_t iEnd = objects.size();
 
-	// TODO: we could also narrow the search space for z1..z5, too.
-	//       They're less important, as they have fewer tiles.
 	if (zoom >= CLUSTER_ZOOM) {
 		// Compute the x, y at the base zoom level
 		TileCoordinate z6x = dstIndex.x / (1 << (zoom - CLUSTER_ZOOM));

From 00bb73b5a98b9efde003d57b74ebb29278174d00 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 17 Dec 2023 13:16:43 -0500
Subject: [PATCH 26/49] implement SortedNodeStore::contains

---
 include/sorted_node_store.h     |  2 +-
 src/sorted_node_store.cpp       | 40 +++++++++++++++++++++++++++++++++
 test/sorted_node_store.test.cpp | 34 +++++++++++++++++-----------
 3 files changed, 62 insertions(+), 14 deletions(-)

diff --git a/include/sorted_node_store.h b/include/sorted_node_store.h
index 8f276f4a..0d12c460 100644
--- a/include/sorted_node_store.h
+++ b/include/sorted_node_store.h
@@ -70,7 +70,7 @@ class SortedNodeStore : public NodeStore
 		reopen();
 	}
 
-	bool contains(size_t shard, NodeID ID) const override { throw std::runtime_error("SortedNodeStore::contains not implemented"); }
+	bool contains(size_t shard, NodeID id) const override;
 	size_t shard() const override { return 0; }
 	size_t shards() const override { return 1; }
 
diff --git a/src/sorted_node_store.cpp b/src/sorted_node_store.cpp
index 6761ad00..f99baa58 100644
--- a/src/sorted_node_store.cpp
+++ b/src/sorted_node_store.cpp
@@ -91,6 +91,46 @@ SortedNodeStore::~SortedNodeStore() {
 	s(this) = ThreadStorage();
 }
 
+bool SortedNodeStore::contains(size_t shard, NodeID id) const {
+	const size_t groupIndex = id / (GroupSize * ChunkSize);
+	const size_t chunk = (id % (GroupSize * ChunkSize)) / ChunkSize;
+	const uint64_t chunkMaskByte = chunk / 8;
+	const uint64_t chunkMaskBit = chunk % 8;
+
+	const uint64_t nodeMaskByte = (id % ChunkSize) / 8;
+	const uint64_t nodeMaskBit = id % 8;
+
+	GroupInfo* groupPtr = groups[groupIndex];
+
+	if (groupPtr == nullptr)
+		return false;
+
+	size_t chunkOffset = 0;
+	{
+		chunkOffset = popcnt(groupPtr->chunkMask, chunkMaskByte);
+		uint8_t maskByte = groupPtr->chunkMask[chunkMaskByte];
+		maskByte = maskByte & ((1 << chunkMaskBit) - 1);
+		chunkOffset += popcnt(&maskByte, 1);
+
+		if (!(groupPtr->chunkMask[chunkMaskByte] & (1 << chunkMaskBit)))
+			return false;
+	}
+
+	uint16_t scaledOffset = groupPtr->chunkOffsets[chunkOffset];
+	ChunkInfoBase* basePtr = (ChunkInfoBase*)(((char *)(groupPtr->chunkOffsets + popcnt(groupPtr->chunkMask, 32))) + (scaledOffset * ChunkAlignment));
+
+	size_t nodeOffset = 0;
+	nodeOffset = popcnt(basePtr->nodeMask, nodeMaskByte);
+	uint8_t maskByte = basePtr->nodeMask[nodeMaskByte];
+	maskByte = maskByte & ((1 << nodeMaskBit) - 1);
+	nodeOffset += popcnt(&maskByte, 1);
+	if (!(basePtr->nodeMask[nodeMaskByte] & (1 << nodeMaskBit)))
+		return false;
+
+
+	return true;
+}
+
 LatpLon SortedNodeStore::at(const NodeID id) const {
 	const size_t groupIndex = id / (GroupSize * ChunkSize);
 	const size_t chunk = (id % (GroupSize * ChunkSize)) / ChunkSize;
diff --git a/test/sorted_node_store.test.cpp b/test/sorted_node_store.test.cpp
index ba7edb2d..de66445f 100644
--- a/test/sorted_node_store.test.cpp
+++ b/test/sorted_node_store.test.cpp
@@ -3,23 +3,31 @@
 #include "sorted_node_store.h"
 
 MU_TEST(test_sorted_node_store) {
-	SortedNodeStore s1(true), s2(true);
-	mu_check(s1.size() == 0);
-	mu_check(s2.size() == 0);
+	bool compressed = true;
 
-	s1.batchStart();
-	s2.batchStart();
+	for (int i = 0; i < 2; i++) {
+		compressed = !compressed;
+		SortedNodeStore s1(compressed), s2(compressed);
+		mu_check(s1.size() == 0);
+		mu_check(s2.size() == 0);
 
-	s1.insert({ {1, {2, 3 } } });
-	s2.insert({ {2, {3, 4 } } });
+		s1.batchStart();
+		s2.batchStart();
 
-	s1.finalize(1);
-	s2.finalize(1);
+		s1.insert({ {1, {2, 3 } } });
+		s2.insert({ {2, {3, 4 } } });
 
-	mu_check(s1.size() == 1);
-	mu_check(s1.at(1) == LatpLon({2, 3}));
-	mu_check(s2.size() == 1);
-	mu_check(s2.at(2) == LatpLon({3, 4}));
+		s1.finalize(1);
+		s2.finalize(1);
+
+		mu_check(s1.size() == 1);
+		mu_check(s1.at(1) == LatpLon({2, 3}));
+		mu_check(s1.contains(0, 1));
+		mu_check(!s1.contains(0, 2));
+		mu_check(!s1.contains(0, 1ull << 34));
+		mu_check(s2.size() == 1);
+		mu_check(s2.at(2) == LatpLon({3, 4}));
+	}
 }
 
 MU_TEST_SUITE(test_suite_sorted_node_store) {

From e8be59ca998b6557061927365271cb5c9e5ac7b4 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 17 Dec 2023 13:23:32 -0500
Subject: [PATCH 27/49] implement SortedWayStore::contains

---
 include/sorted_way_store.h     |  2 +-
 src/sorted_way_store.cpp       | 50 ++++++++++++++++++++++++++++++++++
 test/sorted_way_store.test.cpp | 37 ++++++++++++++++++-------
 3 files changed, 78 insertions(+), 11 deletions(-)

diff --git a/include/sorted_way_store.h b/include/sorted_way_store.h
index b28c4257..23271deb 100644
--- a/include/sorted_way_store.h
+++ b/include/sorted_way_store.h
@@ -95,7 +95,7 @@ class SortedWayStore: public WayStore {
 	std::size_t size() const override;
 	void finalize(unsigned int threadNum) override;
 
-	bool contains(size_t shard, WayID id) const override { throw std::runtime_error("SortedWayStore::contains not implemented"); }
+	bool contains(size_t shard, WayID id) const override;
 	size_t shard() const override { return 0; }
 	size_t shards() const override { return 1; }
 	
diff --git a/src/sorted_way_store.cpp b/src/sorted_way_store.cpp
index c1dd9ae2..669d01a2 100644
--- a/src/sorted_way_store.cpp
+++ b/src/sorted_way_store.cpp
@@ -90,6 +90,56 @@ void SortedWayStore::reopen() {
 
 }
 
+bool SortedWayStore::contains(size_t shard, WayID id) const {
+	const size_t groupIndex = id / (GroupSize * ChunkSize);
+	const size_t chunk = (id % (GroupSize * ChunkSize)) / ChunkSize;
+	const uint64_t chunkMaskByte = chunk / 8;
+	const uint64_t chunkMaskBit = chunk % 8;
+
+	const uint64_t wayMaskByte = (id % ChunkSize) / 8;
+	const uint64_t wayMaskBit = id % 8;
+
+	GroupInfo* groupPtr = groups[groupIndex];
+
+	if (groupPtr == nullptr)
+		return false;
+
+	size_t chunkOffset = 0;
+	{
+		chunkOffset = popcnt(groupPtr->chunkMask, chunkMaskByte);
+		uint8_t maskByte = groupPtr->chunkMask[chunkMaskByte];
+		maskByte = maskByte & ((1 << chunkMaskBit) - 1);
+		chunkOffset += popcnt(&maskByte, 1);
+
+		if (!(groupPtr->chunkMask[chunkMaskByte] & (1 << chunkMaskBit)))
+			return false;
+	}
+
+	ChunkInfo* chunkPtr = (ChunkInfo*)((char*)groupPtr + groupPtr->chunkOffsets[chunkOffset]);
+	const size_t numWays = popcnt(chunkPtr->smallWayMask, 32) + popcnt(chunkPtr->bigWayMask, 32);
+
+	{
+		size_t wayOffset = 0;
+		wayOffset = popcnt(chunkPtr->smallWayMask, wayMaskByte);
+		uint8_t maskByte = chunkPtr->smallWayMask[wayMaskByte];
+		maskByte = maskByte & ((1 << wayMaskBit) - 1);
+		wayOffset += popcnt(&maskByte, 1);
+		if (chunkPtr->smallWayMask[wayMaskByte] & (1 << wayMaskBit))
+			return true;
+	}
+
+	size_t wayOffset = 0;
+	wayOffset += popcnt(chunkPtr->smallWayMask, 32);
+	wayOffset += popcnt(chunkPtr->bigWayMask, wayMaskByte);
+	uint8_t maskByte = chunkPtr->bigWayMask[wayMaskByte];
+	maskByte = maskByte & ((1 << wayMaskBit) - 1);
+	wayOffset += popcnt(&maskByte, 1);
+	if (!(chunkPtr->bigWayMask[wayMaskByte] & (1 << wayMaskBit)))
+		return false;
+
+	return true;
+}
+
 std::vector<LatpLon> SortedWayStore::at(WayID id) const {
 	const size_t groupIndex = id / (GroupSize * ChunkSize);
 	const size_t chunk = (id % (GroupSize * ChunkSize)) / ChunkSize;
diff --git a/test/sorted_way_store.test.cpp b/test/sorted_way_store.test.cpp
index 8c4c432d..65d34816 100644
--- a/test/sorted_way_store.test.cpp
+++ b/test/sorted_way_store.test.cpp
@@ -75,19 +75,36 @@ MU_TEST(test_encode_way) {
 }
 
 MU_TEST(test_multiple_stores) {
-	TestNodeStore ns;
-	SortedWayStore s1(true, ns), s2(true, ns);
-	s1.batchStart();
-	s2.batchStart();
+	bool compressed = false;
+
+	for (int i = 0; i < 2; i++) {
+		compressed = !compressed;
+		TestNodeStore ns;
+		SortedWayStore s1(compressed, ns), s2(compressed, ns);
+		s1.batchStart();
+		s2.batchStart();
+
+		s1.insertNodes({{ 1, { 1 } }});
 
-	s1.insertNodes({{ 1, { 1 } }});
-	s2.insertNodes({{ 2, { 2 } }});
+		// We store small ways differently than large ways, so
+		// store both kinds for testing.
+		std::vector<NodeID> longWay;
+		for (int i = 200; i < 2048; i++)
+			longWay.push_back(i + 3 * (i % 37));
 
-	s1.finalize(1);
-	s2.finalize(1);
+		s1.insertNodes({{ 42, longWay }});
+		s2.insertNodes({{ 2, { 2 } }});
 
-	mu_check(s1.size() == 1);
-	mu_check(s2.size() == 1);
+		s1.finalize(1);
+		s2.finalize(1);
+
+		mu_check(s1.size() == 2);
+		mu_check(s2.size() == 1);
+
+		mu_check(s1.contains(0, 1));
+		mu_check(s1.contains(0, 42));
+		mu_check(!s1.contains(0, 2));
+	}
 }
 
 MU_TEST(test_way_store) {

From 792d1b367c489a5ee785aabf4656c9085d2a17a2 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 17 Dec 2023 18:27:32 -0500
Subject: [PATCH 28/49] use TileCoordinatesSet

---
 src/tilemaker.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/tilemaker.cpp b/src/tilemaker.cpp
index 8a3f6419..d30d556f 100644
--- a/src/tilemaker.cpp
+++ b/src/tilemaker.cpp
@@ -492,7 +492,7 @@ int main(int argc, char* argv[]) {
 		// The clipping bbox check is expensive - as an optimization, compute the set of
 		// z6 tiles that are wholly covered by the clipping box. Membership in this
 		// set is quick to test.
-		std::set<TileCoordinates> coveredZ6Tiles;
+		TileCoordinatesSet coveredZ6Tiles(6);
 		if (hasClippingBox) {
 			for (int x = 0; x < 1 << 6; x++) {
 				for (int y = 0; y < 1 << 6; y++) {
@@ -500,7 +500,7 @@ int main(int argc, char* argv[]) {
 								TileBbox(TileCoordinates(x, y), 6, false, false).getTileBox(),
 								clippingBox
 							))
-						coveredZ6Tiles.insert(TileCoordinates(x, y));
+						coveredZ6Tiles.set(x, y);
 				}
 			}
 		}
@@ -533,7 +533,7 @@ int main(int argc, char* argv[]) {
 						if (zoom >= 6) {
 							TileCoordinate z6x = x / (1 << (zoom - 6));
 							TileCoordinate z6y = y / (1 << (zoom - 6));
-							isInAWhollyCoveredZ6Tile = coveredZ6Tiles.find(TileCoordinates(z6x, z6y)) != coveredZ6Tiles.end();
+							isInAWhollyCoveredZ6Tile = coveredZ6Tiles.test(z6x, z6y);
 						}
 
 						if(!isInAWhollyCoveredZ6Tile && !boost::geometry::intersects(TileBbox(TileCoordinates(x, y), zoom, false, false).getTileBox(), clippingBox)) 

From 2df30816759c35c714608cbe7a4a52b15e64dfed Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 17 Dec 2023 18:57:51 -0500
Subject: [PATCH 29/49] faster covered tile enumeration

Do a single pass,  rather than one pass per zoom.
---
 include/tile_data.h | 23 ++++++++++++++---------
 src/tile_data.cpp   | 39 +++++++++++++++++++++------------------
 src/tilemaker.cpp   | 24 ++++++++++++++++++++++--
 3 files changed, 57 insertions(+), 29 deletions(-)

diff --git a/include/tile_data.h b/include/tile_data.h
index 804761c8..2c5fe4df 100644
--- a/include/tile_data.h
+++ b/include/tile_data.h
@@ -137,9 +137,9 @@ template<typename OO> void collectTilesWithObjectsAtZoomTemplate(
 	const unsigned int& baseZoom,
 	const typename std::vector<AppendVectorNS::AppendVector<OO>>::iterator objects,
 	const size_t size,
-	const unsigned int zoom,
-	TileCoordinatesSet& output
+	std::vector<TileCoordinatesSet>& zooms
 ) {
+	size_t maxZoom = zooms.size() - 1;
 	uint16_t z6OffsetDivisor = baseZoom >= CLUSTER_ZOOM ? (1 << (baseZoom - CLUSTER_ZOOM)) : 1;
 	int64_t lastX = -1;
 	int64_t lastY = -1;
@@ -153,13 +153,18 @@ template<typename OO> void collectTilesWithObjectsAtZoomTemplate(
 			TileCoordinate baseY = z6y * z6OffsetDivisor + objects[i][j].y;
 
 			// Translate the x, y at the requested zoom level
-			TileCoordinate x = baseX / (1 << (baseZoom - zoom));
-			TileCoordinate y = baseY / (1 << (baseZoom - zoom));
+			TileCoordinate x = baseX / (1 << (baseZoom - maxZoom));
+			TileCoordinate y = baseY / (1 << (baseZoom - maxZoom));
 
 			if (lastX != x || lastY != y) {
-				output.set(x, y);
 				lastX = x;
 				lastY = y;
+
+				for (int zoom = maxZoom; zoom >= 0; zoom--) {
+					zooms[zoom].set(x, y);
+					x /= 2;
+					y /= 2;
+				}
 			}
 		}
 	}
@@ -360,9 +365,9 @@ class TileDataSource {
 public:
 	TileDataSource(size_t threadNum, unsigned int baseZoom, bool includeID);
 
-	void collectTilesWithObjectsAtZoom(uint zoom, TileCoordinatesSet& output);
+	void collectTilesWithObjectsAtZoom(std::vector<TileCoordinatesSet>& zooms);
 
-	void collectTilesWithLargeObjectsAtZoom(uint zoom, TileCoordinatesSet& output);
+	void collectTilesWithLargeObjectsAtZoom(std::vector<TileCoordinatesSet>& zooms);
 
 	void collectObjectsForTile(uint zoom, TileCoordinates dstIndex, std::vector<OutputObjectID>& output);
 	void finalize(size_t threadNum);
@@ -473,9 +478,9 @@ class TileDataSource {
 	}
 };
 
-TileCoordinatesSet getTilesAtZoom(
+void populateTilesAtZoom(
 	const std::vector<class TileDataSource *>& sources,
-	unsigned int zoom
+	std::vector<TileCoordinatesSet>& zooms
 );
 
 #endif //_TILE_DATA_H
diff --git a/src/tile_data.cpp b/src/tile_data.cpp
index fbae2038..8a8053bf 100644
--- a/src/tile_data.cpp
+++ b/src/tile_data.cpp
@@ -53,8 +53,8 @@ TileDataSource::TileDataSource(size_t threadNum, unsigned int baseZoom, bool inc
 	baseZoom(baseZoom),
 	pointStores(threadNum),
 	linestringStores(threadNum),
-	multipolygonStores(threadNum),
 	multilinestringStores(threadNum),
+	multipolygonStores(threadNum),
 	multiPolygonClipCache(ClipCache<MultiPolygon>(threadNum, baseZoom)),
 	multiLinestringClipCache(ClipCache<MultiLinestring>(threadNum, baseZoom))
 {
@@ -108,32 +108,39 @@ void TileDataSource::addObjectToSmallIndex(const TileCoordinates& index, const O
 		});
 }
 
-void TileDataSource::collectTilesWithObjectsAtZoom(uint zoom, TileCoordinatesSet& output) {
+void TileDataSource::collectTilesWithObjectsAtZoom(std::vector<TileCoordinatesSet>& zooms) {
 	// Scan through all shards. Convert to base zoom, then convert to the requested zoom.
-	collectTilesWithObjectsAtZoomTemplate<OutputObjectXY>(baseZoom, objects.begin(), objects.size(), zoom, output);
-	collectTilesWithObjectsAtZoomTemplate<OutputObjectXYID>(baseZoom, objectsWithIds.begin(), objectsWithIds.size(), zoom, output);
+	collectTilesWithObjectsAtZoomTemplate<OutputObjectXY>(baseZoom, objects.begin(), objects.size(), zooms);
+	collectTilesWithObjectsAtZoomTemplate<OutputObjectXYID>(baseZoom, objectsWithIds.begin(), objectsWithIds.size(), zooms);
 }
 
-void addCoveredTilesToOutput(const uint baseZoom, const uint zoom, const Box& box, TileCoordinatesSet& output) {
-	int scale = pow(2, baseZoom-zoom);
+void addCoveredTilesToOutput(const uint baseZoom, std::vector<TileCoordinatesSet>& zooms, const Box& box) {
+	size_t maxZoom = zooms.size() - 1;
+	int scale = pow(2, baseZoom - maxZoom);
 	TileCoordinate minx = box.min_corner().x() / scale;
 	TileCoordinate maxx = box.max_corner().x() / scale;
 	TileCoordinate miny = box.min_corner().y() / scale;
 	TileCoordinate maxy = box.max_corner().y() / scale;
 	for (int x=minx; x<=maxx; x++) {
 		for (int y=miny; y<=maxy; y++) {
-			output.set(x, y);
+			size_t zx = x, zy = y;
+
+			for (int zoom = maxZoom; zoom >= 0; zoom--) {
+				zooms[zoom].set(zx, zy);
+				zx /= 2;
+				zy /= 2;
+			}
 		}
 	}
 }
 
 // Find the tiles used by the "large objects" from the rtree index
-void TileDataSource::collectTilesWithLargeObjectsAtZoom(uint zoom, TileCoordinatesSet &output) {
+void TileDataSource::collectTilesWithLargeObjectsAtZoom(std::vector<TileCoordinatesSet>& zooms) {
 	for(auto const &result: boxRtree)
-		addCoveredTilesToOutput(baseZoom, zoom, result.first, output);
+		addCoveredTilesToOutput(baseZoom, zooms, result.first);
 
 	for(auto const &result: boxRtreeWithIds)
-		addCoveredTilesToOutput(baseZoom, zoom, result.first, output);
+		addCoveredTilesToOutput(baseZoom, zooms, result.first);
 }
 
 // Copy objects from the tile at dstIndex (in the dataset srcTiles) into output
@@ -369,18 +376,14 @@ void TileDataSource::reportSize() const {
 	std::cout << "Generated points: " << (points - 1) << ", lines: " << (linestrings - 2) << ", polygons: " << (polygons - 1) << std::endl;
 }
 
-TileCoordinatesSet getTilesAtZoom(
+void populateTilesAtZoom(
 	const std::vector<class TileDataSource *>& sources,
-	unsigned int zoom
+	std::vector<TileCoordinatesSet>& zooms
 ) {
-	TileCoordinatesSet tileCoordinates(zoom);
-
 	for(size_t i=0; i<sources.size(); i++) {
-		sources[i]->collectTilesWithObjectsAtZoom(zoom, tileCoordinates);
-		sources[i]->collectTilesWithLargeObjectsAtZoom(zoom, tileCoordinates);
+		sources[i]->collectTilesWithObjectsAtZoom(zooms);
+		sources[i]->collectTilesWithLargeObjectsAtZoom(zooms);
 	}
-
-	return tileCoordinates;
 }
 
 std::vector<OutputObjectID> TileDataSource::getObjectsForTile(
diff --git a/src/tilemaker.cpp b/src/tilemaker.cpp
index d30d556f..6ebfef0d 100644
--- a/src/tilemaker.cpp
+++ b/src/tilemaker.cpp
@@ -506,7 +506,26 @@ int main(int argc, char* argv[]) {
 		}
 
 		std::deque<std::pair<unsigned int, TileCoordinates>> tileCoordinates;
-		std::cout << "collecting tiles:";
+		std::vector<TileCoordinatesSet> zoomResults;
+		for (uint zoom = 0; zoom <= sharedData.config.endZoom; zoom++) {
+			zoomResults.push_back(TileCoordinatesSet(zoom));
+		}
+
+		{
+#ifdef CLOCK_MONOTONIC
+			timespec start, end;
+			clock_gettime(CLOCK_MONOTONIC, &start);
+#endif
+			std::cout << "collecting tiles" << std::flush;
+			populateTilesAtZoom(sources, zoomResults);
+#ifdef CLOCK_MONOTONIC
+			clock_gettime(CLOCK_MONOTONIC, &end);
+			uint64_t tileNs = 1e9 * (end.tv_sec - start.tv_sec) + end.tv_nsec - start.tv_nsec;
+			std::cout << ": " << (uint32_t)(tileNs / 1e6) << "ms";
+#endif
+		}
+
+		std::cout << ", filtering tiles:" << std::flush;
 		for (uint zoom=sharedData.config.startZoom; zoom <= sharedData.config.endZoom; zoom++) {
 			std::cout << " z" << std::to_string(zoom) << std::flush;
 #ifdef CLOCK_MONOTONIC
@@ -514,7 +533,7 @@ int main(int argc, char* argv[]) {
 			clock_gettime(CLOCK_MONOTONIC, &start);
 #endif
 
-			auto zoomResult = getTilesAtZoom(sources, zoom);
+			const auto& zoomResult = zoomResults[zoom];
 			int numTiles = 0;
 			for (int x = 0; x < 1 << zoom; x++) {
 				for (int y = 0; y < 1 << zoom; y++) {
@@ -554,6 +573,7 @@ int main(int argc, char* argv[]) {
 #endif
 			std::cout << ")" << std::flush;
 		}
+		zoomResults.clear();
 
 		std::cout << std::endl;
 

From b9434f2c65ac26dc60b332cf8ccdebb724550262 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 17 Dec 2023 21:28:25 -0500
Subject: [PATCH 30/49] add ShardedNodeStore

This distributes nodes into one of 8 shards, trying to roughly group
parts of the globe by complexity.

This should help with locality when writing tiles.

A future commit will add a ShardedWayStore and teach read_pbf to read in
a locality-aware manner, which should help when reading ways.
---
 CMakeLists.txt               |  1 +
 Makefile                     |  1 +
 include/node_store.h         |  1 -
 include/node_stores.h        |  3 +-
 include/sharded_node_store.h | 30 +++++++++++++
 include/sorted_node_store.h  |  1 -
 src/sharded_node_store.cpp   | 86 ++++++++++++++++++++++++++++++++++++
 src/sorted_node_store.cpp    |  3 +-
 src/sorted_way_store.cpp     |  2 +-
 src/tilemaker.cpp            | 45 +++++++++++++------
 10 files changed, 153 insertions(+), 20 deletions(-)
 create mode 100644 include/sharded_node_store.h
 create mode 100644 src/sharded_node_store.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d69e61ed..3c301534 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -105,6 +105,7 @@ file(GLOB tilemaker_src_files
 	src/pooled_string.cpp
 	src/read_pbf.cpp
 	src/read_shp.cpp
+	src/sharded_node_store.cpp
 	src/shared_data.cpp
 	src/shp_mem_tiles.cpp
 	src/sorted_node_store.cpp
diff --git a/Makefile b/Makefile
index 9eae44c3..0a63416f 100644
--- a/Makefile
+++ b/Makefile
@@ -114,6 +114,7 @@ tilemaker: \
 	src/pooled_string.o \
 	src/read_pbf.o \
 	src/read_shp.o \
+	src/sharded_node_store.o \
 	src/shared_data.o \
 	src/shp_mem_tiles.o \
 	src/sorted_node_store.o \
diff --git a/include/node_store.h b/include/node_store.h
index 9ef2a4c6..a2547fd5 100644
--- a/include/node_store.h
+++ b/include/node_store.h
@@ -25,7 +25,6 @@ class NodeStore
 	virtual LatpLon at(NodeID i) const = 0;
 
 	virtual bool contains(size_t shard, NodeID id) const = 0;
-	virtual size_t shard() const = 0;
 	virtual size_t shards() const = 0;
 };
 
diff --git a/include/node_stores.h b/include/node_stores.h
index f093081f..80a94868 100644
--- a/include/node_stores.h
+++ b/include/node_stores.h
@@ -5,6 +5,7 @@
 #include <memory>
 #include "node_store.h"
 #include "sorted_node_store.h"
+#include "sharded_node_store.h"
 #include "mmap_allocator.h"
 
 class BinarySearchNodeStore : public NodeStore
@@ -25,7 +26,6 @@ class BinarySearchNodeStore : public NodeStore
 	void batchStart() {}
 
 	bool contains(size_t shard, NodeID id) const override;
-	size_t shard() const override { return 0; }
 	size_t shards() const override { return 1; }
 	
 
@@ -59,7 +59,6 @@ class CompactNodeStore : public NodeStore
 	// CompactNodeStore has no metadata to know whether or not it contains
 	// a node, so it's not suitable for used in sharded scenarios.
 	bool contains(size_t shard, NodeID id) const override { return true; }
-	size_t shard() const override { return 0; }
 	size_t shards() const override { return 1; }
 
 private: 
diff --git a/include/sharded_node_store.h b/include/sharded_node_store.h
new file mode 100644
index 00000000..44938126
--- /dev/null
+++ b/include/sharded_node_store.h
@@ -0,0 +1,30 @@
+#ifndef _SHARDED_NODE_STORE
+#define _SHARDED_NODE_STORE
+
+#include <functional>
+#include <memory>
+#include "node_store.h"
+
+class ShardedNodeStore : public NodeStore {
+public:
+	ShardedNodeStore(std::function<std::shared_ptr<NodeStore>()> createNodeStore);
+	~ShardedNodeStore();
+	void reopen() override;
+	void finalize(size_t threadNum) override;
+	LatpLon at(NodeID i) const override;
+	size_t size() const override;
+	void batchStart() override;
+	void insert(const std::vector<element_t>& elements) override;
+	void clear() { 
+		reopen();
+	}
+
+	bool contains(size_t shard, NodeID id) const override;
+	size_t shards() const override;
+
+private:
+	std::function<std::shared_ptr<NodeStore>()> createNodeStore;
+	std::vector<std::shared_ptr<NodeStore>> stores;
+};
+
+#endif
diff --git a/include/sorted_node_store.h b/include/sorted_node_store.h
index 0d12c460..0e8d2e24 100644
--- a/include/sorted_node_store.h
+++ b/include/sorted_node_store.h
@@ -71,7 +71,6 @@ class SortedNodeStore : public NodeStore
 	}
 
 	bool contains(size_t shard, NodeID id) const override;
-	size_t shard() const override { return 0; }
 	size_t shards() const override { return 1; }
 
 private: 
diff --git a/src/sharded_node_store.cpp b/src/sharded_node_store.cpp
new file mode 100644
index 00000000..92169986
--- /dev/null
+++ b/src/sharded_node_store.cpp
@@ -0,0 +1,86 @@
+#include "sharded_node_store.h"
+
+ShardedNodeStore::ShardedNodeStore(std::function<std::shared_ptr<NodeStore>()> createNodeStore):
+	createNodeStore(createNodeStore) {
+	for (int i = 0; i < shards(); i++)
+		stores.push_back(createNodeStore());
+}
+
+ShardedNodeStore::~ShardedNodeStore() {
+}
+
+void ShardedNodeStore::reopen() {
+	for (auto& store : stores)
+		store->reopen();
+}
+
+void ShardedNodeStore::finalize(size_t threadNum) {
+	for (auto& store : stores)
+		store->finalize(threadNum);
+}
+
+LatpLon ShardedNodeStore::at(NodeID id) const {
+	// TODO: look in the last store we successfully found something, using
+	// a thread local
+	for (int i = 0; i < shards(); i++)
+		if (stores[i]->contains(0, id) || i == shards() - 1)
+			return stores[i]->at(id);
+}
+
+size_t ShardedNodeStore::size() const {
+	size_t rv = 0;
+	for (auto& store : stores)
+		rv += store->size();
+
+	return rv;
+}
+
+void ShardedNodeStore::batchStart() {
+	for (auto& store : stores)
+		store->batchStart();
+}
+
+size_t pickStore(const LatpLon& el) {
+	// Assign the element to a store. This is pretty naive, we could likely do better--
+	// Europe still basically gets its own bucket, but probably should be split up
+	// more.
+
+	const size_t z3x = lon2tilex(el.lon / 10000000, 3);
+	const size_t z3y = latp2tiley(el.latp / 10000000, 3);
+
+	if (z3x == 4 && z3y == 2) return 4; // Central Europe
+	if (z3x == 5 && z3y == 2) return 5; // Western Russia
+	if (z3x == 4 && z3y == 3) return 6; // North Africa
+	if (z3x == 5 && z3y == 3) return 7; // India
+
+	const size_t z2x = z3x / 2;
+	const size_t z2y = z3y / 2;
+
+	if (z2x == 3 && z2y == 1) return 3; // Asia, Russia
+	if (z2x == 1 && z2y == 1) return 2; // North Atlantic Ocean and bordering countries
+	if (z2x == 0 && z2y == 1) return 1; // North America
+
+//	std::cout << "z2x=" << std::to_string(z2x) << ", z2y=" << std::to_string(z2y) << std::endl;
+	return 0; // Artic, Antartcica, Oceania
+}
+
+void ShardedNodeStore::insert(const std::vector<element_t>& elements) {
+	std::vector<std::vector<element_t>> perStore(shards());
+
+	for (const auto& el : elements) {
+		perStore[pickStore(el.second)].push_back(el);
+	}
+
+	for (int i = 0; i < shards(); i++) {
+		if (!perStore[i].empty())
+			stores[i]->insert(perStore[i]);
+	}
+}
+
+bool ShardedNodeStore::contains(size_t shard, NodeID id) const {
+	return stores[shard]->contains(0, id);
+}
+
+size_t ShardedNodeStore::shards() const {
+	return 8;
+}
diff --git a/src/sorted_node_store.cpp b/src/sorted_node_store.cpp
index f99baa58..174664c3 100644
--- a/src/sorted_node_store.cpp
+++ b/src/sorted_node_store.cpp
@@ -58,6 +58,7 @@ namespace SortedNodeStoreTypes {
 using namespace SortedNodeStoreTypes;
 
 SortedNodeStore::SortedNodeStore(bool compressNodes): compressNodes(compressNodes) {
+	s(this); // allocate our ThreadStorage before multi-threading
 	reopen();
 }
 
@@ -320,7 +321,7 @@ void SortedNodeStore::finalize(size_t threadNum) {
 
 	orphanage.clear();
 
-	std::cout << "SortedNodeStore: " << totalGroups << " groups, " << totalChunks << " chunks, " << totalNodes.load() << " nodes, " << totalGroupSpace.load() << " bytes (" << (1000ull * (totalAllocatedSpace.load() - totalGroupSpace.load()) / totalAllocatedSpace.load()) / 10.0 << "% wasted)" << std::endl;
+	std::cout << "SortedNodeStore: " << totalGroups << " groups, " << totalChunks << " chunks, " << totalNodes.load() << " nodes, " << totalGroupSpace.load() << " bytes (" << (1000ull * (totalAllocatedSpace.load() - totalGroupSpace.load()) / (totalAllocatedSpace.load() + 1)) / 10.0 << "% wasted)" << std::endl;
 	/*
 	for (int i = 0; i < 257; i++)
 		std::cout << "chunkSizeFreqs[ " << i << " ]= " << chunkSizeFreqs[i].load() << std::endl;
diff --git a/src/sorted_way_store.cpp b/src/sorted_way_store.cpp
index 669d01a2..e7ff4841 100644
--- a/src/sorted_way_store.cpp
+++ b/src/sorted_way_store.cpp
@@ -59,6 +59,7 @@ namespace SortedWayStoreTypes {
 using namespace SortedWayStoreTypes;
 
 SortedWayStore::SortedWayStore(bool compressWays, const NodeStore& nodeStore): compressWays(compressWays), nodeStore(nodeStore) {
+	s(this); // allocate our ThreadStorage before multi-threading
 	reopen();
 }
 
@@ -116,7 +117,6 @@ bool SortedWayStore::contains(size_t shard, WayID id) const {
 	}
 
 	ChunkInfo* chunkPtr = (ChunkInfo*)((char*)groupPtr + groupPtr->chunkOffsets[chunkOffset]);
-	const size_t numWays = popcnt(chunkPtr->smallWayMask, 32) + popcnt(chunkPtr->bigWayMask, 32);
 
 	{
 		size_t wayOffset = 0;
diff --git a/src/tilemaker.cpp b/src/tilemaker.cpp
index 6ebfef0d..3a32168a 100644
--- a/src/tilemaker.cpp
+++ b/src/tilemaker.cpp
@@ -285,8 +285,6 @@ int main(int argc, char* argv[]) {
 	}
 
 	// For each tile, objects to be used in processing
-	shared_ptr<NodeStore> nodeStore;
-
 	bool allPbfsHaveSortTypeThenID = true;
 	bool anyPbfHasLocationsOnWays = false;
 
@@ -297,22 +295,41 @@ int main(int argc, char* argv[]) {
 		}
 	}
 
-	if (osmStoreCompact)
-		nodeStore = make_shared<CompactNodeStore>();
-	else {
-		if (allPbfsHaveSortTypeThenID)
-			nodeStore = make_shared<SortedNodeStore>(!osmStoreUncompressedNodes);
-		else
-			nodeStore = make_shared<BinarySearchNodeStore>();
-	}
+	auto createNodeStore = [allPbfsHaveSortTypeThenID, osmStoreCompact, osmStoreUncompressedNodes]() {
+		if (osmStoreCompact) {
+			std::shared_ptr<NodeStore> rv = make_shared<CompactNodeStore>();
+			return rv;
+		}
+
+		if (allPbfsHaveSortTypeThenID) {
+			std::shared_ptr<NodeStore> rv = make_shared<SortedNodeStore>(!osmStoreUncompressedNodes);
+			return rv;
+		}
+		std::shared_ptr<NodeStore> rv =  make_shared<BinarySearchNodeStore>();
+		return rv;
+	};
 
-	shared_ptr<WayStore> wayStore;
-	if (!anyPbfHasLocationsOnWays && allPbfsHaveSortTypeThenID) {
-		wayStore = make_shared<SortedWayStore>(!osmStoreUncompressedNodes, *nodeStore.get());
+	shared_ptr<NodeStore> nodeStore;
+
+	// TODO: make this a flag
+	if (true) {
+		nodeStore = std::make_shared<ShardedNodeStore>(createNodeStore);
 	} else {
-		wayStore = make_shared<BinarySearchWayStore>();
+		nodeStore = createNodeStore();
 	}
 
+	auto createWayStore = [anyPbfHasLocationsOnWays, allPbfsHaveSortTypeThenID, osmStoreUncompressedWays, &nodeStore]() {
+		if (!anyPbfHasLocationsOnWays && allPbfsHaveSortTypeThenID) {
+			std::shared_ptr<WayStore> rv = make_shared<SortedWayStore>(!osmStoreUncompressedWays, *nodeStore.get());
+			return rv;
+		}
+
+		std::shared_ptr<WayStore> rv = make_shared<BinarySearchWayStore>();
+		return rv;
+	};
+
+	shared_ptr<WayStore> wayStore = createWayStore();
+
 	OSMStore osmStore(*nodeStore.get(), *wayStore.get());
 	osmStore.use_compact_store(osmStoreCompact);
 	osmStore.enforce_integrity(!skipIntegrity);

From e968b400f73261a42c9f5add61d104342f0e1116 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 17 Dec 2023 23:21:51 -0500
Subject: [PATCH 31/49] add ShardedWayStore

Add `--shard-stores` flag.

It's not clear yet this'll be a win, will need to benchmark.

The cost of reading the PBF blocks repeatedly is a bit higher than I was
expecting. It might be worth seeing if we can index the blocks to skip
fruitless reads.
---
 CMakeLists.txt              |   1 +
 Makefile                    |   1 +
 include/read_pbf.h          |  14 ++-
 include/sharded_way_store.h |  34 ++++++
 include/sorted_way_store.h  |   4 +-
 include/way_store.h         |   4 +-
 include/way_stores.h        |   5 +-
 src/read_pbf.cpp            | 204 +++++++++++++++++++++---------------
 src/sharded_node_store.cpp  |  18 +++-
 src/sharded_way_store.cpp   |  77 ++++++++++++++
 src/sorted_way_store.cpp    |   2 +-
 src/tilemaker.cpp           |  15 ++-
 src/way_stores.cpp          |   2 +-
 13 files changed, 279 insertions(+), 102 deletions(-)
 create mode 100644 include/sharded_way_store.h
 create mode 100644 src/sharded_way_store.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3c301534..dd3179bb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -106,6 +106,7 @@ file(GLOB tilemaker_src_files
 	src/read_pbf.cpp
 	src/read_shp.cpp
 	src/sharded_node_store.cpp
+	src/sharded_way_store.cpp
 	src/shared_data.cpp
 	src/shp_mem_tiles.cpp
 	src/sorted_node_store.cpp
diff --git a/Makefile b/Makefile
index 0a63416f..81779d79 100644
--- a/Makefile
+++ b/Makefile
@@ -115,6 +115,7 @@ tilemaker: \
 	src/read_pbf.o \
 	src/read_shp.o \
 	src/sharded_node_store.o \
+	src/sharded_way_store.o \
 	src/shared_data.o \
 	src/shp_mem_tiles.o \
 	src/sorted_node_store.o \
diff --git a/include/read_pbf.h b/include/read_pbf.h
index b934a563..4ab44612 100644
--- a/include/read_pbf.h
+++ b/include/read_pbf.h
@@ -53,6 +53,7 @@ class PbfReader
 	using pbfreader_generate_stream = std::function< std::shared_ptr<std::istream> () >;
 
 	int ReadPbfFile(
+		uint shards,
 		bool hasSortTypeThenID,
 		const std::unordered_set<std::string>& nodeKeys,
 		unsigned int threadNum,
@@ -79,11 +80,20 @@ class PbfReader
 		const BlockMetadata& blockMetadata,
 		const std::unordered_set<std::string>& nodeKeys,
 		bool locationsOnWays,
-		ReadPhase phase
+		ReadPhase phase,
+		uint shard,
+		uint effectiveShard
 	);
 	bool ReadNodes(OsmLuaProcessing &output, PrimitiveGroup &pg, PrimitiveBlock const &pb, const std::unordered_set<int> &nodeKeyPositions);
 
-	bool ReadWays(OsmLuaProcessing &output, PrimitiveGroup &pg, PrimitiveBlock const &pb, bool locationsOnWays);
+	bool ReadWays(
+		OsmLuaProcessing &output,
+		PrimitiveGroup &pg,
+		PrimitiveBlock const &pb,
+		bool locationsOnWays,
+		uint shard,
+		uint effectiveShards
+	);
 	bool ScanRelations(OsmLuaProcessing &output, PrimitiveGroup &pg, PrimitiveBlock const &pb);
 	bool ReadRelations(
 		OsmLuaProcessing& output,
diff --git a/include/sharded_way_store.h b/include/sharded_way_store.h
new file mode 100644
index 00000000..b57d03e0
--- /dev/null
+++ b/include/sharded_way_store.h
@@ -0,0 +1,34 @@
+#ifndef _SHARDED_WAY_STORE
+#define _SHARDED_WAY_STORE
+
+#include <functional>
+#include <memory>
+#include "way_store.h"
+
+class NodeStore;
+
+class ShardedWayStore : public WayStore {
+public:
+	ShardedWayStore(std::function<std::shared_ptr<WayStore>()> createWayStore, const NodeStore& nodeStore);
+	~ShardedWayStore();
+	void reopen() override;
+	void batchStart() override;
+	std::vector<LatpLon> at(WayID wayid) const override;
+	bool requiresNodes() const override;
+	void insertLatpLons(std::vector<WayStore::ll_element_t> &newWays) override;
+	void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) override;
+	void clear() override;
+	std::size_t size() const override;
+	void finalize(unsigned int threadNum) override;
+
+	bool contains(size_t shard, WayID id) const override;
+	WayStore& shard(size_t shard) override;
+	size_t shards() const override;
+	
+private:
+	std::function<std::shared_ptr<WayStore>()> createWayStore;
+	const NodeStore& nodeStore;
+	std::vector<std::shared_ptr<WayStore>> stores;
+};
+
+#endif
diff --git a/include/sorted_way_store.h b/include/sorted_way_store.h
index 23271deb..890a9a53 100644
--- a/include/sorted_way_store.h
+++ b/include/sorted_way_store.h
@@ -90,13 +90,13 @@ class SortedWayStore: public WayStore {
 	std::vector<LatpLon> at(WayID wayid) const override;
 	bool requiresNodes() const override { return true; }
 	void insertLatpLons(std::vector<WayStore::ll_element_t> &newWays) override;
-	const void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) override;
+	void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) override;
 	void clear() override;
 	std::size_t size() const override;
 	void finalize(unsigned int threadNum) override;
 
 	bool contains(size_t shard, WayID id) const override;
-	size_t shard() const override { return 0; }
+	WayStore& shard(size_t shard) override { return *this; }
 	size_t shards() const override { return 1; }
 	
 	static uint16_t encodeWay(
diff --git a/include/way_store.h b/include/way_store.h
index 5e274a5c..c2b959c7 100644
--- a/include/way_store.h
+++ b/include/way_store.h
@@ -17,13 +17,13 @@ class WayStore {
 	virtual std::vector<LatpLon> at(WayID wayid) const = 0;
 	virtual bool requiresNodes() const = 0;
 	virtual void insertLatpLons(std::vector<ll_element_t>& newWays) = 0;
-	virtual const void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) = 0;
+	virtual void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) = 0;
 	virtual void clear() = 0;
 	virtual std::size_t size() const = 0;
 	virtual void finalize(unsigned int threadNum) = 0;
 
 	virtual bool contains(size_t shard, WayID id) const = 0;
-	virtual size_t shard() const = 0;
+	virtual WayStore& shard(size_t shard) = 0;
 	virtual size_t shards() const = 0;
 };
 
diff --git a/include/way_stores.h b/include/way_stores.h
index 4ed8db7e..f66e3939 100644
--- a/include/way_stores.h
+++ b/include/way_stores.h
@@ -5,6 +5,7 @@
 #include <mutex>
 #include "way_store.h"
 #include "sorted_way_store.h"
+#include "sharded_way_store.h"
 
 class BinarySearchWayStore: public WayStore {
 
@@ -16,13 +17,13 @@ class BinarySearchWayStore: public WayStore {
 	std::vector<LatpLon> at(WayID wayid) const override;
 	bool requiresNodes() const override { return false; }
 	void insertLatpLons(std::vector<WayStore::ll_element_t> &newWays) override;
-	const void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) override;
+	void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) override;
 	void clear() override;
 	std::size_t size() const override;
 	void finalize(unsigned int threadNum) override;
 
 	bool contains(size_t shard, WayID id) const override;
-	size_t shard() const override { return 0; }
+	WayStore& shard(size_t shard) override { return *this; }
 	size_t shards() const override { return 1; }
 
 private:
diff --git a/src/read_pbf.cpp b/src/read_pbf.cpp
index 0202a67d..f371cded 100644
--- a/src/read_pbf.cpp
+++ b/src/read_pbf.cpp
@@ -73,7 +73,14 @@ bool PbfReader::ReadNodes(OsmLuaProcessing &output, PrimitiveGroup &pg, Primitiv
 	return false;
 }
 
-bool PbfReader::ReadWays(OsmLuaProcessing &output, PrimitiveGroup &pg, PrimitiveBlock const &pb, bool locationsOnWays) {
+bool PbfReader::ReadWays(
+	OsmLuaProcessing &output,
+	PrimitiveGroup &pg,
+	PrimitiveBlock const &pb,
+	bool locationsOnWays,
+	uint shard,
+	uint effectiveShards
+) {
 	// ----	Read ways
 
 	if (pg.ways_size() > 0) {
@@ -83,15 +90,18 @@ bool PbfReader::ReadWays(OsmLuaProcessing &output, PrimitiveGroup &pg, Primitive
 
 		std::vector<WayStore::ll_element_t> llWays;
 		std::vector<std::pair<WayID, std::vector<NodeID>>> nodeWays;
+		LatpLonVec llVec;
+		std::vector<NodeID> nodeVec;
 
 		for (int j=0; j<pg.ways_size(); j++) {
+			llVec.clear();
+			nodeVec.clear();
+
 			pbfWay = pg.ways(j);
 			WayID wayId = static_cast<WayID>(pbfWay.id());
 			if (wayId >= pow(2,42)) throw std::runtime_error("Way ID negative or too large: "+std::to_string(wayId));
 
 			// Assemble nodelist
-			LatpLonVec llVec;
-			std::vector<NodeID> nodeVec;
 			if (locationsOnWays) {
 				int lat=0, lon=0;
 				llVec.reserve(pbfWay.lats_size());
@@ -105,8 +115,17 @@ bool PbfReader::ReadWays(OsmLuaProcessing &output, PrimitiveGroup &pg, Primitive
 				int64_t nodeId = 0;
 				llVec.reserve(pbfWay.refs_size());
 				nodeVec.reserve(pbfWay.refs_size());
+
+				bool skipToNext = false;
+
 				for (int k=0; k<pbfWay.refs_size(); k++) {
 					nodeId += pbfWay.refs(k);
+
+					if (k == 0 && effectiveShards > 1 && !osmStore.nodes.contains(shard, nodeId)) {
+						skipToNext = true;
+						break;
+					}
+
 					try {
 						llVec.push_back(osmStore.nodes.at(static_cast<NodeID>(nodeId)));
 						nodeVec.push_back(nodeId);
@@ -114,6 +133,9 @@ bool PbfReader::ReadWays(OsmLuaProcessing &output, PrimitiveGroup &pg, Primitive
 						if (osmStore.integrity_enforced()) throw err;
 					}
 				}
+
+				if (skipToNext)
+					continue;
 			}
 			if (llVec.empty()) continue;
 
@@ -138,9 +160,9 @@ bool PbfReader::ReadWays(OsmLuaProcessing &output, PrimitiveGroup &pg, Primitive
 		}
 
 		if (wayStoreRequiresNodes) {
-			osmStore.ways.insertNodes(nodeWays);
+			osmStore.ways.shard(shard).insertNodes(nodeWays);
 		} else {
-			osmStore.ways.insertLatpLons(llWays);
+			osmStore.ways.shard(shard).insertLatpLons(llWays);
 		}
 
 		return true;
@@ -244,7 +266,9 @@ bool PbfReader::ReadBlock(
 	const BlockMetadata& blockMetadata,
 	const unordered_set<string>& nodeKeys,
 	bool locationsOnWays,
-	ReadPhase phase
+	ReadPhase phase,
+	uint shard,
+	uint effectiveShards
 ) 
 {
 	infile.seekg(blockMetadata.offset);
@@ -274,6 +298,9 @@ bool PbfReader::ReadBlock(
 				std::ostringstream str;
 				str << "\r";
 				void_mmap_allocator::reportStoreSize(str);
+				if (effectiveShards > 1)
+					str << std::to_string(shard + 1) << "/" << std::to_string(effectiveShards) << " ";
+
 				str << "Block " << blocksProcessed.load() << "/" << blocksToProcess.load() << " ways " << pg.ways_size() << " relations " << pg.relations_size() << "                  ";
 				std::cout << str.str();
 				std::cout.flush();
@@ -304,7 +331,7 @@ bool PbfReader::ReadBlock(
 		}
 	
 		if(phase == ReadPhase::Ways) {
-			bool done = ReadWays(output, pg, pb, locationsOnWays);
+			bool done = ReadWays(output, pg, pb, locationsOnWays, shard, effectiveShards);
 			if(done) { 
 				output_progress();
 				++read_groups;
@@ -336,7 +363,7 @@ bool PbfReader::ReadBlock(
 
 	// We can only delete blocks if we're confident we've processed everything,
 	// which is not possible in the case of subdivided blocks.
-	return blockMetadata.chunks == 1;
+	return (shard + 1 == effectiveShards) && blockMetadata.chunks == 1;
 }
 
 bool blockHasPrimitiveGroupSatisfying(
@@ -366,6 +393,7 @@ bool blockHasPrimitiveGroupSatisfying(
 }
 
 int PbfReader::ReadPbfFile(
+	uint shards,
 	bool hasSortTypeThenID,
 	unordered_set<string> const& nodeKeys,
 	unsigned int threadNum,
@@ -463,95 +491,105 @@ int PbfReader::ReadPbfFile(
 
 	std::vector<ReadPhase> all_phases = { ReadPhase::Nodes, ReadPhase::RelationScan, ReadPhase::Ways, ReadPhase::Relations };
 	for(auto phase: all_phases) {
+		uint effectiveShards = 1;
+
+		// On memory-constrained machines, we might read ways multiple times in order
+		// to keep the working set of nodes limited.
+		if (phase == ReadPhase::Ways)
+			effectiveShards = shards;
+
+		for (int shard = 0; shard < effectiveShards; shard++) {
 #ifdef CLOCK_MONOTONIC
-		timespec start, end;
-		clock_gettime(CLOCK_MONOTONIC, &start);
+			timespec start, end;
+			clock_gettime(CLOCK_MONOTONIC, &start);
 #endif
 
-		// Launch the pool with threadNum threads
-		boost::asio::thread_pool pool(threadNum);
-		std::mutex block_mutex;
-
-		// If we're in ReadPhase::Relations and there aren't many blocks left
-		// to read, increase parallelism by letting each thread only process
-		// a portion of the block.
-		if (phase == ReadPhase::Relations && blocks.size() < threadNum * 2) {
-			std::cout << "only " << blocks.size() << " relation blocks; subdividing for better parallelism" << std::endl;
-			std::map<std::size_t, BlockMetadata> moreBlocks;
-			for (const auto& block : blocks) {
-				BlockMetadata newBlock = block.second;
-				newBlock.chunks = threadNum;
-				for (size_t i = 0; i < threadNum; i++) {
-					newBlock.chunk = i;
-					moreBlocks[moreBlocks.size()] = newBlock;
+			// Launch the pool with threadNum threads
+			boost::asio::thread_pool pool(threadNum);
+			std::mutex block_mutex;
+
+			// If we're in ReadPhase::Relations and there aren't many blocks left
+			// to read, increase parallelism by letting each thread only process
+			// a portion of the block.
+			if (phase == ReadPhase::Relations && blocks.size() < threadNum * 2) {
+				std::cout << "only " << blocks.size() << " relation blocks; subdividing for better parallelism" << std::endl;
+				std::map<std::size_t, BlockMetadata> moreBlocks;
+				for (const auto& block : blocks) {
+					BlockMetadata newBlock = block.second;
+					newBlock.chunks = threadNum;
+					for (size_t i = 0; i < threadNum; i++) {
+						newBlock.chunk = i;
+						moreBlocks[moreBlocks.size()] = newBlock;
+					}
 				}
+				blocks = moreBlocks;
 			}
-			blocks = moreBlocks;
-		}
 
-		std::deque<std::vector<IndexedBlockMetadata>> blockRanges;
-		std::map<std::size_t, BlockMetadata> filteredBlocks;
-		for (const auto& entry : blocks) {
-			if ((phase == ReadPhase::Nodes && entry.second.hasNodes) ||
-					(phase == ReadPhase::RelationScan && entry.second.hasRelations) ||
-					(phase == ReadPhase::Ways && entry.second.hasWays) ||
-					(phase == ReadPhase::Relations && entry.second.hasRelations))
-				filteredBlocks[entry.first] = entry.second;
-		}
+			std::deque<std::vector<IndexedBlockMetadata>> blockRanges;
+			std::map<std::size_t, BlockMetadata> filteredBlocks;
+			for (const auto& entry : blocks) {
+				if ((phase == ReadPhase::Nodes && entry.second.hasNodes) ||
+						(phase == ReadPhase::RelationScan && entry.second.hasRelations) ||
+						(phase == ReadPhase::Ways && entry.second.hasWays) ||
+						(phase == ReadPhase::Relations && entry.second.hasRelations))
+					filteredBlocks[entry.first] = entry.second;
+			}
 
-		blocksToProcess = filteredBlocks.size();
-		blocksProcessed = 0;
-
-		// When processing blocks, we try to give each worker large batches
-		// of contiguous blocks, so that they might benefit from long runs
-		// of sorted indexes, and locality of nearby IDs.
-		const size_t batchSize = (filteredBlocks.size() / (threadNum * 8)) + 1;
-
-		size_t consumed = 0;
-		auto it = filteredBlocks.begin();
-		while(it != filteredBlocks.end()) {
-			std::vector<IndexedBlockMetadata> blockRange;
-			blockRange.reserve(batchSize);
-			size_t max = consumed + batchSize;
-			for (; consumed < max && it != filteredBlocks.end(); consumed++) {
-				IndexedBlockMetadata ibm;
-				memcpy(&ibm, &it->second, sizeof(BlockMetadata));
-				ibm.index = it->first;
-				blockRange.push_back(ibm);
-				it++;
+			blocksToProcess = filteredBlocks.size();
+			blocksProcessed = 0;
+
+			// When processing blocks, we try to give each worker large batches
+			// of contiguous blocks, so that they might benefit from long runs
+			// of sorted indexes, and locality of nearby IDs.
+			const size_t batchSize = (filteredBlocks.size() / (threadNum * 8)) + 1;
+
+			size_t consumed = 0;
+			auto it = filteredBlocks.begin();
+			while(it != filteredBlocks.end()) {
+				std::vector<IndexedBlockMetadata> blockRange;
+				blockRange.reserve(batchSize);
+				size_t max = consumed + batchSize;
+				for (; consumed < max && it != filteredBlocks.end(); consumed++) {
+					IndexedBlockMetadata ibm;
+					memcpy(&ibm, &it->second, sizeof(BlockMetadata));
+					ibm.index = it->first;
+					blockRange.push_back(ibm);
+					it++;
+				}
+				blockRanges.push_back(blockRange);
 			}
-			blockRanges.push_back(blockRange);
-		}
 
-		{
-			for(const std::vector<IndexedBlockMetadata>& blockRange: blockRanges) {
-				boost::asio::post(pool, [=, &blockRange, &blocks, &block_mutex, &nodeKeys]() {
-					if (phase == ReadPhase::Nodes)
-						osmStore.nodes.batchStart();
-					if (phase == ReadPhase::Ways)
-						osmStore.ways.batchStart();
-
-					for (const IndexedBlockMetadata& indexedBlockMetadata: blockRange) {
-						auto infile = generate_stream();
-						auto output = generate_output();
-
-						if(ReadBlock(*infile, *output, indexedBlockMetadata, nodeKeys, locationsOnWays, phase)) {
-							const std::lock_guard<std::mutex> lock(block_mutex);
-							blocks.erase(indexedBlockMetadata.index);	
+			{
+				for(const std::vector<IndexedBlockMetadata>& blockRange: blockRanges) {
+					boost::asio::post(pool, [=, &blockRange, &blocks, &block_mutex, &nodeKeys]() {
+						if (phase == ReadPhase::Nodes)
+							osmStore.nodes.batchStart();
+						if (phase == ReadPhase::Ways)
+							osmStore.ways.batchStart();
+
+						for (const IndexedBlockMetadata& indexedBlockMetadata: blockRange) {
+							auto infile = generate_stream();
+							auto output = generate_output();
+
+							if(ReadBlock(*infile, *output, indexedBlockMetadata, nodeKeys, locationsOnWays, phase, shard, effectiveShards)) {
+								const std::lock_guard<std::mutex> lock(block_mutex);
+								blocks.erase(indexedBlockMetadata.index);	
+							}
+							blocksProcessed++;
 						}
-						blocksProcessed++;
-					}
-				});
+					});
+				}
 			}
-		}
-	
-		pool.join();
+		
+			pool.join();
 
 #ifdef CLOCK_MONOTONIC
-		clock_gettime(CLOCK_MONOTONIC, &end);
-		uint64_t elapsedNs = 1e9 * (end.tv_sec - start.tv_sec) + end.tv_nsec - start.tv_nsec;
-		std::cout << "(" << std::to_string((uint32_t)(elapsedNs / 1e6)) << " ms)" << std::endl;
+			clock_gettime(CLOCK_MONOTONIC, &end);
+			uint64_t elapsedNs = 1e9 * (end.tv_sec - start.tv_sec) + end.tv_nsec - start.tv_nsec;
+			std::cout << "(" << std::to_string((uint32_t)(elapsedNs / 1e6)) << " ms)" << std::endl;
 #endif
+		}
+
 		if(phase == ReadPhase::Nodes) {
 			osmStore.nodes.finalize(threadNum);
 		}
diff --git a/src/sharded_node_store.cpp b/src/sharded_node_store.cpp
index 92169986..3bb38563 100644
--- a/src/sharded_node_store.cpp
+++ b/src/sharded_node_store.cpp
@@ -1,5 +1,7 @@
 #include "sharded_node_store.h"
 
+thread_local size_t lastNodeShard = 0;
+
 ShardedNodeStore::ShardedNodeStore(std::function<std::shared_ptr<NodeStore>()> createNodeStore):
 	createNodeStore(createNodeStore) {
 	for (int i = 0; i < shards(); i++)
@@ -20,11 +22,17 @@ void ShardedNodeStore::finalize(size_t threadNum) {
 }
 
 LatpLon ShardedNodeStore::at(NodeID id) const {
-	// TODO: look in the last store we successfully found something, using
-	// a thread local
-	for (int i = 0; i < shards(); i++)
-		if (stores[i]->contains(0, id) || i == shards() - 1)
-			return stores[i]->at(id);
+	for (int i = 0; i < shards(); i++) {
+		size_t index = (lastNodeShard + i) % shards();
+
+		if (stores[index]->contains(0, id)) {
+			lastNodeShard = index;
+			return stores[index]->at(id);
+		}
+	}
+
+	// Superfluous return to silence a compiler warning
+	return stores[shards() - 1]->at(id);
 }
 
 size_t ShardedNodeStore::size() const {
diff --git a/src/sharded_way_store.cpp b/src/sharded_way_store.cpp
new file mode 100644
index 00000000..f4285ff5
--- /dev/null
+++ b/src/sharded_way_store.cpp
@@ -0,0 +1,77 @@
+#include "sharded_way_store.h"
+#include "node_store.h"
+
+thread_local size_t lastWayShard = 0;
+
+ShardedWayStore::ShardedWayStore(std::function<std::shared_ptr<WayStore>()> createWayStore, const NodeStore& nodeStore):
+	createWayStore(createWayStore),
+	nodeStore(nodeStore) {
+	for (int i = 0; i < shards(); i++)
+		stores.push_back(createWayStore());
+}
+
+ShardedWayStore::~ShardedWayStore() {
+}
+
+void ShardedWayStore::reopen() {
+	for (auto& store : stores)
+		store->reopen();
+}
+
+void ShardedWayStore::batchStart() {
+	for (auto& store : stores)
+		store->batchStart();
+}
+
+std::vector<LatpLon> ShardedWayStore::at(WayID wayid) const {
+	for (int i = 0; i < shards(); i++) {
+		size_t index = (lastWayShard + i) % shards();
+		if (stores[index]->contains(0, wayid)) {
+			lastWayShard = index;
+			return stores[index]->at(wayid);
+		}
+	}
+
+	// Superfluous return to silence a compiler warning
+	return stores[shards() - 1]->at(wayid);
+}
+
+bool ShardedWayStore::requiresNodes() const {
+	return stores[0]->requiresNodes();
+}
+
+void ShardedWayStore::insertLatpLons(std::vector<WayStore::ll_element_t> &newWays) {
+	throw std::runtime_error("ShardedWayStore::insertLatpLons: don't call this directly");
+}
+
+void ShardedWayStore::insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) {
+	throw std::runtime_error("ShardedWayStore::insertNodes: don't call this directly");
+}
+
+void ShardedWayStore::clear() {
+	for (auto& store : stores)
+		store->clear();
+}
+
+std::size_t ShardedWayStore::size() const {
+	size_t rv = 0;
+	for (auto& store : stores)
+		rv += store->size();
+	return rv;
+}
+
+void ShardedWayStore::finalize(unsigned int threadNum) {
+	for (auto& store : stores)
+		store->finalize(threadNum);
+}
+
+bool ShardedWayStore::contains(size_t shard, WayID id) const {
+	return stores[shard]->contains(0, id);
+}
+
+WayStore& ShardedWayStore::shard(size_t shard) {
+	return *stores[shard].get();
+}
+
+size_t ShardedWayStore::shards() const { return nodeStore.shards(); }
+
diff --git a/src/sorted_way_store.cpp b/src/sorted_way_store.cpp
index e7ff4841..27ae6ae2 100644
--- a/src/sorted_way_store.cpp
+++ b/src/sorted_way_store.cpp
@@ -208,7 +208,7 @@ void SortedWayStore::insertLatpLons(std::vector<WayStore::ll_element_t> &newWays
 	throw std::runtime_error("SortedWayStore does not support insertLatpLons");
 }
 
-const void SortedWayStore::insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) {
+void SortedWayStore::insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) {
 	// read_pbf can call with an empty array if the only ways it read were unable to
 	// be processed due to missing nodes, so be robust against empty way vector.
 	if (newWays.empty())
diff --git a/src/tilemaker.cpp b/src/tilemaker.cpp
index 3a32168a..e6f791de 100644
--- a/src/tilemaker.cpp
+++ b/src/tilemaker.cpp
@@ -171,7 +171,7 @@ int main(int argc, char* argv[]) {
 	uint threadNum;
 	string outputFile;
 	string bbox;
-	bool _verbose = false, sqlite= false, mergeSqlite = false, mapsplit = false, osmStoreCompact = false, skipIntegrity = false, osmStoreUncompressedNodes = false, osmStoreUncompressedWays = false, materializeGeometries = false;
+	bool _verbose = false, sqlite= false, mergeSqlite = false, mapsplit = false, osmStoreCompact = false, skipIntegrity = false, osmStoreUncompressedNodes = false, osmStoreUncompressedWays = false, materializeGeometries = false, shardStores = false;
 	bool logTileTimings = false;
 
 	po::options_description desc("tilemaker " STR(TM_VERSION) "\nConvert OpenStreetMap .pbf files into vector tiles\n\nAvailable options");
@@ -188,6 +188,7 @@ int main(int argc, char* argv[]) {
 		("no-compress-nodes", po::bool_switch(&osmStoreUncompressedNodes),  "Store nodes uncompressed")
 		("no-compress-ways", po::bool_switch(&osmStoreUncompressedWays),  "Store ways uncompressed")
 		("materialize-geometries", po::bool_switch(&materializeGeometries),  "Materialize geometries - faster, but requires more memory")
+		("shard-stores", po::bool_switch(&shardStores),  "Shard stores - use an alternate reading/writing strategy for low-memory machines")
 		("verbose",po::bool_switch(&_verbose),                                   "verbose error output")
 		("skip-integrity",po::bool_switch(&skipIntegrity),                       "don't enforce way/node integrity")
 		("log-tile-timings", po::bool_switch(&logTileTimings), "log how long each tile takes")
@@ -311,8 +312,7 @@ int main(int argc, char* argv[]) {
 
 	shared_ptr<NodeStore> nodeStore;
 
-	// TODO: make this a flag
-	if (true) {
+	if (shardStores) {
 		nodeStore = std::make_shared<ShardedNodeStore>(createNodeStore);
 	} else {
 		nodeStore = createNodeStore();
@@ -328,7 +328,12 @@ int main(int argc, char* argv[]) {
 		return rv;
 	};
 
-	shared_ptr<WayStore> wayStore = createWayStore();
+	shared_ptr<WayStore> wayStore;
+	if (shardStores) {
+		wayStore = std::make_shared<ShardedWayStore>(createWayStore, *nodeStore.get());
+	} else {
+		wayStore = createWayStore();
+	}
 
 	OSMStore osmStore(*nodeStore.get(), *wayStore.get());
 	osmStore.use_compact_store(osmStoreCompact);
@@ -389,6 +394,7 @@ int main(int argc, char* argv[]) {
 			
 			const bool hasSortTypeThenID = PbfHasOptionalFeature(inputFile, OptionSortTypeThenID);
 			int ret = pbfReader.ReadPbfFile(
+				nodeStore->shards(),
 				hasSortTypeThenID,
 				nodeKeys,
 				threadNum,
@@ -477,6 +483,7 @@ int main(int argc, char* argv[]) {
 			vector<char> pbf = mapsplitFile.readTile(srcZ,srcX,tmsY);
 
 			int ret = pbfReader.ReadPbfFile(
+				nodeStore->shards(),
 				false,
 				nodeKeys,
 				1,
diff --git a/src/way_stores.cpp b/src/way_stores.cpp
index e19cbf5a..790ad816 100644
--- a/src/way_stores.cpp
+++ b/src/way_stores.cpp
@@ -47,7 +47,7 @@ void BinarySearchWayStore::insertLatpLons(std::vector<WayStore::ll_element_t> &n
 	std::copy(std::make_move_iterator(newWays.begin()), std::make_move_iterator(newWays.end()), mLatpLonLists->begin() + i); 
 }
 
-const void BinarySearchWayStore::insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) {
+void BinarySearchWayStore::insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) {
 	throw std::runtime_error("BinarySearchWayStore does not support insertNodes");
 }
 

From 4bfca704a7a6d8ae034e9f31c982baa1c6ef8bac Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Mon, 18 Dec 2023 09:11:04 -0500
Subject: [PATCH 32/49] fewer, more balanced shards

---
 src/sharded_node_store.cpp | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/sharded_node_store.cpp b/src/sharded_node_store.cpp
index 3bb38563..4c222187 100644
--- a/src/sharded_node_store.cpp
+++ b/src/sharded_node_store.cpp
@@ -53,13 +53,19 @@ size_t pickStore(const LatpLon& el) {
 	// Europe still basically gets its own bucket, but probably should be split up
 	// more.
 
-	const size_t z3x = lon2tilex(el.lon / 10000000, 3);
-	const size_t z3y = latp2tiley(el.latp / 10000000, 3);
+	const size_t z4x = lon2tilex(el.lon / 10000000, 4);
+	const size_t z4y = latp2tiley(el.latp / 10000000, 4);
+
+	const size_t z3x = z4x / 2;
+	const size_t z3y = z4y / 2;
 
-	if (z3x == 4 && z3y == 2) return 4; // Central Europe
 	if (z3x == 5 && z3y == 2) return 5; // Western Russia
-	if (z3x == 4 && z3y == 3) return 6; // North Africa
-	if (z3x == 5 && z3y == 3) return 7; // India
+	if (z3x == 4 && z3y == 3) return 5; // North Africa
+	if (z3x == 5 && z3y == 3) return 5; // India
+
+	if (z4x == 8 && z4y == 5) return 4; // some of Central Europe
+
+	if (z3x == 4 && z3y == 2) return 3; // rest of Central Europe
 
 	const size_t z2x = z3x / 2;
 	const size_t z2y = z3y / 2;
@@ -69,7 +75,7 @@ size_t pickStore(const LatpLon& el) {
 	if (z2x == 0 && z2y == 1) return 1; // North America
 
 //	std::cout << "z2x=" << std::to_string(z2x) << ", z2y=" << std::to_string(z2y) << std::endl;
-	return 0; // Artic, Antartcica, Oceania
+	return 0; // Artic, Antartcica, Oceania, South Africa, South America
 }
 
 void ShardedNodeStore::insert(const std::vector<element_t>& elements) {
@@ -90,5 +96,5 @@ bool ShardedNodeStore::contains(size_t shard, NodeID id) const {
 }
 
 size_t ShardedNodeStore::shards() const {
-	return 8;
+	return 6;
 }

From ffbd1942bd44bf743dfa1cc018e9e26be869d6d4 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Mon, 18 Dec 2023 09:17:32 -0500
Subject: [PATCH 33/49] skip ReadPhase::Ways passes if node store is empty

---
 include/node_store.h         | 2 ++
 include/node_stores.h        | 4 ++++
 include/read_pbf.h           | 4 +++-
 include/sharded_node_store.h | 2 ++
 include/sorted_node_store.h  | 2 ++
 src/read_pbf.cpp             | 9 ++++++++-
 src/tilemaker.cpp            | 8 ++++++--
 7 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/include/node_store.h b/include/node_store.h
index a2547fd5..76fe18b3 100644
--- a/include/node_store.h
+++ b/include/node_store.h
@@ -25,6 +25,8 @@ class NodeStore
 	virtual LatpLon at(NodeID i) const = 0;
 
 	virtual bool contains(size_t shard, NodeID id) const = 0;
+	virtual NodeStore& shard(size_t shard) = 0;
+	virtual const NodeStore& shard(size_t shard) const = 0;
 	virtual size_t shards() const = 0;
 };
 
diff --git a/include/node_stores.h b/include/node_stores.h
index 80a94868..2ef14b70 100644
--- a/include/node_stores.h
+++ b/include/node_stores.h
@@ -26,6 +26,8 @@ class BinarySearchNodeStore : public NodeStore
 	void batchStart() {}
 
 	bool contains(size_t shard, NodeID id) const override;
+	NodeStore& shard(size_t shard) override { return *this; }
+	const NodeStore& shard(size_t shard) const override { return *this; }
 	size_t shards() const override { return 1; }
 	
 
@@ -59,6 +61,8 @@ class CompactNodeStore : public NodeStore
 	// CompactNodeStore has no metadata to know whether or not it contains
 	// a node, so it's not suitable for used in sharded scenarios.
 	bool contains(size_t shard, NodeID id) const override { return true; }
+	NodeStore& shard(size_t shard) override { return *this; }
+	const NodeStore& shard(size_t shard) const override { return *this; }
 	size_t shards() const override { return 1; }
 
 private: 
diff --git a/include/read_pbf.h b/include/read_pbf.h
index 4ab44612..a9ffe29c 100644
--- a/include/read_pbf.h
+++ b/include/read_pbf.h
@@ -58,7 +58,9 @@ class PbfReader
 		const std::unordered_set<std::string>& nodeKeys,
 		unsigned int threadNum,
 		const pbfreader_generate_stream& generate_stream,
-		const pbfreader_generate_output& generate_output
+		const pbfreader_generate_output& generate_output,
+		const NodeStore& nodeStore,
+		const WayStore& wayStore
 	);
 
 	// Read tags into a map from a way/node/relation
diff --git a/include/sharded_node_store.h b/include/sharded_node_store.h
index 44938126..ef001347 100644
--- a/include/sharded_node_store.h
+++ b/include/sharded_node_store.h
@@ -20,6 +20,8 @@ class ShardedNodeStore : public NodeStore {
 	}
 
 	bool contains(size_t shard, NodeID id) const override;
+	NodeStore& shard(size_t shard) override { return *stores[shard]; }
+	const NodeStore& shard(size_t shard) const override { return *stores[shard]; }
 	size_t shards() const override;
 
 private:
diff --git a/include/sorted_node_store.h b/include/sorted_node_store.h
index 0e8d2e24..e2832df8 100644
--- a/include/sorted_node_store.h
+++ b/include/sorted_node_store.h
@@ -71,6 +71,8 @@ class SortedNodeStore : public NodeStore
 	}
 
 	bool contains(size_t shard, NodeID id) const override;
+	NodeStore& shard(size_t shard) override { return *this; }
+	const NodeStore& shard(size_t shard) const override { return *this; }
 	size_t shards() const override { return 1; }
 
 private: 
diff --git a/src/read_pbf.cpp b/src/read_pbf.cpp
index f371cded..d2395bde 100644
--- a/src/read_pbf.cpp
+++ b/src/read_pbf.cpp
@@ -398,7 +398,9 @@ int PbfReader::ReadPbfFile(
 	unordered_set<string> const& nodeKeys,
 	unsigned int threadNum,
 	const pbfreader_generate_stream& generate_stream,
-	const pbfreader_generate_output& generate_output
+	const pbfreader_generate_output& generate_output,
+	const NodeStore& nodeStore,
+	const WayStore& wayStore
 )
 {
 	auto infile = generate_stream();
@@ -499,6 +501,11 @@ int PbfReader::ReadPbfFile(
 			effectiveShards = shards;
 
 		for (int shard = 0; shard < effectiveShards; shard++) {
+			// If we're in ReadPhase::Ways, only do a pass if there is at least one
+			// entry in the pass's shard.
+			if (phase == ReadPhase::Ways && nodeStore.shard(shard).size() == 0)
+				continue;
+
 #ifdef CLOCK_MONOTONIC
 			timespec start, end;
 			clock_gettime(CLOCK_MONOTONIC, &start);
diff --git a/src/tilemaker.cpp b/src/tilemaker.cpp
index e6f791de..1cb95a95 100644
--- a/src/tilemaker.cpp
+++ b/src/tilemaker.cpp
@@ -405,7 +405,9 @@ int main(int argc, char* argv[]) {
 				[&]() {
 					thread_local std::shared_ptr<OsmLuaProcessing> osmLuaProcessing(new OsmLuaProcessing(osmStore, config, layers, luaFile, shpMemTiles, osmMemTiles, attributeStore, materializeGeometries));
 					return osmLuaProcessing;
-				}
+				},
+				*nodeStore,
+				*wayStore
 			);
 			if (ret != 0) return ret;
 		} 
@@ -492,7 +494,9 @@ int main(int argc, char* argv[]) {
 				},
 				[&]() {
 					return std::make_unique<OsmLuaProcessing>(osmStore, config, layers, luaFile, shpMemTiles, osmMemTiles, attributeStore, materializeGeometries);
-				}
+				},
+				*nodeStore,
+				*wayStore
 			);
 			if (ret != 0) return ret;
 

From 0affec49fdd64f29137f1039d205076170eaed2b Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Mon, 18 Dec 2023 09:28:11 -0500
Subject: [PATCH 34/49] support multiple passes for ReadPhase::Relations

---
 include/read_pbf.h          |  4 +++-
 include/sharded_way_store.h |  1 +
 include/sorted_way_store.h  |  1 +
 include/way_store.h         |  1 +
 include/way_stores.h        |  1 +
 src/read_pbf.cpp            | 25 ++++++++++++++++++++-----
 src/sharded_way_store.cpp   |  4 ++++
 7 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/include/read_pbf.h b/include/read_pbf.h
index a9ffe29c..94adb8e0 100644
--- a/include/read_pbf.h
+++ b/include/read_pbf.h
@@ -101,7 +101,9 @@ class PbfReader
 		OsmLuaProcessing& output,
 		PrimitiveGroup& pg,
 		const PrimitiveBlock& pb,
-		const BlockMetadata& blockMetadata
+		const BlockMetadata& blockMetadata,
+		uint shard,
+		uint effectiveShards
 	);
 
 	inline bool RelationIsType(Relation const &rel, int typeKey, int val) {
diff --git a/include/sharded_way_store.h b/include/sharded_way_store.h
index b57d03e0..40a3d331 100644
--- a/include/sharded_way_store.h
+++ b/include/sharded_way_store.h
@@ -23,6 +23,7 @@ class ShardedWayStore : public WayStore {
 
 	bool contains(size_t shard, WayID id) const override;
 	WayStore& shard(size_t shard) override;
+	const WayStore& shard(size_t shard) const override;
 	size_t shards() const override;
 	
 private:
diff --git a/include/sorted_way_store.h b/include/sorted_way_store.h
index 890a9a53..b99ba7de 100644
--- a/include/sorted_way_store.h
+++ b/include/sorted_way_store.h
@@ -97,6 +97,7 @@ class SortedWayStore: public WayStore {
 
 	bool contains(size_t shard, WayID id) const override;
 	WayStore& shard(size_t shard) override { return *this; }
+	const WayStore& shard(size_t shard) const override { return *this; }
 	size_t shards() const override { return 1; }
 	
 	static uint16_t encodeWay(
diff --git a/include/way_store.h b/include/way_store.h
index c2b959c7..36862344 100644
--- a/include/way_store.h
+++ b/include/way_store.h
@@ -24,6 +24,7 @@ class WayStore {
 
 	virtual bool contains(size_t shard, WayID id) const = 0;
 	virtual WayStore& shard(size_t shard) = 0;
+	virtual const WayStore& shard(size_t shard) const = 0;
 	virtual size_t shards() const = 0;
 };
 
diff --git a/include/way_stores.h b/include/way_stores.h
index f66e3939..0f94e845 100644
--- a/include/way_stores.h
+++ b/include/way_stores.h
@@ -24,6 +24,7 @@ class BinarySearchWayStore: public WayStore {
 
 	bool contains(size_t shard, WayID id) const override;
 	WayStore& shard(size_t shard) override { return *this; }
+	const WayStore& shard(size_t shard) const override { return *this; }
 	size_t shards() const override { return 1; }
 
 private:
diff --git a/src/read_pbf.cpp b/src/read_pbf.cpp
index d2395bde..55d2a6f3 100644
--- a/src/read_pbf.cpp
+++ b/src/read_pbf.cpp
@@ -206,7 +206,9 @@ bool PbfReader::ReadRelations(
 	OsmLuaProcessing& output,
 	PrimitiveGroup& pg,
 	const PrimitiveBlock& pb,
-	const BlockMetadata& blockMetadata
+	const BlockMetadata& blockMetadata,
+	uint shard,
+	uint effectiveShards
 ) {
 	// ----	Read relations
 
@@ -232,15 +234,24 @@ bool PbfReader::ReadRelations(
 				WayVec outerWayVec, innerWayVec;
 				int64_t lastID = 0;
 				bool isInnerOuter = isBoundary || isMultiPolygon;
+				bool skipToNext = false;
 				for (int n=0; n < pbfRelation.memids_size(); n++) {
 					lastID += pbfRelation.memids(n);
 					if (pbfRelation.types(n) != Relation_MemberType_WAY) { continue; }
 					int32_t role = pbfRelation.roles_sid(n);
 					if (role==innerKey || role==outerKey) isInnerOuter=true;
 					WayID wayId = static_cast<WayID>(lastID);
+
+					if (n == 0 && effectiveShards > 0 && !osmStore.ways.contains(shard, wayId)) {
+						skipToNext = true;
+						break;
+					}
 					(role == innerKey ? innerWayVec : outerWayVec).push_back(wayId);
 				}
 
+				if (skipToNext)
+					continue;
+
 				try {
 					tag_map_t tags;
 					readTags(pbfRelation, pb, tags);
@@ -340,7 +351,7 @@ bool PbfReader::ReadBlock(
 		}
 
 		if(phase == ReadPhase::Relations) {
-			bool done = ReadRelations(output, pg, pb, blockMetadata);
+			bool done = ReadRelations(output, pg, pb, blockMetadata, shard, effectiveShards);
 			if(done) { 
 				output_progress();
 				++read_groups;
@@ -495,9 +506,9 @@ int PbfReader::ReadPbfFile(
 	for(auto phase: all_phases) {
 		uint effectiveShards = 1;
 
-		// On memory-constrained machines, we might read ways multiple times in order
-		// to keep the working set of nodes limited.
-		if (phase == ReadPhase::Ways)
+		// On memory-constrained machines, we might read ways/relations
+		// multiple times in order to keep the working set of nodes limited.
+		if (phase == ReadPhase::Ways || phase == ReadPhase::Relations)
 			effectiveShards = shards;
 
 		for (int shard = 0; shard < effectiveShards; shard++) {
@@ -506,6 +517,10 @@ int PbfReader::ReadPbfFile(
 			if (phase == ReadPhase::Ways && nodeStore.shard(shard).size() == 0)
 				continue;
 
+			// Ditto, but for relations
+			if (phase == ReadPhase::Relations && wayStore.shard(shard).size() == 0)
+				continue;
+
 #ifdef CLOCK_MONOTONIC
 			timespec start, end;
 			clock_gettime(CLOCK_MONOTONIC, &start);
diff --git a/src/sharded_way_store.cpp b/src/sharded_way_store.cpp
index f4285ff5..d9741082 100644
--- a/src/sharded_way_store.cpp
+++ b/src/sharded_way_store.cpp
@@ -73,5 +73,9 @@ WayStore& ShardedWayStore::shard(size_t shard) {
 	return *stores[shard].get();
 }
 
+const WayStore& ShardedWayStore::shard(size_t shard) const {
+	return *stores[shard].get();
+}
+
 size_t ShardedWayStore::shards() const { return nodeStore.shards(); }
 

From 3a2c87aab8c24308934bf7e2f9ea0c40591c63f2 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Mon, 18 Dec 2023 09:32:16 -0500
Subject: [PATCH 35/49] fix check for first way

---
 src/read_pbf.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/read_pbf.cpp b/src/read_pbf.cpp
index 55d2a6f3..2d1e73fc 100644
--- a/src/read_pbf.cpp
+++ b/src/read_pbf.cpp
@@ -235,6 +235,7 @@ bool PbfReader::ReadRelations(
 				int64_t lastID = 0;
 				bool isInnerOuter = isBoundary || isMultiPolygon;
 				bool skipToNext = false;
+				bool firstWay = true;
 				for (int n=0; n < pbfRelation.memids_size(); n++) {
 					lastID += pbfRelation.memids(n);
 					if (pbfRelation.types(n) != Relation_MemberType_WAY) { continue; }
@@ -242,10 +243,12 @@ bool PbfReader::ReadRelations(
 					if (role==innerKey || role==outerKey) isInnerOuter=true;
 					WayID wayId = static_cast<WayID>(lastID);
 
-					if (n == 0 && effectiveShards > 0 && !osmStore.ways.contains(shard, wayId)) {
+					if (firstWay && effectiveShards > 0 && !osmStore.ways.contains(shard, wayId)) {
 						skipToNext = true;
 						break;
 					}
+					if (firstWay)
+						firstWay = false;
 					(role == innerKey ? innerWayVec : outerWayVec).push_back(wayId);
 				}
 

From f499e344a8b90ec9aa24f8cd75c6338a92e86b45 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Mon, 18 Dec 2023 17:47:49 -0500
Subject: [PATCH 36/49] adjust shards

With this distribution, no node shard is more than ~8.5GB.
---
 src/sharded_node_store.cpp | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/sharded_node_store.cpp b/src/sharded_node_store.cpp
index 4c222187..964d61fa 100644
--- a/src/sharded_node_store.cpp
+++ b/src/sharded_node_store.cpp
@@ -49,21 +49,24 @@ void ShardedNodeStore::batchStart() {
 }
 
 size_t pickStore(const LatpLon& el) {
-	// Assign the element to a store. This is pretty naive, we could likely do better--
-	// Europe still basically gets its own bucket, but probably should be split up
-	// more.
+	// Assign the element to a shard. This is a pretty naive division
+	// of the globe, tuned to have max ~10GB of nodes/ways per shard.
 
-	const size_t z4x = lon2tilex(el.lon / 10000000, 4);
-	const size_t z4y = latp2tiley(el.latp / 10000000, 4);
+	const size_t z5x = lon2tilex(el.lon / 10000000, 5);
+	const size_t z5y = latp2tiley(el.latp / 10000000, 5);
+
+	const size_t z4x = z5x / 2;
+	const size_t z4y = z5y / 2;
 
 	const size_t z3x = z4x / 2;
 	const size_t z3y = z4y / 2;
 
-	if (z3x == 5 && z3y == 2) return 5; // Western Russia
-	if (z3x == 4 && z3y == 3) return 5; // North Africa
-	if (z3x == 5 && z3y == 3) return 5; // India
+	if (z3x == 5 && z3y == 2) return 6; // Western Russia
+	if (z3x == 4 && z3y == 3) return 6; // North Africa
+	if (z3x == 5 && z3y == 3) return 6; // India
 
-	if (z4x == 8 && z4y == 5) return 4; // some of Central Europe
+	if ((z5x == 16 && z5y == 10) || (z5x == 16 && z5y == 11)) return 5; // some of Central Europe
+	if ((z5x == 17 && z5y == 10) || (z5x == 17 && z5y == 11)) return 4; // some more of Central Europe
 
 	if (z3x == 4 && z3y == 2) return 3; // rest of Central Europe
 
@@ -96,5 +99,5 @@ bool ShardedNodeStore::contains(size_t shard, NodeID id) const {
 }
 
 size_t ShardedNodeStore::shards() const {
-	return 6;
+	return 7;
 }

From bbf0957c1eb1cca7e35e1aa36e8a672e22a65034 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Tue, 19 Dec 2023 00:28:40 -0500
Subject: [PATCH 37/49] Relations: fix effectiveShards > 1 check

Oops, bug that very moderately affected performance in the non
`--shard-stores` case
---
 src/read_pbf.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/read_pbf.cpp b/src/read_pbf.cpp
index 2d1e73fc..9b8b2f15 100644
--- a/src/read_pbf.cpp
+++ b/src/read_pbf.cpp
@@ -243,7 +243,7 @@ bool PbfReader::ReadRelations(
 					if (role==innerKey || role==outerKey) isInnerOuter=true;
 					WayID wayId = static_cast<WayID>(lastID);
 
-					if (firstWay && effectiveShards > 0 && !osmStore.ways.contains(shard, wayId)) {
+					if (firstWay && effectiveShards > 1 && !osmStore.ways.contains(shard, wayId)) {
 						skipToNext = true;
 						break;
 					}

From 4130f513c55d79943dff48232fd038800a1314d8 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Fri, 22 Dec 2023 00:03:02 -0500
Subject: [PATCH 38/49] extend --materialize-geometries to LayerAsCentroid

It turns out that about 20% of LayerAsCentroid calls are for nodes,
which this branch could already do.

The remaining calls are predominantly ways, e.g. housenumbers.

We always materialize relation centroids, as they're expensive to
compute.

In GB, this saves about 6.4M points, ~102M. Scaled to the planet, it's
perhaps a 4.5GB savings, which should let us use a more aggressive shard
strategy.

It seems to add 3-4 seconds to the time to process GB.
---
 include/osm_mem_tiles.h    | 18 +++++++++---------
 include/tile_data.h        | 12 +++++++-----
 src/osm_lua_processing.cpp | 16 +++++++++++++++-
 src/osm_mem_tiles.cpp      | 25 ++++++++++++++-----------
 src/tile_data.cpp          | 30 ++++++++++--------------------
 src/tile_worker.cpp        |  2 +-
 6 files changed, 56 insertions(+), 47 deletions(-)

diff --git a/include/osm_mem_tiles.h b/include/osm_mem_tiles.h
index e7aff7ee..3c920b08 100644
--- a/include/osm_mem_tiles.h
+++ b/include/osm_mem_tiles.h
@@ -9,12 +9,12 @@
 // NB: Currently, USE_NODE_STORE and USE_WAY_STORE are equivalent.
 // If we permit LayerAsCentroid to be generated from the OSM stores,
 // this will have to change.
-#define OSM_THRESHOLD (1ull << 35)
-#define USE_NODE_STORE (1ull << 35)
-#define IS_NODE(x) (((x) >> 35) == (USE_NODE_STORE >> 35))
-#define USE_WAY_STORE (1ull << 35)
-#define IS_WAY(x) (((x) >> 35) == (USE_WAY_STORE >> 35))
-#define OSM_ID(x) ((x) & 0b11111111111111111111111111111111111)
+#define OSM_THRESHOLD (1ull << TILE_DATA_ID_SIZE)
+#define USE_NODE_STORE (2ull << TILE_DATA_ID_SIZE)
+#define IS_NODE(x) (((x) >> TILE_DATA_ID_SIZE) == (USE_NODE_STORE >> TILE_DATA_ID_SIZE))
+#define USE_WAY_STORE (1ull << TILE_DATA_ID_SIZE)
+#define IS_WAY(x) (((x) >> TILE_DATA_ID_SIZE) == (USE_WAY_STORE >> TILE_DATA_ID_SIZE))
+#define OSM_ID(x) ((x) & 0b1111111111111111111111111111111111)
 
 class NodeStore;
 class WayStore;
@@ -44,14 +44,14 @@ class OsmMemTiles : public TileDataSource {
 		const NodeID objectID,
 		const TileBbox &bbox
 	) override;
-	LatpLon buildNodeGeometry(OutputGeometryType const geomType, NodeID const objectID, const TileBbox &bbox) const override;
+	LatpLon buildNodeGeometry(NodeID const objectID, const TileBbox &bbox) const override;
 
 
 	void Clear();
 
 private:
-	void populateLinestring(Linestring& ls, NodeID objectID);
-	Linestring& getOrBuildLinestring(NodeID objectID);
+	void populateLinestring(Linestring& ls, NodeID objectID) const;
+	Linestring& getOrBuildLinestring(NodeID objectID) const;
 	void populateMultiPolygon(MultiPolygon& dst, NodeID objectID) override;
 
 	const NodeStore& nodeStore;
diff --git a/include/tile_data.h b/include/tile_data.h
index 2c5fe4df..6b59ee3f 100644
--- a/include/tile_data.h
+++ b/include/tile_data.h
@@ -12,6 +12,8 @@
 #include "clip_cache.h"
 #include "mmap_allocator.h"
 
+#define TILE_DATA_ID_SIZE 34
+
 typedef std::vector<class TileDataSource *> SourceList;
 
 class TileBbox;
@@ -407,7 +409,7 @@ class TileDataSource {
 	);
 
 	virtual Geometry buildWayGeometry(OutputGeometryType const geomType, NodeID const objectID, const TileBbox &bbox);
-	virtual LatpLon buildNodeGeometry(OutputGeometryType const geomType, NodeID const objectID, const TileBbox &bbox) const;
+	virtual LatpLon buildNodeGeometry(NodeID const objectID, const TileBbox &bbox) const;
 
 	void open() {
 		// Put something at index 0 of all stores so that 0 can be used
@@ -425,18 +427,18 @@ class TileDataSource {
 	NodeID storePoint(Point const &input);
 
 	inline size_t getShard(NodeID id) const {
-		// Note: we only allocate 35 bits for the IDs. This allows us to
-		// use bit 36 for TileDataSource-specific handling (e.g.,
+		// Note: we only allocate 34 bits for the IDs. This allows us to
+		// use bits 35 and 36 for TileDataSource-specific handling (e.g.,
 		// OsmMemTiles may want to generate points/ways on the fly by
 		// referring to the WayStore).
 
-		return id >> (35 - shardBits);
+		return id >> (TILE_DATA_ID_SIZE - shardBits);
 	}
 
 	virtual void populateMultiPolygon(MultiPolygon& dst, NodeID objectID);
 
 	inline size_t getId(NodeID id) const {
-		return id & (~(~0ull << (35 - shardBits)));
+		return id & (~(~0ull << (TILE_DATA_ID_SIZE - shardBits)));
 	}
 
 	const Point& retrievePoint(NodeID id) const {
diff --git a/src/osm_lua_processing.cpp b/src/osm_lua_processing.cpp
index a90c8b6a..faf69ec7 100644
--- a/src/osm_lua_processing.cpp
+++ b/src/osm_lua_processing.cpp
@@ -468,7 +468,21 @@ void OsmLuaProcessing::LayerAsCentroid(const string &layerName) {
 		return;
 	}
 
-	NodeID id = osmMemTiles.storePoint(geomp);
+	NodeID id = 0;
+	// We don't do lazy centroids for relations - calculating their centroid
+	// can be quite expensive, and there's not as many of them as there are
+	// ways.
+	if (materializeGeometries || isRelation) {
+		id = osmMemTiles.storePoint(geomp);
+	} else if (!isRelation && !isWay) {
+		// Sometimes people call LayerAsCentroid(...) on a node, because they're
+		// writing a generic handler that doesn't know if it's a node or a way,
+		// e.g. POIs.
+		id = USE_NODE_STORE | originalOsmID;
+	} else {
+		id = USE_WAY_STORE | originalOsmID;
+		wayEmitted = true;
+	}
 	OutputObject oo(POINT_, layers.layerMap[layerName], id, 0, layerMinZoom);
 	outputs.push_back(std::make_pair(std::move(oo), attributes));
 }
diff --git a/src/osm_mem_tiles.cpp b/src/osm_mem_tiles.cpp
index 5cfc3c3d..7dc03f45 100644
--- a/src/osm_mem_tiles.cpp
+++ b/src/osm_mem_tiles.cpp
@@ -19,24 +19,27 @@ OsmMemTiles::OsmMemTiles(
 }
 
 LatpLon OsmMemTiles::buildNodeGeometry(
-	OutputGeometryType const geomType, 
 	NodeID const objectID,
 	const TileBbox &bbox
 ) const {
 	if (objectID < OSM_THRESHOLD) {
-		return TileDataSource::buildNodeGeometry(geomType, objectID, bbox);
+		return TileDataSource::buildNodeGeometry(objectID, bbox);
 	}
 
-	switch(geomType) {
-		case POINT_: {
-			return nodeStore.at(OSM_ID(objectID));
-		}
+	if (IS_NODE(objectID))
+		return nodeStore.at(OSM_ID(objectID));
+
 
-		default:
-			break;
+	if (IS_WAY(objectID)) {
+		Linestring& ls = getOrBuildLinestring(objectID);
+		Point centroid;
+		Polygon p;
+		geom::assign_points(p, ls);
+		geom::centroid(p, centroid);
+		return LatpLon{(int32_t)(centroid.y()*10000000.0), (int32_t)(centroid.x()*10000000.0)};
 	}
 
-	throw std::runtime_error("Geometry type is not point");			
+	throw std::runtime_error("OsmMemTiles::buildNodeGeometry: unsupported objectID");
 }
 
 Geometry OsmMemTiles::buildWayGeometry(
@@ -79,7 +82,7 @@ Geometry OsmMemTiles::buildWayGeometry(
 	throw std::runtime_error("buildWayGeometry: unexpected objectID: " + std::to_string(objectID));
 }
 
-void OsmMemTiles::populateLinestring(Linestring& ls, NodeID objectID) {
+void OsmMemTiles::populateLinestring(Linestring& ls, NodeID objectID) const {
 	std::vector<LatpLon> nodes = wayStore.at(OSM_ID(objectID));
 
 	for (const LatpLon& node : nodes) {
@@ -87,7 +90,7 @@ void OsmMemTiles::populateLinestring(Linestring& ls, NodeID objectID) {
 	}
 }
 
-Linestring& OsmMemTiles::getOrBuildLinestring(NodeID objectID) {
+Linestring& OsmMemTiles::getOrBuildLinestring(NodeID objectID) const {
 	// Note: this function returns a reference, not a shared_ptr.
 	//
 	// This is safe, because this function is the only thing that can
diff --git a/src/tile_data.cpp b/src/tile_data.cpp
index 8a8053bf..f78bbdda 100644
--- a/src/tile_data.cpp
+++ b/src/tile_data.cpp
@@ -339,22 +339,12 @@ Geometry TileDataSource::buildWayGeometry(OutputGeometryType const geomType,
 	}
 }
 
-LatpLon TileDataSource::buildNodeGeometry(OutputGeometryType const geomType, 
-                                          NodeID const objectID, const TileBbox &bbox) const {
-	switch(geomType) {
-		case POINT_: {
-			auto p = retrievePoint(objectID);
-			LatpLon out;
-			out.latp = p.y();
-			out.lon  = p.x();
-			return out;
-		}
-
-		default:
-			break;
-	}
-
-	throw std::runtime_error("Geometry type is not point");			
+LatpLon TileDataSource::buildNodeGeometry(NodeID const objectID, const TileBbox &bbox) const {
+	auto p = retrievePoint(objectID);
+	LatpLon out;
+	out.latp = p.y();
+	out.lon  = p.x();
+	return out;
 }
 
 
@@ -538,7 +528,7 @@ NodeID TileDataSource::storePoint(const Point& input) {
 
 	NodeID offset = store.second->size();
 	store.second->emplace_back(input);
-	NodeID rv = (store.first << (35 - shardBits)) + offset;
+	NodeID rv = (store.first << (TILE_DATA_ID_SIZE - shardBits)) + offset;
 	return rv;
 }
 
@@ -548,7 +538,7 @@ NodeID TileDataSource::storeLinestring(const Linestring& src) {
 
 	NodeID offset = store.second->size();
 	store.second->emplace_back(std::move(dst));
-	NodeID rv = (store.first << (35 - shardBits)) + offset;
+	NodeID rv = (store.first << (TILE_DATA_ID_SIZE - shardBits)) + offset;
 	return rv;
 }
 
@@ -570,7 +560,7 @@ NodeID TileDataSource::storeMultiPolygon(const MultiPolygon& src) {
 
 	NodeID offset = store.second->size();
 	store.second->emplace_back(std::move(dst));
-	NodeID rv = (store.first << (35 - shardBits)) + offset;
+	NodeID rv = (store.first << (TILE_DATA_ID_SIZE - shardBits)) + offset;
 	return rv;
 }
 
@@ -585,7 +575,7 @@ NodeID TileDataSource::storeMultiLinestring(const MultiLinestring& src) {
 
 	NodeID offset = store.second->size();
 	store.second->emplace_back(std::move(dst));
-	NodeID rv = (store.first << (35 - shardBits)) + offset;
+	NodeID rv = (store.first << (TILE_DATA_ID_SIZE - shardBits)) + offset;
 	return rv;
 }
 
diff --git a/src/tile_worker.cpp b/src/tile_worker.cpp
index 07d8320a..d59e7fef 100644
--- a/src/tile_worker.cpp
+++ b/src/tile_worker.cpp
@@ -176,7 +176,7 @@ void ProcessObjects(
 
 		if (oo.oo.geomType == POINT_) {
 			vector_tile::Tile_Feature *featurePtr = vtLayer->add_features();
-			LatpLon pos = source->buildNodeGeometry(oo.oo.geomType, oo.oo.objectID, bbox);
+			LatpLon pos = source->buildNodeGeometry(oo.oo.objectID, bbox);
 			featurePtr->add_geometry(9);					// moveTo, repeat x1
 			pair<int,int> xy = bbox.scaleLatpLon(pos.latp/10000000.0, pos.lon/10000000.0);
 			featurePtr->add_geometry((xy.first  << 1) ^ (xy.first  >> 31));

From d6d3f0ee3f86cf63eaf7e6060c932c4c9ea2c819 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sat, 23 Dec 2023 17:26:55 -0500
Subject: [PATCH 39/49] add `DequeMap`, change AttributeStore to use it

This implements the idea in https://github.com/systemed/tilemaker/issues/622#issuecomment-1866813888

Rather than storing a `deque<T>` and a `flat_map<T*, uint32_t>`,
store a `deque<T>` and `vector<uint32_t>`, to save 8 bytes per
AttributePair and AttributeSet.
---
 Makefile                      |   6 ++
 include/attribute_store.h     |  93 +++++++++++-------------
 include/deque_map.h           | 128 ++++++++++++++++++++++++++++++++++
 src/attribute_store.cpp       |  79 +++++++++------------
 test/attribute_store.test.cpp |   1 -
 test/deque_map.test.cpp       |  63 +++++++++++++++++
 6 files changed, 272 insertions(+), 98 deletions(-)
 create mode 100644 include/deque_map.h
 create mode 100644 test/deque_map.test.cpp

diff --git a/Makefile b/Makefile
index 81779d79..0cdd9935 100644
--- a/Makefile
+++ b/Makefile
@@ -130,6 +130,7 @@ tilemaker: \
 test: \
 	test_append_vector \
 	test_attribute_store \
+	test_deque_map \
 	test_pooled_string \
 	test_sorted_node_store \
 	test_sorted_way_store
@@ -146,6 +147,11 @@ test_attribute_store: \
 	test/attribute_store.test.o
 	$(CXX) $(CXXFLAGS) -o test.attribute_store $^ $(INC) $(LIB) $(LDFLAGS) && ./test.attribute_store
 
+test_deque_map: \
+	test/deque_map.test.o
+	$(CXX) $(CXXFLAGS) -o test.deque_map $^ $(INC) $(LIB) $(LDFLAGS) && ./test.deque_map
+
+
 test_pooled_string: \
 	src/mmap_allocator.o \
 	src/pooled_string.o \
diff --git a/include/attribute_store.h b/include/attribute_store.h
index 194f62f6..3aea19cf 100644
--- a/include/attribute_store.h
+++ b/include/attribute_store.h
@@ -11,6 +11,7 @@
 #include <boost/container/flat_map.hpp>
 #include <vector>
 #include "pooled_string.h"
+#include "deque_map.h"
 
 /* AttributeStore - global dictionary for attributes */
 
@@ -90,6 +91,19 @@ struct AttributePair {
 		return *this;
 	}
 
+	bool operator<(const AttributePair& other) const {
+		if (minzoom != other.minzoom)
+			return minzoom < other.minzoom;
+		if (keyIndex != other.keyIndex)
+			return keyIndex < other.keyIndex;
+		if (valueType != other.valueType) return valueType < other.valueType;
+
+		if (hasStringValue()) return pooledString() < other.pooledString();
+		if (hasBoolValue()) return boolValue() < other.boolValue();
+		if (hasFloatValue()) return floatValue() < other.floatValue();
+		throw std::runtime_error("Invalid type in attribute store");
+	}
+
 	bool operator==(const AttributePair &other) const {
 		if (minzoom!=other.minzoom || keyIndex!=other.keyIndex || valueType!=other.valueType) return false;
 		if (valueType == AttributePairType::String)
@@ -173,16 +187,14 @@ class AttributePairStore {
 public:
 	AttributePairStore():
 		finalized(false),
-		pairs(ATTRIBUTE_SHARDS),
-		pairsMaps(ATTRIBUTE_SHARDS),
-		pairsMutex(ATTRIBUTE_SHARDS),
-		hotShardSize(0)
+		pairsMutex(ATTRIBUTE_SHARDS)
 	{
-		// NB: the hot shard is stored in its own, pre-allocated vector.
-		// pairs[0] is _not_ the hot shard
-		hotShard.reserve(1 << 16);
-		for (size_t i = 0; i < 1 << 16; i++)
-			hotShard.push_back(AttributePair(0, false, 0));
+		// The "hot" shard has a capacity of 64K, the others are unbounded.
+		pairs.push_back(DequeMap<AttributePair>(1 << 16));
+		// Reserve offset 0 as a sentinel
+		pairs[0].add(AttributePair(0, false, 0));
+		for (size_t i = 1; i < ATTRIBUTE_SHARDS; i++)
+			pairs.push_back(DequeMap<AttributePair>());
 	}
 
 	void finalize() { finalized = true; }
@@ -190,23 +202,7 @@ class AttributePairStore {
 	const AttributePair& getPairUnsafe(uint32_t i) const;
 	uint32_t addPair(AttributePair& pair, bool isHot);
 
-	struct key_value_less_ptr {
-		bool operator()(AttributePair const* lhs, AttributePair const* rhs) const {            
-			if (lhs->minzoom != rhs->minzoom)
-				return lhs->minzoom < rhs->minzoom;
-			if (lhs->keyIndex != rhs->keyIndex)
-				return lhs->keyIndex < rhs->keyIndex;
-			if (lhs->valueType != rhs->valueType) return lhs->valueType < rhs->valueType;
-
-			if (lhs->hasStringValue()) return lhs->pooledString() < rhs->pooledString();
-			if (lhs->hasBoolValue()) return lhs->boolValue() < rhs->boolValue();
-			if (lhs->hasFloatValue()) return lhs->floatValue() < rhs->floatValue();
-			throw std::runtime_error("Invalid type in attribute store");
-		}
-	}; 
-
-	std::vector<std::deque<AttributePair>> pairs;
-	std::vector<boost::container::flat_map<const AttributePair*, uint32_t, AttributePairStore::key_value_less_ptr>> pairsMaps;
+	std::vector<DequeMap<AttributePair>> pairs;
 
 private:
 	bool finalized;
@@ -218,41 +214,37 @@ class AttributePairStore {
 	// we suspect will be popular. It only ever has 64KB items,
 	// so that we can reference it with a short.
 	mutable std::vector<std::mutex> pairsMutex;
-	std::atomic<uint32_t> hotShardSize;
-	std::vector<AttributePair> hotShard;
 };
 
 // AttributeSet is a set of AttributePairs
 // = the complete attributes for one object
 struct AttributeSet {
 
-	struct less_ptr {
-		bool operator()(const AttributeSet* lhs, const AttributeSet* rhs) const {            
-			if (lhs->useVector != rhs->useVector)
-				return lhs->useVector < rhs->useVector;
-
-			if (lhs->useVector) {
-				if (lhs->intValues.size() != rhs->intValues.size())
-					return lhs->intValues.size() < rhs->intValues.size();
-
-				for (int i = 0; i < lhs->intValues.size(); i++) {
-					if (lhs->intValues[i] != rhs->intValues[i]) {
-						return lhs->intValues[i] < rhs->intValues[i];
-					}
-				}
+	bool operator<(const AttributeSet& other) const {
+		if (useVector != other.useVector)
+			return useVector < other.useVector;
 
-				return false;
-			}
+		if (useVector) {
+			if (intValues.size() != other.intValues.size())
+				return intValues.size() < other.intValues.size();
 
-			for (int i = 0; i < sizeof(lhs->shortValues)/sizeof(lhs->shortValues[0]); i++) {
-				if (lhs->shortValues[i] != rhs->shortValues[i]) {
-					return lhs->shortValues[i] < rhs->shortValues[i];
+			for (int i = 0; i < intValues.size(); i++) {
+				if (intValues[i] != other.intValues[i]) {
+					return intValues[i] < other.intValues[i];
 				}
 			}
 
 			return false;
 		}
-	}; 
+
+		for (int i = 0; i < sizeof(shortValues)/sizeof(shortValues[0]); i++) {
+			if (shortValues[i] != other.shortValues[i]) {
+				return shortValues[i] < other.shortValues[i];
+			}
+		}
+
+		return false;
+	}
 
 	size_t hash() const {
 		// Values are in canonical form after finalizeSet is called, so
@@ -273,6 +265,7 @@ struct AttributeSet {
 		return idx;
 	}
 
+	bool operator!=(const AttributeSet& other) const { return !(*this == other); }
 	bool operator==(const AttributeSet &other) const {
 		// Equivalent if, for every value in values, there is a value in other.values
 		// whose pair is the same.
@@ -412,7 +405,6 @@ struct AttributeStore {
 	AttributeStore():
 		finalized(false),
 		sets(ATTRIBUTE_SHARDS),
-		setsMaps(ATTRIBUTE_SHARDS),
 		setsMutex(ATTRIBUTE_SHARDS),
 		lookups(0) {
 	}
@@ -422,8 +414,7 @@ struct AttributeStore {
 
 private:
 	bool finalized;
-	std::vector<std::deque<AttributeSet>> sets;
-	std::vector<boost::container::flat_map<const AttributeSet*, uint32_t, AttributeSet::less_ptr>> setsMaps;
+	std::vector<DequeMap<AttributeSet>> sets;
 	mutable std::vector<std::mutex> setsMutex;
 
 	mutable std::mutex mutex;
diff --git a/include/deque_map.h b/include/deque_map.h
new file mode 100644
index 00000000..2ec20387
--- /dev/null
+++ b/include/deque_map.h
@@ -0,0 +1,128 @@
+#ifndef DEQUE_MAP_H
+#define DEQUE_MAP_H
+
+#include <algorithm>
+#include <boost/range/irange.hpp>
+#include <cstring>
+#include <deque>
+#include <vector>
+
+// A class which looks deep within the soul of some instance of
+// a class T and assigns it a number based on the order in which
+// it joined (or reminds it of its number).
+//
+// Used to translate an 8-byte pointer into a 4-byte ID that can be
+// used repeatedly.
+template <class T>
+class DequeMap {
+public:
+	DequeMap(): maxSize(0) {}
+	DequeMap(uint32_t maxSize): maxSize(maxSize) {}
+
+	bool full() const {
+		return maxSize != 0 && size() == maxSize;
+	}
+
+	// If `entry` is already in the map, return its index.
+	// Otherwise, if maxSize is `0`, or greater than the number of entries in the map,
+	// add the item and return its index.
+	// Otherwise, return -1.
+	int32_t add(const T& entry) {
+		// Search to see if we've already got this entry.
+		const auto offsets = boost::irange<uint32_t>(0, keys.size());
+		const auto it = std::lower_bound(
+			offsets.begin(),
+			offsets.end(),
+			entry,
+			[&](const auto &e, auto id) {
+				return objects.at(keys[e]) < id;
+			}
+		);
+
+		// We do, return its index.
+		if (it != offsets.end() && objects[keys[*it]] == entry)
+			return keys[*it];
+
+		if (maxSize > 0 && objects.size() >= maxSize)
+			return -1;
+
+		// We don't, so store it...
+		const uint32_t newIndex = objects.size();
+		objects.push_back(entry);
+
+		// ...and add its index to our keys vector.
+		const uint32_t keysOffset = it == offsets.end() ? offsets.size() : *it;
+
+		const uint32_t desiredSize = keys.size() + 1;
+
+		// Amortize growth
+		if (keys.capacity() < desiredSize)
+			keys.reserve(keys.capacity() * 1.5);
+
+		keys.resize(desiredSize);
+
+		// Unless we're adding to the end, we need to shuffle existing keys down
+		// to make room for our new index.
+		if (keysOffset != newIndex) {
+			std::memmove(&keys[keysOffset + 1], &keys[keysOffset], sizeof(uint32_t) * (keys.size() - 1 - keysOffset));
+		}
+
+		keys[keysOffset] = newIndex;
+		return newIndex;
+	}
+
+	void clear() {
+		objects.clear();
+		keys.clear();
+	}
+
+	// Returns the index of `entry` if present, -1 otherwise.
+	int32_t find(const T& entry) const {
+		const auto offsets = boost::irange<uint32_t>(0, keys.size());
+		const auto it = std::lower_bound(
+			offsets.begin(),
+			offsets.end(),
+			entry,
+			[&](const auto &e, auto id) {
+				return objects.at(keys[e]) < id;
+			}
+		);
+
+		// We do, return its index.
+		if (it != offsets.end() && objects[keys[*it]] == entry)
+			return keys[*it];
+
+		return -1;
+	}
+
+	const T& at(uint32_t index) const {
+		return objects.at(index);
+	}
+
+	size_t size() const { return objects.size(); }
+
+	struct iterator {
+		const DequeMap<T>& dm;
+		int offset;
+		iterator(const DequeMap<T>& dm, int offset): dm(dm), offset(offset) {}
+		void operator++() { offset++; }
+		bool operator!=(iterator& other) { return offset != other.offset; }
+		const T& operator*() const { return dm.objects[dm.keys[offset]]; }
+	};
+
+	iterator begin() const { return iterator{*this, 0}; }
+	iterator end() const { return iterator{*this, keys.size()}; }
+
+private:
+	uint32_t maxSize;
+
+	// Using a deque is necessary, as it provides pointer-stability for previously
+	// added objects when it grows the storage (as opposed to, e.g., vector).
+	std::deque<T> objects;
+
+	// Whereas `objects` is ordered by insertion-time, `keys` is sorted such that
+	// objects[key[0]] < objects[key[1]] < ... < objects[key[$]]
+	// operator< of T.
+	std::vector<uint32_t> keys;
+};
+#endif
diff --git a/src/attribute_store.cpp b/src/attribute_store.cpp
index 71c0925b..6fbacbe9 100644
--- a/src/attribute_store.cpp
+++ b/src/attribute_store.cpp
@@ -66,14 +66,22 @@ void AttributePair::ensureStringIsOwned() {
 }
 
 // AttributePairStore
-thread_local boost::container::flat_map<const AttributePair*, uint32_t, AttributePairStore::key_value_less_ptr> tlsHotShardMap;
-thread_local uint16_t tlsHotShardSize = 0;
+thread_local DequeMap<AttributePair> tlsHotShard(1 << 16);
 const AttributePair& AttributePairStore::getPair(uint32_t i) const {
 	uint32_t shard = i >> (32 - SHARD_BITS);
 	uint32_t offset = i & (~(~0u << (32 - SHARD_BITS)));
 
-	if (shard == 0)
-		return hotShard[offset];
+	if (shard == 0) {
+		if (offset < tlsHotShard.size())
+			return tlsHotShard.at(offset);
+
+		{
+			std::lock_guard<std::mutex> lock(pairsMutex[0]);
+			tlsHotShard = pairs[0];
+		}
+
+		return tlsHotShard.at(offset);
+	}
 
 	std::lock_guard<std::mutex> lock(pairsMutex[shard]);
 	return pairs[shard].at(offset);
@@ -86,9 +94,6 @@ const AttributePair& AttributePairStore::getPairUnsafe(uint32_t i) const {
 	uint32_t shard = i >> (32 - SHARD_BITS);
 	uint32_t offset = i & (~(~0u << (32 - SHARD_BITS)));
 
-	if (shard == 0)
-		return hotShard[offset];
-
 	return pairs[shard].at(offset);
 };
 
@@ -96,35 +101,29 @@ uint32_t AttributePairStore::addPair(AttributePair& pair, bool isHot) {
 	if (isHot) {
 		{
 			// First, check our thread-local map.
-			const auto& it = tlsHotShardMap.find(&pair);
-			if (it != tlsHotShardMap.end())
-				return it->second;
+			const auto& index = tlsHotShard.find(pair);
+			if (index != -1)
+				return index;
 		}
+
 		// Not found, ensure our local map is up-to-date for future calls,
 		// and fall through to the main map.
-		//
-		// Note that we can read `hotShard` without a lock, its size is fixed
-		while (tlsHotShardSize < hotShardSize.load()) {
-			tlsHotShardSize++;
-			tlsHotShardMap[&hotShard[tlsHotShardSize]] = tlsHotShardSize;
+		if (!tlsHotShard.full()) {
+			std::lock_guard<std::mutex> lock(pairsMutex[0]);
+			tlsHotShard = pairs[0];
 		}
 
 		// This might be a popular pair, worth re-using.
 		// Have we already assigned it a hot ID?
 		std::lock_guard<std::mutex> lock(pairsMutex[0]);
-		const auto& it = pairsMaps[0].find(&pair);
-		if (it != pairsMaps[0].end())
-			return it->second;
-
-		if (hotShardSize.load() < 1 << 16) {
-			hotShardSize++;
-			uint32_t offset = hotShardSize.load();
+		const auto& index = pairs[0].find(pair);
+		if (index != -1)
+			return index;
 
+		if (!pairs[0].full()) {
 			pair.ensureStringIsOwned();
-			hotShard[offset] = pair;
-			const AttributePair* ptr = &hotShard[offset];
+			uint32_t offset = pairs[0].add(pair);
 			uint32_t rv = (0 << (32 - SHARD_BITS)) + offset;
-			pairsMaps[0][ptr] = rv;
 			return rv;
 		}
 	}
@@ -141,21 +140,17 @@ uint32_t AttributePairStore::addPair(AttributePair& pair, bool isHot) {
 	if (shard == 0) shard = 1;
 
 	std::lock_guard<std::mutex> lock(pairsMutex[shard]);
-	const auto& it = pairsMaps[shard].find(&pair);
-	if (it != pairsMaps[shard].end())
-		return it->second;
+	const auto& index = pairs[shard].find(pair);
+	if (index != -1)
+		return (shard << (32 - SHARD_BITS)) + index;
 
-	uint32_t offset = pairs[shard].size();
+	pair.ensureStringIsOwned();
+	uint32_t offset = pairs[shard].add(pair);
 
 	if (offset >= (1 << (32 - SHARD_BITS)))
 		throw std::out_of_range("pair shard overflow");
 
-	pair.ensureStringIsOwned();
-	pairs[shard].push_back(pair);
-	const AttributePair* ptr = &pairs[shard][offset];
 	uint32_t rv = (shard << (32 - SHARD_BITS)) + offset;
-
-	pairsMaps[shard][ptr] = rv;
 	return rv;
 };
 
@@ -282,19 +277,11 @@ AttributeIndex AttributeStore::add(AttributeSet &attributes) {
 	std::lock_guard<std::mutex> lock(setsMutex[shard]);
 	lookups++;
 
-	// Do we already have it?
-	const auto& existing = setsMaps[shard].find(&attributes);
-	if (existing != setsMaps[shard].end()) return existing->second;
-
-	// No, so add and return the index
-	uint32_t offset = sets[shard].size();
+	const uint32_t offset = sets[shard].add(attributes);
 	if (offset >= (1 << (32 - SHARD_BITS)))
 		throw std::out_of_range("set shard overflow");
-	sets[shard].push_back(attributes);
 
-	const AttributeSet* ptr = &sets[shard][offset];
 	uint32_t rv = (shard << (32 - SHARD_BITS)) + offset;
-	setsMaps[shard][ptr] = rv;
 	return rv;
 }
 
@@ -335,7 +322,7 @@ void AttributeStore::reportSize() const {
 	// Print detailed histogram of frequencies of attributes.
 	if (false) {
 		for (int i = 0; i < ATTRIBUTE_SHARDS; i++) {
-			std::cout << "pairsMaps[" << i << "] has " << pairStore.pairsMaps[i].size() << " entries" << std::endl;
+			std::cout << "pairs[" << i << "] has " << pairStore.pairs[i].size() << " entries" << std::endl;
 		}
 
 		std::map<uint32_t, uint32_t> tagCountDist;
@@ -391,8 +378,8 @@ void AttributeStore::reset() {
 	// This is only used for tests.
 	tlsKeys2Index.clear();
 	tlsKeys2IndexSize = 0;
-	tlsHotShardMap.clear();
-	tlsHotShardSize = 0;
+
+	tlsHotShard.clear();
 }
 
 void AttributeStore::finalize() {
diff --git a/test/attribute_store.test.cpp b/test/attribute_store.test.cpp
index 3f2e28e5..db104a14 100644
--- a/test/attribute_store.test.cpp
+++ b/test/attribute_store.test.cpp
@@ -22,7 +22,6 @@ MU_TEST(test_attribute_store) {
 
 	const auto s1Pairs = store.getUnsafe(s1Index);
 	mu_check(s1Pairs.size() == 5);
-
 	const auto str1 = std::find_if(s1Pairs.begin(), s1Pairs.end(), [&store](auto ap) {
 			return ap->keyIndex == store.keyStore.key2index("str1");
 	});
diff --git a/test/deque_map.test.cpp b/test/deque_map.test.cpp
new file mode 100644
index 00000000..23a3d3cc
--- /dev/null
+++ b/test/deque_map.test.cpp
@@ -0,0 +1,63 @@
+#include <iostream>
+#include <algorithm>
+#include "external/minunit.h"
+#include "deque_map.h"
+
+MU_TEST(test_deque_map) {
+	DequeMap<std::string> strs;
+
+	mu_check(strs.size() == 0);
+	mu_check(!strs.full());
+	mu_check(strs.find("foo") == -1);
+	mu_check(strs.add("foo") == 0);
+	mu_check(!strs.full());
+	mu_check(strs.find("foo") == 0);
+	mu_check(strs.size() == 1);
+	mu_check(strs.add("foo") == 0);
+	mu_check(strs.size() == 1);
+	mu_check(strs.add("bar") == 1);
+	mu_check(strs.size() == 2);
+	mu_check(strs.add("aardvark") == 2);
+	mu_check(strs.size() == 3);
+	mu_check(strs.add("foo") == 0);
+	mu_check(strs.add("bar") == 1);
+	mu_check(strs.add("quux") == 3);
+	mu_check(strs.size() == 4);
+
+	mu_check(strs.at(0) == "foo");
+	mu_check(strs.at(1) == "bar");
+	mu_check(strs.at(2) == "aardvark");
+	mu_check(strs.at(3) == "quux");
+
+	std::vector<std::string> rv;
+	for (std::string x : strs) {
+		rv.push_back(x);
+	}
+	mu_check(rv[0] == "aardvark");
+	mu_check(rv[1] == "bar");
+	mu_check(rv[2] == "foo");
+	mu_check(rv[3] == "quux");
+
+	DequeMap<std::string> boundedMap(1);
+	mu_check(!boundedMap.full());
+	mu_check(boundedMap.add("foo") == 0);
+	mu_check(boundedMap.add("foo") == 0);
+	mu_check(boundedMap.full());
+	mu_check(boundedMap.add("bar") == -1);
+	boundedMap.clear();
+	mu_check(!boundedMap.full());
+	mu_check(boundedMap.find("foo") == -1);
+	mu_check(boundedMap.add("bar") == 0);
+	mu_check(boundedMap.add("bar") == 0);
+	mu_check(boundedMap.full());
+}
+
+MU_TEST_SUITE(test_suite_deque_map) {
+	MU_RUN_TEST(test_deque_map);
+}
+
+int main() {
+	MU_RUN_SUITE(test_suite_deque_map);
+	MU_REPORT();
+	return MU_EXIT_CODE;
+}

From f22cfdfe61e191d1b5185c8018fea74b7b5764cd Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sat, 23 Dec 2023 17:54:58 -0500
Subject: [PATCH 40/49] capture s(this)

Seems to save ~1.5 seconds on GB
---
 src/sorted_node_store.cpp | 82 ++++++++++++++++++++-------------------
 src/sorted_way_store.cpp  | 53 +++++++++++++------------
 2 files changed, 71 insertions(+), 64 deletions(-)

diff --git a/src/sorted_node_store.cpp b/src/sorted_node_store.cpp
index 174664c3..82dccb55 100644
--- a/src/sorted_node_store.cpp
+++ b/src/sorted_node_store.cpp
@@ -173,22 +173,23 @@ LatpLon SortedNodeStore::at(const NodeID id) const {
 
 		// Really naive caching strategy - just cache the last-used chunk.
 		// Probably good enough?
-		if (s(this).cachedChunk != neededChunk) {
-			s(this).cachedChunk = neededChunk;
-			s(this).cacheChunkLons.reserve(256);
-			s(this).cacheChunkLatps.reserve(256);
+		ThreadStorage& tls = s(this);
+		if (tls.cachedChunk != neededChunk) {
+			tls.cachedChunk = neededChunk;
+			tls.cacheChunkLons.reserve(256);
+			tls.cacheChunkLatps.reserve(256);
 
 			uint8_t* latpData = ptr->data;
 			uint8_t* lonData = ptr->data + latpSize;
 			uint32_t recovdata[256] = {0};
 
 			streamvbyte_decode(latpData, recovdata, n);
-			s(this).cacheChunkLatps[0] = ptr->firstLatp;
-			zigzag_delta_decode(recovdata, &s(this).cacheChunkLatps[1], n, s(this).cacheChunkLatps[0]);
+			tls.cacheChunkLatps[0] = ptr->firstLatp;
+			zigzag_delta_decode(recovdata, &tls.cacheChunkLatps[1], n, tls.cacheChunkLatps[0]);
 
 			streamvbyte_decode(lonData, recovdata, n);
-			s(this).cacheChunkLons[0] = ptr->firstLon;
-			zigzag_delta_decode(recovdata, &s(this).cacheChunkLons[1], n, s(this).cacheChunkLons[0]);
+			tls.cacheChunkLons[0] = ptr->firstLon;
+			zigzag_delta_decode(recovdata, &tls.cacheChunkLons[1], n, tls.cacheChunkLons[0]);
 		}
 
 		size_t nodeOffset = 0;
@@ -199,7 +200,7 @@ LatpLon SortedNodeStore::at(const NodeID id) const {
 		if (!(ptr->nodeMask[nodeMaskByte] & (1 << nodeMaskBit)))
 			throw std::out_of_range("SortedNodeStore: node " + std::to_string(id) + " missing, no node");
 
-		return { s(this).cacheChunkLatps[nodeOffset], s(this).cacheChunkLons[nodeOffset] };
+		return { tls.cacheChunkLatps[nodeOffset], tls.cacheChunkLons[nodeOffset] };
 	}
 
 	UncompressedChunkInfo* ptr = (UncompressedChunkInfo*)basePtr;
@@ -241,58 +242,60 @@ size_t SortedNodeStore::size() const {
 }
 
 void SortedNodeStore::insert(const std::vector<element_t>& elements) {
-	if (s(this).localNodes == nullptr) {
+	ThreadStorage& tls = s(this);
+	if (tls.localNodes == nullptr) {
 		std::lock_guard<std::mutex> lock(orphanageMutex);
 		if (workerBuffers.size() == 0)
 			workerBuffers.reserve(256);
 		else if (workerBuffers.size() == workerBuffers.capacity())
 			throw std::runtime_error("SortedNodeStore doesn't support more than 256 cores");
 		workerBuffers.push_back(std::vector<element_t>());
-		s(this).localNodes = &workerBuffers.back();
+		tls.localNodes = &workerBuffers.back();
 	}
 
-	if (s(this).groupStart == -1) {
+	if (tls.groupStart == -1) {
 		// Mark where the first full group starts, so we know when to transition
 		// out of collecting orphans.
-		s(this).groupStart = elements[0].first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+		tls.groupStart = elements[0].first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
 	}
 
 	int i = 0;
-	while (s(this).collectingOrphans && i < elements.size()) {
+	while (tls.collectingOrphans && i < elements.size()) {
 		const element_t& el = elements[i];
-		if (el.first >= s(this).groupStart + (GroupSize * ChunkSize)) {
-			s(this).collectingOrphans = false;
+		if (el.first >= tls.groupStart + (GroupSize * ChunkSize)) {
+			tls.collectingOrphans = false;
 			// Calculate new groupStart, rounding to previous boundary.
-			s(this).groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
-			collectOrphans(*s(this).localNodes);
-			s(this).localNodes->clear();
+			tls.groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+			collectOrphans(*tls.localNodes);
+			tls.localNodes->clear();
 		}
-		s(this).localNodes->push_back(el);
+		tls.localNodes->push_back(el);
 		i++;
 	}
 
 	while(i < elements.size()) {
 		const element_t& el = elements[i];
 
-		if (el.first >= s(this).groupStart + (GroupSize * ChunkSize)) {
-			publishGroup(*s(this).localNodes);
-			s(this).localNodes->clear();
-			s(this).groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+		if (el.first >= tls.groupStart + (GroupSize * ChunkSize)) {
+			publishGroup(*tls.localNodes);
+			tls.localNodes->clear();
+			tls.groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
 		}
 
-		s(this).localNodes->push_back(el);
+		tls.localNodes->push_back(el);
 		i++;
 	}
 }
 
 void SortedNodeStore::batchStart() {
-	s(this).collectingOrphans = true;
-	s(this).groupStart = -1;
-	if (s(this).localNodes == nullptr || s(this).localNodes->size() == 0)
+	ThreadStorage& tls = s(this);
+	tls.collectingOrphans = true;
+	tls.groupStart = -1;
+	if (tls.localNodes == nullptr || tls.localNodes->size() == 0)
 		return;
 
-	collectOrphans(*s(this).localNodes);
-	s(this).localNodes->clear();
+	collectOrphans(*tls.localNodes);
+	tls.localNodes->clear();
 }
 
 void SortedNodeStore::finalize(size_t threadNum) {
@@ -467,22 +470,23 @@ void SortedNodeStore::publishGroup(const std::vector<element_t>& nodes) {
 
 	GroupInfo* groupInfo = nullptr;
 
-	if (s(this).arenaSpace < groupSpace) {
+	ThreadStorage& tls = s(this);
+	if (tls.arenaSpace < groupSpace) {
 		// A full group takes ~330KB. Nodes are read _fast_, and there ends
 		// up being contention calling the allocator when reading the
 		// planet on a machine with 48 cores -- so allocate in large chunks.
-		s(this).arenaSpace = 4 * 1024 * 1024;
-		totalAllocatedSpace += s(this).arenaSpace;
-		s(this).arenaPtr = (char*)void_mmap_allocator::allocate(s(this).arenaSpace);
-		if (s(this).arenaPtr == nullptr)
+		tls.arenaSpace = 4 * 1024 * 1024;
+		totalAllocatedSpace += tls.arenaSpace;
+		tls.arenaPtr = (char*)void_mmap_allocator::allocate(tls.arenaSpace);
+		if (tls.arenaPtr == nullptr)
 			throw std::runtime_error("SortedNodeStore: failed to allocate arena");
 		std::lock_guard<std::mutex> lock(orphanageMutex);
-		allocatedMemory.push_back(std::make_pair((void*)s(this).arenaPtr, s(this).arenaSpace));
+		allocatedMemory.push_back(std::make_pair((void*)tls.arenaPtr, tls.arenaSpace));
 	}
 
-	s(this).arenaSpace -= groupSpace;
-	groupInfo = (GroupInfo*)s(this).arenaPtr;
-	s(this).arenaPtr += groupSpace;
+	tls.arenaSpace -= groupSpace;
+	groupInfo = (GroupInfo*)tls.arenaPtr;
+	tls.arenaPtr += groupSpace;
 
 	if (groups[groupIndex] != nullptr)
 		throw std::runtime_error("SortedNodeStore: group already present");
diff --git a/src/sorted_way_store.cpp b/src/sorted_way_store.cpp
index 27ae6ae2..450a4bcc 100644
--- a/src/sorted_way_store.cpp
+++ b/src/sorted_way_store.cpp
@@ -32,7 +32,7 @@ namespace SortedWayStoreTypes {
 
 	thread_local std::deque<std::pair<const SortedWayStore*, ThreadStorage>> threadStorage;
 
-	ThreadStorage& s(const SortedWayStore* who) {
+	inline ThreadStorage& s(const SortedWayStore* who) {
 		for (auto& entry : threadStorage)
 			if (entry.first == who)
 				return entry.second;
@@ -214,46 +214,47 @@ void SortedWayStore::insertNodes(const std::vector<std::pair<WayID, std::vector<
 	if (newWays.empty())
 		return;
 
-	if (s(this).localWays == nullptr) {
+	ThreadStorage& tls = s(this);
+	if (tls.localWays == nullptr) {
 		std::lock_guard<std::mutex> lock(orphanageMutex);
 		if (workerBuffers.size() == 0)
 			workerBuffers.reserve(256);
 		else if (workerBuffers.size() == workerBuffers.capacity())
 			throw std::runtime_error("SortedWayStore doesn't support more than 256 cores");
 		workerBuffers.push_back(std::vector<std::pair<WayID, std::vector<NodeID>>>());
-		s(this).localWays = &workerBuffers.back();
+		tls.localWays = &workerBuffers.back();
 	}
 
-	if (s(this).groupStart == -1) {
+	if (tls.groupStart == -1) {
 		// Mark where the first full group starts, so we know when to transition
 		// out of collecting orphans.
-		s(this).groupStart = newWays[0].first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+		tls.groupStart = newWays[0].first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
 	}
 
 	int i = 0;
-	while (s(this).collectingOrphans && i < newWays.size()) {
+	while (tls.collectingOrphans && i < newWays.size()) {
 		const auto& el = newWays[i];
-		if (el.first >= s(this).groupStart + (GroupSize * ChunkSize)) {
-			s(this).collectingOrphans = false;
+		if (el.first >= tls.groupStart + (GroupSize * ChunkSize)) {
+			tls.collectingOrphans = false;
 			// Calculate new groupStart, rounding to previous boundary.
-			s(this).groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
-			collectOrphans(*s(this).localWays);
-			s(this).localWays->clear();
+			tls.groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+			collectOrphans(*tls.localWays);
+			tls.localWays->clear();
 		}
-		s(this).localWays->push_back(el);
+		tls.localWays->push_back(el);
 		i++;
 	}
 
 	while(i < newWays.size()) {
 		const auto& el = newWays[i];
 
-		if (el.first >= s(this).groupStart + (GroupSize * ChunkSize)) {
-			publishGroup(*s(this).localWays);
-			s(this).localWays->clear();
-			s(this).groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+		if (el.first >= tls.groupStart + (GroupSize * ChunkSize)) {
+			publishGroup(*tls.localWays);
+			tls.localWays->clear();
+			tls.groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
 		}
 
-		s(this).localWays->push_back(el);
+		tls.localWays->push_back(el);
 		i++;
 	}
 }
@@ -297,13 +298,14 @@ void SortedWayStore::finalize(unsigned int threadNum) {
 }
 
 void SortedWayStore::batchStart() {
-	s(this).collectingOrphans = true;
-	s(this).groupStart = -1;
-	if (s(this).localWays == nullptr || s(this).localWays->size() == 0)
+	ThreadStorage& tls = s(this);
+	tls.collectingOrphans = true;
+	tls.groupStart = -1;
+	if (tls.localWays == nullptr || tls.localWays->size() == 0)
 		return;
 
-	collectOrphans(*s(this).localWays);
-	s(this).localWays->clear();
+	collectOrphans(*tls.localWays);
+	tls.localWays->clear();
 }
 
 void SortedWayStore::collectOrphans(const std::vector<std::pair<WayID, std::vector<NodeID>>>& orphans) {
@@ -476,6 +478,7 @@ void populateMask(uint8_t* mask, const std::vector<uint8_t>& ids) {
 }
 
 void SortedWayStore::publishGroup(const std::vector<std::pair<WayID, std::vector<NodeID>>>& ways) {
+	ThreadStorage& tls = s(this);
 	totalWays += ways.size();
 	if (ways.size() == 0) {
 		throw std::runtime_error("SortedWayStore: group is empty");
@@ -519,12 +522,12 @@ void SortedWayStore::publishGroup(const std::vector<std::pair<WayID, std::vector
 		const WayID id = way.first;
 		lastChunk->wayIds.push_back(id % ChunkSize);
 
-		uint16_t flags = encodeWay(way.second, s(this).encodedWay, compressWays && way.second.size() >= 4);
+		uint16_t flags = encodeWay(way.second, tls.encodedWay, compressWays && way.second.size() >= 4);
 		lastChunk->wayFlags.push_back(flags);
 
 		std::vector<uint8_t> encoded;
-		encoded.resize(s(this).encodedWay.size());
-		memcpy(encoded.data(), s(this).encodedWay.data(), s(this).encodedWay.size());
+		encoded.resize(tls.encodedWay.size());
+		memcpy(encoded.data(), tls.encodedWay.data(), tls.encodedWay.size());
 
 		lastChunk->encodedWays.push_back(std::move(encoded));
 	}

From efd66bbfe82343621f314dcf0f39b6fb0dbb8d36 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sat, 23 Dec 2023 18:02:48 -0500
Subject: [PATCH 41/49] fix warning

---
 include/deque_map.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/deque_map.h b/include/deque_map.h
index 2ec20387..283f8490 100644
--- a/include/deque_map.h
+++ b/include/deque_map.h
@@ -103,7 +103,7 @@ class DequeMap {
 
 	struct iterator {
 		const DequeMap<T>& dm;
-		int offset;
+		size_t offset;
 		iterator(const DequeMap<T>& dm, int offset): dm(dm), offset(offset) {}
 		void operator++() { offset++; }
 		bool operator!=(iterator& other) { return offset != other.offset; }

From db89f8bd3ceaca67c7c7e6cf8b84b63d82e49063 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sat, 23 Dec 2023 18:13:46 -0500
Subject: [PATCH 42/49] fix warning, really

---
 include/deque_map.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/deque_map.h b/include/deque_map.h
index 283f8490..bcb4ddbc 100644
--- a/include/deque_map.h
+++ b/include/deque_map.h
@@ -104,7 +104,7 @@ class DequeMap {
 	struct iterator {
 		const DequeMap<T>& dm;
 		size_t offset;
-		iterator(const DequeMap<T>& dm, int offset): dm(dm), offset(offset) {}
+		iterator(const DequeMap<T>& dm, size_t offset): dm(dm), offset(offset) {}
 		void operator++() { offset++; }
 		bool operator!=(iterator& other) { return offset != other.offset; }
 		const T& operator*() const { return dm.objects[dm.keys[offset]]; }

From 60e5261bd15bef0a3eecf648e9219712e5508070 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sat, 23 Dec 2023 18:17:48 -0500
Subject: [PATCH 43/49] fewer shards

Shard 1 (North America) is ~4.8GB of nodes, shard 4 (some of Europe) is
3.7GB. Even ignoring the memory savings in the recent commits, these
could be merged.
---
 src/sharded_node_store.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/sharded_node_store.cpp b/src/sharded_node_store.cpp
index 964d61fa..e9a6dc16 100644
--- a/src/sharded_node_store.cpp
+++ b/src/sharded_node_store.cpp
@@ -61,12 +61,12 @@ size_t pickStore(const LatpLon& el) {
 	const size_t z3x = z4x / 2;
 	const size_t z3y = z4y / 2;
 
-	if (z3x == 5 && z3y == 2) return 6; // Western Russia
-	if (z3x == 4 && z3y == 3) return 6; // North Africa
-	if (z3x == 5 && z3y == 3) return 6; // India
+	if (z3x == 5 && z3y == 2) return 5; // Western Russia
+	if (z3x == 4 && z3y == 3) return 5; // North Africa
+	if (z3x == 5 && z3y == 3) return 5; // India
 
-	if ((z5x == 16 && z5y == 10) || (z5x == 16 && z5y == 11)) return 5; // some of Central Europe
-	if ((z5x == 17 && z5y == 10) || (z5x == 17 && z5y == 11)) return 4; // some more of Central Europe
+	if ((z5x == 16 && z5y == 10) || (z5x == 16 && z5y == 11)) return 4; // some of Central Europe
+	if ((z5x == 17 && z5y == 10) || (z5x == 17 && z5y == 11)) return 1; // some more of Central Europe
 
 	if (z3x == 4 && z3y == 2) return 3; // rest of Central Europe
 

From 09abd3adb1988b71c204082c3567f410eca3efae Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 24 Dec 2023 12:12:00 -0500
Subject: [PATCH 44/49] extract option parsing to own file

We'd like to have different defaults based on whether `--store` is
present. Now that option parsing will have some more complex logic,
let's pull it into its own class so it can be more easily tested.
---
 CMakeLists.txt               |   1 +
 Makefile                     |   6 +-
 include/helpers.h            |   3 +-
 include/options_parser.h     |  54 ++++++++++++
 include/shared_data.h        |   7 +-
 src/helpers.cpp              |   3 +-
 src/options_parser.cpp       | 100 ++++++++++++++++++++++
 src/shared_data.cpp          |   2 +-
 src/tile_worker.cpp          |   4 +-
 src/tilemaker.cpp            | 160 +++++++++++++----------------------
 test/options_parser.test.cpp |  66 +++++++++++++++
 11 files changed, 294 insertions(+), 112 deletions(-)
 create mode 100644 include/options_parser.h
 create mode 100644 src/options_parser.cpp
 create mode 100644 test/options_parser.test.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8819b5e2..80a76e7e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,6 +97,7 @@ file(GLOB tilemaker_src_files
 	src/mbtiles.cpp
 	src/mmap_allocator.cpp
 	src/node_stores.cpp
+	src/options_parser.cpp
 	src/osm_lua_processing.cpp
 	src/osm_mem_tiles.cpp
 	src/osm_store.cpp
diff --git a/Makefile b/Makefile
index d0305cce..d44245ae 100644
--- a/Makefile
+++ b/Makefile
@@ -106,6 +106,7 @@ tilemaker: \
 	src/mbtiles.o \
 	src/mmap_allocator.o \
 	src/node_stores.o \
+	src/options_parser.o \
 	src/osm_lua_processing.o \
 	src/osm_mem_tiles.o \
 	src/osm_store.o \
@@ -152,6 +153,10 @@ test_deque_map: \
 	test/deque_map.test.o
 	$(CXX) $(CXXFLAGS) -o test.deque_map $^ $(INC) $(LIB) $(LDFLAGS) && ./test.deque_map
 
+test_options_parser: \
+	src/options_parser.o \
+	test/options_parser.test.o
+	$(CXX) $(CXXFLAGS) -o test.options_parser $^ $(INC) $(LIB) $(LDFLAGS) && ./test.options_parser
 
 test_pooled_string: \
 	src/mmap_allocator.o \
@@ -168,7 +173,6 @@ test_sorted_node_store: \
 	test/sorted_node_store.test.o
 	$(CXX) $(CXXFLAGS) -o test.sorted_node_store $^ $(INC) $(LIB) $(LDFLAGS) && ./test.sorted_node_store
 
-
 test_sorted_way_store: \
 	src/external/streamvbyte_decode.o \
 	src/external/streamvbyte_encode.o \
diff --git a/include/helpers.h b/include/helpers.h
index 7cb9c027..029a801d 100644
--- a/include/helpers.h
+++ b/include/helpers.h
@@ -3,7 +3,8 @@
 #define _HELPERS_H
 
 #include <zlib.h>
-#include "geom.h"
+#include <sstream>
+#include <vector>
 
 // General helper routines
 
diff --git a/include/options_parser.h b/include/options_parser.h
new file mode 100644
index 00000000..12b416dd
--- /dev/null
+++ b/include/options_parser.h
@@ -0,0 +1,54 @@
+#ifndef OPTIONS_PARSER_H
+#define OPTIONS_PARSER_H
+
+#include <exception>
+#include <string>
+#include <vector>
+
+namespace OptionsParser {
+	struct OptionException : std::exception {
+		OptionException(std::string message): message(message) {}
+
+		/// Returns the explanatory string.
+		const char* what() const noexcept override {
+				return message.data();
+		}
+
+		private:
+			std::string message;
+	};
+
+	enum class OutputMode: char { File = 0, MBTiles = 1, PMTiles = 2 };
+
+	struct OsmOptions {
+		std::string storeFile;
+		bool compact = false;
+		bool skipIntegrity = false;
+		bool uncompressedNodes = false;
+		bool uncompressedWays = false;
+		bool materializeGeometries = false;
+		bool shardStores = false;
+	};
+
+	struct Options {
+		std::vector<std::string> inputFiles;
+		std::string luaFile;
+		std::string jsonFile;
+		uint threadNum = 0;
+		std::string outputFile;
+		std::string bbox;
+
+		OsmOptions osm;
+		bool showHelp = false;
+		bool verbose = false;
+		bool mergeSqlite = false;
+		bool mapsplit = false;
+		OutputMode outputMode = OutputMode::File;
+		bool logTileTimings = false;
+	};
+
+	Options parse(const int argc, const char* argv[]);
+	void showHelp();
+};
+
+#endif
diff --git a/include/shared_data.h b/include/shared_data.h
index 23ba9a06..45c6e34b 100644
--- a/include/shared_data.h
+++ b/include/shared_data.h
@@ -7,6 +7,7 @@
 
 #include "rapidjson/document.h"
 
+#include "options_parser.h"
 #include "osm_store.h"
 #include "output_object.h"
 #include "mbtiles.h"
@@ -61,10 +62,6 @@ class LayerDefinition {
 	std::string serialiseToJSON() const;
 };
 
-const int OUTPUT_FILE = 0;
-const int OUTPUT_MBTILES = 1;
-const int OUTPUT_PMTILES = 2;
-
 ///\brief Config read from JSON to control behavior of program
 class Config {
 	
@@ -91,7 +88,7 @@ class SharedData {
 
 public:
 	const class LayerDefinition &layers;
-	int outputMode;
+	OptionsParser::OutputMode outputMode;
 	bool mergeSqlite;
 	MBTiles mbtiles;
 	PMTiles pmtiles;
diff --git a/src/helpers.cpp b/src/helpers.cpp
index 444ddcf0..4af04612 100644
--- a/src/helpers.cpp
+++ b/src/helpers.cpp
@@ -4,6 +4,8 @@
 #include <iomanip>
 #include <sstream>
 #include <cstring>
+#include <boost/lexical_cast.hpp>
+#include <boost/algorithm/string.hpp>
 
 #include "helpers.h"
 
@@ -11,7 +13,6 @@
 #define MOD_GZIP_ZLIB_CFACTOR 9
 #define MOD_GZIP_ZLIB_BSIZE 8096
 
-namespace geom = boost::geometry;
 using namespace std;
 
 // Bounding box string parsing
diff --git a/src/options_parser.cpp b/src/options_parser.cpp
new file mode 100644
index 00000000..f49c5129
--- /dev/null
+++ b/src/options_parser.cpp
@@ -0,0 +1,100 @@
+#include "options_parser.h"
+
+#include <thread>
+#include <boost/filesystem.hpp>
+#include <boost/program_options.hpp>
+#include <iostream>
+#include "helpers.h"
+
+#ifndef TM_VERSION
+#define TM_VERSION (version not set)
+#endif
+#define STR1(x)  #x
+#define STR(x)  STR1(x)
+
+using namespace std;
+namespace po = boost::program_options;
+
+po::options_description getParser(OptionsParser::Options& options) {
+	po::options_description desc("tilemaker " STR(TM_VERSION) "\nConvert OpenStreetMap .pbf files into vector tiles\n\nAvailable options");
+	desc.add_options()
+		("help",                                                                 "show help message")
+		("input",  po::value< vector<string> >(&options.inputFiles),                     "source .osm.pbf file")
+		("output", po::value< string >(&options.outputFile),                             "target directory or .mbtiles/.pmtiles file")
+		("bbox",   po::value< string >(&options.bbox),                                   "bounding box to use if input file does not have a bbox header set, example: minlon,minlat,maxlon,maxlat")
+		("merge"  ,po::bool_switch(&options.mergeSqlite),                                "merge with existing .mbtiles (overwrites otherwise)")
+		("config", po::value< string >(&options.jsonFile)->default_value("config.json"), "config JSON file")
+		("process",po::value< string >(&options.luaFile)->default_value("process.lua"),  "tag-processing Lua file")
+		("store",  po::value< string >(&options.osm.storeFile),  "temporary storage for node/ways/relations data")
+		("compact",po::bool_switch(&options.osm.compact),  "Reduce overall memory usage (compact mode).\nNOTE: This requires the input to be renumbered (osmium renumber)")
+		("no-compress-nodes", po::bool_switch(&options.osm.uncompressedNodes),  "Store nodes uncompressed")
+		("no-compress-ways", po::bool_switch(&options.osm.uncompressedWays),  "Store ways uncompressed")
+		("materialize-geometries", po::bool_switch(&options.osm.materializeGeometries),  "Materialize geometries - faster, but requires more memory")
+		("shard-stores", po::bool_switch(&options.osm.shardStores),  "Shard stores - use an alternate reading/writing strategy for low-memory machines")
+		("verbose",po::bool_switch(&options.verbose),                                   "verbose error output")
+		("skip-integrity",po::bool_switch(&options.osm.skipIntegrity),                       "don't enforce way/node integrity")
+		("log-tile-timings", po::bool_switch(&options.logTileTimings), "log how long each tile takes")
+		("threads",po::value< uint >(&options.threadNum)->default_value(0),              "number of threads (automatically detected if 0)");
+	po::options_description performance("Performance options");
+	performance.add_options()
+			("help-module", po::value<std::string>(),
+					"produce a help for a given module")
+			("version", "output the version number")
+			;
+
+	desc.add(performance);
+	return desc;
+}
+
+void OptionsParser::showHelp() {
+	Options options;
+	auto parser = getParser(options);
+	std::cout << parser << std::endl;
+}
+
+OptionsParser::Options OptionsParser::parse(const int argc, const char* argv[]) {
+	Options options;
+	po::options_description desc = getParser(options);
+	po::positional_options_description p;
+	p.add("input", 1).add("output", 1);
+
+	po::variables_map vm;
+	try {
+		po::store(po::command_line_parser(argc, argv).options(desc).positional(p).run(), vm);
+	} catch (const po::unknown_option& ex) {
+		throw OptionException{"Unknown option: " + ex.get_option_name()};
+	}
+	po::notify(vm);
+	
+	if (vm.count("help")) {
+		options.showHelp = true;
+		return options;
+	}
+	if (vm.count("output") == 0) {
+		throw OptionException{ "You must specify an output file or directory. Run with --help to find out more." };
+	}
+
+	if (vm.count("input") == 0) {
+		throw OptionException{ "No source .osm.pbf file supplied" };
+	}
+
+	if (ends_with(options.outputFile, ".mbtiles") || ends_with(options.outputFile, ".sqlite")) {
+		options.outputMode = OutputMode::MBTiles;
+	} else if (ends_with(options.outputFile, ".pmtiles")) {
+		options.outputMode = OutputMode::PMTiles;
+	}
+
+	if (options.threadNum == 0) {
+		options.threadNum = max(thread::hardware_concurrency(), 1u);
+	}
+
+	// ---- Check config
+	if (!boost::filesystem::exists(options.jsonFile)) {
+		throw OptionException{ "Couldn't open .json config: " + options.jsonFile };
+	}
+	if (!boost::filesystem::exists(options.luaFile)) {
+		throw OptionException{"Couldn't open .lua script: " + options.luaFile };
+	}
+
+	return options;
+}
diff --git a/src/shared_data.cpp b/src/shared_data.cpp
index 78cfe11d..da9787d8 100644
--- a/src/shared_data.cpp
+++ b/src/shared_data.cpp
@@ -10,7 +10,7 @@ using namespace rapidjson;
 
 SharedData::SharedData(Config &configIn, const class LayerDefinition &layers)
 	: layers(layers), config(configIn) {
-	outputMode=OUTPUT_FILE;
+	outputMode=OptionsParser::OutputMode::File;
 	mergeSqlite=false;
 }
 
diff --git a/src/tile_worker.cpp b/src/tile_worker.cpp
index cbdd33e3..7951fcaf 100644
--- a/src/tile_worker.cpp
+++ b/src/tile_worker.cpp
@@ -378,13 +378,13 @@ void outputProc(
 
 	// Write to file or sqlite
 	string outputdata, compressed;
-	if (sharedData.outputMode == OUTPUT_MBTILES) {
+	if (sharedData.outputMode == OptionsParser::OutputMode::MBTiles) {
 		// Write to sqlite
 		tile.SerializeToString(&outputdata);
 		if (sharedData.config.compress) { compressed = compress_string(outputdata, Z_DEFAULT_COMPRESSION, sharedData.config.gzip); }
 		sharedData.mbtiles.saveTile(zoom, bbox.index.x, bbox.index.y, sharedData.config.compress ? &compressed : &outputdata, sharedData.mergeSqlite);
 
-	} else if (sharedData.outputMode == OUTPUT_PMTILES) {
+	} else if (sharedData.outputMode == OptionsParser::OutputMode::PMTiles) {
 		// Write to pmtiles
 		tile.SerializeToString(&outputdata);
 		sharedData.pmtiles.saveTile(zoom, bbox.index.x, bbox.index.y, outputdata);
diff --git a/src/tilemaker.cpp b/src/tilemaker.cpp
index 7825f65e..1c821001 100644
--- a/src/tilemaker.cpp
+++ b/src/tilemaker.cpp
@@ -48,6 +48,7 @@
 #include "osm_lua_processing.h"
 #include "mbtiles.h"
 
+#include "options_parser.h"
 #include "shared_data.h"
 #include "read_pbf.h"
 #include "read_shp.h"
@@ -80,89 +81,46 @@ bool verbose = false;
  *
  * Worker threads write the output tiles, and start in the outputProc function.
  */
-int main(int argc, char* argv[]) {
+int main(const int argc, const char* argv[]) {
 	// ----	Read command-line options
-	vector<string> inputFiles;
-	string luaFile;
-	string osmStoreFile;
-	string jsonFile;
-	uint threadNum;
-	string outputFile;
-	string bbox;
-	bool _verbose = false, mergeSqlite = false, mapsplit = false, osmStoreCompact = false, skipIntegrity = false, osmStoreUncompressedNodes = false, osmStoreUncompressedWays = false, materializeGeometries = false, shardStores = false;
-	int outputMode = OUTPUT_FILE;
-	bool logTileTimings = false;
-
-	po::options_description desc("tilemaker " STR(TM_VERSION) "\nConvert OpenStreetMap .pbf files into vector tiles\n\nAvailable options");
-	desc.add_options()
-		("help",                                                                 "show help message")
-		("input",  po::value< vector<string> >(&inputFiles),                     "source .osm.pbf file")
-		("output", po::value< string >(&outputFile),                             "target directory or .mbtiles/.pmtiles file")
-		("bbox",   po::value< string >(&bbox),                                   "bounding box to use if input file does not have a bbox header set, example: minlon,minlat,maxlon,maxlat")
-		("merge"  ,po::bool_switch(&mergeSqlite),                                "merge with existing .mbtiles (overwrites otherwise)")
-		("config", po::value< string >(&jsonFile)->default_value("config.json"), "config JSON file")
-		("process",po::value< string >(&luaFile)->default_value("process.lua"),  "tag-processing Lua file")
-		("store",  po::value< string >(&osmStoreFile),  "temporary storage for node/ways/relations data")
-		("compact",po::bool_switch(&osmStoreCompact),  "Reduce overall memory usage (compact mode).\nNOTE: This requires the input to be renumbered (osmium renumber)")
-		("no-compress-nodes", po::bool_switch(&osmStoreUncompressedNodes),  "Store nodes uncompressed")
-		("no-compress-ways", po::bool_switch(&osmStoreUncompressedWays),  "Store ways uncompressed")
-		("materialize-geometries", po::bool_switch(&materializeGeometries),  "Materialize geometries - faster, but requires more memory")
-		("shard-stores", po::bool_switch(&shardStores),  "Shard stores - use an alternate reading/writing strategy for low-memory machines")
-		("verbose",po::bool_switch(&_verbose),                                   "verbose error output")
-		("skip-integrity",po::bool_switch(&skipIntegrity),                       "don't enforce way/node integrity")
-		("log-tile-timings", po::bool_switch(&logTileTimings), "log how long each tile takes")
-		("threads",po::value< uint >(&threadNum)->default_value(0),              "number of threads (automatically detected if 0)");
-	po::positional_options_description p;
-	p.add("input", 1).add("output", 1);
-	po::variables_map vm;
+	OptionsParser::Options options;
 	try {
-		po::store(po::command_line_parser(argc, argv).options(desc).positional(p).run(), vm);
-	} catch (const po::unknown_option& ex) {
-		cerr << "Unknown option: " << ex.get_option_name() << endl;
-		return -1;
+		options = OptionsParser::parse(argc, argv);
+	} catch (OptionsParser::OptionException& e) {
+		cerr << e.what() << endl;
+		return 1;
 	}
-	po::notify(vm);
-	
-	if (vm.count("help")) { cout << desc << endl; return 0; }
-	if (vm.count("output")==0) { cerr << "You must specify an output file or directory. Run with --help to find out more." << endl; return -1; }
-	if (vm.count("input")==0) { cout << "No source .osm.pbf file supplied" << endl; }
-
-	vector<string> bboxElements = parseBox(bbox);
 
-	if (ends_with(outputFile, ".mbtiles") || ends_with(outputFile, ".sqlite")) { outputMode = OUTPUT_MBTILES; }
-	else if (ends_with(outputFile, ".pmtiles")) { outputMode = OUTPUT_PMTILES; }
-	if (threadNum == 0) { threadNum = max(thread::hardware_concurrency(), 1u); }
-	verbose = _verbose;
+	if (options.showHelp) { OptionsParser::showHelp(); return 0; }
 
+	verbose = options.verbose;
 
-	// ---- Check config
-	
-	if (!boost::filesystem::exists(jsonFile)) { cerr << "Couldn't open .json config: " << jsonFile << endl; return -1; }
-	if (!boost::filesystem::exists(luaFile )) { cerr << "Couldn't open .lua script: "  << luaFile  << endl; return -1; }
+	vector<string> bboxElements = parseBox(options.bbox);
 
 	// ---- Remove existing .mbtiles if it exists
-
-	if ((outputMode==OUTPUT_MBTILES || outputMode==OUTPUT_PMTILES) && !mergeSqlite && static_cast<bool>(std::ifstream(outputFile))) {
+	if ((options.outputMode == OptionsParser::OutputMode::MBTiles || options.outputMode == OptionsParser::OutputMode::PMTiles) && !options.mergeSqlite && static_cast<bool>(std::ifstream(options.outputFile))) {
 		cout << "Output file exists, will overwrite (Ctrl-C to abort";
-		if (outputMode==OUTPUT_MBTILES) cout << ", rerun with --merge to keep";
+		if (options.outputMode == OptionsParser::OutputMode::MBTiles) cout << ", rerun with --merge to keep";
 		cout << ")" << endl;
 		std::this_thread::sleep_for(std::chrono::milliseconds(2000));
-		if (remove(outputFile.c_str()) != 0) {
+		if (remove(options.outputFile.c_str()) != 0) {
 			cerr << "Couldn't remove existing file" << endl;
 			return 0;
 		}
-	} else if (mergeSqlite && outputMode!=OUTPUT_MBTILES) {
+	} else if (options.mergeSqlite && options.outputMode != OptionsParser::OutputMode::MBTiles) {
 		cerr << "--merge only works with .mbtiles" << endl;
 		return 0;
-	} else if (mergeSqlite && !static_cast<bool>(std::ifstream(outputFile))) {
+	} else if (options.mergeSqlite && !static_cast<bool>(std::ifstream(options.outputFile))) {
 		cout << "--merge specified but .mbtiles file doesn't already exist, ignoring" << endl;
-		mergeSqlite = false;
+		options.mergeSqlite = false;
 	}
 
+
 	// ----	Read bounding box from first .pbf (if there is one) or mapsplit file
 
 	bool hasClippingBox = false;
 	Box clippingBox;
+	bool mapsplit = false;
 	MBTiles mapsplitFile;
 	double minLon=0.0, maxLon=0.0, minLat=0.0, maxLat=0.0;
 	if (!bboxElements.empty()) {
@@ -172,14 +130,14 @@ int main(int argc, char* argv[]) {
 		maxLon = bboxElementFromStr(bboxElements.at(2));
 		maxLat = bboxElementFromStr(bboxElements.at(3));
 
-	} else if (inputFiles.size()==1 && (ends_with(inputFiles[0], ".mbtiles") || ends_with(inputFiles[0], ".sqlite") || ends_with(inputFiles[0], ".msf"))) {
+	} else if (options.inputFiles.size()==1 && (ends_with(options.inputFiles[0], ".mbtiles") || ends_with(options.inputFiles[0], ".sqlite") || ends_with(options.inputFiles[0], ".msf"))) {
 		mapsplit = true;
-		mapsplitFile.openForReading(inputFiles[0]);
+		mapsplitFile.openForReading(options.inputFiles[0]);
 		mapsplitFile.readBoundingBox(minLon, maxLon, minLat, maxLat);
 		hasClippingBox = true;
 
-	} else if (inputFiles.size()>0) {
-		int ret = ReadPbfBoundingBox(inputFiles[0], minLon, maxLon, minLat, maxLat, hasClippingBox);
+	} else if (options.inputFiles.size()>0) {
+		int ret = ReadPbfBoundingBox(options.inputFiles[0], minLon, maxLon, minLat, maxLat, hasClippingBox);
 		if(ret != 0) return ret;
 	}
 
@@ -193,7 +151,7 @@ int main(int argc, char* argv[]) {
 	rapidjson::Document jsonConfig;
 	class Config config;
 	try {
-		FILE* fp = fopen(jsonFile.c_str(), "r");
+		FILE* fp = fopen(options.jsonFile.c_str(), "r");
 		char readBuffer[65536];
 		rapidjson::FileReadStream is(fp, readBuffer, sizeof(readBuffer));
 		jsonConfig.ParseStream(is);
@@ -214,21 +172,21 @@ int main(int argc, char* argv[]) {
 	bool allPbfsHaveSortTypeThenID = true;
 	bool anyPbfHasLocationsOnWays = false;
 
-	for (const std::string& file: inputFiles) {
+	for (const std::string& file: options.inputFiles) {
 		if (ends_with(file, ".pbf")) {
 			allPbfsHaveSortTypeThenID = allPbfsHaveSortTypeThenID && PbfHasOptionalFeature(file, OptionSortTypeThenID);
 			anyPbfHasLocationsOnWays = anyPbfHasLocationsOnWays || PbfHasOptionalFeature(file, OptionLocationsOnWays);
 		}
 	}
 
-	auto createNodeStore = [allPbfsHaveSortTypeThenID, osmStoreCompact, osmStoreUncompressedNodes]() {
-		if (osmStoreCompact) {
+	auto createNodeStore = [allPbfsHaveSortTypeThenID, options]() {
+		if (options.osm.compact) {
 			std::shared_ptr<NodeStore> rv = make_shared<CompactNodeStore>();
 			return rv;
 		}
 
 		if (allPbfsHaveSortTypeThenID) {
-			std::shared_ptr<NodeStore> rv = make_shared<SortedNodeStore>(!osmStoreUncompressedNodes);
+			std::shared_ptr<NodeStore> rv = make_shared<SortedNodeStore>(!options.osm.uncompressedNodes);
 			return rv;
 		}
 		std::shared_ptr<NodeStore> rv =  make_shared<BinarySearchNodeStore>();
@@ -237,15 +195,15 @@ int main(int argc, char* argv[]) {
 
 	shared_ptr<NodeStore> nodeStore;
 
-	if (shardStores) {
+	if (options.osm.shardStores) {
 		nodeStore = std::make_shared<ShardedNodeStore>(createNodeStore);
 	} else {
 		nodeStore = createNodeStore();
 	}
 
-	auto createWayStore = [anyPbfHasLocationsOnWays, allPbfsHaveSortTypeThenID, osmStoreUncompressedWays, &nodeStore]() {
+	auto createWayStore = [anyPbfHasLocationsOnWays, allPbfsHaveSortTypeThenID, options, &nodeStore]() {
 		if (!anyPbfHasLocationsOnWays && allPbfsHaveSortTypeThenID) {
-			std::shared_ptr<WayStore> rv = make_shared<SortedWayStore>(!osmStoreUncompressedWays, *nodeStore.get());
+			std::shared_ptr<WayStore> rv = make_shared<SortedWayStore>(!options.osm.uncompressedWays, *nodeStore.get());
 			return rv;
 		}
 
@@ -254,30 +212,30 @@ int main(int argc, char* argv[]) {
 	};
 
 	shared_ptr<WayStore> wayStore;
-	if (shardStores) {
+	if (options.osm.shardStores) {
 		wayStore = std::make_shared<ShardedWayStore>(createWayStore, *nodeStore.get());
 	} else {
 		wayStore = createWayStore();
 	}
 
 	OSMStore osmStore(*nodeStore.get(), *wayStore.get());
-	osmStore.use_compact_store(osmStoreCompact);
-	osmStore.enforce_integrity(!skipIntegrity);
-	if(!osmStoreFile.empty()) {
-		std::cout << "Using osm store file: " << osmStoreFile << std::endl;
-		osmStore.open(osmStoreFile);
+	osmStore.use_compact_store(options.osm.compact);
+	osmStore.enforce_integrity(!options.osm.skipIntegrity);
+	if(!options.osm.storeFile.empty()) {
+		std::cout << "Using osm store file: " << options.osm.storeFile << std::endl;
+		osmStore.open(options.osm.storeFile);
 	}
 
 	AttributeStore attributeStore;
 
 	class LayerDefinition layers(config.layers);
-	class OsmMemTiles osmMemTiles(threadNum, config.baseZoom, config.includeID, *nodeStore, *wayStore);
-	class ShpMemTiles shpMemTiles(threadNum, config.baseZoom);
+	class OsmMemTiles osmMemTiles(options.threadNum, config.baseZoom, config.includeID, *nodeStore, *wayStore);
+	class ShpMemTiles shpMemTiles(options.threadNum, config.baseZoom);
 	osmMemTiles.open();
 	shpMemTiles.open();
 
-	OsmLuaProcessing osmLuaProcessing(osmStore, config, layers, luaFile, 
-		shpMemTiles, osmMemTiles, attributeStore, materializeGeometries);
+	OsmLuaProcessing osmLuaProcessing(osmStore, config, layers, options.luaFile, 
+		shpMemTiles, osmMemTiles, attributeStore, options.osm.materializeGeometries);
 
 	// ---- Load external shp files
 
@@ -295,7 +253,7 @@ int main(int argc, char* argv[]) {
 			readShapefile(clippingBox,
 			              layers,
 			              config.baseZoom, layerNum,
-			              threadNum,
+			              options.threadNum,
 			              shpMemTiles, osmLuaProcessing);
 		}
 	}
@@ -312,7 +270,7 @@ int main(int argc, char* argv[]) {
 	std::vector<bool> sortOrders = layers.getSortOrders();
 
 	if (!mapsplit) {
-		for (auto inputFile : inputFiles) {
+		for (auto inputFile : options.inputFiles) {
 			cout << "Reading .pbf " << inputFile << endl;
 			ifstream infile(inputFile, ios::in | ios::binary);
 			if (!infile) { cerr << "Couldn't open .pbf file " << inputFile << endl; return -1; }
@@ -322,13 +280,13 @@ int main(int argc, char* argv[]) {
 				nodeStore->shards(),
 				hasSortTypeThenID,
 				nodeKeys,
-				threadNum,
+				options.threadNum,
 				[&]() {
 					thread_local std::shared_ptr<ifstream> pbfStream(new ifstream(inputFile, ios::in | ios::binary));
 					return pbfStream;
 				},
 				[&]() {
-					thread_local std::shared_ptr<OsmLuaProcessing> osmLuaProcessing(new OsmLuaProcessing(osmStore, config, layers, luaFile, shpMemTiles, osmMemTiles, attributeStore, materializeGeometries));
+					thread_local std::shared_ptr<OsmLuaProcessing> osmLuaProcessing(new OsmLuaProcessing(osmStore, config, layers, options.luaFile, shpMemTiles, osmMemTiles, attributeStore, options.osm.materializeGeometries));
 					return osmLuaProcessing;
 				},
 				*nodeStore,
@@ -343,16 +301,16 @@ int main(int argc, char* argv[]) {
 	// ----	Initialise SharedData
 	SourceList sources = {&osmMemTiles, &shpMemTiles};
 	class SharedData sharedData(config, layers);
-	sharedData.outputFile = outputFile;
-	sharedData.outputMode = outputMode;
-	sharedData.mergeSqlite = mergeSqlite;
+	sharedData.outputFile = options.outputFile;
+	sharedData.outputMode = options.outputMode;
+	sharedData.mergeSqlite = options.mergeSqlite;
 
 	// ----	Initialise mbtiles/pmtiles if required
 	
-	if (sharedData.outputMode==OUTPUT_MBTILES) {
+	if (sharedData.outputMode == OptionsParser::OutputMode::MBTiles) {
 		sharedData.mbtiles.openForWriting(sharedData.outputFile);
 		sharedData.writeMBTilesProjectData();
-	} else if (sharedData.outputMode==OUTPUT_PMTILES) {
+	} else if (sharedData.outputMode == OptionsParser::OutputMode::PMTiles) {
 		sharedData.pmtiles.open(sharedData.outputFile);
 	}
 
@@ -394,7 +352,7 @@ int main(int argc, char* argv[]) {
 					return make_unique<boost::interprocess::bufferstream>(pbf.data(), pbf.size(),  ios::in | ios::binary);
 				},
 				[&]() {
-					return std::make_unique<OsmLuaProcessing>(osmStore, config, layers, luaFile, shpMemTiles, osmMemTiles, attributeStore, materializeGeometries);
+					return std::make_unique<OsmLuaProcessing>(osmStore, config, layers, options.luaFile, shpMemTiles, osmMemTiles, attributeStore, options.osm.materializeGeometries);
 				},
 				*nodeStore,
 				*wayStore
@@ -405,7 +363,7 @@ int main(int argc, char* argv[]) {
 		}
 
 		// Launch the pool with threadNum threads
-		boost::asio::thread_pool pool(threadNum);
+		boost::asio::thread_pool pool(options.threadNum);
 
 		// Mutex is hold when IO is performed
 		std::mutex io_mutex;
@@ -414,7 +372,7 @@ int main(int argc, char* argv[]) {
 		std::atomic<uint64_t> tilesWritten(0);
 
 		for (auto source : sources) {
-			source->finalize(threadNum);
+			source->finalize(options.threadNum);
 		}
 		// tiles by zoom level
 
@@ -435,7 +393,7 @@ int main(int argc, char* argv[]) {
 		}
 
 		// For large areas (arbitrarily defined as 100 z6 tiles), use a dense index for pmtiles
-		if (coveredZ6Tiles.size()>100 && outputMode==OUTPUT_PMTILES) {
+		if (coveredZ6Tiles.size()>100 && options.outputMode == OptionsParser::OutputMode::PMTiles) {
 			std::cout << "Using dense index for .pmtiles" << std::endl;
 			sharedData.pmtiles.isSparse = false;
 		}
@@ -561,7 +519,7 @@ int main(int argc, char* argv[]) {
 
 				return false;
 			}, 
-			threadNum);
+			options.threadNum);
 
 		std::size_t batchSize = 0;
 		for(std::size_t startIndex = 0; startIndex < tileCoordinates.size(); startIndex += batchSize) {
@@ -592,7 +550,7 @@ int main(int argc, char* argv[]) {
 
 #ifdef CLOCK_MONOTONIC
 					timespec start, end;
-					if (logTileTimings)
+					if (options.logTileTimings)
 						clock_gettime(CLOCK_MONOTONIC, &start);
 #endif
 
@@ -603,7 +561,7 @@ int main(int argc, char* argv[]) {
 					outputProc(sharedData, sources, attributeStore, data, coords, zoom);
 
 #ifdef CLOCK_MONOTONIC
-					if (logTileTimings) {
+					if (options.logTileTimings) {
 						clock_gettime(CLOCK_MONOTONIC, &end);
 						uint64_t tileNs = 1e9 * (end.tv_sec - start.tv_sec) + end.tv_nsec - start.tv_nsec;
 						std::string output = "z" + std::to_string(zoom) + "/" + std::to_string(coords.x) + "/" + std::to_string(coords.y) + " took " + std::to_string(tileNs/1e6) + " ms";
@@ -612,7 +570,7 @@ int main(int argc, char* argv[]) {
 #endif
 				}
 
-				if (logTileTimings) {
+				if (options.logTileTimings) {
 					const std::lock_guard<std::mutex> lock(io_mutex);
 					std::cout << std::endl;
 					for (const auto& output : tileTimings)
@@ -642,10 +600,10 @@ int main(int argc, char* argv[]) {
 
 	// ----	Close tileset
 
-	if (outputMode==OUTPUT_MBTILES) {
+	if (options.outputMode == OptionsParser::OutputMode::MBTiles) {
 		sharedData.writeMBTilesMetadata(jsonConfig);
 		sharedData.mbtiles.closeForWriting();
-	} else if (outputMode==OUTPUT_PMTILES) {
+	} else if (options.outputMode == OptionsParser::OutputMode::PMTiles) {
 		sharedData.writePMTilesBounds();
 		std::string metadata = sharedData.pmTilesMetadata();
 		sharedData.pmtiles.close(metadata);
diff --git a/test/options_parser.test.cpp b/test/options_parser.test.cpp
new file mode 100644
index 00000000..77b4874d
--- /dev/null
+++ b/test/options_parser.test.cpp
@@ -0,0 +1,66 @@
+#include <iostream>
+#include "external/minunit.h"
+#include "options_parser.h"
+
+const char* PROGRAM_NAME = "./tilemaker";
+using namespace OptionsParser;
+
+Options parse(std::vector<std::string>& args) {
+	const char* argv[100];
+
+	argv[0] = PROGRAM_NAME;
+	for(int i = 0; i < args.size(); i++)
+		argv[1 + i] = args[i].data();
+
+	return parse(1 + args.size(), argv);
+}
+
+#define ASSERT_THROWS(MESSAGE, ...) \
+{ \
+	std::vector<std::string> args = { __VA_ARGS__ }; \
+	bool threw = false; \
+	try { \
+		auto opts = parse(args); \
+	} catch(OptionsParser::OptionException& e) { \
+		threw = std::string(e.what()).find(MESSAGE) != std::string::npos; \
+	} \
+	if (!threw) mu_check((std::string("expected exception with ") + MESSAGE).empty()); \
+}
+
+MU_TEST(test_options_parser) {
+	// No args is invalid.
+	ASSERT_THROWS("You must specify an output file");
+
+	// Output without input is invalid
+	ASSERT_THROWS("No source .osm.pbf", "--output", "foo.mbtiles");
+
+	// You can ask for --help.
+	{
+		std::vector<std::string> args = {"--help"};
+		auto opts = parse(args);
+		mu_check(opts.showHelp);
+	}
+
+	// Minimal valid is output and input
+	{
+		std::vector<std::string> args = {"--output", "foo.mbtiles", "--input", "ontario.pbf"};
+		auto opts = parse(args);
+		mu_check(opts.inputFiles.size() == 1);
+		mu_check(opts.inputFiles[0] == "ontario.pbf");
+		mu_check(opts.outputFile == "foo.mbtiles");
+		mu_check(opts.outputMode == OutputMode::MBTiles);
+	}
+
+	ASSERT_THROWS("Couldn't open .json config", "--input", "foo", "--output", "bar", "--config", "nonexistent-config.json");
+	ASSERT_THROWS("Couldn't open .lua script", "--input", "foo", "--output", "bar", "--process", "nonexistent-script.lua");
+}
+
+MU_TEST_SUITE(test_suite_options_parser) {
+	MU_RUN_TEST(test_options_parser);
+}
+
+int main() {
+	MU_RUN_SUITE(test_suite_options_parser);
+	MU_REPORT();
+	return MU_EXIT_CODE;
+}

From 48305a4981e86b666b9da9ca4f60977400fa7559 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 24 Dec 2023 13:00:24 -0500
Subject: [PATCH 45/49] use sensible defaults based on presence of --store

---
 include/options_parser.h     |  1 +
 src/options_parser.cpp       | 31 ++++++++++++++++++++-----------
 test/options_parser.test.cpp | 28 ++++++++++++++++++++++++++++
 3 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/include/options_parser.h b/include/options_parser.h
index 12b416dd..d9441aef 100644
--- a/include/options_parser.h
+++ b/include/options_parser.h
@@ -22,6 +22,7 @@ namespace OptionsParser {
 
 	struct OsmOptions {
 		std::string storeFile;
+		bool fast = false;
 		bool compact = false;
 		bool skipIntegrity = false;
 		bool uncompressedNodes = false;
diff --git a/src/options_parser.cpp b/src/options_parser.cpp
index f49c5129..274fd848 100644
--- a/src/options_parser.cpp
+++ b/src/options_parser.cpp
@@ -25,21 +25,19 @@ po::options_description getParser(OptionsParser::Options& options) {
 		("merge"  ,po::bool_switch(&options.mergeSqlite),                                "merge with existing .mbtiles (overwrites otherwise)")
 		("config", po::value< string >(&options.jsonFile)->default_value("config.json"), "config JSON file")
 		("process",po::value< string >(&options.luaFile)->default_value("process.lua"),  "tag-processing Lua file")
-		("store",  po::value< string >(&options.osm.storeFile),  "temporary storage for node/ways/relations data")
-		("compact",po::bool_switch(&options.osm.compact),  "Reduce overall memory usage (compact mode).\nNOTE: This requires the input to be renumbered (osmium renumber)")
-		("no-compress-nodes", po::bool_switch(&options.osm.uncompressedNodes),  "Store nodes uncompressed")
-		("no-compress-ways", po::bool_switch(&options.osm.uncompressedWays),  "Store ways uncompressed")
-		("materialize-geometries", po::bool_switch(&options.osm.materializeGeometries),  "Materialize geometries - faster, but requires more memory")
-		("shard-stores", po::bool_switch(&options.osm.shardStores),  "Shard stores - use an alternate reading/writing strategy for low-memory machines")
 		("verbose",po::bool_switch(&options.verbose),                                   "verbose error output")
 		("skip-integrity",po::bool_switch(&options.osm.skipIntegrity),                       "don't enforce way/node integrity")
-		("log-tile-timings", po::bool_switch(&options.logTileTimings), "log how long each tile takes")
-		("threads",po::value< uint >(&options.threadNum)->default_value(0),              "number of threads (automatically detected if 0)");
+		("log-tile-timings", po::bool_switch(&options.logTileTimings), "log how long each tile takes");
 	po::options_description performance("Performance options");
 	performance.add_options()
-			("help-module", po::value<std::string>(),
-					"produce a help for a given module")
-			("version", "output the version number")
+		("store",  po::value< string >(&options.osm.storeFile),  "temporary storage for node/ways/relations data")
+		("fast",   po::bool_switch(&options.osm.fast), "prefer speed at the expense of memory")
+		("compact",po::bool_switch(&options.osm.compact),  "use faster data structure for node lookups\nNOTE: This requires the input to be renumbered (osmium renumber)")
+		("no-compress-nodes", po::bool_switch(&options.osm.uncompressedNodes),  "store nodes uncompressed")
+		("no-compress-ways", po::bool_switch(&options.osm.uncompressedWays),  "store ways uncompressed")
+		("materialize-geometries", po::bool_switch(&options.osm.materializeGeometries),  "materialize geometries")
+		("shard-stores", po::bool_switch(&options.osm.shardStores),  "use an alternate reading/writing strategy for low-memory machines")
+		("threads",po::value< uint >(&options.threadNum)->default_value(0),              "number of threads (automatically detected if 0)")
 			;
 
 	desc.add(performance);
@@ -54,6 +52,7 @@ void OptionsParser::showHelp() {
 
 OptionsParser::Options OptionsParser::parse(const int argc, const char* argv[]) {
 	Options options;
+
 	po::options_description desc = getParser(options);
 	po::positional_options_description p;
 	p.add("input", 1).add("output", 1);
@@ -65,6 +64,16 @@ OptionsParser::Options OptionsParser::parse(const int argc, const char* argv[])
 		throw OptionException{"Unknown option: " + ex.get_option_name()};
 	}
 	po::notify(vm);
+
+	if (options.osm.storeFile.empty()) {
+		options.osm.materializeGeometries = true;
+	} else {
+		if (options.osm.fast) {
+			options.osm.materializeGeometries = true;
+		} else {
+			options.osm.shardStores = true;
+		}
+	}
 	
 	if (vm.count("help")) {
 		options.showHelp = true;
diff --git a/test/options_parser.test.cpp b/test/options_parser.test.cpp
index 77b4874d..e3af0ad2 100644
--- a/test/options_parser.test.cpp
+++ b/test/options_parser.test.cpp
@@ -49,6 +49,34 @@ MU_TEST(test_options_parser) {
 		mu_check(opts.inputFiles[0] == "ontario.pbf");
 		mu_check(opts.outputFile == "foo.mbtiles");
 		mu_check(opts.outputMode == OutputMode::MBTiles);
+		mu_check(opts.osm.materializeGeometries);
+		mu_check(!opts.osm.shardStores);
+	}
+
+	// --store should optimize for reduced memory
+	{
+		std::vector<std::string> args = {"--output", "foo.mbtiles", "--input", "ontario.pbf", "--store", "/tmp/store"};
+		auto opts = parse(args);
+		mu_check(opts.inputFiles.size() == 1);
+		mu_check(opts.inputFiles[0] == "ontario.pbf");
+		mu_check(opts.outputFile == "foo.mbtiles");
+		mu_check(opts.outputMode == OutputMode::MBTiles);
+		mu_check(opts.osm.storeFile == "/tmp/store");
+		mu_check(!opts.osm.materializeGeometries);
+		mu_check(opts.osm.shardStores);
+	}
+
+	// --store --fast should optimize for speed
+	{
+		std::vector<std::string> args = {"--output", "foo.mbtiles", "--input", "ontario.pbf", "--store", "/tmp/store", "--fast"};
+		auto opts = parse(args);
+		mu_check(opts.inputFiles.size() == 1);
+		mu_check(opts.inputFiles[0] == "ontario.pbf");
+		mu_check(opts.outputFile == "foo.mbtiles");
+		mu_check(opts.outputMode == OutputMode::MBTiles);
+		mu_check(opts.osm.storeFile == "/tmp/store");
+		mu_check(opts.osm.materializeGeometries);
+		mu_check(!opts.osm.shardStores);
 	}
 
 	ASSERT_THROWS("Couldn't open .json config", "--input", "foo", "--output", "bar", "--config", "nonexistent-config.json");

From 411b71ee031ca509ce723cfc192354a18f975115 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 24 Dec 2023 13:02:13 -0500
Subject: [PATCH 46/49] improve test coverage

---
 test/options_parser.test.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/options_parser.test.cpp b/test/options_parser.test.cpp
index e3af0ad2..10e09597 100644
--- a/test/options_parser.test.cpp
+++ b/test/options_parser.test.cpp
@@ -68,12 +68,12 @@ MU_TEST(test_options_parser) {
 
 	// --store --fast should optimize for speed
 	{
-		std::vector<std::string> args = {"--output", "foo.mbtiles", "--input", "ontario.pbf", "--store", "/tmp/store", "--fast"};
+		std::vector<std::string> args = {"--output", "foo.pmtiles", "--input", "ontario.pbf", "--store", "/tmp/store", "--fast"};
 		auto opts = parse(args);
 		mu_check(opts.inputFiles.size() == 1);
 		mu_check(opts.inputFiles[0] == "ontario.pbf");
-		mu_check(opts.outputFile == "foo.mbtiles");
-		mu_check(opts.outputMode == OutputMode::MBTiles);
+		mu_check(opts.outputFile == "foo.pmtiles");
+		mu_check(opts.outputMode == OutputMode::PMTiles);
 		mu_check(opts.osm.storeFile == "/tmp/store");
 		mu_check(opts.osm.materializeGeometries);
 		mu_check(!opts.osm.shardStores);

From 3d89a78b5c23e4e333f5702eb86fdf31c405b443 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 24 Dec 2023 13:06:30 -0500
Subject: [PATCH 47/49] fixes

---
 include/node_stores.h        | 6 +++---
 include/options_parser.h     | 2 +-
 include/sharded_node_store.h | 2 +-
 include/sorted_node_store.h  | 2 +-
 src/options_parser.cpp       | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/node_stores.h b/include/node_stores.h
index 2ef14b70..05d00f4e 100644
--- a/include/node_stores.h
+++ b/include/node_stores.h
@@ -20,10 +20,10 @@ class BinarySearchNodeStore : public NodeStore
 	LatpLon at(NodeID i) const override;
 	size_t size() const override;
 	void insert(const std::vector<element_t>& elements) override;
-	void clear() { 
+	void clear() override {
 		reopen();
 	}
-	void batchStart() {}
+	void batchStart() override {}
 
 	bool contains(size_t shard, NodeID id) const override;
 	NodeStore& shard(size_t shard) override { return *this; }
@@ -56,7 +56,7 @@ class CompactNodeStore : public NodeStore
 	void insert(const std::vector<element_t>& elements) override;
 	void clear() override;
 	void finalize(size_t numThreads) override {}
-	void batchStart() {}
+	void batchStart() override {}
 
 	// CompactNodeStore has no metadata to know whether or not it contains
 	// a node, so it's not suitable for used in sharded scenarios.
diff --git a/include/options_parser.h b/include/options_parser.h
index d9441aef..c5307932 100644
--- a/include/options_parser.h
+++ b/include/options_parser.h
@@ -35,7 +35,7 @@ namespace OptionsParser {
 		std::vector<std::string> inputFiles;
 		std::string luaFile;
 		std::string jsonFile;
-		uint threadNum = 0;
+		uint32_t threadNum = 0;
 		std::string outputFile;
 		std::string bbox;
 
diff --git a/include/sharded_node_store.h b/include/sharded_node_store.h
index ef001347..836c34ef 100644
--- a/include/sharded_node_store.h
+++ b/include/sharded_node_store.h
@@ -15,7 +15,7 @@ class ShardedNodeStore : public NodeStore {
 	size_t size() const override;
 	void batchStart() override;
 	void insert(const std::vector<element_t>& elements) override;
-	void clear() { 
+	void clear() override {
 		reopen();
 	}
 
diff --git a/include/sorted_node_store.h b/include/sorted_node_store.h
index e2832df8..61fdfad3 100644
--- a/include/sorted_node_store.h
+++ b/include/sorted_node_store.h
@@ -66,7 +66,7 @@ class SortedNodeStore : public NodeStore
 	size_t size() const override;
 	void batchStart() override;
 	void insert(const std::vector<element_t>& elements) override;
-	void clear() { 
+	void clear() override {
 		reopen();
 	}
 
diff --git a/src/options_parser.cpp b/src/options_parser.cpp
index 274fd848..3ea60798 100644
--- a/src/options_parser.cpp
+++ b/src/options_parser.cpp
@@ -37,7 +37,7 @@ po::options_description getParser(OptionsParser::Options& options) {
 		("no-compress-ways", po::bool_switch(&options.osm.uncompressedWays),  "store ways uncompressed")
 		("materialize-geometries", po::bool_switch(&options.osm.materializeGeometries),  "materialize geometries")
 		("shard-stores", po::bool_switch(&options.osm.shardStores),  "use an alternate reading/writing strategy for low-memory machines")
-		("threads",po::value< uint >(&options.threadNum)->default_value(0),              "number of threads (automatically detected if 0)")
+		("threads",po::value<uint32_t>(&options.threadNum)->default_value(0),              "number of threads (automatically detected if 0)")
 			;
 
 	desc.add(performance);

From 1edbfd6e9b2bd1e21860805f10f4038a99bc4c38 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sun, 24 Dec 2023 14:00:06 -0500
Subject: [PATCH 48/49] update number of shards to 6

This has no performance impact as we never put anything in the 7th
shard, and so we skip doing the 7th pass in the ReadPhase::Ways and
ReadPhase::Relations phase.

The benefit is only to avoid emitting a noisy log about how the 7th store
has 0 entries in it.

Timings with 6 shards on Vultr's 16-core machine here: https://gist.github.com/cldellow/77991eb4074f6a0f31766cf901659efb

The new peak memory is ~12.2GB.

I am a little perplexed -- the runtime on a 16-core server was
previously:

```
$ time tilemaker --store /tmp/store --input planet-latest.osm.pbf --output tiles.mbtiles --shard-stores
real	195m7.819s
user	2473m52.322s
sys	73m13.116s
```

But with the most recent commits on this branch, it was:

```
real	118m50.098s
user	1531m13.026s
sys	34m7.252s
```

This is incredibly suspicious. I also tried re-running commit
bbf0957c1eb1cca7e35e1aa36e8a672e22a65034, and got:

```
real	123m15.534s
user	1546m25.196s
sys	38m17.093s
```

...so I can't explain why the earlier runs took 195 min.

Ideas:

- the planet changed between runs, and a horribly broken geometry was
  fixed

- Vultr gives quite different machines for the same class of server

- perhaps most likely: I failed to click "CPU-optimized" when picking
  the earlier server, and got a slow machine the first time, and a fast
  machine the second time. I'm pretty sure I paid the same $, so I'm
  not sure I believe this.

I don't think I really believe that a 33% reduction in runtime is
explained by any of those, though. Anyway, just another thing to
be befuddled by.
---
 src/sharded_node_store.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sharded_node_store.cpp b/src/sharded_node_store.cpp
index e9a6dc16..0d915fbd 100644
--- a/src/sharded_node_store.cpp
+++ b/src/sharded_node_store.cpp
@@ -99,5 +99,5 @@ bool ShardedNodeStore::contains(size_t shard, NodeID id) const {
 }
 
 size_t ShardedNodeStore::shards() const {
-	return 7;
+	return 6;
 }

From 657da1ab92fcf65de3f5adafcceddc064ef5e73d Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Mon, 25 Dec 2023 23:02:08 -0500
Subject: [PATCH 49/49] --store uses lazy geometries; permit overriding

I did some experiments on a Hetzner 48-core box with 192GB of RAM:

--store, materialize geometries:
real 65m34.327s
user 2297m50.204s
sys 65m0.901s

The process often failed to use 100% of CPU--if you naively divide
user+sys/real you get ~36, whereas the ideal would be ~48.

Looking at stack traces, it seemed to coincide with calls to Boost's
rbtree_best_fit allocator.

Maybe:

- we're doing disk I/O, and it's just slower than recomputing the geometries
- we're using the Boost mmap library suboptimally -- maybe there's
  some other allocator we could be using. I think we use the mmap
  allocator like a simple bump allocator, so I don't know why we'd need
  a red-black tree

--store, lazy geometries:
real 55m33.979s
user 2386m27.294s
sys 23m58.973s

Faster, but still some overhead (user+sys/real => ~43)

no --store, materialize geometries: OOM

no --store, lazy geometries (used 175GB):
real 51m27.779s
user 2306m25.309s
sys 16m34.289s

This was almost 100% CPU - user+sys/real => ~45)

From this, I infer:

- `--store` should always default to lazy geometries in order to
  minimize the I/O burden

- `--materialize-geometries` is a good default for non-store usage,
  but it's still useful to be able to override and use lazy geometries,
  if it then means you can fit the data entirely in memory
---
 include/options_parser.h     |  3 +++
 src/options_parser.cpp       | 13 +++++++++----
 test/options_parser.test.cpp | 15 ++++++++++++++-
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/include/options_parser.h b/include/options_parser.h
index c5307932..3ca73785 100644
--- a/include/options_parser.h
+++ b/include/options_parser.h
@@ -28,6 +28,9 @@ namespace OptionsParser {
 		bool uncompressedNodes = false;
 		bool uncompressedWays = false;
 		bool materializeGeometries = false;
+		// lazyGeometries is the inverse of materializeGeometries. It can be passed
+		// to override an implicit materializeGeometries, as in the non-store case.
+		bool lazyGeometries = false;
 		bool shardStores = false;
 	};
 
diff --git a/src/options_parser.cpp b/src/options_parser.cpp
index 3ea60798..529e5f4a 100644
--- a/src/options_parser.cpp
+++ b/src/options_parser.cpp
@@ -35,7 +35,8 @@ po::options_description getParser(OptionsParser::Options& options) {
 		("compact",po::bool_switch(&options.osm.compact),  "use faster data structure for node lookups\nNOTE: This requires the input to be renumbered (osmium renumber)")
 		("no-compress-nodes", po::bool_switch(&options.osm.uncompressedNodes),  "store nodes uncompressed")
 		("no-compress-ways", po::bool_switch(&options.osm.uncompressedWays),  "store ways uncompressed")
-		("materialize-geometries", po::bool_switch(&options.osm.materializeGeometries),  "materialize geometries")
+		("lazy-geometries", po::bool_switch(&options.osm.lazyGeometries),  "generate geometries from the OSM stores; uses less memory")
+		("materialize-geometries", po::bool_switch(&options.osm.materializeGeometries),  "materialize geometries; uses more memory")
 		("shard-stores", po::bool_switch(&options.osm.shardStores),  "use an alternate reading/writing strategy for low-memory machines")
 		("threads",po::value<uint32_t>(&options.threadNum)->default_value(0),              "number of threads (automatically detected if 0)")
 			;
@@ -68,12 +69,16 @@ OptionsParser::Options OptionsParser::parse(const int argc, const char* argv[])
 	if (options.osm.storeFile.empty()) {
 		options.osm.materializeGeometries = true;
 	} else {
-		if (options.osm.fast) {
-			options.osm.materializeGeometries = true;
-		} else {
+		if (!options.osm.fast) {
 			options.osm.shardStores = true;
 		}
 	}
+
+	// You can pass --lazy-geometries to override the default of materialized geometries for
+	// the non-store case.
+	if (options.osm.lazyGeometries)
+		options.osm.materializeGeometries = false;
+
 	
 	if (vm.count("help")) {
 		options.showHelp = true;
diff --git a/test/options_parser.test.cpp b/test/options_parser.test.cpp
index 10e09597..e230fc0d 100644
--- a/test/options_parser.test.cpp
+++ b/test/options_parser.test.cpp
@@ -53,6 +53,19 @@ MU_TEST(test_options_parser) {
 		mu_check(!opts.osm.shardStores);
 	}
 
+	// --lazy-geometries overrides default
+	{
+		std::vector<std::string> args = {"--output", "foo.mbtiles", "--input", "ontario.pbf", "--lazy-geometries"};
+		auto opts = parse(args);
+		mu_check(opts.inputFiles.size() == 1);
+		mu_check(opts.inputFiles[0] == "ontario.pbf");
+		mu_check(opts.outputFile == "foo.mbtiles");
+		mu_check(opts.outputMode == OutputMode::MBTiles);
+		mu_check(!opts.osm.materializeGeometries);
+		mu_check(opts.osm.lazyGeometries);
+		mu_check(!opts.osm.shardStores);
+	}
+
 	// --store should optimize for reduced memory
 	{
 		std::vector<std::string> args = {"--output", "foo.mbtiles", "--input", "ontario.pbf", "--store", "/tmp/store"};
@@ -75,7 +88,7 @@ MU_TEST(test_options_parser) {
 		mu_check(opts.outputFile == "foo.pmtiles");
 		mu_check(opts.outputMode == OutputMode::PMTiles);
 		mu_check(opts.osm.storeFile == "/tmp/store");
-		mu_check(opts.osm.materializeGeometries);
+		mu_check(!opts.osm.materializeGeometries);
 		mu_check(!opts.osm.shardStores);
 	}