-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathcodecs.h
319 lines (266 loc) · 17.2 KB
/
codecs.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
#pragma once
#include "common.h"
#include "docidupdates.h"
#include "docset_iterators_base.h"
#include "docwordspace.h"
#include "runtime.h"
// Use of Codecs::Google results in a somewhat large index, while the access time is similar(maybe somewhat slower) to Lucene's codec
namespace Trinity {
struct candidate_document;
struct queryexec_ctx;
// Information about a term's posting list and number of documents it matches.
// We track the number of documents because it may be useful(and it is) to some codecs, and also
// is extremely useful during execution where we re-order the query nodes based on evaluation cost which
// is directly related to the a term's posting list size based on the number of documents it matches.
struct term_index_ctx final {
uint32_t documents;
// For each each term, the inverted index contains a `posting list`, where
// each posting contains the occurrences information (e.g frequences and positions)
// for documents that contain the trerm. This is the chunk in the index that
// holds the posting list.
// This is codec specific though -- for some codecs, this could mean something else
// e.g for a memory-resident index source/segment, you could use inexChunk to refer to some in-memory data
range32_t indexChunk;
term_index_ctx(const uint32_t d, const range32_t c)
: documents{d}, indexChunk{c} {
}
term_index_ctx(const term_index_ctx &o) {
documents = o.documents;
indexChunk = o.indexChunk;
}
term_index_ctx(term_index_ctx &&o) {
documents = o.documents;
indexChunk = o.indexChunk;
}
term_index_ctx &operator=(const term_index_ctx &o) {
documents = o.documents;
indexChunk = o.indexChunk;
return *this;
}
term_index_ctx &operator=(const term_index_ctx &&o) {
documents = o.documents;
indexChunk = o.indexChunk;
return *this;
}
term_index_ctx() = default;
};
namespace Codecs {
struct Encoder;
struct AccessProxy;
// Represents a new indexer session
// All indexer sessions have an `indexOut` that holds the inverted index(posting lists for each distinct term)
// but other codecs may e.g open/track more files or buffers depending on their needs.
//
// The base path makes sense for disk based storage, but some codecs may only operate on in-memory
// data so it may not be used in those designs.
struct IndexSession {
enum class Capabilities : uint8_t {
// append_index_chunk() is implemented
AppendIndexChunk = 1,
Merge = 1 << 1
};
const uint8_t caps;
IOBuffer indexOut;
// Whenever you flush indexOut to disk, say, every few MBs or GBs,
// you need to adjust indexOutFlushed, because the various codecs implementations need to compute some offset in the index
//
// Obviously, you are not required to flush, but it's an option.
// Doing this would also likely improve performance, for instead of expanding indexOut's underlying allocated memory and thus
// likely incurring a memmcpy() cost, by keeping the buffer small and flushing it periodically to a backing file, this is avoided, memory
// allocation remainins low/constant and no need for memcpy() is required (if no reallocations are required)
//
// TODO: consider an alternative idea, where instead of flushing to disk, we 'll just steal indexOut memory (via IOBuffer::release() ) and
// length, and track that in a vector, and flush indexOut, and in the end, we 'll just compile a new indexOut from those chunks.
// This would provide those benefits:
// - no need to allocate large chunks of memory to hold the whold index; will allocate smaller chunks (and maybe even in the end
// serialize all them to disk, free their memory, and allocate memory for the index and load it from disk)
// - no need to resize the IOBuffer, i.e no need for memcpy() the data to new buffers on reallocation
uint32_t indexOutFlushed;
char basePath[PATH_MAX];
// The segment name should be the generation
// e.g for path Trinity/Indices/Wikipedia/Segments/100
// the generation is extracted as 100, but, again, this is codec specific
IndexSession(const char *bp, const uint8_t capabilities = 0)
: caps{capabilities}, indexOutFlushed{0} {
strcpy(basePath, bp);
}
virtual ~IndexSession() {
}
// Utility method
// Demonstrates how you should update indexOutFlushed
void flush_index(int fd);
// Handy utility function
// see SegmentIndexSession::commit()
void persist_terms(std::vector<std::pair<str8_t, term_index_ctx>> &);
// Subclasses should e.g open files, allocate memory etc
virtual void begin() = 0;
// Subclasses should undo begin() ops. e.g close files and release resources
virtual void end() = 0;
// This may be stored in a segment directory in order to determine
// which codec to use to access it
virtual strwlen8_t codec_identifier() = 0;
// Constructs a new encoder
// Handy utility function
virtual Encoder *new_encoder() = 0;
// This is used during merge
// By providing the access to an index and a term's tctx, we should append to the current indexOut
// for most codecs, that's just a matter of copying the region in tctx.indexChunk into indexOut, but
// some other codecs may need to access other files in the src
// (e.g lucene codec uses an extra files for positions/attributes)
//
// must hold that (src->codec_identifier() == this->codec_identifier())
//
// XXX: If you want to filter a chunk's documents though, don't use this method.
// See MergeCandidatesCollection::merge()
//
// If you have a fancy codec that you don't expect to use for merging other segments
// (e.g some fancy in-memory wrapper that is
// really only used for queries as an IndexSource), then you can just implement this and merge() as no-ops.
//
// see MergeCandidatesCollection::merge() impl.
//
// UPDATE: this is now optional. If your codec implements it, make sure you set (Capabilities::AppendIndexChunk)
// in capabilities flags passed to Codecs::IndexSession::IndexSession(). Both Google and Lucene's do so.
// The default impl. does nothing/aborts
virtual range32_t append_index_chunk(const AccessProxy *src, const term_index_ctx srcTCTX) {
std::abort();
return {};
}
// If we have multiple postng lists for the same term(i.e merging 2+ segments
// and same term exists in 2+ of them)
// where we can't just use append_index_chunk() but instead we need to merge them,
// and to do that, we we need to use this codec-specific merge function
// that will do this for us. Use append_index_chunk() otherwise.
//
// You are expected to have encoder->begin_term() before invoking this method,
// and to invoke encoder->end_term() after the method has returned
//
// The input pairs hold an AccessProxy for the data and a range in the the index of that data, and
// the encoder the target codec encoder
//
// See MergeCandidatesCollection::merge() impl.
struct merge_participant final {
AccessProxy * ap;
term_index_ctx tctx;
masked_documents_registry *maskedDocsReg;
};
// UPDATE: now optional; if you override and implement it, make sure you set Capabilities::Merge in the constructo
virtual void merge(merge_participant *participants, const uint16_t participantsCnt, Encoder *const encoder) {
}
};
// Encoder interface for encoding a single term's posting list
struct Encoder {
IndexSession *const sess;
// You should have s->begin() before you use encode any postlists for that session
Encoder(IndexSession *s)
: sess{s} {
}
virtual ~Encoder() {
}
virtual void begin_term() = 0;
virtual void begin_document(const isrc_docid_t documentID) = 0;
// If you want to register a hit for a special token (e.g site:foo.com) where position makes no sense,
// you should use position 0(or a very high position, but 0 is preferrable)
// You will likely only need to set payload for special terms (e.g site:foo.com).
// Payload can be upto 8 byte sin size(sizeof(uint64_t)). You should try to keep that as low as possible.
virtual void new_hit(const uint32_t position, const range_base<const uint8_t *, const uint8_t> payload) = 0;
virtual void end_document() = 0;
virtual void end_term(term_index_ctx *) = 0;
};
// This is how postings lists are accessed and hits are materialized.
// It is created by a Codecs::Decoder, and it should in practice hold a reference
// to the decoder and delegate work to it, but that should depend on your codec's impl.
//
// In the past, Decoder's implemented next() and advance()/seek(), but it turned to be a bad idea
// because it was somewhat challenging to support multiple terms in the query, where you want to access
// the same underlying decoder state, but traverse them independently.
struct Decoder;
struct PostingsListIterator
: public Trinity::DocsSetIterators::Iterator {
// decoder that created this iterator
Decoder *const dec;
// For current document
tokenpos_t freq;
PostingsListIterator(Decoder *const d)
: Iterator{Trinity::DocsSetIterators::Type::PostingsListIterator}, dec{d} {
}
virtual ~PostingsListIterator() {
}
// Materializes hits for the _current_ document in the postings list
// You must also dwspace->set(termID, pos) for positions != 0
//
// XXX: if you materialize, it's likely that curDocument.freq will be reset to 0
// so you should not materialize if have already done so.
virtual void materialize_hits(DocWordsSpace *dwspace, term_hit *out) = 0;
inline auto decoder() noexcept {
return dec;
}
inline const Decoder *decoder() const noexcept {
return dec;
}
#ifdef RDP_NEED_TOTAL_MATCHES
inline uint32_t total_matches() override final {
return freq;
}
#endif
};
// Decoder interface for decoding a single term's posting list.
// If a decoder makes use of skiplist data, they are expected to be serialized in the index
// and the decoder is responsible for deserializing and using them from there, although this is specific to the codec
//
// Decoder is responsible for maintaining term-specific segment state, and for creating new Posting Lists iterators, which in turn
// should just contain iterator-specific state and logic working in conjuction with the Decoder that created/provided it.
struct Decoder {
// Term specific data for the term of this decode
// see e.g DocsSetIterators::cost()
term_index_ctx indexTermCtx;
// use set_exec() to set those
exec_term_id_t execCtxTermID{0};
queryexec_ctx *rctx{nullptr};
constexpr auto exec_ctx_termid() const noexcept {
return execCtxTermID;
}
// Initialise the decoding state for accessing the postings list
//
// Not going to rely on a constructor for initialization because
// we want to force subclasses to define a method with this signature
virtual void init(const term_index_ctx &, AccessProxy *) = 0;
// This is how you are going to access the postings list
virtual PostingsListIterator *new_iterator() = 0;
Decoder() {
}
virtual ~Decoder() {
}
void set_exec(exec_term_id_t tid, queryexec_ctx *const r) {
execCtxTermID = tid;
rctx = r;
}
};
// Responsible for initializing segment/codec specific state and for generating(factory) decoders for terms
// All AccessProxy instances have a pointer to the actual index(posts lists) in common
struct AccessProxy {
const char *basePath;
// Index of the posting lists for all distinct terms
// This is typically a pointer to memory mapped index file, or a memory-resident anything, or
// something other specific to the codec
const uint8_t *const indexPtr;
// Utility function: returns an initialised new decoder for a term's posting list
// Some codecs(e.g lucene's) may need to access the filesystem and/or other codec specific state
// AccessProxy faciliates that (this is effectively a pointer to self)
//
// XXX: Make sure you Decoder::init() before you return from the method
// see e.g Google::AccessProxy::new_decoder()
//
// See Codecs::Decoder for execCtxTermID
virtual Decoder *new_decoder(const term_index_ctx &tctx) = 0;
AccessProxy(const char *bp, const uint8_t *index_ptr)
: basePath{bp}, indexPtr{index_ptr} {
// Subclasses should open files, etc
}
// See IndexSession::codec_identifier()
virtual strwlen8_t codec_identifier() = 0;
virtual ~AccessProxy() {
}
};
} // namespace Codecs
} // namespace Trinity