Skip to content

Commit

Permalink
New version based on intensive round of reviewing together with Johannes
Browse files Browse the repository at this point in the history
In particular, there is no longer a "construction" phase now, at the
cost of using a slower hash map (absl::node_hash_map), slightly more
space (the hash map and a vector to the strings stored in the hash map),
and an indirection when looking up the word for an index (we have to
follow the pointer to the actual string stored in the hash map).
  • Loading branch information
Hannah Bast committed Nov 10, 2022
1 parent 77419df commit de97c03
Show file tree
Hide file tree
Showing 17 changed files with 167 additions and 169 deletions.
29 changes: 20 additions & 9 deletions src/engine/Bind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,20 +160,31 @@ void Bind::computeExpressionBind(
*resultType =
sparqlExpression::detail::expressionResultTypeToQleverResultType<T>();

size_t i = 0;
for (auto&& resultValue : resultGenerator) {
output(i, inCols) =
sparqlExpression::detail::constantExpressionResultToId(
resultValue, *(outputResultTable->_localVocab),
isConstant && i > 0);
i++;
if (isConstant) {
auto it = resultGenerator.begin();
if (it != resultGenerator.end()) {
Id constantId =
sparqlExpression::detail::constantExpressionResultToId(
*it, *(outputResultTable->_localVocab));
for (size_t i = 0; i < inSize; ++i) {
output(i, inCols) = constantId;
}
}
} else {
size_t i = 0;
for (auto&& resultValue : resultGenerator) {
output(i, inCols) =
sparqlExpression::detail::constantExpressionResultToId(
resultValue, *(outputResultTable->_localVocab));
i++;
}
}
}
};

outputResultTable->_localVocab->startConstructionPhase();
// outputResultTable->_localVocab->startConstructionPhase();
std::visit(visitor, std::move(expressionResult));
outputResultTable->_localVocab->endConstructionPhase();
// outputResultTable->_localVocab->endConstructionPhase();

outputResultTable->_idTable = output.moveToDynamic();
}
1 change: 1 addition & 0 deletions src/engine/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ add_library(engine
../util/Socket.h
Comparators.h
ResultTable.h ResultTable.cpp
LocalVocab.h LocalVocab.cpp
QueryExecutionContext.h
IndexScan.h IndexScan.cpp
Join.h Join.cpp
Expand Down
10 changes: 5 additions & 5 deletions src/engine/GroupBy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ void GroupBy::processGroup(
*resultType =
sparqlExpression::detail::expressionResultTypeToQleverResultType<T>();
resultEntry = sparqlExpression::detail::constantExpressionResultToId(
singleResult, *(outTable->_localVocab), false);
singleResult, *(outTable->_localVocab));
} else {
// This should never happen since aggregates always return constants.
AD_FAIL()
Expand Down Expand Up @@ -235,9 +235,9 @@ void GroupBy::doGroupBy(const IdTable& dynInput,

if (groupByCols.empty()) {
// The entire input is a single group
outTable->_localVocab->startConstructionPhase();
// outTable->_localVocab->startConstructionPhase();
processNextBlock(0, input.size());
outTable->_localVocab->endConstructionPhase();
// outTable->_localVocab->endConstructionPhase();
*dynResult = result.moveToDynamic();
return;
}
Expand All @@ -251,7 +251,7 @@ void GroupBy::doGroupBy(const IdTable& dynInput,
size_t blockStart = 0;
auto checkTimeoutAfterNCalls = checkTimeoutAfterNCallsFactory(32000);

outTable->_localVocab->startConstructionPhase();
// outTable->_localVocab->startConstructionPhase();
for (size_t pos = 1; pos < input.size(); pos++) {
checkTimeoutAfterNCalls(currentGroupBlock.size());
bool rowMatchesCurrentBlock =
Expand All @@ -269,7 +269,7 @@ void GroupBy::doGroupBy(const IdTable& dynInput,
}
}
processNextBlock(blockStart, input.size());
outTable->_localVocab->endConstructionPhase();
// outTable->_localVocab->endConstructionPhase();
*dynResult = result.moveToDynamic();
}

Expand Down
17 changes: 3 additions & 14 deletions src/engine/Join.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,21 +145,10 @@ void Join::computeResult(ResultTable* result) {
&result->_idTable);

// If only one of the two operands has a local vocab, pass it on.
bool leftLocalVocabEmpty = leftRes->_localVocab->empty();
bool rightLocalVocabEmpty = rightRes->_localVocab->empty();
if (!leftLocalVocabEmpty || !rightLocalVocabEmpty) {
if (!leftLocalVocabEmpty && rightLocalVocabEmpty) {
result->_localVocab = std::move(leftRes->_localVocab);
} else if (leftLocalVocabEmpty && !rightLocalVocabEmpty) {
result->_localVocab = std::move(rightRes->_localVocab);
} else {
throw std::runtime_error(
"JOIN of two results, where both have a non-empty vocabulary, is "
"currently not supported");
}
}
result->_localVocab = LocalVocab::mergeLocalVocabsIfOneIsEmpty(
leftRes->_localVocab, rightRes->_localVocab);

LOG(DEBUG) << "Join result computation done." << endl;
LOG(DEBUG) << "Join result computation done" << endl;
}

// _____________________________________________________________________________
Expand Down
51 changes: 51 additions & 0 deletions src/engine/LocalVocab.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// Copyright 2022, University of Freiburg
// Chair of Algorithms and Data Structures
// Author: Hannah Bast <bast@cs.uni-freiburg.de>

#include "engine/LocalVocab.h"

#include "absl/strings/str_cat.h"
#include "global/Id.h"
#include "global/ValueId.h"

// _____________________________________________________________________________
Id LocalVocab::getIdAndAddIfNotContained(const std::string& word) {
// The following code avoids computing the hash for `word` twice in case we
// see it for the first time (note that hashing a string is not cheap). The
// return value of the `insert` operation is a pair, where `result.first` is
// an iterator to the (already existing or newly inserted) key-value pair, and
// `result.second` is a `bool`, which is `true` if and only if the value was
// newly inserted.
auto [keyValuePair, isNewWord] = wordsToIdsMap_.insert({word, nextFreeId_});
if (isNewWord) {
idsToWordsMap_.push_back(&(keyValuePair->first));
nextFreeId_ = Id::makeFromLocalVocabIndex(
LocalVocabIndex::make(idsToWordsMap_.size()));
}
return keyValuePair->second;
}

// _____________________________________________________________________________
const std::string& LocalVocab::getWord(LocalVocabIndex localVocabIndex) const {
if (localVocabIndex.get() > idsToWordsMap_.size()) {
throw std::runtime_error(absl::StrCat(
"LocalVocab error: request for word with local vocab index ",
localVocabIndex.get(), ", but size of local vocab is only ",
idsToWordsMap_.size(), ", please contact the developers"));
}
return *(idsToWordsMap_[localVocabIndex.get()]);
}

// _____________________________________________________________________________
std::shared_ptr<LocalVocab> LocalVocab::mergeLocalVocabsIfOneIsEmpty(
std::shared_ptr<LocalVocab> localVocab1,
std::shared_ptr<LocalVocab> localVocab2) {
bool isLocalVocab1Empty = localVocab1->empty();
bool isLocalVocab2Empty = localVocab2->empty();
if (!isLocalVocab1Empty && !isLocalVocab2Empty) {
throw std::runtime_error(
"Merging of two non-empty local vocabularies is currently not "
"supported, please contact the developers");
}
return !isLocalVocab1Empty ? localVocab1 : localVocab2;
}
69 changes: 69 additions & 0 deletions src/engine/LocalVocab.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// Copyright 2022, University of Freiburg
// Chair of Algorithms and Data Structures
// Author: Hannah Bast <bast@cs.uni-freiburg.de>

#pragma once

#include "absl/container/node_hash_map.h"

// A class for maintaing a local vocabulary with contiguous (local) IDs. This is
// meant for words that are not part of the normal vocabulary (constructed from
// the input data at indexing time).
//
// TODO: This is a first version of this class with basic functionality. Note
// that the local vocabulary used to be a simple `std::vector<std::string>`
// defined inside of the `ResultTable` class. You gotta start somewhere.
class LocalVocab {
public:
// Create a new, empty local vocabulary.
LocalVocab() = default;

// Prevent accidental copying of a local vocabulary (it can be quite large),
// but moving it is OK.
//
// TODO: does the default move do the "right" thing, that is, move the hash
// map instead of copying it?
LocalVocab(const LocalVocab&) = delete;
LocalVocab(LocalVocab&&) = default;

// Get ID of a word in the local vocabulary. If the word was already
// contained, return the already existing ID. If the word was not yet
// contained, add it, and return the new ID.
Id getIdAndAddIfNotContained(const std::string& word);

// The number of words in the vocabulary.
size_t size() const { return idsToWordsMap_.size(); }

// Return true if and only if the local vocabulary is empty.
bool empty() const { return idsToWordsMap_.empty(); }

// Return a const reference to the word.
const std::string& getWord(LocalVocabIndex localVocabIndex) const;

// Merge two local vocabularies if at least one of them is empty. If both are
// non-empty, throws an exception.
//
// TODO: Eventually, we want to have one local vocab for the whole query to
// which each operation writes (one after the other). Then we don't need a
// merge function anymore.
static std::shared_ptr<LocalVocab> mergeLocalVocabsIfOneIsEmpty(
std::shared_ptr<LocalVocab> localVocab1,
std::shared_ptr<LocalVocab> localVocab2);

private:
// A map of the words in the local vocabulary to their local IDs. This is a
// node hash map because we need the addresses of the words (which are of type
// `std::string`) to remain stable over their lifetime in the hash map because
// we refer to them in `wordsToIdsMap_` below.
absl::node_hash_map<std::string, Id> wordsToIdsMap_;

// A map of the local IDs to the words. Since the IDs are contiguous, we can
// use a `std::vector`. We store pointers to the actual words in
// `wordsToIdsMap_` to avoid storing every word twice. This saves space, but
// costs us an indirection when looking up a word by its ID.
std::vector<const std::string*> idsToWordsMap_;

// The next free local ID (will be incremented by one each time we add a new
// word).
Id nextFreeId_ = Id::makeFromLocalVocabIndex(LocalVocabIndex::make(0));
};
11 changes: 3 additions & 8 deletions src/engine/QueryExecutionTree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -333,12 +333,8 @@ QueryExecutionTree::idToStringAndType(Id id,
return std::pair{std::move(entity.value()), nullptr};
}
case Datatype::LocalVocabIndex: {
auto optionalString =
resultTable.indexToOptionalString(id.getLocalVocabIndex());
if (!optionalString.has_value()) {
return std::nullopt;
}
return std::pair{optionalString.value(), nullptr};
return std::pair{
resultTable._localVocab->getWord(id.getLocalVocabIndex()), nullptr};
}
case Datatype::TextRecordIndex:
return std::pair{_qec->getIndex().getTextExcerpt(id.getTextRecordIndex()),
Expand Down Expand Up @@ -453,8 +449,7 @@ ad_utility::streams::stream_generator QueryExecutionTree::generateResults(
break;
case Datatype::LocalVocabIndex:
co_yield escapeFunction(
resultTable->indexToOptionalString(id.getLocalVocabIndex())
.value_or(""));
resultTable->_localVocab->getWord(id.getLocalVocabIndex()));
break;
case Datatype::TextRecordIndex:
co_yield escapeFunction(
Expand Down
41 changes: 1 addition & 40 deletions src/engine/ResultTable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,46 +7,7 @@

#include <cassert>

#include "global/Id.h"
#include "global/ValueId.h"

// _____________________________________________________________________________
Id LocalVocab::getIdAndAddIfNotContained(const std::string& word) {
if (constructionHasFinished_) {
throw std::runtime_error(
"Invalid use of `LocalVocab`: You must not call "
"`getIdAndAddIfNotContained` after `endConstructionPhase` has been "
"called");
}
// The following code avoids computing the hash for `word` twice in case we
// see it for the first time (note that hashing a string is not cheap). The
// return value of the `insert` operation is a pair, where `result.first` is
// an iterator to the (already existing or newly inserted) key-value pair, and
// `result.second` is a `bool`, which is `true` if and only if the value was
// newly inserted.
auto result = wordsToIdsMap_.insert(
{word,
Id::makeFromLocalVocabIndex(LocalVocabIndex::make(words_.size()))});
if (result.second) {
words_.push_back(word);
}
return result.first->second;
}

// _____________________________________________________________________________
void LocalVocab::startConstructionPhase() {
if (!words_.empty()) {
throw std::runtime_error(
"Invalid use of `LocalVocab`: `startConstructionPhase` must currently "
"only be called when the vocabulary is still empty");
}
}

// _____________________________________________________________________________
void LocalVocab::endConstructionPhase() {
wordsToIdsMap_.clear();
constructionHasFinished_ = true;
}
#include "engine/LocalVocab.h"

// _____________________________________________________________________________
ResultTable::ResultTable(ad_utility::AllocatorWithLimit<Id> allocator)
Expand Down
61 changes: 1 addition & 60 deletions src/engine/ResultTable.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <vector>

#include "engine/IdTable.h"
#include "engine/LocalVocab.h"
#include "engine/ResultType.h"
#include "global/Id.h"
#include "global/ValueId.h"
Expand All @@ -25,59 +26,6 @@ using std::mutex;
using std::unique_lock;
using std::vector;

// The local vocabulary for a particular result table. It maps the IDs that are
// not part of the normal vocabulary
//
//
// It contains a map from
// (local vocab) ids
class LocalVocab {
public:
// Create a new, empty local vocabulary.
LocalVocab() {}

// Prevent accidental copying of a local vocabulary.
// TODO: Needed in SparqlExpressionTestHelpers.h:91.
// LocalVocab(const LocalVocab&) = delete;

// Get ID of a word in the local vocabulary. If the word was already
// contained, return the already existing ID. If the word was not yet
// contained, add it, and return the new ID.
[[maybe_unused]] Id getIdAndAddIfNotContained(const std::string& word);

// Start the construction of a local vocabulary. This is currently allowed
// only once, when the vocabulary is still empty.
void startConstructionPhase();

// Signal that the construction of the local vocabulary is done. This call
// will clear the `wordsToIdsMap_` (to save space) and afterwards,
// `getIdAndAddIfNotContained` can no longer be called.
void endConstructionPhase();

// The number of words in the vocabulary.
size_t size() const { return words_.size(); }

// Return true if and only if the local vocabulary is empty.
bool empty() const { return words_.empty(); }

// Return a const reference to the i-th word.
const std::string& operator[](size_t i) const { return words_[i]; }

private:
// The words of the local vocabulary. The index of a word in the `std::vector`
// corresponds to its ID in the local vocabulary.
std::vector<string> words_;

// Remember which words are already in the vocabulary and with which ID. This
// map is only used during the construction of a local vocabulary and can (and
// should) be cleared when the construction is done (to save space).
ad_utility::HashMap<std::string, Id> wordsToIdsMap_;

// Indicator whether the vocabulary is still under construction (only then can
// `getIdAndAddIfNotContained` be called) or done.
bool constructionHasFinished_ = false;
};

class ResultTable {
public:
enum Status { IN_PROGRESS = 0, FINISHED = 1, ABORTED = 2 };
Expand Down Expand Up @@ -109,13 +57,6 @@ class ResultTable {

virtual ~ResultTable();

std::optional<std::string> indexToOptionalString(LocalVocabIndex idx) const {
if (idx.get() < _localVocab->size()) {
return (*_localVocab)[idx.get()];
}
return std::nullopt;
}

size_t size() const;
size_t width() const { return _idTable.cols(); }

Expand Down
1 change: 0 additions & 1 deletion src/engine/Sort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ void Sort::computeResult(ResultTable* result) {
result->_resultTypes.insert(result->_resultTypes.end(),
subRes->_resultTypes.begin(),
subRes->_resultTypes.end());
// TODO: Shouldn't we use std::move here?
result->_localVocab = subRes->_localVocab;
result->_idTable.insert(result->_idTable.end(), subRes->_idTable.begin(),
subRes->_idTable.end());
Expand Down
Loading

0 comments on commit de97c03

Please sign in to comment.