Skip to content

Commit

Permalink
Address comments from Johannes' review + rebase to master
Browse files Browse the repository at this point in the history
Thank you very much for your comments, Johannes. It's not yet perfect,
but I think it's ready for a second round of reviewing now.
  • Loading branch information
Hannah Bast committed Nov 28, 2022
1 parent e48416e commit 4a348d4
Show file tree
Hide file tree
Showing 18 changed files with 315 additions and 175 deletions.
2 changes: 0 additions & 2 deletions src/engine/ResultTable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
#include <cassert>

#include "engine/LocalVocab.h"
#include "global/Id.h"
#include "global/ValueId.h"

// _____________________________________________________________________________
ResultTable::ResultTable(ad_utility::AllocatorWithLimit<Id> allocator)
Expand Down
269 changes: 158 additions & 111 deletions src/engine/Service.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,6 @@
// Chair of Algorithms and Data Structures.
// Author: Hannah Bast (bast@cs.uni-freiburg.de)

// NOTE: The `util/HttpServer/beast.h` *must* come before one or several of the
// `<boost/...>` includes because the former defines some variables which, if
// undefined, are defined the latter. If the `util/HttpServer/beast.h` include
// comes later (or in another file), this leads to a "redefinition" warning and,
// much worse, a nasty segmentation fault in the beast code below.
//
// TODO: Solve this in a more robust manner. The order of includes should not
// matter and it should certainly not cause segmentation faults.
#include "engine/Service.h"

#include <absl/strings/str_cat.h>
Expand All @@ -28,148 +20,203 @@
// ____________________________________________________________________________
Service::Service(QueryExecutionContext* qec,
parsedQuery::Service parsedServiceClause)
: Operation(qec) {
parsedServiceClause_ = parsedServiceClause;
using Url = ad_utility::httpUtils::Url;
: Operation(qec), parsedServiceClause_(parsedServiceClause) {}

// ____________________________________________________________________________
std::string Service::asStringImpl(size_t indent) const {
std::ostringstream os;
for (size_t i = 0; i < indent; ++i) {
os << " ";
}
// TODO: This duplicates code in GraphPatternOperation.cpp .
os << "SERVICE " << parsedServiceClause_.varOrIri_.toSparql() << " {\n"
<< parsedServiceClause_.prologue_ << "\n"
<< parsedServiceClause_.graphPatternAsString_ << "\n}\n";
return std::move(os).str();
}

// ____________________________________________________________________________
std::string Service::getDescriptor() const {
return absl::StrCat("Service with IRI ",
parsedServiceClause_.varOrIri_.toSparql());
}

// TODO: The following code sends a SPARQL query to the remote endpoint and
// turns the TSV result into a `parsedQuery::Values` object. Is this
// constructor the right place for this code or should this be in
// `computeResult` or somewhere else?
// ____________________________________________________________________________
size_t Service::getResultWidth() const {
return parsedServiceClause_.visibleVariables_.size();
}

// ____________________________________________________________________________
std::vector<size_t> Service::resultSortedOn() const { return {}; }

// ____________________________________________________________________________
VariableToColumnMap Service::computeVariableToColumnMap() const {
VariableToColumnMap map;
const auto& visibleVariables = parsedServiceClause_.visibleVariables_;
for (size_t i = 0; i < visibleVariables.size(); i++) {
map[visibleVariables[i]] = i;
}
return map;
}

// ____________________________________________________________________________
float Service::getMultiplicity([[maybe_unused]] size_t col) {
// TODO: For now, we don't have any information about the multiplicities at
// query planning time, so we just return `1` for each column.
return 1;
}

// ____________________________________________________________________________
size_t Service::getSizeEstimate() {
// TODO: For now, we don't have any information about the result size at
// query planning time, so we just return `100'000`.
return 100'000;
}

// ____________________________________________________________________________
size_t Service::getCostEstimate() {
// TODO: For now, we don't have any information about the cost at query
// planning time, so we just return ten times the estimated size.
return 10 * getSizeEstimate();
}

// ____________________________________________________________________________
void Service::computeResult(ResultTable* result) {
using Url = ad_utility::httpUtils::Url;

// Get the URL of the SPARQL endpoint (we can only do fixed IRIs at the
// moment).
std::string_view serviceIriString = parsedServiceClause_.serviceVarOrIri_;
if (serviceIriString.starts_with("?")) {
if (std::holds_alternative<Variable>(parsedServiceClause_.varOrIri_)) {
throw std::runtime_error(
"SERVICE with variable for endpoint not yet supported by QLever");
}
auto iri = std::get<GraphTerm>(parsedServiceClause_.varOrIri_);
// According to the grammar, this can only be of type `Iri`. See the comment
// in `Service.h` for `varOrIri_`.
AD_CHECK(std::holds_alternative<Iri>(iri));
std::string_view serviceIriString = std::get<Iri>(iri).iri();
AD_CHECK(serviceIriString.starts_with("<") &&
serviceIriString.ends_with(">"));
serviceIriString.remove_prefix(1);
serviceIriString.remove_suffix(1);
Url serviceUrl(serviceIriString);
Url serviceUrl{serviceIriString};

// Construct the query to be sent to the SPARQL endpoint.
std::string serviceQuery =
absl::StrCat(parsedServiceClause_.servicePrologue_, "\nSELECT * WHERE ",
parsedServiceClause_.serviceQueryBody_);
std::string variablesForSelectClause =
absl::StrJoin(parsedServiceClause_.visibleVariables_, " ",
[](std::string* out, const Variable& variable) {
out->append(variable.name());
});
std::string serviceQuery = absl::StrCat(
parsedServiceClause_.prologue_, "\nSELECT ", variablesForSelectClause,
" WHERE ", parsedServiceClause_.graphPatternAsString_);
LOG(INFO) << "Sending SERVICE query to remote endpoint "
<< "(protocol: " << serviceUrl.protocolAsString()
<< ", host: " << serviceUrl.host()
<< ", port: " << serviceUrl.port()
<< ", target: " << serviceUrl.target() << ")" << std::endl
<< serviceQuery << std::endl;

// Send the query and get the result.
// Send the query to the remote SPARQL endpoint and get the result as TSV.
//
// TODO: The following assignment is so complicated (immediately initialized
// lambda with another lambda inside) because we currently have two separate
// classes `HttpClient` and `HttpsClient`. Maybe also have a common class that
// picks the right client according to the protocol.
//
// TODO: We ask for the result as TSV because that is a compact and
// easy-to-parse format. It might not be the best choice regarding robustness
// and portability though. In particular, we are not sure how deterministic
// the TSV output is with respect to the precise encoding of literals.
std::istringstream tsvResult = [&]() {
if (serviceUrl.protocol() == Url::Protocol::HTTP) {
HttpClient client(serviceUrl.host(), serviceUrl.port());
auto sendRequest = [&]<typename Client>() {
Client client{serviceUrl.host(), serviceUrl.port()};
return client.sendRequest(boost::beast::http::verb::post,
serviceUrl.host(), serviceUrl.target(),
serviceQuery, "application/sparql-query",
"text/tab-separated-values");
};
if (serviceUrl.protocol() == Url::Protocol::HTTP) {
return sendRequest.operator()<HttpClient>();
} else {
HttpsClient client(serviceUrl.host(), serviceUrl.port());
return client.sendRequest(boost::beast::http::verb::post,
serviceUrl.host(), serviceUrl.target(),
serviceQuery, "application/sparql-query",
"text/tab-separated-values");
return sendRequest.operator()<HttpsClient>();
}
}();
LOG(DEBUG) << "TSV result is:" << std::endl << tsvResult.str() << std::endl;

// Turn TSV into VALUES clause, which will then be turned into a `ResultTable`
// by `parsedQuery::Values` (namely, method `writeValues`, which also
// constructs a `localVocab` if required, which it usually is for SERVICE
// queries).
//
// TODO: Is the time needed for turning the `parsedQuery::Values` into a
// `ResultTable` accounted for?
//
// TODO: Eventually, we should construct a `ResultTable` directly, with a mix
// of IDs from this endpoint and IDs from the remote endpoint (the latter with
// a bitmask, to that we can easily distinguish them).
parsedQuery::SparqlValues sparqlValues;
std::string variables;
if (!std::getline(tsvResult, variables)) {
// The first line of the TSV result contains the variable names.
std::string tsvHeaderRow;
if (!std::getline(tsvResult, tsvHeaderRow)) {
throw std::runtime_error(absl::StrCat("Response from SPARQL endpoint ",
serviceUrl.host(), " is empty"));
}
LOG(INFO) << "First line of response body: " << variables << std::endl;
for (const std::string_view& variableName : absl::StrSplit(variables, "\t")) {
sparqlValues._variables.push_back(Variable{std::string{variableName}});
LOG(INFO) << "Header row of TSV result: " << tsvHeaderRow << std::endl;

// Check that the variables in the header row agree with those requested by
// the SERVICE query.
std::string expectedHeaderRow =
absl::StrJoin(parsedServiceClause_.visibleVariables_, "\t",
[](std::string* out, const Variable& variable) {
out->append(variable.name());
});
if (tsvHeaderRow != expectedHeaderRow) {
throw std::runtime_error(absl::StrCat(
"Header row of TSV result for SERVICE query is \"", tsvHeaderRow,
"\", but expected \"", expectedHeaderRow, "\""));
}
// TODO: This duplicates code in SparqlQleverVisitor.cpp . Maybe it would be
// more meaningful to postpone the interpretation of the values from a VALUES
// clause (via `parseTripleObject`) to parsedQuery::Values ?

// The rest of the code is adapted from `Values::computeResult`.
const Index& index = getIndex();
result->_sortedBy = resultSortedOn();
result->_idTable.setCols(getResultWidth());
result->_resultTypes.resize(parsedServiceClause_.visibleVariables_.size(),
ResultTable::ResultType::KB);
size_t resWidth = getResultWidth();
CALL_FIXED_SIZE(resWidth, &Service::writeTsvResult, this, &result->_idTable,
index, std::move(tsvResult), result->_localVocab);
}

// ____________________________________________________________________________
template <size_t I>
void Service::writeTsvResult(IdTable* res, const Index& index,
std::istringstream tsvResult,
std::shared_ptr<LocalVocab> localVocab) {
IdTableStatic<I> result = res->moveToStatic<I>();
size_t rowIdx = 0;
std::vector<size_t> numLocalVocabPerColumn(result.cols());
std::string line;
std::string lastLine;
while (lastLine = std::move(line), std::getline(tsvResult, line)) {
if (sparqlValues._values.size() == 0) {
LOG(INFO) << "Second line of response body: " << line << std::endl;
// Print first line.
if (rowIdx == 0) {
LOG(INFO) << "First non-header row of TSV result: " << line << std::endl;
}
std::vector<TripleComponent> valueTuple;
for (auto value : absl::StrSplit(line, "\t")) {
valueTuple.push_back(
TurtleStringParser<TokenizerCtre>::parseTripleObject(value));
std::vector<std::string> valueStrings = absl::StrSplit(line, "\t");
AD_CHECK(valueStrings.size() ==
parsedServiceClause_.visibleVariables_.size());
result.resize(result.size() + 1);
for (size_t colIdx = 0; colIdx < valueStrings.size(); colIdx++) {
const TripleComponent& tc =
TurtleStringParser<TokenizerCtre>::parseTripleObject(
valueStrings[colIdx]);
std::optional<Id> id = tc.toValueId(index.getVocab());
if (!id) {
AD_CHECK(tc.isString());
auto& newWord = tc.getString();
++numLocalVocabPerColumn[colIdx];
id = Id::makeFromLocalVocabIndex(
localVocab->getIndexAndAddIfNotContained(std::move(newWord)));
}
result(rowIdx, colIdx) = id.value();
}
sparqlValues._values.push_back(valueTuple);
}
if (sparqlValues._values.size() > 1) {
LOG(INFO) << "Last line of response body: " << lastLine << std::endl;
rowIdx++;
}
resultAsValues_ = std::make_unique<Values>(qec, std::move(sparqlValues));
}

// ____________________________________________________________________________
string Service::asStringImpl(size_t indent) const {
std::ostringstream os;
for (size_t i = 0; i < indent; ++i) {
os << " ";
if (result.size() > 1) {
LOG(INFO) << "Last non-header row of TSV result: " << lastLine << std::endl;
}
// TODO: This duplicates code in GraphPatternOperation.cpp .
os << "SERVICE " << parsedServiceClause_.serviceVarOrIri_ << " {\n"
<< parsedServiceClause_.servicePrologue_ << "\n"
<< parsedServiceClause_.serviceQueryBody_ << "\n}\n";
return std::move(os).str();
}

// ____________________________________________________________________________
string Service::getDescriptor() const {
std::ostringstream os;
os << "Service with IRI " << parsedServiceClause_.serviceVarOrIri_;
// << " and body " << parsedServiceClause_.serviceQueryBody_;
return std::move(os).str();
}

// ____________________________________________________________________________
size_t Service::getResultWidth() const {
return resultAsValues_->getResultWidth();
}

// ____________________________________________________________________________
vector<size_t> Service::resultSortedOn() const {
return resultAsValues_->resultSortedOn();
}

// ____________________________________________________________________________
VariableToColumnMap Service::computeVariableToColumnMap() const {
return resultAsValues_->computeVariableToColumnMap();
}

// ____________________________________________________________________________
float Service::getMultiplicity(size_t col) {
return resultAsValues_->getMultiplicity(col);
}

// ____________________________________________________________________________
size_t Service::getSizeEstimate() { return resultAsValues_->getSizeEstimate(); }

// ____________________________________________________________________________
size_t Service::getCostEstimate() { return resultAsValues_->getCostEstimate(); }

// ____________________________________________________________________________
void Service::computeResult(ResultTable* result) {
resultAsValues_->computeResult(result);
AD_CHECK(rowIdx == result.size());
LOG(INFO) << "Number of rows in result: " << result.size() << std::endl;
LOG(INFO) << "Number of entries in local vocabulary per column: "
<< absl::StrJoin(numLocalVocabPerColumn, ", ") << std::endl;
*res = result.moveToDynamic();
}
20 changes: 14 additions & 6 deletions src/engine/Service.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@
// parse that result.
class Service : public Operation {
private:
// For each column, its multiplicity. Note that this can be computed
// precisely.
// TODO: For now, filled with random values.
// For each column, its multiplicity.
//
// TODO: We can either compute them precisely or have no idea at all. To
// compute them precisely, we would have to issue the SERVICE query at or
// before the query planning. Should we do that?
std::vector<size_t> multiplicities_;

// The parsed SERVICE clause.
Expand All @@ -32,9 +34,9 @@ class Service : public Operation {
Service(QueryExecutionContext* qec, parsedQuery::Service parsedServiceClause);

// Methods inherited from base class `Operation`.
string getDescriptor() const override;
std::string getDescriptor() const override;
size_t getResultWidth() const override;
vector<size_t> resultSortedOn() const override;
std::vector<size_t> resultSortedOn() const override;
float getMultiplicity(size_t col) override;
size_t getSizeEstimate() override;
size_t getCostEstimate() override;
Expand All @@ -51,8 +53,14 @@ class Service : public Operation {

private:
// Used as cache key.
string asStringImpl(size_t indent = 0) const override;
std::string asStringImpl(size_t indent = 0) const override;

// Method inherited from base class `Operation`.
void computeResult(ResultTable* result) override;

// Write TSV result to `IdTable`, analogous to `Values::writeValues`.
template <size_t I>
void writeTsvResult(IdTable* res, const Index& index,
std::istringstream tsvResult,
std::shared_ptr<LocalVocab> localVocab);
};
2 changes: 1 addition & 1 deletion src/parser/GraphPatternOperation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ void GraphPatternOperation::toString(std::ostringstream& os,
os << "VALUES (" << arg._inlineValues.variablesToString() << ") "
<< arg._inlineValues.valuesToString();
} else if constexpr (std::is_same_v<T, Service>) {
os << "SERVICE " << arg.serviceVarOrIri_ << " {";
os << "SERVICE " << arg.varOrIri_.toSparql() << " {";
// TODO: In other places, the interface is `os << ...asString(indent)`.
arg.graphPattern_.toString(os, indentation);
os << "}";
Expand Down
Loading

0 comments on commit 4a348d4

Please sign in to comment.