-
Notifications
You must be signed in to change notification settings - Fork 64
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Basic support of SERVICE clause (#793)
Support SERVICE queries to a remote endpoint specified via an IRI. The query is sent using a POST request with headers `Content-Type: application/sparql-query` and `Accept: text/tab-separated-values`. The result TSV is parsed and words that are not contained in the index vocabulary are added to the local vocabulary of the result of the SERVICE operation. Shortcomings of this implementation are: there is no timeout, the TSV is first read (and stored) completely and only then processed further, TSV is simple to parse but may be ambiguous with respect to how IRIs or literals are encoded, the `SILENT` keyword (telling the engine to ignore the SERVICE clause if the query fails) is currently not supported, variable endpoints are currently not supported. Also, the query is currently sent after query planning, so the query planner has no information about the size, cost, or multiplicities of the result.
- Loading branch information
1 parent
0e4a076
commit 857099c
Showing
20 changed files
with
675 additions
and
35 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,192 @@ | ||
// Copyright 2022 - 2023, University of Freiburg, | ||
// Chair of Algorithms and Data Structures. | ||
// Author: Hannah Bast (bast@cs.uni-freiburg.de) | ||
|
||
#include "engine/Service.h" | ||
|
||
#include "absl/strings/str_cat.h" | ||
#include "absl/strings/str_split.h" | ||
#include "engine/CallFixedSize.h" | ||
#include "engine/Values.h" | ||
#include "engine/VariableToColumnMap.h" | ||
#include "parser/TokenizerCtre.h" | ||
#include "parser/TurtleParser.h" | ||
#include "util/Exception.h" | ||
#include "util/HashSet.h" | ||
#include "util/http/HttpClient.h" | ||
#include "util/http/HttpUtils.h" | ||
|
||
// ____________________________________________________________________________ | ||
Service::Service(QueryExecutionContext* qec, | ||
parsedQuery::Service parsedServiceClause, | ||
GetTsvFunction getTsvFunction) | ||
: Operation{qec}, | ||
parsedServiceClause_{std::move(parsedServiceClause)}, | ||
getTsvFunction_{std::move(getTsvFunction)} {} | ||
|
||
// ____________________________________________________________________________ | ||
std::string Service::asStringImpl(size_t indent) const { | ||
std::ostringstream os; | ||
for (size_t i = 0; i < indent; ++i) { | ||
os << " "; | ||
} | ||
// TODO: This duplicates code in GraphPatternOperation.cpp . | ||
os << "SERVICE " << parsedServiceClause_.serviceIri_.toSparql() << " {\n" | ||
<< parsedServiceClause_.prologue_ << "\n" | ||
<< parsedServiceClause_.graphPatternAsString_ << "\n}\n"; | ||
return std::move(os).str(); | ||
} | ||
|
||
// ____________________________________________________________________________ | ||
std::string Service::getDescriptor() const { | ||
return absl::StrCat("Service with IRI ", | ||
parsedServiceClause_.serviceIri_.toSparql()); | ||
} | ||
|
||
// ____________________________________________________________________________ | ||
size_t Service::getResultWidth() const { | ||
return parsedServiceClause_.visibleVariables_.size(); | ||
} | ||
|
||
// ____________________________________________________________________________ | ||
VariableToColumnMap Service::computeVariableToColumnMap() const { | ||
VariableToColumnMap map; | ||
const auto& visibleVariables = parsedServiceClause_.visibleVariables_; | ||
for (size_t i = 0; i < visibleVariables.size(); i++) { | ||
map[visibleVariables[i]] = i; | ||
} | ||
return map; | ||
} | ||
|
||
// ____________________________________________________________________________ | ||
float Service::getMultiplicity([[maybe_unused]] size_t col) { | ||
// TODO: For now, we don't have any information about the multiplicities at | ||
// query planning time, so we just return `1` for each column. | ||
return 1; | ||
} | ||
|
||
// ____________________________________________________________________________ | ||
size_t Service::getSizeEstimate() { | ||
// TODO: For now, we don't have any information about the result size at | ||
// query planning time, so we just return `100'000`. | ||
return 100'000; | ||
} | ||
|
||
// ____________________________________________________________________________ | ||
size_t Service::getCostEstimate() { | ||
// TODO: For now, we don't have any information about the cost at query | ||
// planning time, so we just return ten times the estimated size. | ||
return 10 * getSizeEstimate(); | ||
} | ||
|
||
// ____________________________________________________________________________ | ||
void Service::computeResult(ResultTable* result) { | ||
// Get the URL of the SPARQL endpoint. | ||
std::string_view serviceIriString = parsedServiceClause_.serviceIri_.iri(); | ||
AD_CONTRACT_CHECK(serviceIriString.starts_with("<") && | ||
serviceIriString.ends_with(">")); | ||
serviceIriString.remove_prefix(1); | ||
serviceIriString.remove_suffix(1); | ||
ad_utility::httpUtils::Url serviceUrl{serviceIriString}; | ||
|
||
// Construct the query to be sent to the SPARQL endpoint. | ||
std::string variablesForSelectClause = absl::StrJoin( | ||
parsedServiceClause_.visibleVariables_, " ", Variable::AbslFormatter); | ||
std::string serviceQuery = absl::StrCat( | ||
parsedServiceClause_.prologue_, "\nSELECT ", variablesForSelectClause, | ||
" WHERE ", parsedServiceClause_.graphPatternAsString_); | ||
LOG(INFO) << "Sending SERVICE query to remote endpoint " | ||
<< "(protocol: " << serviceUrl.protocolAsString() | ||
<< ", host: " << serviceUrl.host() | ||
<< ", port: " << serviceUrl.port() | ||
<< ", target: " << serviceUrl.target() << ")" << std::endl | ||
<< serviceQuery << std::endl; | ||
|
||
// Send the query to the remote SPARQL endpoint via a POST request and get the | ||
// result as TSV. | ||
// | ||
// TODO: We should support a timeout here. | ||
// | ||
// TODO: We ask for the result as TSV because that is a compact and | ||
// easy-to-parse format. It might not be the best choice regarding robustness | ||
// and portability though. In particular, we are not sure how deterministic | ||
// the TSV output is with respect to the precise encoding of literals. | ||
std::istringstream tsvResult = | ||
getTsvFunction_(serviceUrl, boost::beast::http::verb::post, serviceQuery, | ||
"application/sparql-query", "text/tab-separated-values"); | ||
|
||
// The first line of the TSV result contains the variable names. | ||
std::string tsvHeaderRow; | ||
if (!std::getline(tsvResult, tsvHeaderRow)) { | ||
throw std::runtime_error(absl::StrCat("Response from SPARQL endpoint ", | ||
serviceUrl.host(), " is empty")); | ||
} | ||
LOG(INFO) << "Header row of TSV result: " << tsvHeaderRow << std::endl; | ||
|
||
// Check that the variables in the header row agree with those requested by | ||
// the SERVICE query. | ||
std::string expectedHeaderRow = absl::StrJoin( | ||
parsedServiceClause_.visibleVariables_, "\t", Variable::AbslFormatter); | ||
if (tsvHeaderRow != expectedHeaderRow) { | ||
throw std::runtime_error(absl::StrCat( | ||
"Header row of TSV result for SERVICE query is \"", tsvHeaderRow, | ||
"\", but expected \"", expectedHeaderRow, "\"")); | ||
} | ||
|
||
// Set basic properties of the result table (the `_resultTypes` don't matter, | ||
// as long as they have the right size, see `ResultTypes.h`). | ||
result->_sortedBy = resultSortedOn(); | ||
result->_idTable.setNumColumns(getResultWidth()); | ||
result->_resultTypes.resize(parsedServiceClause_.visibleVariables_.size(), | ||
ResultTable::ResultType::KB); | ||
|
||
// Fill the result table using the `writeTsvResult` method below. | ||
size_t resWidth = getResultWidth(); | ||
CALL_FIXED_SIZE(resWidth, &Service::writeTsvResult, this, | ||
std::move(tsvResult), result); | ||
} | ||
|
||
// ____________________________________________________________________________ | ||
template <size_t I> | ||
void Service::writeTsvResult(std::istringstream tsvResult, | ||
ResultTable* result) { | ||
IdTableStatic<I> idTable = std::move(result->_idTable).toStatic<I>(); | ||
size_t rowIdx = 0; | ||
std::vector<size_t> numLocalVocabPerColumn(idTable.numColumns()); | ||
std::string line; | ||
std::string lastLine; | ||
const size_t numVariables = parsedServiceClause_.visibleVariables_.size(); | ||
while (lastLine = std::move(line), std::getline(tsvResult, line)) { | ||
// Print first line. | ||
if (rowIdx == 0) { | ||
LOG(INFO) << "First non-header row of TSV result: " << line << std::endl; | ||
} | ||
std::vector<std::string_view> valueStrings = absl::StrSplit(line, "\t"); | ||
if (valueStrings.size() != numVariables) { | ||
throw std::runtime_error(absl::StrCat( | ||
"Number of columns in ", rowIdx + 1, " of TSV result is ", | ||
valueStrings.size(), "but number of variables in header row is ", | ||
numVariables)); | ||
} | ||
idTable.emplace_back(); | ||
for (size_t colIdx = 0; colIdx < valueStrings.size(); colIdx++) { | ||
TripleComponent tc = TurtleStringParser<TokenizerCtre>::parseTripleObject( | ||
valueStrings[colIdx]); | ||
Id id = std::move(tc).toValueId(getIndex().getVocab(), | ||
result->localVocabNonConst()); | ||
idTable(rowIdx, colIdx) = id; | ||
if (id.getDatatype() == Datatype::LocalVocabIndex) { | ||
++numLocalVocabPerColumn[colIdx]; | ||
} | ||
} | ||
rowIdx++; | ||
} | ||
if (idTable.size() > 1) { | ||
LOG(INFO) << "Last non-header row of TSV result: " << lastLine << std::endl; | ||
} | ||
AD_CORRECTNESS_CHECK(rowIdx == idTable.size()); | ||
LOG(INFO) << "Number of rows in result: " << idTable.size() << std::endl; | ||
LOG(INFO) << "Number of entries in local vocabulary per column: " | ||
<< absl::StrJoin(numLocalVocabPerColumn, ", ") << std::endl; | ||
result->_idTable = std::move(idTable).toDynamic(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
// Copyright 2022 - 2023, University of Freiburg, | ||
// Chair of Algorithms and Data Structures. | ||
// Author: Hannah Bast (bast@cs.uni-freiburg.de) | ||
|
||
#pragma once | ||
|
||
#include <functional> | ||
|
||
#include "engine/Operation.h" | ||
#include "engine/Values.h" | ||
#include "parser/ParsedQuery.h" | ||
#include "util/http/HttpClient.h" | ||
|
||
// The SERVICE operation. Sends a query to the remote endpoint specified by the | ||
// service IRI, gets the result as TSV, parses it, and writes it into a result | ||
// table. | ||
// | ||
// TODO: The current implementation works, but is preliminary in several | ||
// respects: | ||
// | ||
// 1. Reading the result as TSV has potential problems (see comment in | ||
// `computeResult` for details). | ||
// | ||
// 2. There should be a timeout. | ||
// | ||
// 3. A variable in place of the IRI is not yet supported (see comment in | ||
// `computeResult` for details). | ||
// | ||
// 4. The SERVICE is currently executed *after* the query planning. The | ||
// estimates of the result size, cost, and multiplicities are therefore dummy | ||
// values. | ||
// | ||
class Service : public Operation { | ||
public: | ||
// The type of the function used to obtain the results, see below. | ||
using GetTsvFunction = std::function<std::istringstream( | ||
ad_utility::httpUtils::Url, const boost::beast::http::verb&, | ||
std::string_view, std::string_view, std::string_view)>; | ||
|
||
private: | ||
// The parsed SERVICE clause. | ||
parsedQuery::Service parsedServiceClause_; | ||
|
||
// The function used to obtain the result from the remote endpoint. | ||
GetTsvFunction getTsvFunction_; | ||
|
||
public: | ||
// Construct from parsed Service clause. | ||
// | ||
// NOTE: The third argument is the function used to obtain the result from the | ||
// remote endpoint. The default is to use `httpUtils::sendHttpOrHttpsRequest`, | ||
// but in our tests (`ServiceTest`) we use a mock function that does not | ||
// require a running `HttpServer`. | ||
Service(QueryExecutionContext* qec, parsedQuery::Service parsedServiceClause, | ||
GetTsvFunction getTsvFunction = sendHttpOrHttpsRequest); | ||
|
||
// Methods inherited from base class `Operation`. | ||
std::string getDescriptor() const override; | ||
size_t getResultWidth() const override; | ||
std::vector<size_t> resultSortedOn() const override { return {}; } | ||
float getMultiplicity(size_t col) override; | ||
size_t getSizeEstimate() override; | ||
size_t getCostEstimate() override; | ||
VariableToColumnMap computeVariableToColumnMap() const override; | ||
|
||
// Not relevant for SERVICE. | ||
void setTextLimit([[maybe_unused]] size_t limit) override {} | ||
|
||
// We know nothing about the result at query planning time. | ||
bool knownEmptyResult() override { return false; } | ||
|
||
// A SERVICE clause has no children. | ||
vector<QueryExecutionTree*> getChildren() override { return {}; } | ||
|
||
private: | ||
// The string returned by this function is used as cache key. | ||
std::string asStringImpl(size_t indent = 0) const override; | ||
|
||
// Compute the result using `getTsvFunction_`. | ||
void computeResult(ResultTable* result) override; | ||
|
||
// Write the given TSV result to the given result object. The `I` is the width | ||
// of the result table. | ||
// | ||
// NOTE: This is similar to `Values::writeValues`, except that we have to | ||
// parse TSV here and not a VALUES clause. Note that the only reason that | ||
// `tsvResult` is not `const` here is because the method iterates over the | ||
// `std::istringstream` and thus changes it. | ||
template <size_t I> | ||
void writeTsvResult(std::istringstream tsvResult, ResultTable* result); | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.