Skip to content

Commit

Permalink
Parse complete SPARQL queries using ANTLR (#790)
Browse files Browse the repository at this point in the history
The ANTLR based parser now parses the complete query string into a `ParsedQuery`. The only thing that is till done via the old parser are FILTERs. Also add many checks for invariants of a valid SPARQL query into the `ParsedQuery` class and improve the unit tests for the parsing.
  • Loading branch information
Qup42 authored and Hannah Bast committed Sep 18, 2022
1 parent 1d468ce commit c30c101
Show file tree
Hide file tree
Showing 20 changed files with 903 additions and 956 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ if ("${CMAKE_GENERATOR}" STREQUAL "Ninja")
endif ()

if (("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") AND
(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "12") AND
(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "12.1"))
(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "12") AND
(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "12.2"))
message(STATUS "Adding -Wno-restrict for g++12.0 because of false positives")
add_compile_options(-Wno-restrict)
else()
Expand Down
6 changes: 3 additions & 3 deletions e2e/scientists_queries.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1118,10 +1118,10 @@ queries:
?t ql:contains-word "algo*"
}
checks:
- num_cols: 2
- num_cols: 3
- num_rows: 11
- selected: ["?x", "?t"]
- contains_row: ["<Grete_Hermann>","Hermann's algorithm for primary decomposition is still in use now."]
- selected: [ "?x", "?ql_textscore_t", "?t" ]
- contains_row: [ "<Grete_Hermann>",null,"Hermann's algorithm for primary decomposition is still in use now." ]


- query : select_asterisk_regex-lastname-stein
Expand Down
20 changes: 10 additions & 10 deletions src/engine/Server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -437,8 +437,8 @@ Awaitable<json> Server::composeResponseQleverJson(
query.hasSelectClause()
? qet.writeResultAsQLeverJson(query.selectClause(), limit, offset,
std::move(resultTable))
: qet.writeRdfGraphJson(query.constructClause(), limit, offset,
std::move(resultTable));
: qet.writeRdfGraphJson(query.constructClause().triples_, limit,
offset, std::move(resultTable));
requestTimer.stop();
}
j["resultsize"] = query.hasSelectClause() ? resultSize : j["res"].size();
Expand Down Expand Up @@ -510,11 +510,11 @@ Server::composeResponseSepValues(const ParsedQuery& query,
auto compute = [&] {
size_t limit = query._limitOffset._limit;
size_t offset = query._limitOffset._offset;
return query.hasSelectClause()
? qet.generateResults<format>(query.selectClause(), limit,
offset)
: qet.writeRdfGraphSeparatedValues<format>(
query.constructClause(), limit, offset, qet.getResult());
return query.hasSelectClause() ? qet.generateResults<format>(
query.selectClause(), limit, offset)
: qet.writeRdfGraphSeparatedValues<format>(
query.constructClause().triples_,
limit, offset, qet.getResult());
};
return computeInNewThread(compute);
}
Expand All @@ -530,8 +530,8 @@ ad_utility::streams::stream_generator Server::composeTurtleResponse(
}
size_t limit = query._limitOffset._limit;
size_t offset = query._limitOffset._offset;
return qet.writeRdfGraphTurtle(query.constructClause(), limit, offset,
qet.getResult());
return qet.writeRdfGraphTurtle(query.constructClause().triples_, limit,
offset, qet.getResult());
}

// _____________________________________________________________________________
Expand Down Expand Up @@ -646,7 +646,7 @@ boost::asio::awaitable<void> Server::processQuery(
<< (pinResult ? " [pin result]" : "")
<< (pinSubtrees ? " [pin subresults]" : "") << "\n"
<< query << std::endl;
ParsedQuery pq = SparqlParser(query).parse();
ParsedQuery pq = SparqlParser::parseQuery(query);

// The following code block determines the media type to be used for the
// result. The media type is either determined by the "Accept:" header of
Expand Down
6 changes: 5 additions & 1 deletion src/parser/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ add_library(parser
SparqlParserHelpers.h SparqlParserHelpers.cpp
TripleComponent.h
GraphPatternOperation.cpp
PropertyPath.h PropertyPath.cpp Alias.h data/SolutionModifiers.h data/LimitOffsetClause.h data/SparqlFilter.h data/SparqlFilter.cpp data/OrderKey.h data/GroupKey.h ParseException.cpp SelectClause.cpp SelectClause.h GraphPatternOperation.cpp GraphPatternOperation.h GraphPattern.cpp GraphPattern.h)
PropertyPath.h PropertyPath.cpp Alias.h data/SolutionModifiers.h
data/LimitOffsetClause.h data/SparqlFilter.h data/SparqlFilter.cpp
data/OrderKey.h data/GroupKey.h ParseException.cpp SelectClause.cpp
SelectClause.h GraphPatternOperation.cpp GraphPatternOperation.h
GraphPattern.cpp GraphPattern.h ConstructClause.h)
target_link_libraries(parser sparqlParser parserData sparqlExpressions rdfEscaping re2 absl::flat_hash_map util)

30 changes: 30 additions & 0 deletions src/parser/ConstructClause.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// Copyright 2022, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Julian Mundhahs (mundhahj@informatik.uni-freiburg.de)

#pragma once

#include "parser/SelectClause.h"
#include "parser/data/Types.h"

namespace parsedQuery {
struct ConstructClause : ClauseBase {
ad_utility::sparql_types::Triples triples_;

ConstructClause() = default;
explicit ConstructClause(ad_utility::sparql_types::Triples triples)
: triples_(std::move(triples)) {}

// Yields all variables that appear in this `ConstructClause`. Variables that
// appear multiple times are also yielded multiple times.
cppcoro::generator<const Variable> containedVariables() const {
for (const auto& triple : triples_) {
for (const auto& varOrTerm : triple) {
if (auto variable = std::get_if<Variable>(&varOrTerm)) {
co_yield *variable;
}
}
}
}
};
} // namespace parsedQuery
125 changes: 84 additions & 41 deletions src/parser/ParsedQuery.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,6 @@ using std::vector;
string ParsedQuery::asString() const {
std::ostringstream os;

// PREFIX
os << "PREFIX: {";
for (size_t i = 0; i < _prefixes.size(); ++i) {
os << "\n\t" << _prefixes[i].asString();
if (i + 1 < _prefixes.size()) {
os << ',';
}
}
os << "\n}";

bool usesSelect = hasSelectClause();
bool usesAsterisk = usesSelect && selectClause().isAsterisk();

Expand All @@ -56,7 +46,7 @@ string ParsedQuery::asString() const {
os << "{";
}
} else if (hasConstructClause()) {
const auto& constructClause = this->constructClause();
const auto& constructClause = this->constructClause().triples_;
os << "\n CONSTRUCT {\n\t";
for (const auto& triple : constructClause) {
os << triple[0].toSparql();
Expand Down Expand Up @@ -130,24 +120,35 @@ Variable ParsedQuery::addInternalBind(

// ________________________________________________________________________
void ParsedQuery::addSolutionModifiers(SolutionModifiers modifiers) {
// Process groupClause
// TODO<qup42, joka921> Check that all variables that are part of an
// expression that is grouped on are visible in the Query Body.
auto processVariable = [this](const Variable& groupKey) {
// TODO: implement for `ConstructClause`
if (hasSelectClause()) {
if (!ad_utility::contains(selectClause().getVisibleVariables(),
groupKey)) {
throw ParseException(
"Variable " + groupKey.name() +
" was used in an GROUP BY but is not visible in the query body.");
}
auto checkVariableIsVisible = [this](const Variable& var,
const std::string& locationDescription) {
if (!ad_utility::contains(getVisibleVariables(), var)) {
throw ParseException("Variable " + var.name() + " was used in " +
locationDescription +
", but is not visible in the Query Body.");
}
};
auto checkUsedVariablesAreVisible =
[&checkVariableIsVisible](
const sparqlExpression::SparqlExpressionPimpl& expression,
const std::string& locationDescription) {
for (const auto* var : expression.containedVariables()) {
checkVariableIsVisible(*var, locationDescription + " in Expression " +
expression.getDescriptor());
}
};

// Process groupClause
auto processVariable = [this,
&checkVariableIsVisible](const Variable& groupKey) {
checkVariableIsVisible(groupKey, "GROUP BY");

_groupByVariables.emplace_back(groupKey.name());
};
auto processExpression =
[this](sparqlExpression::SparqlExpressionPimpl groupKey) {
[this, &checkUsedVariablesAreVisible](
sparqlExpression::SparqlExpressionPimpl groupKey) {
checkUsedVariablesAreVisible(groupKey, "Group Key");
auto helperTarget = addInternalBind(std::move(groupKey));
_groupByVariables.emplace_back(helperTarget.name());
};
Expand All @@ -167,21 +168,27 @@ void ParsedQuery::addSolutionModifiers(SolutionModifiers modifiers) {
}

// Process havingClause
// TODO<joka921, qup42> as soon as FILTER and HAVING support proper
// expressions, also add similar sanity checks for the HAVING clause here.
_havingClauses = std::move(modifiers.havingClauses_);

// Process orderClause
// TODO<qup42, joka921> Check that all variables that are part of an
// expression that is ordered on are visible in the Query Body.
auto processVariableOrderKey = [this](VariableOrderKey orderKey) {
auto processVariableOrderKey = [this, &checkVariableIsVisible](
VariableOrderKey orderKey) {
// Check whether grouping is done. The variable being ordered by
// must then be either grouped or the result of an alias in the select.
const vector<Variable>& groupByVariables = _groupByVariables;
if (!groupByVariables.empty() &&
!ad_utility::contains(groupByVariables, orderKey.variable_) &&
!ad_utility::contains_if(selectClause().getAliases(),
[&orderKey](const Alias& alias) {
return alias._target == orderKey.variable_;
})) {
if (groupByVariables.empty()) {
checkVariableIsVisible(orderKey.variable_, "ORDERY BY");
} else if (!ad_utility::contains(groupByVariables, orderKey.variable_) &&
// `ConstructClause` has no Aliases. So the variable can never be
// the result of an Alias.
(hasConstructClause() ||
!ad_utility::contains_if(selectClause().getAliases(),
[&orderKey](const Alias& alias) {
return alias._target ==
orderKey.variable_;
}))) {
throw ParseException(
"Variable " + orderKey.variable_.name() +
" was used in an ORDER BY "
Expand All @@ -195,8 +202,10 @@ void ParsedQuery::addSolutionModifiers(SolutionModifiers modifiers) {
// QLever currently only supports ordering by variables. To allow
// all `orderConditions`, the corresponding expression is bound to a new
// internal variable. Ordering is then done by this variable.
auto processExpressionOrderKey = [this](ExpressionOrderKey orderKey) {
if (!_groupByVariables.empty())
auto processExpressionOrderKey = [this, &checkUsedVariablesAreVisible](
ExpressionOrderKey orderKey) {
checkUsedVariablesAreVisible(orderKey.expression_, "Order Key");
if (!_groupByVariables.empty()) {
// TODO<qup42> Implement this by adding a hidden alias in the
// SELECT clause.
throw ParseException(
Expand All @@ -206,6 +215,7 @@ void ParsedQuery::addSolutionModifiers(SolutionModifiers modifiers) {
"\"). Please assign this expression to a "
"new variable in the SELECT clause and then order by this "
"variable.");
}
auto additionalVariable = addInternalBind(std::move(orderKey.expression_));
_orderBy.emplace_back(additionalVariable, orderKey.isDescending_);
};
Expand All @@ -219,6 +229,8 @@ void ParsedQuery::addSolutionModifiers(SolutionModifiers modifiers) {
// Process limitOffsetClause
_limitOffset = modifiers.limitOffset_;

// Check that the query is valid

auto checkAliasOutNamesHaveNoOverlapWith =
[this](const auto& container, const std::string& message) {
for (const auto& alias : selectClause().getAliases()) {
Expand All @@ -228,8 +240,6 @@ void ParsedQuery::addSolutionModifiers(SolutionModifiers modifiers) {
}
};

// Check that the query is valid

if (hasSelectClause()) {
if (!_groupByVariables.empty()) {
ad_utility::HashSet<string> groupVariables{};
Expand Down Expand Up @@ -287,15 +297,27 @@ void ParsedQuery::addSolutionModifiers(SolutionModifiers modifiers) {
throw ParseException("The variable name " + a._target.name() +
" used in an alias was already selected on.");
}
// TODO<qup42, joka921> Check that all variables used in the expression of
// Aliases are visible in the QueryBody.

checkUsedVariablesAreVisible(a._expression, "Alias");
}
} else if (hasConstructClause()) {
if (_groupByVariables.empty()) {
return;
}

for (const auto& variable : constructClause().containedVariables()) {
if (!ad_utility::contains(_groupByVariables, variable)) {
throw ParseException("Variable " + variable.name() +
" is used but not "
"aggregated despite the query not being "
"grouped by " +
variable.name() + ".");
}
}
}
}

void ParsedQuery::merge(const ParsedQuery& p) {
_prefixes.insert(_prefixes.begin(), p._prefixes.begin(), p._prefixes.end());

auto& children = _rootGraphPattern._graphPatterns;
auto& otherChildren = p._rootGraphPattern._graphPatterns;
children.insert(children.end(), otherChildren.begin(), otherChildren.end());
Expand All @@ -305,6 +327,27 @@ void ParsedQuery::merge(const ParsedQuery& p) {
_rootGraphPattern.recomputeIds(&_numGraphPatterns);
}

// _____________________________________________________________________________
const std::vector<Variable>& ParsedQuery::getVisibleVariables() const {
return std::visit(&parsedQuery::ClauseBase::getVisibleVariables, _clause);
}

// _____________________________________________________________________________
void ParsedQuery::registerVariablesVisibleInQueryBody(
const vector<Variable>& variables) {
for (const auto& var : variables) {
registerVariableVisibleInQueryBody(var);
}
}

// _____________________________________________________________________________
void ParsedQuery::registerVariableVisibleInQueryBody(const Variable& variable) {
auto addVariable = [&variable](auto& clause) {
clause.addVisibleVariable(variable);
};
std::visit(addVariable, _clause);
}

// _____________________________________________________________________________
void ParsedQuery::GraphPattern::toString(std::ostringstream& os,
int indentation) const {
Expand Down
Loading

0 comments on commit c30c101

Please sign in to comment.