Skip to content

Commit

Permalink
Blocking and Staging logic
Browse files Browse the repository at this point in the history
This pull implements the blocking and staging logics, which are totally independent.
- Blocking is implemented through cypher queries, see process.sh lines 56-57.
- Staging is the rest. We only care here about the embargo logic of staging. So if we embargo, then, depending on whether we are in the prod or dev stage mode (see Dockerfile), we will embargo different things. The logic corresponds to what was discussed [here](VirtualFlyBrain/neo4j2owl#52). The implementation is realised through two different sets of sparql queries (one for prod, one for dev), which apply differently rigorous embargo rules. The prod queries are unchanged, and the embargo rules in dev are tighter (i.e. less stuff gets embargoed).

see #8
see VirtualFlyBrain/neo4j2owl#52
  • Loading branch information
matentzn committed Oct 15, 2020
1 parent 51876f5 commit f303e4a
Show file tree
Hide file tree
Showing 12 changed files with 157 additions and 4 deletions.
4 changes: 4 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ ENV VALIDATESHEX=true
ENV VALIDATESHACL=true
ENV REMOVE_EMBARGOED_DATA=true

# FOR STAGING, CURRENTLY ONLY prod and dev are supported. If set to dev
# Datasets will only be embargoed if they are not staged.
ENV STAGING=prod

# ENV CHUNK_SIZE=1000
# ENV PING_SLEEP=120s
# ENV BUILD_OUTPUT=${WORKSPACE}/build.out
Expand Down
12 changes: 8 additions & 4 deletions process.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,24 +53,28 @@ date
echo '** Exporting KB to OWL **'
curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (c) REMOVE c.label_rdfs RETURN c"}]}' >> ${VFB_DEBUG_DIR}/neo4j_remove_rdfs_label.txt
curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (p) WHERE EXISTS(p.label) SET p.label_rdfs=[] + p.label"}]}' >> ${VFB_DEBUG_DIR}/neo4j_change_label_to_rdfs.txt
curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (n:Entity) WHERE exists(n.block) DETACH DELETE n"}]}' >> ${VFB_DEBUG_DIR}/neo4j_change_label_to_rdfs.txt
curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH ()-[r]-() WHERE exists(r.block) DELETE r"}]}' >> ${VFB_DEBUG_DIR}/neo4j_change_label_to_rdfs.txt

python3 ${SCRIPTS}neo4j_kb_export.py ${KBserver} ${KBuser} ${KBpassword} ${KB_FILE}

echo "VFBTIME:"
date


if [ "$REMOVE_EMBARGOED_DATA" = true ]; then
echo '** Deleting embargoed data.. **'
robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/embargoed_datasets.sparql ${VFB_FINAL}/embargoed_datasets.txt
robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/embargoed_datasets_${STAGING}.sparql ${VFB_FINAL}/embargoed_datasets.txt

echo 'First 10 embargoed datasets: '
head -10 ${VFB_FINAL}/embargoed_datasets.txt

echo 'Embargoed datasets: select_embargoed_channels'
robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_channels.sparql ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt
robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_channels_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt
echo 'Embargoed datasets: select_embargoed_images'
robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_images.sparql ${VFB_DOWNLOAD_DIR}/embargoed_images.txt
robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_images_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_images.txt
echo 'Embargoed datasets: select_embargoed_datasets'
robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_datasets.sparql ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt
robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_datasets_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt

echo 'Embargoed data: Removing everything'
cat ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt ${VFB_DOWNLOAD_DIR}/embargoed_images.txt ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt | sort | uniq > ${VFB_FINAL}/remove_embargoed.txt
Expand Down
25 changes: 25 additions & 0 deletions sparql/embargoed_datasets_dev.sparql
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX n2o: <http://n2o.neo/property/>
PREFIX n2oc: <http://n2o.neo/custom/>
PREFIX dct: <http://purl.org/dc/terms/>

SELECT DISTINCT ?dataset

WHERE {

?dataset n2o:nodeLabel ?nodelabel . # This selects all datasets
OPTIONAL {
?dataset n2oc:production ?production .
# n2oc:production is a bit brittle because IRI might be changed (risk!)
}

OPTIONAL {
?dataset n2oc:staging ?staged .
}

IF((staging=false || unbound(staging)) && (prod = false || unbound(prod)) ) -----> EMBARGO

FILTER( (?production=false || !bound(?production)) && (?staged=false || !bound(?staged)) ) .
FILTER(?nodelabel="DataSet")
}
File renamed without changes.
15 changes: 15 additions & 0 deletions sparql/select_blocked_entities.sparql
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX n2o: <http://n2o.neo/property/>
PREFIX n2oc: <http://n2o.neo/custom/>
PREFIX dct: <http://purl.org/dc/terms/>

SELECT ?s ?p ?o .
WHERE {
?s <http://n2o.neo/custom/block> ?blocked .
?s ?p ?o .
FILTER(?blocked=true) .
FILTER(isIRI(?s))
}

### EDIT: this was obsoleted in the end in favour of a cypher solution, see process.sh.
22 changes: 22 additions & 0 deletions sparql/select_blocked_relations.sparql
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX n2o: <http://n2o.neo/property/>
PREFIX n2oc: <http://n2o.neo/custom/>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>


SELECT ?s ?p ?o
WHERE {
?s ?p ?o .
?r rdf:type owl:Axiom ;
owl:annotatedSource ?s ;
owl:annotatedProperty ?p ;
owl:annotatedTarget ?o ;
<http://n2o.neo/custom/block> ?blocked;
?bp ?bo;

FILTER(?blocked=true) .
}

### EDIT: this was obsoleted in the end in favour of a cypher solution, see process.sh.
29 changes: 29 additions & 0 deletions sparql/select_embargoed_channels_dev.sparql
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX n2o: <http://n2o.neo/property/>
PREFIX n2oc: <http://n2o.neo/custom/>
PREFIX dct: <http://purl.org/dc/terms/>

#Delete all ds:DataSet where ds.production is False
#Delete all i:Individual where (ds)-[:has_source]-(i:Individual)<-[:depicts]-(ch:Individual) WHERE ds.production is False

SELECT DISTINCT ?channel
WHERE {

?dataset n2o:nodeLabel ?nodelabel . # This selects all datasets

OPTIONAL {
?dataset n2oc:production ?production .
# n2oc:production is a bit brittle because IRI might be changed (risk!)
}

OPTIONAL {
?dataset n2oc:staging ?staged .
}

?image dct:source ?dataset .
?channel <http://xmlns.com/foaf/0.1/depicts> ?image . # There does not always seem to be a channel

FILTER( (?production=false || !bound(?production)) && (?staged=false || !bound(?staged)) ) .
FILTER(?nodelabel="DataSet")
}
File renamed without changes.
26 changes: 26 additions & 0 deletions sparql/select_embargoed_datasets_dev.sparql
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX n2o: <http://n2o.neo/property/>
PREFIX n2oc: <http://n2o.neo/custom/>
PREFIX dct: <http://purl.org/dc/terms/>

#Delete all ds:DataSet where ds.production is False
#Delete all i:Individual where (ds)-[:has_source]-(i:Individual)<-[:depicts]-(ch:Individual) WHERE ds.production is False

SELECT DISTINCT ?dataset
WHERE {

?dataset n2o:nodeLabel ?nodelabel . # This selects all datasets

OPTIONAL {
?dataset n2oc:production ?production .
# n2oc:production is a bit brittle because IRI might be changed (risk!)
}

OPTIONAL {
?dataset n2oc:staging ?staged .
}

FILTER( (?production=false || !bound(?production)) && (?staged=false || !bound(?staged)) ) .
FILTER(?nodelabel="DataSet")
}
File renamed without changes.
28 changes: 28 additions & 0 deletions sparql/select_embargoed_images_dev.sparql
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX n2o: <http://n2o.neo/property/>
PREFIX n2oc: <http://n2o.neo/custom/>
PREFIX dct: <http://purl.org/dc/terms/>

#Delete all ds:DataSet where ds.production is False
#Delete all i:Individual where (ds)-[:has_source]-(i:Individual)<-[:depicts]-(ch:Individual) WHERE ds.production is False

SELECT DISTINCT ?image
WHERE {

?dataset n2o:nodeLabel ?nodelabel . # This selects all datasets

OPTIONAL {
?dataset n2oc:production ?production .
# n2oc:production is a bit brittle because IRI might be changed (risk!)
}

OPTIONAL {
?dataset n2oc:staging ?staged .
}

?image dct:source ?dataset .

FILTER( (?production=false || !bound(?production)) && (?staged=false || !bound(?staged)) ) .
FILTER(?nodelabel="DataSet")
}
File renamed without changes.

0 comments on commit f303e4a

Please sign in to comment.