From 7ea46fe486bded70b52c8488907d07a00014f4b9 Mon Sep 17 00:00:00 2001 From: FredTheNoob <43958385+FredTheNoob@users.noreply.github.com> Date: Fri, 24 Nov 2023 15:29:49 +0100 Subject: [PATCH] update to endpoints - rewrite entity mentions endpoint response to be the file processed, not all entitymentions (formerly /{articlename}/entities) - added /entitymentions/all endpoint to get the entire entitymentions file --- components/EntityLinker.py | 24 ++--------------- components/GetSpacyData.py | 17 ++++++------ lib/JSONEntityOutput.py | 4 +++ main.py | 45 +++++++++++++++---------------- tests/integration/test_GetJSON.py | 17 +++++++++--- tests/unit/test_GetSpacyData.py | 4 ++- 6 files changed, 52 insertions(+), 59 deletions(-) create mode 100644 lib/JSONEntityOutput.py diff --git a/components/EntityLinker.py b/components/EntityLinker.py index cfde3e9..6158c4d 100644 --- a/components/EntityLinker.py +++ b/components/EntityLinker.py @@ -1,3 +1,4 @@ +from typing import List from Levenshtein import distance from components import Db from lib.EntityLinked import EntityLinked @@ -5,28 +6,7 @@ from fuzzywuzzy import fuzz -def GetAllEntities(entityMentions): - allEntities = [] - fileName = "" - for file in entityMentions: - fileName = file["fileName"] - for sentence in file["sentences"]: - for entity in sentence["entityMentions"]: - newEntity = Entity( - name=entity["name"], - startIndex=entity["startIndex"], - endIndex=entity["endIndex"], - sentence=sentence["sentence"], - sentenceStartIndex=sentence["sentenceStartIndex"], - sentenceEndIndex=sentence["sentenceEndIndex"], - label=entity["label"], - type=entity["type"], - ) - allEntities.append(newEntity) - return allEntities - - -async def entitylinkerFunc(entities, threshold=80): +async def entitylinkerFunc(entities: List[Entity], threshold:int=80): iri_dict = {} linked_entities = [] db_path = "./Database/DB.db" diff --git a/components/GetSpacyData.py b/components/GetSpacyData.py index 0d1c09c..1e29a1b 100644 --- a/components/GetSpacyData.py +++ b/components/GetSpacyData.py @@ -1,4 +1,4 @@ -import spacy, json, os +import json, os import sys from langdetect import detect from typing import List @@ -7,6 +7,7 @@ from lib.Exceptions.UndetectedLanguageException import ( UndetectedLanguageException, ) +from lib.JSONEntityOutput import JSONEntityOutput sys.path.append(".") from lib.Entity import Entity @@ -17,8 +18,8 @@ nlp_da = da_core_news_lg.load() -# GetText skal få text fra pipeline del A -def GetText(title): +# GetText shall get text from pipeline del A +def GetText(title: str): file = open(title, "r") stringWithText = file.read() @@ -27,7 +28,7 @@ def GetText(title): return stringWithText -def GetTokens(text): +def GetTokens(text: str): result = DetectLang(text) if result == "da": return nlp_da(text) @@ -37,14 +38,14 @@ def GetTokens(text): raise UndetectedLanguageException() -def DetectLang(text): +def DetectLang(text: str): stringdata = str(text) language = detect(stringdata) return language # Method to fully extract entity mentions, find the sentences and calculate indexes and finally create a final JSON -def BuildJSONFromEntities(entities: List[EntityLinked], doc, fileName: str): +def BuildJSONFromEntities(entities: List[EntityLinked], doc, fileName: str) -> JSONEntityOutput: # Create a list of sentences with their entities in the desired JSON format currentJson = open("./entity_mentions.json", "r") currentJson.seek(0, os.SEEK_END) @@ -88,12 +89,12 @@ def BuildJSONFromEntities(entities: List[EntityLinked], doc, fileName: str): if len(currentJson) != 0: for index in currentJson: if index["fileName"] == final_json["fileName"]: - return currentJson + return JSONEntityOutput(final_json, currentJson) else: currentJson.append(final_json) else: currentJson.append(final_json) - return currentJson + return JSONEntityOutput(final_json, currentJson) def GetEntities(doc) -> List[Entity]: entities = [] diff --git a/lib/JSONEntityOutput.py b/lib/JSONEntityOutput.py new file mode 100644 index 0000000..cd29d22 --- /dev/null +++ b/lib/JSONEntityOutput.py @@ -0,0 +1,4 @@ +class JSONEntityOutput: + def __init__(self, newFile, allFiles): + self.newFile = newFile + self.allFiles = allFiles \ No newline at end of file diff --git a/main.py b/main.py index 8da95d2..b8dd5a1 100644 --- a/main.py +++ b/main.py @@ -16,30 +16,42 @@ DIRECTORY_TO_WATCH = "data_from_A/" async def newFileCreated(file_path: str): - await main(file_path) + await processInput(file_path) dirWatcher = DirectoryWatcher(directory=DIRECTORY_TO_WATCH, async_callback=newFileCreated) + @app.on_event("startup") async def startEvent(): dirWatcher.start_watching() + @app.on_event("shutdown") def shutdown_event(): dirWatcher.stop_watching() + app.mount( "/static", StaticFiles(directory="static"), name="static", ) + @app.get('/') async def root(request: Request): return templates.TemplateResponse( "index.html", {"request": request} ) +@app.get("/entitymentions/all") +async def get_all_json(): + if not os.path.exists("entity_mentions.json"): + raise HTTPException(status_code=404, detail="mentions not found") + + with open("entity_mentions.json", "r") as entity_json: + entity_mentions = json.load(entity_json) + return entity_mentions @app.get("/entitymentions") async def get_json(article: str = Query(..., title="Article Filename")): @@ -47,24 +59,8 @@ async def get_json(article: str = Query(..., title="Article Filename")): if not os.path.exists(path): raise HTTPException(status_code=404, detail="Article not found") - await main(path) # Pass the article parameter to your main function - with open("entity_mentions.json", "r") as entity_json: - entity_mentions = json.load(entity_json) - return entity_mentions - - -@app.get("/{articlename}/entities") -async def getentities(articlename: str): - await main() - with open("entity_mentions.json", "r") as entityJson: - entityMentions = json.load(entityJson) - for elem in entityMentions: - path = elem["fileName"] - name = path.split("/") - if name[-1] == articlename: - return elem - raise HTTPException(status_code=404, detail="Article not found") - + newFile = await processInput(path) + return newFile @app.post("/detectlanguage") async def checklang(request: Request): @@ -79,8 +75,9 @@ async def checklang(request: Request): return language -async def main(file_path: str = "Artikel.txt"): - open("entity_mentions.json", "w").close() +async def processInput(file_path: str = "Artikel.txt"): + if not os.path.exists("entity_mentions.json"): + open("entity_mentions.json", "w").close() text = GetSpacyData.GetText( file_path @@ -104,8 +101,6 @@ async def main(file_path: str = "Artikel.txt"): doc ) # construct entities from text - # To prevent appending challenges, the final JSON is created in GetEntities() - # entMentions= GetSpacyData.entityMentionJson(ents) #Returns JSON object containing an array of entity mentions await Db.InitializeIndexDB( "./Database/DB.db" ) # makes the DB containing the entities of KG @@ -122,4 +117,6 @@ async def main(file_path: str = "Artikel.txt"): ) with open("entity_mentions.json", "w", encoding="utf8") as entityJson: - json.dump(entsJSON, entityJson, ensure_ascii=False, indent=4) \ No newline at end of file + json.dump(entsJSON.allFiles, entityJson, ensure_ascii=False, indent=4) + + return entsJSON.newFile \ No newline at end of file diff --git a/tests/integration/test_GetJSON.py b/tests/integration/test_GetJSON.py index 97b2e5a..2f59832 100644 --- a/tests/integration/test_GetJSON.py +++ b/tests/integration/test_GetJSON.py @@ -18,11 +18,20 @@ async def test_SlashEntityMentionsIsUp(): @pytest.mark.asyncio -async def test_SlashEntityMentionsReturnsJsonArray(): +async def test_SlashEntityMentionsAllReturnsJsonArray(): + with TestClient(app) as client: + res = client.get("/entitymentions/all") + print(type(res.json())) + assert type(res.json()) == list + assert type(res.json()[0]) == dict + client.__exit__ + client.close() + +@pytest.mark.asyncio +async def test_SlashEntityMentionsReturnsJson(): with patch('main.DIRECTORY_TO_WATCH', 'data_from_A/'): with TestClient(app) as client: res = client.get("/entitymentions?article=test.txt") - assert type(res.json()) == list - assert type(res.json()[0]) == dict + assert type(res.json()) == dict client.__exit__ - client.close() + client.close() \ No newline at end of file diff --git a/tests/unit/test_GetSpacyData.py b/tests/unit/test_GetSpacyData.py index 45e4224..452ced3 100644 --- a/tests/unit/test_GetSpacyData.py +++ b/tests/unit/test_GetSpacyData.py @@ -82,12 +82,14 @@ def test_GetEntities(): ) ) - entsJSON = GetSpacyData.BuildJSONFromEntities( + entsJSONOutput = GetSpacyData.BuildJSONFromEntities( entLinks, docFile, filename ) + entsJSON = entsJSONOutput.allFiles + testIndex = 0 for i in range(len(entsJSON)): if entsJSON[i]["fileName"] == "Testing2023":