Merge pull request #83 from Knox-AAU/75-train-danish-model-to-find-li…

…terals 75 train danish model to find literals
Knox-AAU · Nov 30, 2023 · 10cbc2c · 10cbc2c
2 parents ada3ff2 + 0b79904
commit 10cbc2c
Show file tree

Hide file tree

Showing 12 changed files with 915 additions and 37 deletions.
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -27,6 +27,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install flake8 pytest
+        if [ -f models.txt ]; then pip install -r models.txt; fi
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
     - name: Lint with flake8
       run: |

diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,8 @@
+# Ignore Database file
 Database/*
-
+# Ignore trained SpaCy models
+training/model_packages/*
+training/trainedmodel/*
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/docs/pypi.md b/docs/pypi.md
@@ -0,0 +1,80 @@
+# Pypi Repository
+
+For the purpose of our pipeline, we have created a pip repository which is hosted in a docker container on the _knox-web01.srv.aau.dk_ server. It runs off of [pypiserver](https://github.com/pypiserver/pypiserver).
+
+As of writing this, a ticket has been send to ITS to create a domain name for the server at the internal domain [http://pypi.knox.cs.aau.dk](http://pypi.knox.cs.aau.dk). Internal meaning it is only accessible when at Campus or through VPN.
+
+## Compose File Location
+
+The _compose.yml_ file is located in _/srv/data/pip-repo/_ in the _knox-web01.srv.aau.dk_ server.
+
+## Uploading
+
+Uploading can be done using [twine](https://github.com/pypa/twine). If the server is not yet setup to the domain [http://pypi.knox.cs.aau.dk](http://pypi.knox.cs.aau.dk), it should still be running on the web01 server. Because of this, here are two ways to upload to the pip repository:
+
+### By SSH
+
+First you need to SSH into the server using the following command:
+
+```BASH
+ssh USERNAME@student.aau.dk@knox-web01.srv.aau.dk -L 8081:localhost:8081
+```
+
+The above command SSH's you into the server and forwards the port 8081 on the server into your local machine. You should now be able to go to <http://localhost:8081/simple> in your browser and see the repository.
+
+To upload using twine, run the following command:
+
+```BASH
+twine upload -r http://localhost:8081 --sign PACKAGENAME.whl
+```
+
+Uploading requires no authentication as the repository is only available when on campus anyways.
+
+### By Domain (if domain is up)
+
+When the domain is eventually up, the following twine command is also applicable
+
+```BASH
+twine upload -r http://pypi.knox.cs.aau.dk --sign PACKAGENAME.whl
+```
+
+## Installing through the repository
+
+To install packages from the repository you simply use pip.
+Again because we at this state don't know when the domain will be available, two methods are possible.
+
+You can either connect to the web01 server with the command:
+
+```BASH
+ssh USERNAME@student.aau.dk@knox-web01.srv.aau.dk -L 8081:localhost:8081
+```
+
+And afterwards in another terminal run the pip command:
+
+```BASH
+pip3 install --index-url http://localhost:8081/simple PACKAGE-NAME
+```
+
+If the domain is available simply replace the localhost:8081 with the domain:
+
+```BASH
+pip3 install --index-url http://pypi.knox.cs.aau.dk/simle PACKAGE-NAME
+```
+
+## Creating a whl package from Spacy
+
+If you have trained a model, you can use spacy to create a whl package for the repository.
+
+This is done with the command
+
+```BASH
+spacy package MODEL-FOLDER OUTPUT-LOCATION --name package-name --build wheel
+```
+
+Example command:
+
+```BASH
+spacy package trainedmodel/updated_da_model model_packages --name core_news_knox_lg --build wheel
+```
+
+Note that we have left out the "da" before core in the --name, this is added by default through the meta.json file in the model.
diff --git a/lib/DirectoryWatcher.py b/lib/DirectoryWatcher.py
@@ -3,6 +3,7 @@
 from watchdog.observers import Observer
 from watchdog.events import FileSystemEventHandler
 
+
 class DirectoryWatcher:
     def __init__(self, directory, async_callback):
         self.directory = directory
@@ -21,7 +22,9 @@ def on_created(self, event):
     def start_watching(self):
         # Define a thread target function
         def run_observer():
-            self.observer.schedule(self.event_handler, path=self.directory, recursive=False)
+            self.observer.schedule(
+                self.event_handler, path=self.directory, recursive=False
+            )
             self.observer.start()
             try:
                 while self.is_watching:
@@ -45,4 +48,5 @@ async def run_once(self):
 
     def stop_watching(self):
         self.is_watching = False
-        self.observer.stop()
+        self.observer.stop()
+        self.observer.join()
diff --git a/main.py b/main.py
@@ -16,12 +16,16 @@
 
 DIRECTORY_TO_WATCH = "data_from_A/"
 
+
 async def newFileCreated(file_path: str):
     time.sleep(1)
     await modifyTxt(file_path)
     await processInput(file_path)
 
-dirWatcher = DirectoryWatcher(directory=DIRECTORY_TO_WATCH, async_callback=newFileCreated)
+
+dirWatcher = dirWatcher = DirectoryWatcher(
+    directory=DIRECTORY_TO_WATCH, async_callback=newFileCreated
+)
 
 
 @app.on_event("startup")
@@ -49,27 +53,27 @@ def shutdown_event():
 )
 
 
-@app.get('/')
+@app.get("/")
 async def root(request: Request):
-    return templates.TemplateResponse(
-        "index.html", {"request": request}
-    )
+    return templates.TemplateResponse("index.html", {"request": request})
+
 
 @app.get("/entitymentions/all")
 async def get_all_json():
     if not os.path.exists("entity_mentions.json"):
         raise HTTPException(status_code=404, detail="mentions not found")
-    
+
     with open("entity_mentions.json", "r") as entity_json:
         entity_mentions = json.load(entity_json)
         return entity_mentions
 
+
 @app.get("/entitymentions")
 async def get_json(article: str = Query(..., title="Article Filename")):
     path = DIRECTORY_TO_WATCH + article
+    print(path)
     if not os.path.exists(path):
         raise HTTPException(status_code=404, detail="Article not found")
-
     try:
         newFile = await processInput(path)
     except Exception as e:
@@ -129,9 +133,7 @@ async def processInput(file_path: str = "Artikel.txt"):
     except UndetectedLanguageException:
         raise HTTPException(status_code=400, detail="Undetected language")
 
-    ents = GetSpacyData.GetEntities(
-        doc
-    )  # construct entities from text
+    ents = GetSpacyData.GetEntities(doc)  # construct entities from text
 
     await Db.InitializeIndexDB(
         "./Database/DB.db"
@@ -142,13 +144,9 @@ async def processInput(file_path: str = "Artikel.txt"):
         ents
     )  # Returns JSON object containing an array of entity links
 
-    entsJSON = GetSpacyData.BuildJSONFromEntities(
-        entLinks,
-        doc,
-        file_path
-    )
+    entsJSON = GetSpacyData.BuildJSONFromEntities(entLinks, doc, file_path)
 
     with open("entity_mentions.json", "w", encoding="utf8") as entityJson:
         json.dump(entsJSON.allFiles, entityJson, ensure_ascii=False, indent=4)
 
-    return entsJSON.newFile
+    return entsJSON.newFile
diff --git a/models.txt b/models.txt
@@ -0,0 +1,3 @@
+
+en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.0/en_core_web_lg-3.7.0-py3-none-any.whl
+da_core_news_lg @ https://github.com/explosion/spacy-models/releases/download/da_core_news_lg-3.7.0/da_core_news_lg-3.7.0-py3-none-any.whl
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,7 @@
 [tool.pytest.ini_options]
 minversion = "6.0"
-addopts = "-W ignore::DeprecationWarning --cov ."
+addopts = "-W ignore::DeprecationWarning --cov"
+testpaths = ["tests/unit", "tests/integration"]
 
 [tool.black]
-line-length = 79
+line-length = 79
diff --git a/requirements.txt b/requirements.txt
@@ -4,8 +4,6 @@ wheel==0.41.2
 spacy>=3.7.0, <3.8.0
 uvicorn==0.23.2
 levenshtein
-en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.0/en_core_web_lg-3.7.0-py3-none-any.whl
-da_core_news_lg @ https://github.com/explosion/spacy-models/releases/download/da_core_news_lg-3.7.0/da_core_news_lg-3.7.0-py3-none-any.whl
 pytest==7.4.2
 pytest_sugar==0.9.7
 pytest-asyncio==0.21.1

diff --git a/requirements_no_models.txt b/requirements_no_models.txt
diff --git a/training/test_model.py b/training/test_model.py
@@ -0,0 +1,142 @@
+import spacy
+
+# Load your trained model
+nlp = spacy.load("trainedmodel/updated_da_model")
+
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKCYAN = '\033[96m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+
+
+# Evaluation data
+eval_data = [
+    (
+        "I 1976 blev Apple opfundet",
+        {"entities": [(0, 6, "LITERAL"), (12, 17, "ORG")]},
+    ),
+    (
+        "iPhone 12 blev udgivet i 2020",
+        {"entities": [(0, 9, "MISC"), (23, 29, "LITERAL")]},
+    ),
+    (
+        "Det koster 1000 kr. at købe denne ting.",
+        {"entities": [(11, 19, "LITERAL")]},
+    ),
+    (
+        "I morgen skal jeg i skole.",
+        {
+            "entities": [
+                (0, 8, "LITERAL"),
+            ]
+        },
+    ),
+    (
+        "I dag skulle vi møde kl. 08:15.",
+        {"entities": [(0, 5, "LITERAL"), (21, 30, "LITERAL")]},
+    ),
+    (
+        "Bussen kommer 13:00.",
+        {"entities": [(14, 19, "LITERAL")]},
+    ),
+    (
+        "Vi skal aflevere d. 21/12/2023.",
+        {"entities": [(17, 30, "LITERAL")]},
+    ),
+    (
+        "Vestjyllands finansminister Jørgen Kofoed og hans børn, blev mandag d. 3. December opkøbt af storkoncernen Apple for 20 kr.",
+        {
+            "entities": [
+                (0, 12, "LOCATION"),
+                (28, 41, "PERSON"),
+                (45, 54, "PERSON"),
+                (61, 82, "LITERAL"),
+                (107, 112, "ORG"),
+                (117, 123, "LITERAL"),
+            ]
+        },
+    ),
+    (
+        "George Bush var skyld i 9/11, og jeg skal til Struer d. 28/11/2023.",
+        {
+            "entities": [
+                (0, 11, "PERSON"),
+                (24, 28, "LITERAL"),
+                (46, 52, "LOCATION"),
+                (53, 66, "LITERAL"),
+            ]
+        },
+    ),
+    (
+        "Peter gik over vejen og købte mælk og Epstein dræbte ikke sig selv for 2 dage siden.",
+        {
+            "entities": [
+                (0, 5, "PERSON"),
+                (38, 45, "PERSON"),
+                (67, 83, "LITERAL"),
+            ]
+        },
+    ),
+]
+
+# Initialize evaluation metrics
+eval_metrics = {
+    "correct": 0,
+    "incorrect": 0,
+    "missed": 0,
+    "partial": 0,
+    "spurious": 0,
+}
+
+# Evaluate the model
+for text, annotations in eval_data:
+    gold_entities = [
+        text[start:end] for start, end, _ in annotations.get("entities", [])
+    ]
+    gold_labels = [
+        label for start, end, label in annotations.get("entities", [])
+    ]
+    doc = nlp(text)
+
+    print(f"Text: {text}")
+    print("Gold Entities:", gold_entities)
+    print("Gold Labels", gold_labels)
+
+    recognized_entities = [ent.text for ent in doc.ents]
+    recognized_labels = [ent.label_ for ent in doc.ents]
+    print("Recognized Entities:", recognized_entities)
+    print("Recognized Labels", recognized_labels)
+
+    for ent in doc.ents:
+        if ent.text in gold_entities:
+            eval_metrics["correct"] += 1
+        else:
+            eval_metrics["spurious"] += 1
+
+    for gold_entity in gold_entities:
+        if gold_entity not in recognized_entities:
+            eval_metrics["missed"] += 1
+    if recognized_entities == gold_entities:
+        print(f"{bcolors.OKGREEN}PASSED!{bcolors.ENDC}")
+    else:
+        print(f"{bcolors.FAIL}FAILED{bcolors.ENDC}")
+    print("\n---\n")
+
+# Calculate precision, recall, and F1 score
+precision = eval_metrics["correct"] / (
+    eval_metrics["correct"] + eval_metrics["spurious"] + 1e-8
+)
+recall = eval_metrics["correct"] / (
+    eval_metrics["correct"] + eval_metrics["missed"] + 1e-8
+)
+f1_score = 2 * (precision * recall) / (precision + recall + 1e-8)
+
+print(
+    f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1_score:.2f}"
+)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@

		en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.0/en_core_web_lg-3.7.0-py3-none-any.whl
		da_core_news_lg @ https://github.com/explosion/spacy-models/releases/download/da_core_news_lg-3.7.0/da_core_news_lg-3.7.0-py3-none-any.whl