chore(deps): Update spacy to v3

milobella · Oct 2, 2022 · a5bbede · a5bbede
1 parent c23b80a
commit a5bbede
Show file tree

Hide file tree

Showing 13 changed files with 114 additions and 216 deletions.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -36,4 +36,4 @@ jobs:
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
     - name: Test with pytest
       run: |
-        pytest
+        pytest -m "not integration"
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.7
+FROM python:3.10
 
 LABEL maintainer="celian.garcia1@gmail.com"
 
@@ -34,4 +34,4 @@ RUN pip install --upgrade pip && \
 RUN python -m spacy download fr_core_news_md
 
 # Build the main command
-CMD ["python", "manage.py", "runserver"]
+CMD ["sanic", "cerebro.server.app"]
diff --git a/Readme.md b/Readme.md
@@ -4,7 +4,6 @@ Cerebro manage the NLU (Natural Language Understanding) part of Milobella.
 
 ## Installation
 ```bash
-python -m spacy download fr
 python -m spacy download fr_core_news_md
 pip install -r requirements.txt
 pip install -e .
@@ -21,15 +20,15 @@ It might take a few minutes.
 
 ## Run
 ```bash
-$ python manage.py runserver --config-file cerebro.ini
+$ sanic cerebro.server.app --dev
 ```
 
 ## Upload and train a model
 When the server is running, this is not over. You have two tasks to perform to make it work.
 
 #### Upload the model (not necessary if the model is already in database)
 ```bash
-$ curl -iv -X PUT 'http://localhost:9444/models/default/samples' -d @samples.json
+$ curl -iv -X PUT 'http://localhost:9444/models/default/samples' -d samples.json
 ```
 An example of ``samples.json`` file format is here : [./scripts/samples.json](./scripts/samples.json)
 
@@ -40,7 +39,7 @@ $ curl -iv -X POST 'http://localhost:9444/models/default/train'
 
 ## Request example
 ```bash
-$ curl -iv http://localhost:9444/understand?query="Bonjour"
+$ curl -iv -X POST 'http://localhost:9444/understand' -d '{"text": "Bonjour"}'
 ```
 
 ## CHANGELOGS

diff --git a/cerebro.ini b/cerebro.ini
diff --git a/cerebro.yaml b/cerebro.yaml
@@ -0,0 +1,10 @@
+cerebro:
+  spacy:
+    model: fr_core_news_md
+    iterations: 70
+    min_score: 0.1
+    chunk_size: 1000
+
+  features:
+    use_mongo: false
+    use_spacy: true
diff --git a/cerebro/server.py b/cerebro/server.py
@@ -1,8 +1,11 @@
 #!/usr/bin/env python
 # coding: utf8
 import logging
+from typing import Dict, Any
 
+import yaml as yaml
 from sanic import Sanic
+from sanic.config import Config
 
 from cerebro.repository.nlp_repository import Repository
 from cerebro.repository.nlp_repository_fake import NLPRepositoryFake
@@ -15,48 +18,29 @@
 from cerebro.views.web import HtmlView
 
 
-def run(**params):
-    logger = logging.getLogger()
+class YamlConfig(Config):
+    def __init__(self, *args, path: str, **kwargs):
+        super().__init__(*args, **kwargs)
 
-    # Initialize the sanic app
-    _app = Sanic(name="cerebro", configure_logging=False)
+        with open(path, "r") as f:
+            self.apply(yaml.safe_load(f))
 
-    repository = build_repository(**params)
+    def apply(self, config):
+        self.update(self._to_uppercase(config))
 
-    _app.add_route(HtmlView.as_view(), '/')
-    _app.add_route(SamplesView.as_view(repository), '/models/<model_id:string>/samples')
-
-    if params["use_spacy"]:
-        spacy_manager = SpaCyModelManager(
-            repository, model=params["model"],
-            iterations=params["iterations"],
-            chunk_size=params["chunk_size"]
-        )
-        spacy_request = SpaCyRequestService(
-            spacy_manager, min_score=params["min_score"]
-        )
-
-        _app.add_route(UnderstandingView.as_view(spacy_request), '/understand')
-        _app.add_route(TrainingView.as_view(spacy_manager), '/models/<model_id>/train')
-
-        # # Asynchronous call to SpaCy training
-        # spacy_manager.update_model("default")
-    else:
-        logger.warn(
-            "\n#######  !! SpaCy has been disabled !!  ########"
-            "\n Cerebro is not really interesting without SpaCy ;)."
-            "\n You can reactivate it with this line in config:"
-            "\n ================="
-            "\n\t[features]"
-            "\n\tuse_spacy = true"
-            "\n ================="
-            "\n################################################")
-
-    # Run the server
-    _app.run(
-        host=params["host"],
-        port=params["port"],
-    )
+    def _to_uppercase(self, obj: Dict[str, Any]) -> Dict[str, Any]:
+        retval: Dict[str, Any] = {}
+        for key, value in obj.items():
+            upper_key = key.upper()
+            if isinstance(value, list):
+                retval[upper_key] = [
+                    self._to_uppercase(item) for item in value
+                ]
+            elif isinstance(value, dict):
+                retval[upper_key] = self._to_uppercase(value)
+            else:
+                retval[upper_key] = value
+        return retval
 
 
 def build_repository(**params) -> Repository:
@@ -67,3 +51,42 @@ def build_repository(**params) -> Repository:
         )
     else:
         return NLPRepositoryFake()
+
+
+logger = logging.getLogger()
+
+# Initialize the sanic app
+config = YamlConfig(path="cerebro.yaml")
+app = Sanic(name="cerebro", config=config)
+
+# repository = build_repository(**params)
+repository = NLPRepositoryFake()
+
+app.add_route(HtmlView.as_view(), '/')
+app.add_route(SamplesView.as_view(repository), '/models/<model_id:str>/samples')
+
+if config["CEREBRO"]["FEATURES"]["USE_SPACY"]:
+    spacy_manager = SpaCyModelManager(
+        repository, model=config["CEREBRO"]["SPACY"]["MODEL"],
+        iterations=config["CEREBRO"]["SPACY"]["ITERATIONS"],
+        chunk_size=config["CEREBRO"]["SPACY"]["CHUNK_SIZE"]
+    )
+    spacy_request = SpaCyRequestService(
+        spacy_manager, min_score=config["CEREBRO"]["SPACY"]["MIN_SCORE"]
+    )
+
+    app.add_route(UnderstandingView.as_view(spacy_request), '/understand')
+    app.add_route(TrainingView.as_view(spacy_manager), '/models/<model_id:str>/train')
+
+    # # Asynchronous call to SpaCy training
+    # spacy_manager.update_model("default")
+else:
+    logger.warn(
+        "\n#######  !! SpaCy has been disabled !!  ########"
+        "\n Cerebro is not really interesting without SpaCy ;)."
+        "\n You can reactivate it with this line in config:"
+        "\n ================="
+        "\n\t[features]"
+        "\n\tuse_spacy = true"
+        "\n ================="
+        "\n################################################")
diff --git a/cerebro/spacy/spacy_manager.py b/cerebro/spacy/spacy_manager.py
@@ -80,7 +80,7 @@ def _update_model(self, model_id: str) -> None:
             categories = self._repository.get_categories(model_id)
             entities = self._repository.get_entities(model_id)
             nlp_factory = SpacyModelFactory(model=self._model_source, iterations=self._iterations,
-                                            categories=categories, entities=entities)
+                                            intents=categories, entities=entities)
             nlp_factory.load_model()
             start = 0
             while "Some samples are remaining":

diff --git a/cerebro/spacy/spacy_model_factory.py b/cerebro/spacy/spacy_model_factory.py
@@ -3,14 +3,18 @@
 from typing import List, Dict
 
 import spacy
+from spacy.training import Example
 from spacy.util import compounding, minibatch
 
+PIPE_INTENT = "textcat"
+PIPE_ENTITY = "ner"
+
 
 class SpacyModelFactory:
-    def __init__(self, model: str, iterations: int, categories: List[str], entities: List[str]):
+    def __init__(self, model: str, iterations: int, intents: List[str], entities: List[str]):
         self._logger = logging.getLogger(self.__class__.__name__)
         self._nlp = None
-        self._categories = categories
+        self._intents = intents
         self._entities = entities
 
         self._model = model
@@ -29,95 +33,48 @@ def load_model(self):
         self._logger.debug("Successfully loaded Spacy Data !")
 
         # === Load entities ===
-        # create the built-in pipeline components and add them to the pipeline
-        # nlp.create_pipe works for built-ins that are registered with spaCy
-        if 'ner' not in self._nlp.pipe_names:
-            ner = self._nlp.create_pipe('ner')
-            self._nlp.add_pipe(ner, last=True)
-        # otherwise, get it so we can add labels
-        else:
-            ner = self._nlp.get_pipe('ner')
-
-        for entity in self._entities:
-            ner.add_label(entity)
+        if PIPE_ENTITY not in self._nlp.pipe_names:
+            self._nlp.add_pipe(PIPE_ENTITY, last=True)
 
         # === Load categories ===
-        # add the text classifier to the pipeline if it doesn't exist
-        # nlp.create_pipe works for built-ins that are registered with spaCy
-        if 'textcat' not in self._nlp.pipe_names:
-            textcat = self._nlp.create_pipe('textcat')
-            self._nlp.add_pipe(textcat, last=True)
-        # otherwise, get it, so we can add labels to it
-        else:
-            textcat = self._nlp.get_pipe('textcat')
-
-        for cat in self._categories:
-            textcat.add_label(cat)
+        if PIPE_INTENT not in self._nlp.pipe_names:
+            self._nlp.add_pipe(PIPE_INTENT, last=True)
 
     def register_samples(self, samples: List[Dict]):
         # Build the train data
-        train_categories_data = []
-        train_entities_data = []
+        train_intent_data = []
+        train_entity_data = []
         for sample in samples:
             # Append the train data for category
-            train_categories_data.append((
+            train_intent_data.append((
                 sample["text"],
-                {'cats': {cat: 1. if cat in sample["categories"] else 0. for cat in self._categories}}
+                {'cats': {cat: 1. if cat in sample["categories"] else 0. for cat in self._intents}}
             ))
 
             # Append the train data for entities
             if "entities" in sample and len(sample["entities"]) != 0:
-                train_entities_data.append((
+                train_entity_data.append((
                     sample["text"],
                     {'entities': [(ent["start"], ent["end"], ent["name"]) for ent in sample["entities"]]}
                 ))
 
         # Perform the trains
         self._logger.debug(f"Training {len(samples)} samples on SpaCy Data Model : {self._model}... Could take time.")
-        self._train_categories(train_categories_data)
-        self._train_ner(train_entities_data)
+        self._train(PIPE_INTENT, train_intent_data)
+        self._train(PIPE_ENTITY, train_entity_data)
         self._logger.debug("Successfully trained SpaCy Data !")
 
-    def _train_ner(self, train_entities_data):
-
-        # get names of other pipes to disable them during training
-        other_pipes = [pipe for pipe in self._nlp.pipe_names if pipe != 'ner']
-        with self._nlp.disable_pipes(*other_pipes):  # only train NER
-            optimizer = self._nlp.begin_training()
+    def _train(self, pipe_name: str, train_data):
+        with self._nlp.select_pipes(enable=[pipe_name]):  # only train given pipe
+            examples = []
+            for text, annots in train_data:
+                examples.append(Example.from_dict(self._nlp.make_doc(text), annots))
+            self._nlp.initialize(lambda: examples)
             losses = {}
             for itn in range(self._iterations):
-                random.shuffle(train_entities_data)
+                random.shuffle(examples)
 
                 # batch up the examples using spaCy's minibatch
-                batches = minibatch(train_entities_data, size=compounding(4., 32., 1.001))
-                for batch in batches:
-                    texts, annotations = zip(*batch)
-                    self._nlp.update(
-                        texts,  # batch of texts
-                        annotations,  # batch of annotations
-                        sgd=optimizer,  # callable to update weights
-                        losses=losses)
+                for batch in minibatch(examples, size=compounding(4., 32., 1.001)):
+                    self._nlp.update(batch, losses=losses)
             self._logger.debug('Losses ', losses)
-
-    def _train_categories(self, train_categories_data):
-
-        # get names of other pipes to disable them during training
-        other_pipes = [pipe for pipe in self._nlp.pipe_names if pipe != 'textcat']
-        with self._nlp.disable_pipes(*other_pipes):  # only train textcat
-            optimizer = self._nlp.begin_training()
-            # print("Training the model...")
-            # print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
-
-            for itn in range(self._iterations):
-                losses = {}
-                batches = minibatch(train_categories_data, size=compounding(4., 32., 1.001))
-                for batch in batches:
-                    texts, annotations = zip(*batch)
-                    self._nlp.update(texts, annotations, sgd=optimizer, losses=losses)
-
-                # with textcat.model.use_params(optimizer.averages):
-                #     # evaluate on the dev data split off in load_data()
-                #     scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
-                # print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
-                #       .format(losses['textcat'], scores['textcat_p'],
-                #               scores['textcat_r'], scores['textcat_f']))
diff --git a/helm/cerebro/templates/deployment.yaml b/helm/cerebro/templates/deployment.yaml
@@ -25,7 +25,7 @@ spec:
         - name: {{ .Chart.Name }}
           image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
           imagePullPolicy: {{ .Values.image.pullPolicy }}
-          command: ["python", "manage.py", "runserver", "--config-file", "/etc/k8s/cerebro.ini"]
+          command: ["python", "manage.py", "runserver", "--config-file", "/etc/k8s/cerebro.yaml"]
           ports:
             - containerPort: {{ .Values.containerPort }}
           volumeMounts:
@@ -43,7 +43,7 @@ spec:
             name: {{ include "cerebro.fullname" . }}
             items:
               - key: configuration-file
-                path: cerebro.ini
+                path: cerebro.yaml
       {{- with .Values.nodeSelector }}
       nodeSelector:
         {{- toYaml . | nindent 8 }}