Skip to content

Commit

Permalink
chore(deps): Update spacy to v3
Browse files Browse the repository at this point in the history
  • Loading branch information
celian-garcia committed Oct 2, 2022
1 parent c23b80a commit a5bbede
Show file tree
Hide file tree
Showing 13 changed files with 114 additions and 216 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,4 @@ jobs:
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pytest
pytest -m "not integration"
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.7
FROM python:3.10

LABEL maintainer="celian.garcia1@gmail.com"

Expand Down Expand Up @@ -34,4 +34,4 @@ RUN pip install --upgrade pip && \
RUN python -m spacy download fr_core_news_md

# Build the main command
CMD ["python", "manage.py", "runserver"]
CMD ["sanic", "cerebro.server.app"]
7 changes: 3 additions & 4 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ Cerebro manage the NLU (Natural Language Understanding) part of Milobella.

## Installation
```bash
python -m spacy download fr
python -m spacy download fr_core_news_md
pip install -r requirements.txt
pip install -e .
Expand All @@ -21,15 +20,15 @@ It might take a few minutes.

## Run
```bash
$ python manage.py runserver --config-file cerebro.ini
$ sanic cerebro.server.app --dev
```

## Upload and train a model
When the server is running, this is not over. You have two tasks to perform to make it work.

#### Upload the model (not necessary if the model is already in database)
```bash
$ curl -iv -X PUT 'http://localhost:9444/models/default/samples' -d @samples.json
$ curl -iv -X PUT 'http://localhost:9444/models/default/samples' -d samples.json
```
An example of ``samples.json`` file format is here : [./scripts/samples.json](./scripts/samples.json)

Expand All @@ -40,7 +39,7 @@ $ curl -iv -X POST 'http://localhost:9444/models/default/train'

## Request example
```bash
$ curl -iv http://localhost:9444/understand?query="Bonjour"
$ curl -iv -X POST 'http://localhost:9444/understand' -d '{"text": "Bonjour"}'
```

## CHANGELOGS
Expand Down
17 changes: 0 additions & 17 deletions cerebro.ini

This file was deleted.

10 changes: 10 additions & 0 deletions cerebro.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
cerebro:
spacy:
model: fr_core_news_md
iterations: 70
min_score: 0.1
chunk_size: 1000

features:
use_mongo: false
use_spacy: true
101 changes: 62 additions & 39 deletions cerebro/server.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
#!/usr/bin/env python
# coding: utf8
import logging
from typing import Dict, Any

import yaml as yaml
from sanic import Sanic
from sanic.config import Config

from cerebro.repository.nlp_repository import Repository
from cerebro.repository.nlp_repository_fake import NLPRepositoryFake
Expand All @@ -15,48 +18,29 @@
from cerebro.views.web import HtmlView


def run(**params):
logger = logging.getLogger()
class YamlConfig(Config):
def __init__(self, *args, path: str, **kwargs):
super().__init__(*args, **kwargs)

# Initialize the sanic app
_app = Sanic(name="cerebro", configure_logging=False)
with open(path, "r") as f:
self.apply(yaml.safe_load(f))

repository = build_repository(**params)
def apply(self, config):
self.update(self._to_uppercase(config))

_app.add_route(HtmlView.as_view(), '/')
_app.add_route(SamplesView.as_view(repository), '/models/<model_id:string>/samples')

if params["use_spacy"]:
spacy_manager = SpaCyModelManager(
repository, model=params["model"],
iterations=params["iterations"],
chunk_size=params["chunk_size"]
)
spacy_request = SpaCyRequestService(
spacy_manager, min_score=params["min_score"]
)

_app.add_route(UnderstandingView.as_view(spacy_request), '/understand')
_app.add_route(TrainingView.as_view(spacy_manager), '/models/<model_id>/train')

# # Asynchronous call to SpaCy training
# spacy_manager.update_model("default")
else:
logger.warn(
"\n####### !! SpaCy has been disabled !! ########"
"\n Cerebro is not really interesting without SpaCy ;)."
"\n You can reactivate it with this line in config:"
"\n ================="
"\n\t[features]"
"\n\tuse_spacy = true"
"\n ================="
"\n################################################")

# Run the server
_app.run(
host=params["host"],
port=params["port"],
)
def _to_uppercase(self, obj: Dict[str, Any]) -> Dict[str, Any]:
retval: Dict[str, Any] = {}
for key, value in obj.items():
upper_key = key.upper()
if isinstance(value, list):
retval[upper_key] = [
self._to_uppercase(item) for item in value
]
elif isinstance(value, dict):
retval[upper_key] = self._to_uppercase(value)
else:
retval[upper_key] = value
return retval


def build_repository(**params) -> Repository:
Expand All @@ -67,3 +51,42 @@ def build_repository(**params) -> Repository:
)
else:
return NLPRepositoryFake()


logger = logging.getLogger()

# Initialize the sanic app
config = YamlConfig(path="cerebro.yaml")
app = Sanic(name="cerebro", config=config)

# repository = build_repository(**params)
repository = NLPRepositoryFake()

app.add_route(HtmlView.as_view(), '/')
app.add_route(SamplesView.as_view(repository), '/models/<model_id:str>/samples')

if config["CEREBRO"]["FEATURES"]["USE_SPACY"]:
spacy_manager = SpaCyModelManager(
repository, model=config["CEREBRO"]["SPACY"]["MODEL"],
iterations=config["CEREBRO"]["SPACY"]["ITERATIONS"],
chunk_size=config["CEREBRO"]["SPACY"]["CHUNK_SIZE"]
)
spacy_request = SpaCyRequestService(
spacy_manager, min_score=config["CEREBRO"]["SPACY"]["MIN_SCORE"]
)

app.add_route(UnderstandingView.as_view(spacy_request), '/understand')
app.add_route(TrainingView.as_view(spacy_manager), '/models/<model_id:str>/train')

# # Asynchronous call to SpaCy training
# spacy_manager.update_model("default")
else:
logger.warn(
"\n####### !! SpaCy has been disabled !! ########"
"\n Cerebro is not really interesting without SpaCy ;)."
"\n You can reactivate it with this line in config:"
"\n ================="
"\n\t[features]"
"\n\tuse_spacy = true"
"\n ================="
"\n################################################")
2 changes: 1 addition & 1 deletion cerebro/spacy/spacy_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def _update_model(self, model_id: str) -> None:
categories = self._repository.get_categories(model_id)
entities = self._repository.get_entities(model_id)
nlp_factory = SpacyModelFactory(model=self._model_source, iterations=self._iterations,
categories=categories, entities=entities)
intents=categories, entities=entities)
nlp_factory.load_model()
start = 0
while "Some samples are remaining":
Expand Down
95 changes: 26 additions & 69 deletions cerebro/spacy/spacy_model_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,18 @@
from typing import List, Dict

import spacy
from spacy.training import Example
from spacy.util import compounding, minibatch

PIPE_INTENT = "textcat"
PIPE_ENTITY = "ner"


class SpacyModelFactory:
def __init__(self, model: str, iterations: int, categories: List[str], entities: List[str]):
def __init__(self, model: str, iterations: int, intents: List[str], entities: List[str]):
self._logger = logging.getLogger(self.__class__.__name__)
self._nlp = None
self._categories = categories
self._intents = intents
self._entities = entities

self._model = model
Expand All @@ -29,95 +33,48 @@ def load_model(self):
self._logger.debug("Successfully loaded Spacy Data !")

# === Load entities ===
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in self._nlp.pipe_names:
ner = self._nlp.create_pipe('ner')
self._nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
ner = self._nlp.get_pipe('ner')

for entity in self._entities:
ner.add_label(entity)
if PIPE_ENTITY not in self._nlp.pipe_names:
self._nlp.add_pipe(PIPE_ENTITY, last=True)

# === Load categories ===
# add the text classifier to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'textcat' not in self._nlp.pipe_names:
textcat = self._nlp.create_pipe('textcat')
self._nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
textcat = self._nlp.get_pipe('textcat')

for cat in self._categories:
textcat.add_label(cat)
if PIPE_INTENT not in self._nlp.pipe_names:
self._nlp.add_pipe(PIPE_INTENT, last=True)

def register_samples(self, samples: List[Dict]):
# Build the train data
train_categories_data = []
train_entities_data = []
train_intent_data = []
train_entity_data = []
for sample in samples:
# Append the train data for category
train_categories_data.append((
train_intent_data.append((
sample["text"],
{'cats': {cat: 1. if cat in sample["categories"] else 0. for cat in self._categories}}
{'cats': {cat: 1. if cat in sample["categories"] else 0. for cat in self._intents}}
))

# Append the train data for entities
if "entities" in sample and len(sample["entities"]) != 0:
train_entities_data.append((
train_entity_data.append((
sample["text"],
{'entities': [(ent["start"], ent["end"], ent["name"]) for ent in sample["entities"]]}
))

# Perform the trains
self._logger.debug(f"Training {len(samples)} samples on SpaCy Data Model : {self._model}... Could take time.")
self._train_categories(train_categories_data)
self._train_ner(train_entities_data)
self._train(PIPE_INTENT, train_intent_data)
self._train(PIPE_ENTITY, train_entity_data)
self._logger.debug("Successfully trained SpaCy Data !")

def _train_ner(self, train_entities_data):

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in self._nlp.pipe_names if pipe != 'ner']
with self._nlp.disable_pipes(*other_pipes): # only train NER
optimizer = self._nlp.begin_training()
def _train(self, pipe_name: str, train_data):
with self._nlp.select_pipes(enable=[pipe_name]): # only train given pipe
examples = []
for text, annots in train_data:
examples.append(Example.from_dict(self._nlp.make_doc(text), annots))
self._nlp.initialize(lambda: examples)
losses = {}
for itn in range(self._iterations):
random.shuffle(train_entities_data)
random.shuffle(examples)

# batch up the examples using spaCy's minibatch
batches = minibatch(train_entities_data, size=compounding(4., 32., 1.001))
for batch in batches:
texts, annotations = zip(*batch)
self._nlp.update(
texts, # batch of texts
annotations, # batch of annotations
sgd=optimizer, # callable to update weights
losses=losses)
for batch in minibatch(examples, size=compounding(4., 32., 1.001)):
self._nlp.update(batch, losses=losses)
self._logger.debug('Losses ', losses)

def _train_categories(self, train_categories_data):

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in self._nlp.pipe_names if pipe != 'textcat']
with self._nlp.disable_pipes(*other_pipes): # only train textcat
optimizer = self._nlp.begin_training()
# print("Training the model...")
# print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))

for itn in range(self._iterations):
losses = {}
batches = minibatch(train_categories_data, size=compounding(4., 32., 1.001))
for batch in batches:
texts, annotations = zip(*batch)
self._nlp.update(texts, annotations, sgd=optimizer, losses=losses)

# with textcat.model.use_params(optimizer.averages):
# # evaluate on the dev data split off in load_data()
# scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
# print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}' # print a simple table
# .format(losses['textcat'], scores['textcat_p'],
# scores['textcat_r'], scores['textcat_f']))
4 changes: 2 additions & 2 deletions helm/cerebro/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ spec:
- name: {{ .Chart.Name }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
command: ["python", "manage.py", "runserver", "--config-file", "/etc/k8s/cerebro.ini"]
command: ["python", "manage.py", "runserver", "--config-file", "/etc/k8s/cerebro.yaml"]
ports:
- containerPort: {{ .Values.containerPort }}
volumeMounts:
Expand All @@ -43,7 +43,7 @@ spec:
name: {{ include "cerebro.fullname" . }}
items:
- key: configuration-file
path: cerebro.ini
path: cerebro.yaml
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
Expand Down
Loading

0 comments on commit a5bbede

Please sign in to comment.