Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

modernize code and packaging, and make package installable from cython source #165

Merged
merged 5 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[flake8]
max-line-length=160
extend-ignore = E203
8 changes: 1 addition & 7 deletions .github/workflows/build_and_upload.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,9 @@ jobs:
with:
submodules: true
- name: "Build wheels"
uses: "pypa/cibuildwheel@v2.16.2"
uses: "pypa/cibuildwheel@v2.21.1"
env:
CIBW_SKIP: "pp*" # FIXME
CIBW_BEFORE_BUILD: "pip install -U cython && ./update_cpp.sh"
CIBW_BEFORE_BUILD_WINDOWS: "pip install -U cython && update_cpp.sh"
CIBW_TEST_REQUIRES: "pytest"
CIBW_TEST_COMMAND: "pytest {project}/tests --doctest-modules"
- uses: "actions/upload-artifact@v3"
Expand All @@ -38,10 +36,6 @@ jobs:
- uses: "actions/checkout@v4"
with:
submodules: true
- name: "Install dependencies"
run: "python -m pip install --upgrade cython"
- name: "Rebuild CPP files using Cython"
run: "./update_cpp.sh"
- name: "Build source distribution"
run: "pipx run build --sdist"
- uses: "actions/upload-artifact@v3"
Expand Down
4 changes: 0 additions & 4 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,6 @@ jobs:
python -m site
python -m pip install --upgrade pip setuptools wheel
python -m pip install --upgrade virtualenv tox tox-gh-actions
python -m pip install --upgrade cython

- name: "Rebuild CPP files using Cython"
run: "./update_cpp.sh"

- name: "Run tox targets for ${{ matrix.python-version }}"
run: "python -m tox"
8 changes: 0 additions & 8 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,6 @@ Contributing

Feel free to submit ideas, bugs reports, pull requests or regular patches.

In order to run tests, install Cython_ (> 0.24.1) and tox_, then type

::

./update_cpp.sh; tox

from the source checkout.

Please don't commit generated cpp files in the same commit as other files.

.. _Cython: http://cython.org/
Expand Down
1 change: 0 additions & 1 deletion pycrfsuite/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
from __future__ import absolute_import
from ._pycrfsuite import *
17 changes: 9 additions & 8 deletions pycrfsuite/_dumpparser.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import re


class ParsedDump(object):
class ParsedDump:
"""
CRFsuite model parameters. Objects of this type are returned by
:meth:`pycrfsuite.Tagger.info()` method.
Expand All @@ -27,6 +25,7 @@ class ParsedDump(object):
``{name: internal_id}`` dict with known attributes

"""

def __init__(self):
self.header = {}
self.labels = {}
Expand All @@ -35,7 +34,7 @@ def __init__(self):
self.state_features = {}


class CRFsuiteDumpParser(object):
class CRFsuiteDumpParser:
"""
A hack: parser for `crfsuite dump` results.

Expand All @@ -49,17 +48,19 @@ def __init__(self):

def feed(self, line):
# Strip initial ws and line terminator, but allow for ws at the end of feature names.
line = line.lstrip().rstrip('\r\n')
line = line.lstrip().rstrip("\r\n")
if not line:
return

m = re.match(r"(FILEHEADER|LABELS|ATTRIBUTES|TRANSITIONS|STATE_FEATURES) = {", line)
m = re.match(
r"(FILEHEADER|LABELS|ATTRIBUTES|TRANSITIONS|STATE_FEATURES) = {", line
)
if m:
self.state = m.group(1)
elif line == '}':
elif line == "}":
self.state = None
else:
getattr(self, 'parse_%s' % self.state)(line)
getattr(self, "parse_%s" % self.state)(line)

def parse_FILEHEADER(self, line):
m = re.match(r"(\w+): (.*)", line)
Expand Down
123 changes: 65 additions & 58 deletions pycrfsuite/_logparser.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import re
import fractions
from collections import namedtuple

LabelScore = namedtuple('LabelScore', 'match model ref precision recall f1')
LabelScore = namedtuple("LabelScore", "match model ref precision recall f1")


class TrainLogParser(object):

class TrainLogParser:
def __init__(self):
self.state = None
self.featgen_percent = -2
Expand All @@ -26,31 +23,31 @@ def feed(self, line):
# if line != '\n':
self.log.append(line)
if self.state is None:
self.state = 'STARTING'
self.state = "STARTING"
self.handle_STARTING(line)
self.events.append(('start', 0, len(self.log)))
return 'start'
self.events.append(("start", 0, len(self.log)))
return "start"
event = getattr(self, "handle_" + self.state)(line)
if event is not None:
start, end = self.events[-1][2], len(self.log)
if event in ('prepared', 'optimization_end'):
if event in ("prepared", "optimization_end"):
end -= 1
self.events.append((event, start, end))
return event

@property
def last_log(self):
event, start, end = self.events[-1]
return ''.join(self.log[start:end])
return "".join(self.log[start:end])

def handle_STARTING(self, line):
if line.startswith('Feature generation'):
self.state = 'FEATGEN'
if line.startswith("Feature generation"):
self.state = "FEATGEN"

def handle_FEATGEN(self, line):
if line in "0123456789.10":
self.featgen_percent += 2
return 'featgen_progress'
return "featgen_progress"

m = re.match(r"Number of features: (\d+)", line)
if m:
Expand All @@ -59,29 +56,29 @@ def handle_FEATGEN(self, line):

if self._seconds(line) is not None:
self.featgen_seconds = self._seconds(line)
self.state = 'AFTER_FEATGEN'
return 'featgen_end'
self.state = "AFTER_FEATGEN"
return "featgen_end"

def handle_AFTER_FEATGEN(self, line):
if self._iteration_head(line) is not None:
self.state = 'ITERATION'
self.state = "ITERATION"
self.handle_ITERATION(line)
return 'prepared'
return "prepared"

if 'terminated with error' in line:
self.state = 'AFTER_ITERATION'
return 'prepare_error'
if "terminated with error" in line:
self.state = "AFTER_ITERATION"
return "prepare_error"

def handle_ITERATION(self, line):
if self._iteration_head(line) is not None:
self.last_iteration = {
'num': self._iteration_head(line),
'scores': {},
"num": self._iteration_head(line),
"scores": {},
}
self.iterations.append(self.last_iteration)
elif line == '\n':
self.state = 'AFTER_ITERATION'
return 'iteration'
elif line == "\n":
self.state = "AFTER_ITERATION"
return "iteration"

def add_re(key, pattern, typ):
m = re.match(pattern, line)
Expand All @@ -96,71 +93,81 @@ def add_re(key, pattern, typ):
add_re("linesearch_step", r"Line search step: (\d+\.\d+)", float)
add_re("time", r"Seconds required for this iteration: (\d+\.\d+)", float)

m = re.match(r"Macro-average precision, recall, F1: \((\d\.\d+), (\d\.\d+), (\d\.\d+)\)", line)
m = re.match(
r"Macro-average precision, recall, F1: \((\d\.\d+), (\d\.\d+), (\d\.\d+)\)",
line,
)
if m:
self.last_iteration['avg_precision'] = float(m.group(1))
self.last_iteration['avg_recall'] = float(m.group(2))
self.last_iteration['avg_f1'] = float(m.group(3))
self.last_iteration["avg_precision"] = float(m.group(1))
self.last_iteration["avg_recall"] = float(m.group(2))
self.last_iteration["avg_f1"] = float(m.group(3))

m = re.match(r"Item accuracy: (\d+) / (\d+)", line)
if m:
acc = fractions.Fraction(int(m.group(1)), int(m.group(2)))
self.last_iteration['item_accuracy'] = acc
self.last_iteration['item_accuracy_float'] = float(acc)
self.last_iteration["item_accuracy"] = acc
self.last_iteration["item_accuracy_float"] = float(acc)

m = re.match(r"Instance accuracy: (\d+) / (\d+)", line)
if m:
acc = fractions.Fraction(int(m.group(1)), int(m.group(2)))
self.last_iteration['instance_accuracy'] = acc
self.last_iteration['instance_accuracy_float'] = float(acc)
self.last_iteration["instance_accuracy"] = acc
self.last_iteration["instance_accuracy_float"] = float(acc)

m = re.match(r"\s{4}(.+): \((\d+), (\d+), (\d+)\) \((\d\.\d+), (\d\.\d+), (\d\.\d+)\)", line)
m = re.match(
r"\s{4}(.+): \((\d+), (\d+), (\d+)\) \((\d\.\d+), (\d\.\d+), (\d\.\d+)\)",
line,
)
if m:
self.last_iteration['scores'][m.group(1)] = LabelScore(**{
'match': int(m.group(2)),
'model': int(m.group(3)),
'ref': int(m.group(4)),
'precision': float(m.group(5)),
'recall': float(m.group(6)),
'f1': float(m.group(7)),
})
self.last_iteration["scores"][m.group(1)] = LabelScore(
**{
"match": int(m.group(2)),
"model": int(m.group(3)),
"ref": int(m.group(4)),
"precision": float(m.group(5)),
"recall": float(m.group(6)),
"f1": float(m.group(7)),
}
)

m = re.match(r"\s{4}(.+): \(0, 0, 0\) \(\*{6}, \*{6}, \*{6}\)", line)
if m:
self.last_iteration['scores'][m.group(1)] = LabelScore(**{
'match': 0,
'model': 0,
'ref': 0,
'precision': None,
'recall': None,
'f1': None,
})
self.last_iteration["scores"][m.group(1)] = LabelScore(
**{
"match": 0,
"model": 0,
"ref": 0,
"precision": None,
"recall": None,
"f1": None,
}
)

def handle_AFTER_ITERATION(self, line):
if self._iteration_head(line) is not None:
self.state = 'ITERATION'
self.state = "ITERATION"
return self.handle_ITERATION(line)

m = re.match(r"Total seconds required for training: (\d+\.\d+)", line)
if m:
self.training_seconds = float(m.group(1))

if line.startswith('Storing the model'):
self.state = 'STORING'
return 'optimization_end'
if line.startswith("Storing the model"):
self.state = "STORING"
return "optimization_end"

def handle_STORING(self, line):
if line == '\n':
return 'end'
if line == "\n":
return "end"
elif self._seconds(line):
self.storing_seconds = self._seconds(line)

def _iteration_head(self, line):
m = re.match(r'\*{5} (?:Iteration|Epoch) #(\d+) \*{5}\n', line)
m = re.match(r"\*{5} (?:Iteration|Epoch) #(\d+) \*{5}\n", line)
if m:
return int(m.group(1))

def _seconds(self, line):
m = re.match(r'Seconds required: (\d+\.\d+)', line)
m = re.match(r"Seconds required: (\d+\.\d+)", line)
if m:
return float(m.group(1))
Loading
Loading