Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance improvements #92

Merged
merged 8 commits into from
Apr 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 12 additions & 33 deletions clevercsv/detect_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,17 @@
"""

import collections
import re

from typing import Pattern

from .cabstraction import base_abstraction
from .cabstraction import c_merge_with_quotechar

DEFAULT_EPS_PAT = 1e-3

RE_MULTI_C: Pattern = re.compile(r"C{2,}")


def pattern_score(data, dialect, eps=DEFAULT_EPS_PAT):
"""
Expand Down Expand Up @@ -66,13 +72,13 @@ def make_abstraction(data, dialect):
A = base_abstraction(
data, dialect.delimiter, dialect.quotechar, dialect.escapechar
)
A = merge_with_quotechar(A, dialect)
A = merge_with_quotechar(A)
A = fill_empties(A)
A = strip_trailing(A)
return A


def merge_with_quotechar(S, dialect):
def merge_with_quotechar(S, dialect=None):
"""Merge quoted blocks in the abstraction

This function takes the abstract representation and merges quoted blocks
Expand All @@ -85,42 +91,16 @@ def merge_with_quotechar(S, dialect):
The data of a file as a string

dialect : SimpleDialect
The dialect used to make the abstraction.
The dialect used to make the abstraction. This is not used but kept for
backwards compatibility. Will be removed in a future version.

Returns
-------
abstraction : str
A simplified version of the abstraction with quoted blocks merged.

"""
in_quotes = False
i = 0
quote_pairs = []
while i < len(S):
s = S[i]
if not s == "Q":
i += 1
continue

if not in_quotes:
in_quotes = True
begin_quotes = i
else:
if i + 1 < len(S) and S[i + 1] == "Q":
i += 1
else:
end_quotes = i
quote_pairs.append((begin_quotes, end_quotes))
in_quotes = False
i += 1

# replace quoted blocks by C
Sl = list(S)
for begin, end in quote_pairs:
for i in range(begin, end + 1):
Sl[i] = "C"
S = "".join(Sl)
return S
return c_merge_with_quotechar(S)


def fill_empties(abstract):
Expand Down Expand Up @@ -152,8 +132,7 @@ def fill_empties(abstract):
while "RD" in abstract:
abstract = abstract.replace("RD", "RCD")

while "CC" in abstract:
abstract = abstract.replace("CC", "C")
abstract = RE_MULTI_C.sub("C", abstract)

if abstract.startswith("D"):
abstract = "C" + abstract
Expand Down
60 changes: 37 additions & 23 deletions clevercsv/escape.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,36 @@
"""

import codecs
import sys
import unicodedata

#: Set of default characters to *never* consider as escape character
DEFAULT_BLOCK_CHARS = set(
[
"!",
"?",
'"',
"'",
".",
",",
";",
":",
"%",
"*",
"&",
"#",
]
)

#: Set of characters in the Unicode "Po" category
UNICODE_PO_CHARS = set(
[
c
for c in map(chr, range(sys.maxunicode + 1))
if unicodedata.category(c) == "Po"
]
)


def is_potential_escapechar(char, encoding, block_char=None):
"""Check if a character is a potential escape character.
Expand All @@ -29,36 +57,22 @@ def is_potential_escapechar(char, encoding, block_char=None):
block_char : iterable
Characters that are in the Punctuation Other category but that should
not be considered as escape character. If None, the default set is
used, equal to::

["!", "?", '"', "'", ".", ",", ";", ":", "%", "*", "&", "#"
used, which is defined in :py:data:`DEFAULT_BLOCK_CHARS`.

Returns
-------
is_escape : bool
Whether the character is considered a potential escape or not.

"""
as_unicode = codecs.decode(bytes(char, encoding), encoding=encoding)
if encoding.lower() in set(["utf-8", "ascii"]):
uchar = char
else:
uchar = codecs.decode(bytes(char, encoding), encoding=encoding)

ctr = unicodedata.category(as_unicode)
if block_char is None:
block_char = [
"!",
"?",
'"',
"'",
".",
",",
";",
":",
"%",
"*",
"&",
"#",
]
if ctr == "Po":
if as_unicode in block_char:
return False
block_chars = (
DEFAULT_BLOCK_CHARS if block_char is None else set(block_char)
)
if uchar in UNICODE_PO_CHARS and uchar not in block_chars:
return True
return False
9 changes: 8 additions & 1 deletion clevercsv/potential_dialects.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import itertools
import unicodedata

from typing import Dict

import regex

from .detect_type import PATTERNS
Expand Down Expand Up @@ -69,10 +71,15 @@ def get_dialects(
for delim, quotechar in itertools.product(delims, quotechars):
escapechars[(delim, quotechar)] = set([""])

is_escapechar_cache: Dict[str, bool] = {}

# escapechars are those that precede a delimiter or quotechar
for u, v in pairwise(data):
if not is_potential_escapechar(u, encoding):
if u not in is_escapechar_cache:
is_escapechar_cache[u] = is_potential_escapechar(u, encoding)
if not is_escapechar_cache[u]:
continue

for delim, quotechar in itertools.product(delims, quotechars):
if v == delim or v == quotechar:
escapechars[(delim, quotechar)].add(u)
Expand Down
140 changes: 139 additions & 1 deletion src/abstraction.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

#define MODULE_VERSION "1.0"

#include <stdbool.h>

#include "Python.h"

static int _set_char(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt)
Expand Down Expand Up @@ -110,7 +112,7 @@ PyObject *base_abstraction(PyObject *self, PyObject *args)
} else {
if (escape_next)
escape_next = 0;
if (stack[len-1] != 'C')
if (len == 0 || stack[len-1] != 'C')
stack[len++] = 'C';
}
if (len == stack_size) {
Expand All @@ -136,17 +138,153 @@ PyObject *base_abstraction(PyObject *self, PyObject *args)
return stack_obj;
}

PyObject *c_merge_with_quotechar(PyObject *self, PyObject *args)
{
int kind;
void *data;

bool in_quotes = false;
size_t *quote_idx_l = NULL;
size_t *quote_idx_r = NULL;
size_t *quote_idx_l_new = NULL;
size_t *quote_idx_r_new = NULL;
size_t i, j, len, quote_idx, quote_idx_size = 4;
char *new_S = NULL;

// single characters
Py_UCS4 s, t;

// retrieve the string from the function arguments
PyObject *S = NULL;
if (!PyArg_ParseTuple(args, "O", &S)) {
printf("Error parsing arguments.\n");
return NULL;
}

// check that the string is ready
if (PyUnicode_READY(S) == -1) {
printf("Unicode object not ready.\n");
return NULL;
}

// extract kind, data, and length
kind = PyUnicode_KIND(S);
data = PyUnicode_DATA(S);
len = PyUnicode_GET_LENGTH(S);

// empty string means return
if (len == 0)
return S;

// initialize the arrays that'll hold the start and end indices of the
// quoted parts of the string.
quote_idx_l = malloc(sizeof(size_t) * quote_idx_size);
if (quote_idx_l == NULL) {
PyErr_NoMemory();
return NULL;
}
quote_idx_r = malloc(sizeof(size_t) * quote_idx_size);
if (quote_idx_r == NULL) {
PyErr_NoMemory();
return NULL;
}

// allocate and populate the output array
new_S = malloc(sizeof(char) * len);
if (new_S == NULL) {
PyErr_NoMemory();
return NULL;
}
for (i=0; i<len; i++) {
new_S[i] = '\0';
}

i = 0;
quote_idx = 0;
while (i < len) {
s = PyUnicode_READ(kind, data, i);
new_S[i] = s;

if (s != 'Q') {
i++;
continue;
}

// record that we're starting a quoted bit
if (!in_quotes) {
in_quotes = true;
quote_idx_l[quote_idx] = i;
i += 1;
continue;
}

// read the next character if we can
if (i + 1 < len) {
t = PyUnicode_READ(kind, data, i + 1);
}
if (i + 1 < len && t == 'Q') {
i++;
} else {
quote_idx_r[quote_idx] = i;
quote_idx++;
in_quotes = false;

// reallocate if we need to
if (quote_idx == quote_idx_size) {
quote_idx_size *= 2;
quote_idx_l_new = quote_idx_l;
quote_idx_l_new = realloc(quote_idx_l_new, sizeof(size_t)*quote_idx_size);
if (quote_idx_l_new == NULL) {
PyErr_NoMemory();
return NULL;
}
quote_idx_r_new = quote_idx_r;
quote_idx_r_new = realloc(quote_idx_r_new, sizeof(size_t)*quote_idx_size);
if (quote_idx_r_new == NULL) {
PyErr_NoMemory();
return NULL;
}
quote_idx_l = quote_idx_l_new;
quote_idx_r = quote_idx_r_new;
}
}
i++;
}

// overwrite the part of the output string that's in quotes
for (j=0; j<quote_idx; j++) {
for (i=quote_idx_l[j]; i<=quote_idx_r[j]; i++) {
new_S[i] = 'C';
}
}

// convert to Python object
PyObject *new_S_obj = PyUnicode_FromStringAndSize(new_S, (Py_ssize_t)len);
if (new_S_obj == NULL)
goto merge_err;
Py_INCREF(new_S_obj);

merge_err:
free(new_S);
free(quote_idx_l);
free(quote_idx_r);
return new_S_obj;
}

/*
* MODULE
*/

PyDoc_STRVAR(cabstraction_module_doc,
"Helpers for abstraction computation in C\n");
PyDoc_STRVAR(cabstraction_base_abstraction_doc, "");
PyDoc_STRVAR(cabstraction_c_merge_with_quotechar_doc, "");

static struct PyMethodDef cabstraction_methods[] = {
{ "base_abstraction", (PyCFunction)base_abstraction, METH_VARARGS,
cabstraction_base_abstraction_doc },
{ "c_merge_with_quotechar", (PyCFunction)c_merge_with_quotechar, METH_VARARGS,
cabstraction_c_merge_with_quotechar_doc },
{ NULL, NULL, 0, NULL }
};

Expand Down
1 change: 0 additions & 1 deletion tests/test_integration/error.log
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
12f6fa751d2b2a491a54bc9e0e39d05f
13a6c86a18f053c593feda3d98755010
1 change: 1 addition & 0 deletions tests/test_integration/failed.log
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
104761c04f7278b2f5afce85c96db719
120b852c984ad304b3393c7beeea6491
1390ca6ccd8500cbbfbc5c7f64979004
13a6c86a18f053c593feda3d98755010
17c8007d6eb9baf19d075cb33759e313
17ccdf2fd0edef2d3bf5fca779cb2161
17e16b55d1d9ee2e13068db7cc69dbf9
Expand Down
1 change: 1 addition & 0 deletions tests/test_integration/method.log
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,7 @@
13859424315a15fbd24f92cc6e0a1b52,normal
1390ca6ccd8500cbbfbc5c7f64979004,consistency
13989b94a814dfe6b7b784a3a8c5c581,consistency
13a6c86a18f053c593feda3d98755010,consistency
13aae3390d51fa5cf407d77ed29e9b01,normal
13abadb255dbb605620b071f53b33ad3,normal
13ae16b2b3b628657aaa4a527bf6a532,normal
Expand Down
Loading