alan-turing-institute · GjjvdBurg · Apr 6, 2023 · Apr 1, 2023 · Apr 1, 2023 · Apr 2, 2023
diff --git a/clevercsv/detect_pattern.py b/clevercsv/detect_pattern.py
@@ -8,11 +8,17 @@
 """
 
 import collections
+import re
+
+from typing import Pattern
 
 from .cabstraction import base_abstraction
+from .cabstraction import c_merge_with_quotechar
 
 DEFAULT_EPS_PAT = 1e-3
 
+RE_MULTI_C: Pattern = re.compile(r"C{2,}")
+
 
 def pattern_score(data, dialect, eps=DEFAULT_EPS_PAT):
     """
@@ -66,13 +72,13 @@ def make_abstraction(data, dialect):
     A = base_abstraction(
         data, dialect.delimiter, dialect.quotechar, dialect.escapechar
     )
-    A = merge_with_quotechar(A, dialect)
+    A = merge_with_quotechar(A)
     A = fill_empties(A)
     A = strip_trailing(A)
     return A
 
 
-def merge_with_quotechar(S, dialect):
+def merge_with_quotechar(S, dialect=None):
     """Merge quoted blocks in the abstraction
 
     This function takes the abstract representation and merges quoted blocks
@@ -85,42 +91,16 @@ def merge_with_quotechar(S, dialect):
         The data of a file as a string
 
     dialect : SimpleDialect
-        The dialect used to make the abstraction.
+        The dialect used to make the abstraction. This is not used but kept for
+        backwards compatibility. Will be removed in a future version.
 
     Returns
     -------
     abstraction : str
         A simplified version of the abstraction with quoted blocks merged.
 
     """
-    in_quotes = False
-    i = 0
-    quote_pairs = []
-    while i < len(S):
-        s = S[i]
-        if not s == "Q":
-            i += 1
-            continue
-
-        if not in_quotes:
-            in_quotes = True
-            begin_quotes = i
-        else:
-            if i + 1 < len(S) and S[i + 1] == "Q":
-                i += 1
-            else:
-                end_quotes = i
-                quote_pairs.append((begin_quotes, end_quotes))
-                in_quotes = False
-        i += 1
-
-    # replace quoted blocks by C
-    Sl = list(S)
-    for begin, end in quote_pairs:
-        for i in range(begin, end + 1):
-            Sl[i] = "C"
-    S = "".join(Sl)
-    return S
+    return c_merge_with_quotechar(S)
 
 
 def fill_empties(abstract):
@@ -152,8 +132,7 @@ def fill_empties(abstract):
     while "RD" in abstract:
         abstract = abstract.replace("RD", "RCD")
 
-    while "CC" in abstract:
-        abstract = abstract.replace("CC", "C")
+    abstract = RE_MULTI_C.sub("C", abstract)
 
     if abstract.startswith("D"):
         abstract = "C" + abstract

diff --git a/clevercsv/escape.py b/clevercsv/escape.py
@@ -8,8 +8,36 @@
 """
 
 import codecs
+import sys
 import unicodedata
 
+#: Set of default characters to *never* consider as escape character
+DEFAULT_BLOCK_CHARS = set(
+    [
+        "!",
+        "?",
+        '"',
+        "'",
+        ".",
+        ",",
+        ";",
+        ":",
+        "%",
+        "*",
+        "&",
+        "#",
+    ]
+)
+
+#: Set of characters in the Unicode "Po" category
+UNICODE_PO_CHARS = set(
+    [
+        c
+        for c in map(chr, range(sys.maxunicode + 1))
+        if unicodedata.category(c) == "Po"
+    ]
+)
+
 
 def is_potential_escapechar(char, encoding, block_char=None):
     """Check if a character is a potential escape character.
@@ -29,36 +57,22 @@ def is_potential_escapechar(char, encoding, block_char=None):
     block_char : iterable
         Characters that are in the Punctuation Other category but that should
         not be considered as escape character. If None, the default set is
-        used, equal to::
-
-        ["!", "?", '"', "'", ".", ",", ";", ":", "%", "*", "&", "#"
+        used, which is defined in :py:data:`DEFAULT_BLOCK_CHARS`.
 
     Returns
     -------
     is_escape : bool
         Whether the character is considered a potential escape or not.
 
     """
-    as_unicode = codecs.decode(bytes(char, encoding), encoding=encoding)
+    if encoding.lower() in set(["utf-8", "ascii"]):
+        uchar = char
+    else:
+        uchar = codecs.decode(bytes(char, encoding), encoding=encoding)
 
-    ctr = unicodedata.category(as_unicode)
-    if block_char is None:
-        block_char = [
-            "!",
-            "?",
-            '"',
-            "'",
-            ".",
-            ",",
-            ";",
-            ":",
-            "%",
-            "*",
-            "&",
-            "#",
-        ]
-    if ctr == "Po":
-        if as_unicode in block_char:
-            return False
+    block_chars = (
+        DEFAULT_BLOCK_CHARS if block_char is None else set(block_char)
+    )
+    if uchar in UNICODE_PO_CHARS and uchar not in block_chars:
         return True
     return False
diff --git a/clevercsv/potential_dialects.py b/clevercsv/potential_dialects.py
@@ -11,6 +11,8 @@
 import itertools
 import unicodedata
 
+from typing import Dict
+
 import regex
 
 from .detect_type import PATTERNS
@@ -69,10 +71,15 @@ def get_dialects(
     for delim, quotechar in itertools.product(delims, quotechars):
         escapechars[(delim, quotechar)] = set([""])
 
+    is_escapechar_cache: Dict[str, bool] = {}
+
     # escapechars are those that precede a delimiter or quotechar
     for u, v in pairwise(data):
-        if not is_potential_escapechar(u, encoding):
+        if u not in is_escapechar_cache:
+            is_escapechar_cache[u] = is_potential_escapechar(u, encoding)
+        if not is_escapechar_cache[u]:
             continue
+
         for delim, quotechar in itertools.product(delims, quotechars):
             if v == delim or v == quotechar:
                 escapechars[(delim, quotechar)].add(u)

diff --git a/src/abstraction.c b/src/abstraction.c
@@ -11,6 +11,8 @@
 
 #define MODULE_VERSION "1.0"
 
+#include <stdbool.h>
+
 #include "Python.h"
 
 static int _set_char(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt)
@@ -110,7 +112,7 @@ PyObject *base_abstraction(PyObject *self,  PyObject *args)
 		} else {
 			if (escape_next)
 				escape_next = 0;
-			if (stack[len-1] != 'C')
+			if (len == 0 || stack[len-1] != 'C')
 				stack[len++] = 'C';
 		}
 		if (len == stack_size) {
@@ -136,17 +138,153 @@ PyObject *base_abstraction(PyObject *self,  PyObject *args)
 	return stack_obj;
 }
 
+PyObject *c_merge_with_quotechar(PyObject *self, PyObject *args)
+{
+	int kind;
+	void *data;
+
+	bool in_quotes = false;
+	size_t *quote_idx_l = NULL;
+	size_t *quote_idx_r = NULL;
+	size_t *quote_idx_l_new = NULL;
+	size_t *quote_idx_r_new = NULL;
+	size_t i, j, len, quote_idx, quote_idx_size = 4;
+	char *new_S = NULL;
+
+	// single characters
+	Py_UCS4 s, t;
+
+	// retrieve the string from the function arguments
+	PyObject *S = NULL;
+	if (!PyArg_ParseTuple(args, "O", &S)) {
+		printf("Error parsing arguments.\n");
+		return NULL;
+	}
+
+	// check that the string is ready
+	if (PyUnicode_READY(S) == -1) {
+		printf("Unicode object not ready.\n");
+		return NULL;
+	}
+
+	// extract kind, data, and length
+	kind = PyUnicode_KIND(S);
+	data = PyUnicode_DATA(S);
+	len = PyUnicode_GET_LENGTH(S);
+
+	// empty string means return
+	if (len == 0)
+		return S;
+
+	// initialize the arrays that'll hold the start and end indices of the
+	// quoted parts of the string.
+	quote_idx_l = malloc(sizeof(size_t) * quote_idx_size);
+	if (quote_idx_l == NULL) {
+		PyErr_NoMemory();
+		return NULL;
+	}
+	quote_idx_r = malloc(sizeof(size_t) * quote_idx_size);
+	if (quote_idx_r == NULL) {
+		PyErr_NoMemory();
+		return NULL;
+	}
+
+	// allocate and populate the output array
+	new_S = malloc(sizeof(char) * len);
+	if (new_S == NULL) {
+		PyErr_NoMemory();
+		return NULL;
+	}
+	for (i=0; i<len; i++) {
+		new_S[i] = '\0';
+	}
+
+	i = 0;
+	quote_idx = 0;
+	while (i < len) {
+		s = PyUnicode_READ(kind, data, i);
+		new_S[i] = s;
+
+		if (s != 'Q') {
+			i++;
+			continue;
+		}
+
+		// record that we're starting a quoted bit
+		if (!in_quotes) {
+			in_quotes = true;
+			quote_idx_l[quote_idx] = i;
+			i += 1;
+			continue;
+		}
+
+		// read the next character if we can
+		if (i + 1 < len) {
+			t = PyUnicode_READ(kind, data, i + 1);
+		}
+		if (i + 1 < len && t == 'Q') {
+			i++;
+		} else {
+			quote_idx_r[quote_idx] = i;
+			quote_idx++;
+			in_quotes = false;
+
+			// reallocate if we need to
+			if (quote_idx == quote_idx_size) {
+				quote_idx_size *= 2;
+				quote_idx_l_new = quote_idx_l;
+				quote_idx_l_new = realloc(quote_idx_l_new, sizeof(size_t)*quote_idx_size);
+				if (quote_idx_l_new == NULL) {
+					PyErr_NoMemory();
+					return NULL;
+				}
+				quote_idx_r_new = quote_idx_r;
+				quote_idx_r_new = realloc(quote_idx_r_new, sizeof(size_t)*quote_idx_size);
+				if (quote_idx_r_new == NULL) {
+					PyErr_NoMemory();
+					return NULL;
+				}
+				quote_idx_l = quote_idx_l_new;
+				quote_idx_r = quote_idx_r_new;
+			}
+		}
+		i++;
+	}
+
+	// overwrite the part of the output string that's in quotes
+	for (j=0; j<quote_idx; j++) {
+		for (i=quote_idx_l[j]; i<=quote_idx_r[j]; i++) {
+			new_S[i] = 'C';
+		}
+	}
+
+	// convert to Python object
+	PyObject *new_S_obj = PyUnicode_FromStringAndSize(new_S, (Py_ssize_t)len);
+	if (new_S_obj == NULL)
+		goto merge_err;
+	Py_INCREF(new_S_obj);
+
+merge_err:
+	free(new_S);
+	free(quote_idx_l);
+	free(quote_idx_r);
+	return new_S_obj;
+}
+
 /*
  * MODULE
  */
 
 PyDoc_STRVAR(cabstraction_module_doc,
 		"Helpers for abstraction computation in C\n");
 PyDoc_STRVAR(cabstraction_base_abstraction_doc, "");
+PyDoc_STRVAR(cabstraction_c_merge_with_quotechar_doc, "");
 
 static struct PyMethodDef cabstraction_methods[] = {
 	{ "base_abstraction", (PyCFunction)base_abstraction, METH_VARARGS,
 		cabstraction_base_abstraction_doc },
+	{ "c_merge_with_quotechar", (PyCFunction)c_merge_with_quotechar, METH_VARARGS,
+		cabstraction_c_merge_with_quotechar_doc },
 	{ NULL, NULL, 0, NULL }
 };
 

diff --git a/tests/test_integration/error.log b/tests/test_integration/error.log
@@ -1,2 +1 @@
 12f6fa751d2b2a491a54bc9e0e39d05f
-13a6c86a18f053c593feda3d98755010
diff --git a/tests/test_integration/failed.log b/tests/test_integration/failed.log
@@ -18,6 +18,7 @@
 104761c04f7278b2f5afce85c96db719
 120b852c984ad304b3393c7beeea6491
 1390ca6ccd8500cbbfbc5c7f64979004
+13a6c86a18f053c593feda3d98755010
 17c8007d6eb9baf19d075cb33759e313
 17ccdf2fd0edef2d3bf5fca779cb2161
 17e16b55d1d9ee2e13068db7cc69dbf9

diff --git a/tests/test_integration/method.log b/tests/test_integration/method.log
@@ -542,6 +542,7 @@
 13859424315a15fbd24f92cc6e0a1b52,normal
 1390ca6ccd8500cbbfbc5c7f64979004,consistency
 13989b94a814dfe6b7b784a3a8c5c581,consistency
+13a6c86a18f053c593feda3d98755010,consistency
 13aae3390d51fa5cf407d77ed29e9b01,normal
 13abadb255dbb605620b071f53b33ad3,normal
 13ae16b2b3b628657aaa4a527bf6a532,normal
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1 @@
		12f6fa751d2b2a491a54bc9e0e39d05f
		13a6c86a18f053c593feda3d98755010