Skip to content

Commit

Permalink
[3.12] pythongh-67877: Fix memory leaks in terminated RE matching (py…
Browse files Browse the repository at this point in the history
…thonGH-126840)

If SRE(match) function terminates abruptly, either because of a signal
or because memory allocation fails, allocated SRE_REPEAT blocks might
be never released.

(cherry picked from commit 7538e7f)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
Co-authored-by: <wjssz@users.noreply.github.com>
  • Loading branch information
serhiy-storchaka committed Nov 18, 2024
1 parent 9d986d9 commit 5b1b787
Show file tree
Hide file tree
Showing 6 changed files with 251 additions and 14 deletions.
44 changes: 44 additions & 0 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -2621,6 +2621,50 @@ def test_regression_gh94675(self):
p.terminate()
p.join()

def test_fail(self):
self.assertEqual(re.search(r'12(?!)|3', '123')[0], '3')

def test_character_set_any(self):
# The union of complementary character sets matches any character
# and is equivalent to "(?s:.)".
s = '1x\n'
for p in r'[\s\S]', r'[\d\D]', r'[\w\W]', r'[\S\s]', r'\s|\S':
with self.subTest(pattern=p):
self.assertEqual(re.findall(p, s), list(s))
self.assertEqual(re.fullmatch('(?:' + p + ')+', s).group(), s)

def test_character_set_none(self):
# Negation of the union of complementary character sets does not match
# any character.
s = '1x\n'
for p in r'[^\s\S]', r'[^\d\D]', r'[^\w\W]', r'[^\S\s]':
with self.subTest(pattern=p):
self.assertIsNone(re.search(p, s))
self.assertIsNone(re.search('(?s:.)' + p, s))

def check_interrupt(self, pattern, string, maxcount):
class Interrupt(Exception):
pass
p = re.compile(pattern)
for n in range(maxcount):
try:
p._fail_after(n, Interrupt)
p.match(string)
return n
except Interrupt:
pass
finally:
p._fail_after(-1, None)

@unittest.skipUnless(hasattr(re.Pattern, '_fail_after'), 'requires debug build')
def test_memory_leaks(self):
self.check_interrupt(r'(.)*:', 'abc:', 100)
self.check_interrupt(r'([^:])*?:', 'abc:', 100)
self.check_interrupt(r'([^:])*+:', 'abc:', 100)
self.check_interrupt(r'(.){2,4}:', 'abc:', 100)
self.check_interrupt(r'([^:]){2,4}?:', 'abc:', 100)
self.check_interrupt(r'([^:]){2,4}+:', 'abc:', 100)


def get_debug_out(pat):
with captured_stdout() as out:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix memory leaks when :mod:`regular expression <re>` matching terminates
abruptly, either because of a signal or because memory allocation fails.
44 changes: 43 additions & 1 deletion Modules/_sre/clinic/sre.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

132 changes: 127 additions & 5 deletions Modules/_sre/sre.c
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,85 @@ data_stack_grow(SRE_STATE* state, Py_ssize_t size)
return 0;
}

/* memory pool functions for SRE_REPEAT, this can avoid memory
leak when SRE(match) function terminates abruptly.
state->repeat_pool_used is a doubly-linked list, so that we
can remove a SRE_REPEAT node from it.
state->repeat_pool_unused is a singly-linked list, we put/get
node at the head. */
static SRE_REPEAT *
repeat_pool_malloc(SRE_STATE *state)
{
SRE_REPEAT *repeat;

if (state->repeat_pool_unused) {
/* remove from unused pool (singly-linked list) */
repeat = state->repeat_pool_unused;
state->repeat_pool_unused = repeat->pool_next;
}
else {
repeat = PyObject_Malloc(sizeof(SRE_REPEAT));
if (!repeat) {
return NULL;
}
}

/* add to used pool (doubly-linked list) */
SRE_REPEAT *temp = state->repeat_pool_used;
if (temp) {
temp->pool_prev = repeat;
}
repeat->pool_prev = NULL;
repeat->pool_next = temp;
state->repeat_pool_used = repeat;

return repeat;
}

static void
repeat_pool_free(SRE_STATE *state, SRE_REPEAT *repeat)
{
SRE_REPEAT *prev = repeat->pool_prev;
SRE_REPEAT *next = repeat->pool_next;

/* remove from used pool (doubly-linked list) */
if (prev) {
prev->pool_next = next;
}
else {
state->repeat_pool_used = next;
}
if (next) {
next->pool_prev = prev;
}

/* add to unused pool (singly-linked list) */
repeat->pool_next = state->repeat_pool_unused;
state->repeat_pool_unused = repeat;
}

static void
repeat_pool_clear(SRE_STATE *state)
{
/* clear used pool */
SRE_REPEAT *next = state->repeat_pool_used;
state->repeat_pool_used = NULL;
while (next) {
SRE_REPEAT *temp = next;
next = temp->pool_next;
PyObject_Free(temp);
}

/* clear unused pool */
next = state->repeat_pool_unused;
state->repeat_pool_unused = NULL;
while (next) {
SRE_REPEAT *temp = next;
next = temp->pool_next;
PyObject_Free(temp);
}
}

/* generate 8-bit version */

#define SRE_CHAR Py_UCS1
Expand Down Expand Up @@ -463,6 +542,11 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
state->pos = start;
state->endpos = end;

#ifdef Py_DEBUG
state->fail_after_count = pattern->fail_after_count;
state->fail_after_exc = pattern->fail_after_exc; // borrowed ref
#endif

return string;
err:
/* We add an explicit cast here because MSVC has a bug when
Expand All @@ -485,6 +569,8 @@ state_fini(SRE_STATE* state)
/* See above PyMem_Del for why we explicitly cast here. */
PyMem_Free((void*) state->mark);
state->mark = NULL;
/* SRE_REPEAT pool */
repeat_pool_clear(state);
}

/* calculate offset from start of string */
Expand Down Expand Up @@ -571,6 +657,9 @@ pattern_traverse(PatternObject *self, visitproc visit, void *arg)
Py_VISIT(self->groupindex);
Py_VISIT(self->indexgroup);
Py_VISIT(self->pattern);
#ifdef Py_DEBUG
Py_VISIT(self->fail_after_exc);
#endif
return 0;
}

Expand All @@ -580,6 +669,9 @@ pattern_clear(PatternObject *self)
Py_CLEAR(self->groupindex);
Py_CLEAR(self->indexgroup);
Py_CLEAR(self->pattern);
#ifdef Py_DEBUG
Py_CLEAR(self->fail_after_exc);
#endif
return 0;
}

Expand Down Expand Up @@ -642,7 +734,7 @@ _sre_SRE_Pattern_match_impl(PatternObject *self, PyTypeObject *cls,
Py_ssize_t status;
PyObject *match;

if (!state_init(&state, (PatternObject *)self, string, pos, endpos))
if (!state_init(&state, self, string, pos, endpos))
return NULL;

state.ptr = state.start;
Expand Down Expand Up @@ -1330,6 +1422,29 @@ _sre_SRE_Pattern___deepcopy__(PatternObject *self, PyObject *memo)
return Py_NewRef(self);
}

#ifdef Py_DEBUG
/*[clinic input]
_sre.SRE_Pattern._fail_after
count: int
exception: object
/
For debugging.
[clinic start generated code]*/

static PyObject *
_sre_SRE_Pattern__fail_after_impl(PatternObject *self, int count,
PyObject *exception)
/*[clinic end generated code: output=9a6bf12135ac50c2 input=ef80a45c66c5499d]*/
{
self->fail_after_count = count;
Py_INCREF(exception);
Py_XSETREF(self->fail_after_exc, exception);
Py_RETURN_NONE;
}
#endif /* Py_DEBUG */

static PyObject *
pattern_repr(PatternObject *obj)
{
Expand Down Expand Up @@ -1456,6 +1571,10 @@ _sre_compile_impl(PyObject *module, PyObject *pattern, int flags,
self->pattern = NULL;
self->groupindex = NULL;
self->indexgroup = NULL;
#ifdef Py_DEBUG
self->fail_after_count = -1;
self->fail_after_exc = NULL;
#endif

self->codesize = n;

Expand Down Expand Up @@ -2552,7 +2671,8 @@ pattern_new_match(_sremodulestate* module_state,
if (!match)
return NULL;

match->pattern = (PatternObject*)Py_NewRef(pattern);
Py_INCREF(pattern);
match->pattern = pattern;

match->string = Py_NewRef(state->string);

Expand Down Expand Up @@ -2688,7 +2808,7 @@ _sre_SRE_Scanner_match_impl(ScannerObject *self, PyTypeObject *cls)
return NULL;
}

match = pattern_new_match(module_state, (PatternObject*) self->pattern,
match = pattern_new_match(module_state, self->pattern,
state, status);

if (status == 0)
Expand Down Expand Up @@ -2738,7 +2858,7 @@ _sre_SRE_Scanner_search_impl(ScannerObject *self, PyTypeObject *cls)
return NULL;
}

match = pattern_new_match(module_state, (PatternObject*) self->pattern,
match = pattern_new_match(module_state, self->pattern,
state, status);

if (status == 0)
Expand Down Expand Up @@ -2774,7 +2894,8 @@ pattern_scanner(_sremodulestate *module_state,
return NULL;
}

scanner->pattern = Py_NewRef(self);
Py_INCREF(self);
scanner->pattern = self;

PyObject_GC_Track(scanner);
return (PyObject*) scanner;
Expand Down Expand Up @@ -2968,6 +3089,7 @@ static PyMethodDef pattern_methods[] = {
_SRE_SRE_PATTERN_SCANNER_METHODDEF
_SRE_SRE_PATTERN___COPY___METHODDEF
_SRE_SRE_PATTERN___DEEPCOPY___METHODDEF
_SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF
{"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
PyDoc_STR("See PEP 585")},
{NULL, NULL}
Expand Down
17 changes: 16 additions & 1 deletion Modules/_sre/sre.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ typedef struct {
int flags; /* flags used when compiling pattern source */
PyObject *weakreflist; /* List of weak references */
int isbytes; /* pattern type (1 - bytes, 0 - string, -1 - None) */
#ifdef Py_DEBUG
/* for simulation of user interruption */
int fail_after_count;
PyObject *fail_after_exc;
#endif
/* pattern code */
Py_ssize_t codesize;
SRE_CODE code[1];
Expand Down Expand Up @@ -68,6 +73,9 @@ typedef struct SRE_REPEAT_T {
const SRE_CODE* pattern; /* points to REPEAT operator arguments */
const void* last_ptr; /* helper to check for infinite loops */
struct SRE_REPEAT_T *prev; /* points to previous repeat context */
/* for SRE_REPEAT pool */
struct SRE_REPEAT_T *pool_prev;
struct SRE_REPEAT_T *pool_next;
} SRE_REPEAT;

typedef struct {
Expand All @@ -94,12 +102,19 @@ typedef struct {
size_t data_stack_base;
/* current repeat context */
SRE_REPEAT *repeat;
/* SRE_REPEAT pool */
SRE_REPEAT *repeat_pool_used;
SRE_REPEAT *repeat_pool_unused;
unsigned int sigcount;
#ifdef Py_DEBUG
int fail_after_count;
PyObject *fail_after_exc;
#endif
} SRE_STATE;

typedef struct {
PyObject_HEAD
PyObject* pattern;
PatternObject* pattern;
SRE_STATE state;
int executing;
} ScannerObject;
Expand Down
Loading

0 comments on commit 5b1b787

Please sign in to comment.