From c8b3a94c37cb0ccff994a46a4adce85a1823bd24 Mon Sep 17 00:00:00 2001 From: ch4n3-yoon Date: Thu, 19 Mar 2026 10:51:05 +0900 Subject: [PATCH 1/2] gh-149079: Fix O(n^2) canonical ordering in unicodedata.normalize() Replace the insertion sort used for canonical ordering of combining characters with a hybrid approach: insertion sort for short runs (< 20) and counting sort for longer runs, reducing worst-case complexity from O(n^2) to O(n). This prevents denial of service via crafted Unicode strings with many combining characters in alternating CCC order. Co-authored-by: Seokchan Yoon <13852925+ch4n3-yoon@users.noreply.github.com> --- Lib/test/test_unicodedata.py | 28 ++++ ...-04-27-16-36-11.gh-issue-149079.vKl-LM.rst | 5 + Modules/unicodedata.c | 153 ++++++++++++++---- 3 files changed, 159 insertions(+), 27 deletions(-) create mode 100644 Misc/NEWS.d/next/Security/2026-04-27-16-36-11.gh-issue-149079.vKl-LM.rst diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 8ecb0df2f8e5dd..3f569e6baee676 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -616,6 +616,34 @@ def test_issue10254(self): b = 'C\u0338' * 20 + '\xC7' self.assertEqual(self.db.normalize('NFC', a), b) + def test_long_combining_mark_run(self): + # GH-XXXXX: avoid quadratic canonical ordering. + payload = "a" + ("\u0300\u0327" * 32) + nfd = "a" + ("\u0327" * 32) + ("\u0300" * 32) + nfc = "\u00e0" + ("\u0327" * 32) + ("\u0300" * 31) + + self.assertEqual(self.db.normalize("NFD", payload), nfd) + self.assertEqual(self.db.normalize("NFKD", payload), nfd) + self.assertEqual(self.db.normalize("NFC", payload), nfc) + self.assertEqual(self.db.normalize("NFKC", payload), nfc) + + def test_combining_mark_run_fast_paths(self): + # GH-XXXXX: cover short runs and already-sorted long runs. + short_payload = "a" + ("\u0300\u0327" * 9) + "\u0300" + short_nfd = "a" + ("\u0327" * 9) + ("\u0300" * 10) + short_nfc = "\u00e0" + ("\u0327" * 9) + ("\u0300" * 9) + long_sorted = "a" + ("\u0327" * 30) + ("\u0300" * 30) + long_sorted_nfc = "\u00e0" + ("\u0327" * 30) + ("\u0300" * 29) + + self.assertEqual(self.db.normalize("NFD", short_payload), short_nfd) + self.assertEqual(self.db.normalize("NFKD", short_payload), short_nfd) + self.assertEqual(self.db.normalize("NFC", short_payload), short_nfc) + self.assertEqual(self.db.normalize("NFKC", short_payload), short_nfc) + self.assertEqual(self.db.normalize("NFD", long_sorted), long_sorted) + self.assertEqual(self.db.normalize("NFKD", long_sorted), long_sorted) + self.assertEqual(self.db.normalize("NFC", long_sorted), long_sorted_nfc) + self.assertEqual(self.db.normalize("NFKC", long_sorted), long_sorted_nfc) + def test_issue29456(self): # Fix #29456 u1176_str_a = '\u1100\u1176\u11a8' diff --git a/Misc/NEWS.d/next/Security/2026-04-27-16-36-11.gh-issue-149079.vKl-LM.rst b/Misc/NEWS.d/next/Security/2026-04-27-16-36-11.gh-issue-149079.vKl-LM.rst new file mode 100644 index 00000000000000..72e8374acffb1f --- /dev/null +++ b/Misc/NEWS.d/next/Security/2026-04-27-16-36-11.gh-issue-149079.vKl-LM.rst @@ -0,0 +1,5 @@ +Fix a potential denial of service in :func:`unicodedata.normalize`. The +canonical ordering step of Unicode normalization used an O(n²) insertion +sort for reordering combining characters, which could be exploited with +crafted input containing many combining characters in non-canonical order. +Replaced with a linear-time counting sort for long runs. diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 55b33a76e7af8a..9ac2faffb40bfb 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -552,19 +552,88 @@ get_decomp_record(PyObject *self, Py_UCS4 code, (*index)++; } +/* Small combining runs are usually cheaper with insertion sort. */ +#define CANONICAL_ORDERING_COUNTING_SORT_THRESHOLD 20 + +static void +canonical_ordering_sort_insertion(int kind, void *data, + Py_ssize_t start, Py_ssize_t end) +{ + for (Py_ssize_t i = start + 1; i < end; i++) { + Py_UCS4 code = PyUnicode_READ(kind, data, i); + unsigned char combining = _getrecord_ex(code)->combining; + Py_ssize_t j = i; + + while (j > start) { + Py_UCS4 previous = PyUnicode_READ(kind, data, j - 1); + if (_getrecord_ex(previous)->combining <= combining) { + break; + } + PyUnicode_WRITE(kind, data, j, previous); + j--; + } + if (j != i) { + PyUnicode_WRITE(kind, data, j, code); + } + } +} + +static void +canonical_ordering_sort_counting(int kind, void *data, + Py_ssize_t start, Py_ssize_t end, + Py_UCS4 *sortbuf) +{ + Py_ssize_t counts[256] = {0}; + Py_ssize_t run_length = end - start; + Py_ssize_t total = 0; + unsigned char min_combining = 255; + unsigned char max_combining = 0; + + for (Py_ssize_t i = start; i < end; i++) { + Py_UCS4 code = PyUnicode_READ(kind, data, i); + unsigned char combining = _getrecord_ex(code)->combining; + counts[combining]++; + if (combining < min_combining) { + min_combining = combining; + } + if (combining > max_combining) { + max_combining = combining; + } + } + + for (Py_ssize_t i = min_combining; i <= max_combining; i++) { + Py_ssize_t count = counts[i]; + counts[i] = total; + total += count; + } + + /* Reuse counts[] as the next output slot for each CCC. */ + for (Py_ssize_t i = start; i < end; i++) { + Py_UCS4 code = PyUnicode_READ(kind, data, i); + unsigned char combining = _getrecord_ex(code)->combining; + sortbuf[counts[combining]++] = code; + } + for (Py_ssize_t i = 0; i < run_length; i++) { + PyUnicode_WRITE(kind, data, start + i, sortbuf[i]); + } +} + static PyObject* nfd_nfkd(PyObject *self, PyObject *input, int k) { PyObject *result; Py_UCS4 *output; Py_ssize_t i, o, osize; - int kind; - const void *data; + int input_kind, result_kind; + const void *input_data; + void *result_data; /* Longest decomposition in Unicode 3.2: U+FDFA */ Py_UCS4 stack[20]; - Py_ssize_t space, isize; + Py_ssize_t space, isize, length; int index, prefix, count, stackptr; unsigned char prev, cur; + Py_UCS4 *sortbuf = NULL; + Py_ssize_t sortbuflen = 0; stackptr = 0; isize = PyUnicode_GET_LENGTH(input); @@ -584,11 +653,11 @@ nfd_nfkd(PyObject *self, PyObject *input, int k) return NULL; } i = o = 0; - kind = PyUnicode_KIND(input); - data = PyUnicode_DATA(input); + input_kind = PyUnicode_KIND(input); + input_data = PyUnicode_DATA(input); while (i < isize) { - stack[stackptr++] = PyUnicode_READ(kind, data, i++); + stack[stackptr++] = PyUnicode_READ(input_kind, input_data, i++); while(stackptr) { Py_UCS4 code = stack[--stackptr]; /* Hangul Decomposition adds three characters in @@ -656,34 +725,64 @@ nfd_nfkd(PyObject *self, PyObject *input, int k) if (!result) return NULL; - kind = PyUnicode_KIND(result); - data = PyUnicode_DATA(result); + result_kind = PyUnicode_KIND(result); + result_data = PyUnicode_DATA(result); + length = PyUnicode_GET_LENGTH(result); - /* Sort canonically. */ + /* Sort each consecutive combining-character run canonically. */ i = 0; - prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; - for (i++; i < PyUnicode_GET_LENGTH(result); i++) { - cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; - if (prev == 0 || cur == 0 || prev <= cur) { - prev = cur; + while (i < length) { + Py_ssize_t run_length, run_start; + int needs_sort = 0; + + prev = _getrecord_ex( + PyUnicode_READ(result_kind, result_data, i))->combining; + if (prev == 0) { + i++; continue; } - /* Non-canonical order. Need to switch *i with previous. */ - o = i - 1; - while (1) { - Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1); - PyUnicode_WRITE(kind, data, o+1, - PyUnicode_READ(kind, data, o)); - PyUnicode_WRITE(kind, data, o, tmp); - o--; - if (o < 0) - break; - prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining; - if (prev == 0 || prev <= cur) + + run_start = i++; + while (i < length) { + cur = _getrecord_ex( + PyUnicode_READ(result_kind, result_data, i))->combining; + if (cur == 0) { break; + } + if (prev > cur) { + needs_sort = 1; + } + prev = cur; + i++; + } + if (!needs_sort) { + continue; + } + + run_length = i - run_start; + if (run_length < CANONICAL_ORDERING_COUNTING_SORT_THRESHOLD) { + canonical_ordering_sort_insertion(result_kind, result_data, + run_start, i); + continue; } - prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; + + if (run_length > sortbuflen) { + Py_UCS4 *new_sortbuf = PyMem_Realloc(sortbuf, + run_length * sizeof(Py_UCS4)); + if (new_sortbuf == NULL) { + PyErr_NoMemory(); + PyMem_Free(sortbuf); + Py_DECREF(result); + return NULL; + } + sortbuf = new_sortbuf; + sortbuflen = run_length; + } + + canonical_ordering_sort_counting(result_kind, result_data, + run_start, i, sortbuf); } + PyMem_Free(sortbuf); return result; } From 4a29545c8376ef2471ea934c934b7ba61829a60e Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Thu, 14 May 2026 20:47:45 +0100 Subject: [PATCH 2/2] Apply all review suggestions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> Co-authored-by: Petr Viktorin Co-authored-by: Serhiy Storchaka Co-authored-by: Maurycy Pawłowski-Wieroński --- Lib/test/test_unicodedata.py | 4 +-- ...-04-27-16-36-11.gh-issue-149079.vKl-LM.rst | 2 +- Modules/unicodedata.c | 30 +++++++------------ 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 3f569e6baee676..0ffedd195b5017 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -617,7 +617,7 @@ def test_issue10254(self): self.assertEqual(self.db.normalize('NFC', a), b) def test_long_combining_mark_run(self): - # GH-XXXXX: avoid quadratic canonical ordering. + # gh-149079: avoid quadratic canonical ordering. payload = "a" + ("\u0300\u0327" * 32) nfd = "a" + ("\u0327" * 32) + ("\u0300" * 32) nfc = "\u00e0" + ("\u0327" * 32) + ("\u0300" * 31) @@ -628,7 +628,7 @@ def test_long_combining_mark_run(self): self.assertEqual(self.db.normalize("NFKC", payload), nfc) def test_combining_mark_run_fast_paths(self): - # GH-XXXXX: cover short runs and already-sorted long runs. + # gh-149079: cover short runs and already-sorted long runs. short_payload = "a" + ("\u0300\u0327" * 9) + "\u0300" short_nfd = "a" + ("\u0327" * 9) + ("\u0300" * 10) short_nfc = "\u00e0" + ("\u0327" * 9) + ("\u0300" * 9) diff --git a/Misc/NEWS.d/next/Security/2026-04-27-16-36-11.gh-issue-149079.vKl-LM.rst b/Misc/NEWS.d/next/Security/2026-04-27-16-36-11.gh-issue-149079.vKl-LM.rst index 72e8374acffb1f..4ed22b58f7405f 100644 --- a/Misc/NEWS.d/next/Security/2026-04-27-16-36-11.gh-issue-149079.vKl-LM.rst +++ b/Misc/NEWS.d/next/Security/2026-04-27-16-36-11.gh-issue-149079.vKl-LM.rst @@ -1,5 +1,5 @@ Fix a potential denial of service in :func:`unicodedata.normalize`. The -canonical ordering step of Unicode normalization used an O(n²) insertion +canonical ordering step of Unicode normalization used a quadratic-time insertion sort for reordering combining characters, which could be exploited with crafted input containing many combining characters in non-canonical order. Replaced with a linear-time counting sort for long runs. diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 9ac2faffb40bfb..d132e77030bf29 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -586,22 +586,14 @@ canonical_ordering_sort_counting(int kind, void *data, Py_ssize_t counts[256] = {0}; Py_ssize_t run_length = end - start; Py_ssize_t total = 0; - unsigned char min_combining = 255; - unsigned char max_combining = 0; for (Py_ssize_t i = start; i < end; i++) { Py_UCS4 code = PyUnicode_READ(kind, data, i); unsigned char combining = _getrecord_ex(code)->combining; counts[combining]++; - if (combining < min_combining) { - min_combining = combining; - } - if (combining > max_combining) { - max_combining = combining; - } } - for (Py_ssize_t i = min_combining; i <= max_combining; i++) { + for (size_t i = 0; i < Py_ARRAY_LENGTH(counts); i++) { Py_ssize_t count = counts[i]; counts[i] = total; total += count; @@ -629,7 +621,7 @@ nfd_nfkd(PyObject *self, PyObject *input, int k) void *result_data; /* Longest decomposition in Unicode 3.2: U+FDFA */ Py_UCS4 stack[20]; - Py_ssize_t space, isize, length; + Py_ssize_t space, isize; int index, prefix, count, stackptr; unsigned char prev, cur; Py_UCS4 *sortbuf = NULL; @@ -727,25 +719,24 @@ nfd_nfkd(PyObject *self, PyObject *input, int k) result_kind = PyUnicode_KIND(result); result_data = PyUnicode_DATA(result); - length = PyUnicode_GET_LENGTH(result); /* Sort each consecutive combining-character run canonically. */ i = 0; - while (i < length) { + while (i < o) { Py_ssize_t run_length, run_start; int needs_sort = 0; - prev = _getrecord_ex( - PyUnicode_READ(result_kind, result_data, i))->combining; + Py_UCS4 ch = PyUnicode_READ(result_kind, result_data, i); + prev = _getrecord_ex(ch)->combining; if (prev == 0) { i++; continue; } run_start = i++; - while (i < length) { - cur = _getrecord_ex( - PyUnicode_READ(result_kind, result_data, i))->combining; + while (i < o) { + Py_UCS4 ch = PyUnicode_READ(result_kind, result_data, i); + cur = _getrecord_ex(ch)->combining; if (cur == 0) { break; } @@ -767,8 +758,9 @@ nfd_nfkd(PyObject *self, PyObject *input, int k) } if (run_length > sortbuflen) { - Py_UCS4 *new_sortbuf = PyMem_Realloc(sortbuf, - run_length * sizeof(Py_UCS4)); + Py_UCS4 *new_sortbuf = PyMem_Resize(sortbuf, + Py_UCS4, + run_length); if (new_sortbuf == NULL) { PyErr_NoMemory(); PyMem_Free(sortbuf);