From 1fab4ea0fc39cc37c34f736a051bb7d077022d4a Mon Sep 17 00:00:00 2001 From: jmestwa-coder Date: Tue, 24 Mar 2026 15:52:05 +0530 Subject: [PATCH] python: validate offset ranges before pointer arithmetic --- python/_re2.cc | 26 ++++++++++++++++++++++++++ python/re2_test.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) diff --git a/python/_re2.cc b/python/_re2.cc index 22f092b23..b2944b6e8 100644 --- a/python/_re2.cc +++ b/python/_re2.cc @@ -47,11 +47,36 @@ static inline int OneCharLen(const char* ptr) { return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*ptr & 0xFF) >> 4]; } +static inline void ValidatePosLenOrThrow(absl::string_view text, + ssize_t pos, + ssize_t len) { + const ssize_t text_size = static_cast(text.size()); + if (pos < 0 || pos > text_size) { + throw py::value_error("pos out of range"); + } + if (len < 0) { + throw py::value_error("len must be non-negative"); + } +} + +static inline void ValidatePosEndposOrThrow(absl::string_view text, + ssize_t pos, + ssize_t endpos) { + const ssize_t text_size = static_cast(text.size()); + if (pos < 0 || pos > text_size) { + throw py::value_error("pos out of range"); + } + if (endpos < pos || endpos > text_size) { + throw py::value_error("endpos out of range"); + } +} + // Helper function for when Python encodes str to bytes and then needs to // convert str offsets to bytes offsets. Assumes that text is valid UTF-8. ssize_t CharLenToBytes(py::buffer buffer, ssize_t pos, ssize_t len) { auto bytes = buffer.request(); auto text = FromBytes(bytes); + ValidatePosLenOrThrow(text, pos, len); auto ptr = text.data() + pos; auto end = text.data() + text.size(); while (ptr < end && len > 0) { @@ -66,6 +91,7 @@ ssize_t CharLenToBytes(py::buffer buffer, ssize_t pos, ssize_t len) { ssize_t BytesToCharLen(py::buffer buffer, ssize_t pos, ssize_t endpos) { auto bytes = buffer.request(); auto text = FromBytes(bytes); + ValidatePosEndposOrThrow(text, pos, endpos); auto ptr = text.data() + pos; auto end = text.data() + endpos; ssize_t len = 0; diff --git a/python/re2_test.py b/python/re2_test.py index 146b55b48..8347125c8 100644 --- a/python/re2_test.py +++ b/python/re2_test.py @@ -6,9 +6,14 @@ import collections import pickle import re +import unittest from absl.testing import absltest from absl.testing import parameterized +try: + import _re2 +except ImportError: + _re2 = None import re2 @@ -30,6 +35,45 @@ def test_option(self, name): self.assertEqual(value, getattr(options, name)) +@unittest.skipIf(_re2 is None, '_re2 extension module is unavailable') +class BindingOffsetConversionTest(absltest.TestCase): + + def test_charlentobytes_negative_pos(self): + with self.assertRaises(ValueError): + _re2.CharLenToBytes(b'abc', -1, 1) + + def test_charlentobytes_pos_beyond_buffer(self): + with self.assertRaises(ValueError): + _re2.CharLenToBytes(b'abc', 4, 1) + + def test_charlentobytes_negative_len(self): + with self.assertRaises(ValueError): + _re2.CharLenToBytes(b'abc', 0, -1) + + def test_bytestocharlen_negative_pos(self): + with self.assertRaises(ValueError): + _re2.BytesToCharLen(b'abc', -1, 1) + + def test_bytestocharlen_endpos_beyond_buffer(self): + with self.assertRaises(ValueError): + _re2.BytesToCharLen(b'abc', 0, 4) + + def test_bytestocharlen_endpos_before_pos(self): + with self.assertRaises(ValueError): + _re2.BytesToCharLen(b'abc', 2, 1) + + def test_valid_ascii_offsets_unchanged(self): + self.assertEqual(2, _re2.CharLenToBytes(b'abc', 1, 2)) + self.assertEqual(2, _re2.BytesToCharLen(b'abc', 1, 3)) + self.assertEqual(0, _re2.CharLenToBytes(b'abc', 3, 0)) + self.assertEqual(0, _re2.BytesToCharLen(b'abc', 3, 3)) + + def test_valid_utf8_offsets_unchanged(self): + text = 'a\u2603b'.encode('utf-8') + self.assertEqual(4, _re2.CharLenToBytes(text, 0, 2)) + self.assertEqual(2, _re2.BytesToCharLen(text, 0, 4)) + + class Re2CompileTest(parameterized.TestCase): """Contains tests that apply to the re2 module only.