Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions python/_re2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,36 @@ static inline int OneCharLen(const char* ptr) {
return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*ptr & 0xFF) >> 4];
}

static inline void ValidatePosLenOrThrow(absl::string_view text,
ssize_t pos,
ssize_t len) {
const ssize_t text_size = static_cast<ssize_t>(text.size());
if (pos < 0 || pos > text_size) {
throw py::value_error("pos out of range");
}
if (len < 0) {
throw py::value_error("len must be non-negative");
}
}

static inline void ValidatePosEndposOrThrow(absl::string_view text,
ssize_t pos,
ssize_t endpos) {
const ssize_t text_size = static_cast<ssize_t>(text.size());
if (pos < 0 || pos > text_size) {
throw py::value_error("pos out of range");
}
if (endpos < pos || endpos > text_size) {
throw py::value_error("endpos out of range");
}
}

// Helper function for when Python encodes str to bytes and then needs to
// convert str offsets to bytes offsets. Assumes that text is valid UTF-8.
ssize_t CharLenToBytes(py::buffer buffer, ssize_t pos, ssize_t len) {
auto bytes = buffer.request();
auto text = FromBytes(bytes);
ValidatePosLenOrThrow(text, pos, len);
auto ptr = text.data() + pos;
auto end = text.data() + text.size();
while (ptr < end && len > 0) {
Expand All @@ -66,6 +91,7 @@ ssize_t CharLenToBytes(py::buffer buffer, ssize_t pos, ssize_t len) {
ssize_t BytesToCharLen(py::buffer buffer, ssize_t pos, ssize_t endpos) {
auto bytes = buffer.request();
auto text = FromBytes(bytes);
ValidatePosEndposOrThrow(text, pos, endpos);
auto ptr = text.data() + pos;
auto end = text.data() + endpos;
ssize_t len = 0;
Expand Down
44 changes: 44 additions & 0 deletions python/re2_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,14 @@
import collections
import pickle
import re
import unittest

from absl.testing import absltest
from absl.testing import parameterized
try:
import _re2
except ImportError:
_re2 = None
import re2


Expand All @@ -30,6 +35,45 @@ def test_option(self, name):
self.assertEqual(value, getattr(options, name))


@unittest.skipIf(_re2 is None, '_re2 extension module is unavailable')
class BindingOffsetConversionTest(absltest.TestCase):

def test_charlentobytes_negative_pos(self):
with self.assertRaises(ValueError):
_re2.CharLenToBytes(b'abc', -1, 1)

def test_charlentobytes_pos_beyond_buffer(self):
with self.assertRaises(ValueError):
_re2.CharLenToBytes(b'abc', 4, 1)

def test_charlentobytes_negative_len(self):
with self.assertRaises(ValueError):
_re2.CharLenToBytes(b'abc', 0, -1)

def test_bytestocharlen_negative_pos(self):
with self.assertRaises(ValueError):
_re2.BytesToCharLen(b'abc', -1, 1)

def test_bytestocharlen_endpos_beyond_buffer(self):
with self.assertRaises(ValueError):
_re2.BytesToCharLen(b'abc', 0, 4)

def test_bytestocharlen_endpos_before_pos(self):
with self.assertRaises(ValueError):
_re2.BytesToCharLen(b'abc', 2, 1)

def test_valid_ascii_offsets_unchanged(self):
self.assertEqual(2, _re2.CharLenToBytes(b'abc', 1, 2))
self.assertEqual(2, _re2.BytesToCharLen(b'abc', 1, 3))
self.assertEqual(0, _re2.CharLenToBytes(b'abc', 3, 0))
self.assertEqual(0, _re2.BytesToCharLen(b'abc', 3, 3))

def test_valid_utf8_offsets_unchanged(self):
text = 'a\u2603b'.encode('utf-8')
self.assertEqual(4, _re2.CharLenToBytes(text, 0, 2))
self.assertEqual(2, _re2.BytesToCharLen(text, 0, 4))


class Re2CompileTest(parameterized.TestCase):
"""Contains tests that apply to the re2 module only.

Expand Down