diff --git a/README.md b/README.md
index 77090f61ad..795b0692ac 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
-[](https://slack.mindee.com) [](LICENSE)  [](https://github.com/mindee/doctr/pkgs/container/doctr) [](https://codecov.io/gh/mindee/doctr) [](https://www.codefactor.io/repository/github/mindee/doctr) [](https://app.codacy.com/gh/mindee/doctr?utm_source=github.com&utm_medium=referral&utm_content=mindee/doctr&utm_campaign=Badge_Grade) [](https://mindee.github.io/doctr) [](https://pypi.org/project/python-doctr/) [](https://huggingface.co/spaces/mindee/doctr) [](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/quicktour.ipynb) [](https://gurubase.io/g/doctr)
+[](https://slack.mindee.com) [](LICENSE)  [](https://github.com/mindee/doctr/pkgs/container/doctr) [](https://codecov.io/gh/mindee/doctr) [](https://www.codefactor.io/repository/github/mindee/doctr) [](https://app.codacy.com/gh/mindee/doctr?utm_source=github.com&utm_medium=referral&utm_content=mindee/doctr&utm_campaign=Badge_Grade) [](https://mindee.github.io/doctr) [](https://pypi.org/project/python-doctr/) [](https://huggingface.co/spaces/mindee/doctr) [](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/quicktour.ipynb) [](https://gurubase.io/g/doctr)
**Optical Character Recognition made seamless & accessible to anyone, powered by PyTorch**
diff --git a/api/pyproject.toml b/api/pyproject.toml
index d4783eb2d9..f6829be556 100644
--- a/api/pyproject.toml
+++ b/api/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.masonry.api"
[tool.poetry]
name = "doctr-api"
-version = "0.12.1a0"
+version = "1.0.0a0"
description = "Backend template for your OCR API with docTR"
authors = ["Mindee "]
license = "Apache-2.0"
diff --git a/doctr/models/preprocessor/pytorch.py b/doctr/models/preprocessor/pytorch.py
index a8b3f3924d..fb2cc005d7 100644
--- a/doctr/models/preprocessor/pytorch.py
+++ b/doctr/models/preprocessor/pytorch.py
@@ -60,24 +60,21 @@ def batch_inputs(self, samples: list[torch.Tensor]) -> list[torch.Tensor]:
return batches
- def sample_transforms(self, x: np.ndarray | torch.Tensor) -> torch.Tensor:
+ def sample_transforms(self, x: np.ndarray) -> torch.Tensor:
if x.ndim != 3:
raise AssertionError("expected list of 3D Tensors")
- if isinstance(x, np.ndarray):
- if x.dtype not in (np.uint8, np.float32, np.float16):
- raise TypeError("unsupported data type for numpy.ndarray")
- x = torch.from_numpy(x.copy()).permute(2, 0, 1)
- elif x.dtype not in (torch.uint8, torch.float16, torch.float32):
- raise TypeError("unsupported data type for torch.Tensor")
+ if x.dtype not in (np.uint8, np.float32, np.float16):
+ raise TypeError("unsupported data type for numpy.ndarray")
+ tensor = torch.from_numpy(x.copy()).permute(2, 0, 1)
# Resizing
- x = self.resize(x)
+ tensor = self.resize(tensor)
# Data type
- if x.dtype == torch.uint8:
- x = x.to(dtype=torch.float32).div(255).clip(0, 1) # type: ignore[union-attr]
+ if tensor.dtype == torch.uint8:
+ tensor = tensor.to(dtype=torch.float32).div(255).clip(0, 1)
else:
- x = x.to(dtype=torch.float32) # type: ignore[union-attr]
+ tensor = tensor.to(dtype=torch.float32)
- return x
+ return tensor
def __call__(self, x: np.ndarray | list[np.ndarray]) -> list[torch.Tensor]:
"""Prepare document data for model forwarding
@@ -94,29 +91,29 @@ def __call__(self, x: np.ndarray | list[np.ndarray]) -> list[torch.Tensor]:
raise AssertionError("expected 4D Tensor")
if x.dtype not in (np.uint8, np.float32, np.float16):
raise TypeError("unsupported data type for numpy.ndarray")
- x = torch.from_numpy(x.copy()).permute(0, 3, 1, 2) # type: ignore[assignment]
+ tensor = torch.from_numpy(x.copy()).permute(0, 3, 1, 2)
# Resizing
- if x.shape[-2] != self.resize.size[0] or x.shape[-1] != self.resize.size[1]:
- x = F.resize(
- x, self.resize.size, interpolation=self.resize.interpolation, antialias=self.resize.antialias
+ if tensor.shape[-2] != self.resize.size[0] or tensor.shape[-1] != self.resize.size[1]:
+ tensor = F.resize(
+ tensor, self.resize.size, interpolation=self.resize.interpolation, antialias=self.resize.antialias
)
# Data type
- if x.dtype == torch.uint8: # type: ignore[union-attr]
- x = x.to(dtype=torch.float32).div(255).clip(0, 1) # type: ignore[union-attr]
+ if tensor.dtype == torch.uint8:
+ tensor = tensor.to(dtype=torch.float32).div(255).clip(0, 1)
else:
- x = x.to(dtype=torch.float32) # type: ignore[union-attr]
- batches = [x]
+ tensor = tensor.to(dtype=torch.float32)
+ batches = [tensor]
elif isinstance(x, list) and all(isinstance(sample, np.ndarray) for sample in x):
# Sample transform (to tensor, resize)
samples = list(multithread_exec(self.sample_transforms, x))
# Batching
- batches = self.batch_inputs(samples) # type: ignore[assignment]
+ batches = self.batch_inputs(samples)
else:
raise TypeError(f"invalid input type: {type(x)}")
# Batch transforms (normalize)
batches = list(multithread_exec(self.normalize, batches))
- return batches # type: ignore[return-value]
+ return batches
diff --git a/doctr/transforms/modules/pytorch.py b/doctr/transforms/modules/pytorch.py
index 9a54e16b1f..d9f0a07bb0 100644
--- a/doctr/transforms/modules/pytorch.py
+++ b/doctr/transforms/modules/pytorch.py
@@ -27,7 +27,21 @@
class Resize(T.Resize):
- """Resize the input image to the given size"""
+ """Resize the input image to the given size
+
+ >>> import torch
+ >>> from doctr.transforms import Resize
+ >>> transfo = Resize((64, 64), preserve_aspect_ratio=True, symmetric_pad=True)
+ >>> out = transfo(torch.rand((3, 64, 64)))
+
+ Args:
+ size: output size in pixels, either a tuple (height, width) or a single integer for square images
+ interpolation: interpolation mode to use for resizing, default is bilinear
+ preserve_aspect_ratio: whether to preserve the aspect ratio of the image,
+ if True, the image will be resized to fit within the target size while maintaining its aspect ratio
+ symmetric_pad: whether to symmetrically pad the image to the target size,
+ if True, the image will be padded equally on both sides to fit the target size
+ """
def __init__(
self,
@@ -36,25 +50,19 @@ def __init__(
preserve_aspect_ratio: bool = False,
symmetric_pad: bool = False,
) -> None:
- super().__init__(size, interpolation, antialias=True)
+ super().__init__(size if isinstance(size, (list, tuple)) else (size, size), interpolation, antialias=True)
self.preserve_aspect_ratio = preserve_aspect_ratio
self.symmetric_pad = symmetric_pad
- if not isinstance(self.size, (int, tuple, list)):
- raise AssertionError("size should be either a tuple, a list or an int")
-
def forward(
self,
img: torch.Tensor,
target: np.ndarray | None = None,
) -> torch.Tensor | tuple[torch.Tensor, np.ndarray]:
- if isinstance(self.size, int):
- target_ratio = img.shape[-2] / img.shape[-1]
- else:
- target_ratio = self.size[0] / self.size[1]
+ target_ratio = self.size[0] / self.size[1]
actual_ratio = img.shape[-2] / img.shape[-1]
- if not self.preserve_aspect_ratio or (target_ratio == actual_ratio and (isinstance(self.size, (tuple, list)))):
+ if not self.preserve_aspect_ratio or (target_ratio == actual_ratio):
# If we don't preserve the aspect ratio or the wanted aspect ratio is the same than the original one
# We can use with the regular resize
if target is not None:
@@ -62,16 +70,10 @@ def forward(
return super().forward(img)
else:
# Resize
- if isinstance(self.size, (tuple, list)):
- if actual_ratio > target_ratio:
- tmp_size = (self.size[0], max(int(self.size[0] / actual_ratio), 1))
- else:
- tmp_size = (max(int(self.size[1] * actual_ratio), 1), self.size[1])
- elif isinstance(self.size, int): # self.size is the longest side, infer the other
- if img.shape[-2] <= img.shape[-1]:
- tmp_size = (max(int(self.size * actual_ratio), 1), self.size)
- else:
- tmp_size = (self.size, max(int(self.size / actual_ratio), 1))
+ if actual_ratio > target_ratio:
+ tmp_size = (self.size[0], max(int(self.size[0] / actual_ratio), 1))
+ else:
+ tmp_size = (max(int(self.size[1] * actual_ratio), 1), self.size[1])
# Scale image
img = F.resize(img, tmp_size, self.interpolation, antialias=True)
@@ -93,14 +95,14 @@ def forward(
if self.preserve_aspect_ratio:
# Get absolute coords
if target.shape[1:] == (4,):
- if isinstance(self.size, (tuple, list)) and self.symmetric_pad:
+ if self.symmetric_pad:
target[:, [0, 2]] = offset[0] + target[:, [0, 2]] * raw_shape[-1] / img.shape[-1]
target[:, [1, 3]] = offset[1] + target[:, [1, 3]] * raw_shape[-2] / img.shape[-2]
else:
target[:, [0, 2]] *= raw_shape[-1] / img.shape[-1]
target[:, [1, 3]] *= raw_shape[-2] / img.shape[-2]
elif target.shape[1:] == (4, 2):
- if isinstance(self.size, (tuple, list)) and self.symmetric_pad:
+ if self.symmetric_pad:
target[..., 0] = offset[0] + target[..., 0] * raw_shape[-1] / img.shape[-1]
target[..., 1] = offset[1] + target[..., 1] * raw_shape[-2] / img.shape[-2]
else:
diff --git a/tests/pytorch/test_transforms_pt.py b/tests/pytorch/test_transforms_pt.py
index 8ed38d846b..b29f88e119 100644
--- a/tests/pytorch/test_transforms_pt.py
+++ b/tests/pytorch/test_transforms_pt.py
@@ -27,60 +27,77 @@ def test_resize():
assert torch.all(out == 1)
assert out.shape[-2:] == output_size
- assert repr(transfo) == f"Resize(output_size={output_size}, interpolation='bilinear')"
+ assert repr(transfo) == "Resize(output_size=(32, 32), interpolation='bilinear')"
- transfo = Resize(output_size, preserve_aspect_ratio=True)
+ # Test with preserve_aspect_ratio
+ output_size = (32, 32)
input_t = torch.ones((3, 32, 64), dtype=torch.float32)
- out = transfo(input_t)
+ # Asymmetric padding
+ transfo = Resize(output_size, preserve_aspect_ratio=True)
+ out = transfo(input_t)
assert out.shape[-2:] == output_size
assert not torch.all(out == 1)
- # Asymetric padding
assert torch.all(out[:, -1] == 0) and torch.all(out[:, 0] == 1)
- # Symetric padding
- transfo = Resize(output_size, preserve_aspect_ratio=True, symmetric_pad=True)
- assert repr(transfo) == (
- f"Resize(output_size={output_size}, interpolation='bilinear', preserve_aspect_ratio=True, symmetric_pad=True)"
- )
+ # Symmetric padding
+ transfo = Resize(32, preserve_aspect_ratio=True, symmetric_pad=True)
out = transfo(input_t)
assert out.shape[-2:] == output_size
- # symetric padding
- assert torch.all(out[:, -1] == 0) and torch.all(out[:, 0] == 0)
+ assert torch.all(out[:, 0] == 0) and torch.all(out[:, -1] == 0)
- # Inverse aspect ratio
+ expected = "Resize(output_size=(32, 32), interpolation='bilinear', preserve_aspect_ratio=True, symmetric_pad=True)"
+ assert repr(transfo) == expected
+
+ # Test with inverse resize
input_t = torch.ones((3, 64, 32), dtype=torch.float32)
+ transfo = Resize(32, preserve_aspect_ratio=True, symmetric_pad=True)
out = transfo(input_t)
+ assert out.shape[-2:] == (32, 32)
- assert not torch.all(out == 1)
- assert out.shape[-2:] == output_size
-
- # Same aspect ratio
- output_size = (32, 128)
- transfo = Resize(output_size, preserve_aspect_ratio=True)
+ # Test resize with same ratio
+ transfo = Resize((32, 128), preserve_aspect_ratio=True)
out = transfo(torch.ones((3, 16, 64), dtype=torch.float32))
- assert out.shape[-2:] == output_size
+ assert out.shape[-2:] == (32, 128)
- # FP16
+ # Test with fp16 input
+ transfo = Resize((32, 128), preserve_aspect_ratio=True)
input_t = torch.ones((3, 64, 64), dtype=torch.float16)
out = transfo(input_t)
assert out.dtype == torch.float16
- # --- Test with target (bounding boxes) ---
-
- target_boxes = np.array([[0.1, 0.1, 0.9, 0.9], [0.2, 0.2, 0.8, 0.8]])
- output_size = (64, 64)
-
- transfo = Resize(output_size, preserve_aspect_ratio=True)
+ padding = [True, False]
+ for symmetric_pad in padding:
+ # Test with target boxes
+ target_boxes = np.array([[0.1, 0.1, 0.3, 0.4], [0.2, 0.2, 0.8, 0.8]])
+ transfo = Resize((64, 64), preserve_aspect_ratio=True, symmetric_pad=symmetric_pad)
+ input_t = torch.ones((3, 32, 64), dtype=torch.float32)
+ out, new_target = transfo(input_t, target_boxes)
+
+ assert out.shape[-2:] == (64, 64)
+ assert new_target.shape == target_boxes.shape
+ assert np.all((0 <= new_target) & (new_target <= 1))
+
+ # Test with target polygons
+ target_boxes = np.array([
+ [[0.1, 0.1], [0.9, 0.1], [0.9, 0.9], [0.1, 0.9]],
+ [[0.2, 0.2], [0.8, 0.2], [0.8, 0.8], [0.2, 0.8]],
+ ])
+ transfo = Resize((64, 64), preserve_aspect_ratio=True, symmetric_pad=symmetric_pad)
+ input_t = torch.ones((3, 32, 64), dtype=torch.float32)
+ out, new_target = transfo(input_t, target_boxes)
+
+ assert out.shape[-2:] == (64, 64)
+ assert new_target.shape == target_boxes.shape
+ assert np.all((0 <= new_target) & (new_target <= 1))
+
+ # Test with invalid target shape
input_t = torch.ones((3, 32, 64), dtype=torch.float32)
- out, new_target = transfo(input_t, target_boxes)
+ target = np.ones((2, 5)) # Invalid shape
- assert out.shape[-2:] == output_size
- assert new_target.shape == target_boxes.shape
- assert np.all(new_target >= 0) and np.all(new_target <= 1)
-
- out = transfo(input_t)
- assert out.shape[-2:] == output_size
+ transfo = Resize((64, 64), preserve_aspect_ratio=True)
+ with pytest.raises(AssertionError):
+ transfo(input_t, target)
@pytest.mark.parametrize(