mindee · felixdittrich92 · Jul 9, 2025 · Jul 9, 2025 · Jul 9, 2025 · Jul 9, 2025
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
   <img src="https://github.com/mindee/doctr/raw/main/docs/images/Logo_doctr.gif" width="40%">
 </p>
 
-[![Slack Icon](https://img.shields.io/badge/Slack-Community-4A154B?style=flat-square&logo=slack&logoColor=white)](https://slack.mindee.com) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE) ![Build Status](https://github.com/mindee/doctr/workflows/builds/badge.svg) [![Docker Images](https://img.shields.io/badge/Docker-4287f5?style=flat&logo=docker&logoColor=white)](https://github.com/mindee/doctr/pkgs/container/doctr) [![codecov](https://codecov.io/gh/mindee/doctr/branch/main/graph/badge.svg?token=577MO567NM)](https://codecov.io/gh/mindee/doctr) [![CodeFactor](https://www.codefactor.io/repository/github/mindee/doctr/badge?s=bae07db86bb079ce9d6542315b8c6e70fa708a7e)](https://www.codefactor.io/repository/github/mindee/doctr) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/340a76749b634586a498e1c0ab998f08)](https://app.codacy.com/gh/mindee/doctr?utm_source=github.com&utm_medium=referral&utm_content=mindee/doctr&utm_campaign=Badge_Grade) [![Doc Status](https://github.com/mindee/doctr/workflows/doc-status/badge.svg)](https://mindee.github.io/doctr) [![Pypi](https://img.shields.io/badge/pypi-v0.12.0-blue.svg)](https://pypi.org/project/python-doctr/) [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/mindee/doctr) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/quicktour.ipynb) [![Gurubase](https://img.shields.io/badge/Gurubase-Ask%20docTR%20Guru-006BFF)](https://gurubase.io/g/doctr)
+[![Slack Icon](https://img.shields.io/badge/Slack-Community-4A154B?style=flat-square&logo=slack&logoColor=white)](https://slack.mindee.com) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE) ![Build Status](https://github.com/mindee/doctr/workflows/builds/badge.svg) [![Docker Images](https://img.shields.io/badge/Docker-4287f5?style=flat&logo=docker&logoColor=white)](https://github.com/mindee/doctr/pkgs/container/doctr) [![codecov](https://codecov.io/gh/mindee/doctr/branch/main/graph/badge.svg?token=577MO567NM)](https://codecov.io/gh/mindee/doctr) [![CodeFactor](https://www.codefactor.io/repository/github/mindee/doctr/badge?s=bae07db86bb079ce9d6542315b8c6e70fa708a7e)](https://www.codefactor.io/repository/github/mindee/doctr) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/340a76749b634586a498e1c0ab998f08)](https://app.codacy.com/gh/mindee/doctr?utm_source=github.com&utm_medium=referral&utm_content=mindee/doctr&utm_campaign=Badge_Grade) [![Doc Status](https://github.com/mindee/doctr/workflows/doc-status/badge.svg)](https://mindee.github.io/doctr) [![Pypi](https://img.shields.io/badge/pypi-v1.0.0-blue.svg)](https://pypi.org/project/python-doctr/) [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/mindee/doctr) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/quicktour.ipynb) [![Gurubase](https://img.shields.io/badge/Gurubase-Ask%20docTR%20Guru-006BFF)](https://gurubase.io/g/doctr)
 
 
 **Optical Character Recognition made seamless & accessible to anyone, powered by PyTorch**

diff --git a/api/pyproject.toml b/api/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.masonry.api"
 
 [tool.poetry]
 name = "doctr-api"
-version = "0.12.1a0"
+version = "1.0.0a0"
 description = "Backend template for your OCR API with docTR"
 authors = ["Mindee <contact@mindee.com>"]
 license = "Apache-2.0"

diff --git a/doctr/models/preprocessor/pytorch.py b/doctr/models/preprocessor/pytorch.py
@@ -60,24 +60,21 @@ def batch_inputs(self, samples: list[torch.Tensor]) -> list[torch.Tensor]:
 
         return batches
 
-    def sample_transforms(self, x: np.ndarray | torch.Tensor) -> torch.Tensor:
+    def sample_transforms(self, x: np.ndarray) -> torch.Tensor:
         if x.ndim != 3:
             raise AssertionError("expected list of 3D Tensors")
-        if isinstance(x, np.ndarray):
-            if x.dtype not in (np.uint8, np.float32, np.float16):
-                raise TypeError("unsupported data type for numpy.ndarray")
-            x = torch.from_numpy(x.copy()).permute(2, 0, 1)
-        elif x.dtype not in (torch.uint8, torch.float16, torch.float32):
-            raise TypeError("unsupported data type for torch.Tensor")
+        if x.dtype not in (np.uint8, np.float32, np.float16):
+            raise TypeError("unsupported data type for numpy.ndarray")
+        tensor = torch.from_numpy(x.copy()).permute(2, 0, 1)
         # Resizing
-        x = self.resize(x)
+        tensor = self.resize(tensor)
         # Data type
-        if x.dtype == torch.uint8:
-            x = x.to(dtype=torch.float32).div(255).clip(0, 1)  # type: ignore[union-attr]
+        if tensor.dtype == torch.uint8:
+            tensor = tensor.to(dtype=torch.float32).div(255).clip(0, 1)
         else:
-            x = x.to(dtype=torch.float32)  # type: ignore[union-attr]
+            tensor = tensor.to(dtype=torch.float32)
 
-        return x
+        return tensor
 
     def __call__(self, x: np.ndarray | list[np.ndarray]) -> list[torch.Tensor]:
         """Prepare document data for model forwarding
@@ -94,29 +91,29 @@ def __call__(self, x: np.ndarray | list[np.ndarray]) -> list[torch.Tensor]:
                 raise AssertionError("expected 4D Tensor")
             if x.dtype not in (np.uint8, np.float32, np.float16):
                 raise TypeError("unsupported data type for numpy.ndarray")
-            x = torch.from_numpy(x.copy()).permute(0, 3, 1, 2)  # type: ignore[assignment]
+            tensor = torch.from_numpy(x.copy()).permute(0, 3, 1, 2)
 
             # Resizing
-            if x.shape[-2] != self.resize.size[0] or x.shape[-1] != self.resize.size[1]:
-                x = F.resize(
-                    x, self.resize.size, interpolation=self.resize.interpolation, antialias=self.resize.antialias
+            if tensor.shape[-2] != self.resize.size[0] or tensor.shape[-1] != self.resize.size[1]:
+                tensor = F.resize(
+                    tensor, self.resize.size, interpolation=self.resize.interpolation, antialias=self.resize.antialias
                 )
             # Data type
-            if x.dtype == torch.uint8:  # type: ignore[union-attr]
-                x = x.to(dtype=torch.float32).div(255).clip(0, 1)  # type: ignore[union-attr]
+            if tensor.dtype == torch.uint8:
+                tensor = tensor.to(dtype=torch.float32).div(255).clip(0, 1)
             else:
-                x = x.to(dtype=torch.float32)  # type: ignore[union-attr]
-            batches = [x]
+                tensor = tensor.to(dtype=torch.float32)
+            batches = [tensor]
 
         elif isinstance(x, list) and all(isinstance(sample, np.ndarray) for sample in x):
             # Sample transform (to tensor, resize)
             samples = list(multithread_exec(self.sample_transforms, x))
             # Batching
-            batches = self.batch_inputs(samples)  # type: ignore[assignment]
+            batches = self.batch_inputs(samples)
         else:
             raise TypeError(f"invalid input type: {type(x)}")
 
         # Batch transforms (normalize)
         batches = list(multithread_exec(self.normalize, batches))
 
-        return batches  # type: ignore[return-value]
+        return batches
diff --git a/doctr/transforms/modules/pytorch.py b/doctr/transforms/modules/pytorch.py
@@ -27,7 +27,21 @@
 
 
 class Resize(T.Resize):
-    """Resize the input image to the given size"""
+    """Resize the input image to the given size
+
+    >>> import torch
+    >>> from doctr.transforms import Resize
+    >>> transfo = Resize((64, 64), preserve_aspect_ratio=True, symmetric_pad=True)
+    >>> out = transfo(torch.rand((3, 64, 64)))
+
+    Args:
+        size: output size in pixels, either a tuple (height, width) or a single integer for square images
+        interpolation: interpolation mode to use for resizing, default is bilinear
+        preserve_aspect_ratio: whether to preserve the aspect ratio of the image,
+            if True, the image will be resized to fit within the target size while maintaining its aspect ratio
+        symmetric_pad: whether to symmetrically pad the image to the target size,
+            if True, the image will be padded equally on both sides to fit the target size
+    """
 
     def __init__(
         self,
@@ -36,42 +50,30 @@ def __init__(
         preserve_aspect_ratio: bool = False,
         symmetric_pad: bool = False,
     ) -> None:
-        super().__init__(size, interpolation, antialias=True)
+        super().__init__(size if isinstance(size, (list, tuple)) else (size, size), interpolation, antialias=True)
         self.preserve_aspect_ratio = preserve_aspect_ratio
         self.symmetric_pad = symmetric_pad
 
-        if not isinstance(self.size, (int, tuple, list)):
-            raise AssertionError("size should be either a tuple, a list or an int")
-
     def forward(
         self,
         img: torch.Tensor,
         target: np.ndarray | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, np.ndarray]:
-        if isinstance(self.size, int):
-            target_ratio = img.shape[-2] / img.shape[-1]
-        else:
-            target_ratio = self.size[0] / self.size[1]
+        target_ratio = self.size[0] / self.size[1]
         actual_ratio = img.shape[-2] / img.shape[-1]
 
-        if not self.preserve_aspect_ratio or (target_ratio == actual_ratio and (isinstance(self.size, (tuple, list)))):
+        if not self.preserve_aspect_ratio or (target_ratio == actual_ratio):
             # If we don't preserve the aspect ratio or the wanted aspect ratio is the same than the original one
             # We can use with the regular resize
             if target is not None:
                 return super().forward(img), target
             return super().forward(img)
         else:
             # Resize
-            if isinstance(self.size, (tuple, list)):
-                if actual_ratio > target_ratio:
-                    tmp_size = (self.size[0], max(int(self.size[0] / actual_ratio), 1))
-                else:
-                    tmp_size = (max(int(self.size[1] * actual_ratio), 1), self.size[1])
-            elif isinstance(self.size, int):  # self.size is the longest side, infer the other
-                if img.shape[-2] <= img.shape[-1]:
-                    tmp_size = (max(int(self.size * actual_ratio), 1), self.size)
-                else:
-                    tmp_size = (self.size, max(int(self.size / actual_ratio), 1))
+            if actual_ratio > target_ratio:
+                tmp_size = (self.size[0], max(int(self.size[0] / actual_ratio), 1))
+            else:
+                tmp_size = (max(int(self.size[1] * actual_ratio), 1), self.size[1])
 
             # Scale image
             img = F.resize(img, tmp_size, self.interpolation, antialias=True)
@@ -93,14 +95,14 @@ def forward(
                 if self.preserve_aspect_ratio:
                     # Get absolute coords
                     if target.shape[1:] == (4,):
-                        if isinstance(self.size, (tuple, list)) and self.symmetric_pad:
+                        if self.symmetric_pad:
                             target[:, [0, 2]] = offset[0] + target[:, [0, 2]] * raw_shape[-1] / img.shape[-1]
                             target[:, [1, 3]] = offset[1] + target[:, [1, 3]] * raw_shape[-2] / img.shape[-2]
                         else:
                             target[:, [0, 2]] *= raw_shape[-1] / img.shape[-1]
                             target[:, [1, 3]] *= raw_shape[-2] / img.shape[-2]
                     elif target.shape[1:] == (4, 2):
-                        if isinstance(self.size, (tuple, list)) and self.symmetric_pad:
+                        if self.symmetric_pad:
                             target[..., 0] = offset[0] + target[..., 0] * raw_shape[-1] / img.shape[-1]
                             target[..., 1] = offset[1] + target[..., 1] * raw_shape[-2] / img.shape[-2]
                         else:

diff --git a/tests/pytorch/test_transforms_pt.py b/tests/pytorch/test_transforms_pt.py
@@ -27,60 +27,77 @@ def test_resize():
 
     assert torch.all(out == 1)
     assert out.shape[-2:] == output_size
-    assert repr(transfo) == f"Resize(output_size={output_size}, interpolation='bilinear')"
+    assert repr(transfo) == "Resize(output_size=(32, 32), interpolation='bilinear')"
 
-    transfo = Resize(output_size, preserve_aspect_ratio=True)
+    # Test with preserve_aspect_ratio
+    output_size = (32, 32)
     input_t = torch.ones((3, 32, 64), dtype=torch.float32)
-    out = transfo(input_t)
 
+    # Asymmetric padding
+    transfo = Resize(output_size, preserve_aspect_ratio=True)
+    out = transfo(input_t)
     assert out.shape[-2:] == output_size
     assert not torch.all(out == 1)
-    # Asymetric padding
     assert torch.all(out[:, -1] == 0) and torch.all(out[:, 0] == 1)
 
-    # Symetric padding
-    transfo = Resize(output_size, preserve_aspect_ratio=True, symmetric_pad=True)
-    assert repr(transfo) == (
-        f"Resize(output_size={output_size}, interpolation='bilinear', preserve_aspect_ratio=True, symmetric_pad=True)"
-    )
+    # Symmetric padding
+    transfo = Resize(32, preserve_aspect_ratio=True, symmetric_pad=True)
     out = transfo(input_t)
     assert out.shape[-2:] == output_size
-    # symetric padding
-    assert torch.all(out[:, -1] == 0) and torch.all(out[:, 0] == 0)
+    assert torch.all(out[:, 0] == 0) and torch.all(out[:, -1] == 0)
 
-    # Inverse aspect ratio
+    expected = "Resize(output_size=(32, 32), interpolation='bilinear', preserve_aspect_ratio=True, symmetric_pad=True)"
+    assert repr(transfo) == expected
+
+    # Test with inverse resize
     input_t = torch.ones((3, 64, 32), dtype=torch.float32)
+    transfo = Resize(32, preserve_aspect_ratio=True, symmetric_pad=True)
     out = transfo(input_t)
+    assert out.shape[-2:] == (32, 32)
 
-    assert not torch.all(out == 1)
-    assert out.shape[-2:] == output_size
-
-    # Same aspect ratio
-    output_size = (32, 128)
-    transfo = Resize(output_size, preserve_aspect_ratio=True)
+    # Test resize with same ratio
+    transfo = Resize((32, 128), preserve_aspect_ratio=True)
     out = transfo(torch.ones((3, 16, 64), dtype=torch.float32))
-    assert out.shape[-2:] == output_size
+    assert out.shape[-2:] == (32, 128)
 
-    # FP16
+    # Test with fp16 input
+    transfo = Resize((32, 128), preserve_aspect_ratio=True)
     input_t = torch.ones((3, 64, 64), dtype=torch.float16)
     out = transfo(input_t)
     assert out.dtype == torch.float16
 
-    # --- Test with target (bounding boxes) ---
-
-    target_boxes = np.array([[0.1, 0.1, 0.9, 0.9], [0.2, 0.2, 0.8, 0.8]])
-    output_size = (64, 64)
-
-    transfo = Resize(output_size, preserve_aspect_ratio=True)
+    padding = [True, False]
+    for symmetric_pad in padding:
+        # Test with target boxes
+        target_boxes = np.array([[0.1, 0.1, 0.3, 0.4], [0.2, 0.2, 0.8, 0.8]])
+        transfo = Resize((64, 64), preserve_aspect_ratio=True, symmetric_pad=symmetric_pad)
+        input_t = torch.ones((3, 32, 64), dtype=torch.float32)
+        out, new_target = transfo(input_t, target_boxes)
+
+        assert out.shape[-2:] == (64, 64)
+        assert new_target.shape == target_boxes.shape
+        assert np.all((0 <= new_target) & (new_target <= 1))
+
+        # Test with target polygons
+        target_boxes = np.array([
+            [[0.1, 0.1], [0.9, 0.1], [0.9, 0.9], [0.1, 0.9]],
+            [[0.2, 0.2], [0.8, 0.2], [0.8, 0.8], [0.2, 0.8]],
+        ])
+        transfo = Resize((64, 64), preserve_aspect_ratio=True, symmetric_pad=symmetric_pad)
+        input_t = torch.ones((3, 32, 64), dtype=torch.float32)
+        out, new_target = transfo(input_t, target_boxes)
+
+        assert out.shape[-2:] == (64, 64)
+        assert new_target.shape == target_boxes.shape
+        assert np.all((0 <= new_target) & (new_target <= 1))
+
+    # Test with invalid target shape
     input_t = torch.ones((3, 32, 64), dtype=torch.float32)
-    out, new_target = transfo(input_t, target_boxes)
+    target = np.ones((2, 5))  # Invalid shape
 
-    assert out.shape[-2:] == output_size
-    assert new_target.shape == target_boxes.shape
-    assert np.all(new_target >= 0) and np.all(new_target <= 1)
-
-    out = transfo(input_t)
-    assert out.shape[-2:] == output_size
+    transfo = Resize((64, 64), preserve_aspect_ratio=True)
+    with pytest.raises(AssertionError):
+        transfo(input_t, target)
 
 
 @pytest.mark.parametrize(