diff --git a/README.md b/README.md index 77090f61ad..795b0692ac 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@

-[![Slack Icon](https://img.shields.io/badge/Slack-Community-4A154B?style=flat-square&logo=slack&logoColor=white)](https://slack.mindee.com) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE) ![Build Status](https://github.com/mindee/doctr/workflows/builds/badge.svg) [![Docker Images](https://img.shields.io/badge/Docker-4287f5?style=flat&logo=docker&logoColor=white)](https://github.com/mindee/doctr/pkgs/container/doctr) [![codecov](https://codecov.io/gh/mindee/doctr/branch/main/graph/badge.svg?token=577MO567NM)](https://codecov.io/gh/mindee/doctr) [![CodeFactor](https://www.codefactor.io/repository/github/mindee/doctr/badge?s=bae07db86bb079ce9d6542315b8c6e70fa708a7e)](https://www.codefactor.io/repository/github/mindee/doctr) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/340a76749b634586a498e1c0ab998f08)](https://app.codacy.com/gh/mindee/doctr?utm_source=github.com&utm_medium=referral&utm_content=mindee/doctr&utm_campaign=Badge_Grade) [![Doc Status](https://github.com/mindee/doctr/workflows/doc-status/badge.svg)](https://mindee.github.io/doctr) [![Pypi](https://img.shields.io/badge/pypi-v0.12.0-blue.svg)](https://pypi.org/project/python-doctr/) [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/mindee/doctr) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/quicktour.ipynb) [![Gurubase](https://img.shields.io/badge/Gurubase-Ask%20docTR%20Guru-006BFF)](https://gurubase.io/g/doctr) +[![Slack Icon](https://img.shields.io/badge/Slack-Community-4A154B?style=flat-square&logo=slack&logoColor=white)](https://slack.mindee.com) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE) ![Build Status](https://github.com/mindee/doctr/workflows/builds/badge.svg) [![Docker Images](https://img.shields.io/badge/Docker-4287f5?style=flat&logo=docker&logoColor=white)](https://github.com/mindee/doctr/pkgs/container/doctr) [![codecov](https://codecov.io/gh/mindee/doctr/branch/main/graph/badge.svg?token=577MO567NM)](https://codecov.io/gh/mindee/doctr) [![CodeFactor](https://www.codefactor.io/repository/github/mindee/doctr/badge?s=bae07db86bb079ce9d6542315b8c6e70fa708a7e)](https://www.codefactor.io/repository/github/mindee/doctr) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/340a76749b634586a498e1c0ab998f08)](https://app.codacy.com/gh/mindee/doctr?utm_source=github.com&utm_medium=referral&utm_content=mindee/doctr&utm_campaign=Badge_Grade) [![Doc Status](https://github.com/mindee/doctr/workflows/doc-status/badge.svg)](https://mindee.github.io/doctr) [![Pypi](https://img.shields.io/badge/pypi-v1.0.0-blue.svg)](https://pypi.org/project/python-doctr/) [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/mindee/doctr) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/quicktour.ipynb) [![Gurubase](https://img.shields.io/badge/Gurubase-Ask%20docTR%20Guru-006BFF)](https://gurubase.io/g/doctr) **Optical Character Recognition made seamless & accessible to anyone, powered by PyTorch** diff --git a/api/pyproject.toml b/api/pyproject.toml index d4783eb2d9..f6829be556 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.masonry.api" [tool.poetry] name = "doctr-api" -version = "0.12.1a0" +version = "1.0.0a0" description = "Backend template for your OCR API with docTR" authors = ["Mindee "] license = "Apache-2.0" diff --git a/doctr/models/preprocessor/pytorch.py b/doctr/models/preprocessor/pytorch.py index a8b3f3924d..fb2cc005d7 100644 --- a/doctr/models/preprocessor/pytorch.py +++ b/doctr/models/preprocessor/pytorch.py @@ -60,24 +60,21 @@ def batch_inputs(self, samples: list[torch.Tensor]) -> list[torch.Tensor]: return batches - def sample_transforms(self, x: np.ndarray | torch.Tensor) -> torch.Tensor: + def sample_transforms(self, x: np.ndarray) -> torch.Tensor: if x.ndim != 3: raise AssertionError("expected list of 3D Tensors") - if isinstance(x, np.ndarray): - if x.dtype not in (np.uint8, np.float32, np.float16): - raise TypeError("unsupported data type for numpy.ndarray") - x = torch.from_numpy(x.copy()).permute(2, 0, 1) - elif x.dtype not in (torch.uint8, torch.float16, torch.float32): - raise TypeError("unsupported data type for torch.Tensor") + if x.dtype not in (np.uint8, np.float32, np.float16): + raise TypeError("unsupported data type for numpy.ndarray") + tensor = torch.from_numpy(x.copy()).permute(2, 0, 1) # Resizing - x = self.resize(x) + tensor = self.resize(tensor) # Data type - if x.dtype == torch.uint8: - x = x.to(dtype=torch.float32).div(255).clip(0, 1) # type: ignore[union-attr] + if tensor.dtype == torch.uint8: + tensor = tensor.to(dtype=torch.float32).div(255).clip(0, 1) else: - x = x.to(dtype=torch.float32) # type: ignore[union-attr] + tensor = tensor.to(dtype=torch.float32) - return x + return tensor def __call__(self, x: np.ndarray | list[np.ndarray]) -> list[torch.Tensor]: """Prepare document data for model forwarding @@ -94,29 +91,29 @@ def __call__(self, x: np.ndarray | list[np.ndarray]) -> list[torch.Tensor]: raise AssertionError("expected 4D Tensor") if x.dtype not in (np.uint8, np.float32, np.float16): raise TypeError("unsupported data type for numpy.ndarray") - x = torch.from_numpy(x.copy()).permute(0, 3, 1, 2) # type: ignore[assignment] + tensor = torch.from_numpy(x.copy()).permute(0, 3, 1, 2) # Resizing - if x.shape[-2] != self.resize.size[0] or x.shape[-1] != self.resize.size[1]: - x = F.resize( - x, self.resize.size, interpolation=self.resize.interpolation, antialias=self.resize.antialias + if tensor.shape[-2] != self.resize.size[0] or tensor.shape[-1] != self.resize.size[1]: + tensor = F.resize( + tensor, self.resize.size, interpolation=self.resize.interpolation, antialias=self.resize.antialias ) # Data type - if x.dtype == torch.uint8: # type: ignore[union-attr] - x = x.to(dtype=torch.float32).div(255).clip(0, 1) # type: ignore[union-attr] + if tensor.dtype == torch.uint8: + tensor = tensor.to(dtype=torch.float32).div(255).clip(0, 1) else: - x = x.to(dtype=torch.float32) # type: ignore[union-attr] - batches = [x] + tensor = tensor.to(dtype=torch.float32) + batches = [tensor] elif isinstance(x, list) and all(isinstance(sample, np.ndarray) for sample in x): # Sample transform (to tensor, resize) samples = list(multithread_exec(self.sample_transforms, x)) # Batching - batches = self.batch_inputs(samples) # type: ignore[assignment] + batches = self.batch_inputs(samples) else: raise TypeError(f"invalid input type: {type(x)}") # Batch transforms (normalize) batches = list(multithread_exec(self.normalize, batches)) - return batches # type: ignore[return-value] + return batches diff --git a/doctr/transforms/modules/pytorch.py b/doctr/transforms/modules/pytorch.py index 9a54e16b1f..d9f0a07bb0 100644 --- a/doctr/transforms/modules/pytorch.py +++ b/doctr/transforms/modules/pytorch.py @@ -27,7 +27,21 @@ class Resize(T.Resize): - """Resize the input image to the given size""" + """Resize the input image to the given size + + >>> import torch + >>> from doctr.transforms import Resize + >>> transfo = Resize((64, 64), preserve_aspect_ratio=True, symmetric_pad=True) + >>> out = transfo(torch.rand((3, 64, 64))) + + Args: + size: output size in pixels, either a tuple (height, width) or a single integer for square images + interpolation: interpolation mode to use for resizing, default is bilinear + preserve_aspect_ratio: whether to preserve the aspect ratio of the image, + if True, the image will be resized to fit within the target size while maintaining its aspect ratio + symmetric_pad: whether to symmetrically pad the image to the target size, + if True, the image will be padded equally on both sides to fit the target size + """ def __init__( self, @@ -36,25 +50,19 @@ def __init__( preserve_aspect_ratio: bool = False, symmetric_pad: bool = False, ) -> None: - super().__init__(size, interpolation, antialias=True) + super().__init__(size if isinstance(size, (list, tuple)) else (size, size), interpolation, antialias=True) self.preserve_aspect_ratio = preserve_aspect_ratio self.symmetric_pad = symmetric_pad - if not isinstance(self.size, (int, tuple, list)): - raise AssertionError("size should be either a tuple, a list or an int") - def forward( self, img: torch.Tensor, target: np.ndarray | None = None, ) -> torch.Tensor | tuple[torch.Tensor, np.ndarray]: - if isinstance(self.size, int): - target_ratio = img.shape[-2] / img.shape[-1] - else: - target_ratio = self.size[0] / self.size[1] + target_ratio = self.size[0] / self.size[1] actual_ratio = img.shape[-2] / img.shape[-1] - if not self.preserve_aspect_ratio or (target_ratio == actual_ratio and (isinstance(self.size, (tuple, list)))): + if not self.preserve_aspect_ratio or (target_ratio == actual_ratio): # If we don't preserve the aspect ratio or the wanted aspect ratio is the same than the original one # We can use with the regular resize if target is not None: @@ -62,16 +70,10 @@ def forward( return super().forward(img) else: # Resize - if isinstance(self.size, (tuple, list)): - if actual_ratio > target_ratio: - tmp_size = (self.size[0], max(int(self.size[0] / actual_ratio), 1)) - else: - tmp_size = (max(int(self.size[1] * actual_ratio), 1), self.size[1]) - elif isinstance(self.size, int): # self.size is the longest side, infer the other - if img.shape[-2] <= img.shape[-1]: - tmp_size = (max(int(self.size * actual_ratio), 1), self.size) - else: - tmp_size = (self.size, max(int(self.size / actual_ratio), 1)) + if actual_ratio > target_ratio: + tmp_size = (self.size[0], max(int(self.size[0] / actual_ratio), 1)) + else: + tmp_size = (max(int(self.size[1] * actual_ratio), 1), self.size[1]) # Scale image img = F.resize(img, tmp_size, self.interpolation, antialias=True) @@ -93,14 +95,14 @@ def forward( if self.preserve_aspect_ratio: # Get absolute coords if target.shape[1:] == (4,): - if isinstance(self.size, (tuple, list)) and self.symmetric_pad: + if self.symmetric_pad: target[:, [0, 2]] = offset[0] + target[:, [0, 2]] * raw_shape[-1] / img.shape[-1] target[:, [1, 3]] = offset[1] + target[:, [1, 3]] * raw_shape[-2] / img.shape[-2] else: target[:, [0, 2]] *= raw_shape[-1] / img.shape[-1] target[:, [1, 3]] *= raw_shape[-2] / img.shape[-2] elif target.shape[1:] == (4, 2): - if isinstance(self.size, (tuple, list)) and self.symmetric_pad: + if self.symmetric_pad: target[..., 0] = offset[0] + target[..., 0] * raw_shape[-1] / img.shape[-1] target[..., 1] = offset[1] + target[..., 1] * raw_shape[-2] / img.shape[-2] else: diff --git a/tests/pytorch/test_transforms_pt.py b/tests/pytorch/test_transforms_pt.py index 8ed38d846b..b29f88e119 100644 --- a/tests/pytorch/test_transforms_pt.py +++ b/tests/pytorch/test_transforms_pt.py @@ -27,60 +27,77 @@ def test_resize(): assert torch.all(out == 1) assert out.shape[-2:] == output_size - assert repr(transfo) == f"Resize(output_size={output_size}, interpolation='bilinear')" + assert repr(transfo) == "Resize(output_size=(32, 32), interpolation='bilinear')" - transfo = Resize(output_size, preserve_aspect_ratio=True) + # Test with preserve_aspect_ratio + output_size = (32, 32) input_t = torch.ones((3, 32, 64), dtype=torch.float32) - out = transfo(input_t) + # Asymmetric padding + transfo = Resize(output_size, preserve_aspect_ratio=True) + out = transfo(input_t) assert out.shape[-2:] == output_size assert not torch.all(out == 1) - # Asymetric padding assert torch.all(out[:, -1] == 0) and torch.all(out[:, 0] == 1) - # Symetric padding - transfo = Resize(output_size, preserve_aspect_ratio=True, symmetric_pad=True) - assert repr(transfo) == ( - f"Resize(output_size={output_size}, interpolation='bilinear', preserve_aspect_ratio=True, symmetric_pad=True)" - ) + # Symmetric padding + transfo = Resize(32, preserve_aspect_ratio=True, symmetric_pad=True) out = transfo(input_t) assert out.shape[-2:] == output_size - # symetric padding - assert torch.all(out[:, -1] == 0) and torch.all(out[:, 0] == 0) + assert torch.all(out[:, 0] == 0) and torch.all(out[:, -1] == 0) - # Inverse aspect ratio + expected = "Resize(output_size=(32, 32), interpolation='bilinear', preserve_aspect_ratio=True, symmetric_pad=True)" + assert repr(transfo) == expected + + # Test with inverse resize input_t = torch.ones((3, 64, 32), dtype=torch.float32) + transfo = Resize(32, preserve_aspect_ratio=True, symmetric_pad=True) out = transfo(input_t) + assert out.shape[-2:] == (32, 32) - assert not torch.all(out == 1) - assert out.shape[-2:] == output_size - - # Same aspect ratio - output_size = (32, 128) - transfo = Resize(output_size, preserve_aspect_ratio=True) + # Test resize with same ratio + transfo = Resize((32, 128), preserve_aspect_ratio=True) out = transfo(torch.ones((3, 16, 64), dtype=torch.float32)) - assert out.shape[-2:] == output_size + assert out.shape[-2:] == (32, 128) - # FP16 + # Test with fp16 input + transfo = Resize((32, 128), preserve_aspect_ratio=True) input_t = torch.ones((3, 64, 64), dtype=torch.float16) out = transfo(input_t) assert out.dtype == torch.float16 - # --- Test with target (bounding boxes) --- - - target_boxes = np.array([[0.1, 0.1, 0.9, 0.9], [0.2, 0.2, 0.8, 0.8]]) - output_size = (64, 64) - - transfo = Resize(output_size, preserve_aspect_ratio=True) + padding = [True, False] + for symmetric_pad in padding: + # Test with target boxes + target_boxes = np.array([[0.1, 0.1, 0.3, 0.4], [0.2, 0.2, 0.8, 0.8]]) + transfo = Resize((64, 64), preserve_aspect_ratio=True, symmetric_pad=symmetric_pad) + input_t = torch.ones((3, 32, 64), dtype=torch.float32) + out, new_target = transfo(input_t, target_boxes) + + assert out.shape[-2:] == (64, 64) + assert new_target.shape == target_boxes.shape + assert np.all((0 <= new_target) & (new_target <= 1)) + + # Test with target polygons + target_boxes = np.array([ + [[0.1, 0.1], [0.9, 0.1], [0.9, 0.9], [0.1, 0.9]], + [[0.2, 0.2], [0.8, 0.2], [0.8, 0.8], [0.2, 0.8]], + ]) + transfo = Resize((64, 64), preserve_aspect_ratio=True, symmetric_pad=symmetric_pad) + input_t = torch.ones((3, 32, 64), dtype=torch.float32) + out, new_target = transfo(input_t, target_boxes) + + assert out.shape[-2:] == (64, 64) + assert new_target.shape == target_boxes.shape + assert np.all((0 <= new_target) & (new_target <= 1)) + + # Test with invalid target shape input_t = torch.ones((3, 32, 64), dtype=torch.float32) - out, new_target = transfo(input_t, target_boxes) + target = np.ones((2, 5)) # Invalid shape - assert out.shape[-2:] == output_size - assert new_target.shape == target_boxes.shape - assert np.all(new_target >= 0) and np.all(new_target <= 1) - - out = transfo(input_t) - assert out.shape[-2:] == output_size + transfo = Resize((64, 64), preserve_aspect_ratio=True) + with pytest.raises(AssertionError): + transfo(input_t, target) @pytest.mark.parametrize(