From e2ad2fb28875893fd1a412d48891ec17bbfa4bc8 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Wed, 11 Mar 2026 15:52:24 +0000
Subject: [PATCH 1/4] Prevent data copy in  `VideoFrame.to_ndarray()` for
 padded frames

---
 av/video/frame.py | 56 ++++++++++++++++++++++++++++-------------------
 1 file changed, 33 insertions(+), 23 deletions(-)

diff --git a/av/video/frame.py b/av/video/frame.py
index 74be1128d..30c16b0e0 100644
--- a/av/video/frame.py
+++ b/av/video/frame.py
@@ -433,19 +433,26 @@ def useful_array(
     plane: VideoPlane, bytes_per_pixel: cython.uint = 1, dtype: str = "uint8"
 ):
     """
-    Return the useful part of the VideoPlane as a single dimensional array.
+    Return the useful part of the VideoPlane as a strided array.
 
-    We are simply discarding any padding which was added for alignment.
+    We are simply creating a view that discards any padding which was added for
+    alignment.
     """
     import numpy as np
 
-    total_line_size: cython.size_t = abs(plane.line_size)
-    useful_line_size: cython.size_t = plane.width * bytes_per_pixel
-    if total_line_size == useful_line_size:
-        return np.frombuffer(plane, dtype=dtype)
-    arr = np.frombuffer(plane, np.uint8)
-    arr = arr.reshape(-1, total_line_size)[:, 0:useful_line_size].reshape(-1)
-    return arr.view(np.dtype(dtype))
+    dtype_obj = np.dtype(dtype)
+    total_line_size = abs(plane.frame.ptr.linesize[plane.index])
+    itemsize = dtype_obj.itemsize
+    channels = bytes_per_pixel // itemsize
+
+    if channels == 1:
+        shape = (plane.height, plane.width)
+        strides = (total_line_size, itemsize)
+    else:
+        shape = (plane.height, plane.width, channels)
+        strides = (total_line_size, bytes_per_pixel, itemsize)
+
+    return np.ndarray(shape, dtype=dtype_obj, buffer=plane, strides=strides)
 
 
 @cython.cfunc
@@ -755,19 +762,19 @@ def to_ndarray(self, channel_last=False, **kwargs):
             itemsize: cython.uint
             itemsize, dtype = _np_pix_fmt_dtypes[format_name]
             if len(planes) == 1:  # shortcut, avoid memory copy
-                array = useful_array(planes[0], itemsize, dtype).reshape(
-                    height, width, -1
-                )
+                array = useful_array(planes[0], itemsize, dtype)
+                if array.ndim == 2:
+                    array = array[:, :, None]
             else:  # general case
                 array = np.empty((height, width, len(planes)), dtype=dtype)
                 for i, plane in enumerate(planes):
-                    array[:, :, i] = useful_array(plane, itemsize, dtype).reshape(
-                        height, width
-                    )
+                    array[:, :, i] = useful_array(plane, itemsize, dtype)
             array = byteswap_array(array, format_name.endswith("be"))
             if array.shape[2] == 1:  # skip last channel for gray images
                 return array.squeeze(2)
             if format_name.startswith("gbr"):  # gbr -> rgb
+                if len(planes) == 1:
+                    array = array.copy()  # prevent mutating FFmpeg frame in-place
                 array[:, :, :3] = array[:, :, [2, 0, 1]]
             if not channel_last and format_name in {"yuv444p", "yuvj444p"}:
                 array = np.moveaxis(array, 2, 0)
@@ -777,16 +784,16 @@ def to_ndarray(self, channel_last=False, **kwargs):
         if format_name in {"yuv420p", "yuvj420p", "yuv422p"}:
             return np.hstack(
                 [
-                    useful_array(planes[0]),
-                    useful_array(planes[1]),
-                    useful_array(planes[2]),
+                    useful_array(planes[0]).reshape(-1),
+                    useful_array(planes[1]).reshape(-1),
+                    useful_array(planes[2]).reshape(-1),
                 ]
             ).reshape(-1, width)
         if format_name == "yuv422p10le":
             # Read planes as uint16 at their original width
-            y = useful_array(planes[0], 2, "uint16").reshape(height, width)
-            u = useful_array(planes[1], 2, "uint16").reshape(height, width // 2)
-            v = useful_array(planes[2], 2, "uint16").reshape(height, width // 2)
+            y = useful_array(planes[0], 2, "uint16")
+            u = useful_array(planes[1], 2, "uint16")
+            v = useful_array(planes[2], 2, "uint16")
 
             # Double the width of U and V by repeating each value
             u_full = np.repeat(u, 2, axis=1)
@@ -795,7 +802,7 @@ def to_ndarray(self, channel_last=False, **kwargs):
                 return np.stack([y, u_full, v_full], axis=2)
             return np.stack([y, u_full, v_full], axis=0)
         if format_name == "pal8":
-            image = useful_array(planes[0]).reshape(height, width)
+            image = useful_array(planes[0])
             palette = (
                 np.frombuffer(planes[1], "i4")
                 .astype(">i4")
@@ -805,7 +812,10 @@ def to_ndarray(self, channel_last=False, **kwargs):
             return image, palette
         if format_name == "nv12":
             return np.hstack(
-                [useful_array(planes[0]), useful_array(planes[1], 2)]
+                [
+                    useful_array(planes[0]).reshape(-1),
+                    useful_array(planes[1], 2).reshape(-1),
+                ]
             ).reshape(-1, width)
 
         raise ValueError(

From b105f6e745b89568c50f1702827dcb7c0e0cd64e Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Wed, 11 Mar 2026 18:22:55 +0000
Subject: [PATCH 2/4] Simplify special cases

---
 av/video/frame.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/av/video/frame.py b/av/video/frame.py
index 30c16b0e0..6f3e2e6dc 100644
--- a/av/video/frame.py
+++ b/av/video/frame.py
@@ -374,9 +374,13 @@ class PictureType(IntEnum):
     BI = lib.AV_PICTURE_TYPE_BI  # BI type
 
 
+_is_big_endian = cython.declare(cython.bint, sys.byteorder == "big")
+
+
 @cython.cfunc
+@cython.inline
 def byteswap_array(array, big_endian: cython.bint):
-    if (sys.byteorder == "big") != big_endian:
+    if _is_big_endian != big_endian:
         return array.byteswap()
     return array
 
@@ -761,21 +765,18 @@ def to_ndarray(self, channel_last=False, **kwargs):
         if format_name in _np_pix_fmt_dtypes:
             itemsize: cython.uint
             itemsize, dtype = _np_pix_fmt_dtypes[format_name]
-            if len(planes) == 1:  # shortcut, avoid memory copy
+            num_planes: cython.size_t = len(planes)
+            if num_planes == 1:  # shortcut, avoid memory copy
                 array = useful_array(planes[0], itemsize, dtype)
-                if array.ndim == 2:
-                    array = array[:, :, None]
             else:  # general case
-                array = np.empty((height, width, len(planes)), dtype=dtype)
-                for i, plane in enumerate(planes):
-                    array[:, :, i] = useful_array(plane, itemsize, dtype)
+                array = np.empty((height, width, num_planes), dtype=dtype)
+                if format_name.startswith("gbr"):
+                    plane_indices = (2, 0, 1, *range(3, num_planes))
+                else:
+                    plane_indices = range(num_planes)
+                for i, p_idx in enumerate(plane_indices):
+                    array[:, :, i] = useful_array(planes[p_idx], itemsize, dtype)
             array = byteswap_array(array, format_name.endswith("be"))
-            if array.shape[2] == 1:  # skip last channel for gray images
-                return array.squeeze(2)
-            if format_name.startswith("gbr"):  # gbr -> rgb
-                if len(planes) == 1:
-                    array = array.copy()  # prevent mutating FFmpeg frame in-place
-                array[:, :, :3] = array[:, :, [2, 0, 1]]
             if not channel_last and format_name in {"yuv444p", "yuvj444p"}:
                 array = np.moveaxis(array, 2, 0)
             return array

From cd5ffd0994b37069f14db0787b2b503e5dd4da5c Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Wed, 11 Mar 2026 18:41:04 +0000
Subject: [PATCH 3/4] Move duplicated if statements

---
 av/video/frame.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/av/video/frame.py b/av/video/frame.py
index 6f3e2e6dc..5a7ba9d07 100644
--- a/av/video/frame.py
+++ b/av/video/frame.py
@@ -433,6 +433,7 @@ def copy_array_to_plane(array, plane: VideoPlane, bytes_per_pixel: cython.uint):
 
 
 @cython.cfunc
+@cython.inline
 def useful_array(
     plane: VideoPlane, bytes_per_pixel: cython.uint = 1, dtype: str = "uint8"
 ):
@@ -755,21 +756,21 @@ def to_ndarray(self, channel_last=False, **kwargs):
 
         # check size
         format_name = frame.format.name
-        height, width = frame.ptr.height, frame.ptr.width
         planes: tuple[VideoPlane, ...] = frame.planes
-        if format_name in {"yuv420p", "yuvj420p", "yuyv422", "yuv422p10le", "yuv422p"}:
-            assert width % 2 == 0, "the width has to be even for this pixel format"
-            assert height % 2 == 0, "the height has to be even for this pixel format"
-
         # cases planes are simply concatenated in shape (height, width, channels)
         if format_name in _np_pix_fmt_dtypes:
+            if format_name == "yuyv422":
+                assert frame.ptr.width % 2 == 0, "width has to be even for yuyv422"
+                assert frame.ptr.height % 2 == 0, "height has to be even for yuyv422"
             itemsize: cython.uint
             itemsize, dtype = _np_pix_fmt_dtypes[format_name]
             num_planes: cython.size_t = len(planes)
             if num_planes == 1:  # shortcut, avoid memory copy
                 array = useful_array(planes[0], itemsize, dtype)
             else:  # general case
-                array = np.empty((height, width, num_planes), dtype=dtype)
+                array = np.empty(
+                    (frame.ptr.height, frame.ptr.width, num_planes), dtype=dtype
+                )
                 if format_name.startswith("gbr"):
                     plane_indices = (2, 0, 1, *range(3, num_planes))
                 else:
@@ -783,14 +784,18 @@ def to_ndarray(self, channel_last=False, **kwargs):
 
         # special cases
         if format_name in {"yuv420p", "yuvj420p", "yuv422p"}:
+            assert frame.ptr.width % 2 == 0, "width has to be even for this format"
+            assert frame.ptr.height % 2 == 0, "height has to be even for this format"
             return np.hstack(
                 [
                     useful_array(planes[0]).reshape(-1),
                     useful_array(planes[1]).reshape(-1),
                     useful_array(planes[2]).reshape(-1),
                 ]
-            ).reshape(-1, width)
+            ).reshape(-1, frame.ptr.width)
         if format_name == "yuv422p10le":
+            assert frame.ptr.width % 2 == 0, "width has to be even for this format"
+            assert frame.ptr.height % 2 == 0, "height has to be even for this format"
             # Read planes as uint16 at their original width
             y = useful_array(planes[0], 2, "uint16")
             u = useful_array(planes[1], 2, "uint16")
@@ -817,7 +822,7 @@ def to_ndarray(self, channel_last=False, **kwargs):
                     useful_array(planes[0]).reshape(-1),
                     useful_array(planes[1], 2).reshape(-1),
                 ]
-            ).reshape(-1, width)
+            ).reshape(-1, frame.ptr.width)
 
         raise ValueError(
             f"Conversion to numpy array with format `{format_name}` is not yet supported"

From 02ec746c425bff6fe36a6e774614d6d01654e570 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Wed, 11 Mar 2026 18:55:20 +0000
Subject: [PATCH 4/4] Optimize planes

---
 av/video/frame.py | 2 ++
 av/video/plane.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/av/video/frame.py b/av/video/frame.py
index 5a7ba9d07..5212db292 100644
--- a/av/video/frame.py
+++ b/av/video/frame.py
@@ -539,6 +539,8 @@ def planes(self):
         plane_count: cython.int = 0
         while plane_count < max_plane_count and self.ptr.extended_data[plane_count]:
             plane_count += 1
+        if plane_count == 1:
+            return (VideoPlane(self, 0),)
         return tuple([VideoPlane(self, i) for i in range(plane_count)])
 
     @property
diff --git a/av/video/plane.py b/av/video/plane.py
index c4908c21d..2b169f9cc 100644
--- a/av/video/plane.py
+++ b/av/video/plane.py
@@ -26,7 +26,7 @@ def __cinit__(self, frame: VideoFrame, index: cython.int):
                 frames_ctx.sw_format, frame.ptr.width, frame.ptr.height
             )
 
-        if fmt.name == "pal8" and index == 1:
+        if index == 1 and fmt.name == "pal8":
             self.width = 256
             self.height = 1
             self.buffer_size = 256 * 4