From 232454655d062de9fdcc73dcee56d25c9dd99e67 Mon Sep 17 00:00:00 2001
From: Cristi Bleotiu <cristibleotiu@gmail.com>
Date: Wed, 13 May 2026 12:02:53 +0300
Subject: [PATCH 1/4] fix: harden container shutdown cleanup

What changed: abort container restarts when runtime cleanup fails; preserve process, thread, and fixed-volume handles for retry; stop Docker log readers after container stop to avoid false cleanup failures.

Why: prevent stuck or duplicate runtimes during restart and shutdown paths.
---
 .../container_apps/container_app_runner.py    | 254 +++++++++++++-----
 .../business/container_apps/fixed_volume.py   |  26 +-
 .../mixins/fixed_size_volumes.py              |  15 +-
 3 files changed, 217 insertions(+), 78 deletions(-)

diff --git a/extensions/business/container_apps/container_app_runner.py b/extensions/business/container_apps/container_app_runner.py
index dddb8aca..c262b2c9 100644
--- a/extensions/business/container_apps/container_app_runner.py
+++ b/extensions/business/container_apps/container_app_runner.py
@@ -578,6 +578,7 @@ def __reset_vars(self):
     # Container state machine
     self.container_state = ContainerState.UNINITIALIZED
     self.stop_reason = StopReason.UNKNOWN
+    self._cleanup_failed = False
 
     # Restart policy and retry logic
     self._consecutive_failures = 0
@@ -1193,15 +1194,17 @@ def on_command(self, data, **kwargs):
       self.P("Restarting container...")
       self._clear_manual_stop_state()  # Clear persistent stop state
       self._set_container_state(ContainerState.RESTARTING, StopReason.CONFIG_UPDATE)
-      self._stop_container_and_save_logs_to_disk()
       self._restart_container(StopReason.CONFIG_UPDATE)
       return
 
     elif data == "STOP":
       self.P("Stopping container (manual stop - restart policy will not trigger)...")
       self._save_persistent_state(manually_stopped=True)  # Save persistent stop state
-      self._stop_container_and_save_logs_to_disk()
-      self._set_container_state(ContainerState.PAUSED, StopReason.MANUAL_STOP)
+      cleanup_ok = self._stop_container_and_save_logs_to_disk()
+      if cleanup_ok:
+        self._set_container_state(ContainerState.PAUSED, StopReason.MANUAL_STOP)
+      else:
+        self._set_container_state(ContainerState.FAILED, StopReason.UNKNOWN)
       return
     else:
       self.P(f"Unknown plugin command: {data}")
@@ -1249,7 +1252,12 @@ def _handle_config_restart(self, restart_callable):
       )
       return
 
-    self._stop_container_and_save_logs_to_disk()
+    cleanup_ok = self._stop_container_and_save_logs_to_disk()
+    if not cleanup_ok:
+      self.P("Config restart aborted because previous runtime cleanup failed.", color='r')
+      self._set_container_state(ContainerState.FAILED, StopReason.UNKNOWN)
+      self._record_restart_failure()
+      return
     restart_callable()
     return
 
@@ -1271,7 +1279,7 @@ def on_config(self, *args, **kwargs):
     -------
     None
     """
-    return self._handle_config_restart(lambda: self._restart_container(StopReason.CONFIG_UPDATE))
+    return self._handle_config_restart(lambda: self._restart_container(StopReason.CONFIG_UPDATE, cleanup_first=False))
 
 
   def on_post_container_start(self):
@@ -1367,16 +1375,28 @@ def stop_tunnel_engine(self):
 
     Returns
     -------
-    None
+    bool
+        True when the tunnel process and log readers stopped, False otherwise.
     """
     if self.tunnel_process:
       engine_name = "Cloudflare" if self.use_cloudflare() else "ngrok"
       self.P(f"Stopping {engine_name} tunnel...")
-      self.stop_tunnel_command(self.tunnel_process)
-      self.tunnel_process = None
-      self.P(f"{engine_name} tunnel stopped")
+      process = self.tunnel_process
+      result = True
+      try:
+        result = self.stop_tunnel_command(process)
+      except Exception as exc:
+        result = False
+        self.P(f"Error stopping {engine_name} tunnel: {exc}", color='r')
+      finally:
+        if result:
+          self.tunnel_process = None
+          self.P(f"{engine_name} tunnel stopped")
+        else:
+          self.P(f"{engine_name} tunnel did not fully stop; preserving process handle for retry.", color='r')
+      return result
     # end if
-    return
+    return True
 
 
   def get_tunnel_engine_ping_data(self):
@@ -1637,12 +1657,16 @@ def _start_extra_tunnel(self, container_port, tunnel_config):
       self.Pd(f"  Command: {' '.join(command)}")
 
       # Use list-based subprocess to prevent shell injection
-      process = subprocess.Popen(
-        command,
+      popen_kwargs = dict(
+        args=command,
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
-        bufsize=0
+        bufsize=0,
       )
+      if os.name != "nt":
+        popen_kwargs["start_new_session"] = True
+      process = subprocess.Popen(**popen_kwargs)
+      self._remember_process_group(process)
 
       # Create log readers for this tunnel
       logs_reader = self.LogReader(process.stdout, size=100, daemon=None)
@@ -1711,12 +1735,16 @@ def _stop_extra_tunnel(self, container_port):
 
     Returns
     -------
-    None
+    bool
+        True when the tunnel process and log readers stopped, False otherwise.
     """
     process = self.extra_tunnel_processes.get(container_port)
     if not process:
-      return
+      return True
 
+    result = True
+    process_stopped = process.poll() is not None
+    readers_stopped = True
     try:
       self.P(f"Stopping extra tunnel for port {container_port}...")
 
@@ -1725,12 +1753,11 @@ def _stop_extra_tunnel(self, container_port):
 
       # Stop process
       if process.poll() is None:  # Still running
-        process.terminate()
-        try:
-          process.wait(timeout=5)
-        except Exception:
-          process.kill()
-          process.wait()
+        process_stopped = self._terminate_subprocess_tree(
+          process,
+          label=f"Extra tunnel for port {container_port}",
+        )
+        result = process_stopped and result
 
       # Clean up log readers (following base class pattern)
       log_readers = self.extra_tunnel_log_readers.get(container_port, {})
@@ -1739,36 +1766,49 @@ def _stop_extra_tunnel(self, container_port):
       stdout_reader = log_readers.get("stdout")
       if stdout_reader:
         try:
-          stdout_reader.stop()
+          reader_stopped = stdout_reader.stop()
+          readers_stopped = reader_stopped and readers_stopped
+          result = reader_stopped and result
           # Read any remaining logs before cleanup
           remaining_logs = stdout_reader.get_next_characters()
           if remaining_logs:
             self._process_extra_tunnel_log(container_port, remaining_logs, is_error=False)
         except Exception as e:
+          readers_stopped = False
+          result = False
           self.Pd(f"Error stopping stdout reader: {e}")
 
       # Stop stderr reader and read remaining logs
       stderr_reader = log_readers.get("stderr")
       if stderr_reader:
         try:
-          stderr_reader.stop()
+          reader_stopped = stderr_reader.stop()
+          readers_stopped = reader_stopped and readers_stopped
+          result = reader_stopped and result
           # Read any remaining error logs before cleanup
           remaining_err_logs = stderr_reader.get_next_characters()
           if remaining_err_logs:
             self._process_extra_tunnel_log(container_port, remaining_err_logs, is_error=True)
         except Exception as e:
+          readers_stopped = False
+          result = False
           self.Pd(f"Error stopping stderr reader: {e}")
 
-      # Clean up references
-      self.extra_tunnel_processes.pop(container_port, None)
-      self.extra_tunnel_log_readers.pop(container_port, None)
-      self.extra_tunnel_urls.pop(container_port, None)
-      self.extra_tunnel_start_times.pop(container_port, None)
+      if result:
+        self.extra_tunnel_processes.pop(container_port, None)
+        self.extra_tunnel_log_readers.pop(container_port, None)
+        self.extra_tunnel_urls.pop(container_port, None)
+        self.extra_tunnel_start_times.pop(container_port, None)
 
-      self.P(f"Extra tunnel for port {container_port} stopped")
+      if result:
+        self.P(f"Extra tunnel for port {container_port} stopped")
+      else:
+        self.P(f"Extra tunnel for port {container_port} did not fully stop; preserving live handles for retry.", color='r')
 
     except Exception as e:
+      result = False
       self.P(f"Error stopping extra tunnel for port {container_port}: {e}", color='r')
+    return result
 
 
   def stop_extra_tunnels(self):
@@ -1780,17 +1820,20 @@ def stop_extra_tunnels(self):
 
     Returns
     -------
-    None
+    bool
+        True when all extra tunnels stopped, False otherwise.
     """
     if not self.extra_tunnel_processes:
-      return
+      return True
 
     self.P(f"Stopping {len(self.extra_tunnel_processes)} extra tunnel(s)...")
 
+    result = True
     for container_port in list(self.extra_tunnel_processes.keys()):
-      self._stop_extra_tunnel(container_port)
+      result = self._stop_extra_tunnel(container_port) and result
 
     self.P("All extra tunnels stopped")
+    return result
 
 
   def _read_extra_tunnel_logs(self, container_port):
@@ -2037,40 +2080,49 @@ def stop_container(self):
 
     Returns
     -------
-    None
+    bool
+        True when the container was stopped and removed, False otherwise.
 
     Notes
     -----
     If no container exists, logs a warning and returns.
-    Clears container and container_id attributes after removal.
+    Clears container and container_id attributes after successful removal.
     """
     if not self.container:
       self.P("No container to stop", color='r')
-      return
+      return True
 
+    result = True
+    removed = False
     try:
       # Stop the container (gracefully)
       self.P(f"Stopping container {self.container.short_id}...")
       self.container.stop(timeout=5)
       self.P(f"Container {self.container.short_id} stopped successfully")
     except Exception as e:
+      result = False
       self.P(f"Error stopping container: {e}", color='r')
     # end try
 
     try:
       self.P(f"Removing container {self.container.short_id}...")
       self.container.remove()
+      removed = True
       self.P(f"Container {self.container.short_id} removed successfully")
     except Exception as e:
+      result = False
       self.P(f"Error removing container: {e}", color='r')
     finally:
-      self.container = None
-      self.container_id = None
+      if removed:
+        self.container = None
+        self.container_id = None
+      else:
+        self.P("Preserving container handle after failed stop/remove for retry.", color='r')
     # end try
-    return
+    return removed
 
 
-  def _stream_logs(self, log_stream):
+  def _stream_logs(self, log_stream, stop_event=None):
     """
     Consume a log iterator from container logs and print its output.
 
@@ -2087,6 +2139,9 @@ def _stream_logs(self, log_stream):
       self.P("No log stream provided", color='r')
       return
 
+    if stop_event is None:
+      stop_event = self._stop_event
+
     try:
       for log_bytes in log_stream:
         if log_bytes is None:
@@ -2100,7 +2155,7 @@ def _stream_logs(self, log_stream):
         self.P(f"[CONTAINER] {log_str}", end='')
         self.container_logs.append(log_str)
 
-        if self._stop_event.is_set():
+        if stop_event.is_set():
           self.P("Log streaming stopped by stop event")
           break
     except Exception as e:
@@ -2127,7 +2182,7 @@ def _start_container_log_stream(self):
       log_stream = self.container.logs(stream=True, follow=True)
       self.log_thread = threading.Thread(
         target=self._stream_logs,
-        args=(log_stream,),
+        args=(log_stream, self._stop_event),
         daemon=True,
       )
       self.log_thread.start()
@@ -2214,7 +2269,7 @@ def _run_container_exec(self, shell_cmd):
       )
       thread = threading.Thread(
         target=self._stream_logs,
-        args=(exec_result.output,),
+        args=(exec_result.output, self._stop_event),
         daemon=True,
       )
       thread.start()
@@ -2672,39 +2727,78 @@ def _stop_container_and_save_logs_to_disk(self):
 
     Returns
     -------
-    None
+    bool
+        True when cleanup completed without required-step failures, False otherwise.
     """
     self.P(f"Stopping container app '{self.container_id}' ...")
 
+    cleanup_errors = []
+
+    def safe_cleanup_step(step_name, callback):
+      try:
+        result = callback()
+        if result is False:
+          cleanup_errors.append(step_name)
+          self.P(f"Container cleanup step '{step_name}' reported failure.", color='r')
+      except Exception as exc:
+        cleanup_errors.append(step_name)
+        self.P(f"Container cleanup step '{step_name}' failed: {exc}", color='r')
+
     # Clear semaphore and reset signaling state for potential restart
-    self._semaphore_reset_signal()
+    safe_cleanup_step("semaphore reset", self._semaphore_reset_signal)
 
-    # Stop log streaming
-    self._stop_event.set()
-    if self.log_thread:
-      self.log_thread.join(timeout=5)
-      self.log_thread = None
+    def signal_runtime_threads():
+      self._stop_event.set()
+      self._commands_started = False
+      return True
 
-    if getattr(self, 'exec_threads', None):
-      for thread in self.exec_threads:
-        if thread and thread.is_alive():
-          thread.join(timeout=5)
-      self.exec_threads = []
+    def join_runtime_threads():
+      result = True
+      stop_deadline = time.monotonic() + 5
 
-    self._stop_event = threading.Event()
-    self._commands_started = False
+      if self.log_thread:
+        self.log_thread.join(timeout=max(0, stop_deadline - time.monotonic()))
+        if self.log_thread.is_alive():
+          result = False
+          self.P("Container log thread is still alive after stop timeout.", color='r')
+        else:
+          self.log_thread = None
+
+      if getattr(self, 'exec_threads', None):
+        alive_threads = []
+        for thread in self.exec_threads:
+          remaining = max(0, stop_deadline - time.monotonic())
+          if thread and thread.is_alive() and remaining > 0:
+            thread.join(timeout=remaining)
+          if thread and thread.is_alive():
+            result = False
+            alive_threads.append(thread)
+        self.exec_threads = alive_threads
+
+      if result:
+        self._stop_event = threading.Event()
+      self._commands_started = False
+      return result
+
+    # Signal log/exec readers early, but join after Docker stop; quiet Docker
+    # streams usually unblock only when the container stops.
+    safe_cleanup_step("runtime thread signal", signal_runtime_threads)
 
     # Stop tunnel engine if needed
-    self.stop_tunnel_engine()
+    safe_cleanup_step("main tunnel", self.stop_tunnel_engine)
 
     # Stop extra tunnels
-    self.stop_extra_tunnels()
+    safe_cleanup_step("extra tunnels", self.stop_extra_tunnels)
 
     # Stop the container if it's running
-    self.stop_container()
+    safe_cleanup_step("docker container", self.stop_container)
+
+    # Stop log streaming threads after Docker stop has had a chance to unblock
+    # the log/exec streams.
+    safe_cleanup_step("runtime threads", join_runtime_threads)
 
     # Cleanup fixed-size volumes (unmount + detach loop devices)
-    self._cleanup_fixed_size_volumes()
+    safe_cleanup_step("fixed-size volumes", self._cleanup_fixed_size_volumes)
 
     # Save logs to disk under the instance's `logs/` sibling folder
     # (resolves to pipelines_data/{sid}/{iid}/logs/container_logs.pkl)
@@ -2717,7 +2811,12 @@ def _stop_container_and_save_logs_to_disk(self):
       self.P("Container logs saved to disk.")
     except Exception as exc:
       self.P(f"Failed to save logs: {exc}", color='r')
-    return
+    if cleanup_errors:
+      self._cleanup_failed = True
+      self.P("Container cleanup completed with failed step(s): {}.".format(", ".join(cleanup_errors)), color='r')
+      return False
+    self._cleanup_failed = False
+    return True
 
 
   def on_close(self):
@@ -3014,7 +3113,7 @@ def _check_image_updates(self, current_time=None):
     return
 
 
-  def _restart_container(self, stop_reason=None):
+  def _restart_container(self, stop_reason=None, cleanup_first=True):
     """
     Restart the container from scratch.
 
@@ -3022,10 +3121,15 @@ def _restart_container(self, stop_reason=None):
     ----------
     stop_reason : StopReason, optional
         Optional StopReason enum indicating why restart was triggered
+    cleanup_first : bool, optional
+        If True, stop the existing runtime before resetting state. Set to False
+        when the caller already performed cleanup and checked its result.
 
     Returns
     -------
-    None
+    bool
+        True when restart setup succeeded or was deferred waiting for
+        semaphores, False when cleanup or start failed.
     """
     self.P("Restarting container from scratch...")
 
@@ -3035,7 +3139,14 @@ def _restart_container(self, stop_reason=None):
     preserved_last_image_check = self._last_image_check
     preserved_current_hash = self.current_image_hash
 
-    self._stop_container_and_save_logs_to_disk()
+    if cleanup_first:
+      cleanup_ok = self._stop_container_and_save_logs_to_disk()
+      if not cleanup_ok:
+        self.P("Restart aborted because previous runtime cleanup failed.", color='r')
+        self._set_container_state(ContainerState.FAILED, stop_reason or StopReason.UNKNOWN)
+        self._record_restart_failure()
+        return False
+
     self.__reset_vars()
 
     # Reset chainstore response for restart cycle
@@ -3071,7 +3182,7 @@ def _restart_container(self, stop_reason=None):
       self._validate_extra_tunnels_config()
       self._validate_runner_config()
       self.P("Consumer container with semaphore dependencies: deferring start until providers are ready")
-      return
+      return True
 
     # Non-semaphored containers (providers): configure env and start immediately
     self._configure_dynamic_env()
@@ -3087,17 +3198,17 @@ def _restart_container(self, stop_reason=None):
       self.P("Failed to ensure image availability during restart, cannot start container", color='r')
       self._set_container_state(ContainerState.FAILED, StopReason.CRASH)
       self._record_restart_failure()
-      return
+      return False
 
     self.container = self.start_container()
     if not self.container:
       # start_container already recorded the failure
-      return
+      return False
 
     self.container_start_time = self.time()
     self._start_container_log_stream()
     self._maybe_execute_build_and_run()
-    return
+    return True
 
 
   def _ensure_image_always_pull(self):
@@ -3385,6 +3496,13 @@ def process(self):
         self._last_paused_log = current_time
       return
 
+    if self._cleanup_failed:
+      current_time = self.time()
+      if current_time - self._last_paused_log >= self.cfg_paused_state_log_interval:
+        self.P("Container cleanup previously failed; periodic launch/restart is blocked until cleanup succeeds.")
+        self._last_paused_log = current_time
+      return
+
     if not self.container:
       # Check if we're in backoff period
       if self._is_restart_backoff_active():
diff --git a/extensions/business/container_apps/fixed_volume.py b/extensions/business/container_apps/fixed_volume.py
index f584f08f..dd994e6f 100644
--- a/extensions/business/container_apps/fixed_volume.py
+++ b/extensions/business/container_apps/fixed_volume.py
@@ -387,15 +387,18 @@ def provision(
 def cleanup(
   vol: FixedVolume,
   logger: Optional[Callable] = None,
-) -> None:
+) -> bool:
   """Unmount and detach the loop device for a volume.
 
   Graceful -- never raises. All errors are caught and logged as warnings.
+  Returns False when unmount/detach could not be confirmed so callers can
+  preserve cleanup handles and retry later.
   """
   _log(
     logger, "STEP",
     f"Cleaning up volume={vol.name} mount_path={vol.mount_path}",
   )
+  result = True
   loop_dev = None
   if vol.meta_path.exists():
     try:
@@ -403,23 +406,34 @@ def cleanup(
       loop_dev = meta.get("loop_dev")
       _log(logger, "INFO", f"Loaded metadata loop_dev={loop_dev}")
     except Exception as exc:
+      result = False
       _log(logger, "WARN", f"Failed to read metadata error={exc}")
 
-  try:
-    _run(["umount", str(vol.mount_path)], logger=logger)
-  except Exception as exc:
-    _log(logger, "WARN", f"Unmount failed mount_path={vol.mount_path} error={exc}")
+  if _is_path_mounted(vol.mount_path):
+    try:
+      _run(["umount", str(vol.mount_path)], logger=logger)
+    except Exception as exc:
+      result = False
+      _log(logger, "WARN", f"Unmount failed mount_path={vol.mount_path} error={exc}")
+  else:
+    _log(logger, "INFO", f"Mount path is not mounted mount_path={vol.mount_path}")
 
   if loop_dev:
     try:
       _run(["losetup", "-d", loop_dev], logger=logger)
     except Exception as exc:
+      result = False
       _log(logger, "WARN", f"Detach loop failed loop_dev={loop_dev} error={exc}")
 
+  if _is_path_mounted(vol.mount_path):
+    result = False
+    _log(logger, "WARN", f"Mount path is still mounted mount_path={vol.mount_path}")
+
   _log(
     logger, "INFO",
-    f"Cleanup complete mount_path={vol.mount_path} loop_dev={loop_dev}",
+    f"Cleanup complete mount_path={vol.mount_path} loop_dev={loop_dev} ok={result}",
   )
+  return result
 
 
 def docker_bind_spec(vol: FixedVolume, container_target: str) -> Dict[str, Dict[str, str]]:
diff --git a/extensions/business/container_apps/mixins/fixed_size_volumes.py b/extensions/business/container_apps/mixins/fixed_size_volumes.py
index ef9bf868..b077aacd 100644
--- a/extensions/business/container_apps/mixins/fixed_size_volumes.py
+++ b/extensions/business/container_apps/mixins/fixed_size_volumes.py
@@ -226,12 +226,19 @@ def _cleanup_fixed_size_volumes(self):
     Called during container stop/close to free loop device resources.
     """
     if not hasattr(self, '_fixed_volumes') or not self._fixed_volumes:
-      return
+      return True
 
+    result = True
+    remaining_volumes = []
     for vol in self._fixed_volumes:
       try:
-        fixed_volume.cleanup(vol, logger=self.P)
+        cleaned = fixed_volume.cleanup(vol, logger=self.P)
+        if not cleaned:
+          result = False
+          remaining_volumes.append(vol)
       except Exception as exc:
+        result = False
+        remaining_volumes.append(vol)
         self.P(f"Failed to cleanup fixed volume '{vol.name}': {exc}", color='r')
-    self._fixed_volumes = []
-    return
+    self._fixed_volumes = remaining_volumes
+    return result

From fedd9255b4c4be2cce19b1a010b5a19051c4f81b Mon Sep 17 00:00:00 2001
From: Cristi Bleotiu <cristibleotiu@gmail.com>
Date: Fri, 15 May 2026 10:41:23 +0300
Subject: [PATCH 2/4] fix: harden container app shutdown cleanup

What changed:
- keep failed container cleanup candidates retryable instead of dropping handles
- make manual STOP/RESTART/config handling preserve cleanup state safely
- restore sync support files from develop and add lifecycle/fixed-volume/sync regression coverage

Why:
- avoid leaked container subprocesses and preserve existing sync behavior while resolving the PR branch against develop
---
 .../container_apps/container_app_runner.py    |  397 +++-
 .../business/container_apps/fixed_volume.py   |   74 +-
 .../business/container_apps/sync/__init__.py  |   90 +
 .../business/container_apps/sync/constants.py |   53 +
 .../container_apps/sync/control_files.py      |  327 +++
 .../business/container_apps/sync/manager.py   | 1959 +++++++++++++++++
 .../business/container_apps/sync/mixin.py     |  595 +++++
 .../business/container_apps/tests/support.py  |   41 +
 .../tests/test_container_lifecycle.py         |  145 ++
 .../container_apps/tests/test_fixed_volume.py |   76 +-
 .../tests/test_sync_control_files.py          |  228 ++
 .../container_apps/tests/test_sync_manager.py | 1869 ++++++++++++++++
 .../container_apps/tests/test_sync_mixin.py   |  744 +++++++
 13 files changed, 6556 insertions(+), 42 deletions(-)
 create mode 100644 extensions/business/container_apps/sync/__init__.py
 create mode 100644 extensions/business/container_apps/sync/constants.py
 create mode 100644 extensions/business/container_apps/sync/control_files.py
 create mode 100644 extensions/business/container_apps/sync/manager.py
 create mode 100644 extensions/business/container_apps/sync/mixin.py
 create mode 100644 extensions/business/container_apps/tests/test_sync_control_files.py
 create mode 100644 extensions/business/container_apps/tests/test_sync_manager.py
 create mode 100644 extensions/business/container_apps/tests/test_sync_mixin.py

diff --git a/extensions/business/container_apps/container_app_runner.py b/extensions/business/container_apps/container_app_runner.py
index c262b2c9..52601f64 100644
--- a/extensions/business/container_apps/container_app_runner.py
+++ b/extensions/business/container_apps/container_app_runner.py
@@ -66,6 +66,7 @@
 import os
 import requests
 import shutil
+import signal
 import threading
 import time
 import socket
@@ -86,6 +87,7 @@
   _RestartBackoffMixin,
   _TunnelBackoffMixin,
 )
+from .sync import _SyncMixin
 
 __VER__ = "0.7.1"
 
@@ -313,6 +315,25 @@ def from_dict(cls, config_dict: dict) -> "HealthCheckConfig":
                                 #   {"vol_name": {"SIZE": "100M", "MOUNTING_POINT": "/app/data", "FS_TYPE": "ext4",
                                 #                 "OWNER_UID": None, "OWNER_GID": None, "FORCE_RECREATE": False}}
 
+  # Volume-sync (cross-node state replication). Always-on /r1en_system system
+  # volume is provisioned regardless; SYNC.ENABLED only controls the
+  # provider/consumer orchestration on top of it. See the sync/ subpackage
+  # (constants.py + manager.py + mixin.py) for the full contract.
+  "SYNC": {
+    "ENABLED": False,             # master switch
+    "KEY": None,                  # shared UUID across the sync set (provider+consumer)
+    "TYPE": None,                 # "provider" | "consumer"
+    "POLL_INTERVAL": 10,          # seconds between sync ticks
+    "ALLOW_ONLINE_PROVIDER_CAPTURE": False,  # provider-local opt-in for live container fs capture
+    "CONSUMER_APPLY_MODE": "offline_restart",  # consumer-local apply lifecycle policy
+    "HSYNC_POLL_INTERVAL": 60,    # seconds between chainstore_hsync refreshes
+                                  # (consumer only; provider only calls hset, never hsync).
+                                  # Clamped to min 10s. The cheap local-replica hget
+                                  # still runs on every tick; only the network round-trip
+                                  # is rate-limited here. Failed hsync attempts retry
+                                  # sooner than the full interval.
+  },
+
   # Health check configuration (consolidated)
   # Controls how app readiness is determined before starting tunnels
   #
@@ -345,6 +366,9 @@ def from_dict(cls, config_dict: dict) -> "HealthCheckConfig":
   "MAX_LOG_LINES" : 10_000,   # max lines to keep in memory
   # When container is STOPPED_MANUALLY (PAUSED state), this will define how often we log its existance
   "PAUSED_STATE_LOG_INTERVAL": 60,
+  # Container apps can need more than the core plugin default to stop Docker,
+  # tunnel processes, runtime readers, and loop-backed fixed volumes safely.
+  "PLUGIN_STOP_TIMEOUT": 45,
 
   # Semaphore synchronization for paired plugins
   # List of semaphore keys to wait for before starting container
@@ -373,6 +397,7 @@ class ContainerAppRunnerPlugin(
   _ImagePullBackoffMixin,
   _TunnelBackoffMixin,
   _FixedSizeVolumesMixin,
+  _SyncMixin,
   _ContainerUtilsMixin,
   BasePlugin,
 ):
@@ -579,6 +604,7 @@ def __reset_vars(self):
     self.container_state = ContainerState.UNINITIALIZED
     self.stop_reason = StopReason.UNKNOWN
     self._cleanup_failed = False
+    self._manual_stop_pending = False
 
     # Restart policy and retry logic
     self._consecutive_failures = 0
@@ -615,6 +641,13 @@ def __reset_vars(self):
     self._last_image_check = 0
     self._last_extra_tunnels_ping = 0
     self._last_paused_log = 0  # Track when we last logged the paused message
+    self._last_sync_check = 0  # _SyncMixin throttle
+
+    # Volume-sync state. SyncManager is lazy-init'd by _ensure_sync_manager
+    # the first time a tick fires (or on_init for early provisioning).
+    self._sync_manager = None
+    self._sync_unavailable = False
+    self._runtime_stop_degraded = False
 
     # Image update tracking
     self.current_image_hash = None
@@ -1049,6 +1082,39 @@ def _validate_runner_config(self):
     return
 
 
+  def _validate_sync_config(self):
+    """
+    Validate the SYNC config block when ENABLED. Disables SYNC with a
+    warning rather than raising; the system volume itself is independent
+    and the rest of the plugin must keep running.
+    """
+    if not self._sync_enabled():
+      return
+    sync = self._sync_cfg()
+    key = sync.get("KEY")
+    role = sync.get("TYPE")
+    if not key or not isinstance(key, str):
+      self.P(
+        "[sync] SYNC.ENABLED but SYNC.KEY missing/empty; disabling SYNC.",
+        color="r",
+      )
+      sync["ENABLED"] = False
+      return
+    if role not in ("provider", "consumer"):
+      self.P(
+        f"[sync] SYNC.TYPE must be 'provider' or 'consumer' (got {role!r}); disabling SYNC.",
+        color="r",
+      )
+      sync["ENABLED"] = False
+      return
+    self.P(
+      f"[sync] SYNC enabled: role={role}, key={key}, "
+      f"poll={self._sync_poll_interval()}s",
+      color="g",
+    )
+    return
+
+
   def _validate_subclass_config(self):
     """
     Hook for subclasses to enforce additional validation.
@@ -1119,11 +1185,18 @@ def on_init(self):
     self._configure_volumes() # setup container volumes (deprecated)
     self._configure_file_volumes() # setup file volumes with dynamic content
     self._configure_fixed_size_volumes() # setup fixed-size file-backed volumes
+    self._configure_system_volume() # always-on /r1en_system control-plane volume
+
+    # If a prior plugin run crashed mid-publish, request.json.processing may
+    # be left over inside volume-sync/. Rename it back so the next tick retries.
+    self._recover_stale_processing()
+    self._validate_sync_config()
 
     # If we have semaphored keys, defer _setup_env_and_ports() until semaphores are ready
     # This ensures we get the env vars from provider plugins before starting the container
     if not self._semaphore_get_keys():
       self._setup_env_and_ports()
+      self._inject_sync_env_vars()
     else:
       self.Pd("Deferring _setup_env_and_ports() until semaphores are ready")
 
@@ -1193,17 +1266,26 @@ def on_command(self, data, **kwargs):
     if data == "RESTART":
       self.P("Restarting container...")
       self._clear_manual_stop_state()  # Clear persistent stop state
+      # RESTART is an explicit operator override for a previously failed STOP.
+      # Clear the in-memory pause intent too, otherwise a later cleanup retry
+      # could incorrectly persist PAUSED instead of relaunching the container.
+      self._manual_stop_pending = False
       self._set_container_state(ContainerState.RESTARTING, StopReason.CONFIG_UPDATE)
       self._restart_container(StopReason.CONFIG_UPDATE)
       return
 
     elif data == "STOP":
       self.P("Stopping container (manual stop - restart policy will not trigger)...")
-      self._save_persistent_state(manually_stopped=True)  # Save persistent stop state
+      self._manual_stop_pending = True
       cleanup_ok = self._stop_container_and_save_logs_to_disk()
       if cleanup_ok:
+        self._save_persistent_state(manually_stopped=True)  # Persist only after cleanup succeeds.
+        self._manual_stop_pending = False
         self._set_container_state(ContainerState.PAUSED, StopReason.MANUAL_STOP)
       else:
+        # Keep the failed cleanup retryable without persisting a paused state
+        # that would later make config restarts look intentionally ignored.
+        self._clear_manual_stop_state()
         self._set_container_state(ContainerState.FAILED, StopReason.UNKNOWN)
       return
     else:
@@ -1243,6 +1325,16 @@ def _handle_config_restart(self, restart_callable):
       )
       return
 
+    if self._manual_stop_pending:
+      self.P(
+        "Manual STOP cleanup is still pending. Ignoring config restart; "
+        "send RESTART to override the pending stop intent.",
+        color='y',
+      )
+      if self._cleanup_failed:
+        self._retry_failed_cleanup()
+      return
+
     # Check persistent state as fallback (in case container_state not yet set)
     if self._load_manual_stop_state():
       self.P(
@@ -1367,6 +1459,112 @@ def get_cloudflare_protocol(self):
     return super(ContainerAppRunnerPlugin, self).get_cloudflare_protocol()
 
 
+  def _remember_process_group(self, process):
+    """
+    Record tunnel process-group ids even when deployed with an older core.
+
+    Newer cores provide this on ``BaseTunnelEnginePlugin``. Keeping a local
+    fallback lets the edge PR roll out before the matching core PR without
+    breaking extra tunnel startup after ``subprocess.Popen`` succeeds.
+    """
+    base_method = getattr(super(ContainerAppRunnerPlugin, self), "_remember_process_group", None)
+    if callable(base_method):
+      return base_method(process)
+    if process is not None and os.name != "nt":
+      try:
+        process._r1_process_group_id = os.getpgid(process.pid)
+      except Exception as exc:
+        self.P(f"Could not record tunnel process group: {exc}", color='r')
+    return process
+
+
+  def _terminate_subprocess_tree(self, process, label="subprocess", terminate_timeout=5, kill_timeout=5):
+    """
+    Terminate a tunnel subprocess tree with a compatibility fallback.
+
+    Prefer the core implementation when present; otherwise use the same bounded
+    POSIX process-group shutdown strategy locally so mixed-version deployments
+    do not leak extra tunnel children.
+    """
+    base_method = getattr(super(ContainerAppRunnerPlugin, self), "_terminate_subprocess_tree", None)
+    if callable(base_method):
+      return base_method(
+        process,
+        label=label,
+        terminate_timeout=terminate_timeout,
+        kill_timeout=kill_timeout,
+      )
+    if process is None:
+      return True
+
+    pgid = getattr(process, "_r1_process_group_id", None)
+
+    def is_process_group_alive():
+      if os.name == "nt" or pgid is None:
+        return False
+      try:
+        os.killpg(pgid, 0)
+        return True
+      except ProcessLookupError:
+        return False
+      except Exception as exc:
+        self.P(f"Could not probe {label} process group {pgid}: {exc}", color='r')
+        return True
+
+    def wait_process_tree(timeout):
+      deadline = time.monotonic() + timeout
+      process_stopped = process.poll() is not None
+      if not process_stopped:
+        try:
+          process.wait(timeout=timeout)
+          process_stopped = True
+        except subprocess.TimeoutExpired:
+          process_stopped = False
+        except Exception as exc:
+          self.P(f"Error waiting for {label}: {exc}", color='r')
+          process_stopped = process.poll() is not None
+      if os.name == "nt" or pgid is None:
+        return process_stopped
+      while time.monotonic() < deadline:
+        if not is_process_group_alive():
+          return process_stopped
+        time.sleep(0.05)
+      return process_stopped and not is_process_group_alive()
+
+    def send_signal(sig, fallback):
+      if os.name != "nt" and pgid is not None and sig is not None:
+        try:
+          os.killpg(pgid, sig)
+          return True
+        except ProcessLookupError:
+          return True
+        except Exception as exc:
+          self.P(f"Error signaling {label} process group {pgid}: {exc}", color='r')
+      if process.poll() is None:
+        try:
+          fallback()
+          return True
+        except Exception as exc:
+          self.P(f"Error signaling {label}: {exc}", color='r')
+          return False
+      return True
+
+    if process.poll() is None or is_process_group_alive():
+      if not send_signal(signal.SIGTERM, process.terminate):
+        return False
+    if wait_process_tree(terminate_timeout):
+      return True
+
+    self.P(f"{label} did not stop after terminate; killing it.", color='r')
+    kill_signal = getattr(signal, "SIGKILL", None)
+    if not send_signal(kill_signal, process.kill):
+      return False
+    if wait_process_tree(kill_timeout):
+      return True
+    self.P(f"{label} did not exit after kill; continuing shutdown.", color='r')
+    return False
+
+
   def stop_tunnel_engine(self):
     """
     Stop the main tunnel engine.
@@ -2081,7 +2279,9 @@ def stop_container(self):
     Returns
     -------
     bool
-        True when the container was stopped and removed, False otherwise.
+      True when there is no container or the container was removed from Docker.
+      False when Docker reported a remove failure and the container may still
+      exist/running.
 
     Notes
     -----
@@ -2092,34 +2292,40 @@ def stop_container(self):
       self.P("No container to stop", color='r')
       return True
 
-    result = True
-    removed = False
+    stopped_ok = True
+    removed_ok = True
     try:
       # Stop the container (gracefully)
       self.P(f"Stopping container {self.container.short_id}...")
       self.container.stop(timeout=5)
       self.P(f"Container {self.container.short_id} stopped successfully")
     except Exception as e:
-      result = False
+      stopped_ok = False
       self.P(f"Error stopping container: {e}", color='r')
     # end try
 
     try:
       self.P(f"Removing container {self.container.short_id}...")
       self.container.remove()
-      removed = True
       self.P(f"Container {self.container.short_id} removed successfully")
     except Exception as e:
-      result = False
+      removed_ok = False
       self.P(f"Error removing container: {e}", color='r')
-    finally:
-      if removed:
-        self.container = None
-        self.container_id = None
-      else:
-        self.P("Preserving container handle after failed stop/remove for retry.", color='r')
+    if removed_ok:
+      if not stopped_ok:
+        self.P(
+          "Container stop reported an error, but remove succeeded; treating "
+          "container as stopped for restart/cleanup purposes.",
+          color='y',
+        )
+      self.container = None
+      self.container_id = None
+    else:
+      # Keep the handle so a later cleanup retry can remove the same Docker
+      # object instead of losing track of a possibly still-running container.
+      self.P("Preserving container handle after failed stop/remove for retry.", color='r')
     # end try
-    return removed
+    return removed_ok
 
 
   def _stream_logs(self, log_stream, stop_event=None):
@@ -2713,22 +2919,22 @@ def _check_extra_tunnel_health(self):
         self._maybe_reset_tunnel_retry_counter(container_port)
 
 
-  def _stop_container_and_save_logs_to_disk(self):
+  def _stop_container_runtime_for_restart(self):
     """
-    Stop the container and all tunnels, then save logs to disk.
+    Stop runtime sidecars and remove the Docker container.
 
-    Performs full shutdown sequence:
+    Performs the shared pre-restart shutdown sequence:
     - Clears semaphore (signals dependent plugins container is stopping)
     - Stops log streaming threads
     - Stops main tunnel engine
     - Stops all extra tunnels
     - Stops and removes container
-    - Saves logs to disk
 
     Returns
     -------
     bool
-        True when cleanup completed without required-step failures, False otherwise.
+      True when the Docker container is stopped/removed or absent, False when
+      Docker reported a failure.
     """
     self.P(f"Stopping container app '{self.container_id}' ...")
 
@@ -2790,18 +2996,62 @@ def join_runtime_threads():
     # Stop extra tunnels
     safe_cleanup_step("extra tunnels", self.stop_extra_tunnels)
 
-    # Stop the container if it's running
-    safe_cleanup_step("docker container", self.stop_container)
+    def stop_runtime_container():
+      stopped = self.stop_container()
+      self._runtime_stop_degraded = not stopped
+      if not stopped:
+        self.P(
+          "Container runtime stop failed after sidecars were stopped; container "
+          "may still be running and volume mutation/cleanup must be skipped.",
+          color='r',
+        )
+      return stopped
+
+    # Stop the container if it's running. A false result preserves the Docker
+    # handle for retry and prevents volume mutation against a possibly live app.
+    safe_cleanup_step("docker container", stop_runtime_container)
 
     # Stop log streaming threads after Docker stop has had a chance to unblock
     # the log/exec streams.
     safe_cleanup_step("runtime threads", join_runtime_threads)
 
-    # Cleanup fixed-size volumes (unmount + detach loop devices)
-    safe_cleanup_step("fixed-size volumes", self._cleanup_fixed_size_volumes)
+    if cleanup_errors:
+      self._cleanup_failed = True
+      self.P("Container runtime cleanup completed with failed step(s): {}.".format(", ".join(cleanup_errors)), color='r')
+      return False
+    self._runtime_stop_degraded = False
+    self._cleanup_failed = False
+    return True
+
+
+  def _stop_container_and_save_logs_to_disk(self):
+    """
+    Stop the container, all tunnels, fixed volumes, then save logs to disk.
 
-    # Save logs to disk under the instance's `logs/` sibling folder
-    # (resolves to pipelines_data/{sid}/{iid}/logs/container_logs.pkl)
+    Returns
+    -------
+    bool
+        True when cleanup completed without required-step failures, False otherwise.
+    """
+    runtime_ok = self._stop_container_runtime_for_restart()
+    cleanup_errors = []
+
+    if runtime_ok:
+      try:
+        if self._cleanup_fixed_size_volumes() is False:
+          cleanup_errors.append("fixed-size volumes")
+      except Exception as exc:
+        cleanup_errors.append("fixed-size volumes")
+        self.P(f"Container cleanup step 'fixed-size volumes' failed: {exc}", color='r')
+    else:
+      cleanup_errors.append("runtime")
+      self.P(
+        "Skipping fixed-size volume cleanup because container stop/remove failed.",
+        color='r',
+      )
+
+    # Save logs to disk even when cleanup is degraded; logs are diagnostic data
+    # and should not be lost just because Docker/tunnel teardown needs a retry.
     try:
       self.diskapi_save_pickle_to_data(
         obj=list(self.container_logs),
@@ -2811,6 +3061,7 @@ def join_runtime_threads():
       self.P("Container logs saved to disk.")
     except Exception as exc:
       self.P(f"Failed to save logs: {exc}", color='r')
+
     if cleanup_errors:
       self._cleanup_failed = True
       self.P("Container cleanup completed with failed step(s): {}.".format(", ".join(cleanup_errors)), color='r')
@@ -3113,6 +3364,36 @@ def _check_image_updates(self, current_time=None):
     return
 
 
+  def _reset_runtime_state_post_start(self):
+    """Bring per-restart runtime markers back to a fresh-boot baseline.
+
+    Called after a successful ``start_container()`` to:
+      - stamp ``container_start_time`` so health-probe elapsed timers measure
+        from this new boot
+      - clear readiness gates (``_app_ready``, ``_health_probe_start``,
+        ``_tunnel_start_allowed``) so health checks re-run against the new
+        container's state and tunnels gate on the new readiness probe
+      - clear the command-rerun gate (``_commands_started``) so
+        BUILD_AND_RUN_COMMANDS rerun against the new container
+      - re-attach log capture (the prior log thread was stopped at
+        ``stop_container`` time)
+      - run image-defined build/run commands
+
+    Shared between ``_restart_container`` and the volume-sync ticks
+    (``_SyncMixin._sync_safe_start_container``) so they stay in lockstep;
+    sync slices stop+start the container inline (to keep the system volume
+    mounted), and without this helper the readiness/probe state would still
+    point at the previous container instance.
+    """
+    self.container_start_time = self.time()
+    self._app_ready = False
+    self._health_probe_start = None
+    self._tunnel_start_allowed = False
+    self._commands_started = False
+    self._start_container_log_stream()
+    self._maybe_execute_build_and_run()
+
+
   def _restart_container(self, stop_reason=None, cleanup_first=True):
     """
     Restart the container from scratch.
@@ -3169,6 +3450,9 @@ def _restart_container(self, stop_reason=None, cleanup_first=True):
     self._configure_volumes()
     self._configure_file_volumes()
     self._configure_fixed_size_volumes()
+    self._configure_system_volume()
+    self._recover_stale_processing()
+    self._validate_sync_config()
 
     # For semaphored containers (consumers), defer env setup and container start
     # to _handle_initial_launch() which properly waits for provider semaphores.
@@ -3187,6 +3471,7 @@ def _restart_container(self, stop_reason=None, cleanup_first=True):
     # Non-semaphored containers (providers): configure env and start immediately
     self._configure_dynamic_env()
     self._setup_env_and_ports()
+    self._inject_sync_env_vars()
 
     # Revalidate extra tunnels
     self._validate_extra_tunnels_config()
@@ -3205,9 +3490,7 @@ def _restart_container(self, stop_reason=None, cleanup_first=True):
       # start_container already recorded the failure
       return False
 
-    self.container_start_time = self.time()
-    self._start_container_log_stream()
-    self._maybe_execute_build_and_run()
+    self._reset_runtime_state_post_start()
     return True
 
 
@@ -3370,6 +3653,7 @@ def _handle_initial_launch(self):
       # Semaphores ready - dynamic env to resolve shmem values, then setup env
       self._configure_dynamic_env()
       self._setup_env_and_ports()
+      self._inject_sync_env_vars()
     # end if
 
     try:
@@ -3462,9 +3746,60 @@ def _perform_additional_checks(self, current_time):
         return StopReason.EXTERNAL_UPDATE
       return None
     """
+    # Volume-sync drives stop_container/start_container INLINE so the loopback
+    # mount survives the archive/extract window. We must not return a
+    # StopReason from here because that would route through _restart_container,
+    # which calls _cleanup_fixed_size_volumes() and unmounts before our work
+    # can run.
+    if self._sync_enabled():
+      role = self._sync_role()
+      if role == "provider":
+        self._sync_provider_tick(current_time)
+      elif role == "consumer":
+        self._sync_consumer_tick(current_time)
     return None
 
 
+  def _retry_failed_cleanup(self):
+    """
+    Retry a previously failed cleanup cycle from the normal process loop.
+
+    Cleanup failure is a backoff/retry state, not a permanent latch. This keeps
+    transient Docker, tunnel, log-reader, or fixed-volume failures visible while
+    still giving the plugin an automatic recovery path.
+    """
+    if not self._cleanup_failed:
+      return True
+
+    if self._has_exceeded_max_retries():
+      self.P(
+        "Container cleanup retry abandoned after {} consecutive failure(s).".format(
+          self._consecutive_failures
+        ),
+        color='r',
+      )
+      return False
+
+    if self._is_restart_backoff_active():
+      return False
+
+    self.P("Retrying previously failed container cleanup...", color='y')
+    cleanup_ok = self._stop_container_and_save_logs_to_disk()
+    if cleanup_ok:
+      self._cleanup_failed = False
+      self.P("Previously failed container cleanup succeeded.", color='g')
+      if self._manual_stop_pending:
+        self._save_persistent_state(manually_stopped=True)
+        self._manual_stop_pending = False
+        self._set_container_state(ContainerState.PAUSED, StopReason.MANUAL_STOP)
+        return False
+      return True
+
+    self._record_restart_failure()
+    self._set_container_state(ContainerState.FAILED, self.stop_reason or StopReason.UNKNOWN)
+    return False
+
+
   def process(self):
     """
     Main process loop for the plugin.
@@ -3496,11 +3831,7 @@ def process(self):
         self._last_paused_log = current_time
       return
 
-    if self._cleanup_failed:
-      current_time = self.time()
-      if current_time - self._last_paused_log >= self.cfg_paused_state_log_interval:
-        self.P("Container cleanup previously failed; periodic launch/restart is blocked until cleanup succeeds.")
-        self._last_paused_log = current_time
+    if self._cleanup_failed and not self._retry_failed_cleanup():
       return
 
     if not self.container:
diff --git a/extensions/business/container_apps/fixed_volume.py b/extensions/business/container_apps/fixed_volume.py
index dd994e6f..6e38a5bc 100644
--- a/extensions/business/container_apps/fixed_volume.py
+++ b/extensions/business/container_apps/fixed_volume.py
@@ -48,6 +48,33 @@ def _log(logger: Optional[Callable], level: str, message: str) -> None:
     print(f"[FixedVolume] [{level}] {message}", flush=True)
 
 
+def _decode_proc_mount_field(value: str) -> str:
+  """Decode the octal escapes used by /proc/mounts fields."""
+  return (value.replace("\\040", " ")
+               .replace("\\011", "\t")
+               .replace("\\012", "\n")
+               .replace("\\134", "\\"))
+
+
+def _get_mount_source(mount_path) -> Optional[str]:
+  """Return the exact source device mounted at ``mount_path``, if any."""
+  try:
+    with open("/proc/mounts", "r", encoding="utf-8") as f:
+      lines = f.readlines()
+  except OSError:
+    return None
+  target = str(mount_path).rstrip("/")
+  for line in lines:
+    parts = line.split()
+    if len(parts) < 2:
+      continue
+    source = _decode_proc_mount_field(parts[0])
+    mp = _decode_proc_mount_field(parts[1])
+    if mp.rstrip("/") == target:
+      return source
+  return None
+
+
 def _is_path_mounted(mount_path) -> bool:
   """Return True iff `mount_path` is an exact mountpoint in /proc/mounts.
 
@@ -72,11 +99,7 @@ def _is_path_mounted(mount_path) -> bool:
     parts = line.split()
     if len(parts) < 2:
       continue
-    mp = parts[1]
-    mp = (mp.replace("\\040", " ")
-            .replace("\\011", "\t")
-            .replace("\\012", "\n")
-            .replace("\\134", "\\"))
+    mp = _decode_proc_mount_field(parts[1])
     if mp.rstrip("/") == target:
       return True
   return False
@@ -400,16 +423,53 @@ def cleanup(
   )
   result = True
   loop_dev = None
+  metadata_error = False
   if vol.meta_path.exists():
     try:
       meta = json.loads(vol.meta_path.read_text(encoding="utf-8"))
       loop_dev = meta.get("loop_dev")
       _log(logger, "INFO", f"Loaded metadata loop_dev={loop_dev}")
     except Exception as exc:
-      result = False
+      metadata_error = True
       _log(logger, "WARN", f"Failed to read metadata error={exc}")
 
-  if _is_path_mounted(vol.mount_path):
+  mount_source = _get_mount_source(vol.mount_path)
+  mount_source_is_loop = mount_source and str(mount_source).startswith("/dev/loop")
+  if mount_source_is_loop and loop_dev is None:
+    # A mounted loop source is a stronger identity than the sidecar metadata:
+    # it lets us unmount and detach safely even when metadata was lost/corrupt.
+    loop_dev = mount_source
+    metadata_error = False
+    _log(logger, "WARN", f"Recovered loop device from /proc/mounts loop_dev={loop_dev}")
+  elif mount_source_is_loop and loop_dev != mount_source:
+    # Metadata can be stale after interrupted cleanup/restart. The mounted
+    # source is the device that must be detached after unmount, so prefer it.
+    _log(
+      logger, "WARN",
+      f"Metadata loop_dev={loop_dev} differs from mounted source={mount_source}; using mounted source.",
+    )
+    loop_dev = mount_source
+  elif mount_source and loop_dev is None:
+    # A mounted path without a positive loop-device identity must not be
+    # reported as a clean fixed-volume teardown; callers need to retain it for
+    # operator inspection/retry instead of dropping cleanup tracking.
+    result = False
+    _log(logger, "WARN", f"Mounted path has no loop metadata source={mount_source}")
+  elif mount_source:
+    # A fixed-size volume should be mounted from a loop device. If /proc/mounts
+    # says otherwise, fail closed instead of detaching a possibly unrelated
+    # metadata loop device and reporting success.
+    result = False
+    _log(
+      logger, "WARN",
+      f"Mounted path source is not a loop device source={mount_source}; refusing metadata loop detach.",
+    )
+    loop_dev = None
+
+  if metadata_error:
+    result = False
+
+  if mount_source is not None:
     try:
       _run(["umount", str(vol.mount_path)], logger=logger)
     except Exception as exc:
diff --git a/extensions/business/container_apps/sync/__init__.py b/extensions/business/container_apps/sync/__init__.py
new file mode 100644
index 00000000..c9f06457
--- /dev/null
+++ b/extensions/business/container_apps/sync/__init__.py
@@ -0,0 +1,90 @@
+"""Volume-sync subpackage for the Container App Runner.
+
+The whole feature lives here:
+  * ``constants.py`` — file names, namespace strings, schema versions,
+    failure-stage labels. No code, just data.
+  * ``manager.py``   — ``SyncManager`` class plus host-side path helpers.
+    Pure I/O orchestration; takes the plugin as ``owner`` and delegates
+    network/storage to ``owner.r1fs`` / ``owner.chainstore_*``.
+  * ``mixin.py``     — ``_SyncMixin`` class. Plugin-class integration:
+    knows when sync work should happen (on_init, _restart_container,
+    _perform_additional_checks, _handle_initial_launch) and frames each
+    invocation around a ``stop_container → SyncManager.work →
+    start_container`` window.
+
+Re-exports below let callers import from the package root rather than
+reaching into individual modules.
+"""
+
+from .constants import (
+  ARCHIVE_ENCRYPTION,
+  ARCHIVE_FORMAT,
+  CHAINSTORE_SYNC_HKEY,
+  MANIFEST_SCHEMA_VERSION,
+  STAGE_ARCHIVE_BUILD,
+  STAGE_CHAINSTORE_PUBLISH,
+  STAGE_EXTRACT,
+  STAGE_R1FS_UPLOAD,
+  STAGE_RUNTIME_STOP,
+  STAGE_VALIDATION,
+  SYNC_HISTORY_DIR,
+  SYNC_HISTORY_RECEIVED,
+  SYNC_HISTORY_SENT,
+  SYNC_INVALID_FILE,
+  SYNC_LAST_APPLY_FILE,
+  SYNC_PROCESSING_FILE,
+  SYNC_REQUEST_FILE,
+  SYNC_RESPONSE_FILE,
+  SYSTEM_VOLUME_FS,
+  SYSTEM_VOLUME_MOUNT,
+  SYSTEM_VOLUME_NAME,
+  SYSTEM_VOLUME_SIZE,
+  VOLUME_SYNC_SUBDIR,
+)
+from .manager import (
+  SyncManager,
+  history_received_dir,
+  history_root,
+  history_sent_dir,
+  sync_state_dir,
+  system_volume_host_root,
+  volume_sync_dir,
+)
+from .mixin import _SyncMixin
+
+__all__ = [
+  # constants
+  "ARCHIVE_ENCRYPTION",
+  "ARCHIVE_FORMAT",
+  "CHAINSTORE_SYNC_HKEY",
+  "MANIFEST_SCHEMA_VERSION",
+  "STAGE_ARCHIVE_BUILD",
+  "STAGE_CHAINSTORE_PUBLISH",
+  "STAGE_EXTRACT",
+  "STAGE_R1FS_UPLOAD",
+  "STAGE_RUNTIME_STOP",
+  "STAGE_VALIDATION",
+  "SYNC_HISTORY_DIR",
+  "SYNC_HISTORY_RECEIVED",
+  "SYNC_HISTORY_SENT",
+  "SYNC_INVALID_FILE",
+  "SYNC_LAST_APPLY_FILE",
+  "SYNC_PROCESSING_FILE",
+  "SYNC_REQUEST_FILE",
+  "SYNC_RESPONSE_FILE",
+  "SYSTEM_VOLUME_FS",
+  "SYSTEM_VOLUME_MOUNT",
+  "SYSTEM_VOLUME_NAME",
+  "SYSTEM_VOLUME_SIZE",
+  "VOLUME_SYNC_SUBDIR",
+  # path helpers
+  "history_received_dir",
+  "history_root",
+  "history_sent_dir",
+  "sync_state_dir",
+  "system_volume_host_root",
+  "volume_sync_dir",
+  # classes
+  "SyncManager",
+  "_SyncMixin",
+]
diff --git a/extensions/business/container_apps/sync/constants.py b/extensions/business/container_apps/sync/constants.py
new file mode 100644
index 00000000..255e36c1
--- /dev/null
+++ b/extensions/business/container_apps/sync/constants.py
@@ -0,0 +1,53 @@
+"""Volume-sync constants and namespace conventions.
+
+Hard-coded values (no config knobs) shared by ``SyncManager``,
+``_SyncMixin``, and the unit tests. Lives in its own module so a reader
+can `cat sync/constants.py` to see the full data-plane vocabulary in one
+place — file names, the ChainStore hkey, the stage labels, the schema
+version. Anything tunable belongs in the plugin's ``SYNC`` config block,
+not here.
+"""
+
+# ---------------------------------------------------------------------------
+# System volume — non-configurable defaults
+# ---------------------------------------------------------------------------
+
+SYSTEM_VOLUME_NAME = "r1en_system"          # logical name (host paths)
+SYSTEM_VOLUME_MOUNT = "/r1en_system"        # mount point inside container
+SYSTEM_VOLUME_SIZE = "10M"                  # fixed-size ext4 image — control-plane only
+SYSTEM_VOLUME_FS = "ext4"
+
+# Per-feature subdirectory under the system volume root, so future CAR ↔ app
+# control-plane features (not just sync) can coexist without colliding.
+VOLUME_SYNC_SUBDIR = "volume-sync"
+
+# Filenames inside <system_volume_root>/<VOLUME_SYNC_SUBDIR>/
+SYNC_REQUEST_FILE = "request.json"
+SYNC_PROCESSING_FILE = "request.json.processing"
+SYNC_INVALID_FILE = "request.json.invalid"
+SYNC_RESPONSE_FILE = "response.json"
+SYNC_LAST_APPLY_FILE = "last_apply.json"
+
+# Persistent audit folders under <plugin_data>/sync_history/
+SYNC_HISTORY_DIR = "sync_history"
+SYNC_HISTORY_SENT = "sent"          # provider — writes to R1FS
+SYNC_HISTORY_RECEIVED = "received"  # consumer — reads from R1FS
+
+# ChainStore namespace
+CHAINSTORE_SYNC_HKEY = "CHAINSTORE_SYNC"
+
+# Manifest schema versioning so consumers can refuse newer-than-known formats
+MANIFEST_SCHEMA_VERSION = 1
+ARCHIVE_FORMAT = "tar.gz"
+ARCHIVE_ENCRYPTION = "r1fs-default"
+
+# Stages reported on failure (used in response.json + request.json.invalid)
+STAGE_VALIDATION = "validation"
+STAGE_ARCHIVE_BUILD = "archive_build"
+STAGE_R1FS_UPLOAD = "r1fs_upload"
+STAGE_CHAINSTORE_PUBLISH = "chainstore_publish"
+STAGE_EXTRACT = "extract"
+STAGE_RUNTIME_STOP = "runtime_stop"
+
+# History entry deletion sub-record default (filled in when superseded).
+_UNDELETED = {"deleted_at": None, "deletion_succeeded": None, "deletion_error": None}
diff --git a/extensions/business/container_apps/sync/control_files.py b/extensions/business/container_apps/sync/control_files.py
new file mode 100644
index 00000000..b51d3e3b
--- /dev/null
+++ b/extensions/business/container_apps/sync/control_files.py
@@ -0,0 +1,327 @@
+"""Helpers for CAR volume-sync JSON control files.
+
+The sync data plane uses small JSON files in the always-mounted system
+volume as a control protocol between the app and CAR. This module owns the
+file mechanics: atomic JSON writes, pending-to-processing claims, stale
+processing recovery, and processing cleanup. SyncManager keeps the domain
+validation and response payload shapes.
+"""
+
+from __future__ import annotations
+
+import errno
+import json
+import os
+import stat
+import tempfile
+import time as _time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Optional
+
+
+@dataclass(frozen=True)
+class ClaimedJsonObject:
+  """A JSON object claimed from a pending control file."""
+
+  body: dict
+  raw_body: str
+  processing_path: Path
+
+
+class JsonControlFileError(Exception):
+  """Base class for control-file mechanics errors."""
+
+  def __init__(
+    self,
+    message: str,
+    *,
+    raw_body: Optional[str] = None,
+    processing_path: Optional[Path] = None,
+  ) -> None:
+    super().__init__(message)
+    self.raw_body = raw_body
+    self.processing_path = processing_path
+
+
+class JsonControlFileClaimError(JsonControlFileError):
+  """The pending file could not be renamed to its processing name."""
+
+
+class JsonControlFileReadError(JsonControlFileError):
+  """The processing file could not be read."""
+
+
+class JsonControlFileDecodeError(JsonControlFileError):
+  """The processing file was not valid JSON."""
+
+
+class JsonControlFileObjectError(JsonControlFileError):
+  """The processing file was JSON, but not a JSON object."""
+
+
+class JsonControlFileUnsafeError(JsonControlFileError):
+  """The processing file is not a regular no-follow-readable file."""
+
+
+def _ensure_real_directory(path: Path, *, create: bool) -> bool:
+  """Ensure ``path`` is a real directory, not a symlink.
+
+  Returns False only when ``create`` is False and the path is absent.
+  """
+  path = Path(path)
+  if create:
+    try:
+      path.mkdir(parents=True, exist_ok=True)
+    except FileExistsError:
+      pass
+  try:
+    st = os.lstat(str(path))
+  except FileNotFoundError:
+    if create:
+      raise
+    return False
+  if stat.S_ISLNK(st.st_mode):
+    raise JsonControlFileUnsafeError(
+      f"refusing symlink control directory: {path}"
+    )
+  if not stat.S_ISDIR(st.st_mode):
+    raise JsonControlFileUnsafeError(
+      f"refusing non-directory control directory: {path}"
+    )
+  return True
+
+
+def write_json_atomic(path: Path, payload: Any) -> None:
+  """Write JSON to ``path`` atomically and make it app-readable.
+
+  Creates the parent directory if missing. Uses a temporary file in the same
+  directory so ``os.replace`` is atomic within the filesystem. The final file
+  is chmod'd to 0o644 so apps can read CAR-owned status/control results
+  without being able to rewrite them.
+  """
+  path = Path(path)
+  _ensure_real_directory(path.parent, create=True)
+  fd, tmp_name = tempfile.mkstemp(
+    dir=str(path.parent), prefix=f".{path.name}.", suffix=".tmp"
+  )
+  try:
+    with os.fdopen(fd, "w", encoding="utf-8") as handle:
+      json.dump(payload, handle, indent=2, sort_keys=True)
+      handle.flush()
+      os.fsync(handle.fileno())
+    os.chmod(tmp_name, 0o644)
+    os.replace(tmp_name, str(path))
+  except Exception:
+    try:
+      os.unlink(tmp_name)
+    except OSError:
+      pass
+    raise
+
+
+class JsonControlFile:
+  """File-mechanics helper for a single pending/processing JSON control file."""
+
+  def __init__(self, root: Path, pending_name: str, processing_name: str):
+    self.root = Path(root)
+    self.pending_name = pending_name
+    self.processing_name = processing_name
+
+  @property
+  def pending_path(self) -> Path:
+    return self.root / self.pending_name
+
+  @property
+  def processing_path(self) -> Path:
+    return self.root / self.processing_name
+
+  def _root_is_safe(self) -> bool:
+    return _ensure_real_directory(self.root, create=False)
+
+  def has_pending(self) -> bool:
+    if not self._root_is_safe():
+      return False
+    try:
+      os.lstat(str(self.pending_path))
+      return True
+    except FileNotFoundError:
+      return False
+
+  @staticmethod
+  def _quarantine_directory(path: Path) -> None:
+    parent = path.parent
+    base_name = path.name
+    for _ in range(5):
+      target = parent / f"{base_name}.unsafe.{_time.time_ns()}"
+      try:
+        os.replace(str(path), str(target))
+        return
+      except FileExistsError:
+        continue
+    raise JsonControlFileUnsafeError(
+      f"could not quarantine unsafe control directory: {base_name}",
+      processing_path=path,
+    )
+
+  @classmethod
+  def _remove_unsafe_entry(cls, path: Path) -> None:
+    st = os.lstat(str(path))
+    if stat.S_ISDIR(st.st_mode):
+      try:
+        os.rmdir(str(path))
+      except OSError as exc:
+        if getattr(exc, "errno", None) in (errno.ENOTEMPTY, errno.EEXIST):
+          cls._quarantine_directory(path)
+          return
+        raise
+      return
+    os.unlink(str(path))
+
+  @classmethod
+  def _reject_non_regular_control_file(cls, path: Path) -> None:
+    try:
+      st = os.lstat(str(path))
+    except FileNotFoundError:
+      raise
+    if stat.S_ISREG(st.st_mode):
+      return
+    if stat.S_ISLNK(st.st_mode):
+      message = f"refusing symlink control file: {path.name}"
+    else:
+      message = f"refusing non-regular control file: {path.name}"
+    try:
+      cls._remove_unsafe_entry(path)
+    finally:
+      raise JsonControlFileUnsafeError(
+        message,
+        processing_path=path,
+      )
+
+  @staticmethod
+  def _read_text_no_follow(path: Path) -> str:
+    flags = os.O_RDONLY
+    if hasattr(os, "O_NOFOLLOW"):
+      flags |= os.O_NOFOLLOW
+    if hasattr(os, "O_NONBLOCK"):
+      flags |= os.O_NONBLOCK
+
+    fd: Optional[int] = None
+    try:
+      fd = os.open(str(path), flags)
+      st = os.fstat(fd)
+      if not stat.S_ISREG(st.st_mode):
+        raise JsonControlFileUnsafeError(
+          f"refusing to read non-regular control file: {path.name}",
+          processing_path=path,
+        )
+      with os.fdopen(fd, "r", encoding="utf-8") as handle:
+        fd = None
+        return handle.read()
+    except OSError as exc:
+      if getattr(exc, "errno", None) == errno.ELOOP:
+        raise JsonControlFileUnsafeError(
+          f"refusing to read symlink control file: {path.name}",
+          processing_path=path,
+        ) from exc
+      raise
+    finally:
+      if fd is not None:
+        try:
+          os.close(fd)
+        except OSError:
+          pass
+
+  def claim_processing(self) -> Optional[Path]:
+    """Atomically rename pending -> processing, returning the processing path."""
+    if not self.has_pending():
+      return None
+    self._reject_non_regular_control_file(self.pending_path)
+    try:
+      os.replace(str(self.pending_path), str(self.processing_path))
+    except OSError as exc:
+      raise JsonControlFileClaimError(
+        str(exc), processing_path=self.processing_path,
+      ) from exc
+    return self.processing_path
+
+  def claim_object(self) -> Optional[ClaimedJsonObject]:
+    """Claim a pending JSON object control file.
+
+    Returns None when no pending file exists. Raises a JsonControlFileError
+    subclass for mechanics, JSON decode, or JSON-shape failures. On decode or
+    shape failure, the processing file remains in place so callers can write
+    their own failure artifacts and then discard it.
+    """
+    processing_path = self.claim_processing()
+    if processing_path is None:
+      return None
+
+    try:
+      raw_body = self._read_text_no_follow(processing_path)
+    except UnicodeDecodeError as exc:
+      raise JsonControlFileDecodeError(
+        f"invalid UTF-8 in {self.pending_name}: {exc}",
+        processing_path=processing_path,
+      ) from exc
+    except OSError as exc:
+      raise JsonControlFileReadError(
+        str(exc), processing_path=processing_path,
+      ) from exc
+
+    try:
+      body = json.loads(raw_body)
+    except json.JSONDecodeError as exc:
+      raise JsonControlFileDecodeError(
+        str(exc), raw_body=raw_body, processing_path=processing_path,
+      ) from exc
+
+    if not isinstance(body, dict):
+      raise JsonControlFileObjectError(
+        f"{self.pending_name} must be a JSON object",
+        raw_body=raw_body,
+        processing_path=processing_path,
+      )
+
+    return ClaimedJsonObject(
+      body=body,
+      raw_body=raw_body,
+      processing_path=processing_path,
+    )
+
+  def discard_processing(self) -> None:
+    if os.path.lexists(str(self.processing_path)):
+      self._remove_unsafe_entry(self.processing_path)
+
+  def recover_stale_processing(self) -> bool:
+    """Rename orphan processing -> pending without overwriting a pending file."""
+    if not self._root_is_safe():
+      return False
+    if not os.path.lexists(str(self.processing_path)):
+      return False
+    st = os.lstat(str(self.processing_path))
+    if stat.S_ISLNK(st.st_mode):
+      os.unlink(str(self.processing_path))
+      return False
+    if stat.S_ISREG(st.st_mode) and not os.path.lexists(str(self.pending_path)):
+      os.replace(str(self.processing_path), str(self.pending_path))
+      return True
+    if not stat.S_ISREG(st.st_mode):
+      self._remove_unsafe_entry(self.processing_path)
+    return False
+
+  def write_json(self, file_name: str, payload: Any) -> None:
+    write_json_atomic(self.root / file_name, payload)
+
+
+__all__ = [
+  "ClaimedJsonObject",
+  "JsonControlFile",
+  "JsonControlFileClaimError",
+  "JsonControlFileDecodeError",
+  "JsonControlFileError",
+  "JsonControlFileObjectError",
+  "JsonControlFileReadError",
+  "JsonControlFileUnsafeError",
+  "write_json_atomic",
+]
diff --git a/extensions/business/container_apps/sync/manager.py b/extensions/business/container_apps/sync/manager.py
new file mode 100644
index 00000000..aacc2117
--- /dev/null
+++ b/extensions/business/container_apps/sync/manager.py
@@ -0,0 +1,1959 @@
+"""Volume-sync manager for the Container App Runner.
+
+Coordinates publishing app-state snapshots to R1FS+ChainStore (provider) and
+applying them on remote nodes (consumer). The contract with the app inside
+the container is file-based, mediated through the always-on system volume
+mounted at ``/r1en_system``:
+
+  app writes  /r1en_system/volume-sync/request.json   (one-shot)
+  CAR writes  /r1en_system/volume-sync/response.json  (provider, paired)
+  CAR writes  /r1en_system/volume-sync/last_apply.json  (consumer)
+  CAR writes  /r1en_system/volume-sync/request.json.invalid  (failed request body + diagnostics)
+
+Persistent per-plugin audit trail lives under
+``<plugin_data>/sync_history/{sent,received}/<version>__<short_cid>.json``
+so both sides can be inspected with ``ls`` / ``cat`` / ``jq`` after the fact.
+
+See ``extensions/business/container_apps/README.md`` for the public
+operator/app contract.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import copy
+import hashlib
+import stat
+import tarfile
+import tempfile
+import time as _time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Optional
+
+from extensions.business.container_apps.container_utils import (
+  CONTAINER_VOLUMES_PATH,
+)
+
+from .control_files import (
+  JsonControlFile,
+  JsonControlFileClaimError,
+  JsonControlFileDecodeError,
+  JsonControlFileObjectError,
+  JsonControlFileReadError,
+  JsonControlFileUnsafeError,
+  write_json_atomic,
+)
+
+_HISTORY_WRITTEN_AT_NS = "history_written_at_ns"
+_SYNC_STATE_DIR = "state"
+_SYNC_APPLY_STATE_FILE = "current_apply.json"
+_SYNC_QUARANTINE_DIR = "quarantine"
+_BAD_CID_RETRY_BASE_SECONDS = 60.0
+_BAD_CID_RETRY_MAX_SECONDS = 3600.0
+PROVIDER_CAPTURE_OFFLINE = "offline"
+PROVIDER_CAPTURE_ONLINE = "online"
+CONSUMER_APPLY_OFFLINE_RESTART = "offline_restart"
+CONSUMER_APPLY_ONLINE_NO_RESTART = "online_no_restart"
+CONSUMER_APPLY_ONLINE_RESTART = "online_restart"
+_PROVIDER_CAPTURE_MODES = {PROVIDER_CAPTURE_OFFLINE, PROVIDER_CAPTURE_ONLINE}
+_CONSUMER_APPLY_MODES = {
+  CONSUMER_APPLY_OFFLINE_RESTART,
+  CONSUMER_APPLY_ONLINE_NO_RESTART,
+  CONSUMER_APPLY_ONLINE_RESTART,
+}
+
+from .constants import (
+  ARCHIVE_ENCRYPTION,
+  ARCHIVE_FORMAT,
+  CHAINSTORE_SYNC_HKEY,
+  MANIFEST_SCHEMA_VERSION,
+  STAGE_ARCHIVE_BUILD,
+  STAGE_CHAINSTORE_PUBLISH,
+  STAGE_EXTRACT,
+  STAGE_R1FS_UPLOAD,
+  STAGE_VALIDATION,
+  SYNC_HISTORY_DIR,
+  SYNC_HISTORY_RECEIVED,
+  SYNC_HISTORY_SENT,
+  SYNC_INVALID_FILE,
+  SYNC_LAST_APPLY_FILE,
+  SYNC_PROCESSING_FILE,
+  SYNC_REQUEST_FILE,
+  SYNC_RESPONSE_FILE,
+  SYSTEM_VOLUME_FS,
+  SYSTEM_VOLUME_MOUNT,
+  SYSTEM_VOLUME_NAME,
+  SYSTEM_VOLUME_SIZE,
+  VOLUME_SYNC_SUBDIR,
+  _UNDELETED,
+)
+
+
+@dataclass(frozen=True)
+class SyncRuntimePolicy:
+  provider_capture: str = PROVIDER_CAPTURE_OFFLINE
+  consumer_apply: str = CONSUMER_APPLY_OFFLINE_RESTART
+
+
+@dataclass(frozen=True)
+class SyncRequest:
+  archive_paths: list[str]
+  metadata: dict
+  runtime: SyncRuntimePolicy
+
+
+@dataclass(frozen=True)
+class PlannedApplyMember:
+  container_name: str
+  host_path: str
+  host_root: str
+  staging_path: Optional[Path]
+  mode: int
+  is_dir: bool
+
+
+@dataclass(frozen=True)
+class PreparedApply:
+  record: dict
+  cid: str
+  version: int
+  local_path: str
+  staging_dir: Path
+  members: list[PlannedApplyMember]
+  manifest: dict
+
+
+@dataclass(frozen=True)
+class ApplyResult:
+  success: bool
+  restart_safe: bool
+  state: str
+  extracted_paths: list[str]
+  error: Optional[str] = None
+
+
+@dataclass(frozen=True)
+class DirectoryMetadata:
+  uid: int
+  gid: int
+  mode: int
+
+
+@dataclass(frozen=True)
+class ApplyRollbackOp:
+  op: str
+  path: str
+  backup: Optional[str] = None
+  metadata: Optional[DirectoryMetadata] = None
+
+
+def runtime_policy_to_dict(runtime: SyncRuntimePolicy) -> dict:
+  return {
+    "provider_capture": runtime.provider_capture,
+    "consumer_apply": runtime.consumer_apply,
+  }
+
+
+# ---------------------------------------------------------------------------
+# Path helpers (host-side)
+# ---------------------------------------------------------------------------
+
+def system_volume_host_root(owner) -> Path:
+  """Host-side root of the system volume's loopback mount.
+
+  The system volume is provisioned via the same machinery as
+  FIXED_SIZE_VOLUMES, so its mount lives at:
+    <plugin_data>/fixed_volumes/mounts/<SYSTEM_VOLUME_NAME>/
+  """
+  return (
+    Path(owner.get_data_folder())
+    / owner._get_instance_data_subfolder()
+    / "fixed_volumes" / "mounts" / SYSTEM_VOLUME_NAME
+  )
+
+
+def volume_sync_dir(owner) -> Path:
+  """Host-side path of the volume-sync control-plane subdir."""
+  return system_volume_host_root(owner) / VOLUME_SYNC_SUBDIR
+
+
+def history_root(owner) -> Path:
+  """Host-side root of the per-plugin sync history folders."""
+  return (
+    Path(owner.get_data_folder())
+    / owner._get_instance_data_subfolder()
+    / SYNC_HISTORY_DIR
+  )
+
+
+def history_sent_dir(owner) -> Path:
+  return history_root(owner) / SYNC_HISTORY_SENT
+
+
+def history_received_dir(owner) -> Path:
+  return history_root(owner) / SYNC_HISTORY_RECEIVED
+
+
+def sync_state_dir(owner) -> Path:
+  """Host-private sync state root; never mounted into the app container."""
+  return history_root(owner) / _SYNC_STATE_DIR
+
+
+def apply_state_path(owner) -> Path:
+  return sync_state_dir(owner) / _SYNC_APPLY_STATE_FILE
+
+
+def quarantine_dir(owner) -> Path:
+  return sync_state_dir(owner) / _SYNC_QUARANTINE_DIR
+
+
+# ---------------------------------------------------------------------------
+# SyncManager
+# ---------------------------------------------------------------------------
+
+class SyncManager:
+  """Pure orchestration layer driven by ``_SyncMixin`` ticks.
+
+  All file I/O is rooted at host-side paths derived from the plugin's per-
+  instance data folder. Network/storage operations are delegated to the
+  plugin's ``self.r1fs`` and ``self.chainstore_*`` APIs.
+
+  Required attributes on ``owner``:
+    - P, time                                    (BasePlugin)
+    - get_data_folder, _get_instance_data_subfolder  (BasePlugin)
+    - volumes                                    (dict, populated by CAR)
+    - r1fs                                       (R1FSEngine)
+    - chainstore_hset, chainstore_hget, chainstore_hsync  (BasePlugin API)
+    - cfg_sync_key, cfg_sync_type                (CAR config — propagated by mixin)
+    - ee_id                                      (BasePlugin — node identity)
+  """
+
+  # Fallback used by fetch_latest when the owner doesn't expose
+  # cfg_sync_hsync_poll_interval (e.g. test fixtures or older configs).
+  # Mirrors _SyncMixin._HSYNC_POLL_INTERVAL_DEFAULT.
+  _DEFAULT_HSYNC_POLL_INTERVAL = 60.0
+  _DEFAULT_HSYNC_FAILURE_RETRY_INTERVAL = 30.0
+
+  def __init__(self, owner):
+    self.owner = owner
+    # Timestamp (owner.time() units) of the last hsync attempt. Initial 0
+    # guarantees the first ``fetch_latest`` call still hsyncs.
+    self._last_hsync = 0.0
+
+  def _request_control_file(self) -> JsonControlFile:
+    return JsonControlFile(
+      volume_sync_dir(self.owner), SYNC_REQUEST_FILE, SYNC_PROCESSING_FILE
+    )
+
+  @staticmethod
+  def _validate_container_path_shape(container_path: str) -> None:
+    if not isinstance(container_path, str) or not container_path:
+      raise ValueError(f"archive_paths entry must be a non-empty string: {container_path!r}")
+
+    parts = container_path.split("/")
+    if any(p == ".." for p in parts):
+      raise ValueError(f"archive_paths entries must not contain '..': {container_path!r}")
+
+    cp = os.path.normpath(container_path)
+    if not cp.startswith("/"):
+      raise ValueError(f"archive_paths entries must be absolute: {container_path!r}")
+
+    if cp == SYSTEM_VOLUME_MOUNT or cp.startswith(SYSTEM_VOLUME_MOUNT + "/"):
+      raise ValueError(
+        f"refusing to archive system volume content (anti-recursion): {container_path!r}"
+      )
+    return
+
+  # ----- path resolution -------------------------------------------------
+  def resolve_container_path(self, container_path: str) -> tuple[str, str, str]:
+    """Map an app-perspective absolute path to a host path via owner.volumes.
+
+    Enforces the six-rule check from the plan:
+      1. absolute, 2. covered by a mount, 3. backed by a volume-managed
+      mount (fixed-size OR legacy VOLUMES — both are per-instance host
+      directories under known roots; anonymous Docker mounts and ephemeral
+      container fs are still rejected), 4. not inside the system volume,
+      5. no ``..`` after normalization, 6. resolved host path stays within
+      its host_root.
+
+    Returns ``(host_path, bind_root, host_root)`` on success, raises
+    ``ValueError`` on any rule violation.
+    """
+    self._validate_container_path_shape(container_path)
+    cp = os.path.normpath(container_path)
+
+    # Rule 3 allow-list — both eligible roots are bounded, per-instance, and
+    # inside the edge node's data root:
+    #   - fixed_volumes/mounts/ : FIXED_SIZE_VOLUMES (ext4 loopbacks)
+    #   - CONTAINER_VOLUMES_PATH : legacy VOLUMES (raw bind dirs, deprecated
+    #     but still in use by some pipelines). These are functionally
+    #     equivalent for sync purposes: a per-instance host directory
+    #     identified by a known parent root.
+    # Anonymous Docker mounts, FILE_VOLUMES (content-injected single files),
+    # and ephemeral container fs all sit outside both roots and are rejected.
+    fixed_root_marker = os.sep + os.path.join("fixed_volumes", "mounts") + os.sep
+    legacy_root_marker = os.path.normpath(CONTAINER_VOLUMES_PATH) + os.sep
+
+    # Collect every mount whose bind prefix covers cp, then pick the longest.
+    # Docker overlays the more specific mount on top of the broader one inside
+    # the container (e.g. /app/data is shadowed onto /app), so the longest-
+    # prefix match is the one that actually serves reads/writes for cp. The
+    # previous first-match-wins iteration used dict insertion order, which has
+    # no relationship to overlay specificity and could resolve to the wrong
+    # host root for nested mounts.
+    volumes = getattr(self.owner, "volumes", {}) or {}
+    matches: list[tuple[str, str]] = []
+    for host_root, spec in volumes.items():
+      if not isinstance(spec, dict):
+        continue
+      bind = str(spec.get("bind", "")).rstrip("/")
+      if not bind:
+        continue
+      # Rule 2: container path must fall under this mount's bind point.
+      if cp != bind and not cp.startswith(bind + "/"):
+        continue
+      matches.append((str(host_root), bind))
+
+    if not matches:
+      raise ValueError(f"no mounted volume covers {container_path!r}")
+
+    host_root, bind = max(matches, key=lambda hb: len(hb[1]))
+    host_root_n = os.path.normpath(host_root)
+    # Rule 3: the winning mount's host root must fall under a known
+    # volume-managed root (fixed-size or legacy VOLUMES). See the allow-list
+    # construction above for the rationale and the list of rejected cases.
+    host_root_with_sep = host_root_n + os.sep
+    if not (
+      fixed_root_marker in host_root_with_sep
+      or host_root_with_sep.startswith(legacy_root_marker)
+    ):
+      raise ValueError(
+        f"refusing non-volume-backed mount for {container_path!r}: "
+        f"host_root={host_root_n!r} (only FIXED_SIZE_VOLUMES or legacy "
+        f"VOLUMES paths allowed; expected host root under "
+        f"{fixed_root_marker.strip(os.sep)!r} or "
+        f"{CONTAINER_VOLUMES_PATH!r})"
+      )
+
+    rel = "" if cp == bind else os.path.relpath(cp, bind)
+    host_path = os.path.normpath(os.path.join(host_root_n, rel))
+    # Rule 6: resolved path must stay within host_root.
+    if not (host_path == host_root_n or host_path.startswith(host_root_n + os.sep)):
+      raise ValueError(
+        f"resolved host path escapes mount root: {container_path!r} -> {host_path!r}"
+      )
+    return host_path, bind, host_root_n
+
+  @staticmethod
+  def _is_within_root(path: str, root: str) -> bool:
+    path_n = os.path.normpath(path)
+    root_n = os.path.normpath(root)
+    return path_n == root_n or path_n.startswith(root_n + os.sep)
+
+  @staticmethod
+  def _archive_arcname(container_root: str, rel_path: str) -> str:
+    root = os.path.normpath(container_root)
+    if rel_path in ("", "."):
+      return root
+    return os.path.normpath(os.path.join(root, rel_path))
+
+  @staticmethod
+  def _safe_extract_mode(member_mode: int, *, is_dir: bool) -> int:
+    normal_bits = member_mode & 0o777
+    minimum = 0o755 if is_dir else 0o644
+    return normal_bits | minimum
+
+  def _validate_archive_source_path(
+    self,
+    host_path: str,
+    host_root: str,
+    container_path: str,
+  ) -> int:
+    """Validate an offline archive source without following symlinks."""
+    host_path_n = os.path.normpath(host_path)
+    host_root_n = os.path.normpath(host_root)
+    if not self._is_within_root(host_path_n, host_root_n):
+      raise ValueError(
+        f"archive source escapes volume root: {container_path!r} -> {host_path_n!r}"
+      )
+    rel = os.path.relpath(host_path_n, host_root_n)
+    current = host_root_n
+    for part in [] if rel == "." else rel.split(os.sep):
+      current = os.path.join(current, part)
+      try:
+        st = os.lstat(current)
+      except FileNotFoundError as exc:
+        raise FileNotFoundError(
+          f"archive_paths target does not exist on host: "
+          f"{container_path!r} -> {host_path_n!r}"
+        ) from exc
+      if stat.S_ISLNK(st.st_mode):
+        raise ValueError(
+          f"archive source contains symlink: {container_path!r} -> {current!r}"
+        )
+    root_real = os.path.realpath(host_root_n)
+    path_real = os.path.realpath(host_path_n)
+    if not self._is_within_root(path_real, root_real):
+      raise ValueError(
+        f"archive source escapes volume root: {container_path!r} -> {host_path_n!r}"
+      )
+    return os.lstat(host_path_n).st_mode
+
+  def _add_offline_archive_path(
+    self,
+    tar: tarfile.TarFile,
+    container_path: str,
+    host_path: str,
+    host_root: str,
+  ) -> None:
+    mode = self._validate_archive_source_path(
+      host_path, host_root, container_path
+    )
+    if stat.S_ISREG(mode):
+      tar.add(host_path, arcname=os.path.normpath(container_path), recursive=False)
+      return
+    if not stat.S_ISDIR(mode):
+      raise ValueError(
+        f"archive source is not a regular file or directory: {container_path!r}"
+      )
+
+    for current_root, dirnames, filenames in os.walk(
+      host_path, topdown=True, followlinks=False
+    ):
+      rel_root = os.path.relpath(current_root, host_path)
+      current_container = self._archive_arcname(container_path, rel_root)
+      current_mode = self._validate_archive_source_path(
+        current_root, host_root, current_container
+      )
+      if stat.S_ISLNK(current_mode):
+        raise ValueError(
+          f"archive source contains symlink: {current_container!r}"
+        )
+      tar.add(current_root, arcname=current_container, recursive=False)
+
+      kept_dirs: list[str] = []
+      for name in dirnames:
+        child = os.path.join(current_root, name)
+        child_container = self._archive_arcname(
+          container_path, os.path.relpath(child, host_path)
+        )
+        child_mode = self._validate_archive_source_path(
+          child, host_root, child_container
+        )
+        if stat.S_ISLNK(child_mode):
+          raise ValueError(
+            f"archive source contains symlink: {child_container!r}"
+          )
+        if not stat.S_ISDIR(child_mode):
+          raise ValueError(
+            f"archive source is not a directory: {child_container!r}"
+          )
+        kept_dirs.append(name)
+      dirnames[:] = kept_dirs
+
+      for name in filenames:
+        child = os.path.join(current_root, name)
+        child_container = self._archive_arcname(
+          container_path, os.path.relpath(child, host_path)
+        )
+        child_mode = self._validate_archive_source_path(
+          child, host_root, child_container
+        )
+        if not stat.S_ISREG(child_mode):
+          raise ValueError(
+            f"archive source is not a regular file: {child_container!r}"
+          )
+        tar.add(child, arcname=child_container, recursive=False)
+
+  # ----- atomic I/O -------------------------------------------------------
+  def _write_json_atomic(self, path: Path, payload: Any) -> None:
+    """Write JSON to ``path`` atomically (tmp + ``os.replace``).
+
+    Creates the parent directory if missing. Uses a NamedTemporaryFile in
+    the same directory so ``os.replace`` is an atomic rename within one
+    filesystem. The final file is chmod'd to 0o644 because CAR runs as
+    root inside the edge node but the app inside the container typically
+    runs as a non-root user. Apps can read response.json / last_apply.json /
+    request.json.invalid, but cannot rewrite CAR-owned outputs.
+    """
+    write_json_atomic(path, payload)
+
+  # ----- history ---------------------------------------------------------
+  @staticmethod
+  def _history_filename(version: int, cid: str) -> str:
+    """Build the canonical filename for a history entry.
+
+    ``<10-digit-version>__<12-char-cid>.json`` so lexical sort matches
+    chronological order (version is a Unix timestamp).
+    """
+    short_cid = (cid or "")[:12] or "no_cid"
+    # safe_path_component-like sanitisation kept simple — CIDs are base58.
+    safe_short = "".join(ch if ch.isalnum() else "_" for ch in short_cid)
+    return f"{int(version):010d}__{safe_short}.json"
+
+  def _ensure_history_dirs(self) -> None:
+    history_sent_dir(self.owner).mkdir(parents=True, exist_ok=True)
+    history_received_dir(self.owner).mkdir(parents=True, exist_ok=True)
+
+  def _append_history(self, history_dir: Path, entry: dict) -> Path:
+    self._ensure_history_dirs()
+    fname = self._history_filename(entry.get("version", 0), entry.get("cid", ""))
+    path = history_dir / fname
+    payload = dict(entry)
+    payload.setdefault(_HISTORY_WRITTEN_AT_NS, _time.time_ns())
+    payload.setdefault("deletion", dict(_UNDELETED))
+    self._write_json_atomic(path, payload)
+    return path
+
+  def _read_history_entries(self, history_dir: Path) -> list[tuple[Path, dict, int]]:
+    """Read history JSON files with stable insertion-order metadata.
+
+    ``history_written_at_ns`` is set when an entry is first appended and is
+    preserved by deletion updates. Older history files fall back to mtime.
+    """
+    entries = []
+    if not history_dir.is_dir():
+      return entries
+    for path in history_dir.iterdir():
+      if path.suffix != ".json":
+        continue
+      try:
+        with path.open("r", encoding="utf-8") as handle:
+          entry = json.load(handle)
+      except (OSError, json.JSONDecodeError) as exc:
+        self.owner.P(f"[sync] failed to read history file {path}: {exc}", color="r")
+        continue
+      written_at = entry.get(_HISTORY_WRITTEN_AT_NS)
+      if not isinstance(written_at, int):
+        written_at = path.stat().st_mtime_ns
+      entries.append((path, entry, written_at))
+    return entries
+
+  def append_sent(self, entry: dict) -> Path:
+    """Write a provider history entry to sync_history/sent/."""
+    return self._append_history(history_sent_dir(self.owner), entry)
+
+  def append_received(self, entry: dict) -> Path:
+    """Write a consumer history entry to sync_history/received/."""
+    return self._append_history(history_received_dir(self.owner), entry)
+
+  def _latest_in(self, history_dir: Path) -> Optional[dict]:
+    """Return the most recently *written* history entry.
+
+    Sorts by the append-time marker, not by filename. Filenames are
+    version-prefixed for chronological browsability under normal operation,
+    but the consumer's "what did I last apply?" question is about insert
+    order, not about whatever ``version`` happens to be in the entry.
+    Older files without that marker fall back to mtime.
+    """
+    entries = self._read_history_entries(history_dir)
+    if not entries:
+      return None
+    _, latest, _ = max(entries, key=lambda item: item[2])
+    return latest
+
+  def latest_sent(self) -> Optional[dict]:
+    """Return the most recent provider history entry, or None if empty."""
+    return self._latest_in(history_sent_dir(self.owner))
+
+  def latest_received(self) -> Optional[dict]:
+    """Return the most recent consumer history entry, or None if empty."""
+    return self._latest_in(history_received_dir(self.owner))
+
+  def _write_apply_state(
+    self,
+    state: str,
+    record: dict,
+    **extra: Any,
+  ) -> dict:
+    payload = {
+      "state": state,
+      "cid": record.get("cid") if isinstance(record, dict) else None,
+      "version": record.get("version") if isinstance(record, dict) else None,
+      "timestamp": self.owner.time(),
+    }
+    payload.update(extra)
+    self._write_json_atomic(apply_state_path(self.owner), payload)
+    return payload
+
+  def read_apply_state(self) -> Optional[dict]:
+    path = apply_state_path(self.owner)
+    try:
+      with path.open("r", encoding="utf-8") as handle:
+        state = json.load(handle)
+    except (OSError, json.JSONDecodeError):
+      return None
+    return state if isinstance(state, dict) else None
+
+  def latest_applied(self) -> Optional[dict]:
+    """Return the durable last-applied state, falling back to old history."""
+    state = self.read_apply_state()
+    if state and state.get("state") == "applied" and state.get("cid"):
+      return state
+    return self.latest_received()
+
+  @staticmethod
+  def _record_digest(record: dict) -> str:
+    payload = {
+      "cid": record.get("cid"),
+      "manifest": record.get("manifest") if isinstance(record, dict) else None,
+    }
+    encoded = json.dumps(payload, sort_keys=True, separators=(",", ":"), default=str)
+    return hashlib.sha256(encoded.encode("utf-8")).hexdigest()
+
+  def _quarantine_path(self, record: dict) -> Path:
+    cid = str(record.get("cid") or "no_cid")
+    safe_cid = "".join(ch if ch.isalnum() else "_" for ch in cid)[:32] or "no_cid"
+    return quarantine_dir(self.owner) / f"{safe_cid}__{self._record_digest(record)[:16]}.json"
+
+  def _record_preflight_failure(self, record: dict, stage: str, error: str) -> None:
+    now = self.owner.time()
+    path = self._quarantine_path(record)
+    previous = {}
+    try:
+      with path.open("r", encoding="utf-8") as handle:
+        loaded = json.load(handle)
+        if isinstance(loaded, dict):
+          previous = loaded
+    except (OSError, json.JSONDecodeError):
+      previous = {}
+
+    failure_count = int(previous.get("failure_count") or 0) + 1
+    retry_after = min(
+      _BAD_CID_RETRY_MAX_SECONDS,
+      _BAD_CID_RETRY_BASE_SECONDS * (2 ** min(failure_count - 1, 5)),
+    )
+    payload = {
+      "cid": record.get("cid"),
+      "version": record.get("version"),
+      "manifest_digest": self._record_digest(record),
+      "stage": stage,
+      "error": error,
+      "failure_count": failure_count,
+      "first_seen": previous.get("first_seen", now),
+      "last_failed": now,
+      "next_retry_after": now + retry_after,
+    }
+    try:
+      self._write_json_atomic(path, payload)
+    except Exception as exc:
+      self.owner.P(f"[sync] failed to write quarantine state: {exc}", color="r")
+      return
+    try:
+      self._write_apply_state("failed_preflight", record, stage=stage, error=error)
+    except Exception as exc:
+      self.owner.P(f"[sync] failed to write apply preflight state: {exc}", color="r")
+
+  def quarantined_record(self, record: dict) -> Optional[dict]:
+    path = self._quarantine_path(record)
+    try:
+      with path.open("r", encoding="utf-8") as handle:
+        payload = json.load(handle)
+    except (OSError, json.JSONDecodeError):
+      return None
+    if not isinstance(payload, dict):
+      return None
+    next_retry_after = payload.get("next_retry_after")
+    if isinstance(next_retry_after, (int, float)) and self.owner.time() < next_retry_after:
+      return payload
+    return None
+
+  def update_history_deletion(
+    self, history_dir: Path, entry: dict, succeeded: bool, error: Optional[str]
+  ) -> None:
+    """Update the deletion sub-record on an existing history entry.
+
+    Atomic via tmp+rename. Identifies the file by its filename convention
+    (``<version>__<short_cid>.json``) derived from the entry's fields.
+    Silently logs and returns if the file isn't found.
+    """
+    fname = self._history_filename(entry.get("version", 0), entry.get("cid", ""))
+    path = Path(history_dir) / fname
+    if not path.is_file():
+      self.owner.P(
+        f"[sync] history file missing for deletion update: {path}", color="y"
+      )
+      return
+    try:
+      with path.open("r", encoding="utf-8") as handle:
+        data = json.load(handle)
+    except (OSError, json.JSONDecodeError) as exc:
+      self.owner.P(
+        f"[sync] failed to read history file for deletion update {path}: {exc}",
+        color="r",
+      )
+      return
+    data["deletion"] = {
+      "deleted_at": self.owner.time() if succeeded else None,
+      "deletion_succeeded": bool(succeeded),
+      "deletion_error": error,
+    }
+    self._write_json_atomic(path, data)
+
+  # ----- provider --------------------------------------------------------
+  def _fail_request(
+    self,
+    request_body: Optional[dict],
+    stage: str,
+    error: str,
+    processing_path: Optional[Path],
+    raw_body: Optional[str] = None,
+  ) -> None:
+    """Write request.json.invalid + response.json (error), discard .processing.
+
+    Used by both claim_request validation failures and publish_snapshot
+    execution failures so the artifact pair is consistent across stages.
+    """
+    failed_ts = self.owner.time()
+    node_id = getattr(self.owner, "ee_id", None) or getattr(self.owner, "node_id", None)
+    invalid_payload: dict[str, Any] = {
+      "request": request_body,  # may be None for malformed JSON
+      "_error": {
+        "stage": stage,
+        "error": error,
+        "failed_timestamp": failed_ts,
+        "node_id": node_id,
+      },
+    }
+    if raw_body is not None and request_body is None:
+      invalid_payload["_error"]["raw_body"] = raw_body[:1024]
+
+    control_file = self._request_control_file()
+    try:
+      control_file.write_json(SYNC_INVALID_FILE, invalid_payload)
+    except Exception as exc:
+      self.owner.P(f"[sync] failed to write request.json.invalid: {exc}", color="r")
+
+    archive_paths: list[Any] = []
+    if isinstance(request_body, dict):
+      ap = request_body.get("archive_paths")
+      if isinstance(ap, list):
+        archive_paths = ap
+    response_payload = {
+      "status": "error",
+      "stage": stage,
+      "error": error,
+      "failed_timestamp": failed_ts,
+      "archive_paths": archive_paths,
+    }
+    try:
+      control_file.write_json(SYNC_RESPONSE_FILE, response_payload)
+    except Exception as exc:
+      self.owner.P(f"[sync] failed to write response.json: {exc}", color="r")
+
+    if processing_path is not None and os.path.lexists(str(processing_path)):
+      try:
+        control_file.discard_processing()
+      except OSError as exc:
+        self.owner.P(
+          f"[sync] failed to delete .processing after error: {exc}", color="r"
+        )
+
+  def _parse_runtime_policy(self, body: dict) -> SyncRuntimePolicy:
+    runtime = body.get("runtime") or {}
+    if not isinstance(runtime, dict):
+      raise ValueError("runtime must be a JSON object")
+
+    provider_capture = runtime.get("provider_capture", PROVIDER_CAPTURE_OFFLINE)
+    consumer_apply = runtime.get("consumer_apply", CONSUMER_APPLY_OFFLINE_RESTART)
+
+    if provider_capture not in _PROVIDER_CAPTURE_MODES:
+      allowed = ", ".join(sorted(_PROVIDER_CAPTURE_MODES))
+      raise ValueError(
+        f"runtime.provider_capture must be one of [{allowed}], got {provider_capture!r}"
+      )
+    if consumer_apply not in _CONSUMER_APPLY_MODES:
+      allowed = ", ".join(sorted(_CONSUMER_APPLY_MODES))
+      raise ValueError(
+        f"runtime.consumer_apply must be one of [{allowed}], got {consumer_apply!r}"
+      )
+
+    return SyncRuntimePolicy(
+      provider_capture=provider_capture,
+      consumer_apply=consumer_apply,
+    )
+
+  def claim_request(self) -> Optional[SyncRequest]:
+    """Atomically claim the pending request.json, validate, return its payload.
+
+    On success: renames ``request.json`` → ``request.json.processing``,
+    returns a ``SyncRequest``.
+    On any failure (no file, malformed JSON, validation): writes
+    ``request.json.invalid`` (request body + ``_error`` diagnostics) and
+    ``response.json`` (error shape), discards the ``.processing`` file, and
+    returns ``None``.
+    """
+    control_file = self._request_control_file()
+    try:
+      claimed = control_file.claim_object()
+    except JsonControlFileClaimError as exc:
+      self.owner.P(
+        f"[sync] could not rename request.json -> .processing: {exc}", color="r"
+      )
+      return None
+    except JsonControlFileReadError as exc:
+      self._fail_request(
+        None, STAGE_VALIDATION,
+        f"could not read .processing: {exc}", control_file.processing_path,
+      )
+      return None
+    except JsonControlFileUnsafeError as exc:
+      self._fail_request(
+        None, STAGE_VALIDATION,
+        str(exc), control_file.processing_path,
+      )
+      return None
+    except JsonControlFileDecodeError as exc:
+      self._fail_request(
+        None, STAGE_VALIDATION,
+        f"malformed JSON: {exc}", control_file.processing_path,
+        raw_body=exc.raw_body,
+      )
+      return None
+    except JsonControlFileObjectError as exc:
+      self._fail_request(
+        None, STAGE_VALIDATION,
+        str(exc), control_file.processing_path, raw_body=exc.raw_body,
+      )
+      return None
+
+    if claimed is None:
+      return None  # nothing pending
+
+    body = claimed.body
+    proc_path = claimed.processing_path
+
+    archive_paths = body.get("archive_paths")
+    metadata = body.get("metadata", {}) or {}
+    if not isinstance(metadata, dict):
+      self._fail_request(
+        body, STAGE_VALIDATION, "metadata must be a JSON object", proc_path
+      )
+      return None
+
+    try:
+      runtime = self._parse_runtime_policy(body)
+    except ValueError as exc:
+      self._fail_request(body, STAGE_VALIDATION, str(exc), proc_path)
+      return None
+
+    if not isinstance(archive_paths, list) or not archive_paths:
+      self._fail_request(
+        body, STAGE_VALIDATION,
+        "archive_paths must be a non-empty list of container-absolute paths",
+        proc_path,
+      )
+      return None
+
+    for entry in archive_paths:
+      try:
+        if runtime.provider_capture == PROVIDER_CAPTURE_ONLINE:
+          if not bool(
+            getattr(self.owner, "cfg_sync_allow_online_provider_capture", False)
+          ):
+            raise ValueError(
+              "runtime.provider_capture='online' requires local "
+              "SYNC.ALLOW_ONLINE_PROVIDER_CAPTURE=True"
+            )
+          self._validate_container_path_shape(entry)
+        else:
+          self.resolve_container_path(entry)
+      except ValueError as exc:
+        self._fail_request(body, STAGE_VALIDATION, str(exc), proc_path)
+        return None
+
+    return SyncRequest(
+      archive_paths=list(archive_paths),
+      metadata=dict(metadata),
+      runtime=runtime,
+    )
+
+  @staticmethod
+  def _docker_member_arcname(container_path: str, docker_name: str, member_name: str) -> str:
+    target = os.path.normpath(container_path).rstrip("/")
+    base = (docker_name or os.path.basename(target)).strip("/")
+    raw = member_name.strip("/")
+
+    if base and raw == base:
+      return target.lstrip("/")
+    if base and raw.startswith(base + "/"):
+      suffix = raw[len(base) + 1:]
+      return f"{target}/{suffix}".lstrip("/")
+    return f"{target}/{raw}".lstrip("/")
+
+  def _append_docker_archive_path(self, tar: tarfile.TarFile, container_path: str) -> None:
+    container = getattr(self.owner, "container", None)
+    if container is None:
+      raise RuntimeError("online provider capture requires a running container")
+
+    self._validate_container_path_shape(container_path)
+    bits, stat = container.get_archive(container_path)
+
+    output_dir = Path(tempfile.gettempdir())
+    get_output = getattr(self.owner, "get_output_folder", None)
+    if callable(get_output):
+      output_dir = Path(get_output())
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    fd, tmp_name = tempfile.mkstemp(
+      dir=str(output_dir),
+      prefix="sync_docker_archive_",
+      suffix=".tar",
+    )
+    try:
+      with os.fdopen(fd, "wb") as handle:
+        if isinstance(bits, (bytes, bytearray)):
+          handle.write(bits)
+        else:
+          for chunk in bits:
+            handle.write(chunk)
+
+      docker_name = (stat or {}).get("name") or os.path.basename(
+        os.path.normpath(container_path)
+      )
+      with tarfile.open(tmp_name, "r:*") as src:
+        for member in src.getmembers():
+          if any(part == ".." for part in member.name.split("/")):
+            raise ValueError(f"docker archive member name contains '..': {member.name!r}")
+          new_member = copy.copy(member)
+          new_member.name = self._docker_member_arcname(
+            container_path, docker_name, member.name
+          )
+          fileobj = src.extractfile(member) if member.isfile() else None
+          tar.addfile(new_member, fileobj)
+    finally:
+      try:
+        os.unlink(tmp_name)
+      except OSError:
+        pass
+    return
+
+  def make_archive(
+    self,
+    archive_paths: list[str],
+    provider_capture: str = PROVIDER_CAPTURE_OFFLINE,
+  ) -> tuple[str, int]:
+    """Build the snapshot tar.gz under the plugin output folder.
+
+    Tar member names are the **container paths** (so consumers can reverse-
+    resolve via their own self.volumes). Returns ``(tar_path, size_bytes)``.
+    Offline capture re-runs ``resolve_container_path`` for each entry as
+    defence in depth. Online capture uses Docker's archive API against the
+    running container, allowing non-mounted provider paths.
+    """
+    output_dir: Path
+    get_output = getattr(self.owner, "get_output_folder", None)
+    if callable(get_output):
+      output_dir = Path(get_output())
+    else:
+      output_dir = Path(tempfile.gettempdir())
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    ts = int(self.owner.time())
+    tar_path = output_dir / f"sync_archive_{ts}_{os.getpid()}.tar.gz"
+
+    with tarfile.open(str(tar_path), "w:gz") as tar:
+      for container_path in archive_paths:
+        if provider_capture == PROVIDER_CAPTURE_ONLINE:
+          if not bool(
+            getattr(self.owner, "cfg_sync_allow_online_provider_capture", False)
+          ):
+            raise ValueError(
+              "provider_capture='online' requires local "
+              "SYNC.ALLOW_ONLINE_PROVIDER_CAPTURE=True"
+            )
+          self._append_docker_archive_path(tar, container_path)
+        else:
+          host_path, _bind, host_root = self.resolve_container_path(container_path)
+          self._add_offline_archive_path(tar, container_path, host_path, host_root)
+
+    return str(tar_path), os.path.getsize(str(tar_path))
+
+  def _coerce_sync_request(
+    self,
+    request: SyncRequest | list[str],
+    metadata: Optional[dict] = None,
+  ) -> SyncRequest:
+    if isinstance(request, SyncRequest):
+      return request
+    return SyncRequest(
+      archive_paths=list(request),
+      metadata=dict(metadata or {}),
+      runtime=SyncRuntimePolicy(),
+    )
+
+  def _delete_uploaded_cid_best_effort(
+    self,
+    cid: str,
+    *,
+    cleanup_local_files: bool = False,
+  ) -> None:
+    try:
+      self.owner.r1fs.delete_file(
+        cid=cid,
+        unpin_remote=True,
+        cleanup_local_files=cleanup_local_files,
+      )
+    except Exception as exc:  # noqa: BLE001 - cleanup must not mask root failure
+      self.owner.P(
+        f"[sync] failed to clean up uploaded CID {cid}: {exc}", color="y"
+      )
+
+  def publish_snapshot(
+    self,
+    request: SyncRequest | list[str],
+    metadata: Optional[dict] = None,
+  ) -> bool:
+    """Full provider orchestration: archive → R1FS add → ChainStore hset →
+    history append → response.json → clear .invalid → delete .processing →
+    retire previous CID.
+
+    Returns True on success, False on any failure (and writes
+    response.json/error + request.json.invalid for the app).
+    Always cleans up the archive tmp file.
+    """
+    sync_request = self._coerce_sync_request(request, metadata)
+    archive_paths = sync_request.archive_paths
+    runtime_payload = runtime_policy_to_dict(sync_request.runtime)
+    request_body = {
+      "archive_paths": list(archive_paths),
+      "metadata": dict(sync_request.metadata),
+      "runtime": runtime_payload,
+    }
+    control_file = self._request_control_file()
+    vsd = volume_sync_dir(self.owner)
+    proc_path = control_file.processing_path
+    tar_path: Optional[str] = None
+    try:
+      # ---- Stage: archive_build
+      try:
+        tar_path, size_bytes = self.make_archive(
+          archive_paths,
+          provider_capture=sync_request.runtime.provider_capture,
+        )
+      except Exception as exc:
+        self._fail_request(request_body, STAGE_ARCHIVE_BUILD, str(exc), proc_path)
+        return False
+
+      # ---- Stage: r1fs_upload
+      try:
+        cid = self.owner.r1fs.add_file(tar_path)
+      except Exception as exc:
+        self._fail_request(request_body, STAGE_R1FS_UPLOAD, str(exc), proc_path)
+        return False
+      if not cid:
+        self._fail_request(
+          request_body, STAGE_R1FS_UPLOAD,
+          "r1fs.add_file returned no CID", proc_path,
+        )
+        return False
+
+      # Build the manifest + record
+      version = int(self.owner.time())
+      ts = self.owner.time()
+      node_id = getattr(self.owner, "ee_id", None) or getattr(self.owner, "node_id", None)
+      manifest = {
+        "schema_version": MANIFEST_SCHEMA_VERSION,
+        "archive_paths": list(archive_paths),
+        "archive_format": ARCHIVE_FORMAT,
+        "archive_size_bytes": size_bytes,
+        "encryption": ARCHIVE_ENCRYPTION,
+        "runtime": runtime_payload,
+      }
+      record = {
+        "cid": cid,
+        "version": version,
+        "timestamp": ts,
+        "node_id": node_id,
+        "metadata": dict(sync_request.metadata),
+        "runtime": runtime_payload,
+        "manifest": manifest,
+      }
+
+      # ---- Stage: chainstore_publish
+      try:
+        ack = self.owner.chainstore_hset(
+          hkey=CHAINSTORE_SYNC_HKEY,
+          key=getattr(self.owner, "cfg_sync_key", None),
+          value=record,
+        )
+      except Exception as exc:
+        self._delete_uploaded_cid_best_effort(cid)
+        self._fail_request(
+          request_body, STAGE_CHAINSTORE_PUBLISH, str(exc), proc_path
+        )
+        return False
+      if not ack:
+        self._delete_uploaded_cid_best_effort(cid)
+        self._fail_request(
+          request_body,
+          STAGE_CHAINSTORE_PUBLISH,
+          "chainstore_hset returned false acknowledgement",
+          proc_path,
+        )
+        return False
+
+      # Persist history entry (pre-retirement so deletion update finds it).
+      entry = {
+        "cid": cid,
+        "version": version,
+        "published_timestamp": ts,
+        "request": dict(request_body),
+        "manifest": manifest,
+        "archive_size_bytes": size_bytes,
+        "chainstore_ack": bool(ack),
+        "node_id": node_id,
+      }
+      history_error = None
+      history_appended = False
+      try:
+        self.append_sent(entry)
+        history_appended = True
+      except Exception as exc:
+        history_error = str(exc)
+        self.owner.P(
+          f"[sync] ChainStore publish succeeded but sent-history append failed: {exc}",
+          color="r",
+        )
+
+      # Write success response and clean up control-plane artifacts. We
+      # include the app-supplied metadata so the in-volume-sync state file
+      # is self-contained — UIs that surface response.json (without access
+      # to host-side sync_history/) can show the metadata that travelled
+      # with this snapshot.
+      response_payload = {
+        "status": "ok",
+        "cid": cid,
+        "version": version,
+        "published_timestamp": ts,
+        "archive_paths": list(archive_paths),
+        "archive_size_bytes": size_bytes,
+        "chainstore_ack": bool(ack),
+        "metadata": dict(sync_request.metadata),
+      }
+      if history_error is not None:
+        response_payload["history_error"] = history_error
+      try:
+        control_file.write_json(SYNC_RESPONSE_FILE, response_payload)
+      except Exception as exc:
+        self.owner.P(
+          f"[sync] failed to write response.json: {exc}", color="r"
+        )
+
+      invalid_path = vsd / SYNC_INVALID_FILE
+      if os.path.lexists(str(invalid_path)):
+        try:
+          os.unlink(str(invalid_path))
+        except OSError:
+          pass
+      if os.path.lexists(str(proc_path)):
+        try:
+          control_file.discard_processing()
+        except OSError as exc:
+          self.owner.P(
+            f"[sync] failed to delete .processing after success: {exc}", color="y"
+          )
+
+      # Retire prior CID only when the new sent-history entry exists. Without
+      # that entry there is no durable local record for deletion bookkeeping.
+      if history_appended:
+        self._retire_previous_cid(history_sent_dir(self.owner))
+      return True
+    finally:
+      if tar_path:
+        try:
+          os.unlink(tar_path)
+        except OSError:
+          pass
+
+  # ----- consumer --------------------------------------------------------
+  def fetch_latest(self) -> Optional[dict]:
+    """Refresh the local CHAINSTORE_SYNC replica (gated by HSYNC_POLL_INTERVAL),
+    then read the configured KEY.
+
+    The ``hsync`` is the expensive bit — a network round-trip to the chain
+    cluster with a timeout. It fires at most every
+    ``SYNC.HSYNC_POLL_INTERVAL`` seconds (default 60s, min 10s). The cheap
+    local-replica ``hget`` runs on every call regardless, so a consumer that
+    already has the record cached keeps reading it without paying the
+    network cost.
+
+    On ``hsync`` failure we retry sooner than the full success interval
+    (default 30s) to avoid leaving consumers stale for a whole cadence while
+    still avoiding a network attempt on every sync tick.
+    """
+    sync_key = getattr(self.owner, "cfg_sync_key", None)
+    if not sync_key:
+      return None
+
+    interval = getattr(
+      self.owner, "cfg_sync_hsync_poll_interval", self._DEFAULT_HSYNC_POLL_INTERVAL,
+    )
+    now = self.owner.time()
+    if now - self._last_hsync >= interval:
+      # Always log the hsync attempt result (success or failure) — this is
+      # the only sync mixin log that fires on the happy path, so it doubles
+      # as the heartbeat that confirms the consumer is actually ticking and
+      # the rate-limit gating is working. Quiet enough at one log per
+      # HSYNC_POLL_INTERVAL window (default once per minute) to stay on in
+      # prod logs.
+      hsync_start = _time.monotonic()
+      try:
+        self.owner.chainstore_hsync(hkey=CHAINSTORE_SYNC_HKEY)
+        self._last_hsync = now
+        elapsed = _time.monotonic() - hsync_start
+        self.owner.P(f"[sync] chainstore_hsync ok ({elapsed:.2f}s)", color="g")
+      except Exception as exc:
+        retry_after = min(self._DEFAULT_HSYNC_FAILURE_RETRY_INTERVAL, interval)
+        self._last_hsync = now - max(0.0, interval - retry_after)
+        elapsed = _time.monotonic() - hsync_start
+        self.owner.P(
+          f"[sync] chainstore_hsync error after {elapsed:.2f}s "
+          f"(retry in {retry_after:.0f}s): {exc}",
+          color="y",
+        )
+
+    try:
+      return self.owner.chainstore_hget(
+        hkey=CHAINSTORE_SYNC_HKEY, key=sync_key
+      )
+    except Exception as exc:
+      self.owner.P(f"[sync] chainstore_hget error: {exc}", color="r")
+      return None
+
+  def validate_manifest(self, record: dict) -> list[str]:
+    """Return list of human-readable rejection reasons for ``record``.
+
+    Empty list means the manifest is acceptable: schema_version and
+    archive_format are recognised AND the consumer's ``self.volumes`` covers
+    every container path with a (fixed-size) mount. A non-empty list means
+    apply must be skipped without touching the filesystem.
+
+    Reasons are surfaced for:
+      - missing/wrong ``schema_version`` (must be an int <= MANIFEST_SCHEMA_VERSION)
+      - unexpected ``archive_format`` (must equal ARCHIVE_FORMAT)
+      - unexpected ``encryption`` (must equal ARCHIVE_ENCRYPTION)
+      - ``archive_paths`` entries that don't map to a mount on this consumer
+
+    Format/schema checks come first so they short-circuit before we burn
+    cycles resolving paths against a manifest we can't read anyway.
+    """
+    if not isinstance(record, dict):
+      return ["manifest record is not a dict"]
+    manifest = record.get("manifest") or {}
+    if not isinstance(manifest, dict):
+      return ["manifest must be a JSON object"]
+    reasons: list[str] = []
+
+    sv = manifest.get("schema_version")
+    if not isinstance(sv, int):
+      reasons.append(
+        f"unsupported schema_version: {sv!r} (expected int, max supported: {MANIFEST_SCHEMA_VERSION})"
+      )
+    elif sv > MANIFEST_SCHEMA_VERSION:
+      reasons.append(
+        f"unsupported schema_version: {sv} (max supported by this CAR: {MANIFEST_SCHEMA_VERSION})"
+      )
+
+    fmt = manifest.get("archive_format")
+    if fmt != ARCHIVE_FORMAT:
+      reasons.append(
+        f"unsupported archive_format: {fmt!r} (expected: {ARCHIVE_FORMAT!r})"
+      )
+
+    enc = manifest.get("encryption")
+    if enc != ARCHIVE_ENCRYPTION:
+      reasons.append(
+        f"unsupported encryption: {enc!r} (expected: {ARCHIVE_ENCRYPTION!r})"
+      )
+
+    raw_paths = manifest.get("archive_paths")
+    paths: list[str] = []
+    if not isinstance(raw_paths, list) or not raw_paths:
+      reasons.append(
+        "archive_paths must be a non-empty list of container-absolute paths"
+      )
+    else:
+      invalid_paths = [
+        entry for entry in raw_paths
+        if not isinstance(entry, str) or not entry
+      ]
+      if invalid_paths:
+        reasons.append(f"invalid archive_paths entries: {invalid_paths!r}")
+      paths = [entry for entry in raw_paths if isinstance(entry, str) and entry]
+    missing: list[str] = []
+    for entry in paths:
+      try:
+        self.resolve_container_path(entry)
+      except ValueError:
+        missing.append(entry)
+    if missing:
+      reasons.append(f"unmapped archive_paths on this consumer: {missing}")
+    return reasons
+
+  def validate_record_for_apply(self, record: dict) -> list[str]:
+    """Validate the full ChainStore record before disrupting a consumer.
+
+    This covers the record envelope plus the manifest. ``validate_manifest`` is
+    kept as the manifest-focused helper used by older tests and callers.
+    """
+    if not isinstance(record, dict):
+      return ["sync record is not a dict"]
+    reasons: list[str] = []
+    cid = record.get("cid")
+    if not isinstance(cid, str) or not cid:
+      reasons.append("record cid must be a non-empty string")
+    version = record.get("version")
+    if not isinstance(version, int):
+      reasons.append("record version must be an int")
+    reasons.extend(self.validate_manifest(record))
+    return reasons
+
+  @staticmethod
+  def _is_within_real_root(path: str, root: str) -> bool:
+    root_real = os.path.realpath(root)
+    path_real = os.path.realpath(path)
+    return path_real == root_real or path_real.startswith(root_real + os.sep)
+
+  def _validate_extract_target_within_root(
+    self,
+    host_path: str,
+    host_root: str,
+    container_name: str,
+  ) -> None:
+    """Reject extraction targets that would resolve outside their volume.
+
+    ``resolve_container_path`` already proves the normalized string path sits
+    under the selected host root. This second check follows symlinks in the
+    target and parent path so a pre-existing symlink inside the mounted volume
+    cannot redirect extraction outside that volume.
+    """
+    candidates = [host_path]
+    if os.path.normpath(host_path) != os.path.normpath(host_root):
+      candidates.append(os.path.dirname(host_path) or host_root)
+    for candidate in candidates:
+      if not self._is_within_real_root(candidate, host_root):
+        raise ValueError(
+          f"tar member target escapes volume root: {container_name!r} -> {host_path!r}"
+        )
+
+  @staticmethod
+  def _volume_owner(host_root: str) -> tuple[int, int]:
+    st = os.stat(host_root)
+    return st.st_uid, st.st_gid
+
+  @staticmethod
+  def _chown_if_needed(path: str, uid: int, gid: int) -> None:
+    st = os.lstat(path)
+    if st.st_uid != uid or st.st_gid != gid:
+      os.chown(path, uid, gid)
+
+  @staticmethod
+  def _directory_metadata(path: str) -> DirectoryMetadata:
+    st = os.lstat(path)
+    if stat.S_ISLNK(st.st_mode) or not stat.S_ISDIR(st.st_mode):
+      raise ValueError(f"directory path is not a real directory: {path!r}")
+    return DirectoryMetadata(
+      uid=st.st_uid,
+      gid=st.st_gid,
+      mode=stat.S_IMODE(st.st_mode),
+    )
+
+  def _record_directory_metadata(
+    self,
+    path: str,
+    ops: Optional[list[ApplyRollbackOp]],
+    tracked_dirs: Optional[set[str]],
+  ) -> None:
+    if ops is None:
+      return
+    path_n = os.path.normpath(path)
+    if tracked_dirs is not None and path_n in tracked_dirs:
+      return
+    metadata = self._directory_metadata(path_n)
+    ops.append(ApplyRollbackOp("restore_dir_meta", path_n, metadata=metadata))
+    if tracked_dirs is not None:
+      tracked_dirs.add(path_n)
+
+  def _ensure_directory_tree_owner(
+    self,
+    path: str,
+    host_root: str,
+    uid: int,
+    gid: int,
+    ops: Optional[list[ApplyRollbackOp]] = None,
+    tracked_dirs: Optional[set[str]] = None,
+  ) -> None:
+    host_root_n = os.path.normpath(host_root)
+    path_n = os.path.normpath(path)
+    if not self._is_within_root(path_n, host_root_n):
+      raise ValueError(f"directory path escapes volume root: {path_n!r}")
+    if path_n == host_root_n:
+      return
+
+    rel = os.path.relpath(path_n, host_root_n)
+    current = host_root_n
+    for part in rel.split(os.sep):
+      if not part or part == ".":
+        continue
+      if part == "..":
+        raise ValueError(f"directory path escapes volume root: {path_n!r}")
+      current = os.path.join(current, part)
+      try:
+        st = os.lstat(current)
+      except FileNotFoundError:
+        os.mkdir(current)
+        if ops is not None:
+          ops.append(ApplyRollbackOp("remove_dir", current))
+        if tracked_dirs is not None:
+          tracked_dirs.add(os.path.normpath(current))
+        st = os.lstat(current)
+
+      if stat.S_ISLNK(st.st_mode) or not stat.S_ISDIR(st.st_mode):
+        raise ValueError(f"directory path is not a real directory: {current!r}")
+
+      self._record_directory_metadata(current, ops, tracked_dirs)
+      self._chown_if_needed(current, uid, gid)
+      os.chmod(current, 0o755)
+
+  @staticmethod
+  def _container_path_in_declared_archive_paths(
+    container_name: str,
+    archive_paths: list[str],
+  ) -> bool:
+    candidate = os.path.normpath(container_name)
+    if not candidate.startswith("/"):
+      candidate = "/" + candidate
+    for entry in archive_paths:
+      if not isinstance(entry, str) or not entry:
+        continue
+      declared = os.path.normpath(entry)
+      if not declared.startswith("/"):
+        declared = "/" + declared
+      if candidate == declared or candidate.startswith(declared.rstrip("/") + "/"):
+        return True
+    return False
+
+  def extract_archive(
+    self,
+    tar_path: str,
+    allowed_archive_paths: Optional[list[str]] = None,
+  ) -> list[str]:
+    """Reverse-map tar member container paths to host paths and extract.
+
+    Two-pass: first pass validates every member by feeding its name through
+    ``resolve_container_path`` (so the entire extract aborts before any
+    write if the consumer's volume layout doesn't cover all members).
+    Symlinks/hardlinks are skipped with a warning — never extracted, since
+    a malicious tar could otherwise create a link that subsequent regular
+    members would write through. Each regular file is written via tmp +
+    ``os.replace`` so a mid-flight crash never leaves a half-written file.
+    If ``allowed_archive_paths`` is provided, every extracted member must also
+    sit under at least one manifest-declared archive path. Returns the list of
+    container paths that were applied (regular files + directories created).
+    """
+    return self._extract_archive(tar_path, allowed_archive_paths)
+
+  def _extract_archive(
+    self,
+    tar_path: str,
+    allowed_archive_paths: Optional[list[str]] = None,
+  ) -> list[str]:
+    extracted: list[str] = []
+    with tarfile.open(str(tar_path), "r:gz") as tar:
+      members = tar.getmembers()
+
+      # Pass 1: validate every member, build (member, host_path) pairs.
+      # Python's tarfile.add() strips leading '/' from arcnames as a POSIX
+      # safety default, so member names look like "app/data/foo.bin" even
+      # when we put them in as "/app/data/foo.bin". Normalize back to the
+      # container-absolute form before running through the resolver.
+      planned: list[tuple[tarfile.TarInfo, str, str, str]] = []
+      for member in members:
+        if member.issym() or member.islnk():
+          self.owner.P(
+            f"[sync] skipping link member in tar (security): {member.name}",
+            color="y",
+          )
+          continue
+        if any(part == ".." for part in member.name.split("/")):
+          raise ValueError(f"tar member name contains '..': {member.name!r}")
+        container_name = member.name
+        if not container_name.startswith("/"):
+          container_name = "/" + container_name
+        if (
+          allowed_archive_paths is not None
+          and not self._container_path_in_declared_archive_paths(
+            container_name, allowed_archive_paths
+          )
+        ):
+          raise ValueError(
+            f"tar member outside manifest archive_paths: {container_name!r}"
+          )
+        host_path, _bind, host_root = self.resolve_container_path(container_name)
+        self._validate_extract_target_within_root(host_path, host_root, container_name)
+        planned.append((member, host_path, container_name, host_root))
+
+      # Pass 2: actually extract.
+      for member, host_path, container_name, host_root in planned:
+        owner_uid, owner_gid = self._volume_owner(host_root)
+        if member.isdir():
+          self._ensure_directory_tree_owner(
+            host_path, host_root, owner_uid, owner_gid
+          )
+          self._validate_extract_target_within_root(host_path, host_root, container_name)
+          os.chmod(host_path, self._safe_extract_mode(member.mode, is_dir=True))
+          extracted.append(container_name)
+          continue
+        if not member.isfile():
+          continue
+        self._ensure_directory_tree_owner(
+          os.path.dirname(host_path), host_root, owner_uid, owner_gid
+        )
+        self._validate_extract_target_within_root(host_path, host_root, container_name)
+        fobj = tar.extractfile(member)
+        if fobj is None:
+          continue
+        # Atomic per-file write: tmp in same directory, then os.replace.
+        fd, tmp_name = tempfile.mkstemp(
+          dir=os.path.dirname(host_path),
+          prefix=f".{os.path.basename(host_path)}.",
+          suffix=".tmp",
+        )
+        try:
+          with os.fdopen(fd, "wb") as out:
+            while True:
+              chunk = fobj.read(1024 * 1024)
+              if not chunk:
+                break
+              out.write(chunk)
+          self._chown_if_needed(tmp_name, owner_uid, owner_gid)
+          os.chmod(tmp_name, self._safe_extract_mode(member.mode, is_dir=False))
+          os.replace(tmp_name, host_path)
+        except Exception:
+          try:
+            os.unlink(tmp_name)
+          except OSError:
+            pass
+          raise
+        extracted.append(container_name)
+    return extracted
+
+  def _new_apply_staging_dir(self) -> Path:
+    root = sync_state_dir(self.owner) / "staging"
+    root.mkdir(parents=True, exist_ok=True)
+    return Path(tempfile.mkdtemp(prefix="apply.", dir=str(root)))
+
+  @staticmethod
+  def _cleanup_tree(path: Optional[Path]) -> None:
+    if path is None:
+      return
+    try:
+      if path.is_dir():
+        for child in sorted(path.rglob("*"), reverse=True):
+          if child.is_dir():
+            child.rmdir()
+          else:
+            child.unlink()
+        path.rmdir()
+      elif path.exists():
+        path.unlink()
+    except OSError:
+      pass
+
+  def _stage_tar_member(self, fobj, staging_dir: Path, index: int) -> Path:
+    staging_path = staging_dir / f"{index:06d}.blob"
+    with staging_path.open("wb") as out:
+      while True:
+        chunk = fobj.read(1024 * 1024)
+        if not chunk:
+          break
+        out.write(chunk)
+      out.flush()
+      os.fsync(out.fileno())
+    return staging_path
+
+  def prepare_apply(self, record: dict) -> Optional[PreparedApply]:
+    """Validate and stage a consumer snapshot before stopping the app."""
+    rejection_reasons = self.validate_record_for_apply(record)
+    if rejection_reasons:
+      cid = record.get("cid") if isinstance(record, dict) else None
+      version = record.get("version") if isinstance(record, dict) else None
+      error = "; ".join(rejection_reasons)
+      self.owner.P(f"[sync] cannot prepare v{version} (cid={cid}): {error}", color="r")
+      if isinstance(record, dict) and record.get("cid"):
+        self._record_preflight_failure(record, STAGE_VALIDATION, error)
+      return None
+
+    cid = record["cid"]
+    version = record["version"]
+    try:
+      self._write_apply_state("preparing", record)
+    except Exception as exc:
+      self.owner.P(f"[sync] failed to write apply preparing state: {exc}", color="r")
+
+    try:
+      local_path = self.owner.r1fs.get_file(cid)
+    except Exception as exc:
+      self.owner.P(f"[sync] r1fs.get_file({cid}) failed: {exc}", color="r")
+      self._record_preflight_failure(record, "r1fs_download", str(exc))
+      return None
+    if not local_path:
+      error = f"r1fs.get_file({cid}) returned no path"
+      self.owner.P(f"[sync] {error}", color="r")
+      self._record_preflight_failure(record, "r1fs_download", error)
+      return None
+
+    staging_dir: Optional[Path] = None
+    try:
+      staging_dir = self._new_apply_staging_dir()
+      manifest = record.get("manifest") or {}
+      allowed_archive_paths = manifest.get("archive_paths") or []
+      planned: list[PlannedApplyMember] = []
+      with tarfile.open(str(local_path), "r:gz") as tar:
+        for index, member in enumerate(tar.getmembers()):
+          if member.issym() or member.islnk():
+            self.owner.P(
+              f"[sync] skipping link member in tar (security): {member.name}",
+              color="y",
+            )
+            continue
+          if any(part == ".." for part in member.name.split("/")):
+            raise ValueError(f"tar member name contains '..': {member.name!r}")
+          container_name = member.name
+          if not container_name.startswith("/"):
+            container_name = "/" + container_name
+          if not self._container_path_in_declared_archive_paths(
+            container_name, allowed_archive_paths
+          ):
+            raise ValueError(
+              f"tar member outside manifest archive_paths: {container_name!r}"
+            )
+          host_path, _bind, host_root = self.resolve_container_path(container_name)
+          self._validate_extract_target_within_root(host_path, host_root, container_name)
+          if member.isdir():
+            planned.append(PlannedApplyMember(
+              container_name=container_name,
+              host_path=host_path,
+              host_root=host_root,
+              staging_path=None,
+              mode=member.mode,
+              is_dir=True,
+            ))
+            continue
+          if not member.isfile():
+            continue
+          fobj = tar.extractfile(member)
+          if fobj is None:
+            continue
+          staging_path = self._stage_tar_member(fobj, staging_dir, index)
+          planned.append(PlannedApplyMember(
+            container_name=container_name,
+            host_path=host_path,
+            host_root=host_root,
+            staging_path=staging_path,
+            mode=member.mode,
+            is_dir=False,
+          ))
+      return PreparedApply(
+        record=dict(record),
+        cid=cid,
+        version=version,
+        local_path=str(local_path),
+        staging_dir=staging_dir,
+        members=planned,
+        manifest=dict(manifest),
+      )
+    except Exception as exc:
+      self.owner.P(f"[sync] prepare_apply failed for cid={cid}: {exc}", color="r")
+      self._cleanup_tree(staging_dir)
+      self._record_preflight_failure(record, STAGE_EXTRACT, str(exc))
+      return None
+
+  @staticmethod
+  def _new_backup_path(host_path: str) -> str:
+    directory = os.path.dirname(host_path)
+    fd, backup_path = tempfile.mkstemp(
+      dir=directory,
+      prefix=f".{os.path.basename(host_path)}.syncbak.",
+      suffix=".bak",
+    )
+    os.close(fd)
+    os.unlink(backup_path)
+    return backup_path
+
+  @staticmethod
+  def _unlink_path(path: str) -> None:
+    try:
+      os.unlink(path)
+    except FileNotFoundError:
+      pass
+
+  def _rollback_apply_ops(self, ops: list[ApplyRollbackOp]) -> bool:
+    ok = True
+    for op in reversed(ops):
+      try:
+        if op.op == "restore" and op.backup:
+          self._unlink_path(op.path)
+          os.replace(op.backup, op.path)
+        elif op.op == "remove_file":
+          self._unlink_path(op.path)
+        elif op.op == "remove_dir":
+          os.rmdir(op.path)
+        elif op.op == "restore_dir_meta" and op.metadata:
+          self._chown_if_needed(op.path, op.metadata.uid, op.metadata.gid)
+          os.chmod(op.path, op.metadata.mode)
+      except OSError as exc:
+        ok = False
+        self.owner.P(f"[sync] rollback operation failed for {op.path}: {exc}", color="r")
+    return ok
+
+  def _cleanup_backups(self, ops: list[ApplyRollbackOp]) -> None:
+    for op in ops:
+      if op.op == "restore" and op.backup:
+        try:
+          os.unlink(op.backup)
+        except OSError:
+          pass
+
+  def commit_prepared_apply(self, prepared: PreparedApply) -> ApplyResult:
+    """Apply a prepared snapshot while the app container is stopped."""
+    try:
+      self._write_apply_state("applying", prepared.record)
+    except Exception as exc:
+      error = f"could not write applying state: {exc}"
+      self.owner.P(f"[sync] {error}", color="r")
+      self._cleanup_tree(prepared.staging_dir)
+      return ApplyResult(False, True, "failed_preflight", [], error)
+
+    ops: list[ApplyRollbackOp] = []
+    tracked_dirs: set[str] = set()
+    extracted: list[str] = []
+    try:
+      for planned in prepared.members:
+        owner_uid, owner_gid = self._volume_owner(planned.host_root)
+        self._validate_extract_target_within_root(
+          planned.host_path, planned.host_root, planned.container_name
+        )
+        if planned.is_dir:
+          existed = os.path.isdir(planned.host_path)
+          self._ensure_directory_tree_owner(
+            planned.host_path, planned.host_root, owner_uid, owner_gid,
+            ops, tracked_dirs
+          )
+          self._validate_extract_target_within_root(
+            planned.host_path, planned.host_root, planned.container_name
+          )
+          self._record_directory_metadata(planned.host_path, ops, tracked_dirs)
+          os.chmod(planned.host_path, self._safe_extract_mode(planned.mode, is_dir=True))
+          if not existed:
+            created_path = os.path.normpath(planned.host_path)
+            if not any(
+              op.op == "remove_dir" and os.path.normpath(op.path) == created_path
+              for op in ops
+            ):
+              ops.append(ApplyRollbackOp("remove_dir", planned.host_path))
+          extracted.append(planned.container_name)
+          continue
+
+        if planned.staging_path is None:
+          continue
+        parent = os.path.dirname(planned.host_path)
+        self._validate_extract_target_within_root(
+          parent, planned.host_root, planned.container_name
+        )
+        self._ensure_directory_tree_owner(
+          parent, planned.host_root, owner_uid, owner_gid, ops, tracked_dirs
+        )
+        self._validate_extract_target_within_root(
+          planned.host_path, planned.host_root, planned.container_name
+        )
+        if os.path.isdir(planned.host_path):
+          raise ValueError(f"cannot replace directory with file: {planned.container_name!r}")
+
+        backup_path = None
+        if os.path.lexists(planned.host_path):
+          backup_path = self._new_backup_path(planned.host_path)
+          os.replace(planned.host_path, backup_path)
+          ops.append(ApplyRollbackOp("restore", planned.host_path, backup_path))
+        else:
+          ops.append(ApplyRollbackOp("remove_file", planned.host_path))
+
+        fd, tmp_name = tempfile.mkstemp(
+          dir=parent,
+          prefix=f".{os.path.basename(planned.host_path)}.",
+          suffix=".tmp",
+        )
+        try:
+          with os.fdopen(fd, "wb") as out, planned.staging_path.open("rb") as src:
+            while True:
+              chunk = src.read(1024 * 1024)
+              if not chunk:
+                break
+              out.write(chunk)
+            out.flush()
+            os.fsync(out.fileno())
+          self._chown_if_needed(tmp_name, owner_uid, owner_gid)
+          os.chmod(tmp_name, self._safe_extract_mode(planned.mode, is_dir=False))
+          os.replace(tmp_name, planned.host_path)
+        except Exception:
+          try:
+            os.unlink(tmp_name)
+          except OSError:
+            pass
+          raise
+        extracted.append(planned.container_name)
+
+      try:
+        self._write_apply_state(
+          "applied",
+          prepared.record,
+          extracted_paths=list(extracted),
+        )
+      except Exception as exc:
+        raise RuntimeError(f"could not write applied state: {exc}") from exc
+
+      self._cleanup_backups(ops)
+      self._cleanup_tree(prepared.staging_dir)
+      return ApplyResult(True, True, "applied", extracted)
+    except Exception as exc:
+      rollback_ok = self._rollback_apply_ops(ops)
+      state = "failed_rolled_back" if rollback_ok else "uncertain"
+      try:
+        self._write_apply_state(
+          state,
+          prepared.record,
+          error=str(exc),
+          extracted_paths=list(extracted),
+        )
+      except Exception as state_exc:
+        self.owner.P(f"[sync] failed to write apply failure state: {state_exc}", color="r")
+      self._cleanup_tree(prepared.staging_dir)
+      self.owner.P(f"[sync] commit_prepared_apply failed: {exc}", color="r")
+      return ApplyResult(False, rollback_ok, state, extracted, str(exc))
+
+  def _finalize_apply_success(
+    self,
+    record: dict,
+    extracted: list[str],
+  ) -> None:
+    cid = record["cid"]
+    version = record["version"]
+    applied_ts = self.owner.time()
+    entry = {
+      "cid": cid,
+      "version": version,
+      "source_timestamp": record.get("timestamp"),
+      "applied_timestamp": applied_ts,
+      "node_id": record.get("node_id"),
+      "metadata": record.get("metadata") or {},
+      "manifest": record.get("manifest") or {},
+      "extracted_paths": extracted,
+    }
+    history_appended = False
+    try:
+      self.append_received(entry)
+      history_appended = True
+    except Exception as exc:
+      self.owner.P(
+        f"[sync] apply succeeded but received-history append failed: {exc}",
+        color="r",
+      )
+
+    last_apply = {
+      "cid": cid,
+      "version": version,
+      "source_timestamp": record.get("timestamp"),
+      "applied_timestamp": applied_ts,
+      "node_id": record.get("node_id"),
+      "metadata": record.get("metadata") or {},
+    }
+    try:
+      self._write_json_atomic(
+        volume_sync_dir(self.owner) / SYNC_LAST_APPLY_FILE, last_apply
+      )
+    except Exception as exc:
+      self.owner.P(f"[sync] failed to write last_apply.json: {exc}", color="r")
+
+    if history_appended:
+      self._retire_previous_cid(
+        history_received_dir(self.owner),
+        cleanup_local_files=True,
+        unpin_remote=False,
+      )
+
+  def apply_snapshot(self, record: dict) -> bool:
+    """Full consumer orchestration for callers that already stopped the app.
+
+    ``_SyncMixin`` uses ``prepare_apply`` before stopping the container and
+    ``commit_prepared_apply`` after stopping it. This wrapper keeps older tests
+    and direct callers on the same transaction/state path.
+    """
+    prepared = self.prepare_apply(record)
+    if prepared is None:
+      return False
+    result = self.commit_prepared_apply(prepared)
+    if not result.success:
+      return False
+    self._finalize_apply_success(record, result.extracted_paths)
+    return True
+
+  # ----- retirement ------------------------------------------------------
+  def _retire_previous_cid(
+    self,
+    history_dir: Path,
+    cleanup_local_files: bool = False,
+    unpin_remote: bool = True,
+  ) -> None:
+    """Delete the prior R1FS CID after a successful new operation.
+
+    Only the immediately-prior un-retired entry is touched per call. Updates
+    that entry's ``deletion`` sub-record. Never raises — deletion failures
+    must not roll back the new publish/apply.
+    """
+    # Sort by append-time marker, not filename. Filenames embed the version
+    # prefix for chronological browsability under monotonic clocks, but the
+    # question "what did we just publish/apply?" is answered by insert order.
+    # Sorting by name here would retire the highest-*version* entry instead
+    # of the most-recently-appended one.
+    entries = sorted(
+      self._read_history_entries(history_dir),
+      key=lambda item: item[2],
+    )
+    if len(entries) < 2:
+      return  # nothing to retire yet
+    latest = entries[-1][1]
+    latest_cid = latest.get("cid")
+    target_entry: Optional[dict] = None
+    for _, entry, _ in reversed(entries[:-1]):
+      if entry.get("cid") == latest_cid:
+        continue  # same content -- nothing to retire
+      if (entry.get("deletion") or {}).get("deleted_at") is not None:
+        continue  # already retired
+      target_entry = entry
+      break
+
+    if target_entry is None:
+      return
+    target_cid = target_entry.get("cid")
+    if not target_cid:
+      return
+
+    succeeded = False
+    error: Optional[str] = None
+    try:
+      self.owner.r1fs.delete_file(
+        cid=target_cid,
+        unpin_remote=unpin_remote,
+        cleanup_local_files=cleanup_local_files,
+      )
+      succeeded = True
+    except Exception as exc:  # noqa: BLE001 — never raise
+      error = str(exc)
+      self.owner.P(
+        f"[sync] failed to retire CID {target_cid}: {exc}", color="y"
+      )
+
+    self.update_history_deletion(history_dir, target_entry, succeeded, error)
diff --git a/extensions/business/container_apps/sync/mixin.py b/extensions/business/container_apps/sync/mixin.py
new file mode 100644
index 00000000..e034ed75
--- /dev/null
+++ b/extensions/business/container_apps/sync/mixin.py
@@ -0,0 +1,595 @@
+"""Mixin: volume-sync provider/consumer integration for CAR.
+
+Bridges :class:`SyncManager` into ``ContainerAppRunnerPlugin``'s lifecycle:
+
+  * always-on: provisions the 10M ``/r1en_system`` system volume (a fixed-size
+    loopback identical in machinery to ``FIXED_SIZE_VOLUMES``) and exports
+    ``R1_*`` env vars to the container
+  * provider role: per ``cfg_sync_poll_interval`` polls for a pending
+    ``request.json``, then drives runtime stop → publish_snapshot →
+    start_container inline (must NOT route through ``_restart_container``,
+    which calls ``_cleanup_fixed_size_volumes`` and unmounts the loopback
+    before we can read from it)
+  * consumer role: same cadence polls ChainStore for a different ``cid``, then
+    drives runtime stop → apply_snapshot → start_container inline.
+    First boot starts on an empty volume; the next tick picks up whatever
+    snapshot is in ChainStore. Apps that strictly require state at startup
+    must implement their own poll-and-retry in their entrypoint.
+  * recovery: any orphan ``request.json.processing`` left behind by a prior
+    crash is renamed back to ``request.json`` on plugin init so the next
+    provider tick retries cleanly
+
+See ``extensions/business/container_apps/README.md`` for the public
+operator/app contract.
+"""
+from __future__ import annotations
+
+import os
+import stat
+from pathlib import Path
+from typing import Optional
+
+from extensions.business.container_apps import fixed_volume
+
+from .control_files import JsonControlFile, JsonControlFileUnsafeError
+from .constants import (
+  SYNC_PROCESSING_FILE,
+  SYNC_REQUEST_FILE,
+  STAGE_RUNTIME_STOP,
+  SYSTEM_VOLUME_FS,
+  SYSTEM_VOLUME_MOUNT,
+  SYSTEM_VOLUME_NAME,
+  SYSTEM_VOLUME_SIZE,
+  VOLUME_SYNC_SUBDIR,
+)
+from .manager import (
+  CONSUMER_APPLY_OFFLINE_RESTART,
+  CONSUMER_APPLY_ONLINE_NO_RESTART,
+  CONSUMER_APPLY_ONLINE_RESTART,
+  SyncManager,
+  history_received_dir,
+  runtime_policy_to_dict,
+  system_volume_host_root,
+  volume_sync_dir,
+)
+
+
+class _SyncMixin:
+  """
+  Required attributes on the composing plugin:
+    - self.P, self.time, self.cfg_*                  (BasePlugin)
+    - self.get_data_folder, _get_instance_data_subfolder
+    - self.volumes (dict, populated by CAR)
+    - self._fixed_volumes (list, populated by _FixedSizeVolumesMixin)
+    - self.env (dict, container env)
+    - self.r1fs, self.chainstore_hset/hget/hsync     (BasePlugin API)
+    - self._stop_container_runtime_for_restart(),
+      self.start_container()  (CAR lifecycle)
+    - self.cfg_sync                                  (CAR config block)
+    - self.ee_id                                     (BasePlugin identity)
+  """
+
+  # ----- system volume provisioning --------------------------------------
+
+  def _configure_system_volume(self):
+    """Provision the always-on /r1en_system fixed-size loopback.
+
+    Idempotent across plugin restarts: ``fixed_volume.provision`` reuses an
+    existing image/loop/mount when available. Adds the bind spec to
+    ``self.volumes`` so ``start_container`` mounts ``/r1en_system`` at the
+    correct host path. The ``volume-sync/`` subdir is created post-mount so
+    SyncManager always has a place to write request/response files.
+    """
+    try:
+      fixed_volume._require_tools(logger=self.P)
+    except RuntimeError as exc:
+      # Without the host tools we cannot provision /r1en_system, which means
+      # there is no shared filesystem for the app to drop request.json into
+      # and no host root for CAR to poll. Mark sync as unavailable so
+      # _sync_enabled() returns False (skipping all provider/consumer ticks)
+      # and _inject_sync_env_vars() refuses to advertise R1_SYSTEM_VOLUME
+      # to the container — otherwise the app would write to a non-existent
+      # in-container mount while CAR polled a host root that was never
+      # provisioned. Codex review finding 5 on PR #399.
+      self.P(
+        f"[sync] system volume unavailable: {exc}. "
+        f"SYNC will be disabled and R1_SYSTEM_VOLUME env vars will not be "
+        f"exported until host tools are installed.",
+        color="r",
+      )
+      self._sync_unavailable = True
+      return
+
+    root = (
+      Path(self.get_data_folder())
+      / self._get_instance_data_subfolder()
+      / "fixed_volumes"
+    )
+
+    # NOTE: deliberately do NOT call fixed_volume.cleanup_stale_mounts here.
+    # _FixedSizeVolumesMixin._configure_fixed_size_volumes() runs BEFORE us
+    # in on_init / _restart_container and already scans meta/ for the whole
+    # root. Calling it again from here would unmount any FIXED_SIZE_VOLUMES
+    # entries that the previous step just provisioned (because their meta/
+    # files exist) and then we'd never re-mount them — the data volume
+    # would land empty in the container.
+
+    vol = fixed_volume.FixedVolume(
+      name=SYSTEM_VOLUME_NAME,
+      size=SYSTEM_VOLUME_SIZE,
+      root=root,
+      fs_type=SYSTEM_VOLUME_FS,
+      owner_uid=None,
+      owner_gid=None,
+    )
+    try:
+      fixed_volume.provision(vol, force_recreate=False, logger=self.P)
+    except Exception as exc:
+      # Tool presence alone is not enough: hosts can still lack usable loop
+      # devices, mount privileges, or filesystem support. Container execution
+      # should continue without advertising the sync volume in that case.
+      self.P(
+        f"[sync] system volume unavailable: could not provision "
+        f"{SYSTEM_VOLUME_NAME}: {exc}. SYNC will be disabled and "
+        f"R1_SYSTEM_VOLUME env vars will not be exported.",
+        color="r",
+      )
+      self._sync_unavailable = True
+      return
+
+    self._sync_unavailable = False
+
+    # Track for shared cleanup (parity with FIXED_SIZE_VOLUMES).
+    if not hasattr(self, "_fixed_volumes"):
+      self._fixed_volumes = []
+    self._fixed_volumes.append(vol)
+
+    # Ensure the mount root itself stays root-owned/non-app-writable so the
+    # app cannot replace volume-sync/ with a symlink after startup. This system
+    # volume is CAR/app control plane, not app data, so it deliberately ignores
+    # the image USER ownership used for FIXED_SIZE_VOLUMES.
+    try:
+      os.chown(str(vol.mount_path), 0, 0)
+      os.chmod(str(vol.mount_path), 0o755)
+    except OSError as exc:
+      self.P(
+        f"[sync] could not enforce root-owned {vol.mount_path} mode 0o755: "
+        f"{exc}. SYNC will be disabled and R1_SYSTEM_VOLUME env vars will not "
+        f"be exported.",
+        color="r",
+      )
+      self._sync_unavailable = True
+      return
+
+    # Ensure volume-sync subdir exists before container start so the app
+    # can drop a request.json on its first tick. If a previous run left a
+    # symlink or non-directory here while /r1en_system was writable, remove it
+    # and recreate a real directory before exposing env vars to the container.
+    # Mode 1777 keeps it app-writable while preventing non-owners from
+    # deleting CAR-owned response/last_apply temp files.
+    vsd = volume_sync_dir(self)
+    try:
+      try:
+        st = os.lstat(str(vsd))
+      except FileNotFoundError:
+        st = None
+      if st is not None and (
+        stat.S_ISLNK(st.st_mode) or not stat.S_ISDIR(st.st_mode)
+      ):
+        os.unlink(str(vsd))
+      os.makedirs(str(vsd), exist_ok=True)
+      st = os.lstat(str(vsd))
+      if stat.S_ISLNK(st.st_mode) or not stat.S_ISDIR(st.st_mode):
+        raise RuntimeError(f"{vsd} is not a real directory")
+    except Exception as exc:
+      self.P(
+        f"[sync] volume-sync directory unsafe/unavailable: {exc}. "
+        f"SYNC will be disabled and R1_SYSTEM_VOLUME env vars will not be exported.",
+        color="r",
+      )
+      self._sync_unavailable = True
+      return
+    try:
+      os.chown(str(vsd), 0, 0)
+      os.chmod(str(vsd), 0o1777)
+    except OSError as exc:
+      self.P(
+        f"[sync] could not enforce root-owned {vsd} mode 0o1777: {exc}. "
+        f"SYNC will be disabled and R1_SYSTEM_VOLUME env vars will not be exported.",
+        color="r",
+      )
+      self._sync_unavailable = True
+      return
+    self.volumes.update(fixed_volume.docker_bind_spec(vol, SYSTEM_VOLUME_MOUNT))
+    self.P(
+      f"[sync] system volume ready: {vol.mount_path} -> {SYSTEM_VOLUME_MOUNT} "
+      f"(volume-sync at {vsd})",
+      color="g",
+    )
+
+  # ----- env-var injection -----------------------------------------------
+
+  def _inject_sync_env_vars(self):
+    """Add the ``R1_*`` env vars to the container's environment.
+
+    ``R1_SYSTEM_VOLUME`` / ``R1_VOLUME_SYNC_DIR`` / ``R1_SYNC_REQUEST_FILE``
+    are always set so apps can write the request unconditionally; CAR just
+    won't act on it without ``SYNC.ENABLED``. ``R1_SYNC_TYPE`` and
+    ``R1_SYNC_KEY`` are only set when SYNC is enabled so apps that want to
+    branch on role can.
+
+    If ``_sync_unavailable`` was set during ``_configure_system_volume``
+    (host tools missing), inject nothing — advertising a mount that was
+    never provisioned would route the app's writes into a phantom path.
+    """
+    if not isinstance(getattr(self, "env", None), dict):
+      return
+    if getattr(self, "_sync_unavailable", False):
+      return
+    self.env["R1_SYSTEM_VOLUME"] = SYSTEM_VOLUME_MOUNT
+    self.env["R1_VOLUME_SYNC_DIR"] = f"{SYSTEM_VOLUME_MOUNT}/{VOLUME_SYNC_SUBDIR}"
+    self.env["R1_SYNC_REQUEST_FILE"] = (
+      f"{SYSTEM_VOLUME_MOUNT}/{VOLUME_SYNC_SUBDIR}/{SYNC_REQUEST_FILE}"
+    )
+    if self._sync_enabled():
+      sync_type = self.cfg_sync.get("TYPE")
+      sync_key = self.cfg_sync.get("KEY")
+      if sync_type:
+        self.env["R1_SYNC_TYPE"] = str(sync_type)
+      if sync_key:
+        self.env["R1_SYNC_KEY"] = str(sync_key)
+
+  # ----- config helpers --------------------------------------------------
+
+  def _sync_cfg(self) -> dict:
+    cfg = getattr(self, "cfg_sync", None) or {}
+    return cfg if isinstance(cfg, dict) else {}
+
+  def _sync_enabled(self) -> bool:
+    if getattr(self, "_sync_unavailable", False):
+      return False
+    return bool(self._sync_cfg().get("ENABLED"))
+
+  def _sync_role(self) -> Optional[str]:
+    role = self._sync_cfg().get("TYPE")
+    if role in ("provider", "consumer"):
+      return role
+    return None
+
+  def _sync_poll_interval(self) -> float:
+    raw = self._sync_cfg().get("POLL_INTERVAL", 10)
+    try:
+      return max(1.0, float(raw))
+    except (TypeError, ValueError):
+      return 10.0
+
+  # ----- hsync interval (consumer only) ----------------------------------
+  # Decoupled from POLL_INTERVAL: every consumer tick still does the cheap
+  # chainstore_hget against the local replica, but the expensive network
+  # hsync is gated by this interval. Provider does not call hsync.
+  _HSYNC_POLL_INTERVAL_MIN = 10.0
+  _HSYNC_POLL_INTERVAL_DEFAULT = 60.0
+
+  def _hsync_poll_interval(self) -> float:
+    """Seconds between chainstore_hsync refreshes on the consumer side.
+
+    Min 10s, default 60s. Non-numeric values fall back to the default;
+    values below the min are clamped up. ``fetch_latest`` still reads the
+    cheap local replica every tick; this only gates network hsync.
+    """
+    raw = self._sync_cfg().get("HSYNC_POLL_INTERVAL", self._HSYNC_POLL_INTERVAL_DEFAULT)
+    try:
+      v = float(raw)
+    except (TypeError, ValueError):
+      return self._HSYNC_POLL_INTERVAL_DEFAULT
+    return max(self._HSYNC_POLL_INTERVAL_MIN, v)
+
+  # convenience for SyncManager (it reads owner.cfg_sync_key)
+  @property
+  def cfg_sync_key(self):
+    return self._sync_cfg().get("KEY")
+
+  @property
+  def cfg_sync_type(self):
+    return self._sync_cfg().get("TYPE")
+
+  @property
+  def cfg_sync_hsync_poll_interval(self) -> float:
+    """Mirror of ``_hsync_poll_interval()`` accessible by ``SyncManager``
+    via ``owner.cfg_sync_hsync_poll_interval`` (same convention as
+    ``cfg_sync_key`` / ``cfg_sync_type``).
+    """
+    return self._hsync_poll_interval()
+
+  @property
+  def cfg_sync_allow_online_provider_capture(self) -> bool:
+    """Provider-local opt-in for Docker archive capture from live containers."""
+    raw = self._sync_cfg().get("ALLOW_ONLINE_PROVIDER_CAPTURE", False)
+    if isinstance(raw, bool):
+      return raw
+    if isinstance(raw, str):
+      value = raw.strip().lower()
+      if value in ("1", "true", "yes", "on"):
+        return True
+      if value in ("0", "false", "no", "off", ""):
+        return False
+    if isinstance(raw, int) and raw in (0, 1):
+      return bool(raw)
+    self.P(
+      f"[sync] invalid ALLOW_ONLINE_PROVIDER_CAPTURE value {raw!r}; using False",
+      color="y",
+    )
+    return False
+
+  # ----- manager handle ---------------------------------------------------
+
+  def _ensure_sync_manager(self) -> Optional[SyncManager]:
+    """Lazy-init the SyncManager. Returns None if SYNC is not enabled."""
+    if not self._sync_enabled():
+      return None
+    sm = getattr(self, "_sync_manager", None)
+    if sm is None:
+      sm = SyncManager(self)
+      self._sync_manager = sm
+    return sm
+
+  # ----- recovery on plugin init -----------------------------------------
+
+  def _recover_stale_processing(self):
+    """Rename any orphan request.json.processing back to request.json.
+
+    Called from the plugin's on_init so a crash mid-publish doesn't leave
+    a request stuck. The next provider tick will then re-claim it.
+    """
+    control_file = JsonControlFile(
+      volume_sync_dir(self), SYNC_REQUEST_FILE, SYNC_PROCESSING_FILE
+    )
+    proc = control_file.processing_path
+    req = control_file.pending_path
+    try:
+      recovered = control_file.recover_stale_processing()
+    except (OSError, JsonControlFileUnsafeError) as exc:
+      self.P(
+        f"[sync] failed to recover orphan .processing: {exc}", color="r"
+      )
+      return
+    if recovered:
+      self.P(
+        f"[sync] recovered orphan {proc.name} -> {req.name} for retry",
+        color="y",
+      )
+
+  # ----- provider tick ---------------------------------------------------
+
+  def _sync_provider_tick(self, current_time: float) -> None:
+    """If a pending request.json exists, run the full publish flow.
+
+    Drives runtime stop → publish_snapshot → start_container inline.
+    Always returns ``None`` — must NOT use a StopReason because that would
+    route through ``_restart_container``, which unmounts the system volume
+    before we can read from it (see plan Step 1 verification).
+    """
+    sm = self._ensure_sync_manager()
+    if sm is None or self._sync_role() != "provider":
+      return
+    if not self._sync_should_tick(current_time):
+      return
+
+    control_file = JsonControlFile(
+      volume_sync_dir(self), SYNC_REQUEST_FILE, SYNC_PROCESSING_FILE
+    )
+    claimed = sm.claim_request()
+    if claimed is None:
+      # claim_request already wrote .invalid + response.json
+      return
+    self.P(
+      f"[sync] provider tick: claimed {control_file.processing_name} for publish",
+      color="b",
+    )
+
+    stopped_for_sync = claimed.runtime.provider_capture == "offline"
+    if stopped_for_sync:
+      stopped = self._stop_container_runtime_for_restart()
+      if not stopped:
+        request_body = {
+          "archive_paths": list(claimed.archive_paths),
+          "metadata": dict(claimed.metadata),
+          "runtime": runtime_policy_to_dict(claimed.runtime),
+        }
+        sm._fail_request(
+          request_body,
+          STAGE_RUNTIME_STOP,
+          "could not stop/remove container for offline provider capture",
+          control_file.processing_path,
+        )
+        return
+
+    try:
+      sm.publish_snapshot(claimed)
+    except Exception as exc:
+      # SyncManager.publish_snapshot has internal try/except for every
+      # stage, but we still wrap to guarantee we always restart the
+      # container even if something truly unexpected escapes.
+      self.P(f"[sync] publish_snapshot raised unexpectedly: {exc}", color="r")
+
+    if stopped_for_sync:
+      self._sync_safe_start_container()
+
+  # ----- consumer tick ---------------------------------------------------
+
+  def _sync_consumer_tick(self, current_time: float) -> None:
+    """If the ChainStore record points at a different CID than what we last
+    applied, fetch+extract+restart inline. Identity is the CID, not the
+    version: the CID is content-addressed and uniquely identifies the
+    bundle, while ``version`` is informational metadata only (kept for
+    filename ordering + human-readable logs). Comparing CIDs eliminates
+    a class of clock-skew failure modes (a provider's wonky timestamp
+    can never make a consumer permanently ignore a corrected snapshot)
+    and makes multi-provider sync sets coherent without ordering
+    assumptions.
+    """
+    sm = self._ensure_sync_manager()
+    if sm is None or self._sync_role() != "consumer":
+      return
+    if not self._sync_should_tick(current_time):
+      return
+
+    record = sm.fetch_latest()
+    if not isinstance(record, dict):
+      return
+    record_cid = record.get("cid")
+    if not record_cid:
+      return
+
+    latest_local = sm.latest_applied()
+    last_cid = (latest_local or {}).get("cid") if latest_local else None
+    if last_cid and record_cid == last_cid:
+      return  # same bundle as the last apply — nothing to do
+
+    quarantined = sm.quarantined_record(record)
+    if quarantined is not None:
+      self.P(
+        f"[sync] skipping quarantined consumer cid={record_cid} until "
+        f"{quarantined.get('next_retry_after')}: {quarantined.get('error')}",
+        color="y",
+      )
+      return
+
+    prepared = sm.prepare_apply(record)
+    if prepared is None:
+      return
+
+    self.P(
+      f"[sync] consumer tick: applying cid={record_cid} "
+      f"(v{record.get('version')})",
+      color="b",
+    )
+
+    apply_mode = self._sync_consumer_apply_mode(record)
+    if apply_mode == CONSUMER_APPLY_OFFLINE_RESTART:
+      stopped = self._stop_container_runtime_for_restart()
+      if not stopped:
+        self.P(
+          f"[sync] aborting consumer apply for cid={record_cid}: "
+          "could not stop/remove container for offline apply",
+          color="r",
+        )
+        sm._cleanup_tree(prepared.staging_dir)
+        return
+
+    applied = False
+    restart_safe = True
+    try:
+      result = sm.commit_prepared_apply(prepared)
+      applied = bool(result.success)
+      restart_safe = bool(result.restart_safe)
+      if applied:
+        sm._finalize_apply_success(record, result.extracted_paths)
+    except Exception as exc:
+      restart_safe = False
+      self.P(f"[sync] commit_prepared_apply raised unexpectedly: {exc}", color="r")
+
+    if apply_mode == CONSUMER_APPLY_OFFLINE_RESTART and restart_safe:
+      self._sync_safe_start_container()
+    elif apply_mode == CONSUMER_APPLY_OFFLINE_RESTART:
+      self.P(
+        f"[sync] leaving container stopped after uncertain apply for cid={record_cid}",
+        color="r",
+      )
+    elif apply_mode == CONSUMER_APPLY_ONLINE_RESTART and applied:
+      stopped = self._stop_container_runtime_for_restart()
+      if stopped:
+        self._sync_safe_start_container()
+      else:
+        self.P(
+          f"[sync] post-apply restart failed to stop container for cid={record_cid}",
+          color="r",
+        )
+
+  # ----- internal helpers ------------------------------------------------
+
+  def _sync_consumer_apply_mode(self, record: Optional[dict] = None) -> str:
+    """Return the consumer-local lifecycle policy for snapshot apply.
+
+    Provider-published records may carry the requester's desired
+    ``runtime.consumer_apply`` for audit/UI purposes, but lifecycle safety is
+    decided by the consumer node. A provider must not be able to force a
+    running consumer to hot-apply files. Online consumer modes are accepted for
+    compatibility but normalized to offline restart until extraction is
+    descriptor-safe against app-side path races.
+    """
+    mode = self._sync_cfg().get("CONSUMER_APPLY_MODE", CONSUMER_APPLY_OFFLINE_RESTART)
+    allowed = {
+      CONSUMER_APPLY_OFFLINE_RESTART,
+      CONSUMER_APPLY_ONLINE_NO_RESTART,
+      CONSUMER_APPLY_ONLINE_RESTART,
+    }
+    if mode not in allowed:
+      self.P(
+        f"[sync] unknown local CONSUMER_APPLY_MODE {mode!r}; using "
+        f"{CONSUMER_APPLY_OFFLINE_RESTART!r}",
+        color="y",
+      )
+      self._sync_last_apply_mode_resolution = {
+        "requested_mode": mode,
+        "effective_mode": CONSUMER_APPLY_OFFLINE_RESTART,
+        "reason": "unknown_mode",
+      }
+      return CONSUMER_APPLY_OFFLINE_RESTART
+    if mode in {CONSUMER_APPLY_ONLINE_NO_RESTART, CONSUMER_APPLY_ONLINE_RESTART}:
+      self.P(
+        f"[sync] local CONSUMER_APPLY_MODE {mode!r} is currently disabled for "
+        f"filesystem safety; using {CONSUMER_APPLY_OFFLINE_RESTART!r}",
+        color="y",
+      )
+      self._sync_last_apply_mode_resolution = {
+        "requested_mode": mode,
+        "effective_mode": CONSUMER_APPLY_OFFLINE_RESTART,
+        "reason": "online_apply_disabled",
+      }
+      return CONSUMER_APPLY_OFFLINE_RESTART
+    self._sync_last_apply_mode_resolution = {
+      "requested_mode": mode,
+      "effective_mode": mode,
+      "reason": None,
+    }
+    return mode
+
+  def _sync_record_consumer_apply_mode(self, record: dict) -> str:
+    """Backward-compatible wrapper for tests/older call sites."""
+    return self._sync_consumer_apply_mode(record)
+
+  def _sync_should_tick(self, current_time: float) -> bool:
+    last = getattr(self, "_last_sync_check", 0.0) or 0.0
+    if current_time - last < self._sync_poll_interval():
+      return False
+    self._last_sync_check = current_time
+    return True
+
+  def _sync_safe_start_container(self) -> None:
+    """Restart the container after a sync slice. Failures are logged, not
+    raised, because the periodic loop will retry and ``_check_container_status``
+    will pick up a still-stopped container on the next pass.
+
+    Calls ``_reset_runtime_state_post_start`` after the start so that
+    readiness gates, health-probe timers, log capture, and
+    BUILD_AND_RUN_COMMANDS all re-engage against the freshly-started
+    container — same contract ``_restart_container`` follows. Without this,
+    tunnels stay marked ready, health checks are skipped, log streams are
+    stale, and image-defined startup commands don't rerun.
+
+    The reset is guarded by its own try/except so a failed reset does not
+    roll back a successful start — the next periodic tick can re-evaluate
+    readiness.
+    """
+    try:
+      self.start_container()
+    except Exception as exc:
+      self.P(f"[sync] start_container after sync slice failed: {exc}", color="r")
+      return
+    try:
+      self._reset_runtime_state_post_start()
+    except Exception as exc:
+      self.P(
+        f"[sync] runtime-state reset after sync slice failed: {exc}", color="r"
+      )
diff --git a/extensions/business/container_apps/tests/support.py b/extensions/business/container_apps/tests/support.py
index fce7f5a8..dc09bf67 100644
--- a/extensions/business/container_apps/tests/support.py
+++ b/extensions/business/container_apps/tests/support.py
@@ -8,6 +8,41 @@
 import numpy as _np
 
 
+def install_docker_stub_if_needed():
+  """Provide the tiny docker-py surface these unit tests need."""
+  if "docker" in sys.modules and "docker.errors" in sys.modules and "docker.types" in sys.modules:
+    return
+
+  docker_mod = types.ModuleType("docker")
+  errors_mod = types.ModuleType("docker.errors")
+  types_mod = types.ModuleType("docker.types")
+
+  class DockerException(Exception):
+    pass
+
+  class NotFound(DockerException):
+    pass
+
+  class DeviceRequest:
+    def __init__(self, **kwargs):
+      self.kwargs = kwargs
+
+  errors_mod.DockerException = DockerException
+  errors_mod.NotFound = NotFound
+  types_mod.DeviceRequest = DeviceRequest
+  docker_mod.errors = errors_mod
+  docker_mod.types = types_mod
+  docker_mod.from_env = MagicMock()
+
+  sys.modules.setdefault("docker", docker_mod)
+  sys.modules.setdefault("docker.errors", errors_mod)
+  sys.modules.setdefault("docker.types", types_mod)
+  return
+
+
+install_docker_stub_if_needed()
+
+
 class _DummyBasePlugin:
   CONFIG = {'VALIDATION_RULES': {}}
 
@@ -210,6 +245,7 @@ def _log(*args, **kwargs):
   plugin.cfg_extra_tunnels_ping_interval = 30
   plugin.cfg_health_check = {}
   plugin.cfg_restart_policy = "always"
+  plugin.cfg_plugin_stop_timeout = 45
   plugin.volumes = {}
   plugin.extra_ports_mapping = {}
   plugin.inverted_ports_mapping = {}
@@ -225,6 +261,8 @@ def _log(*args, **kwargs):
   plugin._health_probing_disabled = False
   plugin._normalized_exposed_ports = {}
   plugin._normalized_main_exposed_port = None
+  plugin._cleanup_failed = False
+  plugin._manual_stop_pending = False
   plugin.container = object()
   plugin.container_name = "car_instance"
   plugin.log = types.SimpleNamespace(get_localhost_ip=lambda: "127.0.0.1")
@@ -328,6 +366,8 @@ def make_lifecycle_runner(docker_client=None, mock_container=None, **cfg_overrid
   # State machine
   plugin.container_state = ContainerState.UNINITIALIZED
   plugin.stop_reason = StopReason.UNKNOWN
+  plugin._cleanup_failed = False
+  plugin._manual_stop_pending = False
 
   # Restart/backoff
   plugin._consecutive_failures = 0
@@ -371,6 +411,7 @@ def make_lifecycle_runner(docker_client=None, mock_container=None, **cfg_overrid
   plugin._last_extra_tunnels_ping = 0
   plugin._last_paused_log = 0
   plugin.cfg_paused_state_log_interval = 60
+  plugin.cfg_plugin_stop_timeout = 45
   plugin.cfg_show_log_each = 60
   plugin.cfg_show_log_last_lines = 5
   plugin.cfg_semaphore_log_interval = 10
diff --git a/extensions/business/container_apps/tests/test_container_lifecycle.py b/extensions/business/container_apps/tests/test_container_lifecycle.py
index 317b827b..dc99c698 100644
--- a/extensions/business/container_apps/tests/test_container_lifecycle.py
+++ b/extensions/business/container_apps/tests/test_container_lifecycle.py
@@ -10,9 +10,14 @@
 """
 
 import unittest
+import subprocess
 from pathlib import Path
 from unittest.mock import patch, MagicMock
 
+from extensions.business.container_apps.tests.support import install_docker_stub_if_needed
+
+install_docker_stub_if_needed()
+
 import docker.errors
 import docker.types
 
@@ -468,6 +473,107 @@ def test_process_respects_max_retries(self):
     errors = [m for m in plugin.logged_messages if "abandoned" in m.lower()]
     self.assertTrue(len(errors) > 0)
 
+  def test_process_retries_failed_cleanup_then_restarts(self):
+    """A transient cleanup failure must not permanently block process()."""
+    clock = {"now": 100}
+    plugin, client, _ = make_lifecycle_runner(cfg_restart_backoff_initial=0)
+    plugin.time = lambda: clock["now"]
+    plugin._cleanup_failed = True
+    plugin.container_state = ContainerState.FAILED
+
+    attempts = {"count": 0}
+
+    def retry_cleanup():
+      attempts["count"] += 1
+      plugin._cleanup_failed = attempts["count"] == 1
+      return not plugin._cleanup_failed
+
+    plugin._stop_container_and_save_logs_to_disk = retry_cleanup
+
+    plugin.process()
+    self.assertTrue(plugin._cleanup_failed)
+    client.containers.run.assert_not_called()
+
+    with _patch_docker_module(client):
+      plugin.process()
+
+    self.assertFalse(plugin._cleanup_failed)
+    client.containers.run.assert_called_once()
+    self.assertEqual(plugin.container_state, ContainerState.RUNNING)
+
+  def test_manual_stop_persists_only_after_cleanup_success(self):
+    plugin, _, _ = make_lifecycle_runner(cfg_restart_backoff_initial=0)
+    plugin._save_persistent_state = MagicMock()
+    plugin._clear_manual_stop_state = MagicMock()
+    plugin._stop_container_and_save_logs_to_disk = MagicMock(return_value=False)
+
+    plugin.on_command("STOP")
+
+    plugin._save_persistent_state.assert_not_called()
+    plugin._clear_manual_stop_state.assert_called_once()
+    self.assertTrue(plugin._manual_stop_pending)
+    self.assertEqual(plugin.container_state, ContainerState.FAILED)
+
+  def test_pending_manual_stop_pauses_after_cleanup_retry_success(self):
+    plugin, client, _ = make_lifecycle_runner(cfg_restart_backoff_initial=0)
+    plugin._cleanup_failed = True
+    plugin._manual_stop_pending = True
+    plugin.container_state = ContainerState.FAILED
+    plugin._save_persistent_state = MagicMock()
+    plugin._stop_container_and_save_logs_to_disk = MagicMock(return_value=True)
+
+    plugin.process()
+
+    plugin._save_persistent_state.assert_called_once_with(manually_stopped=True)
+    client.containers.run.assert_not_called()
+    self.assertFalse(plugin._manual_stop_pending)
+    self.assertEqual(plugin.container_state, ContainerState.PAUSED)
+
+  def test_restart_clears_pending_manual_stop_before_cleanup_retry(self):
+    plugin, client, _ = make_lifecycle_runner(cfg_restart_backoff_initial=0)
+    plugin._manual_stop_pending = True
+    plugin._cleanup_failed = True
+    plugin._save_persistent_state = MagicMock()
+    plugin._clear_manual_stop_state = MagicMock()
+    attempts = {"count": 0}
+
+    def cleanup():
+      attempts["count"] += 1
+      plugin._cleanup_failed = attempts["count"] == 1
+      return not plugin._cleanup_failed
+
+    plugin._stop_container_and_save_logs_to_disk = cleanup
+
+    plugin.on_command("RESTART")
+
+    plugin._clear_manual_stop_state.assert_called_once()
+    plugin._save_persistent_state.assert_not_called()
+    self.assertFalse(plugin._manual_stop_pending)
+    self.assertTrue(plugin._cleanup_failed)
+
+    with _patch_docker_module(client):
+      plugin.process()
+
+    plugin._save_persistent_state.assert_not_called()
+    self.assertFalse(plugin._cleanup_failed)
+    self.assertEqual(plugin.container_state, ContainerState.RUNNING)
+
+  def test_config_restart_respects_pending_manual_stop_cleanup(self):
+    plugin, _, _ = make_lifecycle_runner(cfg_restart_backoff_initial=0)
+    plugin._manual_stop_pending = True
+    plugin._cleanup_failed = True
+    plugin._save_persistent_state = MagicMock()
+    plugin._stop_container_and_save_logs_to_disk = MagicMock(return_value=True)
+    restart_callable = MagicMock()
+
+    plugin._handle_config_restart(restart_callable)
+
+    restart_callable.assert_not_called()
+    plugin._save_persistent_state.assert_called_once_with(manually_stopped=True)
+    self.assertFalse(plugin._manual_stop_pending)
+    self.assertFalse(plugin._cleanup_failed)
+    self.assertEqual(plugin.container_state, ContainerState.PAUSED)
+
   def test_process_multiple_iterations_running(self):
     """Multiple process() calls with a healthy container should all succeed."""
     plugin, _, container = make_lifecycle_runner()
@@ -480,6 +586,45 @@ def test_process_multiple_iterations_running(self):
     self.assertEqual(plugin.container_state, ContainerState.RUNNING)
 
 
+class _FakeProcess:
+  def __init__(self):
+    self.terminated = False
+    self.killed = False
+    self.wait_calls = 0
+
+  def poll(self):
+    return 0 if self.killed else None
+
+  def terminate(self):
+    self.terminated = True
+    return
+
+  def kill(self):
+    self.killed = True
+    return
+
+  def wait(self, timeout=None):
+    self.wait_calls += 1
+    if self.wait_calls == 1:
+      raise subprocess.TimeoutExpired(cmd="fake", timeout=timeout)
+    self.killed = True
+    return 0
+
+
+class TestTunnelCompatibilityFallbacks(unittest.TestCase):
+  """The edge PR must work even before the matching core PR is deployed."""
+
+  def test_local_subprocess_termination_fallback_without_core_helper(self):
+    plugin, _, _ = make_lifecycle_runner()
+    process = _FakeProcess()
+
+    with patch("extensions.business.container_apps.container_app_runner.os.name", "nt"):
+      self.assertTrue(plugin._terminate_subprocess_tree(process, terminate_timeout=0, kill_timeout=0))
+
+    self.assertTrue(process.terminated)
+    self.assertTrue(process.killed)
+
+
 # ===========================================================================
 # Fixed-Size Volume Integration
 # ===========================================================================
diff --git a/extensions/business/container_apps/tests/test_fixed_volume.py b/extensions/business/container_apps/tests/test_fixed_volume.py
index cdc37bf7..20845054 100644
--- a/extensions/business/container_apps/tests/test_fixed_volume.py
+++ b/extensions/business/container_apps/tests/test_fixed_volume.py
@@ -250,6 +250,67 @@ def test_handles_missing_metadata(self, mock_run):
     # Should not raise even if meta_path doesn't exist
     cleanup(vol)
 
+  @patch("extensions.business.container_apps.fixed_volume._run")
+  def test_missing_metadata_with_mounted_path_reports_failure(self, mock_run):
+    vol = FixedVolume(name="data", size="100M", root=Path("/r"))
+    proc = f"/dev/sdb1 {vol.mount_path} ext4 rw 0 0\n"
+    with patch.object(Path, "exists", return_value=False), \
+         patch("builtins.open", mock_open(read_data=proc)):
+      result = cleanup(vol)
+    self.assertFalse(result)
+    mock_run.assert_called_once_with(["umount", str(vol.mount_path)], logger=None)
+
+  @patch("extensions.business.container_apps.fixed_volume._run")
+  def test_missing_metadata_with_loop_mount_recovers_and_detaches(self, mock_run):
+    vol = FixedVolume(name="data", size="100M", root=Path("/r"))
+    with patch.object(Path, "exists", return_value=False), \
+         patch("extensions.business.container_apps.fixed_volume._get_mount_source", return_value="/dev/loop7"), \
+         patch("extensions.business.container_apps.fixed_volume._is_path_mounted", return_value=False):
+      result = cleanup(vol)
+    self.assertTrue(result)
+    self.assertEqual(
+      [call_args.args[0] for call_args in mock_run.call_args_list],
+      [["umount", str(vol.mount_path)], ["losetup", "-d", "/dev/loop7"]],
+    )
+
+  @patch("extensions.business.container_apps.fixed_volume._run")
+  def test_malformed_metadata_reports_failure(self, mock_run):
+    vol = FixedVolume(name="data", size="100M", root=Path("/r"))
+    with patch.object(Path, "exists", return_value=True), \
+         patch.object(Path, "read_text", return_value="{not-json"), \
+         patch("builtins.open", mock_open(read_data="")):
+      result = cleanup(vol)
+    self.assertFalse(result)
+
+  @patch("extensions.business.container_apps.fixed_volume._run")
+  def test_malformed_metadata_with_loop_mount_recovers_and_detaches(self, mock_run):
+    vol = FixedVolume(name="data", size="100M", root=Path("/r"))
+    with patch.object(Path, "exists", return_value=True), \
+         patch.object(Path, "read_text", return_value="{not-json"), \
+         patch("extensions.business.container_apps.fixed_volume._get_mount_source", return_value="/dev/loop9"), \
+         patch("extensions.business.container_apps.fixed_volume._is_path_mounted", return_value=False):
+      result = cleanup(vol)
+    self.assertTrue(result)
+    self.assertEqual(
+      [call_args.args[0] for call_args in mock_run.call_args_list],
+      [["umount", str(vol.mount_path)], ["losetup", "-d", "/dev/loop9"]],
+    )
+
+  @patch("extensions.business.container_apps.fixed_volume._run")
+  def test_mounted_loop_source_overrides_stale_metadata_loop(self, mock_run):
+    vol = FixedVolume(name="data", size="100M", root=Path("/r"))
+    meta = {"loop_dev": "/dev/loop3"}
+    with patch.object(Path, "exists", return_value=True), \
+         patch.object(Path, "read_text", return_value=json.dumps(meta)), \
+         patch("extensions.business.container_apps.fixed_volume._get_mount_source", return_value="/dev/loop7"), \
+         patch("extensions.business.container_apps.fixed_volume._is_path_mounted", return_value=False):
+      result = cleanup(vol)
+    self.assertTrue(result)
+    self.assertEqual(
+      [call_args.args[0] for call_args in mock_run.call_args_list],
+      [["umount", str(vol.mount_path)], ["losetup", "-d", "/dev/loop7"]],
+    )
+
   @patch("extensions.business.container_apps.fixed_volume._run")
   def test_handles_umount_failure(self, mock_run):
     vol = FixedVolume(name="data", size="100M", root=Path("/tmp/fv"))
@@ -432,7 +493,18 @@ def test_calls_cleanup_for_each_volume(self, mock_cleanup):
     self.assertEqual(plugin._fixed_volumes, [])
 
   @patch("extensions.business.container_apps.fixed_volume.cleanup",
-         side_effect=[Exception("fail"), None])
+         side_effect=[False, True])
+  def test_retains_volume_when_cleanup_returns_false(self, mock_cleanup):
+    plugin = make_container_app_runner()
+    vol1 = FixedVolume(name="a", size="50M", root=Path("/r"))
+    vol2 = FixedVolume(name="b", size="50M", root=Path("/r"))
+    plugin._fixed_volumes = [vol1, vol2]
+    self.assertFalse(plugin._cleanup_fixed_size_volumes())
+    self.assertEqual(mock_cleanup.call_count, 2)
+    self.assertEqual(plugin._fixed_volumes, [vol1])
+
+  @patch("extensions.business.container_apps.fixed_volume.cleanup",
+         side_effect=[Exception("fail"), True])
   def test_continues_on_failure(self, mock_cleanup):
     plugin = make_container_app_runner()
     vol1 = FixedVolume(name="a", size="50M", root=Path("/r"))
@@ -440,7 +512,7 @@ def test_continues_on_failure(self, mock_cleanup):
     plugin._fixed_volumes = [vol1, vol2]
     plugin._cleanup_fixed_size_volumes()  # should not raise
     self.assertEqual(mock_cleanup.call_count, 2)
-    self.assertEqual(plugin._fixed_volumes, [])
+    self.assertEqual(plugin._fixed_volumes, [vol1])
 
 
 if __name__ == "__main__":
diff --git a/extensions/business/container_apps/tests/test_sync_control_files.py b/extensions/business/container_apps/tests/test_sync_control_files.py
new file mode 100644
index 00000000..d1530db5
--- /dev/null
+++ b/extensions/business/container_apps/tests/test_sync_control_files.py
@@ -0,0 +1,228 @@
+"""Unit tests for sync JSON control-file mechanics."""
+
+import json
+import os
+import tempfile
+import unittest
+from pathlib import Path
+from unittest.mock import patch
+
+from extensions.business.container_apps.sync.control_files import (
+  JsonControlFile,
+  JsonControlFileDecodeError,
+  JsonControlFileObjectError,
+  JsonControlFileUnsafeError,
+  write_json_atomic,
+)
+
+
+class TestWriteJsonAtomic(unittest.TestCase):
+  def setUp(self):
+    self._tmp = tempfile.TemporaryDirectory()
+    self.root = Path(self._tmp.name)
+
+  def tearDown(self):
+    self._tmp.cleanup()
+
+  def test_writes_json_atomically_with_app_readable_mode(self):
+    target = self.root / "nested" / "response.json"
+
+    write_json_atomic(target, {"status": "ok", "version": 2})
+
+    self.assertEqual(json.loads(target.read_text()), {"status": "ok", "version": 2})
+    self.assertEqual(os.stat(target).st_mode & 0o777, 0o644)
+    self.assertEqual(list(target.parent.glob(".response.json.*.tmp")), [])
+
+  def test_cleans_tmp_file_on_write_failure(self):
+    target = self.root / "state.json"
+
+    with patch(
+      "extensions.business.container_apps.sync.control_files.json.dump",
+      side_effect=RuntimeError("boom"),
+    ):
+      with self.assertRaises(RuntimeError):
+        write_json_atomic(target, {"status": "ok"})
+
+    self.assertFalse(target.exists())
+    self.assertEqual(list(self.root.glob(".state.json.*.tmp")), [])
+
+  def test_rejects_symlink_parent_directory(self):
+    outside = self.root / "outside"
+    outside.mkdir()
+    control_root = self.root / "volume-sync"
+    os.symlink(str(outside), str(control_root))
+
+    with self.assertRaises(JsonControlFileUnsafeError):
+      write_json_atomic(control_root / "response.json", {"status": "ok"})
+
+    self.assertFalse((outside / "response.json").exists())
+
+
+class TestJsonControlFile(unittest.TestCase):
+  def setUp(self):
+    self._tmp = tempfile.TemporaryDirectory()
+    self.root = Path(self._tmp.name)
+    self.control = JsonControlFile(
+      self.root, "request.json", "request.json.processing"
+    )
+
+  def tearDown(self):
+    self._tmp.cleanup()
+
+  def test_claim_object_returns_none_when_absent(self):
+    self.assertIsNone(self.control.claim_object())
+
+  def test_claim_object_renames_and_parses_pending_file(self):
+    (self.root / "request.json").write_text(
+      '{"archive_paths":["/app/data/"],"metadata":{"k":1}}'
+    )
+
+    claimed = self.control.claim_object()
+
+    self.assertIsNotNone(claimed)
+    self.assertEqual(claimed.body["archive_paths"], ["/app/data/"])
+    self.assertEqual(claimed.body["metadata"], {"k": 1})
+    self.assertFalse((self.root / "request.json").exists())
+    self.assertTrue((self.root / "request.json.processing").is_file())
+    self.assertEqual(claimed.processing_path, self.root / "request.json.processing")
+
+  def test_claim_object_reports_malformed_json_with_raw_body(self):
+    (self.root / "request.json").write_text("not-json{")
+
+    with self.assertRaises(JsonControlFileDecodeError) as ctx:
+      self.control.claim_object()
+
+    self.assertEqual(ctx.exception.raw_body, "not-json{")
+    self.assertTrue((self.root / "request.json.processing").is_file())
+
+  def test_claim_object_reports_invalid_utf8_without_raw_body(self):
+    (self.root / "request.json").write_bytes(b'{"archive_paths": ["\xff"]}')
+
+    with self.assertRaises(JsonControlFileDecodeError) as ctx:
+      self.control.claim_object()
+
+    self.assertIsNone(ctx.exception.raw_body)
+    self.assertIn("invalid UTF-8", str(ctx.exception))
+    self.assertTrue((self.root / "request.json.processing").is_file())
+
+  def test_claim_object_reports_non_object_json_with_raw_body(self):
+    (self.root / "request.json").write_text('["just","a","list"]')
+
+    with self.assertRaises(JsonControlFileObjectError) as ctx:
+      self.control.claim_object()
+
+    self.assertEqual(ctx.exception.raw_body, '["just","a","list"]')
+    self.assertIn("request.json must be a JSON object", str(ctx.exception))
+
+  def test_claim_object_rejects_symlink_without_reading_target(self):
+    secret = self.root / "secret.txt"
+    secret.write_text("host-secret")
+    os.symlink(str(secret), str(self.root / "request.json"))
+
+    with self.assertRaises(JsonControlFileUnsafeError) as ctx:
+      self.control.claim_object()
+
+    self.assertNotIn("host-secret", str(ctx.exception))
+    self.assertIsNone(ctx.exception.raw_body)
+    self.assertFalse((self.root / "request.json").exists())
+    self.assertFalse(os.path.lexists(str(self.root / "request.json.processing")))
+
+  def test_claim_object_rejects_fifo_without_blocking(self):
+    fifo = self.root / "request.json"
+    os.mkfifo(str(fifo))
+
+    with self.assertRaises(JsonControlFileUnsafeError):
+      self.control.claim_object()
+
+    self.assertFalse(os.path.lexists(str(fifo)))
+    self.assertFalse(os.path.lexists(str(self.root / "request.json.processing")))
+
+  def test_claim_object_quarantines_non_empty_directory_request(self):
+    request_dir = self.root / "request.json"
+    request_dir.mkdir()
+    (request_dir / "payload").write_text("keep me")
+
+    with self.assertRaises(JsonControlFileUnsafeError):
+      self.control.claim_object()
+
+    self.assertFalse(os.path.lexists(str(request_dir)))
+    quarantined = list(self.root.glob("request.json.unsafe.*"))
+    self.assertEqual(len(quarantined), 1)
+    self.assertEqual((quarantined[0] / "payload").read_text(), "keep me")
+
+  def test_discard_processing_removes_processing_file(self):
+    (self.root / "request.json.processing").write_text("{}")
+
+    self.control.discard_processing()
+
+    self.assertFalse((self.root / "request.json.processing").exists())
+
+  def test_discard_processing_removes_broken_symlink(self):
+    os.symlink(str(self.root / "missing.json"), str(self.root / "request.json.processing"))
+
+    self.control.discard_processing()
+
+    self.assertFalse(os.path.lexists(str(self.root / "request.json.processing")))
+
+  def test_discard_processing_quarantines_non_empty_directory(self):
+    proc_dir = self.root / "request.json.processing"
+    proc_dir.mkdir()
+    (proc_dir / "payload").write_text("keep me")
+
+    self.control.discard_processing()
+
+    self.assertFalse(os.path.lexists(str(proc_dir)))
+    quarantined = list(self.root.glob("request.json.processing.unsafe.*"))
+    self.assertEqual(len(quarantined), 1)
+    self.assertEqual((quarantined[0] / "payload").read_text(), "keep me")
+
+  def test_recover_stale_processing_removes_symlink(self):
+    os.symlink(str(self.root / "missing.json"), str(self.root / "request.json.processing"))
+
+    recovered = self.control.recover_stale_processing()
+
+    self.assertFalse(recovered)
+    self.assertFalse(os.path.lexists(str(self.root / "request.json.processing")))
+
+  def test_recover_stale_processing_renames_only_orphan(self):
+    (self.root / "request.json.processing").write_text('{"old":true}')
+
+    recovered = self.control.recover_stale_processing()
+
+    self.assertTrue(recovered)
+    self.assertTrue((self.root / "request.json").is_file())
+    self.assertFalse((self.root / "request.json.processing").exists())
+
+  def test_recover_stale_processing_does_not_overwrite_pending(self):
+    (self.root / "request.json").write_text('{"new":true}')
+    (self.root / "request.json.processing").write_text('{"old":true}')
+
+    recovered = self.control.recover_stale_processing()
+
+    self.assertFalse(recovered)
+    self.assertEqual(json.loads((self.root / "request.json").read_text()), {"new": True})
+    self.assertTrue((self.root / "request.json.processing").exists())
+
+  def test_recover_stale_processing_quarantines_non_regular_directory(self):
+    proc_dir = self.root / "request.json.processing"
+    proc_dir.mkdir()
+    (proc_dir / "payload").write_text("keep me")
+
+    recovered = self.control.recover_stale_processing()
+
+    self.assertFalse(recovered)
+    self.assertFalse(os.path.lexists(str(proc_dir)))
+    quarantined = list(self.root.glob("request.json.processing.unsafe.*"))
+    self.assertEqual(len(quarantined), 1)
+    self.assertEqual((quarantined[0] / "payload").read_text(), "keep me")
+
+  def test_write_json_writes_relative_to_control_root(self):
+    self.control.write_json("response.json", {"status": "ok"})
+
+    self.assertEqual(
+      json.loads((self.root / "response.json").read_text()), {"status": "ok"}
+    )
+
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/extensions/business/container_apps/tests/test_sync_manager.py b/extensions/business/container_apps/tests/test_sync_manager.py
new file mode 100644
index 00000000..ebea0f93
--- /dev/null
+++ b/extensions/business/container_apps/tests/test_sync_manager.py
@@ -0,0 +1,1869 @@
+"""Unit tests for sync_manager.SyncManager pure helpers.
+
+Covers the path-validation chokepoint (resolve_container_path), atomic JSON
+writes, and history append/latest/update operations using a temporary
+plugin-data directory and a stub owner that mimics the BasePlugin surface
+the manager depends on.
+"""
+
+import json
+import os
+import io
+import stat
+import tarfile
+import tempfile
+import time
+import unittest
+from pathlib import Path
+from types import SimpleNamespace
+from unittest.mock import patch
+
+from extensions.business.container_apps.sync import (
+  SYNC_PROCESSING_FILE,
+  SYSTEM_VOLUME_NAME,
+  SYSTEM_VOLUME_MOUNT,
+  SyncManager,
+  history_received_dir,
+  history_sent_dir,
+  sync_state_dir,
+  system_volume_host_root,
+  volume_sync_dir,
+)
+from extensions.business.container_apps.sync.manager import (
+  PROVIDER_CAPTURE_ONLINE,
+  SyncRequest,
+  SyncRuntimePolicy,
+)
+
+
+def _tar_bytes(name: str, content: bytes) -> bytes:
+  buff = io.BytesIO()
+  with tarfile.open(fileobj=buff, mode="w") as tar:
+    info = tarfile.TarInfo(name=name)
+    info.size = len(content)
+    info.mode = 0o644
+    tar.addfile(info, io.BytesIO(content))
+  return buff.getvalue()
+
+
+class _FakeDockerArchiveContainer:
+  def __init__(self, archives: dict[str, bytes]):
+    self.archives = dict(archives)
+    self.get_archive_calls: list[str] = []
+
+  def get_archive(self, path):
+    self.get_archive_calls.append(path)
+    archive = self.archives[path]
+    name = os.path.basename(path.rstrip("/")) or "/"
+    return iter([archive]), {"name": name}
+
+
+class _FakeR1FS:
+  """Minimal r1fs stub for orchestrator tests."""
+
+  def __init__(self):
+    self.added: dict[str, bytes] = {}
+    self.deleted: list[tuple[str, bool, bool]] = []
+    self.add_should_raise: Exception | None = None
+    self.add_should_return_empty = False
+    self.get_should_raise: Exception | None = None
+    self.delete_should_raise: Exception | None = None
+    self._counter = 0
+
+  def add_file(self, file_path: str) -> str:
+    if self.add_should_raise:
+      raise self.add_should_raise
+    if self.add_should_return_empty:
+      return ""
+    self._counter += 1
+    cid = f"QmFAKE{self._counter:08d}"
+    with open(file_path, "rb") as handle:
+      self.added[cid] = handle.read()
+    return cid
+
+  def get_file(self, cid: str) -> str:
+    if self.get_should_raise:
+      raise self.get_should_raise
+    if cid not in self.added:
+      return ""
+    fd, path = tempfile.mkstemp(suffix=".tar.gz")
+    with os.fdopen(fd, "wb") as out:
+      out.write(self.added[cid])
+    return path
+
+  def delete_file(
+    self,
+    cid: str,
+    unpin_remote: bool = False,
+    cleanup_local_files: bool = False,
+    **_kwargs,
+  ) -> dict:
+    if self.delete_should_raise:
+      raise self.delete_should_raise
+    self.added.pop(cid, None)
+    self.deleted.append((cid, unpin_remote, cleanup_local_files))
+    return {"ok": True}
+
+
+class _FakeChainStore:
+  """Minimal chainstore stub: a process-local hkey/key dict."""
+
+  def __init__(self):
+    self.store: dict[tuple[str, str], object] = {}
+    self.hset_calls: list[tuple[str, str, object]] = []
+    self.hsync_calls: list[str] = []
+    self.hset_should_raise: Exception | None = None
+    self.hsync_should_raise: Exception | None = None
+    self.hset_returns: bool = True
+
+  def hset(self, hkey, key, value, **_kwargs):
+    if self.hset_should_raise:
+      raise self.hset_should_raise
+    self.hset_calls.append((hkey, key, value))
+    self.store[(hkey, key)] = value
+    return self.hset_returns
+
+  def hget(self, hkey, key, **_kwargs):
+    return self.store.get((hkey, key))
+
+  def hsync(self, hkey, **_kwargs):
+    self.hsync_calls.append(hkey)
+    if self.hsync_should_raise:
+      raise self.hsync_should_raise
+    return None
+
+
+def _make_owner(tmpdir: Path) -> SimpleNamespace:
+  """Build a minimal owner stub for SyncManager tests."""
+  data_folder = tmpdir / "_local_cache" / "_data"
+  data_folder.mkdir(parents=True)
+  instance_subfolder = "pipelines_data/test_pipe/test_inst"
+
+  fixed_root = data_folder / instance_subfolder / "fixed_volumes" / "mounts"
+  fixed_root.mkdir(parents=True)
+  (fixed_root / SYSTEM_VOLUME_NAME).mkdir()
+  (fixed_root / "appdata").mkdir()
+  (fixed_root / "legacy_bind").mkdir()  # pretend FILE_VOLUMES path
+
+  volumes = {
+    str(fixed_root / SYSTEM_VOLUME_NAME): {"bind": SYSTEM_VOLUME_MOUNT, "mode": "rw"},
+    str(fixed_root / "appdata"): {"bind": "/app/data", "mode": "rw"},
+    # A path that looks like a fixed-size volume but isn't (no fixed_volumes/mounts/ root)
+    str(tmpdir / "tmpfs_legacy"): {"bind": "/app/legacy", "mode": "rw"},
+  }
+  (tmpdir / "tmpfs_legacy").mkdir()
+
+  output_folder = tmpdir / "output"
+  output_folder.mkdir()
+
+  msgs: list[str] = []
+  r1fs = _FakeR1FS()
+  cs = _FakeChainStore()
+  # Track time so each call to time() returns a slightly larger float, which
+  # lets us emit successive snapshots with distinct version timestamps in
+  # the same test without sleeping.
+  clock = [1714742400.0]
+  def _time():
+    clock[0] += 1.0
+    return clock[0]
+  return SimpleNamespace(
+    get_data_folder=lambda: str(data_folder),
+    _get_instance_data_subfolder=lambda: instance_subfolder,
+    get_output_folder=lambda: str(output_folder),
+    volumes=volumes,
+    time=_time,
+    ee_id="ee_test_provider",
+    cfg_sync_key="11111111-1111-1111-1111-111111111111",
+    cfg_sync_type="provider",
+    r1fs=r1fs,
+    chainstore_hset=cs.hset,
+    chainstore_hget=cs.hget,
+    chainstore_hsync=cs.hsync,
+    P=lambda msg, color=None: msgs.append(f"[{color or ''}] {msg}"),
+    _msgs=msgs,
+    _fixed_root=fixed_root,
+    _output_folder=output_folder,
+    _r1fs=r1fs,
+    _cs=cs,
+  )
+
+
+# ---------------------------------------------------------------------------
+# resolve_container_path
+# ---------------------------------------------------------------------------
+
+class TestResolveContainerPath(unittest.TestCase):
+  def setUp(self):
+    self._tmp = tempfile.TemporaryDirectory()
+    self.tmpdir = Path(self._tmp.name)
+    self.owner = _make_owner(self.tmpdir)
+    self.sm = SyncManager(self.owner)
+
+  def tearDown(self):
+    self._tmp.cleanup()
+
+  def test_happy_path_directory(self):
+    host, bind, host_root = self.sm.resolve_container_path("/app/data/")
+    self.assertTrue(host.endswith("fixed_volumes/mounts/appdata"))
+    self.assertEqual(bind, "/app/data")
+    self.assertTrue(host_root.endswith("fixed_volumes/mounts/appdata"))
+
+  def test_happy_path_subfile(self):
+    host, _, _ = self.sm.resolve_container_path("/app/data/foo.bin")
+    self.assertTrue(host.endswith("fixed_volumes/mounts/appdata/foo.bin"))
+
+  def test_rejects_relative(self):
+    with self.assertRaisesRegex(ValueError, "must be absolute"):
+      self.sm.resolve_container_path("app/data/")
+
+  def test_rejects_dotdot(self):
+    with self.assertRaisesRegex(ValueError, r"must not contain"):
+      self.sm.resolve_container_path("/app/data/../../etc/passwd")
+
+  def test_rejects_unmounted(self):
+    with self.assertRaisesRegex(ValueError, "no mounted volume covers"):
+      self.sm.resolve_container_path("/nope/")
+
+  def test_rejects_anonymous_mount(self):
+    """Rule 3 admits FIXED_SIZE_VOLUMES and legacy VOLUMES (both per-instance
+    host directories under known roots). Mounts that aren't under either —
+    anonymous Docker mounts, FILE_VOLUMES content files, ephemeral container
+    fs — are still rejected. The fixture's ``/app/legacy`` mount is bound at
+    ``tmpdir/tmpfs_legacy`` (outside both allow-listed roots) so it stands in
+    for the "anonymous mount" case here.
+    """
+    with self.assertRaisesRegex(ValueError, "non-volume-backed mount"):
+      self.sm.resolve_container_path("/app/legacy/x")
+
+  def test_rejects_system_volume(self):
+    with self.assertRaisesRegex(ValueError, "anti-recursion"):
+      self.sm.resolve_container_path("/r1en_system/foo")
+
+  def test_rejects_system_volume_root(self):
+    with self.assertRaisesRegex(ValueError, "anti-recursion"):
+      self.sm.resolve_container_path("/r1en_system")
+
+  def test_rejects_empty(self):
+    with self.assertRaisesRegex(ValueError, "non-empty"):
+      self.sm.resolve_container_path("")
+
+  def test_longest_prefix_wins_for_nested_mounts(self):
+    """Nested fixed-size mounts (/app and /app/data) must resolve by the most
+    specific bind, not by dict insertion order. Docker overlays the deeper
+    mount on top of the broader one inside the container, so a path under
+    /app/data must resolve to the /app/data mount's host root even when /app
+    was added to self.volumes first. The previous first-match-wins iteration
+    silently mapped to the wrong host root (codex review finding 3 on PR #399).
+    """
+    fixed_root = self.owner._fixed_root
+    (fixed_root / "outer_app").mkdir(exist_ok=True)
+    (fixed_root / "inner_data").mkdir(exist_ok=True)
+    # Order matters: insert the broader mount FIRST so first-match-wins would
+    # pick the wrong one.
+    self.owner.volumes = {
+      str(fixed_root / "outer_app"): {"bind": "/app", "mode": "rw"},
+      str(fixed_root / "inner_data"): {"bind": "/app/data", "mode": "rw"},
+    }
+    host, bind, host_root = self.sm.resolve_container_path("/app/data/foo.bin")
+    self.assertTrue(host.endswith("fixed_volumes/mounts/inner_data/foo.bin"))
+    self.assertEqual(bind, "/app/data")
+    self.assertTrue(host_root.endswith("fixed_volumes/mounts/inner_data"))
+
+  def test_longest_prefix_wins_regardless_of_insertion_order(self):
+    """Same as above but with the dict items in the opposite order. The result
+    must be identical — specificity, not insertion order, decides the winner.
+    """
+    fixed_root = self.owner._fixed_root
+    (fixed_root / "outer_app").mkdir(exist_ok=True)
+    (fixed_root / "inner_data").mkdir(exist_ok=True)
+    self.owner.volumes = {
+      str(fixed_root / "inner_data"): {"bind": "/app/data", "mode": "rw"},
+      str(fixed_root / "outer_app"): {"bind": "/app", "mode": "rw"},
+    }
+    host, bind, _ = self.sm.resolve_container_path("/app/data/foo.bin")
+    self.assertTrue(host.endswith("fixed_volumes/mounts/inner_data/foo.bin"))
+    self.assertEqual(bind, "/app/data")
+
+  def test_outer_bind_still_resolves_for_paths_only_it_covers(self):
+    """Paths that fall under the broader mount but NOT the nested one must
+    still resolve to the broader mount — longest-prefix-match must not break
+    legitimate routes through the outer bind.
+    """
+    fixed_root = self.owner._fixed_root
+    (fixed_root / "outer_app").mkdir(exist_ok=True)
+    (fixed_root / "inner_data").mkdir(exist_ok=True)
+    self.owner.volumes = {
+      str(fixed_root / "outer_app"): {"bind": "/app", "mode": "rw"},
+      str(fixed_root / "inner_data"): {"bind": "/app/data", "mode": "rw"},
+    }
+    host, bind, _ = self.sm.resolve_container_path("/app/other.bin")
+    self.assertTrue(host.endswith("fixed_volumes/mounts/outer_app/other.bin"))
+    self.assertEqual(bind, "/app")
+
+  def test_legacy_volumes_resolves_to_host_root(self):
+    """Rule 3 admits legacy VOLUMES. Their host roots live under
+    CONTAINER_VOLUMES_PATH (/edge_node/_local_cache/_data/container_volumes/),
+    which is per-instance and bounded — functionally equivalent to
+    fixed-size for sync purposes. Plan: extend-sync-to-legacy-VOLUMES.
+    """
+    from extensions.business.container_apps.container_utils import (
+      CONTAINER_VOLUMES_PATH,
+    )
+    # Place a fake legacy host root and bind it into the volumes dict.
+    # We can't use the real CONTAINER_VOLUMES_PATH on a CI host without root,
+    # so monkeypatch it (constants_in_path comparison normalizes the value).
+    legacy_root = Path(self.tmpdir) / "edge_node" / "_local_cache" / "_data" / "container_volumes"
+    instance_dir = legacy_root / "test_instance_appdata"
+    instance_dir.mkdir(parents=True)
+
+    self.owner.volumes = {
+      str(instance_dir): {"bind": "/app/data", "mode": "rw"},
+    }
+
+    # Patch CONTAINER_VOLUMES_PATH on the manager module so the resolver
+    # accepts our temp legacy root for the duration of the test.
+    import extensions.business.container_apps.sync.manager as manager_mod
+    original = manager_mod.CONTAINER_VOLUMES_PATH
+    manager_mod.CONTAINER_VOLUMES_PATH = str(legacy_root)
+    try:
+      host, bind, host_root = self.sm.resolve_container_path("/app/data/foo.bin")
+    finally:
+      manager_mod.CONTAINER_VOLUMES_PATH = original
+
+    self.assertTrue(host.endswith("test_instance_appdata/foo.bin"))
+    self.assertEqual(bind, "/app/data")
+    self.assertEqual(host_root, str(instance_dir))
+
+
+# ---------------------------------------------------------------------------
+# _write_json_atomic
+# ---------------------------------------------------------------------------
+
+class TestAtomicJsonWrite(unittest.TestCase):
+  def setUp(self):
+    self._tmp = tempfile.TemporaryDirectory()
+    self.tmpdir = Path(self._tmp.name)
+    self.owner = _make_owner(self.tmpdir)
+    self.sm = SyncManager(self.owner)
+
+  def tearDown(self):
+    self._tmp.cleanup()
+
+  def test_writes_json_and_creates_parent(self):
+    target = self.tmpdir / "deep" / "nested" / "out.json"
+    self.sm._write_json_atomic(target, {"hello": "world", "n": 7})
+    self.assertTrue(target.is_file())
+    data = json.loads(target.read_text())
+    self.assertEqual(data, {"hello": "world", "n": 7})
+
+  def test_no_orphan_tmp_on_success(self):
+    target = self.tmpdir / "out.json"
+    self.sm._write_json_atomic(target, {"x": 1})
+    leftovers = [p for p in self.tmpdir.iterdir() if p.name.startswith(".out.json")]
+    self.assertEqual(leftovers, [], f"leftover tmps: {leftovers}")
+
+  def test_overwrites_existing(self):
+    target = self.tmpdir / "out.json"
+    target.write_text('{"old": true}')
+    self.sm._write_json_atomic(target, {"new": True})
+    self.assertEqual(json.loads(target.read_text()), {"new": True})
+
+
+# ---------------------------------------------------------------------------
+# History readers / writers
+# ---------------------------------------------------------------------------
+
+class TestHistory(unittest.TestCase):
+  def setUp(self):
+    self._tmp = tempfile.TemporaryDirectory()
+    self.tmpdir = Path(self._tmp.name)
+    self.owner = _make_owner(self.tmpdir)
+    self.sm = SyncManager(self.owner)
+
+  def tearDown(self):
+    self._tmp.cleanup()
+
+  def test_filename_pads_version_and_truncates_cid(self):
+    fname = SyncManager._history_filename(1714742400, "QmHash1234567890ABCDEF")
+    self.assertEqual(fname, "1714742400__QmHash123456.json")
+
+  def test_filename_handles_short_cid(self):
+    fname = SyncManager._history_filename(7, "Qm")
+    self.assertEqual(fname, "0000000007__Qm.json")
+
+  def test_filename_handles_missing_cid(self):
+    fname = SyncManager._history_filename(7, "")
+    self.assertEqual(fname, "0000000007__no_cid.json")
+
+  def test_append_sent_writes_under_history_sent(self):
+    entry = {"cid": "QmAA1", "version": 100, "node_id": "ee_x"}
+    path = self.sm.append_sent(entry)
+    self.assertEqual(path.parent, history_sent_dir(self.owner))
+    self.assertEqual(path.name, "0000000100__QmAA1.json")
+    data = json.loads(path.read_text())
+    self.assertEqual(data["cid"], "QmAA1")
+    self.assertEqual(data["deletion"], {
+      "deleted_at": None, "deletion_succeeded": None, "deletion_error": None
+    })
+
+  def test_append_received_uses_received_dir(self):
+    entry = {"cid": "QmBB", "version": 50, "node_id": "ee_y"}
+    path = self.sm.append_received(entry)
+    self.assertEqual(path.parent, history_received_dir(self.owner))
+
+  def test_latest_picks_most_recently_written(self):
+    """latest_sent / latest_received use mtime, not filename ordering, so a
+    back-dated version (e.g. clock-skewed provider) doesn't permanently
+    'win' over an entry written after it."""
+    self.sm.append_sent({"cid": "Qm1", "version": 100})
+    # Tiny sleep to guarantee distinct mtimes on filesystems with low
+    # mtime resolution.
+    import time as _t; _t.sleep(0.01)
+    self.sm.append_sent({"cid": "Qm3", "version": 300})
+    _t.sleep(0.01)
+    # Entry written LAST has version=200 — lex-smaller filename than
+    # Qm3's, but the most recent on disk. mtime sort returns it.
+    self.sm.append_sent({"cid": "Qm2", "version": 200})
+    latest = self.sm.latest_sent()
+    self.assertIsNotNone(latest)
+    self.assertEqual(latest["cid"], "Qm2")
+    self.assertEqual(latest["version"], 200)
+
+  def test_latest_returns_none_when_empty(self):
+    self.assertIsNone(self.sm.latest_sent())
+    self.assertIsNone(self.sm.latest_received())
+
+  def test_update_history_deletion_modifies_in_place(self):
+    entry = {"cid": "Qm9", "version": 999}
+    path = self.sm.append_sent(entry)
+
+    self.sm.update_history_deletion(
+      history_sent_dir(self.owner), entry, succeeded=True, error=None
+    )
+    data = json.loads(path.read_text())
+    self.assertTrue(data["deletion"]["deletion_succeeded"])
+    self.assertEqual(data["deletion"]["deletion_error"], None)
+    self.assertGreater(data["deletion"]["deleted_at"], 1714742400.0)
+    self.assertEqual(data["cid"], "Qm9")  # rest of payload preserved
+
+  def test_update_history_deletion_records_failure(self):
+    entry = {"cid": "Qm9", "version": 999}
+    self.sm.append_sent(entry)
+    self.sm.update_history_deletion(
+      history_sent_dir(self.owner), entry, succeeded=False, error="daemon down"
+    )
+    path = history_sent_dir(self.owner) / "0000000999__Qm9.json"
+    data = json.loads(path.read_text())
+    self.assertFalse(data["deletion"]["deletion_succeeded"])
+    self.assertEqual(data["deletion"]["deletion_error"], "daemon down")
+
+  def test_update_history_deletion_missing_file_logs(self):
+    entry = {"cid": "QmMissing", "version": 1}
+    # Don't append; just call update — should log warning, not raise.
+    self.sm.update_history_deletion(
+      history_sent_dir(self.owner), entry, succeeded=True, error=None
+    )
+    self.assertTrue(any("history file missing" in m for m in self.owner._msgs))
+
+
+# ---------------------------------------------------------------------------
+# claim_request
+# ---------------------------------------------------------------------------
+
+class TestClaimRequest(unittest.TestCase):
+  def setUp(self):
+    self._tmp = tempfile.TemporaryDirectory()
+    self.tmpdir = Path(self._tmp.name)
+    self.owner = _make_owner(self.tmpdir)
+    self.sm = SyncManager(self.owner)
+    # Provision the volume-sync subdir on the host (mimics _configure_system_volume)
+    self.vsd = volume_sync_dir(self.owner)
+    self.vsd.mkdir(parents=True, exist_ok=True)
+
+  def tearDown(self):
+    self._tmp.cleanup()
+
+  def _write_request(self, body):
+    (self.vsd / "request.json").write_text(json.dumps(body))
+
+  def _read_invalid(self):
+    p = self.vsd / "request.json.invalid"
+    if not p.exists():
+      return None
+    return json.loads(p.read_text())
+
+  def _read_response(self):
+    p = self.vsd / "response.json"
+    if not p.exists():
+      return None
+    return json.loads(p.read_text())
+
+  def test_no_pending_returns_none(self):
+    self.assertIsNone(self.sm.claim_request())
+
+  def test_happy_path(self):
+    self._write_request({"archive_paths": ["/app/data/"], "metadata": {"k": 1}})
+    result = self.sm.claim_request()
+    self.assertIsNotNone(result)
+    self.assertEqual(result.archive_paths, ["/app/data/"])
+    self.assertEqual(result.metadata, {"k": 1})
+    self.assertEqual(result.runtime.provider_capture, "offline")
+    self.assertEqual(result.runtime.consumer_apply, "offline_restart")
+    # request.json gone, .processing present, no .invalid
+    self.assertFalse((self.vsd / "request.json").exists())
+    self.assertTrue((self.vsd / "request.json.processing").exists())
+    self.assertIsNone(self._read_invalid())
+
+  def test_runtime_policy_parsed(self):
+    self.owner.cfg_sync_allow_online_provider_capture = True
+    self._write_request({
+      "archive_paths": ["/app/data/"],
+      "runtime": {
+        "provider_capture": "online",
+        "consumer_apply": "online_no_restart",
+      },
+    })
+
+    result = self.sm.claim_request()
+
+    self.assertIsNotNone(result)
+    self.assertEqual(result.runtime.provider_capture, "online")
+    self.assertEqual(result.runtime.consumer_apply, "online_no_restart")
+
+  def test_runtime_policy_must_be_object(self):
+    self._write_request({"archive_paths": ["/app/data/"], "runtime": "online"})
+
+    self.assertIsNone(self.sm.claim_request())
+
+    self.assertIn("runtime must be a JSON object", self._read_invalid()["_error"]["error"])
+
+  def test_invalid_provider_capture_rejected(self):
+    self._write_request({
+      "archive_paths": ["/app/data/"],
+      "runtime": {"provider_capture": "maybe"},
+    })
+
+    self.assertIsNone(self.sm.claim_request())
+
+    err = self._read_invalid()["_error"]["error"]
+    self.assertIn("provider_capture", err)
+    self.assertIn("maybe", err)
+
+  def test_invalid_consumer_apply_rejected(self):
+    self._write_request({
+      "archive_paths": ["/app/data/"],
+      "runtime": {"consumer_apply": "sometimes"},
+    })
+
+    self.assertIsNone(self.sm.claim_request())
+
+    err = self._read_invalid()["_error"]["error"]
+    self.assertIn("consumer_apply", err)
+    self.assertIn("sometimes", err)
+
+  def test_online_provider_capture_allows_unmounted_path(self):
+    self.owner.cfg_sync_allow_online_provider_capture = True
+    self._write_request({
+      "archive_paths": ["/tmp/generated.txt"],
+      "runtime": {"provider_capture": "online"},
+    })
+
+    result = self.sm.claim_request()
+
+    self.assertIsNotNone(result)
+    self.assertEqual(result.archive_paths, ["/tmp/generated.txt"])
+    self.assertEqual(result.runtime.provider_capture, "online")
+
+  def test_online_provider_capture_rejected_without_local_opt_in(self):
+    self._write_request({
+      "archive_paths": ["/tmp/generated.txt"],
+      "runtime": {"provider_capture": "online"},
+    })
+
+    self.assertIsNone(self.sm.claim_request())
+
+    err = self._read_invalid()["_error"]["error"]
+    self.assertIn("ALLOW_ONLINE_PROVIDER_CAPTURE", err)
+
+  def test_malformed_json(self):
+    (self.vsd / "request.json").write_text("not-json{")
+    self.assertIsNone(self.sm.claim_request())
+    invalid = self._read_invalid()
+    self.assertIsNotNone(invalid)
+    self.assertIsNone(invalid["request"])
+    self.assertEqual(invalid["_error"]["stage"], "validation")
+    self.assertIn("malformed JSON", invalid["_error"]["error"])
+    self.assertEqual(invalid["_error"]["raw_body"], "not-json{")
+    response = self._read_response()
+    self.assertEqual(response["status"], "error")
+    self.assertEqual(response["stage"], "validation")
+    self.assertFalse((self.vsd / "request.json.processing").exists())
+
+  def test_request_symlink_rejected_without_leaking_target_body(self):
+    secret = self.tmpdir / "host-secret.txt"
+    secret.write_text("not-json-secret-token")
+    os.symlink(str(secret), str(self.vsd / "request.json"))
+
+    self.assertIsNone(self.sm.claim_request())
+
+    invalid = self._read_invalid()
+    self.assertIsNotNone(invalid)
+    self.assertIsNone(invalid["request"])
+    self.assertEqual(invalid["_error"]["stage"], "validation")
+    self.assertIn("symlink control file", invalid["_error"]["error"])
+    self.assertNotIn("raw_body", invalid["_error"])
+    self.assertNotIn("not-json-secret-token", json.dumps(invalid))
+    self.assertFalse((self.vsd / "request.json.processing").exists())
+
+  def test_not_an_object(self):
+    self._write_request(["just", "a", "list"])
+    self.assertIsNone(self.sm.claim_request())
+    self.assertEqual(self._read_invalid()["_error"]["error"],
+                     "request.json must be a JSON object")
+
+  def test_missing_archive_paths(self):
+    self._write_request({"metadata": {}})
+    self.assertIsNone(self.sm.claim_request())
+    self.assertIn("archive_paths must be a non-empty list",
+                  self._read_invalid()["_error"]["error"])
+
+  def test_empty_archive_paths(self):
+    self._write_request({"archive_paths": []})
+    self.assertIsNone(self.sm.claim_request())
+    self.assertIn("archive_paths must be a non-empty list",
+                  self._read_invalid()["_error"]["error"])
+
+  def test_metadata_must_be_object(self):
+    self._write_request({"archive_paths": ["/app/data/"], "metadata": "nope"})
+    self.assertIsNone(self.sm.claim_request())
+    self.assertIn("metadata must be a JSON object",
+                  self._read_invalid()["_error"]["error"])
+
+  def test_path_traversal_rejected(self):
+    self._write_request({"archive_paths": ["/app/../../etc/passwd"]})
+    self.assertIsNone(self.sm.claim_request())
+    invalid = self._read_invalid()
+    self.assertEqual(invalid["_error"]["stage"], "validation")
+    self.assertIn("..", invalid["_error"]["error"])
+    self.assertEqual(invalid["request"]["archive_paths"], ["/app/../../etc/passwd"])
+
+  def test_unmounted_path_rejected(self):
+    self._write_request({"archive_paths": ["/nope/"]})
+    self.assertIsNone(self.sm.claim_request())
+    self.assertIn("no mounted volume covers",
+                  self._read_invalid()["_error"]["error"])
+
+  def test_anonymous_mount_rejected(self):
+    """The fixture's ``/app/legacy`` mount is bound at ``tmpdir/tmpfs_legacy``
+    (outside both allow-listed roots), standing in for an anonymous Docker
+    mount or ephemeral fs. claim_request must surface a clear error so the
+    app sees ``request.json.invalid`` instead of a silent stall.
+    """
+    self._write_request({"archive_paths": ["/app/legacy/x"]})
+    self.assertIsNone(self.sm.claim_request())
+    self.assertIn("non-volume-backed mount",
+                  self._read_invalid()["_error"]["error"])
+
+  def test_system_volume_rejected(self):
+    self._write_request({"archive_paths": ["/r1en_system/x"]})
+    self.assertIsNone(self.sm.claim_request())
+    self.assertIn("anti-recursion",
+                  self._read_invalid()["_error"]["error"])
+
+  def test_invalid_response_carries_archive_paths(self):
+    self._write_request({"archive_paths": ["/nope/"], "metadata": {"v": 1}})
+    self.sm.claim_request()
+    response = self._read_response()
+    self.assertEqual(response["archive_paths"], ["/nope/"])
+
+  def test_failure_clears_processing(self):
+    self._write_request({"archive_paths": ["/nope/"]})
+    self.sm.claim_request()
+    self.assertFalse((self.vsd / "request.json.processing").exists())
+
+
+# ---------------------------------------------------------------------------
+# make_archive + extract_archive
+# ---------------------------------------------------------------------------
+
+class TestArchiveRoundtrip(unittest.TestCase):
+  """Build a tar from a fake provider mount, extract it into a fake consumer
+  mount with the same container path layout, and confirm bytes round-trip."""
+
+  def setUp(self):
+    self._tmp = tempfile.TemporaryDirectory()
+    self.tmpdir = Path(self._tmp.name)
+    self.provider = _make_owner(self.tmpdir / "provider")
+    self.consumer = _make_owner(self.tmpdir / "consumer")
+    self.sm_p = SyncManager(self.provider)
+    self.sm_c = SyncManager(self.consumer)
+    # Seed provider's /app/data with content
+    self.appdata_p = self.provider._fixed_root / "appdata"
+    (self.appdata_p / "foo.bin").write_bytes(b"hello world\x00\xff")
+    (self.appdata_p / "subdir").mkdir()
+    (self.appdata_p / "subdir" / "nested.txt").write_text("nested!")
+
+  def tearDown(self):
+    self._tmp.cleanup()
+
+  def test_round_trip_directory(self):
+    tar_path, size = self.sm_p.make_archive(["/app/data/"])
+    self.assertTrue(os.path.isfile(tar_path))
+    self.assertGreater(size, 0)
+
+    extracted = self.sm_c.extract_archive(tar_path)
+    self.assertTrue(any(e == "/app/data/" or e.startswith("/app/data/") for e in extracted))
+
+    appdata_c = self.consumer._fixed_root / "appdata"
+    self.assertEqual((appdata_c / "foo.bin").read_bytes(), b"hello world\x00\xff")
+    self.assertEqual((appdata_c / "subdir" / "nested.txt").read_text(), "nested!")
+
+  def test_round_trip_file_only(self):
+    tar_path, _ = self.sm_p.make_archive(["/app/data/foo.bin"])
+    self.sm_c.extract_archive(tar_path)
+    self.assertEqual(
+      (self.consumer._fixed_root / "appdata" / "foo.bin").read_bytes(),
+      b"hello world\x00\xff",
+    )
+
+  def test_make_archive_rejects_non_existent_host_path(self):
+    # Container path passes resolve_container_path but host file missing.
+    with self.assertRaisesRegex(FileNotFoundError, "does not exist"):
+      self.sm_p.make_archive(["/app/data/missing.bin"])
+
+  def test_make_archive_propagates_validation(self):
+    with self.assertRaisesRegex(ValueError, "no mounted volume covers"):
+      self.sm_p.make_archive(["/nope/"])
+
+  def test_make_archive_rejects_symlink_path(self):
+    outside = self.tmpdir / "outside-provider"
+    outside.mkdir()
+    (outside / "secret.txt").write_text("secret")
+    (self.appdata_p / "escape").symlink_to(outside, target_is_directory=True)
+
+    with self.assertRaisesRegex(ValueError, "symlink"):
+      self.sm_p.make_archive(["/app/data/escape/secret.txt"])
+
+  def test_make_archive_rejects_symlink_descendant(self):
+    outside = self.tmpdir / "outside-provider-desc"
+    outside.mkdir()
+    (outside / "secret.txt").write_text("secret")
+    (self.appdata_p / "subdir" / "escape").symlink_to(
+      outside, target_is_directory=True
+    )
+
+    with self.assertRaisesRegex(ValueError, "symlink"):
+      self.sm_p.make_archive(["/app/data/"])
+
+  # ---- legacy VOLUMES round-trip tests ------------------------------------
+  #
+  # Rule 3 admits legacy VOLUMES in addition to FIXED_SIZE_VOLUMES; these
+  # tests prove the round-trip works regardless of which root backs each
+  # side's mount. The fake legacy root lives under tmpdir, and we
+  # monkeypatch ``manager_mod.CONTAINER_VOLUMES_PATH`` to point at it for
+  # the duration of each test so Rule 3 accepts the synthetic location.
+  #
+  # The cross-type cases (legacy ↔ fixed-size) confirm the soft-migration
+  # path: a snapshot can flow from a legacy provider into a fixed-size
+  # consumer (and vice versa) because resolve_container_path keys off the
+  # container path, not the host layout.
+
+  def _patch_legacy_root(self):
+    """Return a legacy root path under tmpdir and patch CONTAINER_VOLUMES_PATH
+    to match. Caller must call self._unpatch_legacy_root() to restore."""
+    import extensions.business.container_apps.sync.manager as manager_mod
+    legacy = self.tmpdir / "edge_node" / "_local_cache" / "_data" / "container_volumes"
+    legacy.mkdir(parents=True, exist_ok=True)
+    self._manager_mod = manager_mod
+    self._legacy_orig = manager_mod.CONTAINER_VOLUMES_PATH
+    manager_mod.CONTAINER_VOLUMES_PATH = str(legacy)
+    return legacy
+
+  def _unpatch_legacy_root(self):
+    self._manager_mod.CONTAINER_VOLUMES_PATH = self._legacy_orig
+
+  def test_round_trip_legacy_volumes_only(self):
+    """Provider + consumer both use legacy VOLUMES at the same container
+    path. Snapshots round-trip byte-for-byte across the legacy root."""
+    legacy = self._patch_legacy_root()
+    try:
+      prov_host = legacy / "provider_inst_appdata"
+      cons_host = legacy / "consumer_inst_appdata"
+      prov_host.mkdir()
+      cons_host.mkdir()
+      (prov_host / "weights.bin").write_bytes(b"legacy-only-payload")
+      (prov_host / "sub").mkdir()
+      (prov_host / "sub" / "n.txt").write_text("nested")
+      self.provider.volumes = {str(prov_host): {"bind": "/app/data", "mode": "rw"}}
+      self.consumer.volumes = {str(cons_host): {"bind": "/app/data", "mode": "rw"}}
+
+      tar_path, _ = self.sm_p.make_archive(["/app/data/"])
+      self.sm_c.extract_archive(tar_path)
+
+      self.assertEqual((cons_host / "weights.bin").read_bytes(), b"legacy-only-payload")
+      self.assertEqual((cons_host / "sub" / "n.txt").read_text(), "nested")
+    finally:
+      self._unpatch_legacy_root()
+
+  def test_round_trip_legacy_to_fixed_size(self):
+    """Provider legacy, consumer fixed-size at the same container path.
+    Proves the soft-migration scenario: a new fixed-size node can absorb
+    state from a legacy node without rebuilding the data on the operator
+    side. Container path is the routing key — host layout differences
+    are invisible to the archive."""
+    legacy = self._patch_legacy_root()
+    try:
+      prov_host = legacy / "provider_inst_appdata"
+      prov_host.mkdir()
+      (prov_host / "weights.bin").write_bytes(b"legacy-to-fixed")
+      self.provider.volumes = {str(prov_host): {"bind": "/app/data", "mode": "rw"}}
+      # Consumer keeps its default fixed-size mount at /app/data
+      # (set up by _make_owner — host root under fixed_volumes/mounts/).
+
+      tar_path, _ = self.sm_p.make_archive(["/app/data/"])
+      self.sm_c.extract_archive(tar_path)
+
+      cons_host = self.consumer._fixed_root / "appdata"
+      self.assertEqual((cons_host / "weights.bin").read_bytes(), b"legacy-to-fixed")
+    finally:
+      self._unpatch_legacy_root()
+
+  def test_round_trip_fixed_size_to_legacy(self):
+    """Symmetric of the above: provider fixed-size, consumer legacy. Same
+    archive, opposite host-layout pairing. Result must be identical —
+    container path drives the routing on both ends."""
+    legacy = self._patch_legacy_root()
+    try:
+      cons_host = legacy / "consumer_inst_appdata"
+      cons_host.mkdir()
+      # Provider's default fixed-size mount at /app/data is already seeded
+      # by setUp (foo.bin = b"hello world\x00\xff").
+      self.consumer.volumes = {str(cons_host): {"bind": "/app/data", "mode": "rw"}}
+
+      tar_path, _ = self.sm_p.make_archive(["/app/data/"])
+      self.sm_c.extract_archive(tar_path)
+
+      self.assertEqual((cons_host / "foo.bin").read_bytes(), b"hello world\x00\xff")
+      self.assertEqual((cons_host / "subdir" / "nested.txt").read_text(), "nested!")
+    finally:
+      self._unpatch_legacy_root()
+
+  def test_extract_aborts_on_member_with_no_consumer_mount(self):
+    # Build a bespoke tar with a member at /app/missing/ that consumer
+    # doesn't have a mount for.
+    import tarfile as _tarfile
+    bad_tar = self.tmpdir / "bad.tar.gz"
+    src = self.tmpdir / "src"
+    src.mkdir()
+    (src / "x.bin").write_text("x")
+    with _tarfile.open(str(bad_tar), "w:gz") as tar:
+      tar.add(str(src / "x.bin"), arcname="/app/missing/x.bin")
+
+    with self.assertRaisesRegex(ValueError, "no mounted volume covers"):
+      self.sm_c.extract_archive(str(bad_tar))
+    # No file was created
+    self.assertFalse((self.consumer._fixed_root / "appdata" / "x.bin").exists())
+
+  def test_extract_rejects_member_outside_manifest_archive_paths(self):
+    bad_tar = self.tmpdir / "outside-manifest.tar.gz"
+    src = self.tmpdir / "outside-manifest.txt"
+    src.write_text("outside")
+    with tarfile.open(str(bad_tar), "w:gz") as tar:
+      tar.add(str(src), arcname="/app/data/other.txt")
+
+    with self.assertRaisesRegex(ValueError, "outside manifest archive_paths"):
+      self.sm_c.extract_archive(
+        str(bad_tar), allowed_archive_paths=["/app/data/declared/"]
+      )
+
+    self.assertFalse((self.consumer._fixed_root / "appdata" / "other.txt").exists())
+
+  def test_extract_skips_symlink_members(self):
+    import tarfile as _tarfile
+    sym_tar = self.tmpdir / "sym.tar.gz"
+    src = self.tmpdir / "sym_src"
+    src.mkdir()
+    (src / "real.txt").write_text("real")
+    link_path = src / "link"
+    os.symlink("real.txt", str(link_path))
+    with _tarfile.open(str(sym_tar), "w:gz") as tar:
+      tar.add(str(src / "real.txt"), arcname="/app/data/real.txt")
+      info = tar.gettarinfo(str(link_path), arcname="/app/data/link")
+      tar.addfile(info)
+    self.sm_c.extract_archive(str(sym_tar))
+    self.assertEqual(
+      (self.consumer._fixed_root / "appdata" / "real.txt").read_text(), "real"
+    )
+    self.assertFalse((self.consumer._fixed_root / "appdata" / "link").exists())
+
+  def test_extract_rejects_member_through_symlink_directory(self):
+    outside = self.tmpdir / "outside"
+    outside.mkdir()
+    symlink_dir = self.consumer._fixed_root / "appdata" / "escape"
+    symlink_dir.symlink_to(outside, target_is_directory=True)
+
+    bad_tar = self.tmpdir / "symlink-dir-escape.tar.gz"
+    src = self.tmpdir / "escape-src.txt"
+    src.write_text("escaped")
+    with tarfile.open(str(bad_tar), "w:gz") as tar:
+      tar.add(str(src), arcname="/app/data/escape/pwn.txt")
+
+    with self.assertRaisesRegex(ValueError, "escapes volume root"):
+      self.sm_c.extract_archive(str(bad_tar))
+
+    self.assertFalse((outside / "pwn.txt").exists())
+
+  def test_extract_rejects_member_over_symlink_file(self):
+    outside = self.tmpdir / "outside-file.txt"
+    outside.write_text("outside")
+    symlink_file = self.consumer._fixed_root / "appdata" / "link.txt"
+    symlink_file.symlink_to(outside)
+
+    bad_tar = self.tmpdir / "symlink-file-escape.tar.gz"
+    src = self.tmpdir / "replacement.txt"
+    src.write_text("replacement")
+    with tarfile.open(str(bad_tar), "w:gz") as tar:
+      tar.add(str(src), arcname="/app/data/link.txt")
+
+    with self.assertRaisesRegex(ValueError, "escapes volume root"):
+      self.sm_c.extract_archive(str(bad_tar))
+
+    self.assertTrue(symlink_file.is_symlink())
+    self.assertEqual(outside.read_text(), "outside")
+
+  def test_extract_strips_special_mode_bits(self):
+    mode_tar = self.tmpdir / "special-modes.tar.gz"
+    with tarfile.open(str(mode_tar), "w:gz") as tar:
+      dir_info = tarfile.TarInfo(name="/app/data/special")
+      dir_info.type = tarfile.DIRTYPE
+      dir_info.mode = 0o7777
+      tar.addfile(dir_info)
+
+      content = b"payload"
+      file_info = tarfile.TarInfo(name="/app/data/special/run.sh")
+      file_info.size = len(content)
+      file_info.mode = 0o6755
+      tar.addfile(file_info, io.BytesIO(content))
+
+    self.sm_c.extract_archive(str(mode_tar))
+
+    target_dir = self.consumer._fixed_root / "appdata" / "special"
+    target_file = target_dir / "run.sh"
+    self.assertEqual(target_file.read_bytes(), b"payload")
+    self.assertEqual(os.stat(target_dir).st_mode & 0o7000, 0)
+    self.assertEqual(os.stat(target_file).st_mode & 0o7000, 0)
+
+  def test_extract_chowns_restored_entries_to_volume_owner(self):
+    owner_tar = self.tmpdir / "owner.tar.gz"
+    with tarfile.open(str(owner_tar), "w:gz") as tar:
+      dir_info = tarfile.TarInfo(name="/app/data/owned")
+      dir_info.type = tarfile.DIRTYPE
+      dir_info.mode = 0o755
+      tar.addfile(dir_info)
+
+      content = b"payload"
+      file_info = tarfile.TarInfo(name="/app/data/owned/file.txt")
+      file_info.size = len(content)
+      file_info.mode = 0o644
+      tar.addfile(file_info, io.BytesIO(content))
+
+    calls = []
+
+    def _fake_chown(path, uid, gid):
+      calls.append((os.path.basename(path), uid, gid))
+
+    with patch.object(self.sm_c, "_volume_owner", return_value=(1234, 2345)), patch(
+      "extensions.business.container_apps.sync.manager.os.chown",
+      side_effect=_fake_chown,
+    ):
+      self.sm_c.extract_archive(str(owner_tar))
+
+    self.assertIn(("owned", 1234, 2345), calls)
+    self.assertTrue(any(call[1:] == (1234, 2345) for call in calls))
+    self.assertEqual(
+      (self.consumer._fixed_root / "appdata" / "owned" / "file.txt").read_bytes(),
+      b"payload",
+    )
+
+
+# ---------------------------------------------------------------------------
+# publish_snapshot
+# ---------------------------------------------------------------------------
+
+class TestPublishSnapshot(unittest.TestCase):
+  def setUp(self):
+    self._tmp = tempfile.TemporaryDirectory()
+    self.tmpdir = Path(self._tmp.name)
+    self.owner = _make_owner(self.tmpdir)
+    self.sm = SyncManager(self.owner)
+    self.vsd = volume_sync_dir(self.owner)
+    self.vsd.mkdir(parents=True, exist_ok=True)
+    # Seed the data volume so make_archive can find content
+    appdata = self.owner._fixed_root / "appdata"
+    (appdata / "weights.bin").write_bytes(b"weights-content")
+    # Simulate having claimed a request — leave a .processing file so
+    # publish_snapshot's clean-up paths can be exercised.
+    (self.vsd / SYNC_PROCESSING_FILE).write_text(
+      json.dumps({"archive_paths": ["/app/data/"], "metadata": {}})
+    )
+
+  def tearDown(self):
+    self._tmp.cleanup()
+
+  def test_happy_path_writes_response_history_and_chainstore(self):
+    ok = self.sm.publish_snapshot(["/app/data/"], {"epoch": 1})
+    self.assertTrue(ok)
+
+    # Response.json
+    resp = json.loads((self.vsd / "response.json").read_text())
+    self.assertEqual(resp["status"], "ok")
+    self.assertTrue(resp["cid"].startswith("QmFAKE"))
+    self.assertGreater(resp["archive_size_bytes"], 0)
+    self.assertTrue(resp["chainstore_ack"])
+
+    # ChainStore record
+    self.assertEqual(len(self.owner._cs.hset_calls), 1)
+    hkey, key, value = self.owner._cs.hset_calls[0]
+    self.assertEqual(hkey, "CHAINSTORE_SYNC")
+    self.assertEqual(key, "11111111-1111-1111-1111-111111111111")
+    self.assertEqual(value["cid"], resp["cid"])
+    self.assertEqual(value["manifest"]["archive_paths"], ["/app/data/"])
+    self.assertEqual(value["manifest"]["schema_version"], 1)
+    self.assertEqual(value["manifest"]["archive_format"], "tar.gz")
+    self.assertEqual(value["manifest"]["runtime"]["provider_capture"], "offline")
+    self.assertEqual(value["manifest"]["runtime"]["consumer_apply"], "offline_restart")
+    self.assertEqual(value["metadata"], {"epoch": 1})
+
+    # History
+    sent_dir = history_sent_dir(self.owner)
+    files = list(sent_dir.glob("*.json"))
+    self.assertEqual(len(files), 1)
+    entry = json.loads(files[0].read_text())
+    self.assertEqual(entry["cid"], resp["cid"])
+    self.assertEqual(entry["chainstore_ack"], True)
+    self.assertEqual(entry["request"]["archive_paths"], ["/app/data/"])
+    self.assertIsNone(entry["deletion"]["deleted_at"])
+
+    # .processing cleaned up
+    self.assertFalse((self.vsd / "request.json.processing").exists())
+    # No .invalid because success
+    self.assertFalse((self.vsd / "request.json.invalid").exists())
+
+  def test_online_provider_capture_uses_docker_archive_for_unmounted_path(self):
+    self.owner.cfg_sync_allow_online_provider_capture = True
+    self.owner.container = _FakeDockerArchiveContainer({
+      "/tmp/generated.txt": _tar_bytes("generated.txt", b"from-container"),
+    })
+    request = SyncRequest(
+      archive_paths=["/tmp/generated.txt"],
+      metadata={"epoch": 2},
+      runtime=SyncRuntimePolicy(provider_capture=PROVIDER_CAPTURE_ONLINE),
+    )
+
+    ok = self.sm.publish_snapshot(request)
+
+    self.assertTrue(ok)
+    self.assertEqual(self.owner.container.get_archive_calls, ["/tmp/generated.txt"])
+    record = self.owner._cs.hset_calls[0][2]
+    self.assertEqual(record["manifest"]["archive_paths"], ["/tmp/generated.txt"])
+    self.assertEqual(record["manifest"]["runtime"]["provider_capture"], "online")
+
+    stored_tar = self.owner._r1fs.added[record["cid"]]
+    tar_path = self.tmpdir / "online.tar.gz"
+    tar_path.write_bytes(stored_tar)
+    with tarfile.open(tar_path, "r:gz") as tar:
+      member = tar.getmember("tmp/generated.txt")
+      self.assertEqual(tar.extractfile(member).read(), b"from-container")
+
+  def test_clears_existing_invalid_on_success(self):
+    (self.vsd / "request.json.invalid").write_text('{"old": true}')
+    self.sm.publish_snapshot(["/app/data/"], {})
+    self.assertFalse((self.vsd / "request.json.invalid").exists())
+
+  def test_archive_build_failure(self):
+    self.owner._fixed_root.joinpath("appdata", "weights.bin").unlink()
+    ok = self.sm.publish_snapshot(["/app/data/missing.bin"], {})
+    self.assertFalse(ok)
+    invalid = json.loads((self.vsd / "request.json.invalid").read_text())
+    self.assertEqual(invalid["_error"]["stage"], "archive_build")
+    resp = json.loads((self.vsd / "response.json").read_text())
+    self.assertEqual(resp["status"], "error")
+    self.assertEqual(resp["stage"], "archive_build")
+    # No history entry written
+    self.assertEqual(len(list(history_sent_dir(self.owner).glob("*.json"))), 0)
+
+  def test_r1fs_upload_failure(self):
+    self.owner._r1fs.add_should_raise = RuntimeError("ipfs offline")
+    ok = self.sm.publish_snapshot(["/app/data/"], {})
+    self.assertFalse(ok)
+    invalid = json.loads((self.vsd / "request.json.invalid").read_text())
+    self.assertEqual(invalid["_error"]["stage"], "r1fs_upload")
+    self.assertIn("ipfs offline", invalid["_error"]["error"])
+    self.assertEqual(self.owner._cs.hset_calls, [])
+
+  def test_chainstore_publish_failure(self):
+    self.owner._cs.hset_should_raise = RuntimeError("peers unreachable")
+    ok = self.sm.publish_snapshot(["/app/data/"], {})
+    self.assertFalse(ok)
+    invalid = json.loads((self.vsd / "request.json.invalid").read_text())
+    self.assertEqual(invalid["_error"]["stage"], "chainstore_publish")
+    # No history because we failed before append
+    self.assertEqual(len(list(history_sent_dir(self.owner).glob("*.json"))), 0)
+    # CID landed in r1fs but was cleaned up before returning failure.
+    self.assertEqual(len(self.owner._r1fs.added), 0)
+    self.assertEqual(len(self.owner._r1fs.deleted), 1)
+
+  def test_chainstore_no_ack_fails_and_cleans_uploaded_cid(self):
+    self.owner._cs.hset_returns = False
+
+    ok = self.sm.publish_snapshot(["/app/data/"], {})
+
+    self.assertFalse(ok)
+    invalid = json.loads((self.vsd / "request.json.invalid").read_text())
+    self.assertEqual(invalid["_error"]["stage"], "chainstore_publish")
+    self.assertIn("ack", invalid["_error"]["error"])
+    self.assertEqual(len(list(history_sent_dir(self.owner).glob("*.json"))), 0)
+    self.assertEqual(self.owner._r1fs.added, {})
+    self.assertEqual(len(self.owner._r1fs.deleted), 1)
+
+  def test_sent_history_failure_after_chainstore_ack_still_completes_request(self):
+    with patch.object(self.sm, "append_sent", side_effect=RuntimeError("disk full")):
+      ok = self.sm.publish_snapshot(["/app/data/"], {"epoch": 7})
+
+    self.assertTrue(ok)
+    self.assertEqual(len(self.owner._cs.hset_calls), 1)
+    resp = json.loads((self.vsd / "response.json").read_text())
+    self.assertEqual(resp["status"], "ok")
+    self.assertIn("disk full", resp["history_error"])
+    self.assertFalse((self.vsd / "request.json.processing").exists())
+    self.assertFalse((self.vsd / "request.json.invalid").exists())
+
+  def test_sent_history_failure_skips_prior_cid_retirement(self):
+    self.sm.publish_snapshot(["/app/data/"], {"epoch": 1})
+    first = json.loads(next(history_sent_dir(self.owner).glob("*.json")).read_text())
+
+    (self.owner._fixed_root / "appdata" / "weights.bin").write_bytes(b"v2")
+    (self.vsd / SYNC_PROCESSING_FILE).write_text("{}")
+    with patch.object(self.sm, "append_sent", side_effect=RuntimeError("disk full")):
+      ok = self.sm.publish_snapshot(["/app/data/"], {"epoch": 2})
+
+    self.assertTrue(ok)
+    files = list(history_sent_dir(self.owner).glob("*.json"))
+    self.assertEqual(len(files), 1)
+    still_first = json.loads(files[0].read_text())
+    self.assertEqual(still_first["cid"], first["cid"])
+    self.assertIsNone(still_first["deletion"]["deleted_at"])
+    self.assertFalse(self.owner._r1fs.deleted)
+
+  def test_two_snapshots_retire_first_cid(self):
+    self.sm.publish_snapshot(["/app/data/"], {"epoch": 1})
+    # Update content for the second snapshot
+    (self.owner._fixed_root / "appdata" / "weights.bin").write_bytes(b"v2")
+    # Re-create .processing because publish_snapshot deleted it
+    (self.vsd / SYNC_PROCESSING_FILE).write_text("{}")
+    self.sm.publish_snapshot(["/app/data/"], {"epoch": 2})
+
+    files = sorted(history_sent_dir(self.owner).glob("*.json"))
+    self.assertEqual(len(files), 2)
+    older = json.loads(files[0].read_text())
+    newer = json.loads(files[1].read_text())
+
+    self.assertTrue(older["deletion"]["deletion_succeeded"])
+    self.assertIsNotNone(older["deletion"]["deleted_at"])
+    self.assertIsNone(older["deletion"]["error"]) if older["deletion"].get("error") else None
+
+    self.assertIsNone(newer["deletion"]["deleted_at"])
+
+    deleted_cids = [d[0] for d in self.owner._r1fs.deleted]
+    self.assertEqual(deleted_cids, [older["cid"]])
+
+  def test_retire_records_failure(self):
+    self.sm.publish_snapshot(["/app/data/"], {"epoch": 1})
+    (self.owner._fixed_root / "appdata" / "weights.bin").write_bytes(b"v2")
+    (self.vsd / SYNC_PROCESSING_FILE).write_text("{}")
+    self.owner._r1fs.delete_should_raise = RuntimeError("daemon paused")
+
+    self.sm.publish_snapshot(["/app/data/"], {"epoch": 2})
+
+    files = sorted(history_sent_dir(self.owner).glob("*.json"))
+    older = json.loads(files[0].read_text())
+    self.assertIsNone(older["deletion"]["deleted_at"])
+    self.assertFalse(older["deletion"]["deletion_succeeded"])
+    self.assertIn("daemon paused", older["deletion"]["deletion_error"])
+
+  def test_retire_retries_after_failure(self):
+    self.sm.publish_snapshot(["/app/data/"], {"epoch": 1})
+    (self.owner._fixed_root / "appdata" / "weights.bin").write_bytes(b"v2")
+    (self.vsd / SYNC_PROCESSING_FILE).write_text("{}")
+    self.owner._r1fs.delete_should_raise = RuntimeError("daemon paused")
+
+    self.sm.publish_snapshot(["/app/data/"], {"epoch": 2})
+    self.owner._r1fs.delete_should_raise = None
+    self.sm._retire_previous_cid(history_sent_dir(self.owner))
+
+    files = sorted(history_sent_dir(self.owner).glob("*.json"))
+    older = json.loads(files[0].read_text())
+    self.assertIsNotNone(older["deletion"]["deleted_at"])
+    self.assertTrue(older["deletion"]["deletion_succeeded"])
+
+  def test_retire_uses_mtime_not_version(self):
+    """A higher-version entry that was written BEFORE a lower-version entry
+    must be retired when the lower-version one is "latest". Mirrors the
+    contract from ``_latest_in``: the answer to "what did we just do?" is
+    insert-order (mtime), not whatever ``version`` happens to be in the
+    entry. Without this guarantee a clock-skewed provider or multi-provider
+    sync set can cause the just-published CID to be retired on the next
+    publish.
+    """
+    sent_dir = history_sent_dir(self.owner)
+    sent_dir.mkdir(parents=True, exist_ok=True)
+
+    # Older-by-mtime but higher version (would sort last by filename).
+    # Use the canonical filename helper so update_history_deletion can find
+    # the file via its <version>__<short_cid>.json convention.
+    older_path = sent_dir / self.sm._history_filename(100, "QmCID_A")
+    older_path.write_text(json.dumps({
+      "cid": "QmCID_A", "version": 100,
+      "deletion": {"deleted_at": None, "deletion_succeeded": None, "deletion_error": None},
+    }))
+    os.utime(older_path, (1000, 1000))
+
+    # Newer-by-mtime but lower version (would sort first by filename)
+    newer_path = sent_dir / self.sm._history_filename(50, "QmCID_B")
+    newer_path.write_text(json.dumps({
+      "cid": "QmCID_B", "version": 50,
+      "deletion": {"deleted_at": None, "deletion_succeeded": None, "deletion_error": None},
+    }))
+    os.utime(newer_path, (2000, 2000))
+
+    self.sm._retire_previous_cid(sent_dir)
+
+    older_after = json.loads(older_path.read_text())
+    newer_after = json.loads(newer_path.read_text())
+
+    # The just-written (newer-by-mtime) entry must be left alone.
+    self.assertIsNone(newer_after["deletion"]["deleted_at"])
+    # The older-by-mtime entry should be retired, even though it has the
+    # higher version number.
+    self.assertIsNotNone(older_after["deletion"]["deleted_at"])
+    self.assertTrue(older_after["deletion"]["deletion_succeeded"])
+
+    deleted_cids = [d[0] for d in self.owner._r1fs.deleted]
+    self.assertEqual(deleted_cids, ["QmCID_A"])
+
+  def test_archive_tmp_cleaned_up_on_success(self):
+    self.sm.publish_snapshot(["/app/data/"], {})
+    leftovers = list(self.owner._output_folder.glob("sync_archive_*.tar.gz"))
+    self.assertEqual(leftovers, [])
+
+  def test_archive_tmp_cleaned_up_on_failure(self):
+    self.owner._cs.hset_should_raise = RuntimeError("boom")
+    self.sm.publish_snapshot(["/app/data/"], {})
+    leftovers = list(self.owner._output_folder.glob("sync_archive_*.tar.gz"))
+    self.assertEqual(leftovers, [])
+
+
+# ---------------------------------------------------------------------------
+# fetch_latest + validate_manifest + apply_snapshot
+# ---------------------------------------------------------------------------
+
+class TestConsumerFlow(unittest.TestCase):
+  def setUp(self):
+    self._tmp = tempfile.TemporaryDirectory()
+    self.tmpdir = Path(self._tmp.name)
+    # Build provider AND consumer owners that share an r1fs+chainstore so
+    # we can do a true end-to-end publish→apply round-trip.
+    shared_r1fs = _FakeR1FS()
+    shared_cs = _FakeChainStore()
+
+    self.provider = _make_owner(self.tmpdir / "p")
+    self.consumer = _make_owner(self.tmpdir / "c")
+    for o in (self.provider, self.consumer):
+      o.r1fs = shared_r1fs
+      o._r1fs = shared_r1fs
+      o.chainstore_hset = shared_cs.hset
+      o.chainstore_hget = shared_cs.hget
+      o.chainstore_hsync = shared_cs.hsync
+      o._cs = shared_cs
+    self.consumer.cfg_sync_type = "consumer"
+
+    self.sm_p = SyncManager(self.provider)
+    self.sm_c = SyncManager(self.consumer)
+
+    # Provision provider's volume-sync subdir + seed data
+    volume_sync_dir(self.provider).mkdir(parents=True, exist_ok=True)
+    volume_sync_dir(self.consumer).mkdir(parents=True, exist_ok=True)
+    (self.provider._fixed_root / "appdata" / "weights.bin").write_bytes(b"hello")
+
+  def tearDown(self):
+    self._tmp.cleanup()
+
+  # ----- validate_manifest --------------------------------------------------
+
+  def _ok_manifest(self, **overrides):
+    """Return a minimally-valid manifest dict. Tests override fields they
+    care about; the rest stay sane defaults so we don't have to copy the
+    schema boilerplate everywhere."""
+    manifest = {
+      "schema_version": 1,
+      "archive_format": "tar.gz",
+      "encryption": "r1fs-default",
+      "archive_paths": ["/app/data/"],
+    }
+    manifest.update(overrides)
+    return {"manifest": manifest}
+
+  def test_validate_manifest_empty_when_aligned(self):
+    self.assertEqual(self.sm_c.validate_manifest(self._ok_manifest()), [])
+
+  def test_validate_manifest_returns_missing_paths(self):
+    record = self._ok_manifest(archive_paths=["/app/data/", "/somewhere/else/"])
+    reasons = self.sm_c.validate_manifest(record)
+    self.assertEqual(len(reasons), 1)
+    self.assertIn("/somewhere/else/", reasons[0])
+    self.assertIn("unmapped archive_paths", reasons[0])
+
+  def test_validate_manifest_rejects_unsupported_schema_version(self):
+    """A manifest from a future CAR that bumped MANIFEST_SCHEMA_VERSION must
+    be refused rather than silently applied — schema bumps signal breaking
+    format changes the current consumer can't safely interpret. Codex
+    review finding 4 on PR #399."""
+    record = self._ok_manifest(schema_version=999)
+    reasons = self.sm_c.validate_manifest(record)
+    self.assertEqual(len(reasons), 1)
+    self.assertIn("schema_version", reasons[0])
+    self.assertIn("999", reasons[0])
+
+  def test_validate_manifest_rejects_missing_schema_version(self):
+    record = {"manifest": {"archive_format": "tar.gz", "archive_paths": ["/app/data/"]}}
+    reasons = self.sm_c.validate_manifest(record)
+    self.assertTrue(any("schema_version" in r for r in reasons))
+
+  def test_validate_manifest_rejects_non_int_schema_version(self):
+    record = self._ok_manifest(schema_version="1")
+    reasons = self.sm_c.validate_manifest(record)
+    self.assertTrue(any("schema_version" in r for r in reasons))
+
+  def test_validate_manifest_rejects_unsupported_archive_format(self):
+    record = self._ok_manifest(archive_format="zip")
+    reasons = self.sm_c.validate_manifest(record)
+    self.assertEqual(len(reasons), 1)
+    self.assertIn("archive_format", reasons[0])
+    self.assertIn("zip", reasons[0])
+    self.assertIn("tar.gz", reasons[0])
+
+  def test_validate_manifest_rejects_unsupported_encryption(self):
+    record = self._ok_manifest(encryption="plaintext")
+    reasons = self.sm_c.validate_manifest(record)
+    self.assertEqual(len(reasons), 1)
+    self.assertIn("encryption", reasons[0])
+    self.assertIn("plaintext", reasons[0])
+    self.assertIn("r1fs-default", reasons[0])
+
+  def test_validate_manifest_collects_multiple_violations(self):
+    """Schema + format + path violations all surface in one pass so the
+    operator sees the full picture in a single log line."""
+    record = self._ok_manifest(
+      schema_version=999, archive_format="zip",
+      archive_paths=["/app/data/", "/nope/"],
+    )
+    reasons = self.sm_c.validate_manifest(record)
+    self.assertEqual(len(reasons), 3)
+    joined = "; ".join(reasons)
+    self.assertIn("schema_version", joined)
+    self.assertIn("archive_format", joined)
+    self.assertIn("/nope/", joined)
+
+  def test_validate_manifest_handles_no_manifest(self):
+    # An empty record / empty manifest is non-conformant (missing required
+    # schema_version + archive_format), so it must be rejected.
+    self.assertNotEqual(self.sm_c.validate_manifest({}), [])
+    self.assertNotEqual(self.sm_c.validate_manifest({"manifest": {}}), [])
+
+  def test_validate_manifest_rejects_non_dict_manifest(self):
+    reasons = self.sm_c.validate_manifest({"manifest": "not-an-object"})
+
+    self.assertEqual(reasons, ["manifest must be a JSON object"])
+
+  def test_validate_manifest_rejects_missing_archive_paths(self):
+    record = self._ok_manifest()
+    del record["manifest"]["archive_paths"]
+    reasons = self.sm_c.validate_manifest(record)
+    self.assertTrue(any("archive_paths" in r for r in reasons))
+
+  def test_validate_manifest_rejects_empty_archive_paths(self):
+    reasons = self.sm_c.validate_manifest(self._ok_manifest(archive_paths=[]))
+    self.assertTrue(any("non-empty list" in r for r in reasons))
+
+  def test_validate_manifest_rejects_non_list_archive_paths(self):
+    reasons = self.sm_c.validate_manifest(self._ok_manifest(archive_paths="/app/data/"))
+    self.assertTrue(any("non-empty list" in r for r in reasons))
+
+  def test_validate_manifest_rejects_non_string_archive_path_entries(self):
+    reasons = self.sm_c.validate_manifest(self._ok_manifest(archive_paths=["/app/data/", 7]))
+    self.assertTrue(any("invalid archive_paths" in r for r in reasons))
+
+  def test_validate_manifest_rejects_non_dict(self):
+    self.assertEqual(self.sm_c.validate_manifest(None), ["manifest record is not a dict"])
+    self.assertEqual(self.sm_c.validate_manifest("string"), ["manifest record is not a dict"])
+
+  def test_validate_record_rejects_missing_envelope_fields(self):
+    reasons = self.sm_c.validate_record_for_apply({
+      "cid": "",
+      "version": "1",
+      "manifest": self._ok_manifest()["manifest"],
+    })
+
+    joined = "; ".join(reasons)
+    self.assertIn("cid", joined)
+    self.assertIn("version", joined)
+
+  # ----- fetch_latest -------------------------------------------------------
+
+  def test_fetch_latest_empty_returns_none(self):
+    self.assertIsNone(self.sm_c.fetch_latest())
+    # hsync was still called
+    self.assertEqual(self.consumer._cs.hsync_calls, ["CHAINSTORE_SYNC"])
+
+  def test_fetch_latest_after_publish_returns_record(self):
+    (self.provider.__dict__["_fixed_root"] / "appdata" / "weights.bin").write_bytes(b"x")
+    (volume_sync_dir(self.provider) / SYNC_PROCESSING_FILE).write_text("{}")
+    self.sm_p.publish_snapshot(["/app/data/"], {"epoch": 5})
+    record = self.sm_c.fetch_latest()
+    self.assertIsNotNone(record)
+    self.assertEqual(record["metadata"], {"epoch": 5})
+
+  def test_fetch_latest_no_sync_key_returns_none(self):
+    self.consumer.cfg_sync_key = None
+    self.assertIsNone(self.sm_c.fetch_latest())
+
+  def test_hsync_gated_by_interval_skips_within_window(self):
+    """The expensive chainstore_hsync is rate-limited; a second fetch_latest
+    inside the configured HSYNC_POLL_INTERVAL window only does the cheap
+    local hget, leaving hsync_calls at one entry."""
+    self.consumer.cfg_sync_hsync_poll_interval = 60.0
+    self.sm_c.fetch_latest()
+    self.sm_c.fetch_latest()  # ~1s later (mock clock increments per time() call)
+    self.assertEqual(self.consumer._cs.hsync_calls, ["CHAINSTORE_SYNC"])
+
+  def test_hsync_fires_again_after_interval_elapses(self):
+    """Once HSYNC_POLL_INTERVAL has elapsed since the last hsync, the next
+    fetch_latest does a fresh network round-trip."""
+    self.consumer.cfg_sync_hsync_poll_interval = 60.0
+    self.sm_c.fetch_latest()
+    # Back-date the last-hsync stamp so the next call falls outside the
+    # window without having to actually wait 60s.
+    self.sm_c._last_hsync = self.sm_c._last_hsync - 70.0
+    self.sm_c.fetch_latest()
+    self.assertEqual(self.consumer._cs.hsync_calls, ["CHAINSTORE_SYNC", "CHAINSTORE_SYNC"])
+
+  def test_hsync_failure_retries_before_full_success_interval(self):
+    """A timing-out / failing hsync should not suppress retries for the full
+    success interval. It still avoids retrying on the immediate next tick, but
+    becomes eligible again after the shorter failure retry window."""
+    self.consumer.cfg_sync_hsync_poll_interval = 60.0
+    self.consumer._cs.hsync_should_raise = RuntimeError("offline")
+    self.sm_c.fetch_latest()           # hsync raises (caught), retry after 30s
+    self.sm_c.fetch_latest()           # immediate next tick -> still skipped
+    self.assertEqual(self.consumer._cs.hsync_calls, ["CHAINSTORE_SYNC"])
+    self.sm_c._last_hsync = self.sm_c._last_hsync - 31.0
+    self.sm_c.fetch_latest()
+    self.assertEqual(self.consumer._cs.hsync_calls, ["CHAINSTORE_SYNC", "CHAINSTORE_SYNC"])
+
+  # ----- apply_snapshot -----------------------------------------------------
+
+  def test_apply_round_trip(self):
+    (volume_sync_dir(self.provider) / SYNC_PROCESSING_FILE).write_text("{}")
+    self.sm_p.publish_snapshot(["/app/data/"], {"epoch": 9})
+
+    record = self.sm_c.fetch_latest()
+    ok = self.sm_c.apply_snapshot(record)
+    self.assertTrue(ok)
+
+    # File extracted
+    target = self.consumer._fixed_root / "appdata" / "weights.bin"
+    self.assertEqual(target.read_bytes(), b"hello")
+
+    # last_apply.json written
+    la = json.loads((volume_sync_dir(self.consumer) / "last_apply.json").read_text())
+    self.assertEqual(la["cid"], record["cid"])
+    self.assertEqual(la["version"], record["version"])
+    self.assertIn("applied_timestamp", la)
+
+    # Host-private apply state is the durable dedupe source.
+    state = json.loads((sync_state_dir(self.consumer) / "current_apply.json").read_text())
+    self.assertEqual(state["state"], "applied")
+    self.assertEqual(state["cid"], record["cid"])
+
+    # History entry
+    files = list(history_received_dir(self.consumer).glob("*.json"))
+    self.assertEqual(len(files), 1)
+    entry = json.loads(files[0].read_text())
+    self.assertEqual(entry["cid"], record["cid"])
+    # tarfile strips trailing slashes on directory members; the consumer
+    # re-prepends the leading slash on extract, so directory entries land
+    # without their trailing slash.
+    self.assertEqual(entry["extracted_paths"], ["/app/data", "/app/data/weights.bin"])
+    self.assertIsNone(entry["deletion"]["deleted_at"])
+
+  def test_apply_skips_when_misaligned(self):
+    # Provider includes a path consumer doesn't have a mount for.
+    # We can't legitimately publish such a record (provider would also reject
+    # it), so build it manually and stuff into chainstore.
+    self.consumer._cs.store[("CHAINSTORE_SYNC", self.consumer.cfg_sync_key)] = {
+      "cid": "QmFAKE99999999",
+      "version": 9999999999,
+      "timestamp": 1234.0,
+      "node_id": "ee_someone",
+      "metadata": {},
+      "manifest": {
+        "schema_version": 1,
+        "archive_paths": ["/app/data/", "/foo/bar/"],
+        "archive_format": "tar.gz",
+        "encryption": "r1fs-default",
+        "archive_size_bytes": 100,
+      },
+    }
+    record = self.sm_c.fetch_latest()
+    ok = self.sm_c.apply_snapshot(record)
+    self.assertFalse(ok)
+    # No last_apply, no history advance
+    self.assertFalse((volume_sync_dir(self.consumer) / "last_apply.json").exists())
+    self.assertEqual(len(list(history_received_dir(self.consumer).glob("*.json"))), 0)
+    # Useful error message — should name the path that couldn't be mapped.
+    self.assertTrue(any("unmapped archive_paths" in m for m in self.consumer._msgs))
+    self.assertTrue(any("/foo/bar/" in m for m in self.consumer._msgs))
+
+  def test_apply_rejects_non_dict_manifest_without_raising(self):
+    record = {
+      "cid": "QmFAKE_BAD_MANIFEST",
+      "version": 123,
+      "timestamp": 1234.0,
+      "node_id": "ee_someone",
+      "metadata": {},
+      "manifest": "not-an-object",
+    }
+
+    ok = self.sm_c.apply_snapshot(record)
+
+    self.assertFalse(ok)
+    self.assertFalse((volume_sync_dir(self.consumer) / "last_apply.json").exists())
+    self.assertTrue(any("manifest must be a JSON object" in m for m in self.consumer._msgs))
+
+  def test_apply_rejects_tar_member_outside_manifest_archive_paths(self):
+    cid = "QmOUTSIDE_MANIFEST"
+    bad_tar = self.tmpdir / "outside-manifest-apply.tar.gz"
+    src = self.tmpdir / "outside-apply.txt"
+    src.write_text("outside")
+    with tarfile.open(str(bad_tar), "w:gz") as tar:
+      tar.add(str(src), arcname="/app/data/other.txt")
+    self.consumer._r1fs.added[cid] = bad_tar.read_bytes()
+
+    record = {
+      "cid": cid,
+      "version": 123,
+      "timestamp": 1.0,
+      "node_id": "ee_provider",
+      "metadata": {},
+      "manifest": {
+        "schema_version": 1,
+        "archive_paths": ["/app/data/declared/"],
+        "archive_format": "tar.gz",
+        "encryption": "r1fs-default",
+      },
+    }
+
+    ok = self.sm_c.apply_snapshot(record)
+
+    self.assertFalse(ok)
+    self.assertFalse((volume_sync_dir(self.consumer) / "last_apply.json").exists())
+    self.assertFalse((self.consumer._fixed_root / "appdata" / "other.txt").exists())
+    self.assertTrue(
+      any("outside manifest archive_paths" in m for m in self.consumer._msgs)
+    )
+
+  def test_apply_aborts_on_r1fs_get_failure(self):
+    (volume_sync_dir(self.provider) / SYNC_PROCESSING_FILE).write_text("{}")
+    self.sm_p.publish_snapshot(["/app/data/"], {})
+    record = self.sm_c.fetch_latest()
+    self.consumer._r1fs.get_should_raise = RuntimeError("network down")
+    ok = self.sm_c.apply_snapshot(record)
+    self.assertFalse(ok)
+    self.assertFalse((volume_sync_dir(self.consumer) / "last_apply.json").exists())
+    self.assertIsNotNone(self.sm_c.quarantined_record(record))
+
+  def test_apply_success_dedupes_from_state_when_history_append_fails(self):
+    (volume_sync_dir(self.provider) / SYNC_PROCESSING_FILE).write_text("{}")
+    self.sm_p.publish_snapshot(["/app/data/"], {})
+    record = self.sm_c.fetch_latest()
+
+    with patch.object(self.sm_c, "append_received", side_effect=RuntimeError("disk full")):
+      ok = self.sm_c.apply_snapshot(record)
+
+    self.assertTrue(ok)
+    self.assertEqual(len(list(history_received_dir(self.consumer).glob("*.json"))), 0)
+    latest = self.sm_c.latest_applied()
+    self.assertEqual(latest["state"], "applied")
+    self.assertEqual(latest["cid"], record["cid"])
+
+  def test_commit_prepared_apply_rolls_back_touched_files_on_failure(self):
+    target = self.consumer._fixed_root / "appdata" / "weights.bin"
+    target.write_bytes(b"old")
+    second = self.consumer._fixed_root / "appdata" / "second.bin"
+
+    tar_path = self.tmpdir / "rollback.tar.gz"
+    src1 = self.tmpdir / "new-weights.bin"
+    src2 = self.tmpdir / "second.bin"
+    src1.write_bytes(b"new")
+    src2.write_bytes(b"second")
+    with tarfile.open(str(tar_path), "w:gz") as tar:
+      tar.add(str(src1), arcname="/app/data/weights.bin")
+      tar.add(str(src2), arcname="/app/data/second.bin")
+
+    cid = "QmROLLBACK"
+    self.consumer._r1fs.added[cid] = tar_path.read_bytes()
+    record = {
+      "cid": cid,
+      "version": 123,
+      "timestamp": 1.0,
+      "node_id": "ee_provider",
+      "metadata": {},
+      "manifest": {
+        "schema_version": 1,
+        "archive_paths": ["/app/data/"],
+        "archive_format": "tar.gz",
+        "encryption": "r1fs-default",
+      },
+    }
+    prepared = self.sm_c.prepare_apply(record)
+    self.assertIsNotNone(prepared)
+
+    with patch.object(
+      self.sm_c,
+      "_safe_extract_mode",
+      side_effect=[0o644, RuntimeError("chmod failed")],
+    ):
+      result = self.sm_c.commit_prepared_apply(prepared)
+
+    self.assertFalse(result.success)
+    self.assertTrue(result.restart_safe)
+    self.assertEqual(result.state, "failed_rolled_back")
+    self.assertEqual(target.read_bytes(), b"old")
+    self.assertFalse(second.exists())
+    state = json.loads((sync_state_dir(self.consumer) / "current_apply.json").read_text())
+    self.assertEqual(state["state"], "failed_rolled_back")
+
+  def test_commit_prepared_apply_restores_directory_metadata_on_failure(self):
+    existing = self.consumer._fixed_root / "appdata" / "existing"
+    existing.mkdir()
+    os.chmod(existing, 0o700)
+    before_mode = stat.S_IMODE(os.stat(existing).st_mode)
+
+    tar_path = self.tmpdir / "rollback-dir-metadata.tar.gz"
+    src = self.tmpdir / "new.bin"
+    src.write_bytes(b"new")
+    with tarfile.open(str(tar_path), "w:gz") as tar:
+      info = tarfile.TarInfo(name="/app/data/existing")
+      info.type = tarfile.DIRTYPE
+      info.mode = 0o755
+      tar.addfile(info)
+      tar.add(str(src), arcname="/app/data/new.bin")
+
+    cid = "QmDIRMETAROLLBACK"
+    self.consumer._r1fs.added[cid] = tar_path.read_bytes()
+    record = {
+      "cid": cid,
+      "version": 123,
+      "timestamp": 1.0,
+      "node_id": "ee_provider",
+      "metadata": {},
+      "manifest": {
+        "schema_version": 1,
+        "archive_paths": ["/app/data/"],
+        "archive_format": "tar.gz",
+        "encryption": "r1fs-default",
+      },
+    }
+    prepared = self.sm_c.prepare_apply(record)
+    self.assertIsNotNone(prepared)
+
+    with patch.object(
+      self.sm_c,
+      "_safe_extract_mode",
+      side_effect=[0o755, RuntimeError("forced later failure")],
+    ):
+      result = self.sm_c.commit_prepared_apply(prepared)
+
+    self.assertFalse(result.success)
+    self.assertTrue(result.restart_safe)
+    self.assertEqual(result.state, "failed_rolled_back")
+    self.assertEqual(stat.S_IMODE(os.stat(existing).st_mode), before_mode)
+    self.assertFalse((self.consumer._fixed_root / "appdata" / "new.bin").exists())
+    state = json.loads((sync_state_dir(self.consumer) / "current_apply.json").read_text())
+    self.assertEqual(state["state"], "failed_rolled_back")
+
+  def test_commit_prepared_apply_removes_created_parent_dirs_on_failure(self):
+    new_root = self.consumer._fixed_root / "appdata" / "new"
+    child = new_root / "child"
+    target = child / "file.bin"
+
+    tar_path = self.tmpdir / "rollback-created-parents.tar.gz"
+    src = self.tmpdir / "file.bin"
+    src.write_bytes(b"new")
+    with tarfile.open(str(tar_path), "w:gz") as tar:
+      tar.add(str(src), arcname="/app/data/new/child/file.bin")
+
+    cid = "QmCREATEDPARENTS"
+    self.consumer._r1fs.added[cid] = tar_path.read_bytes()
+    record = {
+      "cid": cid,
+      "version": 123,
+      "timestamp": 1.0,
+      "node_id": "ee_provider",
+      "metadata": {},
+      "manifest": {
+        "schema_version": 1,
+        "archive_paths": ["/app/data/"],
+        "archive_format": "tar.gz",
+        "encryption": "r1fs-default",
+      },
+    }
+    prepared = self.sm_c.prepare_apply(record)
+    self.assertIsNotNone(prepared)
+
+    with patch.object(
+      self.sm_c,
+      "_safe_extract_mode",
+      side_effect=RuntimeError("forced file failure"),
+    ):
+      result = self.sm_c.commit_prepared_apply(prepared)
+
+    self.assertFalse(result.success)
+    self.assertTrue(result.restart_safe)
+    self.assertEqual(result.state, "failed_rolled_back")
+    self.assertFalse(target.exists())
+    self.assertFalse(child.exists())
+    self.assertFalse(new_root.exists())
+
+  def test_commit_prepared_apply_reports_uncertain_when_dir_metadata_rollback_fails(self):
+    existing = self.consumer._fixed_root / "appdata" / "existing"
+    existing.mkdir()
+    os.chmod(existing, 0o700)
+
+    tar_path = self.tmpdir / "rollback-dir-metadata-fails.tar.gz"
+    src = self.tmpdir / "new.bin"
+    src.write_bytes(b"new")
+    with tarfile.open(str(tar_path), "w:gz") as tar:
+      info = tarfile.TarInfo(name="/app/data/existing")
+      info.type = tarfile.DIRTYPE
+      info.mode = 0o755
+      tar.addfile(info)
+      tar.add(str(src), arcname="/app/data/new.bin")
+
+    cid = "QmDIRMETAUNCERTAIN"
+    self.consumer._r1fs.added[cid] = tar_path.read_bytes()
+    record = {
+      "cid": cid,
+      "version": 123,
+      "timestamp": 1.0,
+      "node_id": "ee_provider",
+      "metadata": {},
+      "manifest": {
+        "schema_version": 1,
+        "archive_paths": ["/app/data/"],
+        "archive_format": "tar.gz",
+        "encryption": "r1fs-default",
+      },
+    }
+    prepared = self.sm_c.prepare_apply(record)
+    self.assertIsNotNone(prepared)
+
+    import extensions.business.container_apps.sync.manager as manager_mod
+    original_chmod = manager_mod.os.chmod
+
+    def _fail_restoring_existing_dir(path, mode):
+      if os.path.normpath(path) == os.path.normpath(existing) and mode == 0o700:
+        raise OSError("restore chmod failed")
+      return original_chmod(path, mode)
+
+    with patch.object(
+      self.sm_c,
+      "_safe_extract_mode",
+      side_effect=[0o755, RuntimeError("forced later failure")],
+    ), patch(
+      "extensions.business.container_apps.sync.manager.os.chmod",
+      side_effect=_fail_restoring_existing_dir,
+    ):
+      result = self.sm_c.commit_prepared_apply(prepared)
+
+    self.assertFalse(result.success)
+    self.assertFalse(result.restart_safe)
+    self.assertEqual(result.state, "uncertain")
+    state = json.loads((sync_state_dir(self.consumer) / "current_apply.json").read_text())
+    self.assertEqual(state["state"], "uncertain")
+
+  def test_apply_rejects_symlink_escape_without_advancing_state(self):
+    outside = self.tmpdir / "outside"
+    outside.mkdir()
+    symlink_dir = self.consumer._fixed_root / "appdata" / "escape"
+    symlink_dir.symlink_to(outside, target_is_directory=True)
+
+    bad_tar = self.tmpdir / "bad-apply.tar.gz"
+    src = self.tmpdir / "bad-apply-src.txt"
+    src.write_text("escaped")
+    with tarfile.open(str(bad_tar), "w:gz") as tar:
+      tar.add(str(src), arcname="/app/data/escape/pwn.txt")
+
+    cid = "QmBADSYMLINKESCAPE"
+    self.consumer._r1fs.added[cid] = bad_tar.read_bytes()
+    record = {
+      "cid": cid,
+      "version": 123,
+      "timestamp": 456.0,
+      "node_id": "ee_bad",
+      "metadata": {},
+      "manifest": {
+        "schema_version": 1,
+        "archive_paths": ["/app/data/"],
+        "archive_format": "tar.gz",
+        "encryption": "r1fs-default",
+        "archive_size_bytes": bad_tar.stat().st_size,
+      },
+    }
+
+    ok = self.sm_c.apply_snapshot(record)
+
+    self.assertFalse(ok)
+    self.assertFalse((outside / "pwn.txt").exists())
+    self.assertFalse((volume_sync_dir(self.consumer) / "last_apply.json").exists())
+    self.assertEqual(len(list(history_received_dir(self.consumer).glob("*.json"))), 0)
+
+  def test_apply_two_snapshots_retires_first(self):
+    # First publish + apply
+    (volume_sync_dir(self.provider) / SYNC_PROCESSING_FILE).write_text("{}")
+    self.sm_p.publish_snapshot(["/app/data/"], {"v": 1})
+    rec1 = self.sm_c.fetch_latest()
+    self.sm_c.apply_snapshot(rec1)
+    # Second publish + apply
+    (self.provider._fixed_root / "appdata" / "weights.bin").write_bytes(b"v2")
+    (volume_sync_dir(self.provider) / SYNC_PROCESSING_FILE).write_text("{}")
+    self.sm_p.publish_snapshot(["/app/data/"], {"v": 2})
+    rec2 = self.sm_c.fetch_latest()
+    self.sm_c.apply_snapshot(rec2)
+
+    files = sorted(history_received_dir(self.consumer).glob("*.json"))
+    self.assertEqual(len(files), 2)
+    older = json.loads(files[0].read_text())
+    newer = json.loads(files[1].read_text())
+    self.assertTrue(older["deletion"]["deletion_succeeded"])
+    self.assertIsNone(newer["deletion"]["deleted_at"])
+    # Consumer-side delete used cleanup_local_files=True
+    deleted = self.consumer._r1fs.deleted
+    self.assertTrue(any(cid == older["cid"] and cleanup
+                        for (cid, _, cleanup) in deleted))
+    self.assertTrue(any(cid == older["cid"] and not unpin_remote
+                        for (cid, unpin_remote, _) in deleted))
+
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/extensions/business/container_apps/tests/test_sync_mixin.py b/extensions/business/container_apps/tests/test_sync_mixin.py
new file mode 100644
index 00000000..ed911c3c
--- /dev/null
+++ b/extensions/business/container_apps/tests/test_sync_mixin.py
@@ -0,0 +1,744 @@
+"""Unit tests for ``_SyncMixin`` stand-alone methods.
+
+Covers env-var injection, config helpers, stale .processing recovery, and
+the provider/consumer ticks driven against a fake plugin that records
+stop_container/start_container call ordering. The mixin's
+_configure_system_volume() is intentionally NOT tested here because it
+shells out to losetup/mount which require root + a real loopback environment;
+that path is exercised by the e2e scenarios (volume_sync/02_mount_persistence_sanity
+and the rest).
+"""
+
+import json
+import os
+import tempfile
+import unittest
+from pathlib import Path
+from unittest.mock import patch
+
+from extensions.business.container_apps.sync import (
+  SYSTEM_VOLUME_MOUNT,
+  SYSTEM_VOLUME_NAME,
+  SyncManager,
+  _SyncMixin,
+  history_received_dir,
+  volume_sync_dir,
+)
+from extensions.business.container_apps.sync.manager import ApplyResult
+from extensions.business.container_apps.tests.test_sync_manager import (
+  _FakeDockerArchiveContainer,
+  _FakeChainStore,
+  _FakeR1FS,
+  _make_owner,
+  _tar_bytes,
+)
+
+
+class _FakePlugin(_SyncMixin):
+  """A minimal fake plugin that mixes in _SyncMixin and records lifecycle calls."""
+
+  def __init__(self, owner_ns):
+    self._delegate = owner_ns
+    self.stop_calls = 0
+    self.start_calls = 0
+    self.runtime_stop_calls = 0
+    self.fixed_volume_cleanup_calls = 0
+    self.stop_result = True
+    self.lifecycle_log: list[str] = []
+    # Mirror SyncManager-required attributes onto self by attribute lookup.
+    # We simply use __getattr__ to forward.
+
+  def __getattr__(self, name):
+    return getattr(self._delegate, name)
+
+  # Plugin lifecycle stubs (logged + counted)
+  def stop_container(self):
+    self.stop_calls += 1
+    self.lifecycle_log.append("stop")
+    return self.stop_result
+
+  def _stop_container_runtime_for_restart(self):
+    self.runtime_stop_calls += 1
+    return self.stop_container()
+
+  def _cleanup_fixed_size_volumes(self):
+    self.fixed_volume_cleanup_calls += 1
+
+  def start_container(self):
+    self.start_calls += 1
+    self.lifecycle_log.append("start")
+
+  def _reset_runtime_state_post_start(self):
+    """Mirror the real plugin's helper so sync-tick tests can observe both
+    the call order and the resulting state-marker resets.
+    """
+    self.lifecycle_log.append("reset")
+    # Same resets the real container_app_runner._reset_runtime_state_post_start
+    # performs. Log stream / build-and-run hooks are no-ops in this fake.
+    self.container_start_time = self.time()
+    self._app_ready = False
+    self._health_probe_start = None
+    self._tunnel_start_allowed = False
+    self._commands_started = False
+
+  # Mark-as-mutable env so the mixin's _inject_sync_env_vars can write.
+  @property
+  def env(self):
+    return self._delegate.__dict__.setdefault("env", {})
+
+
+def _make_plugin(tmpdir, *, role="provider", enabled=True, key="SYNC-KEY-1"):
+  owner = _make_owner(tmpdir)
+  owner.cfg_sync = {
+    "ENABLED": enabled,
+    "KEY": key,
+    "TYPE": role,
+    "POLL_INTERVAL": 1,
+  }
+  owner.cfg_sync_type = role
+  owner.cfg_sync_key = key
+  plugin = _FakePlugin(owner)
+  # Make sure the volume-sync directory exists for tests that don't go through
+  # _configure_system_volume.
+  volume_sync_dir(plugin).mkdir(parents=True, exist_ok=True)
+  return plugin, owner
+
+
+# ---------------------------------------------------------------------------
+# Config helpers
+# ---------------------------------------------------------------------------
+
+class TestConfigHelpers(unittest.TestCase):
+  def setUp(self):
+    self._tmp = tempfile.TemporaryDirectory()
+    self.tmpdir = Path(self._tmp.name)
+
+  def tearDown(self):
+    self._tmp.cleanup()
+
+  def test_disabled(self):
+    plugin, _ = _make_plugin(self.tmpdir, enabled=False)
+    self.assertFalse(plugin._sync_enabled())
+    self.assertIsNone(plugin._ensure_sync_manager())
+
+  def test_enabled_provider(self):
+    plugin, _ = _make_plugin(self.tmpdir, role="provider")
+    self.assertTrue(plugin._sync_enabled())
+    self.assertEqual(plugin._sync_role(), "provider")
+    self.assertIsInstance(plugin._ensure_sync_manager(), SyncManager)
+    # Lazy-init returns the same instance.
+    self.assertIs(plugin._ensure_sync_manager(), plugin._sync_manager)
+
+  def test_invalid_role(self):
+    plugin, _ = _make_plugin(self.tmpdir, role="bogus")
+    self.assertIsNone(plugin._sync_role())
+
+  def test_poll_interval_floor(self):
+    plugin, owner = _make_plugin(self.tmpdir)
+    owner.cfg_sync["POLL_INTERVAL"] = 0
+    self.assertEqual(plugin._sync_poll_interval(), 1.0)
+
+  def test_poll_interval_invalid_falls_back(self):
+    plugin, owner = _make_plugin(self.tmpdir)
+    owner.cfg_sync["POLL_INTERVAL"] = "nope"
+    self.assertEqual(plugin._sync_poll_interval(), 10.0)
+
+  def test_hsync_poll_interval_default(self):
+    """When SYNC.HSYNC_POLL_INTERVAL is unset, ``_hsync_poll_interval``
+    returns the 60s default so consumers don't go to the network for fresh
+    chain replicas more than once per default window.
+    """
+    plugin, owner = _make_plugin(self.tmpdir)
+    # Make sure the field really is absent on the test fixture.
+    owner.cfg_sync.pop("HSYNC_POLL_INTERVAL", None)
+    self.assertEqual(plugin._hsync_poll_interval(), 60.0)
+    # Same value surfaces via the SyncManager-facing property.
+    self.assertEqual(plugin.cfg_sync_hsync_poll_interval, 60.0)
+
+  def test_hsync_poll_interval_floor(self):
+    """Values below the 10s minimum are clamped up — the floor protects
+    the cluster from operators who set the knob aggressively low without
+    realising the network cost."""
+    plugin, owner = _make_plugin(self.tmpdir)
+    owner.cfg_sync["HSYNC_POLL_INTERVAL"] = 1
+    self.assertEqual(plugin._hsync_poll_interval(), 10.0)
+
+  def test_hsync_poll_interval_invalid_falls_back(self):
+    """Non-numeric values fall back to the default (not the floor) — same
+    pattern as ``_sync_poll_interval`` so misconfiguration is forgiving
+    but conservative."""
+    plugin, owner = _make_plugin(self.tmpdir)
+    owner.cfg_sync["HSYNC_POLL_INTERVAL"] = "nope"
+    self.assertEqual(plugin._hsync_poll_interval(), 60.0)
+
+  def test_online_provider_capture_string_false_is_false(self):
+    plugin, owner = _make_plugin(self.tmpdir)
+    owner.cfg_sync["ALLOW_ONLINE_PROVIDER_CAPTURE"] = "false"
+    self.assertFalse(plugin.cfg_sync_allow_online_provider_capture)
+
+  def test_online_provider_capture_string_true_is_true(self):
+    plugin, owner = _make_plugin(self.tmpdir)
+    owner.cfg_sync["ALLOW_ONLINE_PROVIDER_CAPTURE"] = "true"
+    self.assertTrue(plugin.cfg_sync_allow_online_provider_capture)
+
+
+# ---------------------------------------------------------------------------
+# Env-var injection
+# ---------------------------------------------------------------------------
+
+class TestEnvInjection(unittest.TestCase):
+  def setUp(self):
+    self._tmp = tempfile.TemporaryDirectory()
+    self.tmpdir = Path(self._tmp.name)
+
+  def tearDown(self):
+    self._tmp.cleanup()
+
+  def test_always_on_keys_present(self):
+    plugin, _ = _make_plugin(self.tmpdir, enabled=False)
+    plugin._inject_sync_env_vars()
+    self.assertEqual(plugin.env["R1_SYSTEM_VOLUME"], "/r1en_system")
+    self.assertEqual(plugin.env["R1_VOLUME_SYNC_DIR"], "/r1en_system/volume-sync")
+    self.assertEqual(
+      plugin.env["R1_SYNC_REQUEST_FILE"], "/r1en_system/volume-sync/request.json"
+    )
+    # Role/key keys not set when SYNC disabled.
+    self.assertNotIn("R1_SYNC_TYPE", plugin.env)
+    self.assertNotIn("R1_SYNC_KEY", plugin.env)
+
+  def test_role_and_key_set_when_enabled(self):
+    plugin, _ = _make_plugin(self.tmpdir, role="consumer", key="abc-123")
+    plugin._inject_sync_env_vars()
+    self.assertEqual(plugin.env["R1_SYNC_TYPE"], "consumer")
+    self.assertEqual(plugin.env["R1_SYNC_KEY"], "abc-123")
+
+  def test_no_env_when_sync_unavailable(self):
+    """If _configure_system_volume set _sync_unavailable (host tools missing),
+    _inject_sync_env_vars must not advertise R1_SYSTEM_VOLUME or any other
+    R1_* key — the mount doesn't exist on the host, so the app would write
+    into a phantom path while CAR polled a host root that was never
+    provisioned. Codex review finding 5 on PR #399."""
+    plugin, _ = _make_plugin(self.tmpdir, role="provider", key="abc-123")
+    plugin._sync_unavailable = True
+    plugin._inject_sync_env_vars()
+    for k in ("R1_SYSTEM_VOLUME", "R1_VOLUME_SYNC_DIR", "R1_SYNC_REQUEST_FILE",
+              "R1_SYNC_TYPE", "R1_SYNC_KEY"):
+      self.assertNotIn(k, plugin.env)
+
+  def test_sync_disabled_when_unavailable(self):
+    """_sync_enabled() must return False when _sync_unavailable is set, even
+    with SYNC.ENABLED=True in config — provider/consumer ticks would
+    otherwise poll a host root that doesn't exist."""
+    plugin, _ = _make_plugin(self.tmpdir, role="provider", enabled=True)
+    self.assertTrue(plugin._sync_enabled())  # baseline
+    plugin._sync_unavailable = True
+    self.assertFalse(plugin._sync_enabled())
+
+  def test_successful_system_volume_config_clears_sync_unavailable(self):
+    plugin, _ = _make_plugin(self.tmpdir, role="provider", enabled=True)
+    plugin._sync_unavailable = True
+
+    with patch(
+      "extensions.business.container_apps.sync.mixin.fixed_volume._require_tools"
+    ), patch(
+      "extensions.business.container_apps.sync.mixin.fixed_volume.provision",
+      side_effect=lambda vol, **_kwargs: vol,
+    ), patch(
+      "extensions.business.container_apps.sync.mixin.os.chown",
+    ):
+      plugin._configure_system_volume()
+
+    self.assertFalse(plugin._sync_unavailable)
+    self.assertIn(SYSTEM_VOLUME_MOUNT, [spec["bind"] for spec in plugin.volumes.values()])
+    self.assertEqual(os.stat(volume_sync_dir(plugin).parent).st_mode & 0o777, 0o755)
+    self.assertEqual(os.stat(volume_sync_dir(plugin)).st_mode & 0o777, 0o777)
+    self.assertEqual(os.stat(volume_sync_dir(plugin)).st_mode & 0o1000, 0o1000)
+
+  def test_system_volume_config_recreates_symlinked_volume_sync_dir(self):
+    plugin, _ = _make_plugin(self.tmpdir, role="provider", enabled=True)
+    vsd = volume_sync_dir(plugin)
+    vsd.rmdir()
+    outside = self.tmpdir / "outside-control"
+    outside.mkdir()
+    os.symlink(str(outside), str(vsd))
+
+    with patch(
+      "extensions.business.container_apps.sync.mixin.fixed_volume._require_tools"
+    ), patch(
+      "extensions.business.container_apps.sync.mixin.fixed_volume.provision",
+      side_effect=lambda vol, **_kwargs: vol,
+    ), patch(
+      "extensions.business.container_apps.sync.mixin.os.chown",
+    ):
+      plugin._configure_system_volume()
+
+    self.assertFalse(plugin._sync_unavailable)
+    self.assertTrue(vsd.is_dir())
+    self.assertFalse(vsd.is_symlink())
+    self.assertEqual(os.stat(vsd.parent).st_mode & 0o777, 0o755)
+    self.assertEqual(os.stat(vsd).st_mode & 0o1000, 0o1000)
+
+  def test_system_volume_ignores_image_owner_and_enforces_root_ownership(self):
+    plugin, _ = _make_plugin(self.tmpdir, role="provider", enabled=True)
+    plugin._resolve_image_owner = lambda: (1000, 1000)
+    seen = {}
+
+    def _provision(vol, **_kwargs):
+      seen["vol"] = vol
+      return vol
+
+    with patch(
+      "extensions.business.container_apps.sync.mixin.fixed_volume._require_tools"
+    ), patch(
+      "extensions.business.container_apps.sync.mixin.fixed_volume.provision",
+      side_effect=_provision,
+    ), patch(
+      "extensions.business.container_apps.sync.mixin.os.chown"
+    ) as chown:
+      plugin._configure_system_volume()
+
+    self.assertIsNone(seen["vol"].owner_uid)
+    self.assertIsNone(seen["vol"].owner_gid)
+    chown.assert_any_call(str(volume_sync_dir(plugin).parent), 0, 0)
+    chown.assert_any_call(str(volume_sync_dir(plugin)), 0, 0)
+    self.assertFalse(plugin._sync_unavailable)
+
+
+# ---------------------------------------------------------------------------
+# Stale .processing recovery
+# ---------------------------------------------------------------------------
+
+class TestRecoverStaleProcessing(unittest.TestCase):
+  def setUp(self):
+    self._tmp = tempfile.TemporaryDirectory()
+    self.tmpdir = Path(self._tmp.name)
+    self.plugin, self.owner = _make_plugin(self.tmpdir)
+    self.vsd = volume_sync_dir(self.plugin)
+
+  def tearDown(self):
+    self._tmp.cleanup()
+
+  def test_no_op_when_no_processing(self):
+    self.plugin._recover_stale_processing()  # should not raise
+    self.assertFalse((self.vsd / "request.json").exists())
+
+  def test_renames_processing_back(self):
+    (self.vsd / "request.json.processing").write_text('{"archive_paths":["/app/data/"]}')
+    self.plugin._recover_stale_processing()
+    self.assertFalse((self.vsd / "request.json.processing").exists())
+    self.assertTrue((self.vsd / "request.json").is_file())
+
+  def test_keeps_existing_request_intact(self):
+    # If both exist (rare crash race), don't overwrite the in-flight request.
+    (self.vsd / "request.json").write_text('{"archive_paths":["/app/data/"]}')
+    (self.vsd / "request.json.processing").write_text('{"archive_paths":["/old/"]}')
+    self.plugin._recover_stale_processing()
+    # .processing untouched, request.json preserved.
+    self.assertTrue((self.vsd / "request.json.processing").exists())
+    self.assertEqual(
+      json.loads((self.vsd / "request.json").read_text())["archive_paths"],
+      ["/app/data/"],
+    )
+
+
+# ---------------------------------------------------------------------------
+# Provider tick
+# ---------------------------------------------------------------------------
+
+class TestProviderTick(unittest.TestCase):
+  def setUp(self):
+    self._tmp = tempfile.TemporaryDirectory()
+    self.tmpdir = Path(self._tmp.name)
+    self.plugin, self.owner = _make_plugin(self.tmpdir, role="provider")
+    self.vsd = volume_sync_dir(self.plugin)
+    # Seed data volume
+    (self.owner._fixed_root / "appdata" / "weights.bin").write_bytes(b"abc")
+
+  def tearDown(self):
+    self._tmp.cleanup()
+
+  def _write_request(self, body):
+    (self.vsd / "request.json").write_text(json.dumps(body))
+
+  def test_no_request_no_action(self):
+    self.plugin._sync_provider_tick(current_time=100.0)
+    self.assertEqual(self.plugin.stop_calls, 0)
+    self.assertEqual(self.plugin.start_calls, 0)
+
+  def test_disabled_no_action(self):
+    self.owner.cfg_sync["ENABLED"] = False
+    self._write_request({"archive_paths": ["/app/data/"]})
+    self.plugin._sync_provider_tick(current_time=100.0)
+    self.assertEqual(self.plugin.stop_calls, 0)
+    self.assertTrue((self.vsd / "request.json").exists())
+
+  def test_consumer_role_no_provider_action(self):
+    self.owner.cfg_sync["TYPE"] = "consumer"
+    self._write_request({"archive_paths": ["/app/data/"]})
+    self.plugin._sync_provider_tick(current_time=100.0)
+    self.assertEqual(self.plugin.stop_calls, 0)
+
+  def test_throttle_skips_within_poll_interval(self):
+    self.owner.cfg_sync["POLL_INTERVAL"] = 100
+    self._write_request({"archive_paths": ["/app/data/"]})
+    self.plugin._last_sync_check = 90.0
+    self.plugin._sync_provider_tick(current_time=100.0)  # only 10s since last
+    self.assertEqual(self.plugin.stop_calls, 0)
+
+  def test_full_provider_flow(self):
+    self._write_request({"archive_paths": ["/app/data/"], "metadata": {"v": 1}})
+    self.plugin._sync_provider_tick(current_time=1000.0)
+
+    # stop -> work -> start in that order
+    self.assertEqual(self.plugin.lifecycle_log, ["stop", "start", "reset"])
+    # response.json + chainstore + history all produced
+    response = json.loads((self.vsd / "response.json").read_text())
+    self.assertEqual(response["status"], "ok")
+    self.assertEqual(len(self.owner._cs.hset_calls), 1)
+
+  def test_provider_sync_uses_runtime_stop_without_fixed_volume_cleanup(self):
+    self._write_request({"archive_paths": ["/app/data/"]})
+
+    self.plugin._sync_provider_tick(current_time=1000.0)
+
+    self.assertEqual(self.plugin.runtime_stop_calls, 1)
+    self.assertEqual(self.plugin.fixed_volume_cleanup_calls, 0)
+    self.assertEqual(self.plugin.lifecycle_log, ["stop", "start", "reset"])
+
+  def test_online_provider_capture_skips_runtime_stop(self):
+    self.owner.cfg_sync["ALLOW_ONLINE_PROVIDER_CAPTURE"] = True
+    self.plugin.container = _FakeDockerArchiveContainer({
+      "/tmp/generated.txt": _tar_bytes("generated.txt", b"from-container"),
+    })
+    self._write_request({
+      "archive_paths": ["/tmp/generated.txt"],
+      "runtime": {"provider_capture": "online"},
+    })
+
+    self.plugin._sync_provider_tick(current_time=1000.0)
+
+    self.assertEqual(self.plugin.runtime_stop_calls, 0)
+    self.assertEqual(self.plugin.start_calls, 0)
+    self.assertEqual(self.plugin.lifecycle_log, [])
+    response = json.loads((self.vsd / "response.json").read_text())
+    self.assertEqual(response["status"], "ok")
+
+  def test_validation_failure_does_not_stop_container(self):
+    # claim_request fails fast; no need to disturb the container.
+    self._write_request({"archive_paths": ["/nope/"]})
+    self.plugin._sync_provider_tick(current_time=1000.0)
+    self.assertEqual(self.plugin.stop_calls, 0)
+    self.assertEqual(self.plugin.start_calls, 0)
+    invalid = json.loads((self.vsd / "request.json.invalid").read_text())
+    self.assertEqual(invalid["_error"]["stage"], "validation")
+
+  def test_publish_failure_still_restarts_container(self):
+    self._write_request({"archive_paths": ["/app/data/"]})
+    self.owner._r1fs.add_should_raise = RuntimeError("ipfs gone")
+    self.plugin._sync_provider_tick(current_time=1000.0)
+    # We did stop because claim succeeded; the failure was at r1fs stage.
+    self.assertEqual(self.plugin.lifecycle_log, ["stop", "start", "reset"])
+    response = json.loads((self.vsd / "response.json").read_text())
+    self.assertEqual(response["stage"], "r1fs_upload")
+
+  def test_offline_provider_stop_failure_aborts_before_publish(self):
+    self.plugin.stop_result = False
+    self._write_request({"archive_paths": ["/app/data/"]})
+
+    self.plugin._sync_provider_tick(current_time=1000.0)
+
+    self.assertEqual(self.plugin.lifecycle_log, ["stop"])
+    self.assertEqual(self.owner._cs.hset_calls, [])
+    self.assertEqual(self.owner._r1fs.added, {})
+    response = json.loads((self.vsd / "response.json").read_text())
+    self.assertEqual(response["status"], "error")
+    self.assertEqual(response["stage"], "runtime_stop")
+
+
+# ---------------------------------------------------------------------------
+# Consumer tick
+# ---------------------------------------------------------------------------
+
+class TestConsumerTick(unittest.TestCase):
+  def setUp(self):
+    self._tmp = tempfile.TemporaryDirectory()
+    self.tmpdir = Path(self._tmp.name)
+    # Set up provider+consumer plugins sharing one r1fs/chainstore.
+    self.provider_plugin, self.provider_owner = _make_plugin(
+      self.tmpdir / "p", role="provider"
+    )
+    self.consumer_plugin, self.consumer_owner = _make_plugin(
+      self.tmpdir / "c", role="consumer"
+    )
+    # Share state by using the provider's r1fs/chainstore on the consumer.
+    shared_r1fs = self.provider_owner._r1fs
+    shared_cs = self.provider_owner._cs
+    self.consumer_owner.r1fs = shared_r1fs
+    self.consumer_owner._r1fs = shared_r1fs
+    self.consumer_owner.chainstore_hset = shared_cs.hset
+    self.consumer_owner.chainstore_hget = shared_cs.hget
+    self.consumer_owner.chainstore_hsync = shared_cs.hsync
+    self.consumer_owner._cs = shared_cs
+    # Same SYNC.KEY across both
+    self.consumer_owner.cfg_sync["KEY"] = "SYNC-KEY-1"
+    self.consumer_owner.cfg_sync_key = "SYNC-KEY-1"
+
+    (self.provider_owner._fixed_root / "appdata" / "weights.bin").write_bytes(b"data1")
+
+  def tearDown(self):
+    self._tmp.cleanup()
+
+  def _publish(self, content=b"data1", runtime=None):
+    (self.provider_owner._fixed_root / "appdata" / "weights.bin").write_bytes(content)
+    p_vsd = volume_sync_dir(self.provider_plugin)
+    p_vsd.mkdir(parents=True, exist_ok=True)
+    request = {"archive_paths": ["/app/data/"]}
+    if runtime is not None:
+      request["runtime"] = runtime
+    (p_vsd / "request.json").write_text(json.dumps(request))
+    self.provider_plugin._last_sync_check = 0
+    self.provider_plugin._sync_provider_tick(current_time=1000.0)
+
+  def test_no_record_no_action(self):
+    self.consumer_plugin._sync_consumer_tick(current_time=1000.0)
+    self.assertEqual(self.consumer_plugin.stop_calls, 0)
+
+  def test_full_consumer_flow(self):
+    self._publish()
+    self.consumer_plugin._sync_consumer_tick(current_time=2000.0)
+    self.assertEqual(self.consumer_plugin.lifecycle_log, ["stop", "start", "reset"])
+    target = self.consumer_owner._fixed_root / "appdata" / "weights.bin"
+    self.assertEqual(target.read_bytes(), b"data1")
+    self.assertTrue((volume_sync_dir(self.consumer_plugin) / "last_apply.json").exists())
+
+  def test_consumer_explicit_offline_restart_stops_applies_and_restarts(self):
+    self._publish(runtime={"consumer_apply": "offline_restart"})
+
+    self.consumer_plugin._sync_consumer_tick(current_time=2000.0)
+
+    self.assertEqual(self.consumer_plugin.runtime_stop_calls, 1)
+    self.assertEqual(self.consumer_plugin.lifecycle_log, ["stop", "start", "reset"])
+    target = self.consumer_owner._fixed_root / "appdata" / "weights.bin"
+    self.assertEqual(target.read_bytes(), b"data1")
+
+  def test_consumer_online_no_restart_falls_back_to_offline_restart(self):
+    self.consumer_owner.cfg_sync["CONSUMER_APPLY_MODE"] = "online_no_restart"
+    self._publish(runtime={"consumer_apply": "online_no_restart"})
+
+    self.consumer_plugin._sync_consumer_tick(current_time=2000.0)
+
+    self.assertEqual(self.consumer_plugin.runtime_stop_calls, 1)
+    self.assertEqual(self.consumer_plugin.lifecycle_log, ["stop", "start", "reset"])
+    self.assertEqual(
+      self.consumer_plugin._sync_last_apply_mode_resolution,
+      {
+        "requested_mode": "online_no_restart",
+        "effective_mode": "offline_restart",
+        "reason": "online_apply_disabled",
+      },
+    )
+    target = self.consumer_owner._fixed_root / "appdata" / "weights.bin"
+    self.assertEqual(target.read_bytes(), b"data1")
+    self.assertTrue((volume_sync_dir(self.consumer_plugin) / "last_apply.json").exists())
+
+  def test_consumer_online_restart_falls_back_to_offline_restart(self):
+    self.consumer_owner.cfg_sync["CONSUMER_APPLY_MODE"] = "online_restart"
+    target = self.consumer_owner._fixed_root / "appdata" / "weights.bin"
+    target.write_bytes(b"old")
+    self._publish(content=b"new", runtime={"consumer_apply": "online_restart"})
+
+    self.consumer_plugin._sync_consumer_tick(current_time=2000.0)
+
+    self.assertEqual(self.consumer_plugin.runtime_stop_calls, 1)
+    self.assertEqual(self.consumer_plugin.lifecycle_log, ["stop", "start", "reset"])
+    self.assertEqual(target.read_bytes(), b"new")
+    self.assertEqual(
+      self.consumer_plugin._sync_last_apply_mode_resolution,
+      {
+        "requested_mode": "online_restart",
+        "effective_mode": "offline_restart",
+        "reason": "online_apply_disabled",
+      },
+    )
+
+  def test_provider_record_cannot_force_consumer_online_apply(self):
+    self._publish(runtime={"consumer_apply": "online_no_restart"})
+
+    self.consumer_plugin._sync_consumer_tick(current_time=2000.0)
+
+    self.assertEqual(self.consumer_plugin.runtime_stop_calls, 1)
+    self.assertEqual(self.consumer_plugin.lifecycle_log, ["stop", "start", "reset"])
+
+  def test_consumer_resets_runtime_state_after_apply(self):
+    """After a sync slice, per-restart runtime markers must be reset so
+    readiness gates, health-probe timers, and BUILD_AND_RUN_COMMANDS re-engage
+    against the freshly-started container. Otherwise tunnels stay marked
+    ready, health checks are skipped, and image-defined startup commands
+    don't rerun — the codex review's HIGH-severity finding 2 on PR #399.
+    """
+    # Seed the plugin with "previous container is running" markers.
+    self.consumer_plugin.container_start_time = 999.0
+    self.consumer_plugin._app_ready = True
+    self.consumer_plugin._health_probe_start = 999.0
+    self.consumer_plugin._tunnel_start_allowed = True
+    self.consumer_plugin._commands_started = True
+
+    self._publish()
+    self.consumer_plugin._sync_consumer_tick(current_time=2000.0)
+
+    # Order: stop, start, then reset (reset MUST come after start so the
+    # markers reflect the new container, not the prior one).
+    self.assertEqual(
+      self.consumer_plugin.lifecycle_log, ["stop", "start", "reset"]
+    )
+    # All readiness / probe / command-rerun markers reset. The fake's
+    # ``time()`` is a monotonic counter that increments on each read, so
+    # compare against the seeded sentinel (999.0) rather than chasing the
+    # exact post-reset value.
+    self.assertNotEqual(self.consumer_plugin.container_start_time, 999.0)
+    self.assertIsNotNone(self.consumer_plugin.container_start_time)
+    self.assertFalse(self.consumer_plugin._app_ready)
+    self.assertIsNone(self.consumer_plugin._health_probe_start)
+    self.assertFalse(self.consumer_plugin._tunnel_start_allowed)
+    self.assertFalse(self.consumer_plugin._commands_started)
+
+  def test_skips_when_record_cid_matches_last_apply(self):
+    """The consumer's 'is this new?' check is by CID, not version. A second
+    tick that sees the same ChainStore record (same cid) is a no-op even
+    if version metadata changed."""
+    self._publish()
+    self.consumer_plugin._sync_consumer_tick(current_time=2000.0)
+    self.consumer_plugin.lifecycle_log.clear()
+    self.consumer_plugin._last_sync_check = 0  # reset throttle
+    # Tick again without a new publish — should be a no-op (same cid).
+    self.consumer_plugin._sync_consumer_tick(current_time=3000.0)
+    self.assertEqual(self.consumer_plugin.lifecycle_log, [])
+
+  def test_applies_when_cid_differs_even_if_version_lower(self):
+    """A consumer should apply any record whose cid differs from the last
+    applied entry, regardless of version ordering. This guards against
+    clock-skew failure modes where a provider's wonky timestamp could
+    otherwise make a corrected snapshot look 'older'."""
+    # First publish + apply (creates a baseline received entry).
+    self._publish(content=b"initial")
+    self.consumer_plugin._sync_consumer_tick(current_time=2000.0)
+    initial_received = self.consumer_plugin._sync_manager.latest_received()
+    self.assertIsNotNone(initial_received)
+    initial_version = initial_received["version"]
+
+    # Hand-craft a chainstore record with a *lower* version but a fresh CID.
+    # Under the old version-comparison logic this would be skipped; under
+    # CID comparison it must be applied.
+    spoofed_cid = "QmSPOOF_LOWER_VERSION_FRESH_CONTENT"
+    fake_tar = self.consumer_owner._r1fs.added.get(initial_received["cid"], b"")
+    self.consumer_owner._r1fs.added[spoofed_cid] = fake_tar
+    self.consumer_owner._cs.store[("CHAINSTORE_SYNC", "SYNC-KEY-1")] = {
+      "cid": spoofed_cid,
+      "version": initial_version - 100,  # explicitly older
+      "timestamp": 0.5,
+      "node_id": "ee_other",
+      "metadata": {"who": "wonky-clock"},
+      "manifest": initial_received["manifest"],
+    }
+
+    self.consumer_plugin.lifecycle_log.clear()
+    self.consumer_plugin._last_sync_check = 0
+    self.consumer_plugin._sync_consumer_tick(current_time=3000.0)
+    # The new (lower-versioned but different-cid) record was applied.
+    self.assertEqual(self.consumer_plugin.lifecycle_log, ["stop", "start", "reset"])
+    latest = self.consumer_plugin._sync_manager.latest_received()
+    self.assertEqual(latest["cid"], spoofed_cid)
+
+  def test_misalignment_skips_apply(self):
+    # Store a record in chainstore that references a path consumer can't map.
+    self.consumer_owner._cs.store[("CHAINSTORE_SYNC", "SYNC-KEY-1")] = {
+      "cid": "QmFAKE_BAD",
+      "version": 9999999999,
+      "timestamp": 1.0,
+      "node_id": "ee_other",
+      "metadata": {},
+      "manifest": {
+        "schema_version": 1,
+        "archive_paths": ["/app/data/", "/nope/"],
+        "archive_format": "tar.gz",
+        "encryption": "r1fs-default",
+        "archive_size_bytes": 123,
+      },
+    }
+    self.consumer_plugin._sync_consumer_tick(current_time=2000.0)
+    self.assertEqual(self.consumer_plugin.lifecycle_log, [])
+    # No last_apply written
+    self.assertFalse(
+      (volume_sync_dir(self.consumer_plugin) / "last_apply.json").exists()
+    )
+    # No history advance
+    self.assertEqual(
+      len(list(history_received_dir(self.consumer_plugin).glob("*.json"))), 0
+    )
+
+  def test_r1fs_failure_is_quarantined_before_container_stop(self):
+    self._publish(content=b"new-data")
+    self.consumer_owner._r1fs.get_should_raise = RuntimeError("network down")
+
+    self.consumer_plugin._sync_consumer_tick(current_time=2000.0)
+
+    self.assertEqual(self.consumer_plugin.lifecycle_log, [])
+    record = self.consumer_owner._cs.store[("CHAINSTORE_SYNC", "SYNC-KEY-1")]
+    self.assertIsNotNone(self.consumer_plugin._sync_manager.quarantined_record(record))
+
+    self.consumer_plugin._last_sync_check = 0
+    self.consumer_plugin._sync_consumer_tick(current_time=3000.0)
+
+    self.assertEqual(self.consumer_plugin.lifecycle_log, [])
+
+  def test_uncertain_apply_does_not_restart_container(self):
+    self._publish(content=b"new-data")
+
+    def uncertain(_prepared):
+      return ApplyResult(False, False, "uncertain", [], "rollback failed")
+
+    sm = self.consumer_plugin._ensure_sync_manager()
+    with patch.object(
+      sm,
+      "commit_prepared_apply",
+      side_effect=uncertain,
+    ):
+      self.consumer_plugin._sync_consumer_tick(current_time=2000.0)
+
+    self.assertEqual(self.consumer_plugin.lifecycle_log, ["stop"])
+    self.assertEqual(self.consumer_plugin.start_calls, 0)
+
+  def test_non_dict_manifest_skips_without_restart(self):
+    self.consumer_owner._cs.store[("CHAINSTORE_SYNC", "SYNC-KEY-1")] = {
+      "cid": "QmFAKE_BAD_MANIFEST",
+      "version": 9999999999,
+      "timestamp": 1.0,
+      "node_id": "ee_other",
+      "metadata": {},
+      "manifest": "not-an-object",
+    }
+
+    self.consumer_plugin._sync_consumer_tick(current_time=2000.0)
+
+    self.assertEqual(self.consumer_plugin.lifecycle_log, [])
+    self.assertFalse(
+      (volume_sync_dir(self.consumer_plugin) / "last_apply.json").exists()
+    )
+
+  def test_offline_consumer_stop_failure_aborts_before_apply(self):
+    self._publish(content=b"new-data")
+    self.consumer_plugin.stop_result = False
+
+    self.consumer_plugin._sync_consumer_tick(current_time=2000.0)
+
+    self.assertEqual(self.consumer_plugin.lifecycle_log, ["stop"])
+    self.assertFalse(
+      (volume_sync_dir(self.consumer_plugin) / "last_apply.json").exists()
+    )
+    self.assertEqual(
+      len(list(history_received_dir(self.consumer_plugin).glob("*.json"))), 0
+    )
+
+
+if __name__ == "__main__":
+  unittest.main()

From 31da87cfcb3131efa40e939ff3caaf8607cc108b Mon Sep 17 00:00:00 2001
From: Cristi Bleotiu <cristibleotiu@gmail.com>
Date: Fri, 15 May 2026 12:06:08 +0300
Subject: [PATCH 3/4] fix: report extra tunnel cleanup failures

What changed:
- log extra tunnel cleanup success only when every tunnel stopped
- add coverage for failed extra tunnel cleanup logging

Why:
- avoid misleading success logs in hardened cleanup paths
---
 .../container_apps/container_app_runner.py    |  5 ++++-
 .../tests/test_container_lifecycle.py         | 22 +++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/extensions/business/container_apps/container_app_runner.py b/extensions/business/container_apps/container_app_runner.py
index 52601f64..87d46b77 100644
--- a/extensions/business/container_apps/container_app_runner.py
+++ b/extensions/business/container_apps/container_app_runner.py
@@ -2030,7 +2030,10 @@ def stop_extra_tunnels(self):
     for container_port in list(self.extra_tunnel_processes.keys()):
       result = self._stop_extra_tunnel(container_port) and result
 
-    self.P("All extra tunnels stopped")
+    if result:
+      self.P("All extra tunnels stopped", color='g')
+    else:
+      self.P("One or more extra tunnels failed to stop; preserving live handles for retry.", color='r')
     return result
 
 
diff --git a/extensions/business/container_apps/tests/test_container_lifecycle.py b/extensions/business/container_apps/tests/test_container_lifecycle.py
index ecd6a285..20141f97 100644
--- a/extensions/business/container_apps/tests/test_container_lifecycle.py
+++ b/extensions/business/container_apps/tests/test_container_lifecycle.py
@@ -22,6 +22,7 @@
 import docker.types
 
 from extensions.business.container_apps.tests.support import (
+  make_container_app_runner,
   make_lifecycle_runner,
   make_mock_container,
   make_mock_docker_client,
@@ -225,6 +226,27 @@ def test_container_none_returns_false(self):
     self.assertFalse(plugin._check_container_status())
 
 
+class TestExtraTunnelCleanup(unittest.TestCase):
+
+  def test_stop_extra_tunnels_logs_failure_when_any_tunnel_fails(self):
+    plugin = make_container_app_runner()
+    plugin.extra_tunnel_processes = {
+      8001: object(),
+      8002: object(),
+    }
+    plugin._stop_extra_tunnel = MagicMock(side_effect=[False, True])
+
+    result = plugin.stop_extra_tunnels()
+
+    self.assertFalse(result)
+    self.assertEqual(plugin._stop_extra_tunnel.call_count, 2)
+    self.assertIn(
+      "One or more extra tunnels failed to stop",
+      plugin.logged_messages[-1],
+    )
+    self.assertNotIn("All extra tunnels stopped", plugin.logged_messages[-1])
+
+
 # ===========================================================================
 # Restart
 # ===========================================================================

From 9cdc15a0c55e2a1377445a6398d08329e3b99936 Mon Sep 17 00:00:00 2001
From: Cristi Bleotiu <cristibleotiu@gmail.com>
Date: Fri, 15 May 2026 12:37:10 +0300
Subject: [PATCH 4/4] chore: inc ver

---
 ver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ver.py b/ver.py
index c4106a1d..a8fb3a53 100644
--- a/ver.py
+++ b/ver.py
@@ -1 +1 @@
-__VER__ = '2.10.219'
+__VER__ = '2.10.221'