From 232454655d062de9fdcc73dcee56d25c9dd99e67 Mon Sep 17 00:00:00 2001 From: Cristi Bleotiu Date: Wed, 13 May 2026 12:02:53 +0300 Subject: [PATCH 1/4] fix: harden container shutdown cleanup What changed: abort container restarts when runtime cleanup fails; preserve process, thread, and fixed-volume handles for retry; stop Docker log readers after container stop to avoid false cleanup failures. Why: prevent stuck or duplicate runtimes during restart and shutdown paths. --- .../container_apps/container_app_runner.py | 254 +++++++++++++----- .../business/container_apps/fixed_volume.py | 26 +- .../mixins/fixed_size_volumes.py | 15 +- 3 files changed, 217 insertions(+), 78 deletions(-) diff --git a/extensions/business/container_apps/container_app_runner.py b/extensions/business/container_apps/container_app_runner.py index dddb8aca..c262b2c9 100644 --- a/extensions/business/container_apps/container_app_runner.py +++ b/extensions/business/container_apps/container_app_runner.py @@ -578,6 +578,7 @@ def __reset_vars(self): # Container state machine self.container_state = ContainerState.UNINITIALIZED self.stop_reason = StopReason.UNKNOWN + self._cleanup_failed = False # Restart policy and retry logic self._consecutive_failures = 0 @@ -1193,15 +1194,17 @@ def on_command(self, data, **kwargs): self.P("Restarting container...") self._clear_manual_stop_state() # Clear persistent stop state self._set_container_state(ContainerState.RESTARTING, StopReason.CONFIG_UPDATE) - self._stop_container_and_save_logs_to_disk() self._restart_container(StopReason.CONFIG_UPDATE) return elif data == "STOP": self.P("Stopping container (manual stop - restart policy will not trigger)...") self._save_persistent_state(manually_stopped=True) # Save persistent stop state - self._stop_container_and_save_logs_to_disk() - self._set_container_state(ContainerState.PAUSED, StopReason.MANUAL_STOP) + cleanup_ok = self._stop_container_and_save_logs_to_disk() + if cleanup_ok: + self._set_container_state(ContainerState.PAUSED, StopReason.MANUAL_STOP) + else: + self._set_container_state(ContainerState.FAILED, StopReason.UNKNOWN) return else: self.P(f"Unknown plugin command: {data}") @@ -1249,7 +1252,12 @@ def _handle_config_restart(self, restart_callable): ) return - self._stop_container_and_save_logs_to_disk() + cleanup_ok = self._stop_container_and_save_logs_to_disk() + if not cleanup_ok: + self.P("Config restart aborted because previous runtime cleanup failed.", color='r') + self._set_container_state(ContainerState.FAILED, StopReason.UNKNOWN) + self._record_restart_failure() + return restart_callable() return @@ -1271,7 +1279,7 @@ def on_config(self, *args, **kwargs): ------- None """ - return self._handle_config_restart(lambda: self._restart_container(StopReason.CONFIG_UPDATE)) + return self._handle_config_restart(lambda: self._restart_container(StopReason.CONFIG_UPDATE, cleanup_first=False)) def on_post_container_start(self): @@ -1367,16 +1375,28 @@ def stop_tunnel_engine(self): Returns ------- - None + bool + True when the tunnel process and log readers stopped, False otherwise. """ if self.tunnel_process: engine_name = "Cloudflare" if self.use_cloudflare() else "ngrok" self.P(f"Stopping {engine_name} tunnel...") - self.stop_tunnel_command(self.tunnel_process) - self.tunnel_process = None - self.P(f"{engine_name} tunnel stopped") + process = self.tunnel_process + result = True + try: + result = self.stop_tunnel_command(process) + except Exception as exc: + result = False + self.P(f"Error stopping {engine_name} tunnel: {exc}", color='r') + finally: + if result: + self.tunnel_process = None + self.P(f"{engine_name} tunnel stopped") + else: + self.P(f"{engine_name} tunnel did not fully stop; preserving process handle for retry.", color='r') + return result # end if - return + return True def get_tunnel_engine_ping_data(self): @@ -1637,12 +1657,16 @@ def _start_extra_tunnel(self, container_port, tunnel_config): self.Pd(f" Command: {' '.join(command)}") # Use list-based subprocess to prevent shell injection - process = subprocess.Popen( - command, + popen_kwargs = dict( + args=command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, - bufsize=0 + bufsize=0, ) + if os.name != "nt": + popen_kwargs["start_new_session"] = True + process = subprocess.Popen(**popen_kwargs) + self._remember_process_group(process) # Create log readers for this tunnel logs_reader = self.LogReader(process.stdout, size=100, daemon=None) @@ -1711,12 +1735,16 @@ def _stop_extra_tunnel(self, container_port): Returns ------- - None + bool + True when the tunnel process and log readers stopped, False otherwise. """ process = self.extra_tunnel_processes.get(container_port) if not process: - return + return True + result = True + process_stopped = process.poll() is not None + readers_stopped = True try: self.P(f"Stopping extra tunnel for port {container_port}...") @@ -1725,12 +1753,11 @@ def _stop_extra_tunnel(self, container_port): # Stop process if process.poll() is None: # Still running - process.terminate() - try: - process.wait(timeout=5) - except Exception: - process.kill() - process.wait() + process_stopped = self._terminate_subprocess_tree( + process, + label=f"Extra tunnel for port {container_port}", + ) + result = process_stopped and result # Clean up log readers (following base class pattern) log_readers = self.extra_tunnel_log_readers.get(container_port, {}) @@ -1739,36 +1766,49 @@ def _stop_extra_tunnel(self, container_port): stdout_reader = log_readers.get("stdout") if stdout_reader: try: - stdout_reader.stop() + reader_stopped = stdout_reader.stop() + readers_stopped = reader_stopped and readers_stopped + result = reader_stopped and result # Read any remaining logs before cleanup remaining_logs = stdout_reader.get_next_characters() if remaining_logs: self._process_extra_tunnel_log(container_port, remaining_logs, is_error=False) except Exception as e: + readers_stopped = False + result = False self.Pd(f"Error stopping stdout reader: {e}") # Stop stderr reader and read remaining logs stderr_reader = log_readers.get("stderr") if stderr_reader: try: - stderr_reader.stop() + reader_stopped = stderr_reader.stop() + readers_stopped = reader_stopped and readers_stopped + result = reader_stopped and result # Read any remaining error logs before cleanup remaining_err_logs = stderr_reader.get_next_characters() if remaining_err_logs: self._process_extra_tunnel_log(container_port, remaining_err_logs, is_error=True) except Exception as e: + readers_stopped = False + result = False self.Pd(f"Error stopping stderr reader: {e}") - # Clean up references - self.extra_tunnel_processes.pop(container_port, None) - self.extra_tunnel_log_readers.pop(container_port, None) - self.extra_tunnel_urls.pop(container_port, None) - self.extra_tunnel_start_times.pop(container_port, None) + if result: + self.extra_tunnel_processes.pop(container_port, None) + self.extra_tunnel_log_readers.pop(container_port, None) + self.extra_tunnel_urls.pop(container_port, None) + self.extra_tunnel_start_times.pop(container_port, None) - self.P(f"Extra tunnel for port {container_port} stopped") + if result: + self.P(f"Extra tunnel for port {container_port} stopped") + else: + self.P(f"Extra tunnel for port {container_port} did not fully stop; preserving live handles for retry.", color='r') except Exception as e: + result = False self.P(f"Error stopping extra tunnel for port {container_port}: {e}", color='r') + return result def stop_extra_tunnels(self): @@ -1780,17 +1820,20 @@ def stop_extra_tunnels(self): Returns ------- - None + bool + True when all extra tunnels stopped, False otherwise. """ if not self.extra_tunnel_processes: - return + return True self.P(f"Stopping {len(self.extra_tunnel_processes)} extra tunnel(s)...") + result = True for container_port in list(self.extra_tunnel_processes.keys()): - self._stop_extra_tunnel(container_port) + result = self._stop_extra_tunnel(container_port) and result self.P("All extra tunnels stopped") + return result def _read_extra_tunnel_logs(self, container_port): @@ -2037,40 +2080,49 @@ def stop_container(self): Returns ------- - None + bool + True when the container was stopped and removed, False otherwise. Notes ----- If no container exists, logs a warning and returns. - Clears container and container_id attributes after removal. + Clears container and container_id attributes after successful removal. """ if not self.container: self.P("No container to stop", color='r') - return + return True + result = True + removed = False try: # Stop the container (gracefully) self.P(f"Stopping container {self.container.short_id}...") self.container.stop(timeout=5) self.P(f"Container {self.container.short_id} stopped successfully") except Exception as e: + result = False self.P(f"Error stopping container: {e}", color='r') # end try try: self.P(f"Removing container {self.container.short_id}...") self.container.remove() + removed = True self.P(f"Container {self.container.short_id} removed successfully") except Exception as e: + result = False self.P(f"Error removing container: {e}", color='r') finally: - self.container = None - self.container_id = None + if removed: + self.container = None + self.container_id = None + else: + self.P("Preserving container handle after failed stop/remove for retry.", color='r') # end try - return + return removed - def _stream_logs(self, log_stream): + def _stream_logs(self, log_stream, stop_event=None): """ Consume a log iterator from container logs and print its output. @@ -2087,6 +2139,9 @@ def _stream_logs(self, log_stream): self.P("No log stream provided", color='r') return + if stop_event is None: + stop_event = self._stop_event + try: for log_bytes in log_stream: if log_bytes is None: @@ -2100,7 +2155,7 @@ def _stream_logs(self, log_stream): self.P(f"[CONTAINER] {log_str}", end='') self.container_logs.append(log_str) - if self._stop_event.is_set(): + if stop_event.is_set(): self.P("Log streaming stopped by stop event") break except Exception as e: @@ -2127,7 +2182,7 @@ def _start_container_log_stream(self): log_stream = self.container.logs(stream=True, follow=True) self.log_thread = threading.Thread( target=self._stream_logs, - args=(log_stream,), + args=(log_stream, self._stop_event), daemon=True, ) self.log_thread.start() @@ -2214,7 +2269,7 @@ def _run_container_exec(self, shell_cmd): ) thread = threading.Thread( target=self._stream_logs, - args=(exec_result.output,), + args=(exec_result.output, self._stop_event), daemon=True, ) thread.start() @@ -2672,39 +2727,78 @@ def _stop_container_and_save_logs_to_disk(self): Returns ------- - None + bool + True when cleanup completed without required-step failures, False otherwise. """ self.P(f"Stopping container app '{self.container_id}' ...") + cleanup_errors = [] + + def safe_cleanup_step(step_name, callback): + try: + result = callback() + if result is False: + cleanup_errors.append(step_name) + self.P(f"Container cleanup step '{step_name}' reported failure.", color='r') + except Exception as exc: + cleanup_errors.append(step_name) + self.P(f"Container cleanup step '{step_name}' failed: {exc}", color='r') + # Clear semaphore and reset signaling state for potential restart - self._semaphore_reset_signal() + safe_cleanup_step("semaphore reset", self._semaphore_reset_signal) - # Stop log streaming - self._stop_event.set() - if self.log_thread: - self.log_thread.join(timeout=5) - self.log_thread = None + def signal_runtime_threads(): + self._stop_event.set() + self._commands_started = False + return True - if getattr(self, 'exec_threads', None): - for thread in self.exec_threads: - if thread and thread.is_alive(): - thread.join(timeout=5) - self.exec_threads = [] + def join_runtime_threads(): + result = True + stop_deadline = time.monotonic() + 5 - self._stop_event = threading.Event() - self._commands_started = False + if self.log_thread: + self.log_thread.join(timeout=max(0, stop_deadline - time.monotonic())) + if self.log_thread.is_alive(): + result = False + self.P("Container log thread is still alive after stop timeout.", color='r') + else: + self.log_thread = None + + if getattr(self, 'exec_threads', None): + alive_threads = [] + for thread in self.exec_threads: + remaining = max(0, stop_deadline - time.monotonic()) + if thread and thread.is_alive() and remaining > 0: + thread.join(timeout=remaining) + if thread and thread.is_alive(): + result = False + alive_threads.append(thread) + self.exec_threads = alive_threads + + if result: + self._stop_event = threading.Event() + self._commands_started = False + return result + + # Signal log/exec readers early, but join after Docker stop; quiet Docker + # streams usually unblock only when the container stops. + safe_cleanup_step("runtime thread signal", signal_runtime_threads) # Stop tunnel engine if needed - self.stop_tunnel_engine() + safe_cleanup_step("main tunnel", self.stop_tunnel_engine) # Stop extra tunnels - self.stop_extra_tunnels() + safe_cleanup_step("extra tunnels", self.stop_extra_tunnels) # Stop the container if it's running - self.stop_container() + safe_cleanup_step("docker container", self.stop_container) + + # Stop log streaming threads after Docker stop has had a chance to unblock + # the log/exec streams. + safe_cleanup_step("runtime threads", join_runtime_threads) # Cleanup fixed-size volumes (unmount + detach loop devices) - self._cleanup_fixed_size_volumes() + safe_cleanup_step("fixed-size volumes", self._cleanup_fixed_size_volumes) # Save logs to disk under the instance's `logs/` sibling folder # (resolves to pipelines_data/{sid}/{iid}/logs/container_logs.pkl) @@ -2717,7 +2811,12 @@ def _stop_container_and_save_logs_to_disk(self): self.P("Container logs saved to disk.") except Exception as exc: self.P(f"Failed to save logs: {exc}", color='r') - return + if cleanup_errors: + self._cleanup_failed = True + self.P("Container cleanup completed with failed step(s): {}.".format(", ".join(cleanup_errors)), color='r') + return False + self._cleanup_failed = False + return True def on_close(self): @@ -3014,7 +3113,7 @@ def _check_image_updates(self, current_time=None): return - def _restart_container(self, stop_reason=None): + def _restart_container(self, stop_reason=None, cleanup_first=True): """ Restart the container from scratch. @@ -3022,10 +3121,15 @@ def _restart_container(self, stop_reason=None): ---------- stop_reason : StopReason, optional Optional StopReason enum indicating why restart was triggered + cleanup_first : bool, optional + If True, stop the existing runtime before resetting state. Set to False + when the caller already performed cleanup and checked its result. Returns ------- - None + bool + True when restart setup succeeded or was deferred waiting for + semaphores, False when cleanup or start failed. """ self.P("Restarting container from scratch...") @@ -3035,7 +3139,14 @@ def _restart_container(self, stop_reason=None): preserved_last_image_check = self._last_image_check preserved_current_hash = self.current_image_hash - self._stop_container_and_save_logs_to_disk() + if cleanup_first: + cleanup_ok = self._stop_container_and_save_logs_to_disk() + if not cleanup_ok: + self.P("Restart aborted because previous runtime cleanup failed.", color='r') + self._set_container_state(ContainerState.FAILED, stop_reason or StopReason.UNKNOWN) + self._record_restart_failure() + return False + self.__reset_vars() # Reset chainstore response for restart cycle @@ -3071,7 +3182,7 @@ def _restart_container(self, stop_reason=None): self._validate_extra_tunnels_config() self._validate_runner_config() self.P("Consumer container with semaphore dependencies: deferring start until providers are ready") - return + return True # Non-semaphored containers (providers): configure env and start immediately self._configure_dynamic_env() @@ -3087,17 +3198,17 @@ def _restart_container(self, stop_reason=None): self.P("Failed to ensure image availability during restart, cannot start container", color='r') self._set_container_state(ContainerState.FAILED, StopReason.CRASH) self._record_restart_failure() - return + return False self.container = self.start_container() if not self.container: # start_container already recorded the failure - return + return False self.container_start_time = self.time() self._start_container_log_stream() self._maybe_execute_build_and_run() - return + return True def _ensure_image_always_pull(self): @@ -3385,6 +3496,13 @@ def process(self): self._last_paused_log = current_time return + if self._cleanup_failed: + current_time = self.time() + if current_time - self._last_paused_log >= self.cfg_paused_state_log_interval: + self.P("Container cleanup previously failed; periodic launch/restart is blocked until cleanup succeeds.") + self._last_paused_log = current_time + return + if not self.container: # Check if we're in backoff period if self._is_restart_backoff_active(): diff --git a/extensions/business/container_apps/fixed_volume.py b/extensions/business/container_apps/fixed_volume.py index f584f08f..dd994e6f 100644 --- a/extensions/business/container_apps/fixed_volume.py +++ b/extensions/business/container_apps/fixed_volume.py @@ -387,15 +387,18 @@ def provision( def cleanup( vol: FixedVolume, logger: Optional[Callable] = None, -) -> None: +) -> bool: """Unmount and detach the loop device for a volume. Graceful -- never raises. All errors are caught and logged as warnings. + Returns False when unmount/detach could not be confirmed so callers can + preserve cleanup handles and retry later. """ _log( logger, "STEP", f"Cleaning up volume={vol.name} mount_path={vol.mount_path}", ) + result = True loop_dev = None if vol.meta_path.exists(): try: @@ -403,23 +406,34 @@ def cleanup( loop_dev = meta.get("loop_dev") _log(logger, "INFO", f"Loaded metadata loop_dev={loop_dev}") except Exception as exc: + result = False _log(logger, "WARN", f"Failed to read metadata error={exc}") - try: - _run(["umount", str(vol.mount_path)], logger=logger) - except Exception as exc: - _log(logger, "WARN", f"Unmount failed mount_path={vol.mount_path} error={exc}") + if _is_path_mounted(vol.mount_path): + try: + _run(["umount", str(vol.mount_path)], logger=logger) + except Exception as exc: + result = False + _log(logger, "WARN", f"Unmount failed mount_path={vol.mount_path} error={exc}") + else: + _log(logger, "INFO", f"Mount path is not mounted mount_path={vol.mount_path}") if loop_dev: try: _run(["losetup", "-d", loop_dev], logger=logger) except Exception as exc: + result = False _log(logger, "WARN", f"Detach loop failed loop_dev={loop_dev} error={exc}") + if _is_path_mounted(vol.mount_path): + result = False + _log(logger, "WARN", f"Mount path is still mounted mount_path={vol.mount_path}") + _log( logger, "INFO", - f"Cleanup complete mount_path={vol.mount_path} loop_dev={loop_dev}", + f"Cleanup complete mount_path={vol.mount_path} loop_dev={loop_dev} ok={result}", ) + return result def docker_bind_spec(vol: FixedVolume, container_target: str) -> Dict[str, Dict[str, str]]: diff --git a/extensions/business/container_apps/mixins/fixed_size_volumes.py b/extensions/business/container_apps/mixins/fixed_size_volumes.py index ef9bf868..b077aacd 100644 --- a/extensions/business/container_apps/mixins/fixed_size_volumes.py +++ b/extensions/business/container_apps/mixins/fixed_size_volumes.py @@ -226,12 +226,19 @@ def _cleanup_fixed_size_volumes(self): Called during container stop/close to free loop device resources. """ if not hasattr(self, '_fixed_volumes') or not self._fixed_volumes: - return + return True + result = True + remaining_volumes = [] for vol in self._fixed_volumes: try: - fixed_volume.cleanup(vol, logger=self.P) + cleaned = fixed_volume.cleanup(vol, logger=self.P) + if not cleaned: + result = False + remaining_volumes.append(vol) except Exception as exc: + result = False + remaining_volumes.append(vol) self.P(f"Failed to cleanup fixed volume '{vol.name}': {exc}", color='r') - self._fixed_volumes = [] - return + self._fixed_volumes = remaining_volumes + return result From fedd9255b4c4be2cce19b1a010b5a19051c4f81b Mon Sep 17 00:00:00 2001 From: Cristi Bleotiu Date: Fri, 15 May 2026 10:41:23 +0300 Subject: [PATCH 2/4] fix: harden container app shutdown cleanup What changed: - keep failed container cleanup candidates retryable instead of dropping handles - make manual STOP/RESTART/config handling preserve cleanup state safely - restore sync support files from develop and add lifecycle/fixed-volume/sync regression coverage Why: - avoid leaked container subprocesses and preserve existing sync behavior while resolving the PR branch against develop --- .../container_apps/container_app_runner.py | 397 +++- .../business/container_apps/fixed_volume.py | 74 +- .../business/container_apps/sync/__init__.py | 90 + .../business/container_apps/sync/constants.py | 53 + .../container_apps/sync/control_files.py | 327 +++ .../business/container_apps/sync/manager.py | 1959 +++++++++++++++++ .../business/container_apps/sync/mixin.py | 595 +++++ .../business/container_apps/tests/support.py | 41 + .../tests/test_container_lifecycle.py | 145 ++ .../container_apps/tests/test_fixed_volume.py | 76 +- .../tests/test_sync_control_files.py | 228 ++ .../container_apps/tests/test_sync_manager.py | 1869 ++++++++++++++++ .../container_apps/tests/test_sync_mixin.py | 744 +++++++ 13 files changed, 6556 insertions(+), 42 deletions(-) create mode 100644 extensions/business/container_apps/sync/__init__.py create mode 100644 extensions/business/container_apps/sync/constants.py create mode 100644 extensions/business/container_apps/sync/control_files.py create mode 100644 extensions/business/container_apps/sync/manager.py create mode 100644 extensions/business/container_apps/sync/mixin.py create mode 100644 extensions/business/container_apps/tests/test_sync_control_files.py create mode 100644 extensions/business/container_apps/tests/test_sync_manager.py create mode 100644 extensions/business/container_apps/tests/test_sync_mixin.py diff --git a/extensions/business/container_apps/container_app_runner.py b/extensions/business/container_apps/container_app_runner.py index c262b2c9..52601f64 100644 --- a/extensions/business/container_apps/container_app_runner.py +++ b/extensions/business/container_apps/container_app_runner.py @@ -66,6 +66,7 @@ import os import requests import shutil +import signal import threading import time import socket @@ -86,6 +87,7 @@ _RestartBackoffMixin, _TunnelBackoffMixin, ) +from .sync import _SyncMixin __VER__ = "0.7.1" @@ -313,6 +315,25 @@ def from_dict(cls, config_dict: dict) -> "HealthCheckConfig": # {"vol_name": {"SIZE": "100M", "MOUNTING_POINT": "/app/data", "FS_TYPE": "ext4", # "OWNER_UID": None, "OWNER_GID": None, "FORCE_RECREATE": False}} + # Volume-sync (cross-node state replication). Always-on /r1en_system system + # volume is provisioned regardless; SYNC.ENABLED only controls the + # provider/consumer orchestration on top of it. See the sync/ subpackage + # (constants.py + manager.py + mixin.py) for the full contract. + "SYNC": { + "ENABLED": False, # master switch + "KEY": None, # shared UUID across the sync set (provider+consumer) + "TYPE": None, # "provider" | "consumer" + "POLL_INTERVAL": 10, # seconds between sync ticks + "ALLOW_ONLINE_PROVIDER_CAPTURE": False, # provider-local opt-in for live container fs capture + "CONSUMER_APPLY_MODE": "offline_restart", # consumer-local apply lifecycle policy + "HSYNC_POLL_INTERVAL": 60, # seconds between chainstore_hsync refreshes + # (consumer only; provider only calls hset, never hsync). + # Clamped to min 10s. The cheap local-replica hget + # still runs on every tick; only the network round-trip + # is rate-limited here. Failed hsync attempts retry + # sooner than the full interval. + }, + # Health check configuration (consolidated) # Controls how app readiness is determined before starting tunnels # @@ -345,6 +366,9 @@ def from_dict(cls, config_dict: dict) -> "HealthCheckConfig": "MAX_LOG_LINES" : 10_000, # max lines to keep in memory # When container is STOPPED_MANUALLY (PAUSED state), this will define how often we log its existance "PAUSED_STATE_LOG_INTERVAL": 60, + # Container apps can need more than the core plugin default to stop Docker, + # tunnel processes, runtime readers, and loop-backed fixed volumes safely. + "PLUGIN_STOP_TIMEOUT": 45, # Semaphore synchronization for paired plugins # List of semaphore keys to wait for before starting container @@ -373,6 +397,7 @@ class ContainerAppRunnerPlugin( _ImagePullBackoffMixin, _TunnelBackoffMixin, _FixedSizeVolumesMixin, + _SyncMixin, _ContainerUtilsMixin, BasePlugin, ): @@ -579,6 +604,7 @@ def __reset_vars(self): self.container_state = ContainerState.UNINITIALIZED self.stop_reason = StopReason.UNKNOWN self._cleanup_failed = False + self._manual_stop_pending = False # Restart policy and retry logic self._consecutive_failures = 0 @@ -615,6 +641,13 @@ def __reset_vars(self): self._last_image_check = 0 self._last_extra_tunnels_ping = 0 self._last_paused_log = 0 # Track when we last logged the paused message + self._last_sync_check = 0 # _SyncMixin throttle + + # Volume-sync state. SyncManager is lazy-init'd by _ensure_sync_manager + # the first time a tick fires (or on_init for early provisioning). + self._sync_manager = None + self._sync_unavailable = False + self._runtime_stop_degraded = False # Image update tracking self.current_image_hash = None @@ -1049,6 +1082,39 @@ def _validate_runner_config(self): return + def _validate_sync_config(self): + """ + Validate the SYNC config block when ENABLED. Disables SYNC with a + warning rather than raising; the system volume itself is independent + and the rest of the plugin must keep running. + """ + if not self._sync_enabled(): + return + sync = self._sync_cfg() + key = sync.get("KEY") + role = sync.get("TYPE") + if not key or not isinstance(key, str): + self.P( + "[sync] SYNC.ENABLED but SYNC.KEY missing/empty; disabling SYNC.", + color="r", + ) + sync["ENABLED"] = False + return + if role not in ("provider", "consumer"): + self.P( + f"[sync] SYNC.TYPE must be 'provider' or 'consumer' (got {role!r}); disabling SYNC.", + color="r", + ) + sync["ENABLED"] = False + return + self.P( + f"[sync] SYNC enabled: role={role}, key={key}, " + f"poll={self._sync_poll_interval()}s", + color="g", + ) + return + + def _validate_subclass_config(self): """ Hook for subclasses to enforce additional validation. @@ -1119,11 +1185,18 @@ def on_init(self): self._configure_volumes() # setup container volumes (deprecated) self._configure_file_volumes() # setup file volumes with dynamic content self._configure_fixed_size_volumes() # setup fixed-size file-backed volumes + self._configure_system_volume() # always-on /r1en_system control-plane volume + + # If a prior plugin run crashed mid-publish, request.json.processing may + # be left over inside volume-sync/. Rename it back so the next tick retries. + self._recover_stale_processing() + self._validate_sync_config() # If we have semaphored keys, defer _setup_env_and_ports() until semaphores are ready # This ensures we get the env vars from provider plugins before starting the container if not self._semaphore_get_keys(): self._setup_env_and_ports() + self._inject_sync_env_vars() else: self.Pd("Deferring _setup_env_and_ports() until semaphores are ready") @@ -1193,17 +1266,26 @@ def on_command(self, data, **kwargs): if data == "RESTART": self.P("Restarting container...") self._clear_manual_stop_state() # Clear persistent stop state + # RESTART is an explicit operator override for a previously failed STOP. + # Clear the in-memory pause intent too, otherwise a later cleanup retry + # could incorrectly persist PAUSED instead of relaunching the container. + self._manual_stop_pending = False self._set_container_state(ContainerState.RESTARTING, StopReason.CONFIG_UPDATE) self._restart_container(StopReason.CONFIG_UPDATE) return elif data == "STOP": self.P("Stopping container (manual stop - restart policy will not trigger)...") - self._save_persistent_state(manually_stopped=True) # Save persistent stop state + self._manual_stop_pending = True cleanup_ok = self._stop_container_and_save_logs_to_disk() if cleanup_ok: + self._save_persistent_state(manually_stopped=True) # Persist only after cleanup succeeds. + self._manual_stop_pending = False self._set_container_state(ContainerState.PAUSED, StopReason.MANUAL_STOP) else: + # Keep the failed cleanup retryable without persisting a paused state + # that would later make config restarts look intentionally ignored. + self._clear_manual_stop_state() self._set_container_state(ContainerState.FAILED, StopReason.UNKNOWN) return else: @@ -1243,6 +1325,16 @@ def _handle_config_restart(self, restart_callable): ) return + if self._manual_stop_pending: + self.P( + "Manual STOP cleanup is still pending. Ignoring config restart; " + "send RESTART to override the pending stop intent.", + color='y', + ) + if self._cleanup_failed: + self._retry_failed_cleanup() + return + # Check persistent state as fallback (in case container_state not yet set) if self._load_manual_stop_state(): self.P( @@ -1367,6 +1459,112 @@ def get_cloudflare_protocol(self): return super(ContainerAppRunnerPlugin, self).get_cloudflare_protocol() + def _remember_process_group(self, process): + """ + Record tunnel process-group ids even when deployed with an older core. + + Newer cores provide this on ``BaseTunnelEnginePlugin``. Keeping a local + fallback lets the edge PR roll out before the matching core PR without + breaking extra tunnel startup after ``subprocess.Popen`` succeeds. + """ + base_method = getattr(super(ContainerAppRunnerPlugin, self), "_remember_process_group", None) + if callable(base_method): + return base_method(process) + if process is not None and os.name != "nt": + try: + process._r1_process_group_id = os.getpgid(process.pid) + except Exception as exc: + self.P(f"Could not record tunnel process group: {exc}", color='r') + return process + + + def _terminate_subprocess_tree(self, process, label="subprocess", terminate_timeout=5, kill_timeout=5): + """ + Terminate a tunnel subprocess tree with a compatibility fallback. + + Prefer the core implementation when present; otherwise use the same bounded + POSIX process-group shutdown strategy locally so mixed-version deployments + do not leak extra tunnel children. + """ + base_method = getattr(super(ContainerAppRunnerPlugin, self), "_terminate_subprocess_tree", None) + if callable(base_method): + return base_method( + process, + label=label, + terminate_timeout=terminate_timeout, + kill_timeout=kill_timeout, + ) + if process is None: + return True + + pgid = getattr(process, "_r1_process_group_id", None) + + def is_process_group_alive(): + if os.name == "nt" or pgid is None: + return False + try: + os.killpg(pgid, 0) + return True + except ProcessLookupError: + return False + except Exception as exc: + self.P(f"Could not probe {label} process group {pgid}: {exc}", color='r') + return True + + def wait_process_tree(timeout): + deadline = time.monotonic() + timeout + process_stopped = process.poll() is not None + if not process_stopped: + try: + process.wait(timeout=timeout) + process_stopped = True + except subprocess.TimeoutExpired: + process_stopped = False + except Exception as exc: + self.P(f"Error waiting for {label}: {exc}", color='r') + process_stopped = process.poll() is not None + if os.name == "nt" or pgid is None: + return process_stopped + while time.monotonic() < deadline: + if not is_process_group_alive(): + return process_stopped + time.sleep(0.05) + return process_stopped and not is_process_group_alive() + + def send_signal(sig, fallback): + if os.name != "nt" and pgid is not None and sig is not None: + try: + os.killpg(pgid, sig) + return True + except ProcessLookupError: + return True + except Exception as exc: + self.P(f"Error signaling {label} process group {pgid}: {exc}", color='r') + if process.poll() is None: + try: + fallback() + return True + except Exception as exc: + self.P(f"Error signaling {label}: {exc}", color='r') + return False + return True + + if process.poll() is None or is_process_group_alive(): + if not send_signal(signal.SIGTERM, process.terminate): + return False + if wait_process_tree(terminate_timeout): + return True + + self.P(f"{label} did not stop after terminate; killing it.", color='r') + kill_signal = getattr(signal, "SIGKILL", None) + if not send_signal(kill_signal, process.kill): + return False + if wait_process_tree(kill_timeout): + return True + self.P(f"{label} did not exit after kill; continuing shutdown.", color='r') + return False + + def stop_tunnel_engine(self): """ Stop the main tunnel engine. @@ -2081,7 +2279,9 @@ def stop_container(self): Returns ------- bool - True when the container was stopped and removed, False otherwise. + True when there is no container or the container was removed from Docker. + False when Docker reported a remove failure and the container may still + exist/running. Notes ----- @@ -2092,34 +2292,40 @@ def stop_container(self): self.P("No container to stop", color='r') return True - result = True - removed = False + stopped_ok = True + removed_ok = True try: # Stop the container (gracefully) self.P(f"Stopping container {self.container.short_id}...") self.container.stop(timeout=5) self.P(f"Container {self.container.short_id} stopped successfully") except Exception as e: - result = False + stopped_ok = False self.P(f"Error stopping container: {e}", color='r') # end try try: self.P(f"Removing container {self.container.short_id}...") self.container.remove() - removed = True self.P(f"Container {self.container.short_id} removed successfully") except Exception as e: - result = False + removed_ok = False self.P(f"Error removing container: {e}", color='r') - finally: - if removed: - self.container = None - self.container_id = None - else: - self.P("Preserving container handle after failed stop/remove for retry.", color='r') + if removed_ok: + if not stopped_ok: + self.P( + "Container stop reported an error, but remove succeeded; treating " + "container as stopped for restart/cleanup purposes.", + color='y', + ) + self.container = None + self.container_id = None + else: + # Keep the handle so a later cleanup retry can remove the same Docker + # object instead of losing track of a possibly still-running container. + self.P("Preserving container handle after failed stop/remove for retry.", color='r') # end try - return removed + return removed_ok def _stream_logs(self, log_stream, stop_event=None): @@ -2713,22 +2919,22 @@ def _check_extra_tunnel_health(self): self._maybe_reset_tunnel_retry_counter(container_port) - def _stop_container_and_save_logs_to_disk(self): + def _stop_container_runtime_for_restart(self): """ - Stop the container and all tunnels, then save logs to disk. + Stop runtime sidecars and remove the Docker container. - Performs full shutdown sequence: + Performs the shared pre-restart shutdown sequence: - Clears semaphore (signals dependent plugins container is stopping) - Stops log streaming threads - Stops main tunnel engine - Stops all extra tunnels - Stops and removes container - - Saves logs to disk Returns ------- bool - True when cleanup completed without required-step failures, False otherwise. + True when the Docker container is stopped/removed or absent, False when + Docker reported a failure. """ self.P(f"Stopping container app '{self.container_id}' ...") @@ -2790,18 +2996,62 @@ def join_runtime_threads(): # Stop extra tunnels safe_cleanup_step("extra tunnels", self.stop_extra_tunnels) - # Stop the container if it's running - safe_cleanup_step("docker container", self.stop_container) + def stop_runtime_container(): + stopped = self.stop_container() + self._runtime_stop_degraded = not stopped + if not stopped: + self.P( + "Container runtime stop failed after sidecars were stopped; container " + "may still be running and volume mutation/cleanup must be skipped.", + color='r', + ) + return stopped + + # Stop the container if it's running. A false result preserves the Docker + # handle for retry and prevents volume mutation against a possibly live app. + safe_cleanup_step("docker container", stop_runtime_container) # Stop log streaming threads after Docker stop has had a chance to unblock # the log/exec streams. safe_cleanup_step("runtime threads", join_runtime_threads) - # Cleanup fixed-size volumes (unmount + detach loop devices) - safe_cleanup_step("fixed-size volumes", self._cleanup_fixed_size_volumes) + if cleanup_errors: + self._cleanup_failed = True + self.P("Container runtime cleanup completed with failed step(s): {}.".format(", ".join(cleanup_errors)), color='r') + return False + self._runtime_stop_degraded = False + self._cleanup_failed = False + return True + + + def _stop_container_and_save_logs_to_disk(self): + """ + Stop the container, all tunnels, fixed volumes, then save logs to disk. - # Save logs to disk under the instance's `logs/` sibling folder - # (resolves to pipelines_data/{sid}/{iid}/logs/container_logs.pkl) + Returns + ------- + bool + True when cleanup completed without required-step failures, False otherwise. + """ + runtime_ok = self._stop_container_runtime_for_restart() + cleanup_errors = [] + + if runtime_ok: + try: + if self._cleanup_fixed_size_volumes() is False: + cleanup_errors.append("fixed-size volumes") + except Exception as exc: + cleanup_errors.append("fixed-size volumes") + self.P(f"Container cleanup step 'fixed-size volumes' failed: {exc}", color='r') + else: + cleanup_errors.append("runtime") + self.P( + "Skipping fixed-size volume cleanup because container stop/remove failed.", + color='r', + ) + + # Save logs to disk even when cleanup is degraded; logs are diagnostic data + # and should not be lost just because Docker/tunnel teardown needs a retry. try: self.diskapi_save_pickle_to_data( obj=list(self.container_logs), @@ -2811,6 +3061,7 @@ def join_runtime_threads(): self.P("Container logs saved to disk.") except Exception as exc: self.P(f"Failed to save logs: {exc}", color='r') + if cleanup_errors: self._cleanup_failed = True self.P("Container cleanup completed with failed step(s): {}.".format(", ".join(cleanup_errors)), color='r') @@ -3113,6 +3364,36 @@ def _check_image_updates(self, current_time=None): return + def _reset_runtime_state_post_start(self): + """Bring per-restart runtime markers back to a fresh-boot baseline. + + Called after a successful ``start_container()`` to: + - stamp ``container_start_time`` so health-probe elapsed timers measure + from this new boot + - clear readiness gates (``_app_ready``, ``_health_probe_start``, + ``_tunnel_start_allowed``) so health checks re-run against the new + container's state and tunnels gate on the new readiness probe + - clear the command-rerun gate (``_commands_started``) so + BUILD_AND_RUN_COMMANDS rerun against the new container + - re-attach log capture (the prior log thread was stopped at + ``stop_container`` time) + - run image-defined build/run commands + + Shared between ``_restart_container`` and the volume-sync ticks + (``_SyncMixin._sync_safe_start_container``) so they stay in lockstep; + sync slices stop+start the container inline (to keep the system volume + mounted), and without this helper the readiness/probe state would still + point at the previous container instance. + """ + self.container_start_time = self.time() + self._app_ready = False + self._health_probe_start = None + self._tunnel_start_allowed = False + self._commands_started = False + self._start_container_log_stream() + self._maybe_execute_build_and_run() + + def _restart_container(self, stop_reason=None, cleanup_first=True): """ Restart the container from scratch. @@ -3169,6 +3450,9 @@ def _restart_container(self, stop_reason=None, cleanup_first=True): self._configure_volumes() self._configure_file_volumes() self._configure_fixed_size_volumes() + self._configure_system_volume() + self._recover_stale_processing() + self._validate_sync_config() # For semaphored containers (consumers), defer env setup and container start # to _handle_initial_launch() which properly waits for provider semaphores. @@ -3187,6 +3471,7 @@ def _restart_container(self, stop_reason=None, cleanup_first=True): # Non-semaphored containers (providers): configure env and start immediately self._configure_dynamic_env() self._setup_env_and_ports() + self._inject_sync_env_vars() # Revalidate extra tunnels self._validate_extra_tunnels_config() @@ -3205,9 +3490,7 @@ def _restart_container(self, stop_reason=None, cleanup_first=True): # start_container already recorded the failure return False - self.container_start_time = self.time() - self._start_container_log_stream() - self._maybe_execute_build_and_run() + self._reset_runtime_state_post_start() return True @@ -3370,6 +3653,7 @@ def _handle_initial_launch(self): # Semaphores ready - dynamic env to resolve shmem values, then setup env self._configure_dynamic_env() self._setup_env_and_ports() + self._inject_sync_env_vars() # end if try: @@ -3462,9 +3746,60 @@ def _perform_additional_checks(self, current_time): return StopReason.EXTERNAL_UPDATE return None """ + # Volume-sync drives stop_container/start_container INLINE so the loopback + # mount survives the archive/extract window. We must not return a + # StopReason from here because that would route through _restart_container, + # which calls _cleanup_fixed_size_volumes() and unmounts before our work + # can run. + if self._sync_enabled(): + role = self._sync_role() + if role == "provider": + self._sync_provider_tick(current_time) + elif role == "consumer": + self._sync_consumer_tick(current_time) return None + def _retry_failed_cleanup(self): + """ + Retry a previously failed cleanup cycle from the normal process loop. + + Cleanup failure is a backoff/retry state, not a permanent latch. This keeps + transient Docker, tunnel, log-reader, or fixed-volume failures visible while + still giving the plugin an automatic recovery path. + """ + if not self._cleanup_failed: + return True + + if self._has_exceeded_max_retries(): + self.P( + "Container cleanup retry abandoned after {} consecutive failure(s).".format( + self._consecutive_failures + ), + color='r', + ) + return False + + if self._is_restart_backoff_active(): + return False + + self.P("Retrying previously failed container cleanup...", color='y') + cleanup_ok = self._stop_container_and_save_logs_to_disk() + if cleanup_ok: + self._cleanup_failed = False + self.P("Previously failed container cleanup succeeded.", color='g') + if self._manual_stop_pending: + self._save_persistent_state(manually_stopped=True) + self._manual_stop_pending = False + self._set_container_state(ContainerState.PAUSED, StopReason.MANUAL_STOP) + return False + return True + + self._record_restart_failure() + self._set_container_state(ContainerState.FAILED, self.stop_reason or StopReason.UNKNOWN) + return False + + def process(self): """ Main process loop for the plugin. @@ -3496,11 +3831,7 @@ def process(self): self._last_paused_log = current_time return - if self._cleanup_failed: - current_time = self.time() - if current_time - self._last_paused_log >= self.cfg_paused_state_log_interval: - self.P("Container cleanup previously failed; periodic launch/restart is blocked until cleanup succeeds.") - self._last_paused_log = current_time + if self._cleanup_failed and not self._retry_failed_cleanup(): return if not self.container: diff --git a/extensions/business/container_apps/fixed_volume.py b/extensions/business/container_apps/fixed_volume.py index dd994e6f..6e38a5bc 100644 --- a/extensions/business/container_apps/fixed_volume.py +++ b/extensions/business/container_apps/fixed_volume.py @@ -48,6 +48,33 @@ def _log(logger: Optional[Callable], level: str, message: str) -> None: print(f"[FixedVolume] [{level}] {message}", flush=True) +def _decode_proc_mount_field(value: str) -> str: + """Decode the octal escapes used by /proc/mounts fields.""" + return (value.replace("\\040", " ") + .replace("\\011", "\t") + .replace("\\012", "\n") + .replace("\\134", "\\")) + + +def _get_mount_source(mount_path) -> Optional[str]: + """Return the exact source device mounted at ``mount_path``, if any.""" + try: + with open("/proc/mounts", "r", encoding="utf-8") as f: + lines = f.readlines() + except OSError: + return None + target = str(mount_path).rstrip("/") + for line in lines: + parts = line.split() + if len(parts) < 2: + continue + source = _decode_proc_mount_field(parts[0]) + mp = _decode_proc_mount_field(parts[1]) + if mp.rstrip("/") == target: + return source + return None + + def _is_path_mounted(mount_path) -> bool: """Return True iff `mount_path` is an exact mountpoint in /proc/mounts. @@ -72,11 +99,7 @@ def _is_path_mounted(mount_path) -> bool: parts = line.split() if len(parts) < 2: continue - mp = parts[1] - mp = (mp.replace("\\040", " ") - .replace("\\011", "\t") - .replace("\\012", "\n") - .replace("\\134", "\\")) + mp = _decode_proc_mount_field(parts[1]) if mp.rstrip("/") == target: return True return False @@ -400,16 +423,53 @@ def cleanup( ) result = True loop_dev = None + metadata_error = False if vol.meta_path.exists(): try: meta = json.loads(vol.meta_path.read_text(encoding="utf-8")) loop_dev = meta.get("loop_dev") _log(logger, "INFO", f"Loaded metadata loop_dev={loop_dev}") except Exception as exc: - result = False + metadata_error = True _log(logger, "WARN", f"Failed to read metadata error={exc}") - if _is_path_mounted(vol.mount_path): + mount_source = _get_mount_source(vol.mount_path) + mount_source_is_loop = mount_source and str(mount_source).startswith("/dev/loop") + if mount_source_is_loop and loop_dev is None: + # A mounted loop source is a stronger identity than the sidecar metadata: + # it lets us unmount and detach safely even when metadata was lost/corrupt. + loop_dev = mount_source + metadata_error = False + _log(logger, "WARN", f"Recovered loop device from /proc/mounts loop_dev={loop_dev}") + elif mount_source_is_loop and loop_dev != mount_source: + # Metadata can be stale after interrupted cleanup/restart. The mounted + # source is the device that must be detached after unmount, so prefer it. + _log( + logger, "WARN", + f"Metadata loop_dev={loop_dev} differs from mounted source={mount_source}; using mounted source.", + ) + loop_dev = mount_source + elif mount_source and loop_dev is None: + # A mounted path without a positive loop-device identity must not be + # reported as a clean fixed-volume teardown; callers need to retain it for + # operator inspection/retry instead of dropping cleanup tracking. + result = False + _log(logger, "WARN", f"Mounted path has no loop metadata source={mount_source}") + elif mount_source: + # A fixed-size volume should be mounted from a loop device. If /proc/mounts + # says otherwise, fail closed instead of detaching a possibly unrelated + # metadata loop device and reporting success. + result = False + _log( + logger, "WARN", + f"Mounted path source is not a loop device source={mount_source}; refusing metadata loop detach.", + ) + loop_dev = None + + if metadata_error: + result = False + + if mount_source is not None: try: _run(["umount", str(vol.mount_path)], logger=logger) except Exception as exc: diff --git a/extensions/business/container_apps/sync/__init__.py b/extensions/business/container_apps/sync/__init__.py new file mode 100644 index 00000000..c9f06457 --- /dev/null +++ b/extensions/business/container_apps/sync/__init__.py @@ -0,0 +1,90 @@ +"""Volume-sync subpackage for the Container App Runner. + +The whole feature lives here: + * ``constants.py`` — file names, namespace strings, schema versions, + failure-stage labels. No code, just data. + * ``manager.py`` — ``SyncManager`` class plus host-side path helpers. + Pure I/O orchestration; takes the plugin as ``owner`` and delegates + network/storage to ``owner.r1fs`` / ``owner.chainstore_*``. + * ``mixin.py`` — ``_SyncMixin`` class. Plugin-class integration: + knows when sync work should happen (on_init, _restart_container, + _perform_additional_checks, _handle_initial_launch) and frames each + invocation around a ``stop_container → SyncManager.work → + start_container`` window. + +Re-exports below let callers import from the package root rather than +reaching into individual modules. +""" + +from .constants import ( + ARCHIVE_ENCRYPTION, + ARCHIVE_FORMAT, + CHAINSTORE_SYNC_HKEY, + MANIFEST_SCHEMA_VERSION, + STAGE_ARCHIVE_BUILD, + STAGE_CHAINSTORE_PUBLISH, + STAGE_EXTRACT, + STAGE_R1FS_UPLOAD, + STAGE_RUNTIME_STOP, + STAGE_VALIDATION, + SYNC_HISTORY_DIR, + SYNC_HISTORY_RECEIVED, + SYNC_HISTORY_SENT, + SYNC_INVALID_FILE, + SYNC_LAST_APPLY_FILE, + SYNC_PROCESSING_FILE, + SYNC_REQUEST_FILE, + SYNC_RESPONSE_FILE, + SYSTEM_VOLUME_FS, + SYSTEM_VOLUME_MOUNT, + SYSTEM_VOLUME_NAME, + SYSTEM_VOLUME_SIZE, + VOLUME_SYNC_SUBDIR, +) +from .manager import ( + SyncManager, + history_received_dir, + history_root, + history_sent_dir, + sync_state_dir, + system_volume_host_root, + volume_sync_dir, +) +from .mixin import _SyncMixin + +__all__ = [ + # constants + "ARCHIVE_ENCRYPTION", + "ARCHIVE_FORMAT", + "CHAINSTORE_SYNC_HKEY", + "MANIFEST_SCHEMA_VERSION", + "STAGE_ARCHIVE_BUILD", + "STAGE_CHAINSTORE_PUBLISH", + "STAGE_EXTRACT", + "STAGE_R1FS_UPLOAD", + "STAGE_RUNTIME_STOP", + "STAGE_VALIDATION", + "SYNC_HISTORY_DIR", + "SYNC_HISTORY_RECEIVED", + "SYNC_HISTORY_SENT", + "SYNC_INVALID_FILE", + "SYNC_LAST_APPLY_FILE", + "SYNC_PROCESSING_FILE", + "SYNC_REQUEST_FILE", + "SYNC_RESPONSE_FILE", + "SYSTEM_VOLUME_FS", + "SYSTEM_VOLUME_MOUNT", + "SYSTEM_VOLUME_NAME", + "SYSTEM_VOLUME_SIZE", + "VOLUME_SYNC_SUBDIR", + # path helpers + "history_received_dir", + "history_root", + "history_sent_dir", + "sync_state_dir", + "system_volume_host_root", + "volume_sync_dir", + # classes + "SyncManager", + "_SyncMixin", +] diff --git a/extensions/business/container_apps/sync/constants.py b/extensions/business/container_apps/sync/constants.py new file mode 100644 index 00000000..255e36c1 --- /dev/null +++ b/extensions/business/container_apps/sync/constants.py @@ -0,0 +1,53 @@ +"""Volume-sync constants and namespace conventions. + +Hard-coded values (no config knobs) shared by ``SyncManager``, +``_SyncMixin``, and the unit tests. Lives in its own module so a reader +can `cat sync/constants.py` to see the full data-plane vocabulary in one +place — file names, the ChainStore hkey, the stage labels, the schema +version. Anything tunable belongs in the plugin's ``SYNC`` config block, +not here. +""" + +# --------------------------------------------------------------------------- +# System volume — non-configurable defaults +# --------------------------------------------------------------------------- + +SYSTEM_VOLUME_NAME = "r1en_system" # logical name (host paths) +SYSTEM_VOLUME_MOUNT = "/r1en_system" # mount point inside container +SYSTEM_VOLUME_SIZE = "10M" # fixed-size ext4 image — control-plane only +SYSTEM_VOLUME_FS = "ext4" + +# Per-feature subdirectory under the system volume root, so future CAR ↔ app +# control-plane features (not just sync) can coexist without colliding. +VOLUME_SYNC_SUBDIR = "volume-sync" + +# Filenames inside // +SYNC_REQUEST_FILE = "request.json" +SYNC_PROCESSING_FILE = "request.json.processing" +SYNC_INVALID_FILE = "request.json.invalid" +SYNC_RESPONSE_FILE = "response.json" +SYNC_LAST_APPLY_FILE = "last_apply.json" + +# Persistent audit folders under /sync_history/ +SYNC_HISTORY_DIR = "sync_history" +SYNC_HISTORY_SENT = "sent" # provider — writes to R1FS +SYNC_HISTORY_RECEIVED = "received" # consumer — reads from R1FS + +# ChainStore namespace +CHAINSTORE_SYNC_HKEY = "CHAINSTORE_SYNC" + +# Manifest schema versioning so consumers can refuse newer-than-known formats +MANIFEST_SCHEMA_VERSION = 1 +ARCHIVE_FORMAT = "tar.gz" +ARCHIVE_ENCRYPTION = "r1fs-default" + +# Stages reported on failure (used in response.json + request.json.invalid) +STAGE_VALIDATION = "validation" +STAGE_ARCHIVE_BUILD = "archive_build" +STAGE_R1FS_UPLOAD = "r1fs_upload" +STAGE_CHAINSTORE_PUBLISH = "chainstore_publish" +STAGE_EXTRACT = "extract" +STAGE_RUNTIME_STOP = "runtime_stop" + +# History entry deletion sub-record default (filled in when superseded). +_UNDELETED = {"deleted_at": None, "deletion_succeeded": None, "deletion_error": None} diff --git a/extensions/business/container_apps/sync/control_files.py b/extensions/business/container_apps/sync/control_files.py new file mode 100644 index 00000000..b51d3e3b --- /dev/null +++ b/extensions/business/container_apps/sync/control_files.py @@ -0,0 +1,327 @@ +"""Helpers for CAR volume-sync JSON control files. + +The sync data plane uses small JSON files in the always-mounted system +volume as a control protocol between the app and CAR. This module owns the +file mechanics: atomic JSON writes, pending-to-processing claims, stale +processing recovery, and processing cleanup. SyncManager keeps the domain +validation and response payload shapes. +""" + +from __future__ import annotations + +import errno +import json +import os +import stat +import tempfile +import time as _time +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Optional + + +@dataclass(frozen=True) +class ClaimedJsonObject: + """A JSON object claimed from a pending control file.""" + + body: dict + raw_body: str + processing_path: Path + + +class JsonControlFileError(Exception): + """Base class for control-file mechanics errors.""" + + def __init__( + self, + message: str, + *, + raw_body: Optional[str] = None, + processing_path: Optional[Path] = None, + ) -> None: + super().__init__(message) + self.raw_body = raw_body + self.processing_path = processing_path + + +class JsonControlFileClaimError(JsonControlFileError): + """The pending file could not be renamed to its processing name.""" + + +class JsonControlFileReadError(JsonControlFileError): + """The processing file could not be read.""" + + +class JsonControlFileDecodeError(JsonControlFileError): + """The processing file was not valid JSON.""" + + +class JsonControlFileObjectError(JsonControlFileError): + """The processing file was JSON, but not a JSON object.""" + + +class JsonControlFileUnsafeError(JsonControlFileError): + """The processing file is not a regular no-follow-readable file.""" + + +def _ensure_real_directory(path: Path, *, create: bool) -> bool: + """Ensure ``path`` is a real directory, not a symlink. + + Returns False only when ``create`` is False and the path is absent. + """ + path = Path(path) + if create: + try: + path.mkdir(parents=True, exist_ok=True) + except FileExistsError: + pass + try: + st = os.lstat(str(path)) + except FileNotFoundError: + if create: + raise + return False + if stat.S_ISLNK(st.st_mode): + raise JsonControlFileUnsafeError( + f"refusing symlink control directory: {path}" + ) + if not stat.S_ISDIR(st.st_mode): + raise JsonControlFileUnsafeError( + f"refusing non-directory control directory: {path}" + ) + return True + + +def write_json_atomic(path: Path, payload: Any) -> None: + """Write JSON to ``path`` atomically and make it app-readable. + + Creates the parent directory if missing. Uses a temporary file in the same + directory so ``os.replace`` is atomic within the filesystem. The final file + is chmod'd to 0o644 so apps can read CAR-owned status/control results + without being able to rewrite them. + """ + path = Path(path) + _ensure_real_directory(path.parent, create=True) + fd, tmp_name = tempfile.mkstemp( + dir=str(path.parent), prefix=f".{path.name}.", suffix=".tmp" + ) + try: + with os.fdopen(fd, "w", encoding="utf-8") as handle: + json.dump(payload, handle, indent=2, sort_keys=True) + handle.flush() + os.fsync(handle.fileno()) + os.chmod(tmp_name, 0o644) + os.replace(tmp_name, str(path)) + except Exception: + try: + os.unlink(tmp_name) + except OSError: + pass + raise + + +class JsonControlFile: + """File-mechanics helper for a single pending/processing JSON control file.""" + + def __init__(self, root: Path, pending_name: str, processing_name: str): + self.root = Path(root) + self.pending_name = pending_name + self.processing_name = processing_name + + @property + def pending_path(self) -> Path: + return self.root / self.pending_name + + @property + def processing_path(self) -> Path: + return self.root / self.processing_name + + def _root_is_safe(self) -> bool: + return _ensure_real_directory(self.root, create=False) + + def has_pending(self) -> bool: + if not self._root_is_safe(): + return False + try: + os.lstat(str(self.pending_path)) + return True + except FileNotFoundError: + return False + + @staticmethod + def _quarantine_directory(path: Path) -> None: + parent = path.parent + base_name = path.name + for _ in range(5): + target = parent / f"{base_name}.unsafe.{_time.time_ns()}" + try: + os.replace(str(path), str(target)) + return + except FileExistsError: + continue + raise JsonControlFileUnsafeError( + f"could not quarantine unsafe control directory: {base_name}", + processing_path=path, + ) + + @classmethod + def _remove_unsafe_entry(cls, path: Path) -> None: + st = os.lstat(str(path)) + if stat.S_ISDIR(st.st_mode): + try: + os.rmdir(str(path)) + except OSError as exc: + if getattr(exc, "errno", None) in (errno.ENOTEMPTY, errno.EEXIST): + cls._quarantine_directory(path) + return + raise + return + os.unlink(str(path)) + + @classmethod + def _reject_non_regular_control_file(cls, path: Path) -> None: + try: + st = os.lstat(str(path)) + except FileNotFoundError: + raise + if stat.S_ISREG(st.st_mode): + return + if stat.S_ISLNK(st.st_mode): + message = f"refusing symlink control file: {path.name}" + else: + message = f"refusing non-regular control file: {path.name}" + try: + cls._remove_unsafe_entry(path) + finally: + raise JsonControlFileUnsafeError( + message, + processing_path=path, + ) + + @staticmethod + def _read_text_no_follow(path: Path) -> str: + flags = os.O_RDONLY + if hasattr(os, "O_NOFOLLOW"): + flags |= os.O_NOFOLLOW + if hasattr(os, "O_NONBLOCK"): + flags |= os.O_NONBLOCK + + fd: Optional[int] = None + try: + fd = os.open(str(path), flags) + st = os.fstat(fd) + if not stat.S_ISREG(st.st_mode): + raise JsonControlFileUnsafeError( + f"refusing to read non-regular control file: {path.name}", + processing_path=path, + ) + with os.fdopen(fd, "r", encoding="utf-8") as handle: + fd = None + return handle.read() + except OSError as exc: + if getattr(exc, "errno", None) == errno.ELOOP: + raise JsonControlFileUnsafeError( + f"refusing to read symlink control file: {path.name}", + processing_path=path, + ) from exc + raise + finally: + if fd is not None: + try: + os.close(fd) + except OSError: + pass + + def claim_processing(self) -> Optional[Path]: + """Atomically rename pending -> processing, returning the processing path.""" + if not self.has_pending(): + return None + self._reject_non_regular_control_file(self.pending_path) + try: + os.replace(str(self.pending_path), str(self.processing_path)) + except OSError as exc: + raise JsonControlFileClaimError( + str(exc), processing_path=self.processing_path, + ) from exc + return self.processing_path + + def claim_object(self) -> Optional[ClaimedJsonObject]: + """Claim a pending JSON object control file. + + Returns None when no pending file exists. Raises a JsonControlFileError + subclass for mechanics, JSON decode, or JSON-shape failures. On decode or + shape failure, the processing file remains in place so callers can write + their own failure artifacts and then discard it. + """ + processing_path = self.claim_processing() + if processing_path is None: + return None + + try: + raw_body = self._read_text_no_follow(processing_path) + except UnicodeDecodeError as exc: + raise JsonControlFileDecodeError( + f"invalid UTF-8 in {self.pending_name}: {exc}", + processing_path=processing_path, + ) from exc + except OSError as exc: + raise JsonControlFileReadError( + str(exc), processing_path=processing_path, + ) from exc + + try: + body = json.loads(raw_body) + except json.JSONDecodeError as exc: + raise JsonControlFileDecodeError( + str(exc), raw_body=raw_body, processing_path=processing_path, + ) from exc + + if not isinstance(body, dict): + raise JsonControlFileObjectError( + f"{self.pending_name} must be a JSON object", + raw_body=raw_body, + processing_path=processing_path, + ) + + return ClaimedJsonObject( + body=body, + raw_body=raw_body, + processing_path=processing_path, + ) + + def discard_processing(self) -> None: + if os.path.lexists(str(self.processing_path)): + self._remove_unsafe_entry(self.processing_path) + + def recover_stale_processing(self) -> bool: + """Rename orphan processing -> pending without overwriting a pending file.""" + if not self._root_is_safe(): + return False + if not os.path.lexists(str(self.processing_path)): + return False + st = os.lstat(str(self.processing_path)) + if stat.S_ISLNK(st.st_mode): + os.unlink(str(self.processing_path)) + return False + if stat.S_ISREG(st.st_mode) and not os.path.lexists(str(self.pending_path)): + os.replace(str(self.processing_path), str(self.pending_path)) + return True + if not stat.S_ISREG(st.st_mode): + self._remove_unsafe_entry(self.processing_path) + return False + + def write_json(self, file_name: str, payload: Any) -> None: + write_json_atomic(self.root / file_name, payload) + + +__all__ = [ + "ClaimedJsonObject", + "JsonControlFile", + "JsonControlFileClaimError", + "JsonControlFileDecodeError", + "JsonControlFileError", + "JsonControlFileObjectError", + "JsonControlFileReadError", + "JsonControlFileUnsafeError", + "write_json_atomic", +] diff --git a/extensions/business/container_apps/sync/manager.py b/extensions/business/container_apps/sync/manager.py new file mode 100644 index 00000000..aacc2117 --- /dev/null +++ b/extensions/business/container_apps/sync/manager.py @@ -0,0 +1,1959 @@ +"""Volume-sync manager for the Container App Runner. + +Coordinates publishing app-state snapshots to R1FS+ChainStore (provider) and +applying them on remote nodes (consumer). The contract with the app inside +the container is file-based, mediated through the always-on system volume +mounted at ``/r1en_system``: + + app writes /r1en_system/volume-sync/request.json (one-shot) + CAR writes /r1en_system/volume-sync/response.json (provider, paired) + CAR writes /r1en_system/volume-sync/last_apply.json (consumer) + CAR writes /r1en_system/volume-sync/request.json.invalid (failed request body + diagnostics) + +Persistent per-plugin audit trail lives under +``/sync_history/{sent,received}/__.json`` +so both sides can be inspected with ``ls`` / ``cat`` / ``jq`` after the fact. + +See ``extensions/business/container_apps/README.md`` for the public +operator/app contract. +""" + +from __future__ import annotations + +import json +import os +import copy +import hashlib +import stat +import tarfile +import tempfile +import time as _time +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Optional + +from extensions.business.container_apps.container_utils import ( + CONTAINER_VOLUMES_PATH, +) + +from .control_files import ( + JsonControlFile, + JsonControlFileClaimError, + JsonControlFileDecodeError, + JsonControlFileObjectError, + JsonControlFileReadError, + JsonControlFileUnsafeError, + write_json_atomic, +) + +_HISTORY_WRITTEN_AT_NS = "history_written_at_ns" +_SYNC_STATE_DIR = "state" +_SYNC_APPLY_STATE_FILE = "current_apply.json" +_SYNC_QUARANTINE_DIR = "quarantine" +_BAD_CID_RETRY_BASE_SECONDS = 60.0 +_BAD_CID_RETRY_MAX_SECONDS = 3600.0 +PROVIDER_CAPTURE_OFFLINE = "offline" +PROVIDER_CAPTURE_ONLINE = "online" +CONSUMER_APPLY_OFFLINE_RESTART = "offline_restart" +CONSUMER_APPLY_ONLINE_NO_RESTART = "online_no_restart" +CONSUMER_APPLY_ONLINE_RESTART = "online_restart" +_PROVIDER_CAPTURE_MODES = {PROVIDER_CAPTURE_OFFLINE, PROVIDER_CAPTURE_ONLINE} +_CONSUMER_APPLY_MODES = { + CONSUMER_APPLY_OFFLINE_RESTART, + CONSUMER_APPLY_ONLINE_NO_RESTART, + CONSUMER_APPLY_ONLINE_RESTART, +} + +from .constants import ( + ARCHIVE_ENCRYPTION, + ARCHIVE_FORMAT, + CHAINSTORE_SYNC_HKEY, + MANIFEST_SCHEMA_VERSION, + STAGE_ARCHIVE_BUILD, + STAGE_CHAINSTORE_PUBLISH, + STAGE_EXTRACT, + STAGE_R1FS_UPLOAD, + STAGE_VALIDATION, + SYNC_HISTORY_DIR, + SYNC_HISTORY_RECEIVED, + SYNC_HISTORY_SENT, + SYNC_INVALID_FILE, + SYNC_LAST_APPLY_FILE, + SYNC_PROCESSING_FILE, + SYNC_REQUEST_FILE, + SYNC_RESPONSE_FILE, + SYSTEM_VOLUME_FS, + SYSTEM_VOLUME_MOUNT, + SYSTEM_VOLUME_NAME, + SYSTEM_VOLUME_SIZE, + VOLUME_SYNC_SUBDIR, + _UNDELETED, +) + + +@dataclass(frozen=True) +class SyncRuntimePolicy: + provider_capture: str = PROVIDER_CAPTURE_OFFLINE + consumer_apply: str = CONSUMER_APPLY_OFFLINE_RESTART + + +@dataclass(frozen=True) +class SyncRequest: + archive_paths: list[str] + metadata: dict + runtime: SyncRuntimePolicy + + +@dataclass(frozen=True) +class PlannedApplyMember: + container_name: str + host_path: str + host_root: str + staging_path: Optional[Path] + mode: int + is_dir: bool + + +@dataclass(frozen=True) +class PreparedApply: + record: dict + cid: str + version: int + local_path: str + staging_dir: Path + members: list[PlannedApplyMember] + manifest: dict + + +@dataclass(frozen=True) +class ApplyResult: + success: bool + restart_safe: bool + state: str + extracted_paths: list[str] + error: Optional[str] = None + + +@dataclass(frozen=True) +class DirectoryMetadata: + uid: int + gid: int + mode: int + + +@dataclass(frozen=True) +class ApplyRollbackOp: + op: str + path: str + backup: Optional[str] = None + metadata: Optional[DirectoryMetadata] = None + + +def runtime_policy_to_dict(runtime: SyncRuntimePolicy) -> dict: + return { + "provider_capture": runtime.provider_capture, + "consumer_apply": runtime.consumer_apply, + } + + +# --------------------------------------------------------------------------- +# Path helpers (host-side) +# --------------------------------------------------------------------------- + +def system_volume_host_root(owner) -> Path: + """Host-side root of the system volume's loopback mount. + + The system volume is provisioned via the same machinery as + FIXED_SIZE_VOLUMES, so its mount lives at: + /fixed_volumes/mounts// + """ + return ( + Path(owner.get_data_folder()) + / owner._get_instance_data_subfolder() + / "fixed_volumes" / "mounts" / SYSTEM_VOLUME_NAME + ) + + +def volume_sync_dir(owner) -> Path: + """Host-side path of the volume-sync control-plane subdir.""" + return system_volume_host_root(owner) / VOLUME_SYNC_SUBDIR + + +def history_root(owner) -> Path: + """Host-side root of the per-plugin sync history folders.""" + return ( + Path(owner.get_data_folder()) + / owner._get_instance_data_subfolder() + / SYNC_HISTORY_DIR + ) + + +def history_sent_dir(owner) -> Path: + return history_root(owner) / SYNC_HISTORY_SENT + + +def history_received_dir(owner) -> Path: + return history_root(owner) / SYNC_HISTORY_RECEIVED + + +def sync_state_dir(owner) -> Path: + """Host-private sync state root; never mounted into the app container.""" + return history_root(owner) / _SYNC_STATE_DIR + + +def apply_state_path(owner) -> Path: + return sync_state_dir(owner) / _SYNC_APPLY_STATE_FILE + + +def quarantine_dir(owner) -> Path: + return sync_state_dir(owner) / _SYNC_QUARANTINE_DIR + + +# --------------------------------------------------------------------------- +# SyncManager +# --------------------------------------------------------------------------- + +class SyncManager: + """Pure orchestration layer driven by ``_SyncMixin`` ticks. + + All file I/O is rooted at host-side paths derived from the plugin's per- + instance data folder. Network/storage operations are delegated to the + plugin's ``self.r1fs`` and ``self.chainstore_*`` APIs. + + Required attributes on ``owner``: + - P, time (BasePlugin) + - get_data_folder, _get_instance_data_subfolder (BasePlugin) + - volumes (dict, populated by CAR) + - r1fs (R1FSEngine) + - chainstore_hset, chainstore_hget, chainstore_hsync (BasePlugin API) + - cfg_sync_key, cfg_sync_type (CAR config — propagated by mixin) + - ee_id (BasePlugin — node identity) + """ + + # Fallback used by fetch_latest when the owner doesn't expose + # cfg_sync_hsync_poll_interval (e.g. test fixtures or older configs). + # Mirrors _SyncMixin._HSYNC_POLL_INTERVAL_DEFAULT. + _DEFAULT_HSYNC_POLL_INTERVAL = 60.0 + _DEFAULT_HSYNC_FAILURE_RETRY_INTERVAL = 30.0 + + def __init__(self, owner): + self.owner = owner + # Timestamp (owner.time() units) of the last hsync attempt. Initial 0 + # guarantees the first ``fetch_latest`` call still hsyncs. + self._last_hsync = 0.0 + + def _request_control_file(self) -> JsonControlFile: + return JsonControlFile( + volume_sync_dir(self.owner), SYNC_REQUEST_FILE, SYNC_PROCESSING_FILE + ) + + @staticmethod + def _validate_container_path_shape(container_path: str) -> None: + if not isinstance(container_path, str) or not container_path: + raise ValueError(f"archive_paths entry must be a non-empty string: {container_path!r}") + + parts = container_path.split("/") + if any(p == ".." for p in parts): + raise ValueError(f"archive_paths entries must not contain '..': {container_path!r}") + + cp = os.path.normpath(container_path) + if not cp.startswith("/"): + raise ValueError(f"archive_paths entries must be absolute: {container_path!r}") + + if cp == SYSTEM_VOLUME_MOUNT or cp.startswith(SYSTEM_VOLUME_MOUNT + "/"): + raise ValueError( + f"refusing to archive system volume content (anti-recursion): {container_path!r}" + ) + return + + # ----- path resolution ------------------------------------------------- + def resolve_container_path(self, container_path: str) -> tuple[str, str, str]: + """Map an app-perspective absolute path to a host path via owner.volumes. + + Enforces the six-rule check from the plan: + 1. absolute, 2. covered by a mount, 3. backed by a volume-managed + mount (fixed-size OR legacy VOLUMES — both are per-instance host + directories under known roots; anonymous Docker mounts and ephemeral + container fs are still rejected), 4. not inside the system volume, + 5. no ``..`` after normalization, 6. resolved host path stays within + its host_root. + + Returns ``(host_path, bind_root, host_root)`` on success, raises + ``ValueError`` on any rule violation. + """ + self._validate_container_path_shape(container_path) + cp = os.path.normpath(container_path) + + # Rule 3 allow-list — both eligible roots are bounded, per-instance, and + # inside the edge node's data root: + # - fixed_volumes/mounts/ : FIXED_SIZE_VOLUMES (ext4 loopbacks) + # - CONTAINER_VOLUMES_PATH : legacy VOLUMES (raw bind dirs, deprecated + # but still in use by some pipelines). These are functionally + # equivalent for sync purposes: a per-instance host directory + # identified by a known parent root. + # Anonymous Docker mounts, FILE_VOLUMES (content-injected single files), + # and ephemeral container fs all sit outside both roots and are rejected. + fixed_root_marker = os.sep + os.path.join("fixed_volumes", "mounts") + os.sep + legacy_root_marker = os.path.normpath(CONTAINER_VOLUMES_PATH) + os.sep + + # Collect every mount whose bind prefix covers cp, then pick the longest. + # Docker overlays the more specific mount on top of the broader one inside + # the container (e.g. /app/data is shadowed onto /app), so the longest- + # prefix match is the one that actually serves reads/writes for cp. The + # previous first-match-wins iteration used dict insertion order, which has + # no relationship to overlay specificity and could resolve to the wrong + # host root for nested mounts. + volumes = getattr(self.owner, "volumes", {}) or {} + matches: list[tuple[str, str]] = [] + for host_root, spec in volumes.items(): + if not isinstance(spec, dict): + continue + bind = str(spec.get("bind", "")).rstrip("/") + if not bind: + continue + # Rule 2: container path must fall under this mount's bind point. + if cp != bind and not cp.startswith(bind + "/"): + continue + matches.append((str(host_root), bind)) + + if not matches: + raise ValueError(f"no mounted volume covers {container_path!r}") + + host_root, bind = max(matches, key=lambda hb: len(hb[1])) + host_root_n = os.path.normpath(host_root) + # Rule 3: the winning mount's host root must fall under a known + # volume-managed root (fixed-size or legacy VOLUMES). See the allow-list + # construction above for the rationale and the list of rejected cases. + host_root_with_sep = host_root_n + os.sep + if not ( + fixed_root_marker in host_root_with_sep + or host_root_with_sep.startswith(legacy_root_marker) + ): + raise ValueError( + f"refusing non-volume-backed mount for {container_path!r}: " + f"host_root={host_root_n!r} (only FIXED_SIZE_VOLUMES or legacy " + f"VOLUMES paths allowed; expected host root under " + f"{fixed_root_marker.strip(os.sep)!r} or " + f"{CONTAINER_VOLUMES_PATH!r})" + ) + + rel = "" if cp == bind else os.path.relpath(cp, bind) + host_path = os.path.normpath(os.path.join(host_root_n, rel)) + # Rule 6: resolved path must stay within host_root. + if not (host_path == host_root_n or host_path.startswith(host_root_n + os.sep)): + raise ValueError( + f"resolved host path escapes mount root: {container_path!r} -> {host_path!r}" + ) + return host_path, bind, host_root_n + + @staticmethod + def _is_within_root(path: str, root: str) -> bool: + path_n = os.path.normpath(path) + root_n = os.path.normpath(root) + return path_n == root_n or path_n.startswith(root_n + os.sep) + + @staticmethod + def _archive_arcname(container_root: str, rel_path: str) -> str: + root = os.path.normpath(container_root) + if rel_path in ("", "."): + return root + return os.path.normpath(os.path.join(root, rel_path)) + + @staticmethod + def _safe_extract_mode(member_mode: int, *, is_dir: bool) -> int: + normal_bits = member_mode & 0o777 + minimum = 0o755 if is_dir else 0o644 + return normal_bits | minimum + + def _validate_archive_source_path( + self, + host_path: str, + host_root: str, + container_path: str, + ) -> int: + """Validate an offline archive source without following symlinks.""" + host_path_n = os.path.normpath(host_path) + host_root_n = os.path.normpath(host_root) + if not self._is_within_root(host_path_n, host_root_n): + raise ValueError( + f"archive source escapes volume root: {container_path!r} -> {host_path_n!r}" + ) + rel = os.path.relpath(host_path_n, host_root_n) + current = host_root_n + for part in [] if rel == "." else rel.split(os.sep): + current = os.path.join(current, part) + try: + st = os.lstat(current) + except FileNotFoundError as exc: + raise FileNotFoundError( + f"archive_paths target does not exist on host: " + f"{container_path!r} -> {host_path_n!r}" + ) from exc + if stat.S_ISLNK(st.st_mode): + raise ValueError( + f"archive source contains symlink: {container_path!r} -> {current!r}" + ) + root_real = os.path.realpath(host_root_n) + path_real = os.path.realpath(host_path_n) + if not self._is_within_root(path_real, root_real): + raise ValueError( + f"archive source escapes volume root: {container_path!r} -> {host_path_n!r}" + ) + return os.lstat(host_path_n).st_mode + + def _add_offline_archive_path( + self, + tar: tarfile.TarFile, + container_path: str, + host_path: str, + host_root: str, + ) -> None: + mode = self._validate_archive_source_path( + host_path, host_root, container_path + ) + if stat.S_ISREG(mode): + tar.add(host_path, arcname=os.path.normpath(container_path), recursive=False) + return + if not stat.S_ISDIR(mode): + raise ValueError( + f"archive source is not a regular file or directory: {container_path!r}" + ) + + for current_root, dirnames, filenames in os.walk( + host_path, topdown=True, followlinks=False + ): + rel_root = os.path.relpath(current_root, host_path) + current_container = self._archive_arcname(container_path, rel_root) + current_mode = self._validate_archive_source_path( + current_root, host_root, current_container + ) + if stat.S_ISLNK(current_mode): + raise ValueError( + f"archive source contains symlink: {current_container!r}" + ) + tar.add(current_root, arcname=current_container, recursive=False) + + kept_dirs: list[str] = [] + for name in dirnames: + child = os.path.join(current_root, name) + child_container = self._archive_arcname( + container_path, os.path.relpath(child, host_path) + ) + child_mode = self._validate_archive_source_path( + child, host_root, child_container + ) + if stat.S_ISLNK(child_mode): + raise ValueError( + f"archive source contains symlink: {child_container!r}" + ) + if not stat.S_ISDIR(child_mode): + raise ValueError( + f"archive source is not a directory: {child_container!r}" + ) + kept_dirs.append(name) + dirnames[:] = kept_dirs + + for name in filenames: + child = os.path.join(current_root, name) + child_container = self._archive_arcname( + container_path, os.path.relpath(child, host_path) + ) + child_mode = self._validate_archive_source_path( + child, host_root, child_container + ) + if not stat.S_ISREG(child_mode): + raise ValueError( + f"archive source is not a regular file: {child_container!r}" + ) + tar.add(child, arcname=child_container, recursive=False) + + # ----- atomic I/O ------------------------------------------------------- + def _write_json_atomic(self, path: Path, payload: Any) -> None: + """Write JSON to ``path`` atomically (tmp + ``os.replace``). + + Creates the parent directory if missing. Uses a NamedTemporaryFile in + the same directory so ``os.replace`` is an atomic rename within one + filesystem. The final file is chmod'd to 0o644 because CAR runs as + root inside the edge node but the app inside the container typically + runs as a non-root user. Apps can read response.json / last_apply.json / + request.json.invalid, but cannot rewrite CAR-owned outputs. + """ + write_json_atomic(path, payload) + + # ----- history --------------------------------------------------------- + @staticmethod + def _history_filename(version: int, cid: str) -> str: + """Build the canonical filename for a history entry. + + ``<10-digit-version>__<12-char-cid>.json`` so lexical sort matches + chronological order (version is a Unix timestamp). + """ + short_cid = (cid or "")[:12] or "no_cid" + # safe_path_component-like sanitisation kept simple — CIDs are base58. + safe_short = "".join(ch if ch.isalnum() else "_" for ch in short_cid) + return f"{int(version):010d}__{safe_short}.json" + + def _ensure_history_dirs(self) -> None: + history_sent_dir(self.owner).mkdir(parents=True, exist_ok=True) + history_received_dir(self.owner).mkdir(parents=True, exist_ok=True) + + def _append_history(self, history_dir: Path, entry: dict) -> Path: + self._ensure_history_dirs() + fname = self._history_filename(entry.get("version", 0), entry.get("cid", "")) + path = history_dir / fname + payload = dict(entry) + payload.setdefault(_HISTORY_WRITTEN_AT_NS, _time.time_ns()) + payload.setdefault("deletion", dict(_UNDELETED)) + self._write_json_atomic(path, payload) + return path + + def _read_history_entries(self, history_dir: Path) -> list[tuple[Path, dict, int]]: + """Read history JSON files with stable insertion-order metadata. + + ``history_written_at_ns`` is set when an entry is first appended and is + preserved by deletion updates. Older history files fall back to mtime. + """ + entries = [] + if not history_dir.is_dir(): + return entries + for path in history_dir.iterdir(): + if path.suffix != ".json": + continue + try: + with path.open("r", encoding="utf-8") as handle: + entry = json.load(handle) + except (OSError, json.JSONDecodeError) as exc: + self.owner.P(f"[sync] failed to read history file {path}: {exc}", color="r") + continue + written_at = entry.get(_HISTORY_WRITTEN_AT_NS) + if not isinstance(written_at, int): + written_at = path.stat().st_mtime_ns + entries.append((path, entry, written_at)) + return entries + + def append_sent(self, entry: dict) -> Path: + """Write a provider history entry to sync_history/sent/.""" + return self._append_history(history_sent_dir(self.owner), entry) + + def append_received(self, entry: dict) -> Path: + """Write a consumer history entry to sync_history/received/.""" + return self._append_history(history_received_dir(self.owner), entry) + + def _latest_in(self, history_dir: Path) -> Optional[dict]: + """Return the most recently *written* history entry. + + Sorts by the append-time marker, not by filename. Filenames are + version-prefixed for chronological browsability under normal operation, + but the consumer's "what did I last apply?" question is about insert + order, not about whatever ``version`` happens to be in the entry. + Older files without that marker fall back to mtime. + """ + entries = self._read_history_entries(history_dir) + if not entries: + return None + _, latest, _ = max(entries, key=lambda item: item[2]) + return latest + + def latest_sent(self) -> Optional[dict]: + """Return the most recent provider history entry, or None if empty.""" + return self._latest_in(history_sent_dir(self.owner)) + + def latest_received(self) -> Optional[dict]: + """Return the most recent consumer history entry, or None if empty.""" + return self._latest_in(history_received_dir(self.owner)) + + def _write_apply_state( + self, + state: str, + record: dict, + **extra: Any, + ) -> dict: + payload = { + "state": state, + "cid": record.get("cid") if isinstance(record, dict) else None, + "version": record.get("version") if isinstance(record, dict) else None, + "timestamp": self.owner.time(), + } + payload.update(extra) + self._write_json_atomic(apply_state_path(self.owner), payload) + return payload + + def read_apply_state(self) -> Optional[dict]: + path = apply_state_path(self.owner) + try: + with path.open("r", encoding="utf-8") as handle: + state = json.load(handle) + except (OSError, json.JSONDecodeError): + return None + return state if isinstance(state, dict) else None + + def latest_applied(self) -> Optional[dict]: + """Return the durable last-applied state, falling back to old history.""" + state = self.read_apply_state() + if state and state.get("state") == "applied" and state.get("cid"): + return state + return self.latest_received() + + @staticmethod + def _record_digest(record: dict) -> str: + payload = { + "cid": record.get("cid"), + "manifest": record.get("manifest") if isinstance(record, dict) else None, + } + encoded = json.dumps(payload, sort_keys=True, separators=(",", ":"), default=str) + return hashlib.sha256(encoded.encode("utf-8")).hexdigest() + + def _quarantine_path(self, record: dict) -> Path: + cid = str(record.get("cid") or "no_cid") + safe_cid = "".join(ch if ch.isalnum() else "_" for ch in cid)[:32] or "no_cid" + return quarantine_dir(self.owner) / f"{safe_cid}__{self._record_digest(record)[:16]}.json" + + def _record_preflight_failure(self, record: dict, stage: str, error: str) -> None: + now = self.owner.time() + path = self._quarantine_path(record) + previous = {} + try: + with path.open("r", encoding="utf-8") as handle: + loaded = json.load(handle) + if isinstance(loaded, dict): + previous = loaded + except (OSError, json.JSONDecodeError): + previous = {} + + failure_count = int(previous.get("failure_count") or 0) + 1 + retry_after = min( + _BAD_CID_RETRY_MAX_SECONDS, + _BAD_CID_RETRY_BASE_SECONDS * (2 ** min(failure_count - 1, 5)), + ) + payload = { + "cid": record.get("cid"), + "version": record.get("version"), + "manifest_digest": self._record_digest(record), + "stage": stage, + "error": error, + "failure_count": failure_count, + "first_seen": previous.get("first_seen", now), + "last_failed": now, + "next_retry_after": now + retry_after, + } + try: + self._write_json_atomic(path, payload) + except Exception as exc: + self.owner.P(f"[sync] failed to write quarantine state: {exc}", color="r") + return + try: + self._write_apply_state("failed_preflight", record, stage=stage, error=error) + except Exception as exc: + self.owner.P(f"[sync] failed to write apply preflight state: {exc}", color="r") + + def quarantined_record(self, record: dict) -> Optional[dict]: + path = self._quarantine_path(record) + try: + with path.open("r", encoding="utf-8") as handle: + payload = json.load(handle) + except (OSError, json.JSONDecodeError): + return None + if not isinstance(payload, dict): + return None + next_retry_after = payload.get("next_retry_after") + if isinstance(next_retry_after, (int, float)) and self.owner.time() < next_retry_after: + return payload + return None + + def update_history_deletion( + self, history_dir: Path, entry: dict, succeeded: bool, error: Optional[str] + ) -> None: + """Update the deletion sub-record on an existing history entry. + + Atomic via tmp+rename. Identifies the file by its filename convention + (``__.json``) derived from the entry's fields. + Silently logs and returns if the file isn't found. + """ + fname = self._history_filename(entry.get("version", 0), entry.get("cid", "")) + path = Path(history_dir) / fname + if not path.is_file(): + self.owner.P( + f"[sync] history file missing for deletion update: {path}", color="y" + ) + return + try: + with path.open("r", encoding="utf-8") as handle: + data = json.load(handle) + except (OSError, json.JSONDecodeError) as exc: + self.owner.P( + f"[sync] failed to read history file for deletion update {path}: {exc}", + color="r", + ) + return + data["deletion"] = { + "deleted_at": self.owner.time() if succeeded else None, + "deletion_succeeded": bool(succeeded), + "deletion_error": error, + } + self._write_json_atomic(path, data) + + # ----- provider -------------------------------------------------------- + def _fail_request( + self, + request_body: Optional[dict], + stage: str, + error: str, + processing_path: Optional[Path], + raw_body: Optional[str] = None, + ) -> None: + """Write request.json.invalid + response.json (error), discard .processing. + + Used by both claim_request validation failures and publish_snapshot + execution failures so the artifact pair is consistent across stages. + """ + failed_ts = self.owner.time() + node_id = getattr(self.owner, "ee_id", None) or getattr(self.owner, "node_id", None) + invalid_payload: dict[str, Any] = { + "request": request_body, # may be None for malformed JSON + "_error": { + "stage": stage, + "error": error, + "failed_timestamp": failed_ts, + "node_id": node_id, + }, + } + if raw_body is not None and request_body is None: + invalid_payload["_error"]["raw_body"] = raw_body[:1024] + + control_file = self._request_control_file() + try: + control_file.write_json(SYNC_INVALID_FILE, invalid_payload) + except Exception as exc: + self.owner.P(f"[sync] failed to write request.json.invalid: {exc}", color="r") + + archive_paths: list[Any] = [] + if isinstance(request_body, dict): + ap = request_body.get("archive_paths") + if isinstance(ap, list): + archive_paths = ap + response_payload = { + "status": "error", + "stage": stage, + "error": error, + "failed_timestamp": failed_ts, + "archive_paths": archive_paths, + } + try: + control_file.write_json(SYNC_RESPONSE_FILE, response_payload) + except Exception as exc: + self.owner.P(f"[sync] failed to write response.json: {exc}", color="r") + + if processing_path is not None and os.path.lexists(str(processing_path)): + try: + control_file.discard_processing() + except OSError as exc: + self.owner.P( + f"[sync] failed to delete .processing after error: {exc}", color="r" + ) + + def _parse_runtime_policy(self, body: dict) -> SyncRuntimePolicy: + runtime = body.get("runtime") or {} + if not isinstance(runtime, dict): + raise ValueError("runtime must be a JSON object") + + provider_capture = runtime.get("provider_capture", PROVIDER_CAPTURE_OFFLINE) + consumer_apply = runtime.get("consumer_apply", CONSUMER_APPLY_OFFLINE_RESTART) + + if provider_capture not in _PROVIDER_CAPTURE_MODES: + allowed = ", ".join(sorted(_PROVIDER_CAPTURE_MODES)) + raise ValueError( + f"runtime.provider_capture must be one of [{allowed}], got {provider_capture!r}" + ) + if consumer_apply not in _CONSUMER_APPLY_MODES: + allowed = ", ".join(sorted(_CONSUMER_APPLY_MODES)) + raise ValueError( + f"runtime.consumer_apply must be one of [{allowed}], got {consumer_apply!r}" + ) + + return SyncRuntimePolicy( + provider_capture=provider_capture, + consumer_apply=consumer_apply, + ) + + def claim_request(self) -> Optional[SyncRequest]: + """Atomically claim the pending request.json, validate, return its payload. + + On success: renames ``request.json`` → ``request.json.processing``, + returns a ``SyncRequest``. + On any failure (no file, malformed JSON, validation): writes + ``request.json.invalid`` (request body + ``_error`` diagnostics) and + ``response.json`` (error shape), discards the ``.processing`` file, and + returns ``None``. + """ + control_file = self._request_control_file() + try: + claimed = control_file.claim_object() + except JsonControlFileClaimError as exc: + self.owner.P( + f"[sync] could not rename request.json -> .processing: {exc}", color="r" + ) + return None + except JsonControlFileReadError as exc: + self._fail_request( + None, STAGE_VALIDATION, + f"could not read .processing: {exc}", control_file.processing_path, + ) + return None + except JsonControlFileUnsafeError as exc: + self._fail_request( + None, STAGE_VALIDATION, + str(exc), control_file.processing_path, + ) + return None + except JsonControlFileDecodeError as exc: + self._fail_request( + None, STAGE_VALIDATION, + f"malformed JSON: {exc}", control_file.processing_path, + raw_body=exc.raw_body, + ) + return None + except JsonControlFileObjectError as exc: + self._fail_request( + None, STAGE_VALIDATION, + str(exc), control_file.processing_path, raw_body=exc.raw_body, + ) + return None + + if claimed is None: + return None # nothing pending + + body = claimed.body + proc_path = claimed.processing_path + + archive_paths = body.get("archive_paths") + metadata = body.get("metadata", {}) or {} + if not isinstance(metadata, dict): + self._fail_request( + body, STAGE_VALIDATION, "metadata must be a JSON object", proc_path + ) + return None + + try: + runtime = self._parse_runtime_policy(body) + except ValueError as exc: + self._fail_request(body, STAGE_VALIDATION, str(exc), proc_path) + return None + + if not isinstance(archive_paths, list) or not archive_paths: + self._fail_request( + body, STAGE_VALIDATION, + "archive_paths must be a non-empty list of container-absolute paths", + proc_path, + ) + return None + + for entry in archive_paths: + try: + if runtime.provider_capture == PROVIDER_CAPTURE_ONLINE: + if not bool( + getattr(self.owner, "cfg_sync_allow_online_provider_capture", False) + ): + raise ValueError( + "runtime.provider_capture='online' requires local " + "SYNC.ALLOW_ONLINE_PROVIDER_CAPTURE=True" + ) + self._validate_container_path_shape(entry) + else: + self.resolve_container_path(entry) + except ValueError as exc: + self._fail_request(body, STAGE_VALIDATION, str(exc), proc_path) + return None + + return SyncRequest( + archive_paths=list(archive_paths), + metadata=dict(metadata), + runtime=runtime, + ) + + @staticmethod + def _docker_member_arcname(container_path: str, docker_name: str, member_name: str) -> str: + target = os.path.normpath(container_path).rstrip("/") + base = (docker_name or os.path.basename(target)).strip("/") + raw = member_name.strip("/") + + if base and raw == base: + return target.lstrip("/") + if base and raw.startswith(base + "/"): + suffix = raw[len(base) + 1:] + return f"{target}/{suffix}".lstrip("/") + return f"{target}/{raw}".lstrip("/") + + def _append_docker_archive_path(self, tar: tarfile.TarFile, container_path: str) -> None: + container = getattr(self.owner, "container", None) + if container is None: + raise RuntimeError("online provider capture requires a running container") + + self._validate_container_path_shape(container_path) + bits, stat = container.get_archive(container_path) + + output_dir = Path(tempfile.gettempdir()) + get_output = getattr(self.owner, "get_output_folder", None) + if callable(get_output): + output_dir = Path(get_output()) + output_dir.mkdir(parents=True, exist_ok=True) + + fd, tmp_name = tempfile.mkstemp( + dir=str(output_dir), + prefix="sync_docker_archive_", + suffix=".tar", + ) + try: + with os.fdopen(fd, "wb") as handle: + if isinstance(bits, (bytes, bytearray)): + handle.write(bits) + else: + for chunk in bits: + handle.write(chunk) + + docker_name = (stat or {}).get("name") or os.path.basename( + os.path.normpath(container_path) + ) + with tarfile.open(tmp_name, "r:*") as src: + for member in src.getmembers(): + if any(part == ".." for part in member.name.split("/")): + raise ValueError(f"docker archive member name contains '..': {member.name!r}") + new_member = copy.copy(member) + new_member.name = self._docker_member_arcname( + container_path, docker_name, member.name + ) + fileobj = src.extractfile(member) if member.isfile() else None + tar.addfile(new_member, fileobj) + finally: + try: + os.unlink(tmp_name) + except OSError: + pass + return + + def make_archive( + self, + archive_paths: list[str], + provider_capture: str = PROVIDER_CAPTURE_OFFLINE, + ) -> tuple[str, int]: + """Build the snapshot tar.gz under the plugin output folder. + + Tar member names are the **container paths** (so consumers can reverse- + resolve via their own self.volumes). Returns ``(tar_path, size_bytes)``. + Offline capture re-runs ``resolve_container_path`` for each entry as + defence in depth. Online capture uses Docker's archive API against the + running container, allowing non-mounted provider paths. + """ + output_dir: Path + get_output = getattr(self.owner, "get_output_folder", None) + if callable(get_output): + output_dir = Path(get_output()) + else: + output_dir = Path(tempfile.gettempdir()) + output_dir.mkdir(parents=True, exist_ok=True) + + ts = int(self.owner.time()) + tar_path = output_dir / f"sync_archive_{ts}_{os.getpid()}.tar.gz" + + with tarfile.open(str(tar_path), "w:gz") as tar: + for container_path in archive_paths: + if provider_capture == PROVIDER_CAPTURE_ONLINE: + if not bool( + getattr(self.owner, "cfg_sync_allow_online_provider_capture", False) + ): + raise ValueError( + "provider_capture='online' requires local " + "SYNC.ALLOW_ONLINE_PROVIDER_CAPTURE=True" + ) + self._append_docker_archive_path(tar, container_path) + else: + host_path, _bind, host_root = self.resolve_container_path(container_path) + self._add_offline_archive_path(tar, container_path, host_path, host_root) + + return str(tar_path), os.path.getsize(str(tar_path)) + + def _coerce_sync_request( + self, + request: SyncRequest | list[str], + metadata: Optional[dict] = None, + ) -> SyncRequest: + if isinstance(request, SyncRequest): + return request + return SyncRequest( + archive_paths=list(request), + metadata=dict(metadata or {}), + runtime=SyncRuntimePolicy(), + ) + + def _delete_uploaded_cid_best_effort( + self, + cid: str, + *, + cleanup_local_files: bool = False, + ) -> None: + try: + self.owner.r1fs.delete_file( + cid=cid, + unpin_remote=True, + cleanup_local_files=cleanup_local_files, + ) + except Exception as exc: # noqa: BLE001 - cleanup must not mask root failure + self.owner.P( + f"[sync] failed to clean up uploaded CID {cid}: {exc}", color="y" + ) + + def publish_snapshot( + self, + request: SyncRequest | list[str], + metadata: Optional[dict] = None, + ) -> bool: + """Full provider orchestration: archive → R1FS add → ChainStore hset → + history append → response.json → clear .invalid → delete .processing → + retire previous CID. + + Returns True on success, False on any failure (and writes + response.json/error + request.json.invalid for the app). + Always cleans up the archive tmp file. + """ + sync_request = self._coerce_sync_request(request, metadata) + archive_paths = sync_request.archive_paths + runtime_payload = runtime_policy_to_dict(sync_request.runtime) + request_body = { + "archive_paths": list(archive_paths), + "metadata": dict(sync_request.metadata), + "runtime": runtime_payload, + } + control_file = self._request_control_file() + vsd = volume_sync_dir(self.owner) + proc_path = control_file.processing_path + tar_path: Optional[str] = None + try: + # ---- Stage: archive_build + try: + tar_path, size_bytes = self.make_archive( + archive_paths, + provider_capture=sync_request.runtime.provider_capture, + ) + except Exception as exc: + self._fail_request(request_body, STAGE_ARCHIVE_BUILD, str(exc), proc_path) + return False + + # ---- Stage: r1fs_upload + try: + cid = self.owner.r1fs.add_file(tar_path) + except Exception as exc: + self._fail_request(request_body, STAGE_R1FS_UPLOAD, str(exc), proc_path) + return False + if not cid: + self._fail_request( + request_body, STAGE_R1FS_UPLOAD, + "r1fs.add_file returned no CID", proc_path, + ) + return False + + # Build the manifest + record + version = int(self.owner.time()) + ts = self.owner.time() + node_id = getattr(self.owner, "ee_id", None) or getattr(self.owner, "node_id", None) + manifest = { + "schema_version": MANIFEST_SCHEMA_VERSION, + "archive_paths": list(archive_paths), + "archive_format": ARCHIVE_FORMAT, + "archive_size_bytes": size_bytes, + "encryption": ARCHIVE_ENCRYPTION, + "runtime": runtime_payload, + } + record = { + "cid": cid, + "version": version, + "timestamp": ts, + "node_id": node_id, + "metadata": dict(sync_request.metadata), + "runtime": runtime_payload, + "manifest": manifest, + } + + # ---- Stage: chainstore_publish + try: + ack = self.owner.chainstore_hset( + hkey=CHAINSTORE_SYNC_HKEY, + key=getattr(self.owner, "cfg_sync_key", None), + value=record, + ) + except Exception as exc: + self._delete_uploaded_cid_best_effort(cid) + self._fail_request( + request_body, STAGE_CHAINSTORE_PUBLISH, str(exc), proc_path + ) + return False + if not ack: + self._delete_uploaded_cid_best_effort(cid) + self._fail_request( + request_body, + STAGE_CHAINSTORE_PUBLISH, + "chainstore_hset returned false acknowledgement", + proc_path, + ) + return False + + # Persist history entry (pre-retirement so deletion update finds it). + entry = { + "cid": cid, + "version": version, + "published_timestamp": ts, + "request": dict(request_body), + "manifest": manifest, + "archive_size_bytes": size_bytes, + "chainstore_ack": bool(ack), + "node_id": node_id, + } + history_error = None + history_appended = False + try: + self.append_sent(entry) + history_appended = True + except Exception as exc: + history_error = str(exc) + self.owner.P( + f"[sync] ChainStore publish succeeded but sent-history append failed: {exc}", + color="r", + ) + + # Write success response and clean up control-plane artifacts. We + # include the app-supplied metadata so the in-volume-sync state file + # is self-contained — UIs that surface response.json (without access + # to host-side sync_history/) can show the metadata that travelled + # with this snapshot. + response_payload = { + "status": "ok", + "cid": cid, + "version": version, + "published_timestamp": ts, + "archive_paths": list(archive_paths), + "archive_size_bytes": size_bytes, + "chainstore_ack": bool(ack), + "metadata": dict(sync_request.metadata), + } + if history_error is not None: + response_payload["history_error"] = history_error + try: + control_file.write_json(SYNC_RESPONSE_FILE, response_payload) + except Exception as exc: + self.owner.P( + f"[sync] failed to write response.json: {exc}", color="r" + ) + + invalid_path = vsd / SYNC_INVALID_FILE + if os.path.lexists(str(invalid_path)): + try: + os.unlink(str(invalid_path)) + except OSError: + pass + if os.path.lexists(str(proc_path)): + try: + control_file.discard_processing() + except OSError as exc: + self.owner.P( + f"[sync] failed to delete .processing after success: {exc}", color="y" + ) + + # Retire prior CID only when the new sent-history entry exists. Without + # that entry there is no durable local record for deletion bookkeeping. + if history_appended: + self._retire_previous_cid(history_sent_dir(self.owner)) + return True + finally: + if tar_path: + try: + os.unlink(tar_path) + except OSError: + pass + + # ----- consumer -------------------------------------------------------- + def fetch_latest(self) -> Optional[dict]: + """Refresh the local CHAINSTORE_SYNC replica (gated by HSYNC_POLL_INTERVAL), + then read the configured KEY. + + The ``hsync`` is the expensive bit — a network round-trip to the chain + cluster with a timeout. It fires at most every + ``SYNC.HSYNC_POLL_INTERVAL`` seconds (default 60s, min 10s). The cheap + local-replica ``hget`` runs on every call regardless, so a consumer that + already has the record cached keeps reading it without paying the + network cost. + + On ``hsync`` failure we retry sooner than the full success interval + (default 30s) to avoid leaving consumers stale for a whole cadence while + still avoiding a network attempt on every sync tick. + """ + sync_key = getattr(self.owner, "cfg_sync_key", None) + if not sync_key: + return None + + interval = getattr( + self.owner, "cfg_sync_hsync_poll_interval", self._DEFAULT_HSYNC_POLL_INTERVAL, + ) + now = self.owner.time() + if now - self._last_hsync >= interval: + # Always log the hsync attempt result (success or failure) — this is + # the only sync mixin log that fires on the happy path, so it doubles + # as the heartbeat that confirms the consumer is actually ticking and + # the rate-limit gating is working. Quiet enough at one log per + # HSYNC_POLL_INTERVAL window (default once per minute) to stay on in + # prod logs. + hsync_start = _time.monotonic() + try: + self.owner.chainstore_hsync(hkey=CHAINSTORE_SYNC_HKEY) + self._last_hsync = now + elapsed = _time.monotonic() - hsync_start + self.owner.P(f"[sync] chainstore_hsync ok ({elapsed:.2f}s)", color="g") + except Exception as exc: + retry_after = min(self._DEFAULT_HSYNC_FAILURE_RETRY_INTERVAL, interval) + self._last_hsync = now - max(0.0, interval - retry_after) + elapsed = _time.monotonic() - hsync_start + self.owner.P( + f"[sync] chainstore_hsync error after {elapsed:.2f}s " + f"(retry in {retry_after:.0f}s): {exc}", + color="y", + ) + + try: + return self.owner.chainstore_hget( + hkey=CHAINSTORE_SYNC_HKEY, key=sync_key + ) + except Exception as exc: + self.owner.P(f"[sync] chainstore_hget error: {exc}", color="r") + return None + + def validate_manifest(self, record: dict) -> list[str]: + """Return list of human-readable rejection reasons for ``record``. + + Empty list means the manifest is acceptable: schema_version and + archive_format are recognised AND the consumer's ``self.volumes`` covers + every container path with a (fixed-size) mount. A non-empty list means + apply must be skipped without touching the filesystem. + + Reasons are surfaced for: + - missing/wrong ``schema_version`` (must be an int <= MANIFEST_SCHEMA_VERSION) + - unexpected ``archive_format`` (must equal ARCHIVE_FORMAT) + - unexpected ``encryption`` (must equal ARCHIVE_ENCRYPTION) + - ``archive_paths`` entries that don't map to a mount on this consumer + + Format/schema checks come first so they short-circuit before we burn + cycles resolving paths against a manifest we can't read anyway. + """ + if not isinstance(record, dict): + return ["manifest record is not a dict"] + manifest = record.get("manifest") or {} + if not isinstance(manifest, dict): + return ["manifest must be a JSON object"] + reasons: list[str] = [] + + sv = manifest.get("schema_version") + if not isinstance(sv, int): + reasons.append( + f"unsupported schema_version: {sv!r} (expected int, max supported: {MANIFEST_SCHEMA_VERSION})" + ) + elif sv > MANIFEST_SCHEMA_VERSION: + reasons.append( + f"unsupported schema_version: {sv} (max supported by this CAR: {MANIFEST_SCHEMA_VERSION})" + ) + + fmt = manifest.get("archive_format") + if fmt != ARCHIVE_FORMAT: + reasons.append( + f"unsupported archive_format: {fmt!r} (expected: {ARCHIVE_FORMAT!r})" + ) + + enc = manifest.get("encryption") + if enc != ARCHIVE_ENCRYPTION: + reasons.append( + f"unsupported encryption: {enc!r} (expected: {ARCHIVE_ENCRYPTION!r})" + ) + + raw_paths = manifest.get("archive_paths") + paths: list[str] = [] + if not isinstance(raw_paths, list) or not raw_paths: + reasons.append( + "archive_paths must be a non-empty list of container-absolute paths" + ) + else: + invalid_paths = [ + entry for entry in raw_paths + if not isinstance(entry, str) or not entry + ] + if invalid_paths: + reasons.append(f"invalid archive_paths entries: {invalid_paths!r}") + paths = [entry for entry in raw_paths if isinstance(entry, str) and entry] + missing: list[str] = [] + for entry in paths: + try: + self.resolve_container_path(entry) + except ValueError: + missing.append(entry) + if missing: + reasons.append(f"unmapped archive_paths on this consumer: {missing}") + return reasons + + def validate_record_for_apply(self, record: dict) -> list[str]: + """Validate the full ChainStore record before disrupting a consumer. + + This covers the record envelope plus the manifest. ``validate_manifest`` is + kept as the manifest-focused helper used by older tests and callers. + """ + if not isinstance(record, dict): + return ["sync record is not a dict"] + reasons: list[str] = [] + cid = record.get("cid") + if not isinstance(cid, str) or not cid: + reasons.append("record cid must be a non-empty string") + version = record.get("version") + if not isinstance(version, int): + reasons.append("record version must be an int") + reasons.extend(self.validate_manifest(record)) + return reasons + + @staticmethod + def _is_within_real_root(path: str, root: str) -> bool: + root_real = os.path.realpath(root) + path_real = os.path.realpath(path) + return path_real == root_real or path_real.startswith(root_real + os.sep) + + def _validate_extract_target_within_root( + self, + host_path: str, + host_root: str, + container_name: str, + ) -> None: + """Reject extraction targets that would resolve outside their volume. + + ``resolve_container_path`` already proves the normalized string path sits + under the selected host root. This second check follows symlinks in the + target and parent path so a pre-existing symlink inside the mounted volume + cannot redirect extraction outside that volume. + """ + candidates = [host_path] + if os.path.normpath(host_path) != os.path.normpath(host_root): + candidates.append(os.path.dirname(host_path) or host_root) + for candidate in candidates: + if not self._is_within_real_root(candidate, host_root): + raise ValueError( + f"tar member target escapes volume root: {container_name!r} -> {host_path!r}" + ) + + @staticmethod + def _volume_owner(host_root: str) -> tuple[int, int]: + st = os.stat(host_root) + return st.st_uid, st.st_gid + + @staticmethod + def _chown_if_needed(path: str, uid: int, gid: int) -> None: + st = os.lstat(path) + if st.st_uid != uid or st.st_gid != gid: + os.chown(path, uid, gid) + + @staticmethod + def _directory_metadata(path: str) -> DirectoryMetadata: + st = os.lstat(path) + if stat.S_ISLNK(st.st_mode) or not stat.S_ISDIR(st.st_mode): + raise ValueError(f"directory path is not a real directory: {path!r}") + return DirectoryMetadata( + uid=st.st_uid, + gid=st.st_gid, + mode=stat.S_IMODE(st.st_mode), + ) + + def _record_directory_metadata( + self, + path: str, + ops: Optional[list[ApplyRollbackOp]], + tracked_dirs: Optional[set[str]], + ) -> None: + if ops is None: + return + path_n = os.path.normpath(path) + if tracked_dirs is not None and path_n in tracked_dirs: + return + metadata = self._directory_metadata(path_n) + ops.append(ApplyRollbackOp("restore_dir_meta", path_n, metadata=metadata)) + if tracked_dirs is not None: + tracked_dirs.add(path_n) + + def _ensure_directory_tree_owner( + self, + path: str, + host_root: str, + uid: int, + gid: int, + ops: Optional[list[ApplyRollbackOp]] = None, + tracked_dirs: Optional[set[str]] = None, + ) -> None: + host_root_n = os.path.normpath(host_root) + path_n = os.path.normpath(path) + if not self._is_within_root(path_n, host_root_n): + raise ValueError(f"directory path escapes volume root: {path_n!r}") + if path_n == host_root_n: + return + + rel = os.path.relpath(path_n, host_root_n) + current = host_root_n + for part in rel.split(os.sep): + if not part or part == ".": + continue + if part == "..": + raise ValueError(f"directory path escapes volume root: {path_n!r}") + current = os.path.join(current, part) + try: + st = os.lstat(current) + except FileNotFoundError: + os.mkdir(current) + if ops is not None: + ops.append(ApplyRollbackOp("remove_dir", current)) + if tracked_dirs is not None: + tracked_dirs.add(os.path.normpath(current)) + st = os.lstat(current) + + if stat.S_ISLNK(st.st_mode) or not stat.S_ISDIR(st.st_mode): + raise ValueError(f"directory path is not a real directory: {current!r}") + + self._record_directory_metadata(current, ops, tracked_dirs) + self._chown_if_needed(current, uid, gid) + os.chmod(current, 0o755) + + @staticmethod + def _container_path_in_declared_archive_paths( + container_name: str, + archive_paths: list[str], + ) -> bool: + candidate = os.path.normpath(container_name) + if not candidate.startswith("/"): + candidate = "/" + candidate + for entry in archive_paths: + if not isinstance(entry, str) or not entry: + continue + declared = os.path.normpath(entry) + if not declared.startswith("/"): + declared = "/" + declared + if candidate == declared or candidate.startswith(declared.rstrip("/") + "/"): + return True + return False + + def extract_archive( + self, + tar_path: str, + allowed_archive_paths: Optional[list[str]] = None, + ) -> list[str]: + """Reverse-map tar member container paths to host paths and extract. + + Two-pass: first pass validates every member by feeding its name through + ``resolve_container_path`` (so the entire extract aborts before any + write if the consumer's volume layout doesn't cover all members). + Symlinks/hardlinks are skipped with a warning — never extracted, since + a malicious tar could otherwise create a link that subsequent regular + members would write through. Each regular file is written via tmp + + ``os.replace`` so a mid-flight crash never leaves a half-written file. + If ``allowed_archive_paths`` is provided, every extracted member must also + sit under at least one manifest-declared archive path. Returns the list of + container paths that were applied (regular files + directories created). + """ + return self._extract_archive(tar_path, allowed_archive_paths) + + def _extract_archive( + self, + tar_path: str, + allowed_archive_paths: Optional[list[str]] = None, + ) -> list[str]: + extracted: list[str] = [] + with tarfile.open(str(tar_path), "r:gz") as tar: + members = tar.getmembers() + + # Pass 1: validate every member, build (member, host_path) pairs. + # Python's tarfile.add() strips leading '/' from arcnames as a POSIX + # safety default, so member names look like "app/data/foo.bin" even + # when we put them in as "/app/data/foo.bin". Normalize back to the + # container-absolute form before running through the resolver. + planned: list[tuple[tarfile.TarInfo, str, str, str]] = [] + for member in members: + if member.issym() or member.islnk(): + self.owner.P( + f"[sync] skipping link member in tar (security): {member.name}", + color="y", + ) + continue + if any(part == ".." for part in member.name.split("/")): + raise ValueError(f"tar member name contains '..': {member.name!r}") + container_name = member.name + if not container_name.startswith("/"): + container_name = "/" + container_name + if ( + allowed_archive_paths is not None + and not self._container_path_in_declared_archive_paths( + container_name, allowed_archive_paths + ) + ): + raise ValueError( + f"tar member outside manifest archive_paths: {container_name!r}" + ) + host_path, _bind, host_root = self.resolve_container_path(container_name) + self._validate_extract_target_within_root(host_path, host_root, container_name) + planned.append((member, host_path, container_name, host_root)) + + # Pass 2: actually extract. + for member, host_path, container_name, host_root in planned: + owner_uid, owner_gid = self._volume_owner(host_root) + if member.isdir(): + self._ensure_directory_tree_owner( + host_path, host_root, owner_uid, owner_gid + ) + self._validate_extract_target_within_root(host_path, host_root, container_name) + os.chmod(host_path, self._safe_extract_mode(member.mode, is_dir=True)) + extracted.append(container_name) + continue + if not member.isfile(): + continue + self._ensure_directory_tree_owner( + os.path.dirname(host_path), host_root, owner_uid, owner_gid + ) + self._validate_extract_target_within_root(host_path, host_root, container_name) + fobj = tar.extractfile(member) + if fobj is None: + continue + # Atomic per-file write: tmp in same directory, then os.replace. + fd, tmp_name = tempfile.mkstemp( + dir=os.path.dirname(host_path), + prefix=f".{os.path.basename(host_path)}.", + suffix=".tmp", + ) + try: + with os.fdopen(fd, "wb") as out: + while True: + chunk = fobj.read(1024 * 1024) + if not chunk: + break + out.write(chunk) + self._chown_if_needed(tmp_name, owner_uid, owner_gid) + os.chmod(tmp_name, self._safe_extract_mode(member.mode, is_dir=False)) + os.replace(tmp_name, host_path) + except Exception: + try: + os.unlink(tmp_name) + except OSError: + pass + raise + extracted.append(container_name) + return extracted + + def _new_apply_staging_dir(self) -> Path: + root = sync_state_dir(self.owner) / "staging" + root.mkdir(parents=True, exist_ok=True) + return Path(tempfile.mkdtemp(prefix="apply.", dir=str(root))) + + @staticmethod + def _cleanup_tree(path: Optional[Path]) -> None: + if path is None: + return + try: + if path.is_dir(): + for child in sorted(path.rglob("*"), reverse=True): + if child.is_dir(): + child.rmdir() + else: + child.unlink() + path.rmdir() + elif path.exists(): + path.unlink() + except OSError: + pass + + def _stage_tar_member(self, fobj, staging_dir: Path, index: int) -> Path: + staging_path = staging_dir / f"{index:06d}.blob" + with staging_path.open("wb") as out: + while True: + chunk = fobj.read(1024 * 1024) + if not chunk: + break + out.write(chunk) + out.flush() + os.fsync(out.fileno()) + return staging_path + + def prepare_apply(self, record: dict) -> Optional[PreparedApply]: + """Validate and stage a consumer snapshot before stopping the app.""" + rejection_reasons = self.validate_record_for_apply(record) + if rejection_reasons: + cid = record.get("cid") if isinstance(record, dict) else None + version = record.get("version") if isinstance(record, dict) else None + error = "; ".join(rejection_reasons) + self.owner.P(f"[sync] cannot prepare v{version} (cid={cid}): {error}", color="r") + if isinstance(record, dict) and record.get("cid"): + self._record_preflight_failure(record, STAGE_VALIDATION, error) + return None + + cid = record["cid"] + version = record["version"] + try: + self._write_apply_state("preparing", record) + except Exception as exc: + self.owner.P(f"[sync] failed to write apply preparing state: {exc}", color="r") + + try: + local_path = self.owner.r1fs.get_file(cid) + except Exception as exc: + self.owner.P(f"[sync] r1fs.get_file({cid}) failed: {exc}", color="r") + self._record_preflight_failure(record, "r1fs_download", str(exc)) + return None + if not local_path: + error = f"r1fs.get_file({cid}) returned no path" + self.owner.P(f"[sync] {error}", color="r") + self._record_preflight_failure(record, "r1fs_download", error) + return None + + staging_dir: Optional[Path] = None + try: + staging_dir = self._new_apply_staging_dir() + manifest = record.get("manifest") or {} + allowed_archive_paths = manifest.get("archive_paths") or [] + planned: list[PlannedApplyMember] = [] + with tarfile.open(str(local_path), "r:gz") as tar: + for index, member in enumerate(tar.getmembers()): + if member.issym() or member.islnk(): + self.owner.P( + f"[sync] skipping link member in tar (security): {member.name}", + color="y", + ) + continue + if any(part == ".." for part in member.name.split("/")): + raise ValueError(f"tar member name contains '..': {member.name!r}") + container_name = member.name + if not container_name.startswith("/"): + container_name = "/" + container_name + if not self._container_path_in_declared_archive_paths( + container_name, allowed_archive_paths + ): + raise ValueError( + f"tar member outside manifest archive_paths: {container_name!r}" + ) + host_path, _bind, host_root = self.resolve_container_path(container_name) + self._validate_extract_target_within_root(host_path, host_root, container_name) + if member.isdir(): + planned.append(PlannedApplyMember( + container_name=container_name, + host_path=host_path, + host_root=host_root, + staging_path=None, + mode=member.mode, + is_dir=True, + )) + continue + if not member.isfile(): + continue + fobj = tar.extractfile(member) + if fobj is None: + continue + staging_path = self._stage_tar_member(fobj, staging_dir, index) + planned.append(PlannedApplyMember( + container_name=container_name, + host_path=host_path, + host_root=host_root, + staging_path=staging_path, + mode=member.mode, + is_dir=False, + )) + return PreparedApply( + record=dict(record), + cid=cid, + version=version, + local_path=str(local_path), + staging_dir=staging_dir, + members=planned, + manifest=dict(manifest), + ) + except Exception as exc: + self.owner.P(f"[sync] prepare_apply failed for cid={cid}: {exc}", color="r") + self._cleanup_tree(staging_dir) + self._record_preflight_failure(record, STAGE_EXTRACT, str(exc)) + return None + + @staticmethod + def _new_backup_path(host_path: str) -> str: + directory = os.path.dirname(host_path) + fd, backup_path = tempfile.mkstemp( + dir=directory, + prefix=f".{os.path.basename(host_path)}.syncbak.", + suffix=".bak", + ) + os.close(fd) + os.unlink(backup_path) + return backup_path + + @staticmethod + def _unlink_path(path: str) -> None: + try: + os.unlink(path) + except FileNotFoundError: + pass + + def _rollback_apply_ops(self, ops: list[ApplyRollbackOp]) -> bool: + ok = True + for op in reversed(ops): + try: + if op.op == "restore" and op.backup: + self._unlink_path(op.path) + os.replace(op.backup, op.path) + elif op.op == "remove_file": + self._unlink_path(op.path) + elif op.op == "remove_dir": + os.rmdir(op.path) + elif op.op == "restore_dir_meta" and op.metadata: + self._chown_if_needed(op.path, op.metadata.uid, op.metadata.gid) + os.chmod(op.path, op.metadata.mode) + except OSError as exc: + ok = False + self.owner.P(f"[sync] rollback operation failed for {op.path}: {exc}", color="r") + return ok + + def _cleanup_backups(self, ops: list[ApplyRollbackOp]) -> None: + for op in ops: + if op.op == "restore" and op.backup: + try: + os.unlink(op.backup) + except OSError: + pass + + def commit_prepared_apply(self, prepared: PreparedApply) -> ApplyResult: + """Apply a prepared snapshot while the app container is stopped.""" + try: + self._write_apply_state("applying", prepared.record) + except Exception as exc: + error = f"could not write applying state: {exc}" + self.owner.P(f"[sync] {error}", color="r") + self._cleanup_tree(prepared.staging_dir) + return ApplyResult(False, True, "failed_preflight", [], error) + + ops: list[ApplyRollbackOp] = [] + tracked_dirs: set[str] = set() + extracted: list[str] = [] + try: + for planned in prepared.members: + owner_uid, owner_gid = self._volume_owner(planned.host_root) + self._validate_extract_target_within_root( + planned.host_path, planned.host_root, planned.container_name + ) + if planned.is_dir: + existed = os.path.isdir(planned.host_path) + self._ensure_directory_tree_owner( + planned.host_path, planned.host_root, owner_uid, owner_gid, + ops, tracked_dirs + ) + self._validate_extract_target_within_root( + planned.host_path, planned.host_root, planned.container_name + ) + self._record_directory_metadata(planned.host_path, ops, tracked_dirs) + os.chmod(planned.host_path, self._safe_extract_mode(planned.mode, is_dir=True)) + if not existed: + created_path = os.path.normpath(planned.host_path) + if not any( + op.op == "remove_dir" and os.path.normpath(op.path) == created_path + for op in ops + ): + ops.append(ApplyRollbackOp("remove_dir", planned.host_path)) + extracted.append(planned.container_name) + continue + + if planned.staging_path is None: + continue + parent = os.path.dirname(planned.host_path) + self._validate_extract_target_within_root( + parent, planned.host_root, planned.container_name + ) + self._ensure_directory_tree_owner( + parent, planned.host_root, owner_uid, owner_gid, ops, tracked_dirs + ) + self._validate_extract_target_within_root( + planned.host_path, planned.host_root, planned.container_name + ) + if os.path.isdir(planned.host_path): + raise ValueError(f"cannot replace directory with file: {planned.container_name!r}") + + backup_path = None + if os.path.lexists(planned.host_path): + backup_path = self._new_backup_path(planned.host_path) + os.replace(planned.host_path, backup_path) + ops.append(ApplyRollbackOp("restore", planned.host_path, backup_path)) + else: + ops.append(ApplyRollbackOp("remove_file", planned.host_path)) + + fd, tmp_name = tempfile.mkstemp( + dir=parent, + prefix=f".{os.path.basename(planned.host_path)}.", + suffix=".tmp", + ) + try: + with os.fdopen(fd, "wb") as out, planned.staging_path.open("rb") as src: + while True: + chunk = src.read(1024 * 1024) + if not chunk: + break + out.write(chunk) + out.flush() + os.fsync(out.fileno()) + self._chown_if_needed(tmp_name, owner_uid, owner_gid) + os.chmod(tmp_name, self._safe_extract_mode(planned.mode, is_dir=False)) + os.replace(tmp_name, planned.host_path) + except Exception: + try: + os.unlink(tmp_name) + except OSError: + pass + raise + extracted.append(planned.container_name) + + try: + self._write_apply_state( + "applied", + prepared.record, + extracted_paths=list(extracted), + ) + except Exception as exc: + raise RuntimeError(f"could not write applied state: {exc}") from exc + + self._cleanup_backups(ops) + self._cleanup_tree(prepared.staging_dir) + return ApplyResult(True, True, "applied", extracted) + except Exception as exc: + rollback_ok = self._rollback_apply_ops(ops) + state = "failed_rolled_back" if rollback_ok else "uncertain" + try: + self._write_apply_state( + state, + prepared.record, + error=str(exc), + extracted_paths=list(extracted), + ) + except Exception as state_exc: + self.owner.P(f"[sync] failed to write apply failure state: {state_exc}", color="r") + self._cleanup_tree(prepared.staging_dir) + self.owner.P(f"[sync] commit_prepared_apply failed: {exc}", color="r") + return ApplyResult(False, rollback_ok, state, extracted, str(exc)) + + def _finalize_apply_success( + self, + record: dict, + extracted: list[str], + ) -> None: + cid = record["cid"] + version = record["version"] + applied_ts = self.owner.time() + entry = { + "cid": cid, + "version": version, + "source_timestamp": record.get("timestamp"), + "applied_timestamp": applied_ts, + "node_id": record.get("node_id"), + "metadata": record.get("metadata") or {}, + "manifest": record.get("manifest") or {}, + "extracted_paths": extracted, + } + history_appended = False + try: + self.append_received(entry) + history_appended = True + except Exception as exc: + self.owner.P( + f"[sync] apply succeeded but received-history append failed: {exc}", + color="r", + ) + + last_apply = { + "cid": cid, + "version": version, + "source_timestamp": record.get("timestamp"), + "applied_timestamp": applied_ts, + "node_id": record.get("node_id"), + "metadata": record.get("metadata") or {}, + } + try: + self._write_json_atomic( + volume_sync_dir(self.owner) / SYNC_LAST_APPLY_FILE, last_apply + ) + except Exception as exc: + self.owner.P(f"[sync] failed to write last_apply.json: {exc}", color="r") + + if history_appended: + self._retire_previous_cid( + history_received_dir(self.owner), + cleanup_local_files=True, + unpin_remote=False, + ) + + def apply_snapshot(self, record: dict) -> bool: + """Full consumer orchestration for callers that already stopped the app. + + ``_SyncMixin`` uses ``prepare_apply`` before stopping the container and + ``commit_prepared_apply`` after stopping it. This wrapper keeps older tests + and direct callers on the same transaction/state path. + """ + prepared = self.prepare_apply(record) + if prepared is None: + return False + result = self.commit_prepared_apply(prepared) + if not result.success: + return False + self._finalize_apply_success(record, result.extracted_paths) + return True + + # ----- retirement ------------------------------------------------------ + def _retire_previous_cid( + self, + history_dir: Path, + cleanup_local_files: bool = False, + unpin_remote: bool = True, + ) -> None: + """Delete the prior R1FS CID after a successful new operation. + + Only the immediately-prior un-retired entry is touched per call. Updates + that entry's ``deletion`` sub-record. Never raises — deletion failures + must not roll back the new publish/apply. + """ + # Sort by append-time marker, not filename. Filenames embed the version + # prefix for chronological browsability under monotonic clocks, but the + # question "what did we just publish/apply?" is answered by insert order. + # Sorting by name here would retire the highest-*version* entry instead + # of the most-recently-appended one. + entries = sorted( + self._read_history_entries(history_dir), + key=lambda item: item[2], + ) + if len(entries) < 2: + return # nothing to retire yet + latest = entries[-1][1] + latest_cid = latest.get("cid") + target_entry: Optional[dict] = None + for _, entry, _ in reversed(entries[:-1]): + if entry.get("cid") == latest_cid: + continue # same content -- nothing to retire + if (entry.get("deletion") or {}).get("deleted_at") is not None: + continue # already retired + target_entry = entry + break + + if target_entry is None: + return + target_cid = target_entry.get("cid") + if not target_cid: + return + + succeeded = False + error: Optional[str] = None + try: + self.owner.r1fs.delete_file( + cid=target_cid, + unpin_remote=unpin_remote, + cleanup_local_files=cleanup_local_files, + ) + succeeded = True + except Exception as exc: # noqa: BLE001 — never raise + error = str(exc) + self.owner.P( + f"[sync] failed to retire CID {target_cid}: {exc}", color="y" + ) + + self.update_history_deletion(history_dir, target_entry, succeeded, error) diff --git a/extensions/business/container_apps/sync/mixin.py b/extensions/business/container_apps/sync/mixin.py new file mode 100644 index 00000000..e034ed75 --- /dev/null +++ b/extensions/business/container_apps/sync/mixin.py @@ -0,0 +1,595 @@ +"""Mixin: volume-sync provider/consumer integration for CAR. + +Bridges :class:`SyncManager` into ``ContainerAppRunnerPlugin``'s lifecycle: + + * always-on: provisions the 10M ``/r1en_system`` system volume (a fixed-size + loopback identical in machinery to ``FIXED_SIZE_VOLUMES``) and exports + ``R1_*`` env vars to the container + * provider role: per ``cfg_sync_poll_interval`` polls for a pending + ``request.json``, then drives runtime stop → publish_snapshot → + start_container inline (must NOT route through ``_restart_container``, + which calls ``_cleanup_fixed_size_volumes`` and unmounts the loopback + before we can read from it) + * consumer role: same cadence polls ChainStore for a different ``cid``, then + drives runtime stop → apply_snapshot → start_container inline. + First boot starts on an empty volume; the next tick picks up whatever + snapshot is in ChainStore. Apps that strictly require state at startup + must implement their own poll-and-retry in their entrypoint. + * recovery: any orphan ``request.json.processing`` left behind by a prior + crash is renamed back to ``request.json`` on plugin init so the next + provider tick retries cleanly + +See ``extensions/business/container_apps/README.md`` for the public +operator/app contract. +""" +from __future__ import annotations + +import os +import stat +from pathlib import Path +from typing import Optional + +from extensions.business.container_apps import fixed_volume + +from .control_files import JsonControlFile, JsonControlFileUnsafeError +from .constants import ( + SYNC_PROCESSING_FILE, + SYNC_REQUEST_FILE, + STAGE_RUNTIME_STOP, + SYSTEM_VOLUME_FS, + SYSTEM_VOLUME_MOUNT, + SYSTEM_VOLUME_NAME, + SYSTEM_VOLUME_SIZE, + VOLUME_SYNC_SUBDIR, +) +from .manager import ( + CONSUMER_APPLY_OFFLINE_RESTART, + CONSUMER_APPLY_ONLINE_NO_RESTART, + CONSUMER_APPLY_ONLINE_RESTART, + SyncManager, + history_received_dir, + runtime_policy_to_dict, + system_volume_host_root, + volume_sync_dir, +) + + +class _SyncMixin: + """ + Required attributes on the composing plugin: + - self.P, self.time, self.cfg_* (BasePlugin) + - self.get_data_folder, _get_instance_data_subfolder + - self.volumes (dict, populated by CAR) + - self._fixed_volumes (list, populated by _FixedSizeVolumesMixin) + - self.env (dict, container env) + - self.r1fs, self.chainstore_hset/hget/hsync (BasePlugin API) + - self._stop_container_runtime_for_restart(), + self.start_container() (CAR lifecycle) + - self.cfg_sync (CAR config block) + - self.ee_id (BasePlugin identity) + """ + + # ----- system volume provisioning -------------------------------------- + + def _configure_system_volume(self): + """Provision the always-on /r1en_system fixed-size loopback. + + Idempotent across plugin restarts: ``fixed_volume.provision`` reuses an + existing image/loop/mount when available. Adds the bind spec to + ``self.volumes`` so ``start_container`` mounts ``/r1en_system`` at the + correct host path. The ``volume-sync/`` subdir is created post-mount so + SyncManager always has a place to write request/response files. + """ + try: + fixed_volume._require_tools(logger=self.P) + except RuntimeError as exc: + # Without the host tools we cannot provision /r1en_system, which means + # there is no shared filesystem for the app to drop request.json into + # and no host root for CAR to poll. Mark sync as unavailable so + # _sync_enabled() returns False (skipping all provider/consumer ticks) + # and _inject_sync_env_vars() refuses to advertise R1_SYSTEM_VOLUME + # to the container — otherwise the app would write to a non-existent + # in-container mount while CAR polled a host root that was never + # provisioned. Codex review finding 5 on PR #399. + self.P( + f"[sync] system volume unavailable: {exc}. " + f"SYNC will be disabled and R1_SYSTEM_VOLUME env vars will not be " + f"exported until host tools are installed.", + color="r", + ) + self._sync_unavailable = True + return + + root = ( + Path(self.get_data_folder()) + / self._get_instance_data_subfolder() + / "fixed_volumes" + ) + + # NOTE: deliberately do NOT call fixed_volume.cleanup_stale_mounts here. + # _FixedSizeVolumesMixin._configure_fixed_size_volumes() runs BEFORE us + # in on_init / _restart_container and already scans meta/ for the whole + # root. Calling it again from here would unmount any FIXED_SIZE_VOLUMES + # entries that the previous step just provisioned (because their meta/ + # files exist) and then we'd never re-mount them — the data volume + # would land empty in the container. + + vol = fixed_volume.FixedVolume( + name=SYSTEM_VOLUME_NAME, + size=SYSTEM_VOLUME_SIZE, + root=root, + fs_type=SYSTEM_VOLUME_FS, + owner_uid=None, + owner_gid=None, + ) + try: + fixed_volume.provision(vol, force_recreate=False, logger=self.P) + except Exception as exc: + # Tool presence alone is not enough: hosts can still lack usable loop + # devices, mount privileges, or filesystem support. Container execution + # should continue without advertising the sync volume in that case. + self.P( + f"[sync] system volume unavailable: could not provision " + f"{SYSTEM_VOLUME_NAME}: {exc}. SYNC will be disabled and " + f"R1_SYSTEM_VOLUME env vars will not be exported.", + color="r", + ) + self._sync_unavailable = True + return + + self._sync_unavailable = False + + # Track for shared cleanup (parity with FIXED_SIZE_VOLUMES). + if not hasattr(self, "_fixed_volumes"): + self._fixed_volumes = [] + self._fixed_volumes.append(vol) + + # Ensure the mount root itself stays root-owned/non-app-writable so the + # app cannot replace volume-sync/ with a symlink after startup. This system + # volume is CAR/app control plane, not app data, so it deliberately ignores + # the image USER ownership used for FIXED_SIZE_VOLUMES. + try: + os.chown(str(vol.mount_path), 0, 0) + os.chmod(str(vol.mount_path), 0o755) + except OSError as exc: + self.P( + f"[sync] could not enforce root-owned {vol.mount_path} mode 0o755: " + f"{exc}. SYNC will be disabled and R1_SYSTEM_VOLUME env vars will not " + f"be exported.", + color="r", + ) + self._sync_unavailable = True + return + + # Ensure volume-sync subdir exists before container start so the app + # can drop a request.json on its first tick. If a previous run left a + # symlink or non-directory here while /r1en_system was writable, remove it + # and recreate a real directory before exposing env vars to the container. + # Mode 1777 keeps it app-writable while preventing non-owners from + # deleting CAR-owned response/last_apply temp files. + vsd = volume_sync_dir(self) + try: + try: + st = os.lstat(str(vsd)) + except FileNotFoundError: + st = None + if st is not None and ( + stat.S_ISLNK(st.st_mode) or not stat.S_ISDIR(st.st_mode) + ): + os.unlink(str(vsd)) + os.makedirs(str(vsd), exist_ok=True) + st = os.lstat(str(vsd)) + if stat.S_ISLNK(st.st_mode) or not stat.S_ISDIR(st.st_mode): + raise RuntimeError(f"{vsd} is not a real directory") + except Exception as exc: + self.P( + f"[sync] volume-sync directory unsafe/unavailable: {exc}. " + f"SYNC will be disabled and R1_SYSTEM_VOLUME env vars will not be exported.", + color="r", + ) + self._sync_unavailable = True + return + try: + os.chown(str(vsd), 0, 0) + os.chmod(str(vsd), 0o1777) + except OSError as exc: + self.P( + f"[sync] could not enforce root-owned {vsd} mode 0o1777: {exc}. " + f"SYNC will be disabled and R1_SYSTEM_VOLUME env vars will not be exported.", + color="r", + ) + self._sync_unavailable = True + return + self.volumes.update(fixed_volume.docker_bind_spec(vol, SYSTEM_VOLUME_MOUNT)) + self.P( + f"[sync] system volume ready: {vol.mount_path} -> {SYSTEM_VOLUME_MOUNT} " + f"(volume-sync at {vsd})", + color="g", + ) + + # ----- env-var injection ----------------------------------------------- + + def _inject_sync_env_vars(self): + """Add the ``R1_*`` env vars to the container's environment. + + ``R1_SYSTEM_VOLUME`` / ``R1_VOLUME_SYNC_DIR`` / ``R1_SYNC_REQUEST_FILE`` + are always set so apps can write the request unconditionally; CAR just + won't act on it without ``SYNC.ENABLED``. ``R1_SYNC_TYPE`` and + ``R1_SYNC_KEY`` are only set when SYNC is enabled so apps that want to + branch on role can. + + If ``_sync_unavailable`` was set during ``_configure_system_volume`` + (host tools missing), inject nothing — advertising a mount that was + never provisioned would route the app's writes into a phantom path. + """ + if not isinstance(getattr(self, "env", None), dict): + return + if getattr(self, "_sync_unavailable", False): + return + self.env["R1_SYSTEM_VOLUME"] = SYSTEM_VOLUME_MOUNT + self.env["R1_VOLUME_SYNC_DIR"] = f"{SYSTEM_VOLUME_MOUNT}/{VOLUME_SYNC_SUBDIR}" + self.env["R1_SYNC_REQUEST_FILE"] = ( + f"{SYSTEM_VOLUME_MOUNT}/{VOLUME_SYNC_SUBDIR}/{SYNC_REQUEST_FILE}" + ) + if self._sync_enabled(): + sync_type = self.cfg_sync.get("TYPE") + sync_key = self.cfg_sync.get("KEY") + if sync_type: + self.env["R1_SYNC_TYPE"] = str(sync_type) + if sync_key: + self.env["R1_SYNC_KEY"] = str(sync_key) + + # ----- config helpers -------------------------------------------------- + + def _sync_cfg(self) -> dict: + cfg = getattr(self, "cfg_sync", None) or {} + return cfg if isinstance(cfg, dict) else {} + + def _sync_enabled(self) -> bool: + if getattr(self, "_sync_unavailable", False): + return False + return bool(self._sync_cfg().get("ENABLED")) + + def _sync_role(self) -> Optional[str]: + role = self._sync_cfg().get("TYPE") + if role in ("provider", "consumer"): + return role + return None + + def _sync_poll_interval(self) -> float: + raw = self._sync_cfg().get("POLL_INTERVAL", 10) + try: + return max(1.0, float(raw)) + except (TypeError, ValueError): + return 10.0 + + # ----- hsync interval (consumer only) ---------------------------------- + # Decoupled from POLL_INTERVAL: every consumer tick still does the cheap + # chainstore_hget against the local replica, but the expensive network + # hsync is gated by this interval. Provider does not call hsync. + _HSYNC_POLL_INTERVAL_MIN = 10.0 + _HSYNC_POLL_INTERVAL_DEFAULT = 60.0 + + def _hsync_poll_interval(self) -> float: + """Seconds between chainstore_hsync refreshes on the consumer side. + + Min 10s, default 60s. Non-numeric values fall back to the default; + values below the min are clamped up. ``fetch_latest`` still reads the + cheap local replica every tick; this only gates network hsync. + """ + raw = self._sync_cfg().get("HSYNC_POLL_INTERVAL", self._HSYNC_POLL_INTERVAL_DEFAULT) + try: + v = float(raw) + except (TypeError, ValueError): + return self._HSYNC_POLL_INTERVAL_DEFAULT + return max(self._HSYNC_POLL_INTERVAL_MIN, v) + + # convenience for SyncManager (it reads owner.cfg_sync_key) + @property + def cfg_sync_key(self): + return self._sync_cfg().get("KEY") + + @property + def cfg_sync_type(self): + return self._sync_cfg().get("TYPE") + + @property + def cfg_sync_hsync_poll_interval(self) -> float: + """Mirror of ``_hsync_poll_interval()`` accessible by ``SyncManager`` + via ``owner.cfg_sync_hsync_poll_interval`` (same convention as + ``cfg_sync_key`` / ``cfg_sync_type``). + """ + return self._hsync_poll_interval() + + @property + def cfg_sync_allow_online_provider_capture(self) -> bool: + """Provider-local opt-in for Docker archive capture from live containers.""" + raw = self._sync_cfg().get("ALLOW_ONLINE_PROVIDER_CAPTURE", False) + if isinstance(raw, bool): + return raw + if isinstance(raw, str): + value = raw.strip().lower() + if value in ("1", "true", "yes", "on"): + return True + if value in ("0", "false", "no", "off", ""): + return False + if isinstance(raw, int) and raw in (0, 1): + return bool(raw) + self.P( + f"[sync] invalid ALLOW_ONLINE_PROVIDER_CAPTURE value {raw!r}; using False", + color="y", + ) + return False + + # ----- manager handle --------------------------------------------------- + + def _ensure_sync_manager(self) -> Optional[SyncManager]: + """Lazy-init the SyncManager. Returns None if SYNC is not enabled.""" + if not self._sync_enabled(): + return None + sm = getattr(self, "_sync_manager", None) + if sm is None: + sm = SyncManager(self) + self._sync_manager = sm + return sm + + # ----- recovery on plugin init ----------------------------------------- + + def _recover_stale_processing(self): + """Rename any orphan request.json.processing back to request.json. + + Called from the plugin's on_init so a crash mid-publish doesn't leave + a request stuck. The next provider tick will then re-claim it. + """ + control_file = JsonControlFile( + volume_sync_dir(self), SYNC_REQUEST_FILE, SYNC_PROCESSING_FILE + ) + proc = control_file.processing_path + req = control_file.pending_path + try: + recovered = control_file.recover_stale_processing() + except (OSError, JsonControlFileUnsafeError) as exc: + self.P( + f"[sync] failed to recover orphan .processing: {exc}", color="r" + ) + return + if recovered: + self.P( + f"[sync] recovered orphan {proc.name} -> {req.name} for retry", + color="y", + ) + + # ----- provider tick --------------------------------------------------- + + def _sync_provider_tick(self, current_time: float) -> None: + """If a pending request.json exists, run the full publish flow. + + Drives runtime stop → publish_snapshot → start_container inline. + Always returns ``None`` — must NOT use a StopReason because that would + route through ``_restart_container``, which unmounts the system volume + before we can read from it (see plan Step 1 verification). + """ + sm = self._ensure_sync_manager() + if sm is None or self._sync_role() != "provider": + return + if not self._sync_should_tick(current_time): + return + + control_file = JsonControlFile( + volume_sync_dir(self), SYNC_REQUEST_FILE, SYNC_PROCESSING_FILE + ) + claimed = sm.claim_request() + if claimed is None: + # claim_request already wrote .invalid + response.json + return + self.P( + f"[sync] provider tick: claimed {control_file.processing_name} for publish", + color="b", + ) + + stopped_for_sync = claimed.runtime.provider_capture == "offline" + if stopped_for_sync: + stopped = self._stop_container_runtime_for_restart() + if not stopped: + request_body = { + "archive_paths": list(claimed.archive_paths), + "metadata": dict(claimed.metadata), + "runtime": runtime_policy_to_dict(claimed.runtime), + } + sm._fail_request( + request_body, + STAGE_RUNTIME_STOP, + "could not stop/remove container for offline provider capture", + control_file.processing_path, + ) + return + + try: + sm.publish_snapshot(claimed) + except Exception as exc: + # SyncManager.publish_snapshot has internal try/except for every + # stage, but we still wrap to guarantee we always restart the + # container even if something truly unexpected escapes. + self.P(f"[sync] publish_snapshot raised unexpectedly: {exc}", color="r") + + if stopped_for_sync: + self._sync_safe_start_container() + + # ----- consumer tick --------------------------------------------------- + + def _sync_consumer_tick(self, current_time: float) -> None: + """If the ChainStore record points at a different CID than what we last + applied, fetch+extract+restart inline. Identity is the CID, not the + version: the CID is content-addressed and uniquely identifies the + bundle, while ``version`` is informational metadata only (kept for + filename ordering + human-readable logs). Comparing CIDs eliminates + a class of clock-skew failure modes (a provider's wonky timestamp + can never make a consumer permanently ignore a corrected snapshot) + and makes multi-provider sync sets coherent without ordering + assumptions. + """ + sm = self._ensure_sync_manager() + if sm is None or self._sync_role() != "consumer": + return + if not self._sync_should_tick(current_time): + return + + record = sm.fetch_latest() + if not isinstance(record, dict): + return + record_cid = record.get("cid") + if not record_cid: + return + + latest_local = sm.latest_applied() + last_cid = (latest_local or {}).get("cid") if latest_local else None + if last_cid and record_cid == last_cid: + return # same bundle as the last apply — nothing to do + + quarantined = sm.quarantined_record(record) + if quarantined is not None: + self.P( + f"[sync] skipping quarantined consumer cid={record_cid} until " + f"{quarantined.get('next_retry_after')}: {quarantined.get('error')}", + color="y", + ) + return + + prepared = sm.prepare_apply(record) + if prepared is None: + return + + self.P( + f"[sync] consumer tick: applying cid={record_cid} " + f"(v{record.get('version')})", + color="b", + ) + + apply_mode = self._sync_consumer_apply_mode(record) + if apply_mode == CONSUMER_APPLY_OFFLINE_RESTART: + stopped = self._stop_container_runtime_for_restart() + if not stopped: + self.P( + f"[sync] aborting consumer apply for cid={record_cid}: " + "could not stop/remove container for offline apply", + color="r", + ) + sm._cleanup_tree(prepared.staging_dir) + return + + applied = False + restart_safe = True + try: + result = sm.commit_prepared_apply(prepared) + applied = bool(result.success) + restart_safe = bool(result.restart_safe) + if applied: + sm._finalize_apply_success(record, result.extracted_paths) + except Exception as exc: + restart_safe = False + self.P(f"[sync] commit_prepared_apply raised unexpectedly: {exc}", color="r") + + if apply_mode == CONSUMER_APPLY_OFFLINE_RESTART and restart_safe: + self._sync_safe_start_container() + elif apply_mode == CONSUMER_APPLY_OFFLINE_RESTART: + self.P( + f"[sync] leaving container stopped after uncertain apply for cid={record_cid}", + color="r", + ) + elif apply_mode == CONSUMER_APPLY_ONLINE_RESTART and applied: + stopped = self._stop_container_runtime_for_restart() + if stopped: + self._sync_safe_start_container() + else: + self.P( + f"[sync] post-apply restart failed to stop container for cid={record_cid}", + color="r", + ) + + # ----- internal helpers ------------------------------------------------ + + def _sync_consumer_apply_mode(self, record: Optional[dict] = None) -> str: + """Return the consumer-local lifecycle policy for snapshot apply. + + Provider-published records may carry the requester's desired + ``runtime.consumer_apply`` for audit/UI purposes, but lifecycle safety is + decided by the consumer node. A provider must not be able to force a + running consumer to hot-apply files. Online consumer modes are accepted for + compatibility but normalized to offline restart until extraction is + descriptor-safe against app-side path races. + """ + mode = self._sync_cfg().get("CONSUMER_APPLY_MODE", CONSUMER_APPLY_OFFLINE_RESTART) + allowed = { + CONSUMER_APPLY_OFFLINE_RESTART, + CONSUMER_APPLY_ONLINE_NO_RESTART, + CONSUMER_APPLY_ONLINE_RESTART, + } + if mode not in allowed: + self.P( + f"[sync] unknown local CONSUMER_APPLY_MODE {mode!r}; using " + f"{CONSUMER_APPLY_OFFLINE_RESTART!r}", + color="y", + ) + self._sync_last_apply_mode_resolution = { + "requested_mode": mode, + "effective_mode": CONSUMER_APPLY_OFFLINE_RESTART, + "reason": "unknown_mode", + } + return CONSUMER_APPLY_OFFLINE_RESTART + if mode in {CONSUMER_APPLY_ONLINE_NO_RESTART, CONSUMER_APPLY_ONLINE_RESTART}: + self.P( + f"[sync] local CONSUMER_APPLY_MODE {mode!r} is currently disabled for " + f"filesystem safety; using {CONSUMER_APPLY_OFFLINE_RESTART!r}", + color="y", + ) + self._sync_last_apply_mode_resolution = { + "requested_mode": mode, + "effective_mode": CONSUMER_APPLY_OFFLINE_RESTART, + "reason": "online_apply_disabled", + } + return CONSUMER_APPLY_OFFLINE_RESTART + self._sync_last_apply_mode_resolution = { + "requested_mode": mode, + "effective_mode": mode, + "reason": None, + } + return mode + + def _sync_record_consumer_apply_mode(self, record: dict) -> str: + """Backward-compatible wrapper for tests/older call sites.""" + return self._sync_consumer_apply_mode(record) + + def _sync_should_tick(self, current_time: float) -> bool: + last = getattr(self, "_last_sync_check", 0.0) or 0.0 + if current_time - last < self._sync_poll_interval(): + return False + self._last_sync_check = current_time + return True + + def _sync_safe_start_container(self) -> None: + """Restart the container after a sync slice. Failures are logged, not + raised, because the periodic loop will retry and ``_check_container_status`` + will pick up a still-stopped container on the next pass. + + Calls ``_reset_runtime_state_post_start`` after the start so that + readiness gates, health-probe timers, log capture, and + BUILD_AND_RUN_COMMANDS all re-engage against the freshly-started + container — same contract ``_restart_container`` follows. Without this, + tunnels stay marked ready, health checks are skipped, log streams are + stale, and image-defined startup commands don't rerun. + + The reset is guarded by its own try/except so a failed reset does not + roll back a successful start — the next periodic tick can re-evaluate + readiness. + """ + try: + self.start_container() + except Exception as exc: + self.P(f"[sync] start_container after sync slice failed: {exc}", color="r") + return + try: + self._reset_runtime_state_post_start() + except Exception as exc: + self.P( + f"[sync] runtime-state reset after sync slice failed: {exc}", color="r" + ) diff --git a/extensions/business/container_apps/tests/support.py b/extensions/business/container_apps/tests/support.py index fce7f5a8..dc09bf67 100644 --- a/extensions/business/container_apps/tests/support.py +++ b/extensions/business/container_apps/tests/support.py @@ -8,6 +8,41 @@ import numpy as _np +def install_docker_stub_if_needed(): + """Provide the tiny docker-py surface these unit tests need.""" + if "docker" in sys.modules and "docker.errors" in sys.modules and "docker.types" in sys.modules: + return + + docker_mod = types.ModuleType("docker") + errors_mod = types.ModuleType("docker.errors") + types_mod = types.ModuleType("docker.types") + + class DockerException(Exception): + pass + + class NotFound(DockerException): + pass + + class DeviceRequest: + def __init__(self, **kwargs): + self.kwargs = kwargs + + errors_mod.DockerException = DockerException + errors_mod.NotFound = NotFound + types_mod.DeviceRequest = DeviceRequest + docker_mod.errors = errors_mod + docker_mod.types = types_mod + docker_mod.from_env = MagicMock() + + sys.modules.setdefault("docker", docker_mod) + sys.modules.setdefault("docker.errors", errors_mod) + sys.modules.setdefault("docker.types", types_mod) + return + + +install_docker_stub_if_needed() + + class _DummyBasePlugin: CONFIG = {'VALIDATION_RULES': {}} @@ -210,6 +245,7 @@ def _log(*args, **kwargs): plugin.cfg_extra_tunnels_ping_interval = 30 plugin.cfg_health_check = {} plugin.cfg_restart_policy = "always" + plugin.cfg_plugin_stop_timeout = 45 plugin.volumes = {} plugin.extra_ports_mapping = {} plugin.inverted_ports_mapping = {} @@ -225,6 +261,8 @@ def _log(*args, **kwargs): plugin._health_probing_disabled = False plugin._normalized_exposed_ports = {} plugin._normalized_main_exposed_port = None + plugin._cleanup_failed = False + plugin._manual_stop_pending = False plugin.container = object() plugin.container_name = "car_instance" plugin.log = types.SimpleNamespace(get_localhost_ip=lambda: "127.0.0.1") @@ -328,6 +366,8 @@ def make_lifecycle_runner(docker_client=None, mock_container=None, **cfg_overrid # State machine plugin.container_state = ContainerState.UNINITIALIZED plugin.stop_reason = StopReason.UNKNOWN + plugin._cleanup_failed = False + plugin._manual_stop_pending = False # Restart/backoff plugin._consecutive_failures = 0 @@ -371,6 +411,7 @@ def make_lifecycle_runner(docker_client=None, mock_container=None, **cfg_overrid plugin._last_extra_tunnels_ping = 0 plugin._last_paused_log = 0 plugin.cfg_paused_state_log_interval = 60 + plugin.cfg_plugin_stop_timeout = 45 plugin.cfg_show_log_each = 60 plugin.cfg_show_log_last_lines = 5 plugin.cfg_semaphore_log_interval = 10 diff --git a/extensions/business/container_apps/tests/test_container_lifecycle.py b/extensions/business/container_apps/tests/test_container_lifecycle.py index 317b827b..dc99c698 100644 --- a/extensions/business/container_apps/tests/test_container_lifecycle.py +++ b/extensions/business/container_apps/tests/test_container_lifecycle.py @@ -10,9 +10,14 @@ """ import unittest +import subprocess from pathlib import Path from unittest.mock import patch, MagicMock +from extensions.business.container_apps.tests.support import install_docker_stub_if_needed + +install_docker_stub_if_needed() + import docker.errors import docker.types @@ -468,6 +473,107 @@ def test_process_respects_max_retries(self): errors = [m for m in plugin.logged_messages if "abandoned" in m.lower()] self.assertTrue(len(errors) > 0) + def test_process_retries_failed_cleanup_then_restarts(self): + """A transient cleanup failure must not permanently block process().""" + clock = {"now": 100} + plugin, client, _ = make_lifecycle_runner(cfg_restart_backoff_initial=0) + plugin.time = lambda: clock["now"] + plugin._cleanup_failed = True + plugin.container_state = ContainerState.FAILED + + attempts = {"count": 0} + + def retry_cleanup(): + attempts["count"] += 1 + plugin._cleanup_failed = attempts["count"] == 1 + return not plugin._cleanup_failed + + plugin._stop_container_and_save_logs_to_disk = retry_cleanup + + plugin.process() + self.assertTrue(plugin._cleanup_failed) + client.containers.run.assert_not_called() + + with _patch_docker_module(client): + plugin.process() + + self.assertFalse(plugin._cleanup_failed) + client.containers.run.assert_called_once() + self.assertEqual(plugin.container_state, ContainerState.RUNNING) + + def test_manual_stop_persists_only_after_cleanup_success(self): + plugin, _, _ = make_lifecycle_runner(cfg_restart_backoff_initial=0) + plugin._save_persistent_state = MagicMock() + plugin._clear_manual_stop_state = MagicMock() + plugin._stop_container_and_save_logs_to_disk = MagicMock(return_value=False) + + plugin.on_command("STOP") + + plugin._save_persistent_state.assert_not_called() + plugin._clear_manual_stop_state.assert_called_once() + self.assertTrue(plugin._manual_stop_pending) + self.assertEqual(plugin.container_state, ContainerState.FAILED) + + def test_pending_manual_stop_pauses_after_cleanup_retry_success(self): + plugin, client, _ = make_lifecycle_runner(cfg_restart_backoff_initial=0) + plugin._cleanup_failed = True + plugin._manual_stop_pending = True + plugin.container_state = ContainerState.FAILED + plugin._save_persistent_state = MagicMock() + plugin._stop_container_and_save_logs_to_disk = MagicMock(return_value=True) + + plugin.process() + + plugin._save_persistent_state.assert_called_once_with(manually_stopped=True) + client.containers.run.assert_not_called() + self.assertFalse(plugin._manual_stop_pending) + self.assertEqual(plugin.container_state, ContainerState.PAUSED) + + def test_restart_clears_pending_manual_stop_before_cleanup_retry(self): + plugin, client, _ = make_lifecycle_runner(cfg_restart_backoff_initial=0) + plugin._manual_stop_pending = True + plugin._cleanup_failed = True + plugin._save_persistent_state = MagicMock() + plugin._clear_manual_stop_state = MagicMock() + attempts = {"count": 0} + + def cleanup(): + attempts["count"] += 1 + plugin._cleanup_failed = attempts["count"] == 1 + return not plugin._cleanup_failed + + plugin._stop_container_and_save_logs_to_disk = cleanup + + plugin.on_command("RESTART") + + plugin._clear_manual_stop_state.assert_called_once() + plugin._save_persistent_state.assert_not_called() + self.assertFalse(plugin._manual_stop_pending) + self.assertTrue(plugin._cleanup_failed) + + with _patch_docker_module(client): + plugin.process() + + plugin._save_persistent_state.assert_not_called() + self.assertFalse(plugin._cleanup_failed) + self.assertEqual(plugin.container_state, ContainerState.RUNNING) + + def test_config_restart_respects_pending_manual_stop_cleanup(self): + plugin, _, _ = make_lifecycle_runner(cfg_restart_backoff_initial=0) + plugin._manual_stop_pending = True + plugin._cleanup_failed = True + plugin._save_persistent_state = MagicMock() + plugin._stop_container_and_save_logs_to_disk = MagicMock(return_value=True) + restart_callable = MagicMock() + + plugin._handle_config_restart(restart_callable) + + restart_callable.assert_not_called() + plugin._save_persistent_state.assert_called_once_with(manually_stopped=True) + self.assertFalse(plugin._manual_stop_pending) + self.assertFalse(plugin._cleanup_failed) + self.assertEqual(plugin.container_state, ContainerState.PAUSED) + def test_process_multiple_iterations_running(self): """Multiple process() calls with a healthy container should all succeed.""" plugin, _, container = make_lifecycle_runner() @@ -480,6 +586,45 @@ def test_process_multiple_iterations_running(self): self.assertEqual(plugin.container_state, ContainerState.RUNNING) +class _FakeProcess: + def __init__(self): + self.terminated = False + self.killed = False + self.wait_calls = 0 + + def poll(self): + return 0 if self.killed else None + + def terminate(self): + self.terminated = True + return + + def kill(self): + self.killed = True + return + + def wait(self, timeout=None): + self.wait_calls += 1 + if self.wait_calls == 1: + raise subprocess.TimeoutExpired(cmd="fake", timeout=timeout) + self.killed = True + return 0 + + +class TestTunnelCompatibilityFallbacks(unittest.TestCase): + """The edge PR must work even before the matching core PR is deployed.""" + + def test_local_subprocess_termination_fallback_without_core_helper(self): + plugin, _, _ = make_lifecycle_runner() + process = _FakeProcess() + + with patch("extensions.business.container_apps.container_app_runner.os.name", "nt"): + self.assertTrue(plugin._terminate_subprocess_tree(process, terminate_timeout=0, kill_timeout=0)) + + self.assertTrue(process.terminated) + self.assertTrue(process.killed) + + # =========================================================================== # Fixed-Size Volume Integration # =========================================================================== diff --git a/extensions/business/container_apps/tests/test_fixed_volume.py b/extensions/business/container_apps/tests/test_fixed_volume.py index cdc37bf7..20845054 100644 --- a/extensions/business/container_apps/tests/test_fixed_volume.py +++ b/extensions/business/container_apps/tests/test_fixed_volume.py @@ -250,6 +250,67 @@ def test_handles_missing_metadata(self, mock_run): # Should not raise even if meta_path doesn't exist cleanup(vol) + @patch("extensions.business.container_apps.fixed_volume._run") + def test_missing_metadata_with_mounted_path_reports_failure(self, mock_run): + vol = FixedVolume(name="data", size="100M", root=Path("/r")) + proc = f"/dev/sdb1 {vol.mount_path} ext4 rw 0 0\n" + with patch.object(Path, "exists", return_value=False), \ + patch("builtins.open", mock_open(read_data=proc)): + result = cleanup(vol) + self.assertFalse(result) + mock_run.assert_called_once_with(["umount", str(vol.mount_path)], logger=None) + + @patch("extensions.business.container_apps.fixed_volume._run") + def test_missing_metadata_with_loop_mount_recovers_and_detaches(self, mock_run): + vol = FixedVolume(name="data", size="100M", root=Path("/r")) + with patch.object(Path, "exists", return_value=False), \ + patch("extensions.business.container_apps.fixed_volume._get_mount_source", return_value="/dev/loop7"), \ + patch("extensions.business.container_apps.fixed_volume._is_path_mounted", return_value=False): + result = cleanup(vol) + self.assertTrue(result) + self.assertEqual( + [call_args.args[0] for call_args in mock_run.call_args_list], + [["umount", str(vol.mount_path)], ["losetup", "-d", "/dev/loop7"]], + ) + + @patch("extensions.business.container_apps.fixed_volume._run") + def test_malformed_metadata_reports_failure(self, mock_run): + vol = FixedVolume(name="data", size="100M", root=Path("/r")) + with patch.object(Path, "exists", return_value=True), \ + patch.object(Path, "read_text", return_value="{not-json"), \ + patch("builtins.open", mock_open(read_data="")): + result = cleanup(vol) + self.assertFalse(result) + + @patch("extensions.business.container_apps.fixed_volume._run") + def test_malformed_metadata_with_loop_mount_recovers_and_detaches(self, mock_run): + vol = FixedVolume(name="data", size="100M", root=Path("/r")) + with patch.object(Path, "exists", return_value=True), \ + patch.object(Path, "read_text", return_value="{not-json"), \ + patch("extensions.business.container_apps.fixed_volume._get_mount_source", return_value="/dev/loop9"), \ + patch("extensions.business.container_apps.fixed_volume._is_path_mounted", return_value=False): + result = cleanup(vol) + self.assertTrue(result) + self.assertEqual( + [call_args.args[0] for call_args in mock_run.call_args_list], + [["umount", str(vol.mount_path)], ["losetup", "-d", "/dev/loop9"]], + ) + + @patch("extensions.business.container_apps.fixed_volume._run") + def test_mounted_loop_source_overrides_stale_metadata_loop(self, mock_run): + vol = FixedVolume(name="data", size="100M", root=Path("/r")) + meta = {"loop_dev": "/dev/loop3"} + with patch.object(Path, "exists", return_value=True), \ + patch.object(Path, "read_text", return_value=json.dumps(meta)), \ + patch("extensions.business.container_apps.fixed_volume._get_mount_source", return_value="/dev/loop7"), \ + patch("extensions.business.container_apps.fixed_volume._is_path_mounted", return_value=False): + result = cleanup(vol) + self.assertTrue(result) + self.assertEqual( + [call_args.args[0] for call_args in mock_run.call_args_list], + [["umount", str(vol.mount_path)], ["losetup", "-d", "/dev/loop7"]], + ) + @patch("extensions.business.container_apps.fixed_volume._run") def test_handles_umount_failure(self, mock_run): vol = FixedVolume(name="data", size="100M", root=Path("/tmp/fv")) @@ -432,7 +493,18 @@ def test_calls_cleanup_for_each_volume(self, mock_cleanup): self.assertEqual(plugin._fixed_volumes, []) @patch("extensions.business.container_apps.fixed_volume.cleanup", - side_effect=[Exception("fail"), None]) + side_effect=[False, True]) + def test_retains_volume_when_cleanup_returns_false(self, mock_cleanup): + plugin = make_container_app_runner() + vol1 = FixedVolume(name="a", size="50M", root=Path("/r")) + vol2 = FixedVolume(name="b", size="50M", root=Path("/r")) + plugin._fixed_volumes = [vol1, vol2] + self.assertFalse(plugin._cleanup_fixed_size_volumes()) + self.assertEqual(mock_cleanup.call_count, 2) + self.assertEqual(plugin._fixed_volumes, [vol1]) + + @patch("extensions.business.container_apps.fixed_volume.cleanup", + side_effect=[Exception("fail"), True]) def test_continues_on_failure(self, mock_cleanup): plugin = make_container_app_runner() vol1 = FixedVolume(name="a", size="50M", root=Path("/r")) @@ -440,7 +512,7 @@ def test_continues_on_failure(self, mock_cleanup): plugin._fixed_volumes = [vol1, vol2] plugin._cleanup_fixed_size_volumes() # should not raise self.assertEqual(mock_cleanup.call_count, 2) - self.assertEqual(plugin._fixed_volumes, []) + self.assertEqual(plugin._fixed_volumes, [vol1]) if __name__ == "__main__": diff --git a/extensions/business/container_apps/tests/test_sync_control_files.py b/extensions/business/container_apps/tests/test_sync_control_files.py new file mode 100644 index 00000000..d1530db5 --- /dev/null +++ b/extensions/business/container_apps/tests/test_sync_control_files.py @@ -0,0 +1,228 @@ +"""Unit tests for sync JSON control-file mechanics.""" + +import json +import os +import tempfile +import unittest +from pathlib import Path +from unittest.mock import patch + +from extensions.business.container_apps.sync.control_files import ( + JsonControlFile, + JsonControlFileDecodeError, + JsonControlFileObjectError, + JsonControlFileUnsafeError, + write_json_atomic, +) + + +class TestWriteJsonAtomic(unittest.TestCase): + def setUp(self): + self._tmp = tempfile.TemporaryDirectory() + self.root = Path(self._tmp.name) + + def tearDown(self): + self._tmp.cleanup() + + def test_writes_json_atomically_with_app_readable_mode(self): + target = self.root / "nested" / "response.json" + + write_json_atomic(target, {"status": "ok", "version": 2}) + + self.assertEqual(json.loads(target.read_text()), {"status": "ok", "version": 2}) + self.assertEqual(os.stat(target).st_mode & 0o777, 0o644) + self.assertEqual(list(target.parent.glob(".response.json.*.tmp")), []) + + def test_cleans_tmp_file_on_write_failure(self): + target = self.root / "state.json" + + with patch( + "extensions.business.container_apps.sync.control_files.json.dump", + side_effect=RuntimeError("boom"), + ): + with self.assertRaises(RuntimeError): + write_json_atomic(target, {"status": "ok"}) + + self.assertFalse(target.exists()) + self.assertEqual(list(self.root.glob(".state.json.*.tmp")), []) + + def test_rejects_symlink_parent_directory(self): + outside = self.root / "outside" + outside.mkdir() + control_root = self.root / "volume-sync" + os.symlink(str(outside), str(control_root)) + + with self.assertRaises(JsonControlFileUnsafeError): + write_json_atomic(control_root / "response.json", {"status": "ok"}) + + self.assertFalse((outside / "response.json").exists()) + + +class TestJsonControlFile(unittest.TestCase): + def setUp(self): + self._tmp = tempfile.TemporaryDirectory() + self.root = Path(self._tmp.name) + self.control = JsonControlFile( + self.root, "request.json", "request.json.processing" + ) + + def tearDown(self): + self._tmp.cleanup() + + def test_claim_object_returns_none_when_absent(self): + self.assertIsNone(self.control.claim_object()) + + def test_claim_object_renames_and_parses_pending_file(self): + (self.root / "request.json").write_text( + '{"archive_paths":["/app/data/"],"metadata":{"k":1}}' + ) + + claimed = self.control.claim_object() + + self.assertIsNotNone(claimed) + self.assertEqual(claimed.body["archive_paths"], ["/app/data/"]) + self.assertEqual(claimed.body["metadata"], {"k": 1}) + self.assertFalse((self.root / "request.json").exists()) + self.assertTrue((self.root / "request.json.processing").is_file()) + self.assertEqual(claimed.processing_path, self.root / "request.json.processing") + + def test_claim_object_reports_malformed_json_with_raw_body(self): + (self.root / "request.json").write_text("not-json{") + + with self.assertRaises(JsonControlFileDecodeError) as ctx: + self.control.claim_object() + + self.assertEqual(ctx.exception.raw_body, "not-json{") + self.assertTrue((self.root / "request.json.processing").is_file()) + + def test_claim_object_reports_invalid_utf8_without_raw_body(self): + (self.root / "request.json").write_bytes(b'{"archive_paths": ["\xff"]}') + + with self.assertRaises(JsonControlFileDecodeError) as ctx: + self.control.claim_object() + + self.assertIsNone(ctx.exception.raw_body) + self.assertIn("invalid UTF-8", str(ctx.exception)) + self.assertTrue((self.root / "request.json.processing").is_file()) + + def test_claim_object_reports_non_object_json_with_raw_body(self): + (self.root / "request.json").write_text('["just","a","list"]') + + with self.assertRaises(JsonControlFileObjectError) as ctx: + self.control.claim_object() + + self.assertEqual(ctx.exception.raw_body, '["just","a","list"]') + self.assertIn("request.json must be a JSON object", str(ctx.exception)) + + def test_claim_object_rejects_symlink_without_reading_target(self): + secret = self.root / "secret.txt" + secret.write_text("host-secret") + os.symlink(str(secret), str(self.root / "request.json")) + + with self.assertRaises(JsonControlFileUnsafeError) as ctx: + self.control.claim_object() + + self.assertNotIn("host-secret", str(ctx.exception)) + self.assertIsNone(ctx.exception.raw_body) + self.assertFalse((self.root / "request.json").exists()) + self.assertFalse(os.path.lexists(str(self.root / "request.json.processing"))) + + def test_claim_object_rejects_fifo_without_blocking(self): + fifo = self.root / "request.json" + os.mkfifo(str(fifo)) + + with self.assertRaises(JsonControlFileUnsafeError): + self.control.claim_object() + + self.assertFalse(os.path.lexists(str(fifo))) + self.assertFalse(os.path.lexists(str(self.root / "request.json.processing"))) + + def test_claim_object_quarantines_non_empty_directory_request(self): + request_dir = self.root / "request.json" + request_dir.mkdir() + (request_dir / "payload").write_text("keep me") + + with self.assertRaises(JsonControlFileUnsafeError): + self.control.claim_object() + + self.assertFalse(os.path.lexists(str(request_dir))) + quarantined = list(self.root.glob("request.json.unsafe.*")) + self.assertEqual(len(quarantined), 1) + self.assertEqual((quarantined[0] / "payload").read_text(), "keep me") + + def test_discard_processing_removes_processing_file(self): + (self.root / "request.json.processing").write_text("{}") + + self.control.discard_processing() + + self.assertFalse((self.root / "request.json.processing").exists()) + + def test_discard_processing_removes_broken_symlink(self): + os.symlink(str(self.root / "missing.json"), str(self.root / "request.json.processing")) + + self.control.discard_processing() + + self.assertFalse(os.path.lexists(str(self.root / "request.json.processing"))) + + def test_discard_processing_quarantines_non_empty_directory(self): + proc_dir = self.root / "request.json.processing" + proc_dir.mkdir() + (proc_dir / "payload").write_text("keep me") + + self.control.discard_processing() + + self.assertFalse(os.path.lexists(str(proc_dir))) + quarantined = list(self.root.glob("request.json.processing.unsafe.*")) + self.assertEqual(len(quarantined), 1) + self.assertEqual((quarantined[0] / "payload").read_text(), "keep me") + + def test_recover_stale_processing_removes_symlink(self): + os.symlink(str(self.root / "missing.json"), str(self.root / "request.json.processing")) + + recovered = self.control.recover_stale_processing() + + self.assertFalse(recovered) + self.assertFalse(os.path.lexists(str(self.root / "request.json.processing"))) + + def test_recover_stale_processing_renames_only_orphan(self): + (self.root / "request.json.processing").write_text('{"old":true}') + + recovered = self.control.recover_stale_processing() + + self.assertTrue(recovered) + self.assertTrue((self.root / "request.json").is_file()) + self.assertFalse((self.root / "request.json.processing").exists()) + + def test_recover_stale_processing_does_not_overwrite_pending(self): + (self.root / "request.json").write_text('{"new":true}') + (self.root / "request.json.processing").write_text('{"old":true}') + + recovered = self.control.recover_stale_processing() + + self.assertFalse(recovered) + self.assertEqual(json.loads((self.root / "request.json").read_text()), {"new": True}) + self.assertTrue((self.root / "request.json.processing").exists()) + + def test_recover_stale_processing_quarantines_non_regular_directory(self): + proc_dir = self.root / "request.json.processing" + proc_dir.mkdir() + (proc_dir / "payload").write_text("keep me") + + recovered = self.control.recover_stale_processing() + + self.assertFalse(recovered) + self.assertFalse(os.path.lexists(str(proc_dir))) + quarantined = list(self.root.glob("request.json.processing.unsafe.*")) + self.assertEqual(len(quarantined), 1) + self.assertEqual((quarantined[0] / "payload").read_text(), "keep me") + + def test_write_json_writes_relative_to_control_root(self): + self.control.write_json("response.json", {"status": "ok"}) + + self.assertEqual( + json.loads((self.root / "response.json").read_text()), {"status": "ok"} + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/extensions/business/container_apps/tests/test_sync_manager.py b/extensions/business/container_apps/tests/test_sync_manager.py new file mode 100644 index 00000000..ebea0f93 --- /dev/null +++ b/extensions/business/container_apps/tests/test_sync_manager.py @@ -0,0 +1,1869 @@ +"""Unit tests for sync_manager.SyncManager pure helpers. + +Covers the path-validation chokepoint (resolve_container_path), atomic JSON +writes, and history append/latest/update operations using a temporary +plugin-data directory and a stub owner that mimics the BasePlugin surface +the manager depends on. +""" + +import json +import os +import io +import stat +import tarfile +import tempfile +import time +import unittest +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import patch + +from extensions.business.container_apps.sync import ( + SYNC_PROCESSING_FILE, + SYSTEM_VOLUME_NAME, + SYSTEM_VOLUME_MOUNT, + SyncManager, + history_received_dir, + history_sent_dir, + sync_state_dir, + system_volume_host_root, + volume_sync_dir, +) +from extensions.business.container_apps.sync.manager import ( + PROVIDER_CAPTURE_ONLINE, + SyncRequest, + SyncRuntimePolicy, +) + + +def _tar_bytes(name: str, content: bytes) -> bytes: + buff = io.BytesIO() + with tarfile.open(fileobj=buff, mode="w") as tar: + info = tarfile.TarInfo(name=name) + info.size = len(content) + info.mode = 0o644 + tar.addfile(info, io.BytesIO(content)) + return buff.getvalue() + + +class _FakeDockerArchiveContainer: + def __init__(self, archives: dict[str, bytes]): + self.archives = dict(archives) + self.get_archive_calls: list[str] = [] + + def get_archive(self, path): + self.get_archive_calls.append(path) + archive = self.archives[path] + name = os.path.basename(path.rstrip("/")) or "/" + return iter([archive]), {"name": name} + + +class _FakeR1FS: + """Minimal r1fs stub for orchestrator tests.""" + + def __init__(self): + self.added: dict[str, bytes] = {} + self.deleted: list[tuple[str, bool, bool]] = [] + self.add_should_raise: Exception | None = None + self.add_should_return_empty = False + self.get_should_raise: Exception | None = None + self.delete_should_raise: Exception | None = None + self._counter = 0 + + def add_file(self, file_path: str) -> str: + if self.add_should_raise: + raise self.add_should_raise + if self.add_should_return_empty: + return "" + self._counter += 1 + cid = f"QmFAKE{self._counter:08d}" + with open(file_path, "rb") as handle: + self.added[cid] = handle.read() + return cid + + def get_file(self, cid: str) -> str: + if self.get_should_raise: + raise self.get_should_raise + if cid not in self.added: + return "" + fd, path = tempfile.mkstemp(suffix=".tar.gz") + with os.fdopen(fd, "wb") as out: + out.write(self.added[cid]) + return path + + def delete_file( + self, + cid: str, + unpin_remote: bool = False, + cleanup_local_files: bool = False, + **_kwargs, + ) -> dict: + if self.delete_should_raise: + raise self.delete_should_raise + self.added.pop(cid, None) + self.deleted.append((cid, unpin_remote, cleanup_local_files)) + return {"ok": True} + + +class _FakeChainStore: + """Minimal chainstore stub: a process-local hkey/key dict.""" + + def __init__(self): + self.store: dict[tuple[str, str], object] = {} + self.hset_calls: list[tuple[str, str, object]] = [] + self.hsync_calls: list[str] = [] + self.hset_should_raise: Exception | None = None + self.hsync_should_raise: Exception | None = None + self.hset_returns: bool = True + + def hset(self, hkey, key, value, **_kwargs): + if self.hset_should_raise: + raise self.hset_should_raise + self.hset_calls.append((hkey, key, value)) + self.store[(hkey, key)] = value + return self.hset_returns + + def hget(self, hkey, key, **_kwargs): + return self.store.get((hkey, key)) + + def hsync(self, hkey, **_kwargs): + self.hsync_calls.append(hkey) + if self.hsync_should_raise: + raise self.hsync_should_raise + return None + + +def _make_owner(tmpdir: Path) -> SimpleNamespace: + """Build a minimal owner stub for SyncManager tests.""" + data_folder = tmpdir / "_local_cache" / "_data" + data_folder.mkdir(parents=True) + instance_subfolder = "pipelines_data/test_pipe/test_inst" + + fixed_root = data_folder / instance_subfolder / "fixed_volumes" / "mounts" + fixed_root.mkdir(parents=True) + (fixed_root / SYSTEM_VOLUME_NAME).mkdir() + (fixed_root / "appdata").mkdir() + (fixed_root / "legacy_bind").mkdir() # pretend FILE_VOLUMES path + + volumes = { + str(fixed_root / SYSTEM_VOLUME_NAME): {"bind": SYSTEM_VOLUME_MOUNT, "mode": "rw"}, + str(fixed_root / "appdata"): {"bind": "/app/data", "mode": "rw"}, + # A path that looks like a fixed-size volume but isn't (no fixed_volumes/mounts/ root) + str(tmpdir / "tmpfs_legacy"): {"bind": "/app/legacy", "mode": "rw"}, + } + (tmpdir / "tmpfs_legacy").mkdir() + + output_folder = tmpdir / "output" + output_folder.mkdir() + + msgs: list[str] = [] + r1fs = _FakeR1FS() + cs = _FakeChainStore() + # Track time so each call to time() returns a slightly larger float, which + # lets us emit successive snapshots with distinct version timestamps in + # the same test without sleeping. + clock = [1714742400.0] + def _time(): + clock[0] += 1.0 + return clock[0] + return SimpleNamespace( + get_data_folder=lambda: str(data_folder), + _get_instance_data_subfolder=lambda: instance_subfolder, + get_output_folder=lambda: str(output_folder), + volumes=volumes, + time=_time, + ee_id="ee_test_provider", + cfg_sync_key="11111111-1111-1111-1111-111111111111", + cfg_sync_type="provider", + r1fs=r1fs, + chainstore_hset=cs.hset, + chainstore_hget=cs.hget, + chainstore_hsync=cs.hsync, + P=lambda msg, color=None: msgs.append(f"[{color or ''}] {msg}"), + _msgs=msgs, + _fixed_root=fixed_root, + _output_folder=output_folder, + _r1fs=r1fs, + _cs=cs, + ) + + +# --------------------------------------------------------------------------- +# resolve_container_path +# --------------------------------------------------------------------------- + +class TestResolveContainerPath(unittest.TestCase): + def setUp(self): + self._tmp = tempfile.TemporaryDirectory() + self.tmpdir = Path(self._tmp.name) + self.owner = _make_owner(self.tmpdir) + self.sm = SyncManager(self.owner) + + def tearDown(self): + self._tmp.cleanup() + + def test_happy_path_directory(self): + host, bind, host_root = self.sm.resolve_container_path("/app/data/") + self.assertTrue(host.endswith("fixed_volumes/mounts/appdata")) + self.assertEqual(bind, "/app/data") + self.assertTrue(host_root.endswith("fixed_volumes/mounts/appdata")) + + def test_happy_path_subfile(self): + host, _, _ = self.sm.resolve_container_path("/app/data/foo.bin") + self.assertTrue(host.endswith("fixed_volumes/mounts/appdata/foo.bin")) + + def test_rejects_relative(self): + with self.assertRaisesRegex(ValueError, "must be absolute"): + self.sm.resolve_container_path("app/data/") + + def test_rejects_dotdot(self): + with self.assertRaisesRegex(ValueError, r"must not contain"): + self.sm.resolve_container_path("/app/data/../../etc/passwd") + + def test_rejects_unmounted(self): + with self.assertRaisesRegex(ValueError, "no mounted volume covers"): + self.sm.resolve_container_path("/nope/") + + def test_rejects_anonymous_mount(self): + """Rule 3 admits FIXED_SIZE_VOLUMES and legacy VOLUMES (both per-instance + host directories under known roots). Mounts that aren't under either — + anonymous Docker mounts, FILE_VOLUMES content files, ephemeral container + fs — are still rejected. The fixture's ``/app/legacy`` mount is bound at + ``tmpdir/tmpfs_legacy`` (outside both allow-listed roots) so it stands in + for the "anonymous mount" case here. + """ + with self.assertRaisesRegex(ValueError, "non-volume-backed mount"): + self.sm.resolve_container_path("/app/legacy/x") + + def test_rejects_system_volume(self): + with self.assertRaisesRegex(ValueError, "anti-recursion"): + self.sm.resolve_container_path("/r1en_system/foo") + + def test_rejects_system_volume_root(self): + with self.assertRaisesRegex(ValueError, "anti-recursion"): + self.sm.resolve_container_path("/r1en_system") + + def test_rejects_empty(self): + with self.assertRaisesRegex(ValueError, "non-empty"): + self.sm.resolve_container_path("") + + def test_longest_prefix_wins_for_nested_mounts(self): + """Nested fixed-size mounts (/app and /app/data) must resolve by the most + specific bind, not by dict insertion order. Docker overlays the deeper + mount on top of the broader one inside the container, so a path under + /app/data must resolve to the /app/data mount's host root even when /app + was added to self.volumes first. The previous first-match-wins iteration + silently mapped to the wrong host root (codex review finding 3 on PR #399). + """ + fixed_root = self.owner._fixed_root + (fixed_root / "outer_app").mkdir(exist_ok=True) + (fixed_root / "inner_data").mkdir(exist_ok=True) + # Order matters: insert the broader mount FIRST so first-match-wins would + # pick the wrong one. + self.owner.volumes = { + str(fixed_root / "outer_app"): {"bind": "/app", "mode": "rw"}, + str(fixed_root / "inner_data"): {"bind": "/app/data", "mode": "rw"}, + } + host, bind, host_root = self.sm.resolve_container_path("/app/data/foo.bin") + self.assertTrue(host.endswith("fixed_volumes/mounts/inner_data/foo.bin")) + self.assertEqual(bind, "/app/data") + self.assertTrue(host_root.endswith("fixed_volumes/mounts/inner_data")) + + def test_longest_prefix_wins_regardless_of_insertion_order(self): + """Same as above but with the dict items in the opposite order. The result + must be identical — specificity, not insertion order, decides the winner. + """ + fixed_root = self.owner._fixed_root + (fixed_root / "outer_app").mkdir(exist_ok=True) + (fixed_root / "inner_data").mkdir(exist_ok=True) + self.owner.volumes = { + str(fixed_root / "inner_data"): {"bind": "/app/data", "mode": "rw"}, + str(fixed_root / "outer_app"): {"bind": "/app", "mode": "rw"}, + } + host, bind, _ = self.sm.resolve_container_path("/app/data/foo.bin") + self.assertTrue(host.endswith("fixed_volumes/mounts/inner_data/foo.bin")) + self.assertEqual(bind, "/app/data") + + def test_outer_bind_still_resolves_for_paths_only_it_covers(self): + """Paths that fall under the broader mount but NOT the nested one must + still resolve to the broader mount — longest-prefix-match must not break + legitimate routes through the outer bind. + """ + fixed_root = self.owner._fixed_root + (fixed_root / "outer_app").mkdir(exist_ok=True) + (fixed_root / "inner_data").mkdir(exist_ok=True) + self.owner.volumes = { + str(fixed_root / "outer_app"): {"bind": "/app", "mode": "rw"}, + str(fixed_root / "inner_data"): {"bind": "/app/data", "mode": "rw"}, + } + host, bind, _ = self.sm.resolve_container_path("/app/other.bin") + self.assertTrue(host.endswith("fixed_volumes/mounts/outer_app/other.bin")) + self.assertEqual(bind, "/app") + + def test_legacy_volumes_resolves_to_host_root(self): + """Rule 3 admits legacy VOLUMES. Their host roots live under + CONTAINER_VOLUMES_PATH (/edge_node/_local_cache/_data/container_volumes/), + which is per-instance and bounded — functionally equivalent to + fixed-size for sync purposes. Plan: extend-sync-to-legacy-VOLUMES. + """ + from extensions.business.container_apps.container_utils import ( + CONTAINER_VOLUMES_PATH, + ) + # Place a fake legacy host root and bind it into the volumes dict. + # We can't use the real CONTAINER_VOLUMES_PATH on a CI host without root, + # so monkeypatch it (constants_in_path comparison normalizes the value). + legacy_root = Path(self.tmpdir) / "edge_node" / "_local_cache" / "_data" / "container_volumes" + instance_dir = legacy_root / "test_instance_appdata" + instance_dir.mkdir(parents=True) + + self.owner.volumes = { + str(instance_dir): {"bind": "/app/data", "mode": "rw"}, + } + + # Patch CONTAINER_VOLUMES_PATH on the manager module so the resolver + # accepts our temp legacy root for the duration of the test. + import extensions.business.container_apps.sync.manager as manager_mod + original = manager_mod.CONTAINER_VOLUMES_PATH + manager_mod.CONTAINER_VOLUMES_PATH = str(legacy_root) + try: + host, bind, host_root = self.sm.resolve_container_path("/app/data/foo.bin") + finally: + manager_mod.CONTAINER_VOLUMES_PATH = original + + self.assertTrue(host.endswith("test_instance_appdata/foo.bin")) + self.assertEqual(bind, "/app/data") + self.assertEqual(host_root, str(instance_dir)) + + +# --------------------------------------------------------------------------- +# _write_json_atomic +# --------------------------------------------------------------------------- + +class TestAtomicJsonWrite(unittest.TestCase): + def setUp(self): + self._tmp = tempfile.TemporaryDirectory() + self.tmpdir = Path(self._tmp.name) + self.owner = _make_owner(self.tmpdir) + self.sm = SyncManager(self.owner) + + def tearDown(self): + self._tmp.cleanup() + + def test_writes_json_and_creates_parent(self): + target = self.tmpdir / "deep" / "nested" / "out.json" + self.sm._write_json_atomic(target, {"hello": "world", "n": 7}) + self.assertTrue(target.is_file()) + data = json.loads(target.read_text()) + self.assertEqual(data, {"hello": "world", "n": 7}) + + def test_no_orphan_tmp_on_success(self): + target = self.tmpdir / "out.json" + self.sm._write_json_atomic(target, {"x": 1}) + leftovers = [p for p in self.tmpdir.iterdir() if p.name.startswith(".out.json")] + self.assertEqual(leftovers, [], f"leftover tmps: {leftovers}") + + def test_overwrites_existing(self): + target = self.tmpdir / "out.json" + target.write_text('{"old": true}') + self.sm._write_json_atomic(target, {"new": True}) + self.assertEqual(json.loads(target.read_text()), {"new": True}) + + +# --------------------------------------------------------------------------- +# History readers / writers +# --------------------------------------------------------------------------- + +class TestHistory(unittest.TestCase): + def setUp(self): + self._tmp = tempfile.TemporaryDirectory() + self.tmpdir = Path(self._tmp.name) + self.owner = _make_owner(self.tmpdir) + self.sm = SyncManager(self.owner) + + def tearDown(self): + self._tmp.cleanup() + + def test_filename_pads_version_and_truncates_cid(self): + fname = SyncManager._history_filename(1714742400, "QmHash1234567890ABCDEF") + self.assertEqual(fname, "1714742400__QmHash123456.json") + + def test_filename_handles_short_cid(self): + fname = SyncManager._history_filename(7, "Qm") + self.assertEqual(fname, "0000000007__Qm.json") + + def test_filename_handles_missing_cid(self): + fname = SyncManager._history_filename(7, "") + self.assertEqual(fname, "0000000007__no_cid.json") + + def test_append_sent_writes_under_history_sent(self): + entry = {"cid": "QmAA1", "version": 100, "node_id": "ee_x"} + path = self.sm.append_sent(entry) + self.assertEqual(path.parent, history_sent_dir(self.owner)) + self.assertEqual(path.name, "0000000100__QmAA1.json") + data = json.loads(path.read_text()) + self.assertEqual(data["cid"], "QmAA1") + self.assertEqual(data["deletion"], { + "deleted_at": None, "deletion_succeeded": None, "deletion_error": None + }) + + def test_append_received_uses_received_dir(self): + entry = {"cid": "QmBB", "version": 50, "node_id": "ee_y"} + path = self.sm.append_received(entry) + self.assertEqual(path.parent, history_received_dir(self.owner)) + + def test_latest_picks_most_recently_written(self): + """latest_sent / latest_received use mtime, not filename ordering, so a + back-dated version (e.g. clock-skewed provider) doesn't permanently + 'win' over an entry written after it.""" + self.sm.append_sent({"cid": "Qm1", "version": 100}) + # Tiny sleep to guarantee distinct mtimes on filesystems with low + # mtime resolution. + import time as _t; _t.sleep(0.01) + self.sm.append_sent({"cid": "Qm3", "version": 300}) + _t.sleep(0.01) + # Entry written LAST has version=200 — lex-smaller filename than + # Qm3's, but the most recent on disk. mtime sort returns it. + self.sm.append_sent({"cid": "Qm2", "version": 200}) + latest = self.sm.latest_sent() + self.assertIsNotNone(latest) + self.assertEqual(latest["cid"], "Qm2") + self.assertEqual(latest["version"], 200) + + def test_latest_returns_none_when_empty(self): + self.assertIsNone(self.sm.latest_sent()) + self.assertIsNone(self.sm.latest_received()) + + def test_update_history_deletion_modifies_in_place(self): + entry = {"cid": "Qm9", "version": 999} + path = self.sm.append_sent(entry) + + self.sm.update_history_deletion( + history_sent_dir(self.owner), entry, succeeded=True, error=None + ) + data = json.loads(path.read_text()) + self.assertTrue(data["deletion"]["deletion_succeeded"]) + self.assertEqual(data["deletion"]["deletion_error"], None) + self.assertGreater(data["deletion"]["deleted_at"], 1714742400.0) + self.assertEqual(data["cid"], "Qm9") # rest of payload preserved + + def test_update_history_deletion_records_failure(self): + entry = {"cid": "Qm9", "version": 999} + self.sm.append_sent(entry) + self.sm.update_history_deletion( + history_sent_dir(self.owner), entry, succeeded=False, error="daemon down" + ) + path = history_sent_dir(self.owner) / "0000000999__Qm9.json" + data = json.loads(path.read_text()) + self.assertFalse(data["deletion"]["deletion_succeeded"]) + self.assertEqual(data["deletion"]["deletion_error"], "daemon down") + + def test_update_history_deletion_missing_file_logs(self): + entry = {"cid": "QmMissing", "version": 1} + # Don't append; just call update — should log warning, not raise. + self.sm.update_history_deletion( + history_sent_dir(self.owner), entry, succeeded=True, error=None + ) + self.assertTrue(any("history file missing" in m for m in self.owner._msgs)) + + +# --------------------------------------------------------------------------- +# claim_request +# --------------------------------------------------------------------------- + +class TestClaimRequest(unittest.TestCase): + def setUp(self): + self._tmp = tempfile.TemporaryDirectory() + self.tmpdir = Path(self._tmp.name) + self.owner = _make_owner(self.tmpdir) + self.sm = SyncManager(self.owner) + # Provision the volume-sync subdir on the host (mimics _configure_system_volume) + self.vsd = volume_sync_dir(self.owner) + self.vsd.mkdir(parents=True, exist_ok=True) + + def tearDown(self): + self._tmp.cleanup() + + def _write_request(self, body): + (self.vsd / "request.json").write_text(json.dumps(body)) + + def _read_invalid(self): + p = self.vsd / "request.json.invalid" + if not p.exists(): + return None + return json.loads(p.read_text()) + + def _read_response(self): + p = self.vsd / "response.json" + if not p.exists(): + return None + return json.loads(p.read_text()) + + def test_no_pending_returns_none(self): + self.assertIsNone(self.sm.claim_request()) + + def test_happy_path(self): + self._write_request({"archive_paths": ["/app/data/"], "metadata": {"k": 1}}) + result = self.sm.claim_request() + self.assertIsNotNone(result) + self.assertEqual(result.archive_paths, ["/app/data/"]) + self.assertEqual(result.metadata, {"k": 1}) + self.assertEqual(result.runtime.provider_capture, "offline") + self.assertEqual(result.runtime.consumer_apply, "offline_restart") + # request.json gone, .processing present, no .invalid + self.assertFalse((self.vsd / "request.json").exists()) + self.assertTrue((self.vsd / "request.json.processing").exists()) + self.assertIsNone(self._read_invalid()) + + def test_runtime_policy_parsed(self): + self.owner.cfg_sync_allow_online_provider_capture = True + self._write_request({ + "archive_paths": ["/app/data/"], + "runtime": { + "provider_capture": "online", + "consumer_apply": "online_no_restart", + }, + }) + + result = self.sm.claim_request() + + self.assertIsNotNone(result) + self.assertEqual(result.runtime.provider_capture, "online") + self.assertEqual(result.runtime.consumer_apply, "online_no_restart") + + def test_runtime_policy_must_be_object(self): + self._write_request({"archive_paths": ["/app/data/"], "runtime": "online"}) + + self.assertIsNone(self.sm.claim_request()) + + self.assertIn("runtime must be a JSON object", self._read_invalid()["_error"]["error"]) + + def test_invalid_provider_capture_rejected(self): + self._write_request({ + "archive_paths": ["/app/data/"], + "runtime": {"provider_capture": "maybe"}, + }) + + self.assertIsNone(self.sm.claim_request()) + + err = self._read_invalid()["_error"]["error"] + self.assertIn("provider_capture", err) + self.assertIn("maybe", err) + + def test_invalid_consumer_apply_rejected(self): + self._write_request({ + "archive_paths": ["/app/data/"], + "runtime": {"consumer_apply": "sometimes"}, + }) + + self.assertIsNone(self.sm.claim_request()) + + err = self._read_invalid()["_error"]["error"] + self.assertIn("consumer_apply", err) + self.assertIn("sometimes", err) + + def test_online_provider_capture_allows_unmounted_path(self): + self.owner.cfg_sync_allow_online_provider_capture = True + self._write_request({ + "archive_paths": ["/tmp/generated.txt"], + "runtime": {"provider_capture": "online"}, + }) + + result = self.sm.claim_request() + + self.assertIsNotNone(result) + self.assertEqual(result.archive_paths, ["/tmp/generated.txt"]) + self.assertEqual(result.runtime.provider_capture, "online") + + def test_online_provider_capture_rejected_without_local_opt_in(self): + self._write_request({ + "archive_paths": ["/tmp/generated.txt"], + "runtime": {"provider_capture": "online"}, + }) + + self.assertIsNone(self.sm.claim_request()) + + err = self._read_invalid()["_error"]["error"] + self.assertIn("ALLOW_ONLINE_PROVIDER_CAPTURE", err) + + def test_malformed_json(self): + (self.vsd / "request.json").write_text("not-json{") + self.assertIsNone(self.sm.claim_request()) + invalid = self._read_invalid() + self.assertIsNotNone(invalid) + self.assertIsNone(invalid["request"]) + self.assertEqual(invalid["_error"]["stage"], "validation") + self.assertIn("malformed JSON", invalid["_error"]["error"]) + self.assertEqual(invalid["_error"]["raw_body"], "not-json{") + response = self._read_response() + self.assertEqual(response["status"], "error") + self.assertEqual(response["stage"], "validation") + self.assertFalse((self.vsd / "request.json.processing").exists()) + + def test_request_symlink_rejected_without_leaking_target_body(self): + secret = self.tmpdir / "host-secret.txt" + secret.write_text("not-json-secret-token") + os.symlink(str(secret), str(self.vsd / "request.json")) + + self.assertIsNone(self.sm.claim_request()) + + invalid = self._read_invalid() + self.assertIsNotNone(invalid) + self.assertIsNone(invalid["request"]) + self.assertEqual(invalid["_error"]["stage"], "validation") + self.assertIn("symlink control file", invalid["_error"]["error"]) + self.assertNotIn("raw_body", invalid["_error"]) + self.assertNotIn("not-json-secret-token", json.dumps(invalid)) + self.assertFalse((self.vsd / "request.json.processing").exists()) + + def test_not_an_object(self): + self._write_request(["just", "a", "list"]) + self.assertIsNone(self.sm.claim_request()) + self.assertEqual(self._read_invalid()["_error"]["error"], + "request.json must be a JSON object") + + def test_missing_archive_paths(self): + self._write_request({"metadata": {}}) + self.assertIsNone(self.sm.claim_request()) + self.assertIn("archive_paths must be a non-empty list", + self._read_invalid()["_error"]["error"]) + + def test_empty_archive_paths(self): + self._write_request({"archive_paths": []}) + self.assertIsNone(self.sm.claim_request()) + self.assertIn("archive_paths must be a non-empty list", + self._read_invalid()["_error"]["error"]) + + def test_metadata_must_be_object(self): + self._write_request({"archive_paths": ["/app/data/"], "metadata": "nope"}) + self.assertIsNone(self.sm.claim_request()) + self.assertIn("metadata must be a JSON object", + self._read_invalid()["_error"]["error"]) + + def test_path_traversal_rejected(self): + self._write_request({"archive_paths": ["/app/../../etc/passwd"]}) + self.assertIsNone(self.sm.claim_request()) + invalid = self._read_invalid() + self.assertEqual(invalid["_error"]["stage"], "validation") + self.assertIn("..", invalid["_error"]["error"]) + self.assertEqual(invalid["request"]["archive_paths"], ["/app/../../etc/passwd"]) + + def test_unmounted_path_rejected(self): + self._write_request({"archive_paths": ["/nope/"]}) + self.assertIsNone(self.sm.claim_request()) + self.assertIn("no mounted volume covers", + self._read_invalid()["_error"]["error"]) + + def test_anonymous_mount_rejected(self): + """The fixture's ``/app/legacy`` mount is bound at ``tmpdir/tmpfs_legacy`` + (outside both allow-listed roots), standing in for an anonymous Docker + mount or ephemeral fs. claim_request must surface a clear error so the + app sees ``request.json.invalid`` instead of a silent stall. + """ + self._write_request({"archive_paths": ["/app/legacy/x"]}) + self.assertIsNone(self.sm.claim_request()) + self.assertIn("non-volume-backed mount", + self._read_invalid()["_error"]["error"]) + + def test_system_volume_rejected(self): + self._write_request({"archive_paths": ["/r1en_system/x"]}) + self.assertIsNone(self.sm.claim_request()) + self.assertIn("anti-recursion", + self._read_invalid()["_error"]["error"]) + + def test_invalid_response_carries_archive_paths(self): + self._write_request({"archive_paths": ["/nope/"], "metadata": {"v": 1}}) + self.sm.claim_request() + response = self._read_response() + self.assertEqual(response["archive_paths"], ["/nope/"]) + + def test_failure_clears_processing(self): + self._write_request({"archive_paths": ["/nope/"]}) + self.sm.claim_request() + self.assertFalse((self.vsd / "request.json.processing").exists()) + + +# --------------------------------------------------------------------------- +# make_archive + extract_archive +# --------------------------------------------------------------------------- + +class TestArchiveRoundtrip(unittest.TestCase): + """Build a tar from a fake provider mount, extract it into a fake consumer + mount with the same container path layout, and confirm bytes round-trip.""" + + def setUp(self): + self._tmp = tempfile.TemporaryDirectory() + self.tmpdir = Path(self._tmp.name) + self.provider = _make_owner(self.tmpdir / "provider") + self.consumer = _make_owner(self.tmpdir / "consumer") + self.sm_p = SyncManager(self.provider) + self.sm_c = SyncManager(self.consumer) + # Seed provider's /app/data with content + self.appdata_p = self.provider._fixed_root / "appdata" + (self.appdata_p / "foo.bin").write_bytes(b"hello world\x00\xff") + (self.appdata_p / "subdir").mkdir() + (self.appdata_p / "subdir" / "nested.txt").write_text("nested!") + + def tearDown(self): + self._tmp.cleanup() + + def test_round_trip_directory(self): + tar_path, size = self.sm_p.make_archive(["/app/data/"]) + self.assertTrue(os.path.isfile(tar_path)) + self.assertGreater(size, 0) + + extracted = self.sm_c.extract_archive(tar_path) + self.assertTrue(any(e == "/app/data/" or e.startswith("/app/data/") for e in extracted)) + + appdata_c = self.consumer._fixed_root / "appdata" + self.assertEqual((appdata_c / "foo.bin").read_bytes(), b"hello world\x00\xff") + self.assertEqual((appdata_c / "subdir" / "nested.txt").read_text(), "nested!") + + def test_round_trip_file_only(self): + tar_path, _ = self.sm_p.make_archive(["/app/data/foo.bin"]) + self.sm_c.extract_archive(tar_path) + self.assertEqual( + (self.consumer._fixed_root / "appdata" / "foo.bin").read_bytes(), + b"hello world\x00\xff", + ) + + def test_make_archive_rejects_non_existent_host_path(self): + # Container path passes resolve_container_path but host file missing. + with self.assertRaisesRegex(FileNotFoundError, "does not exist"): + self.sm_p.make_archive(["/app/data/missing.bin"]) + + def test_make_archive_propagates_validation(self): + with self.assertRaisesRegex(ValueError, "no mounted volume covers"): + self.sm_p.make_archive(["/nope/"]) + + def test_make_archive_rejects_symlink_path(self): + outside = self.tmpdir / "outside-provider" + outside.mkdir() + (outside / "secret.txt").write_text("secret") + (self.appdata_p / "escape").symlink_to(outside, target_is_directory=True) + + with self.assertRaisesRegex(ValueError, "symlink"): + self.sm_p.make_archive(["/app/data/escape/secret.txt"]) + + def test_make_archive_rejects_symlink_descendant(self): + outside = self.tmpdir / "outside-provider-desc" + outside.mkdir() + (outside / "secret.txt").write_text("secret") + (self.appdata_p / "subdir" / "escape").symlink_to( + outside, target_is_directory=True + ) + + with self.assertRaisesRegex(ValueError, "symlink"): + self.sm_p.make_archive(["/app/data/"]) + + # ---- legacy VOLUMES round-trip tests ------------------------------------ + # + # Rule 3 admits legacy VOLUMES in addition to FIXED_SIZE_VOLUMES; these + # tests prove the round-trip works regardless of which root backs each + # side's mount. The fake legacy root lives under tmpdir, and we + # monkeypatch ``manager_mod.CONTAINER_VOLUMES_PATH`` to point at it for + # the duration of each test so Rule 3 accepts the synthetic location. + # + # The cross-type cases (legacy ↔ fixed-size) confirm the soft-migration + # path: a snapshot can flow from a legacy provider into a fixed-size + # consumer (and vice versa) because resolve_container_path keys off the + # container path, not the host layout. + + def _patch_legacy_root(self): + """Return a legacy root path under tmpdir and patch CONTAINER_VOLUMES_PATH + to match. Caller must call self._unpatch_legacy_root() to restore.""" + import extensions.business.container_apps.sync.manager as manager_mod + legacy = self.tmpdir / "edge_node" / "_local_cache" / "_data" / "container_volumes" + legacy.mkdir(parents=True, exist_ok=True) + self._manager_mod = manager_mod + self._legacy_orig = manager_mod.CONTAINER_VOLUMES_PATH + manager_mod.CONTAINER_VOLUMES_PATH = str(legacy) + return legacy + + def _unpatch_legacy_root(self): + self._manager_mod.CONTAINER_VOLUMES_PATH = self._legacy_orig + + def test_round_trip_legacy_volumes_only(self): + """Provider + consumer both use legacy VOLUMES at the same container + path. Snapshots round-trip byte-for-byte across the legacy root.""" + legacy = self._patch_legacy_root() + try: + prov_host = legacy / "provider_inst_appdata" + cons_host = legacy / "consumer_inst_appdata" + prov_host.mkdir() + cons_host.mkdir() + (prov_host / "weights.bin").write_bytes(b"legacy-only-payload") + (prov_host / "sub").mkdir() + (prov_host / "sub" / "n.txt").write_text("nested") + self.provider.volumes = {str(prov_host): {"bind": "/app/data", "mode": "rw"}} + self.consumer.volumes = {str(cons_host): {"bind": "/app/data", "mode": "rw"}} + + tar_path, _ = self.sm_p.make_archive(["/app/data/"]) + self.sm_c.extract_archive(tar_path) + + self.assertEqual((cons_host / "weights.bin").read_bytes(), b"legacy-only-payload") + self.assertEqual((cons_host / "sub" / "n.txt").read_text(), "nested") + finally: + self._unpatch_legacy_root() + + def test_round_trip_legacy_to_fixed_size(self): + """Provider legacy, consumer fixed-size at the same container path. + Proves the soft-migration scenario: a new fixed-size node can absorb + state from a legacy node without rebuilding the data on the operator + side. Container path is the routing key — host layout differences + are invisible to the archive.""" + legacy = self._patch_legacy_root() + try: + prov_host = legacy / "provider_inst_appdata" + prov_host.mkdir() + (prov_host / "weights.bin").write_bytes(b"legacy-to-fixed") + self.provider.volumes = {str(prov_host): {"bind": "/app/data", "mode": "rw"}} + # Consumer keeps its default fixed-size mount at /app/data + # (set up by _make_owner — host root under fixed_volumes/mounts/). + + tar_path, _ = self.sm_p.make_archive(["/app/data/"]) + self.sm_c.extract_archive(tar_path) + + cons_host = self.consumer._fixed_root / "appdata" + self.assertEqual((cons_host / "weights.bin").read_bytes(), b"legacy-to-fixed") + finally: + self._unpatch_legacy_root() + + def test_round_trip_fixed_size_to_legacy(self): + """Symmetric of the above: provider fixed-size, consumer legacy. Same + archive, opposite host-layout pairing. Result must be identical — + container path drives the routing on both ends.""" + legacy = self._patch_legacy_root() + try: + cons_host = legacy / "consumer_inst_appdata" + cons_host.mkdir() + # Provider's default fixed-size mount at /app/data is already seeded + # by setUp (foo.bin = b"hello world\x00\xff"). + self.consumer.volumes = {str(cons_host): {"bind": "/app/data", "mode": "rw"}} + + tar_path, _ = self.sm_p.make_archive(["/app/data/"]) + self.sm_c.extract_archive(tar_path) + + self.assertEqual((cons_host / "foo.bin").read_bytes(), b"hello world\x00\xff") + self.assertEqual((cons_host / "subdir" / "nested.txt").read_text(), "nested!") + finally: + self._unpatch_legacy_root() + + def test_extract_aborts_on_member_with_no_consumer_mount(self): + # Build a bespoke tar with a member at /app/missing/ that consumer + # doesn't have a mount for. + import tarfile as _tarfile + bad_tar = self.tmpdir / "bad.tar.gz" + src = self.tmpdir / "src" + src.mkdir() + (src / "x.bin").write_text("x") + with _tarfile.open(str(bad_tar), "w:gz") as tar: + tar.add(str(src / "x.bin"), arcname="/app/missing/x.bin") + + with self.assertRaisesRegex(ValueError, "no mounted volume covers"): + self.sm_c.extract_archive(str(bad_tar)) + # No file was created + self.assertFalse((self.consumer._fixed_root / "appdata" / "x.bin").exists()) + + def test_extract_rejects_member_outside_manifest_archive_paths(self): + bad_tar = self.tmpdir / "outside-manifest.tar.gz" + src = self.tmpdir / "outside-manifest.txt" + src.write_text("outside") + with tarfile.open(str(bad_tar), "w:gz") as tar: + tar.add(str(src), arcname="/app/data/other.txt") + + with self.assertRaisesRegex(ValueError, "outside manifest archive_paths"): + self.sm_c.extract_archive( + str(bad_tar), allowed_archive_paths=["/app/data/declared/"] + ) + + self.assertFalse((self.consumer._fixed_root / "appdata" / "other.txt").exists()) + + def test_extract_skips_symlink_members(self): + import tarfile as _tarfile + sym_tar = self.tmpdir / "sym.tar.gz" + src = self.tmpdir / "sym_src" + src.mkdir() + (src / "real.txt").write_text("real") + link_path = src / "link" + os.symlink("real.txt", str(link_path)) + with _tarfile.open(str(sym_tar), "w:gz") as tar: + tar.add(str(src / "real.txt"), arcname="/app/data/real.txt") + info = tar.gettarinfo(str(link_path), arcname="/app/data/link") + tar.addfile(info) + self.sm_c.extract_archive(str(sym_tar)) + self.assertEqual( + (self.consumer._fixed_root / "appdata" / "real.txt").read_text(), "real" + ) + self.assertFalse((self.consumer._fixed_root / "appdata" / "link").exists()) + + def test_extract_rejects_member_through_symlink_directory(self): + outside = self.tmpdir / "outside" + outside.mkdir() + symlink_dir = self.consumer._fixed_root / "appdata" / "escape" + symlink_dir.symlink_to(outside, target_is_directory=True) + + bad_tar = self.tmpdir / "symlink-dir-escape.tar.gz" + src = self.tmpdir / "escape-src.txt" + src.write_text("escaped") + with tarfile.open(str(bad_tar), "w:gz") as tar: + tar.add(str(src), arcname="/app/data/escape/pwn.txt") + + with self.assertRaisesRegex(ValueError, "escapes volume root"): + self.sm_c.extract_archive(str(bad_tar)) + + self.assertFalse((outside / "pwn.txt").exists()) + + def test_extract_rejects_member_over_symlink_file(self): + outside = self.tmpdir / "outside-file.txt" + outside.write_text("outside") + symlink_file = self.consumer._fixed_root / "appdata" / "link.txt" + symlink_file.symlink_to(outside) + + bad_tar = self.tmpdir / "symlink-file-escape.tar.gz" + src = self.tmpdir / "replacement.txt" + src.write_text("replacement") + with tarfile.open(str(bad_tar), "w:gz") as tar: + tar.add(str(src), arcname="/app/data/link.txt") + + with self.assertRaisesRegex(ValueError, "escapes volume root"): + self.sm_c.extract_archive(str(bad_tar)) + + self.assertTrue(symlink_file.is_symlink()) + self.assertEqual(outside.read_text(), "outside") + + def test_extract_strips_special_mode_bits(self): + mode_tar = self.tmpdir / "special-modes.tar.gz" + with tarfile.open(str(mode_tar), "w:gz") as tar: + dir_info = tarfile.TarInfo(name="/app/data/special") + dir_info.type = tarfile.DIRTYPE + dir_info.mode = 0o7777 + tar.addfile(dir_info) + + content = b"payload" + file_info = tarfile.TarInfo(name="/app/data/special/run.sh") + file_info.size = len(content) + file_info.mode = 0o6755 + tar.addfile(file_info, io.BytesIO(content)) + + self.sm_c.extract_archive(str(mode_tar)) + + target_dir = self.consumer._fixed_root / "appdata" / "special" + target_file = target_dir / "run.sh" + self.assertEqual(target_file.read_bytes(), b"payload") + self.assertEqual(os.stat(target_dir).st_mode & 0o7000, 0) + self.assertEqual(os.stat(target_file).st_mode & 0o7000, 0) + + def test_extract_chowns_restored_entries_to_volume_owner(self): + owner_tar = self.tmpdir / "owner.tar.gz" + with tarfile.open(str(owner_tar), "w:gz") as tar: + dir_info = tarfile.TarInfo(name="/app/data/owned") + dir_info.type = tarfile.DIRTYPE + dir_info.mode = 0o755 + tar.addfile(dir_info) + + content = b"payload" + file_info = tarfile.TarInfo(name="/app/data/owned/file.txt") + file_info.size = len(content) + file_info.mode = 0o644 + tar.addfile(file_info, io.BytesIO(content)) + + calls = [] + + def _fake_chown(path, uid, gid): + calls.append((os.path.basename(path), uid, gid)) + + with patch.object(self.sm_c, "_volume_owner", return_value=(1234, 2345)), patch( + "extensions.business.container_apps.sync.manager.os.chown", + side_effect=_fake_chown, + ): + self.sm_c.extract_archive(str(owner_tar)) + + self.assertIn(("owned", 1234, 2345), calls) + self.assertTrue(any(call[1:] == (1234, 2345) for call in calls)) + self.assertEqual( + (self.consumer._fixed_root / "appdata" / "owned" / "file.txt").read_bytes(), + b"payload", + ) + + +# --------------------------------------------------------------------------- +# publish_snapshot +# --------------------------------------------------------------------------- + +class TestPublishSnapshot(unittest.TestCase): + def setUp(self): + self._tmp = tempfile.TemporaryDirectory() + self.tmpdir = Path(self._tmp.name) + self.owner = _make_owner(self.tmpdir) + self.sm = SyncManager(self.owner) + self.vsd = volume_sync_dir(self.owner) + self.vsd.mkdir(parents=True, exist_ok=True) + # Seed the data volume so make_archive can find content + appdata = self.owner._fixed_root / "appdata" + (appdata / "weights.bin").write_bytes(b"weights-content") + # Simulate having claimed a request — leave a .processing file so + # publish_snapshot's clean-up paths can be exercised. + (self.vsd / SYNC_PROCESSING_FILE).write_text( + json.dumps({"archive_paths": ["/app/data/"], "metadata": {}}) + ) + + def tearDown(self): + self._tmp.cleanup() + + def test_happy_path_writes_response_history_and_chainstore(self): + ok = self.sm.publish_snapshot(["/app/data/"], {"epoch": 1}) + self.assertTrue(ok) + + # Response.json + resp = json.loads((self.vsd / "response.json").read_text()) + self.assertEqual(resp["status"], "ok") + self.assertTrue(resp["cid"].startswith("QmFAKE")) + self.assertGreater(resp["archive_size_bytes"], 0) + self.assertTrue(resp["chainstore_ack"]) + + # ChainStore record + self.assertEqual(len(self.owner._cs.hset_calls), 1) + hkey, key, value = self.owner._cs.hset_calls[0] + self.assertEqual(hkey, "CHAINSTORE_SYNC") + self.assertEqual(key, "11111111-1111-1111-1111-111111111111") + self.assertEqual(value["cid"], resp["cid"]) + self.assertEqual(value["manifest"]["archive_paths"], ["/app/data/"]) + self.assertEqual(value["manifest"]["schema_version"], 1) + self.assertEqual(value["manifest"]["archive_format"], "tar.gz") + self.assertEqual(value["manifest"]["runtime"]["provider_capture"], "offline") + self.assertEqual(value["manifest"]["runtime"]["consumer_apply"], "offline_restart") + self.assertEqual(value["metadata"], {"epoch": 1}) + + # History + sent_dir = history_sent_dir(self.owner) + files = list(sent_dir.glob("*.json")) + self.assertEqual(len(files), 1) + entry = json.loads(files[0].read_text()) + self.assertEqual(entry["cid"], resp["cid"]) + self.assertEqual(entry["chainstore_ack"], True) + self.assertEqual(entry["request"]["archive_paths"], ["/app/data/"]) + self.assertIsNone(entry["deletion"]["deleted_at"]) + + # .processing cleaned up + self.assertFalse((self.vsd / "request.json.processing").exists()) + # No .invalid because success + self.assertFalse((self.vsd / "request.json.invalid").exists()) + + def test_online_provider_capture_uses_docker_archive_for_unmounted_path(self): + self.owner.cfg_sync_allow_online_provider_capture = True + self.owner.container = _FakeDockerArchiveContainer({ + "/tmp/generated.txt": _tar_bytes("generated.txt", b"from-container"), + }) + request = SyncRequest( + archive_paths=["/tmp/generated.txt"], + metadata={"epoch": 2}, + runtime=SyncRuntimePolicy(provider_capture=PROVIDER_CAPTURE_ONLINE), + ) + + ok = self.sm.publish_snapshot(request) + + self.assertTrue(ok) + self.assertEqual(self.owner.container.get_archive_calls, ["/tmp/generated.txt"]) + record = self.owner._cs.hset_calls[0][2] + self.assertEqual(record["manifest"]["archive_paths"], ["/tmp/generated.txt"]) + self.assertEqual(record["manifest"]["runtime"]["provider_capture"], "online") + + stored_tar = self.owner._r1fs.added[record["cid"]] + tar_path = self.tmpdir / "online.tar.gz" + tar_path.write_bytes(stored_tar) + with tarfile.open(tar_path, "r:gz") as tar: + member = tar.getmember("tmp/generated.txt") + self.assertEqual(tar.extractfile(member).read(), b"from-container") + + def test_clears_existing_invalid_on_success(self): + (self.vsd / "request.json.invalid").write_text('{"old": true}') + self.sm.publish_snapshot(["/app/data/"], {}) + self.assertFalse((self.vsd / "request.json.invalid").exists()) + + def test_archive_build_failure(self): + self.owner._fixed_root.joinpath("appdata", "weights.bin").unlink() + ok = self.sm.publish_snapshot(["/app/data/missing.bin"], {}) + self.assertFalse(ok) + invalid = json.loads((self.vsd / "request.json.invalid").read_text()) + self.assertEqual(invalid["_error"]["stage"], "archive_build") + resp = json.loads((self.vsd / "response.json").read_text()) + self.assertEqual(resp["status"], "error") + self.assertEqual(resp["stage"], "archive_build") + # No history entry written + self.assertEqual(len(list(history_sent_dir(self.owner).glob("*.json"))), 0) + + def test_r1fs_upload_failure(self): + self.owner._r1fs.add_should_raise = RuntimeError("ipfs offline") + ok = self.sm.publish_snapshot(["/app/data/"], {}) + self.assertFalse(ok) + invalid = json.loads((self.vsd / "request.json.invalid").read_text()) + self.assertEqual(invalid["_error"]["stage"], "r1fs_upload") + self.assertIn("ipfs offline", invalid["_error"]["error"]) + self.assertEqual(self.owner._cs.hset_calls, []) + + def test_chainstore_publish_failure(self): + self.owner._cs.hset_should_raise = RuntimeError("peers unreachable") + ok = self.sm.publish_snapshot(["/app/data/"], {}) + self.assertFalse(ok) + invalid = json.loads((self.vsd / "request.json.invalid").read_text()) + self.assertEqual(invalid["_error"]["stage"], "chainstore_publish") + # No history because we failed before append + self.assertEqual(len(list(history_sent_dir(self.owner).glob("*.json"))), 0) + # CID landed in r1fs but was cleaned up before returning failure. + self.assertEqual(len(self.owner._r1fs.added), 0) + self.assertEqual(len(self.owner._r1fs.deleted), 1) + + def test_chainstore_no_ack_fails_and_cleans_uploaded_cid(self): + self.owner._cs.hset_returns = False + + ok = self.sm.publish_snapshot(["/app/data/"], {}) + + self.assertFalse(ok) + invalid = json.loads((self.vsd / "request.json.invalid").read_text()) + self.assertEqual(invalid["_error"]["stage"], "chainstore_publish") + self.assertIn("ack", invalid["_error"]["error"]) + self.assertEqual(len(list(history_sent_dir(self.owner).glob("*.json"))), 0) + self.assertEqual(self.owner._r1fs.added, {}) + self.assertEqual(len(self.owner._r1fs.deleted), 1) + + def test_sent_history_failure_after_chainstore_ack_still_completes_request(self): + with patch.object(self.sm, "append_sent", side_effect=RuntimeError("disk full")): + ok = self.sm.publish_snapshot(["/app/data/"], {"epoch": 7}) + + self.assertTrue(ok) + self.assertEqual(len(self.owner._cs.hset_calls), 1) + resp = json.loads((self.vsd / "response.json").read_text()) + self.assertEqual(resp["status"], "ok") + self.assertIn("disk full", resp["history_error"]) + self.assertFalse((self.vsd / "request.json.processing").exists()) + self.assertFalse((self.vsd / "request.json.invalid").exists()) + + def test_sent_history_failure_skips_prior_cid_retirement(self): + self.sm.publish_snapshot(["/app/data/"], {"epoch": 1}) + first = json.loads(next(history_sent_dir(self.owner).glob("*.json")).read_text()) + + (self.owner._fixed_root / "appdata" / "weights.bin").write_bytes(b"v2") + (self.vsd / SYNC_PROCESSING_FILE).write_text("{}") + with patch.object(self.sm, "append_sent", side_effect=RuntimeError("disk full")): + ok = self.sm.publish_snapshot(["/app/data/"], {"epoch": 2}) + + self.assertTrue(ok) + files = list(history_sent_dir(self.owner).glob("*.json")) + self.assertEqual(len(files), 1) + still_first = json.loads(files[0].read_text()) + self.assertEqual(still_first["cid"], first["cid"]) + self.assertIsNone(still_first["deletion"]["deleted_at"]) + self.assertFalse(self.owner._r1fs.deleted) + + def test_two_snapshots_retire_first_cid(self): + self.sm.publish_snapshot(["/app/data/"], {"epoch": 1}) + # Update content for the second snapshot + (self.owner._fixed_root / "appdata" / "weights.bin").write_bytes(b"v2") + # Re-create .processing because publish_snapshot deleted it + (self.vsd / SYNC_PROCESSING_FILE).write_text("{}") + self.sm.publish_snapshot(["/app/data/"], {"epoch": 2}) + + files = sorted(history_sent_dir(self.owner).glob("*.json")) + self.assertEqual(len(files), 2) + older = json.loads(files[0].read_text()) + newer = json.loads(files[1].read_text()) + + self.assertTrue(older["deletion"]["deletion_succeeded"]) + self.assertIsNotNone(older["deletion"]["deleted_at"]) + self.assertIsNone(older["deletion"]["error"]) if older["deletion"].get("error") else None + + self.assertIsNone(newer["deletion"]["deleted_at"]) + + deleted_cids = [d[0] for d in self.owner._r1fs.deleted] + self.assertEqual(deleted_cids, [older["cid"]]) + + def test_retire_records_failure(self): + self.sm.publish_snapshot(["/app/data/"], {"epoch": 1}) + (self.owner._fixed_root / "appdata" / "weights.bin").write_bytes(b"v2") + (self.vsd / SYNC_PROCESSING_FILE).write_text("{}") + self.owner._r1fs.delete_should_raise = RuntimeError("daemon paused") + + self.sm.publish_snapshot(["/app/data/"], {"epoch": 2}) + + files = sorted(history_sent_dir(self.owner).glob("*.json")) + older = json.loads(files[0].read_text()) + self.assertIsNone(older["deletion"]["deleted_at"]) + self.assertFalse(older["deletion"]["deletion_succeeded"]) + self.assertIn("daemon paused", older["deletion"]["deletion_error"]) + + def test_retire_retries_after_failure(self): + self.sm.publish_snapshot(["/app/data/"], {"epoch": 1}) + (self.owner._fixed_root / "appdata" / "weights.bin").write_bytes(b"v2") + (self.vsd / SYNC_PROCESSING_FILE).write_text("{}") + self.owner._r1fs.delete_should_raise = RuntimeError("daemon paused") + + self.sm.publish_snapshot(["/app/data/"], {"epoch": 2}) + self.owner._r1fs.delete_should_raise = None + self.sm._retire_previous_cid(history_sent_dir(self.owner)) + + files = sorted(history_sent_dir(self.owner).glob("*.json")) + older = json.loads(files[0].read_text()) + self.assertIsNotNone(older["deletion"]["deleted_at"]) + self.assertTrue(older["deletion"]["deletion_succeeded"]) + + def test_retire_uses_mtime_not_version(self): + """A higher-version entry that was written BEFORE a lower-version entry + must be retired when the lower-version one is "latest". Mirrors the + contract from ``_latest_in``: the answer to "what did we just do?" is + insert-order (mtime), not whatever ``version`` happens to be in the + entry. Without this guarantee a clock-skewed provider or multi-provider + sync set can cause the just-published CID to be retired on the next + publish. + """ + sent_dir = history_sent_dir(self.owner) + sent_dir.mkdir(parents=True, exist_ok=True) + + # Older-by-mtime but higher version (would sort last by filename). + # Use the canonical filename helper so update_history_deletion can find + # the file via its __.json convention. + older_path = sent_dir / self.sm._history_filename(100, "QmCID_A") + older_path.write_text(json.dumps({ + "cid": "QmCID_A", "version": 100, + "deletion": {"deleted_at": None, "deletion_succeeded": None, "deletion_error": None}, + })) + os.utime(older_path, (1000, 1000)) + + # Newer-by-mtime but lower version (would sort first by filename) + newer_path = sent_dir / self.sm._history_filename(50, "QmCID_B") + newer_path.write_text(json.dumps({ + "cid": "QmCID_B", "version": 50, + "deletion": {"deleted_at": None, "deletion_succeeded": None, "deletion_error": None}, + })) + os.utime(newer_path, (2000, 2000)) + + self.sm._retire_previous_cid(sent_dir) + + older_after = json.loads(older_path.read_text()) + newer_after = json.loads(newer_path.read_text()) + + # The just-written (newer-by-mtime) entry must be left alone. + self.assertIsNone(newer_after["deletion"]["deleted_at"]) + # The older-by-mtime entry should be retired, even though it has the + # higher version number. + self.assertIsNotNone(older_after["deletion"]["deleted_at"]) + self.assertTrue(older_after["deletion"]["deletion_succeeded"]) + + deleted_cids = [d[0] for d in self.owner._r1fs.deleted] + self.assertEqual(deleted_cids, ["QmCID_A"]) + + def test_archive_tmp_cleaned_up_on_success(self): + self.sm.publish_snapshot(["/app/data/"], {}) + leftovers = list(self.owner._output_folder.glob("sync_archive_*.tar.gz")) + self.assertEqual(leftovers, []) + + def test_archive_tmp_cleaned_up_on_failure(self): + self.owner._cs.hset_should_raise = RuntimeError("boom") + self.sm.publish_snapshot(["/app/data/"], {}) + leftovers = list(self.owner._output_folder.glob("sync_archive_*.tar.gz")) + self.assertEqual(leftovers, []) + + +# --------------------------------------------------------------------------- +# fetch_latest + validate_manifest + apply_snapshot +# --------------------------------------------------------------------------- + +class TestConsumerFlow(unittest.TestCase): + def setUp(self): + self._tmp = tempfile.TemporaryDirectory() + self.tmpdir = Path(self._tmp.name) + # Build provider AND consumer owners that share an r1fs+chainstore so + # we can do a true end-to-end publish→apply round-trip. + shared_r1fs = _FakeR1FS() + shared_cs = _FakeChainStore() + + self.provider = _make_owner(self.tmpdir / "p") + self.consumer = _make_owner(self.tmpdir / "c") + for o in (self.provider, self.consumer): + o.r1fs = shared_r1fs + o._r1fs = shared_r1fs + o.chainstore_hset = shared_cs.hset + o.chainstore_hget = shared_cs.hget + o.chainstore_hsync = shared_cs.hsync + o._cs = shared_cs + self.consumer.cfg_sync_type = "consumer" + + self.sm_p = SyncManager(self.provider) + self.sm_c = SyncManager(self.consumer) + + # Provision provider's volume-sync subdir + seed data + volume_sync_dir(self.provider).mkdir(parents=True, exist_ok=True) + volume_sync_dir(self.consumer).mkdir(parents=True, exist_ok=True) + (self.provider._fixed_root / "appdata" / "weights.bin").write_bytes(b"hello") + + def tearDown(self): + self._tmp.cleanup() + + # ----- validate_manifest -------------------------------------------------- + + def _ok_manifest(self, **overrides): + """Return a minimally-valid manifest dict. Tests override fields they + care about; the rest stay sane defaults so we don't have to copy the + schema boilerplate everywhere.""" + manifest = { + "schema_version": 1, + "archive_format": "tar.gz", + "encryption": "r1fs-default", + "archive_paths": ["/app/data/"], + } + manifest.update(overrides) + return {"manifest": manifest} + + def test_validate_manifest_empty_when_aligned(self): + self.assertEqual(self.sm_c.validate_manifest(self._ok_manifest()), []) + + def test_validate_manifest_returns_missing_paths(self): + record = self._ok_manifest(archive_paths=["/app/data/", "/somewhere/else/"]) + reasons = self.sm_c.validate_manifest(record) + self.assertEqual(len(reasons), 1) + self.assertIn("/somewhere/else/", reasons[0]) + self.assertIn("unmapped archive_paths", reasons[0]) + + def test_validate_manifest_rejects_unsupported_schema_version(self): + """A manifest from a future CAR that bumped MANIFEST_SCHEMA_VERSION must + be refused rather than silently applied — schema bumps signal breaking + format changes the current consumer can't safely interpret. Codex + review finding 4 on PR #399.""" + record = self._ok_manifest(schema_version=999) + reasons = self.sm_c.validate_manifest(record) + self.assertEqual(len(reasons), 1) + self.assertIn("schema_version", reasons[0]) + self.assertIn("999", reasons[0]) + + def test_validate_manifest_rejects_missing_schema_version(self): + record = {"manifest": {"archive_format": "tar.gz", "archive_paths": ["/app/data/"]}} + reasons = self.sm_c.validate_manifest(record) + self.assertTrue(any("schema_version" in r for r in reasons)) + + def test_validate_manifest_rejects_non_int_schema_version(self): + record = self._ok_manifest(schema_version="1") + reasons = self.sm_c.validate_manifest(record) + self.assertTrue(any("schema_version" in r for r in reasons)) + + def test_validate_manifest_rejects_unsupported_archive_format(self): + record = self._ok_manifest(archive_format="zip") + reasons = self.sm_c.validate_manifest(record) + self.assertEqual(len(reasons), 1) + self.assertIn("archive_format", reasons[0]) + self.assertIn("zip", reasons[0]) + self.assertIn("tar.gz", reasons[0]) + + def test_validate_manifest_rejects_unsupported_encryption(self): + record = self._ok_manifest(encryption="plaintext") + reasons = self.sm_c.validate_manifest(record) + self.assertEqual(len(reasons), 1) + self.assertIn("encryption", reasons[0]) + self.assertIn("plaintext", reasons[0]) + self.assertIn("r1fs-default", reasons[0]) + + def test_validate_manifest_collects_multiple_violations(self): + """Schema + format + path violations all surface in one pass so the + operator sees the full picture in a single log line.""" + record = self._ok_manifest( + schema_version=999, archive_format="zip", + archive_paths=["/app/data/", "/nope/"], + ) + reasons = self.sm_c.validate_manifest(record) + self.assertEqual(len(reasons), 3) + joined = "; ".join(reasons) + self.assertIn("schema_version", joined) + self.assertIn("archive_format", joined) + self.assertIn("/nope/", joined) + + def test_validate_manifest_handles_no_manifest(self): + # An empty record / empty manifest is non-conformant (missing required + # schema_version + archive_format), so it must be rejected. + self.assertNotEqual(self.sm_c.validate_manifest({}), []) + self.assertNotEqual(self.sm_c.validate_manifest({"manifest": {}}), []) + + def test_validate_manifest_rejects_non_dict_manifest(self): + reasons = self.sm_c.validate_manifest({"manifest": "not-an-object"}) + + self.assertEqual(reasons, ["manifest must be a JSON object"]) + + def test_validate_manifest_rejects_missing_archive_paths(self): + record = self._ok_manifest() + del record["manifest"]["archive_paths"] + reasons = self.sm_c.validate_manifest(record) + self.assertTrue(any("archive_paths" in r for r in reasons)) + + def test_validate_manifest_rejects_empty_archive_paths(self): + reasons = self.sm_c.validate_manifest(self._ok_manifest(archive_paths=[])) + self.assertTrue(any("non-empty list" in r for r in reasons)) + + def test_validate_manifest_rejects_non_list_archive_paths(self): + reasons = self.sm_c.validate_manifest(self._ok_manifest(archive_paths="/app/data/")) + self.assertTrue(any("non-empty list" in r for r in reasons)) + + def test_validate_manifest_rejects_non_string_archive_path_entries(self): + reasons = self.sm_c.validate_manifest(self._ok_manifest(archive_paths=["/app/data/", 7])) + self.assertTrue(any("invalid archive_paths" in r for r in reasons)) + + def test_validate_manifest_rejects_non_dict(self): + self.assertEqual(self.sm_c.validate_manifest(None), ["manifest record is not a dict"]) + self.assertEqual(self.sm_c.validate_manifest("string"), ["manifest record is not a dict"]) + + def test_validate_record_rejects_missing_envelope_fields(self): + reasons = self.sm_c.validate_record_for_apply({ + "cid": "", + "version": "1", + "manifest": self._ok_manifest()["manifest"], + }) + + joined = "; ".join(reasons) + self.assertIn("cid", joined) + self.assertIn("version", joined) + + # ----- fetch_latest ------------------------------------------------------- + + def test_fetch_latest_empty_returns_none(self): + self.assertIsNone(self.sm_c.fetch_latest()) + # hsync was still called + self.assertEqual(self.consumer._cs.hsync_calls, ["CHAINSTORE_SYNC"]) + + def test_fetch_latest_after_publish_returns_record(self): + (self.provider.__dict__["_fixed_root"] / "appdata" / "weights.bin").write_bytes(b"x") + (volume_sync_dir(self.provider) / SYNC_PROCESSING_FILE).write_text("{}") + self.sm_p.publish_snapshot(["/app/data/"], {"epoch": 5}) + record = self.sm_c.fetch_latest() + self.assertIsNotNone(record) + self.assertEqual(record["metadata"], {"epoch": 5}) + + def test_fetch_latest_no_sync_key_returns_none(self): + self.consumer.cfg_sync_key = None + self.assertIsNone(self.sm_c.fetch_latest()) + + def test_hsync_gated_by_interval_skips_within_window(self): + """The expensive chainstore_hsync is rate-limited; a second fetch_latest + inside the configured HSYNC_POLL_INTERVAL window only does the cheap + local hget, leaving hsync_calls at one entry.""" + self.consumer.cfg_sync_hsync_poll_interval = 60.0 + self.sm_c.fetch_latest() + self.sm_c.fetch_latest() # ~1s later (mock clock increments per time() call) + self.assertEqual(self.consumer._cs.hsync_calls, ["CHAINSTORE_SYNC"]) + + def test_hsync_fires_again_after_interval_elapses(self): + """Once HSYNC_POLL_INTERVAL has elapsed since the last hsync, the next + fetch_latest does a fresh network round-trip.""" + self.consumer.cfg_sync_hsync_poll_interval = 60.0 + self.sm_c.fetch_latest() + # Back-date the last-hsync stamp so the next call falls outside the + # window without having to actually wait 60s. + self.sm_c._last_hsync = self.sm_c._last_hsync - 70.0 + self.sm_c.fetch_latest() + self.assertEqual(self.consumer._cs.hsync_calls, ["CHAINSTORE_SYNC", "CHAINSTORE_SYNC"]) + + def test_hsync_failure_retries_before_full_success_interval(self): + """A timing-out / failing hsync should not suppress retries for the full + success interval. It still avoids retrying on the immediate next tick, but + becomes eligible again after the shorter failure retry window.""" + self.consumer.cfg_sync_hsync_poll_interval = 60.0 + self.consumer._cs.hsync_should_raise = RuntimeError("offline") + self.sm_c.fetch_latest() # hsync raises (caught), retry after 30s + self.sm_c.fetch_latest() # immediate next tick -> still skipped + self.assertEqual(self.consumer._cs.hsync_calls, ["CHAINSTORE_SYNC"]) + self.sm_c._last_hsync = self.sm_c._last_hsync - 31.0 + self.sm_c.fetch_latest() + self.assertEqual(self.consumer._cs.hsync_calls, ["CHAINSTORE_SYNC", "CHAINSTORE_SYNC"]) + + # ----- apply_snapshot ----------------------------------------------------- + + def test_apply_round_trip(self): + (volume_sync_dir(self.provider) / SYNC_PROCESSING_FILE).write_text("{}") + self.sm_p.publish_snapshot(["/app/data/"], {"epoch": 9}) + + record = self.sm_c.fetch_latest() + ok = self.sm_c.apply_snapshot(record) + self.assertTrue(ok) + + # File extracted + target = self.consumer._fixed_root / "appdata" / "weights.bin" + self.assertEqual(target.read_bytes(), b"hello") + + # last_apply.json written + la = json.loads((volume_sync_dir(self.consumer) / "last_apply.json").read_text()) + self.assertEqual(la["cid"], record["cid"]) + self.assertEqual(la["version"], record["version"]) + self.assertIn("applied_timestamp", la) + + # Host-private apply state is the durable dedupe source. + state = json.loads((sync_state_dir(self.consumer) / "current_apply.json").read_text()) + self.assertEqual(state["state"], "applied") + self.assertEqual(state["cid"], record["cid"]) + + # History entry + files = list(history_received_dir(self.consumer).glob("*.json")) + self.assertEqual(len(files), 1) + entry = json.loads(files[0].read_text()) + self.assertEqual(entry["cid"], record["cid"]) + # tarfile strips trailing slashes on directory members; the consumer + # re-prepends the leading slash on extract, so directory entries land + # without their trailing slash. + self.assertEqual(entry["extracted_paths"], ["/app/data", "/app/data/weights.bin"]) + self.assertIsNone(entry["deletion"]["deleted_at"]) + + def test_apply_skips_when_misaligned(self): + # Provider includes a path consumer doesn't have a mount for. + # We can't legitimately publish such a record (provider would also reject + # it), so build it manually and stuff into chainstore. + self.consumer._cs.store[("CHAINSTORE_SYNC", self.consumer.cfg_sync_key)] = { + "cid": "QmFAKE99999999", + "version": 9999999999, + "timestamp": 1234.0, + "node_id": "ee_someone", + "metadata": {}, + "manifest": { + "schema_version": 1, + "archive_paths": ["/app/data/", "/foo/bar/"], + "archive_format": "tar.gz", + "encryption": "r1fs-default", + "archive_size_bytes": 100, + }, + } + record = self.sm_c.fetch_latest() + ok = self.sm_c.apply_snapshot(record) + self.assertFalse(ok) + # No last_apply, no history advance + self.assertFalse((volume_sync_dir(self.consumer) / "last_apply.json").exists()) + self.assertEqual(len(list(history_received_dir(self.consumer).glob("*.json"))), 0) + # Useful error message — should name the path that couldn't be mapped. + self.assertTrue(any("unmapped archive_paths" in m for m in self.consumer._msgs)) + self.assertTrue(any("/foo/bar/" in m for m in self.consumer._msgs)) + + def test_apply_rejects_non_dict_manifest_without_raising(self): + record = { + "cid": "QmFAKE_BAD_MANIFEST", + "version": 123, + "timestamp": 1234.0, + "node_id": "ee_someone", + "metadata": {}, + "manifest": "not-an-object", + } + + ok = self.sm_c.apply_snapshot(record) + + self.assertFalse(ok) + self.assertFalse((volume_sync_dir(self.consumer) / "last_apply.json").exists()) + self.assertTrue(any("manifest must be a JSON object" in m for m in self.consumer._msgs)) + + def test_apply_rejects_tar_member_outside_manifest_archive_paths(self): + cid = "QmOUTSIDE_MANIFEST" + bad_tar = self.tmpdir / "outside-manifest-apply.tar.gz" + src = self.tmpdir / "outside-apply.txt" + src.write_text("outside") + with tarfile.open(str(bad_tar), "w:gz") as tar: + tar.add(str(src), arcname="/app/data/other.txt") + self.consumer._r1fs.added[cid] = bad_tar.read_bytes() + + record = { + "cid": cid, + "version": 123, + "timestamp": 1.0, + "node_id": "ee_provider", + "metadata": {}, + "manifest": { + "schema_version": 1, + "archive_paths": ["/app/data/declared/"], + "archive_format": "tar.gz", + "encryption": "r1fs-default", + }, + } + + ok = self.sm_c.apply_snapshot(record) + + self.assertFalse(ok) + self.assertFalse((volume_sync_dir(self.consumer) / "last_apply.json").exists()) + self.assertFalse((self.consumer._fixed_root / "appdata" / "other.txt").exists()) + self.assertTrue( + any("outside manifest archive_paths" in m for m in self.consumer._msgs) + ) + + def test_apply_aborts_on_r1fs_get_failure(self): + (volume_sync_dir(self.provider) / SYNC_PROCESSING_FILE).write_text("{}") + self.sm_p.publish_snapshot(["/app/data/"], {}) + record = self.sm_c.fetch_latest() + self.consumer._r1fs.get_should_raise = RuntimeError("network down") + ok = self.sm_c.apply_snapshot(record) + self.assertFalse(ok) + self.assertFalse((volume_sync_dir(self.consumer) / "last_apply.json").exists()) + self.assertIsNotNone(self.sm_c.quarantined_record(record)) + + def test_apply_success_dedupes_from_state_when_history_append_fails(self): + (volume_sync_dir(self.provider) / SYNC_PROCESSING_FILE).write_text("{}") + self.sm_p.publish_snapshot(["/app/data/"], {}) + record = self.sm_c.fetch_latest() + + with patch.object(self.sm_c, "append_received", side_effect=RuntimeError("disk full")): + ok = self.sm_c.apply_snapshot(record) + + self.assertTrue(ok) + self.assertEqual(len(list(history_received_dir(self.consumer).glob("*.json"))), 0) + latest = self.sm_c.latest_applied() + self.assertEqual(latest["state"], "applied") + self.assertEqual(latest["cid"], record["cid"]) + + def test_commit_prepared_apply_rolls_back_touched_files_on_failure(self): + target = self.consumer._fixed_root / "appdata" / "weights.bin" + target.write_bytes(b"old") + second = self.consumer._fixed_root / "appdata" / "second.bin" + + tar_path = self.tmpdir / "rollback.tar.gz" + src1 = self.tmpdir / "new-weights.bin" + src2 = self.tmpdir / "second.bin" + src1.write_bytes(b"new") + src2.write_bytes(b"second") + with tarfile.open(str(tar_path), "w:gz") as tar: + tar.add(str(src1), arcname="/app/data/weights.bin") + tar.add(str(src2), arcname="/app/data/second.bin") + + cid = "QmROLLBACK" + self.consumer._r1fs.added[cid] = tar_path.read_bytes() + record = { + "cid": cid, + "version": 123, + "timestamp": 1.0, + "node_id": "ee_provider", + "metadata": {}, + "manifest": { + "schema_version": 1, + "archive_paths": ["/app/data/"], + "archive_format": "tar.gz", + "encryption": "r1fs-default", + }, + } + prepared = self.sm_c.prepare_apply(record) + self.assertIsNotNone(prepared) + + with patch.object( + self.sm_c, + "_safe_extract_mode", + side_effect=[0o644, RuntimeError("chmod failed")], + ): + result = self.sm_c.commit_prepared_apply(prepared) + + self.assertFalse(result.success) + self.assertTrue(result.restart_safe) + self.assertEqual(result.state, "failed_rolled_back") + self.assertEqual(target.read_bytes(), b"old") + self.assertFalse(second.exists()) + state = json.loads((sync_state_dir(self.consumer) / "current_apply.json").read_text()) + self.assertEqual(state["state"], "failed_rolled_back") + + def test_commit_prepared_apply_restores_directory_metadata_on_failure(self): + existing = self.consumer._fixed_root / "appdata" / "existing" + existing.mkdir() + os.chmod(existing, 0o700) + before_mode = stat.S_IMODE(os.stat(existing).st_mode) + + tar_path = self.tmpdir / "rollback-dir-metadata.tar.gz" + src = self.tmpdir / "new.bin" + src.write_bytes(b"new") + with tarfile.open(str(tar_path), "w:gz") as tar: + info = tarfile.TarInfo(name="/app/data/existing") + info.type = tarfile.DIRTYPE + info.mode = 0o755 + tar.addfile(info) + tar.add(str(src), arcname="/app/data/new.bin") + + cid = "QmDIRMETAROLLBACK" + self.consumer._r1fs.added[cid] = tar_path.read_bytes() + record = { + "cid": cid, + "version": 123, + "timestamp": 1.0, + "node_id": "ee_provider", + "metadata": {}, + "manifest": { + "schema_version": 1, + "archive_paths": ["/app/data/"], + "archive_format": "tar.gz", + "encryption": "r1fs-default", + }, + } + prepared = self.sm_c.prepare_apply(record) + self.assertIsNotNone(prepared) + + with patch.object( + self.sm_c, + "_safe_extract_mode", + side_effect=[0o755, RuntimeError("forced later failure")], + ): + result = self.sm_c.commit_prepared_apply(prepared) + + self.assertFalse(result.success) + self.assertTrue(result.restart_safe) + self.assertEqual(result.state, "failed_rolled_back") + self.assertEqual(stat.S_IMODE(os.stat(existing).st_mode), before_mode) + self.assertFalse((self.consumer._fixed_root / "appdata" / "new.bin").exists()) + state = json.loads((sync_state_dir(self.consumer) / "current_apply.json").read_text()) + self.assertEqual(state["state"], "failed_rolled_back") + + def test_commit_prepared_apply_removes_created_parent_dirs_on_failure(self): + new_root = self.consumer._fixed_root / "appdata" / "new" + child = new_root / "child" + target = child / "file.bin" + + tar_path = self.tmpdir / "rollback-created-parents.tar.gz" + src = self.tmpdir / "file.bin" + src.write_bytes(b"new") + with tarfile.open(str(tar_path), "w:gz") as tar: + tar.add(str(src), arcname="/app/data/new/child/file.bin") + + cid = "QmCREATEDPARENTS" + self.consumer._r1fs.added[cid] = tar_path.read_bytes() + record = { + "cid": cid, + "version": 123, + "timestamp": 1.0, + "node_id": "ee_provider", + "metadata": {}, + "manifest": { + "schema_version": 1, + "archive_paths": ["/app/data/"], + "archive_format": "tar.gz", + "encryption": "r1fs-default", + }, + } + prepared = self.sm_c.prepare_apply(record) + self.assertIsNotNone(prepared) + + with patch.object( + self.sm_c, + "_safe_extract_mode", + side_effect=RuntimeError("forced file failure"), + ): + result = self.sm_c.commit_prepared_apply(prepared) + + self.assertFalse(result.success) + self.assertTrue(result.restart_safe) + self.assertEqual(result.state, "failed_rolled_back") + self.assertFalse(target.exists()) + self.assertFalse(child.exists()) + self.assertFalse(new_root.exists()) + + def test_commit_prepared_apply_reports_uncertain_when_dir_metadata_rollback_fails(self): + existing = self.consumer._fixed_root / "appdata" / "existing" + existing.mkdir() + os.chmod(existing, 0o700) + + tar_path = self.tmpdir / "rollback-dir-metadata-fails.tar.gz" + src = self.tmpdir / "new.bin" + src.write_bytes(b"new") + with tarfile.open(str(tar_path), "w:gz") as tar: + info = tarfile.TarInfo(name="/app/data/existing") + info.type = tarfile.DIRTYPE + info.mode = 0o755 + tar.addfile(info) + tar.add(str(src), arcname="/app/data/new.bin") + + cid = "QmDIRMETAUNCERTAIN" + self.consumer._r1fs.added[cid] = tar_path.read_bytes() + record = { + "cid": cid, + "version": 123, + "timestamp": 1.0, + "node_id": "ee_provider", + "metadata": {}, + "manifest": { + "schema_version": 1, + "archive_paths": ["/app/data/"], + "archive_format": "tar.gz", + "encryption": "r1fs-default", + }, + } + prepared = self.sm_c.prepare_apply(record) + self.assertIsNotNone(prepared) + + import extensions.business.container_apps.sync.manager as manager_mod + original_chmod = manager_mod.os.chmod + + def _fail_restoring_existing_dir(path, mode): + if os.path.normpath(path) == os.path.normpath(existing) and mode == 0o700: + raise OSError("restore chmod failed") + return original_chmod(path, mode) + + with patch.object( + self.sm_c, + "_safe_extract_mode", + side_effect=[0o755, RuntimeError("forced later failure")], + ), patch( + "extensions.business.container_apps.sync.manager.os.chmod", + side_effect=_fail_restoring_existing_dir, + ): + result = self.sm_c.commit_prepared_apply(prepared) + + self.assertFalse(result.success) + self.assertFalse(result.restart_safe) + self.assertEqual(result.state, "uncertain") + state = json.loads((sync_state_dir(self.consumer) / "current_apply.json").read_text()) + self.assertEqual(state["state"], "uncertain") + + def test_apply_rejects_symlink_escape_without_advancing_state(self): + outside = self.tmpdir / "outside" + outside.mkdir() + symlink_dir = self.consumer._fixed_root / "appdata" / "escape" + symlink_dir.symlink_to(outside, target_is_directory=True) + + bad_tar = self.tmpdir / "bad-apply.tar.gz" + src = self.tmpdir / "bad-apply-src.txt" + src.write_text("escaped") + with tarfile.open(str(bad_tar), "w:gz") as tar: + tar.add(str(src), arcname="/app/data/escape/pwn.txt") + + cid = "QmBADSYMLINKESCAPE" + self.consumer._r1fs.added[cid] = bad_tar.read_bytes() + record = { + "cid": cid, + "version": 123, + "timestamp": 456.0, + "node_id": "ee_bad", + "metadata": {}, + "manifest": { + "schema_version": 1, + "archive_paths": ["/app/data/"], + "archive_format": "tar.gz", + "encryption": "r1fs-default", + "archive_size_bytes": bad_tar.stat().st_size, + }, + } + + ok = self.sm_c.apply_snapshot(record) + + self.assertFalse(ok) + self.assertFalse((outside / "pwn.txt").exists()) + self.assertFalse((volume_sync_dir(self.consumer) / "last_apply.json").exists()) + self.assertEqual(len(list(history_received_dir(self.consumer).glob("*.json"))), 0) + + def test_apply_two_snapshots_retires_first(self): + # First publish + apply + (volume_sync_dir(self.provider) / SYNC_PROCESSING_FILE).write_text("{}") + self.sm_p.publish_snapshot(["/app/data/"], {"v": 1}) + rec1 = self.sm_c.fetch_latest() + self.sm_c.apply_snapshot(rec1) + # Second publish + apply + (self.provider._fixed_root / "appdata" / "weights.bin").write_bytes(b"v2") + (volume_sync_dir(self.provider) / SYNC_PROCESSING_FILE).write_text("{}") + self.sm_p.publish_snapshot(["/app/data/"], {"v": 2}) + rec2 = self.sm_c.fetch_latest() + self.sm_c.apply_snapshot(rec2) + + files = sorted(history_received_dir(self.consumer).glob("*.json")) + self.assertEqual(len(files), 2) + older = json.loads(files[0].read_text()) + newer = json.loads(files[1].read_text()) + self.assertTrue(older["deletion"]["deletion_succeeded"]) + self.assertIsNone(newer["deletion"]["deleted_at"]) + # Consumer-side delete used cleanup_local_files=True + deleted = self.consumer._r1fs.deleted + self.assertTrue(any(cid == older["cid"] and cleanup + for (cid, _, cleanup) in deleted)) + self.assertTrue(any(cid == older["cid"] and not unpin_remote + for (cid, unpin_remote, _) in deleted)) + + +if __name__ == "__main__": + unittest.main() diff --git a/extensions/business/container_apps/tests/test_sync_mixin.py b/extensions/business/container_apps/tests/test_sync_mixin.py new file mode 100644 index 00000000..ed911c3c --- /dev/null +++ b/extensions/business/container_apps/tests/test_sync_mixin.py @@ -0,0 +1,744 @@ +"""Unit tests for ``_SyncMixin`` stand-alone methods. + +Covers env-var injection, config helpers, stale .processing recovery, and +the provider/consumer ticks driven against a fake plugin that records +stop_container/start_container call ordering. The mixin's +_configure_system_volume() is intentionally NOT tested here because it +shells out to losetup/mount which require root + a real loopback environment; +that path is exercised by the e2e scenarios (volume_sync/02_mount_persistence_sanity +and the rest). +""" + +import json +import os +import tempfile +import unittest +from pathlib import Path +from unittest.mock import patch + +from extensions.business.container_apps.sync import ( + SYSTEM_VOLUME_MOUNT, + SYSTEM_VOLUME_NAME, + SyncManager, + _SyncMixin, + history_received_dir, + volume_sync_dir, +) +from extensions.business.container_apps.sync.manager import ApplyResult +from extensions.business.container_apps.tests.test_sync_manager import ( + _FakeDockerArchiveContainer, + _FakeChainStore, + _FakeR1FS, + _make_owner, + _tar_bytes, +) + + +class _FakePlugin(_SyncMixin): + """A minimal fake plugin that mixes in _SyncMixin and records lifecycle calls.""" + + def __init__(self, owner_ns): + self._delegate = owner_ns + self.stop_calls = 0 + self.start_calls = 0 + self.runtime_stop_calls = 0 + self.fixed_volume_cleanup_calls = 0 + self.stop_result = True + self.lifecycle_log: list[str] = [] + # Mirror SyncManager-required attributes onto self by attribute lookup. + # We simply use __getattr__ to forward. + + def __getattr__(self, name): + return getattr(self._delegate, name) + + # Plugin lifecycle stubs (logged + counted) + def stop_container(self): + self.stop_calls += 1 + self.lifecycle_log.append("stop") + return self.stop_result + + def _stop_container_runtime_for_restart(self): + self.runtime_stop_calls += 1 + return self.stop_container() + + def _cleanup_fixed_size_volumes(self): + self.fixed_volume_cleanup_calls += 1 + + def start_container(self): + self.start_calls += 1 + self.lifecycle_log.append("start") + + def _reset_runtime_state_post_start(self): + """Mirror the real plugin's helper so sync-tick tests can observe both + the call order and the resulting state-marker resets. + """ + self.lifecycle_log.append("reset") + # Same resets the real container_app_runner._reset_runtime_state_post_start + # performs. Log stream / build-and-run hooks are no-ops in this fake. + self.container_start_time = self.time() + self._app_ready = False + self._health_probe_start = None + self._tunnel_start_allowed = False + self._commands_started = False + + # Mark-as-mutable env so the mixin's _inject_sync_env_vars can write. + @property + def env(self): + return self._delegate.__dict__.setdefault("env", {}) + + +def _make_plugin(tmpdir, *, role="provider", enabled=True, key="SYNC-KEY-1"): + owner = _make_owner(tmpdir) + owner.cfg_sync = { + "ENABLED": enabled, + "KEY": key, + "TYPE": role, + "POLL_INTERVAL": 1, + } + owner.cfg_sync_type = role + owner.cfg_sync_key = key + plugin = _FakePlugin(owner) + # Make sure the volume-sync directory exists for tests that don't go through + # _configure_system_volume. + volume_sync_dir(plugin).mkdir(parents=True, exist_ok=True) + return plugin, owner + + +# --------------------------------------------------------------------------- +# Config helpers +# --------------------------------------------------------------------------- + +class TestConfigHelpers(unittest.TestCase): + def setUp(self): + self._tmp = tempfile.TemporaryDirectory() + self.tmpdir = Path(self._tmp.name) + + def tearDown(self): + self._tmp.cleanup() + + def test_disabled(self): + plugin, _ = _make_plugin(self.tmpdir, enabled=False) + self.assertFalse(plugin._sync_enabled()) + self.assertIsNone(plugin._ensure_sync_manager()) + + def test_enabled_provider(self): + plugin, _ = _make_plugin(self.tmpdir, role="provider") + self.assertTrue(plugin._sync_enabled()) + self.assertEqual(plugin._sync_role(), "provider") + self.assertIsInstance(plugin._ensure_sync_manager(), SyncManager) + # Lazy-init returns the same instance. + self.assertIs(plugin._ensure_sync_manager(), plugin._sync_manager) + + def test_invalid_role(self): + plugin, _ = _make_plugin(self.tmpdir, role="bogus") + self.assertIsNone(plugin._sync_role()) + + def test_poll_interval_floor(self): + plugin, owner = _make_plugin(self.tmpdir) + owner.cfg_sync["POLL_INTERVAL"] = 0 + self.assertEqual(plugin._sync_poll_interval(), 1.0) + + def test_poll_interval_invalid_falls_back(self): + plugin, owner = _make_plugin(self.tmpdir) + owner.cfg_sync["POLL_INTERVAL"] = "nope" + self.assertEqual(plugin._sync_poll_interval(), 10.0) + + def test_hsync_poll_interval_default(self): + """When SYNC.HSYNC_POLL_INTERVAL is unset, ``_hsync_poll_interval`` + returns the 60s default so consumers don't go to the network for fresh + chain replicas more than once per default window. + """ + plugin, owner = _make_plugin(self.tmpdir) + # Make sure the field really is absent on the test fixture. + owner.cfg_sync.pop("HSYNC_POLL_INTERVAL", None) + self.assertEqual(plugin._hsync_poll_interval(), 60.0) + # Same value surfaces via the SyncManager-facing property. + self.assertEqual(plugin.cfg_sync_hsync_poll_interval, 60.0) + + def test_hsync_poll_interval_floor(self): + """Values below the 10s minimum are clamped up — the floor protects + the cluster from operators who set the knob aggressively low without + realising the network cost.""" + plugin, owner = _make_plugin(self.tmpdir) + owner.cfg_sync["HSYNC_POLL_INTERVAL"] = 1 + self.assertEqual(plugin._hsync_poll_interval(), 10.0) + + def test_hsync_poll_interval_invalid_falls_back(self): + """Non-numeric values fall back to the default (not the floor) — same + pattern as ``_sync_poll_interval`` so misconfiguration is forgiving + but conservative.""" + plugin, owner = _make_plugin(self.tmpdir) + owner.cfg_sync["HSYNC_POLL_INTERVAL"] = "nope" + self.assertEqual(plugin._hsync_poll_interval(), 60.0) + + def test_online_provider_capture_string_false_is_false(self): + plugin, owner = _make_plugin(self.tmpdir) + owner.cfg_sync["ALLOW_ONLINE_PROVIDER_CAPTURE"] = "false" + self.assertFalse(plugin.cfg_sync_allow_online_provider_capture) + + def test_online_provider_capture_string_true_is_true(self): + plugin, owner = _make_plugin(self.tmpdir) + owner.cfg_sync["ALLOW_ONLINE_PROVIDER_CAPTURE"] = "true" + self.assertTrue(plugin.cfg_sync_allow_online_provider_capture) + + +# --------------------------------------------------------------------------- +# Env-var injection +# --------------------------------------------------------------------------- + +class TestEnvInjection(unittest.TestCase): + def setUp(self): + self._tmp = tempfile.TemporaryDirectory() + self.tmpdir = Path(self._tmp.name) + + def tearDown(self): + self._tmp.cleanup() + + def test_always_on_keys_present(self): + plugin, _ = _make_plugin(self.tmpdir, enabled=False) + plugin._inject_sync_env_vars() + self.assertEqual(plugin.env["R1_SYSTEM_VOLUME"], "/r1en_system") + self.assertEqual(plugin.env["R1_VOLUME_SYNC_DIR"], "/r1en_system/volume-sync") + self.assertEqual( + plugin.env["R1_SYNC_REQUEST_FILE"], "/r1en_system/volume-sync/request.json" + ) + # Role/key keys not set when SYNC disabled. + self.assertNotIn("R1_SYNC_TYPE", plugin.env) + self.assertNotIn("R1_SYNC_KEY", plugin.env) + + def test_role_and_key_set_when_enabled(self): + plugin, _ = _make_plugin(self.tmpdir, role="consumer", key="abc-123") + plugin._inject_sync_env_vars() + self.assertEqual(plugin.env["R1_SYNC_TYPE"], "consumer") + self.assertEqual(plugin.env["R1_SYNC_KEY"], "abc-123") + + def test_no_env_when_sync_unavailable(self): + """If _configure_system_volume set _sync_unavailable (host tools missing), + _inject_sync_env_vars must not advertise R1_SYSTEM_VOLUME or any other + R1_* key — the mount doesn't exist on the host, so the app would write + into a phantom path while CAR polled a host root that was never + provisioned. Codex review finding 5 on PR #399.""" + plugin, _ = _make_plugin(self.tmpdir, role="provider", key="abc-123") + plugin._sync_unavailable = True + plugin._inject_sync_env_vars() + for k in ("R1_SYSTEM_VOLUME", "R1_VOLUME_SYNC_DIR", "R1_SYNC_REQUEST_FILE", + "R1_SYNC_TYPE", "R1_SYNC_KEY"): + self.assertNotIn(k, plugin.env) + + def test_sync_disabled_when_unavailable(self): + """_sync_enabled() must return False when _sync_unavailable is set, even + with SYNC.ENABLED=True in config — provider/consumer ticks would + otherwise poll a host root that doesn't exist.""" + plugin, _ = _make_plugin(self.tmpdir, role="provider", enabled=True) + self.assertTrue(plugin._sync_enabled()) # baseline + plugin._sync_unavailable = True + self.assertFalse(plugin._sync_enabled()) + + def test_successful_system_volume_config_clears_sync_unavailable(self): + plugin, _ = _make_plugin(self.tmpdir, role="provider", enabled=True) + plugin._sync_unavailable = True + + with patch( + "extensions.business.container_apps.sync.mixin.fixed_volume._require_tools" + ), patch( + "extensions.business.container_apps.sync.mixin.fixed_volume.provision", + side_effect=lambda vol, **_kwargs: vol, + ), patch( + "extensions.business.container_apps.sync.mixin.os.chown", + ): + plugin._configure_system_volume() + + self.assertFalse(plugin._sync_unavailable) + self.assertIn(SYSTEM_VOLUME_MOUNT, [spec["bind"] for spec in plugin.volumes.values()]) + self.assertEqual(os.stat(volume_sync_dir(plugin).parent).st_mode & 0o777, 0o755) + self.assertEqual(os.stat(volume_sync_dir(plugin)).st_mode & 0o777, 0o777) + self.assertEqual(os.stat(volume_sync_dir(plugin)).st_mode & 0o1000, 0o1000) + + def test_system_volume_config_recreates_symlinked_volume_sync_dir(self): + plugin, _ = _make_plugin(self.tmpdir, role="provider", enabled=True) + vsd = volume_sync_dir(plugin) + vsd.rmdir() + outside = self.tmpdir / "outside-control" + outside.mkdir() + os.symlink(str(outside), str(vsd)) + + with patch( + "extensions.business.container_apps.sync.mixin.fixed_volume._require_tools" + ), patch( + "extensions.business.container_apps.sync.mixin.fixed_volume.provision", + side_effect=lambda vol, **_kwargs: vol, + ), patch( + "extensions.business.container_apps.sync.mixin.os.chown", + ): + plugin._configure_system_volume() + + self.assertFalse(plugin._sync_unavailable) + self.assertTrue(vsd.is_dir()) + self.assertFalse(vsd.is_symlink()) + self.assertEqual(os.stat(vsd.parent).st_mode & 0o777, 0o755) + self.assertEqual(os.stat(vsd).st_mode & 0o1000, 0o1000) + + def test_system_volume_ignores_image_owner_and_enforces_root_ownership(self): + plugin, _ = _make_plugin(self.tmpdir, role="provider", enabled=True) + plugin._resolve_image_owner = lambda: (1000, 1000) + seen = {} + + def _provision(vol, **_kwargs): + seen["vol"] = vol + return vol + + with patch( + "extensions.business.container_apps.sync.mixin.fixed_volume._require_tools" + ), patch( + "extensions.business.container_apps.sync.mixin.fixed_volume.provision", + side_effect=_provision, + ), patch( + "extensions.business.container_apps.sync.mixin.os.chown" + ) as chown: + plugin._configure_system_volume() + + self.assertIsNone(seen["vol"].owner_uid) + self.assertIsNone(seen["vol"].owner_gid) + chown.assert_any_call(str(volume_sync_dir(plugin).parent), 0, 0) + chown.assert_any_call(str(volume_sync_dir(plugin)), 0, 0) + self.assertFalse(plugin._sync_unavailable) + + +# --------------------------------------------------------------------------- +# Stale .processing recovery +# --------------------------------------------------------------------------- + +class TestRecoverStaleProcessing(unittest.TestCase): + def setUp(self): + self._tmp = tempfile.TemporaryDirectory() + self.tmpdir = Path(self._tmp.name) + self.plugin, self.owner = _make_plugin(self.tmpdir) + self.vsd = volume_sync_dir(self.plugin) + + def tearDown(self): + self._tmp.cleanup() + + def test_no_op_when_no_processing(self): + self.plugin._recover_stale_processing() # should not raise + self.assertFalse((self.vsd / "request.json").exists()) + + def test_renames_processing_back(self): + (self.vsd / "request.json.processing").write_text('{"archive_paths":["/app/data/"]}') + self.plugin._recover_stale_processing() + self.assertFalse((self.vsd / "request.json.processing").exists()) + self.assertTrue((self.vsd / "request.json").is_file()) + + def test_keeps_existing_request_intact(self): + # If both exist (rare crash race), don't overwrite the in-flight request. + (self.vsd / "request.json").write_text('{"archive_paths":["/app/data/"]}') + (self.vsd / "request.json.processing").write_text('{"archive_paths":["/old/"]}') + self.plugin._recover_stale_processing() + # .processing untouched, request.json preserved. + self.assertTrue((self.vsd / "request.json.processing").exists()) + self.assertEqual( + json.loads((self.vsd / "request.json").read_text())["archive_paths"], + ["/app/data/"], + ) + + +# --------------------------------------------------------------------------- +# Provider tick +# --------------------------------------------------------------------------- + +class TestProviderTick(unittest.TestCase): + def setUp(self): + self._tmp = tempfile.TemporaryDirectory() + self.tmpdir = Path(self._tmp.name) + self.plugin, self.owner = _make_plugin(self.tmpdir, role="provider") + self.vsd = volume_sync_dir(self.plugin) + # Seed data volume + (self.owner._fixed_root / "appdata" / "weights.bin").write_bytes(b"abc") + + def tearDown(self): + self._tmp.cleanup() + + def _write_request(self, body): + (self.vsd / "request.json").write_text(json.dumps(body)) + + def test_no_request_no_action(self): + self.plugin._sync_provider_tick(current_time=100.0) + self.assertEqual(self.plugin.stop_calls, 0) + self.assertEqual(self.plugin.start_calls, 0) + + def test_disabled_no_action(self): + self.owner.cfg_sync["ENABLED"] = False + self._write_request({"archive_paths": ["/app/data/"]}) + self.plugin._sync_provider_tick(current_time=100.0) + self.assertEqual(self.plugin.stop_calls, 0) + self.assertTrue((self.vsd / "request.json").exists()) + + def test_consumer_role_no_provider_action(self): + self.owner.cfg_sync["TYPE"] = "consumer" + self._write_request({"archive_paths": ["/app/data/"]}) + self.plugin._sync_provider_tick(current_time=100.0) + self.assertEqual(self.plugin.stop_calls, 0) + + def test_throttle_skips_within_poll_interval(self): + self.owner.cfg_sync["POLL_INTERVAL"] = 100 + self._write_request({"archive_paths": ["/app/data/"]}) + self.plugin._last_sync_check = 90.0 + self.plugin._sync_provider_tick(current_time=100.0) # only 10s since last + self.assertEqual(self.plugin.stop_calls, 0) + + def test_full_provider_flow(self): + self._write_request({"archive_paths": ["/app/data/"], "metadata": {"v": 1}}) + self.plugin._sync_provider_tick(current_time=1000.0) + + # stop -> work -> start in that order + self.assertEqual(self.plugin.lifecycle_log, ["stop", "start", "reset"]) + # response.json + chainstore + history all produced + response = json.loads((self.vsd / "response.json").read_text()) + self.assertEqual(response["status"], "ok") + self.assertEqual(len(self.owner._cs.hset_calls), 1) + + def test_provider_sync_uses_runtime_stop_without_fixed_volume_cleanup(self): + self._write_request({"archive_paths": ["/app/data/"]}) + + self.plugin._sync_provider_tick(current_time=1000.0) + + self.assertEqual(self.plugin.runtime_stop_calls, 1) + self.assertEqual(self.plugin.fixed_volume_cleanup_calls, 0) + self.assertEqual(self.plugin.lifecycle_log, ["stop", "start", "reset"]) + + def test_online_provider_capture_skips_runtime_stop(self): + self.owner.cfg_sync["ALLOW_ONLINE_PROVIDER_CAPTURE"] = True + self.plugin.container = _FakeDockerArchiveContainer({ + "/tmp/generated.txt": _tar_bytes("generated.txt", b"from-container"), + }) + self._write_request({ + "archive_paths": ["/tmp/generated.txt"], + "runtime": {"provider_capture": "online"}, + }) + + self.plugin._sync_provider_tick(current_time=1000.0) + + self.assertEqual(self.plugin.runtime_stop_calls, 0) + self.assertEqual(self.plugin.start_calls, 0) + self.assertEqual(self.plugin.lifecycle_log, []) + response = json.loads((self.vsd / "response.json").read_text()) + self.assertEqual(response["status"], "ok") + + def test_validation_failure_does_not_stop_container(self): + # claim_request fails fast; no need to disturb the container. + self._write_request({"archive_paths": ["/nope/"]}) + self.plugin._sync_provider_tick(current_time=1000.0) + self.assertEqual(self.plugin.stop_calls, 0) + self.assertEqual(self.plugin.start_calls, 0) + invalid = json.loads((self.vsd / "request.json.invalid").read_text()) + self.assertEqual(invalid["_error"]["stage"], "validation") + + def test_publish_failure_still_restarts_container(self): + self._write_request({"archive_paths": ["/app/data/"]}) + self.owner._r1fs.add_should_raise = RuntimeError("ipfs gone") + self.plugin._sync_provider_tick(current_time=1000.0) + # We did stop because claim succeeded; the failure was at r1fs stage. + self.assertEqual(self.plugin.lifecycle_log, ["stop", "start", "reset"]) + response = json.loads((self.vsd / "response.json").read_text()) + self.assertEqual(response["stage"], "r1fs_upload") + + def test_offline_provider_stop_failure_aborts_before_publish(self): + self.plugin.stop_result = False + self._write_request({"archive_paths": ["/app/data/"]}) + + self.plugin._sync_provider_tick(current_time=1000.0) + + self.assertEqual(self.plugin.lifecycle_log, ["stop"]) + self.assertEqual(self.owner._cs.hset_calls, []) + self.assertEqual(self.owner._r1fs.added, {}) + response = json.loads((self.vsd / "response.json").read_text()) + self.assertEqual(response["status"], "error") + self.assertEqual(response["stage"], "runtime_stop") + + +# --------------------------------------------------------------------------- +# Consumer tick +# --------------------------------------------------------------------------- + +class TestConsumerTick(unittest.TestCase): + def setUp(self): + self._tmp = tempfile.TemporaryDirectory() + self.tmpdir = Path(self._tmp.name) + # Set up provider+consumer plugins sharing one r1fs/chainstore. + self.provider_plugin, self.provider_owner = _make_plugin( + self.tmpdir / "p", role="provider" + ) + self.consumer_plugin, self.consumer_owner = _make_plugin( + self.tmpdir / "c", role="consumer" + ) + # Share state by using the provider's r1fs/chainstore on the consumer. + shared_r1fs = self.provider_owner._r1fs + shared_cs = self.provider_owner._cs + self.consumer_owner.r1fs = shared_r1fs + self.consumer_owner._r1fs = shared_r1fs + self.consumer_owner.chainstore_hset = shared_cs.hset + self.consumer_owner.chainstore_hget = shared_cs.hget + self.consumer_owner.chainstore_hsync = shared_cs.hsync + self.consumer_owner._cs = shared_cs + # Same SYNC.KEY across both + self.consumer_owner.cfg_sync["KEY"] = "SYNC-KEY-1" + self.consumer_owner.cfg_sync_key = "SYNC-KEY-1" + + (self.provider_owner._fixed_root / "appdata" / "weights.bin").write_bytes(b"data1") + + def tearDown(self): + self._tmp.cleanup() + + def _publish(self, content=b"data1", runtime=None): + (self.provider_owner._fixed_root / "appdata" / "weights.bin").write_bytes(content) + p_vsd = volume_sync_dir(self.provider_plugin) + p_vsd.mkdir(parents=True, exist_ok=True) + request = {"archive_paths": ["/app/data/"]} + if runtime is not None: + request["runtime"] = runtime + (p_vsd / "request.json").write_text(json.dumps(request)) + self.provider_plugin._last_sync_check = 0 + self.provider_plugin._sync_provider_tick(current_time=1000.0) + + def test_no_record_no_action(self): + self.consumer_plugin._sync_consumer_tick(current_time=1000.0) + self.assertEqual(self.consumer_plugin.stop_calls, 0) + + def test_full_consumer_flow(self): + self._publish() + self.consumer_plugin._sync_consumer_tick(current_time=2000.0) + self.assertEqual(self.consumer_plugin.lifecycle_log, ["stop", "start", "reset"]) + target = self.consumer_owner._fixed_root / "appdata" / "weights.bin" + self.assertEqual(target.read_bytes(), b"data1") + self.assertTrue((volume_sync_dir(self.consumer_plugin) / "last_apply.json").exists()) + + def test_consumer_explicit_offline_restart_stops_applies_and_restarts(self): + self._publish(runtime={"consumer_apply": "offline_restart"}) + + self.consumer_plugin._sync_consumer_tick(current_time=2000.0) + + self.assertEqual(self.consumer_plugin.runtime_stop_calls, 1) + self.assertEqual(self.consumer_plugin.lifecycle_log, ["stop", "start", "reset"]) + target = self.consumer_owner._fixed_root / "appdata" / "weights.bin" + self.assertEqual(target.read_bytes(), b"data1") + + def test_consumer_online_no_restart_falls_back_to_offline_restart(self): + self.consumer_owner.cfg_sync["CONSUMER_APPLY_MODE"] = "online_no_restart" + self._publish(runtime={"consumer_apply": "online_no_restart"}) + + self.consumer_plugin._sync_consumer_tick(current_time=2000.0) + + self.assertEqual(self.consumer_plugin.runtime_stop_calls, 1) + self.assertEqual(self.consumer_plugin.lifecycle_log, ["stop", "start", "reset"]) + self.assertEqual( + self.consumer_plugin._sync_last_apply_mode_resolution, + { + "requested_mode": "online_no_restart", + "effective_mode": "offline_restart", + "reason": "online_apply_disabled", + }, + ) + target = self.consumer_owner._fixed_root / "appdata" / "weights.bin" + self.assertEqual(target.read_bytes(), b"data1") + self.assertTrue((volume_sync_dir(self.consumer_plugin) / "last_apply.json").exists()) + + def test_consumer_online_restart_falls_back_to_offline_restart(self): + self.consumer_owner.cfg_sync["CONSUMER_APPLY_MODE"] = "online_restart" + target = self.consumer_owner._fixed_root / "appdata" / "weights.bin" + target.write_bytes(b"old") + self._publish(content=b"new", runtime={"consumer_apply": "online_restart"}) + + self.consumer_plugin._sync_consumer_tick(current_time=2000.0) + + self.assertEqual(self.consumer_plugin.runtime_stop_calls, 1) + self.assertEqual(self.consumer_plugin.lifecycle_log, ["stop", "start", "reset"]) + self.assertEqual(target.read_bytes(), b"new") + self.assertEqual( + self.consumer_plugin._sync_last_apply_mode_resolution, + { + "requested_mode": "online_restart", + "effective_mode": "offline_restart", + "reason": "online_apply_disabled", + }, + ) + + def test_provider_record_cannot_force_consumer_online_apply(self): + self._publish(runtime={"consumer_apply": "online_no_restart"}) + + self.consumer_plugin._sync_consumer_tick(current_time=2000.0) + + self.assertEqual(self.consumer_plugin.runtime_stop_calls, 1) + self.assertEqual(self.consumer_plugin.lifecycle_log, ["stop", "start", "reset"]) + + def test_consumer_resets_runtime_state_after_apply(self): + """After a sync slice, per-restart runtime markers must be reset so + readiness gates, health-probe timers, and BUILD_AND_RUN_COMMANDS re-engage + against the freshly-started container. Otherwise tunnels stay marked + ready, health checks are skipped, and image-defined startup commands + don't rerun — the codex review's HIGH-severity finding 2 on PR #399. + """ + # Seed the plugin with "previous container is running" markers. + self.consumer_plugin.container_start_time = 999.0 + self.consumer_plugin._app_ready = True + self.consumer_plugin._health_probe_start = 999.0 + self.consumer_plugin._tunnel_start_allowed = True + self.consumer_plugin._commands_started = True + + self._publish() + self.consumer_plugin._sync_consumer_tick(current_time=2000.0) + + # Order: stop, start, then reset (reset MUST come after start so the + # markers reflect the new container, not the prior one). + self.assertEqual( + self.consumer_plugin.lifecycle_log, ["stop", "start", "reset"] + ) + # All readiness / probe / command-rerun markers reset. The fake's + # ``time()`` is a monotonic counter that increments on each read, so + # compare against the seeded sentinel (999.0) rather than chasing the + # exact post-reset value. + self.assertNotEqual(self.consumer_plugin.container_start_time, 999.0) + self.assertIsNotNone(self.consumer_plugin.container_start_time) + self.assertFalse(self.consumer_plugin._app_ready) + self.assertIsNone(self.consumer_plugin._health_probe_start) + self.assertFalse(self.consumer_plugin._tunnel_start_allowed) + self.assertFalse(self.consumer_plugin._commands_started) + + def test_skips_when_record_cid_matches_last_apply(self): + """The consumer's 'is this new?' check is by CID, not version. A second + tick that sees the same ChainStore record (same cid) is a no-op even + if version metadata changed.""" + self._publish() + self.consumer_plugin._sync_consumer_tick(current_time=2000.0) + self.consumer_plugin.lifecycle_log.clear() + self.consumer_plugin._last_sync_check = 0 # reset throttle + # Tick again without a new publish — should be a no-op (same cid). + self.consumer_plugin._sync_consumer_tick(current_time=3000.0) + self.assertEqual(self.consumer_plugin.lifecycle_log, []) + + def test_applies_when_cid_differs_even_if_version_lower(self): + """A consumer should apply any record whose cid differs from the last + applied entry, regardless of version ordering. This guards against + clock-skew failure modes where a provider's wonky timestamp could + otherwise make a corrected snapshot look 'older'.""" + # First publish + apply (creates a baseline received entry). + self._publish(content=b"initial") + self.consumer_plugin._sync_consumer_tick(current_time=2000.0) + initial_received = self.consumer_plugin._sync_manager.latest_received() + self.assertIsNotNone(initial_received) + initial_version = initial_received["version"] + + # Hand-craft a chainstore record with a *lower* version but a fresh CID. + # Under the old version-comparison logic this would be skipped; under + # CID comparison it must be applied. + spoofed_cid = "QmSPOOF_LOWER_VERSION_FRESH_CONTENT" + fake_tar = self.consumer_owner._r1fs.added.get(initial_received["cid"], b"") + self.consumer_owner._r1fs.added[spoofed_cid] = fake_tar + self.consumer_owner._cs.store[("CHAINSTORE_SYNC", "SYNC-KEY-1")] = { + "cid": spoofed_cid, + "version": initial_version - 100, # explicitly older + "timestamp": 0.5, + "node_id": "ee_other", + "metadata": {"who": "wonky-clock"}, + "manifest": initial_received["manifest"], + } + + self.consumer_plugin.lifecycle_log.clear() + self.consumer_plugin._last_sync_check = 0 + self.consumer_plugin._sync_consumer_tick(current_time=3000.0) + # The new (lower-versioned but different-cid) record was applied. + self.assertEqual(self.consumer_plugin.lifecycle_log, ["stop", "start", "reset"]) + latest = self.consumer_plugin._sync_manager.latest_received() + self.assertEqual(latest["cid"], spoofed_cid) + + def test_misalignment_skips_apply(self): + # Store a record in chainstore that references a path consumer can't map. + self.consumer_owner._cs.store[("CHAINSTORE_SYNC", "SYNC-KEY-1")] = { + "cid": "QmFAKE_BAD", + "version": 9999999999, + "timestamp": 1.0, + "node_id": "ee_other", + "metadata": {}, + "manifest": { + "schema_version": 1, + "archive_paths": ["/app/data/", "/nope/"], + "archive_format": "tar.gz", + "encryption": "r1fs-default", + "archive_size_bytes": 123, + }, + } + self.consumer_plugin._sync_consumer_tick(current_time=2000.0) + self.assertEqual(self.consumer_plugin.lifecycle_log, []) + # No last_apply written + self.assertFalse( + (volume_sync_dir(self.consumer_plugin) / "last_apply.json").exists() + ) + # No history advance + self.assertEqual( + len(list(history_received_dir(self.consumer_plugin).glob("*.json"))), 0 + ) + + def test_r1fs_failure_is_quarantined_before_container_stop(self): + self._publish(content=b"new-data") + self.consumer_owner._r1fs.get_should_raise = RuntimeError("network down") + + self.consumer_plugin._sync_consumer_tick(current_time=2000.0) + + self.assertEqual(self.consumer_plugin.lifecycle_log, []) + record = self.consumer_owner._cs.store[("CHAINSTORE_SYNC", "SYNC-KEY-1")] + self.assertIsNotNone(self.consumer_plugin._sync_manager.quarantined_record(record)) + + self.consumer_plugin._last_sync_check = 0 + self.consumer_plugin._sync_consumer_tick(current_time=3000.0) + + self.assertEqual(self.consumer_plugin.lifecycle_log, []) + + def test_uncertain_apply_does_not_restart_container(self): + self._publish(content=b"new-data") + + def uncertain(_prepared): + return ApplyResult(False, False, "uncertain", [], "rollback failed") + + sm = self.consumer_plugin._ensure_sync_manager() + with patch.object( + sm, + "commit_prepared_apply", + side_effect=uncertain, + ): + self.consumer_plugin._sync_consumer_tick(current_time=2000.0) + + self.assertEqual(self.consumer_plugin.lifecycle_log, ["stop"]) + self.assertEqual(self.consumer_plugin.start_calls, 0) + + def test_non_dict_manifest_skips_without_restart(self): + self.consumer_owner._cs.store[("CHAINSTORE_SYNC", "SYNC-KEY-1")] = { + "cid": "QmFAKE_BAD_MANIFEST", + "version": 9999999999, + "timestamp": 1.0, + "node_id": "ee_other", + "metadata": {}, + "manifest": "not-an-object", + } + + self.consumer_plugin._sync_consumer_tick(current_time=2000.0) + + self.assertEqual(self.consumer_plugin.lifecycle_log, []) + self.assertFalse( + (volume_sync_dir(self.consumer_plugin) / "last_apply.json").exists() + ) + + def test_offline_consumer_stop_failure_aborts_before_apply(self): + self._publish(content=b"new-data") + self.consumer_plugin.stop_result = False + + self.consumer_plugin._sync_consumer_tick(current_time=2000.0) + + self.assertEqual(self.consumer_plugin.lifecycle_log, ["stop"]) + self.assertFalse( + (volume_sync_dir(self.consumer_plugin) / "last_apply.json").exists() + ) + self.assertEqual( + len(list(history_received_dir(self.consumer_plugin).glob("*.json"))), 0 + ) + + +if __name__ == "__main__": + unittest.main() From 31da87cfcb3131efa40e939ff3caaf8607cc108b Mon Sep 17 00:00:00 2001 From: Cristi Bleotiu Date: Fri, 15 May 2026 12:06:08 +0300 Subject: [PATCH 3/4] fix: report extra tunnel cleanup failures What changed: - log extra tunnel cleanup success only when every tunnel stopped - add coverage for failed extra tunnel cleanup logging Why: - avoid misleading success logs in hardened cleanup paths --- .../container_apps/container_app_runner.py | 5 ++++- .../tests/test_container_lifecycle.py | 22 +++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/extensions/business/container_apps/container_app_runner.py b/extensions/business/container_apps/container_app_runner.py index 52601f64..87d46b77 100644 --- a/extensions/business/container_apps/container_app_runner.py +++ b/extensions/business/container_apps/container_app_runner.py @@ -2030,7 +2030,10 @@ def stop_extra_tunnels(self): for container_port in list(self.extra_tunnel_processes.keys()): result = self._stop_extra_tunnel(container_port) and result - self.P("All extra tunnels stopped") + if result: + self.P("All extra tunnels stopped", color='g') + else: + self.P("One or more extra tunnels failed to stop; preserving live handles for retry.", color='r') return result diff --git a/extensions/business/container_apps/tests/test_container_lifecycle.py b/extensions/business/container_apps/tests/test_container_lifecycle.py index ecd6a285..20141f97 100644 --- a/extensions/business/container_apps/tests/test_container_lifecycle.py +++ b/extensions/business/container_apps/tests/test_container_lifecycle.py @@ -22,6 +22,7 @@ import docker.types from extensions.business.container_apps.tests.support import ( + make_container_app_runner, make_lifecycle_runner, make_mock_container, make_mock_docker_client, @@ -225,6 +226,27 @@ def test_container_none_returns_false(self): self.assertFalse(plugin._check_container_status()) +class TestExtraTunnelCleanup(unittest.TestCase): + + def test_stop_extra_tunnels_logs_failure_when_any_tunnel_fails(self): + plugin = make_container_app_runner() + plugin.extra_tunnel_processes = { + 8001: object(), + 8002: object(), + } + plugin._stop_extra_tunnel = MagicMock(side_effect=[False, True]) + + result = plugin.stop_extra_tunnels() + + self.assertFalse(result) + self.assertEqual(plugin._stop_extra_tunnel.call_count, 2) + self.assertIn( + "One or more extra tunnels failed to stop", + plugin.logged_messages[-1], + ) + self.assertNotIn("All extra tunnels stopped", plugin.logged_messages[-1]) + + # =========================================================================== # Restart # =========================================================================== From 9cdc15a0c55e2a1377445a6398d08329e3b99936 Mon Sep 17 00:00:00 2001 From: Cristi Bleotiu Date: Fri, 15 May 2026 12:37:10 +0300 Subject: [PATCH 4/4] chore: inc ver --- ver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ver.py b/ver.py index c4106a1d..a8fb3a53 100644 --- a/ver.py +++ b/ver.py @@ -1 +1 @@ -__VER__ = '2.10.219' +__VER__ = '2.10.221'