From 17b4e47396f5e1e874c50b8b56967b9d0c463395 Mon Sep 17 00:00:00 2001 From: Adrian Curtin <48138055+AdrianCurtin@users.noreply.github.com> Date: Tue, 9 Jun 2026 16:56:54 -0400 Subject: [PATCH] Release 5.5.0: multimodal embeddings & tooling Bumps the SDK to 5.5.0 and adds a major set of embedding, image, caching, migration, and hardening features. Key changes: - Multimodal image bytes path: SDK-side image download via Parse::Embeddings::ImageFetch with magic-byte MIME sniffing, URL-extension cross-check, and configurable allowed_image_types; EXIF/XMP stripping is on by default; embed_image now supports source: :bytes and FetchedImage objects to avoid provider-side fetches. - Bulk embedding & resilience: Parse::Embeddings::BatchEmbedder adds batch slicing, inter-batch pacing, exponential backoff with jitter, and a BatchFailed error for resumable jobs. - Query-embed cache: Parse::Embeddings::Cache (opt-in, LRU+TTL) with MonetaStore adapter for persistent L2 sharing and a hashed keyspace to avoid plaintext queries landing in stores; cache hits emit existing embed notifications with cached: true. - Spend-cap improvements: SpendCap now covers all query-embed paths (direct callers included), supports warn_at soft-cap notifications, and provides tooling to avoid double-billing for agent tools. - Embedding provenance & migrations: auto-declared _meta object with {provider,model,dimensions,modality,embedded_at}; Class.reembed! for resumable bulk re-embeds; guidance for same-shape vs changed-width migrations and dual-field workflow. - Vector index drift detection: first-query verification of Atlas vectorSearch index numDimensions/similarity and tenant-scope coverage with configurable Parse::VectorSearch.index_drift_policy (:warn/:raise/:ignore). - Retrieval & filter hardening: pointer-value translation into MongoDB storage form for pointer-valued filters; various ACL/aggregation fixes and stricter/strict: options for permission constraints; aggregation terminals now route via mongo-direct when necessary and fail-closed when scoped and direct is unavailable. - Hybrid search & ACL fixes: rankFusion score recomputation for scoped callers, probe error-class narrowing, and multiple webhook after_save callback hardening (single-run semantics and swallowed callback errors where appropriate). - Client ergonomics & docs: README, changelog and Atlas vector search guide updated with new features, examples, and operator notes; numerous tests added/updated for embeddings, image fetch, cache, batch embedder, vector drift, retrieval filters and webhook behavior. Overall this changeset hardens embedding/image handling (PII protections and MIME-laundering prevention), adds operational tooling for bulk re-embedding and caching, and tightens vector-search / ACL correctness and safety. --- CHANGELOG.md | 344 ++++++++++++ Gemfile.lock | 2 +- README.md | 51 +- docs/atlas_vector_search_guide.md | 333 ++++++++++- lib/parse/api/users.rb | 10 + lib/parse/client.rb | 20 +- lib/parse/embeddings.rb | 43 +- lib/parse/embeddings/batch_embedder.rb | 188 +++++++ lib/parse/embeddings/cache.rb | 322 +++++++++++ lib/parse/embeddings/cohere.rb | 49 +- lib/parse/embeddings/image_fetch.rb | 347 ++++++++++++ lib/parse/embeddings/provider.rb | 28 +- lib/parse/embeddings/spend_cap.rb | 120 +++- lib/parse/embeddings/voyage.rb | 59 +- lib/parse/model/acl.rb | 26 +- lib/parse/model/core/embed_managed.rb | 257 ++++++++- lib/parse/model/core/vector_searchable.rb | 165 +++++- lib/parse/query.rb | 275 +++++++-- lib/parse/query/constraint.rb | 22 + lib/parse/query/constraints.rb | 521 +++++++++--------- lib/parse/retrieval/agent_tool.rb | 35 +- lib/parse/retrieval/retriever.rb | 84 +++ lib/parse/schema/search_index_migrator.rb | 49 +- lib/parse/stack/version.rb | 2 +- lib/parse/vector_search.rb | 34 ++ lib/parse/vector_search/hybrid.rb | 40 +- lib/parse/webhooks.rb | 128 ++++- lib/parse/webhooks/payload.rb | 8 +- test/lib/parse/acl_constraints_unit_test.rb | 224 +++++++- .../agent/mcp_resource_subscriptions_test.rb | 178 ++++++ .../parse/aggregation_auto_promotion_test.rb | 81 ++- test/lib/parse/cloud_functions_module_test.rb | 59 ++ test/lib/parse/cloud_result_decode_test.rb | 38 ++ .../parse/embed_managed_meta_reembed_test.rb | 232 ++++++++ .../parse/embeddings_batch_embedder_test.rb | 171 ++++++ test/lib/parse/embeddings_cache_test.rb | 266 +++++++++ .../lib/parse/embeddings_cohere_image_test.rb | 4 +- test/lib/parse/embeddings_image_fetch_test.rb | 340 ++++++++++++ .../parse/embeddings_spend_cap_query_test.rb | 150 +++++ .../lib/parse/embeddings_voyage_image_test.rb | 5 +- test/lib/parse/find_similar_test.rb | 46 +- .../constraints/acl_query_constraints_test.rb | 251 ++++----- .../hint_mongo_direct_integration_test.rb | 109 ++++ .../query/regex_unicode_integration_test.rb | 114 ++++ .../parse/regex_unicode_option_unit_test.rb | 99 ++++ .../parse/retrieval_pointer_filter_test.rb | 101 ++++ ...earch_index_migrator_tenant_filter_test.rb | 101 ++++ test/lib/parse/vector_index_drift_test.rb | 257 +++++++++ .../vector_search_hybrid_security_test.rb | 96 ++++ .../parse/verify_password_rate_limit_test.rb | 128 +++++ ...webhook_aftersave_payload_fidelity_test.rb | 9 +- test/lib/parse/webhook_callbacks_test.rb | 389 +++++++++++++ test/lib/parse/webhook_triggers_test.rb | 5 + 53 files changed, 6403 insertions(+), 612 deletions(-) create mode 100644 lib/parse/embeddings/batch_embedder.rb create mode 100644 lib/parse/embeddings/cache.rb create mode 100644 lib/parse/embeddings/image_fetch.rb create mode 100644 test/lib/parse/embed_managed_meta_reembed_test.rb create mode 100644 test/lib/parse/embeddings_batch_embedder_test.rb create mode 100644 test/lib/parse/embeddings_cache_test.rb create mode 100644 test/lib/parse/embeddings_image_fetch_test.rb create mode 100644 test/lib/parse/embeddings_spend_cap_query_test.rb create mode 100644 test/lib/parse/query/hint_mongo_direct_integration_test.rb create mode 100644 test/lib/parse/query/regex_unicode_integration_test.rb create mode 100644 test/lib/parse/regex_unicode_option_unit_test.rb create mode 100644 test/lib/parse/retrieval_pointer_filter_test.rb create mode 100644 test/lib/parse/search_index_migrator_tenant_filter_test.rb create mode 100644 test/lib/parse/vector_index_drift_test.rb create mode 100644 test/lib/parse/vector_search_hybrid_security_test.rb create mode 100644 test/lib/parse/verify_password_rate_limit_test.rb diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e18ace..41ce196 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,349 @@ ## parse-stack-next Changelog +### 5.5.0 + +#### Multimodal bytes-fetch path with magic-byte MIME verification + +- **NEW**: `Parse::Embeddings::ImageFetch` — the SDK-side image download + layer for image embeddings. Downloads through the existing + `Parse::File.safe_open_url` SSRF primitive (CIDR blocks, port allowlist, + DNS-rebinding re-check, size caps, timeouts — no parallel fetch mechanism), + determines the MIME type **exclusively by magic-byte sniffing** of the + leading bytes (JPEG / PNG / GIF / WebP), cross-checks the URL extension + against the sniffed type, and enforces a configurable + `Parse::Embeddings.allowed_image_types` allowlist. The HTTP `Content-Type` + header is never consulted, closing the file MIME-laundering gap: a `.jpg` + URL serving HTML (or PNG bytes behind a JPEG extension) is refused outright. +- **NEW**: `embed_image ..., source: :bytes` declaration mode. Where the + default `source: :url` forwards a validated URL for the provider to fetch + itself (and therefore requires the `trust_provider_url_fetch` sentinel), + `:bytes` mode has the SDK download, verify, and metadata-strip the image, + then forward it to the provider as a base64 data URI. No third-party URL + egress occurs, so the sentinel is not required — but the file's host must + still be in `Parse::Embeddings.allowed_image_hosts` (deny-all when empty). + + ```ruby + class Post < Parse::Object + property :cover_image, :file + property :cover_embedding, :vector, dimensions: 1024, provider: :voyage + embed_image :cover_image, into: :cover_embedding, source: :bytes + end + ``` +- **NEW**: EXIF/XMP metadata stripping, **default ON** for the bytes path. + User-uploaded photos commonly carry GPS coordinates and device serial + numbers; forwarding them to an embedding provider is a PII egress. JPEG + APP1 segments (Exif and XMP), PNG `eXIf` chunks, and WebP `EXIF`/`XMP ` + RIFF chunks (with the VP8X flag bits cleared) are removed before the bytes + leave the process. Opt out per declaration with `exif_strip: false` when + orientation metadata must be preserved. +- **NEW**: `Voyage#embed_image` and `Cohere#embed_image` accept + `Parse::Embeddings::ImageFetch::FetchedImage` sources alongside URL + Strings (forms may be mixed in one batch). Fetched bytes ride Voyage's + `image_base64` content row and Cohere's `image_url` data-URI form. +- **NEW**: `Parse::Embeddings.allowed_image_types=` — MIME allowlist for the + bytes path (default JPEG/PNG/GIF/WebP; SVG deliberately excluded as + script-capable active content). +- **ENHANCED**: `Parse::Embeddings.validate_image_url!` accepts + `mode: :fetch` for SDK-side downloads — same host allowlist, + obfuscated-IP screen, port and CIDR checks as the default `:forward` + mode, minus the provider-egress sentinel that doesn't apply when no URL + is forwarded. + +#### Embedding-model migration tooling + +- **NEW**: `Class.reembed!(field:, batch_size:, limit:, where:, only_stale:, + save_opts:)` — bulk re-embed for provider/model migrations. Unlike + `embed_pending!` (which only fills null vectors), `reembed!` walks every + row with objectId-cursor pagination, clears the digest sibling so the + save-path recompute cannot elide the provider call, and saves. With + `only_stale: true` the walk skips rows whose recorded provenance already + matches the current provider, model, and dimensions — making a partially + failed migration resumable. +- **NEW**: `embed` / `embed_image` auto-declare an `_meta` `:object` + sibling property recording `{ provider, model, dimensions, modality, + embedded_at }` on every recompute (cleared when the source clears). + This is the provenance record `reembed!(only_stale: true)` reads, and it + tells operational tooling which model produced any stored vector. + Override the name with `meta_field:`. + +#### Bulk embedding and query-embed caching + +- **NEW**: `Parse::Embeddings::BatchEmbedder` — batch-level orchestration + for bulk embedding jobs. Wraps any registered provider with batch slicing + (defaulting to the provider's own batch-size hint), requests-per-minute + pacing between calls, and batch-level exponential backoff with jitter on + rate-limit / transient errors (previously backoff lived only inside each + provider's single HTTP call). A batch that exhausts its attempts raises + `BatchEmbedder::BatchFailed` carrying `batch_index` and `completed_count` + so a resumable job knows where to pick up. Supports `retry_on:` exception + overrides and an `on_progress:` callback. +- **NEW**: `Parse::Embeddings::Cache` — process-local embedding cache keyed + by `(provider, model, dimensions, input_type, SHA-256(input))`, disabled by + default. Dimensions participate in the key so two registrations of the + same Matryoshka-capable model at different output widths never serve each + other's vectors. + `Parse::Embeddings::Cache.enable!(max_entries:, ttl:)` activates an LRU + + TTL store (or pass `store:` for a custom backend); repeated identical + query embeds through `find_similar(text:)`, `hybrid_search(text:)`, and + `Parse::Retrieval.retrieve` then skip the provider round-trip. Cache hits + emit the standard `parse.embeddings.embed` notification with + `cached: true`, so existing spend subscribers see hits and misses on one + stream. The input text is hashed before keying — plaintext queries never + land in a shared store. + +#### Vector index drift detection + +- **NEW**: first-query verification of deployed Atlas vectorSearch indexes. + When `find_similar` / `hybrid_search` auto-discovers an index, the SDK now + compares the index's `numDimensions` and `similarity` against the + `:vector` property declaration, and — when the class registers an + `agent_tenant_scope` — confirms the scope field is declared as a + `type: "filter"` path (without it, every tenant-scoped + `$vectorSearch.filter` fails Atlas-side). Findings are computed once per + (class, field, index) per process and governed by + `Parse::VectorSearch.index_drift_policy`: `:warn` (default) emits a + `[Parse::VectorSearch:DRIFT]` warning on the first check; `:raise` raises + `Parse::Core::VectorSearchable::IndexDriftError` on **every** query + against the drifted index, so strict deployments never serve degraded + results after the first failure; `:ignore` skips verification. An + explicit `index:` kwarg is verified best-effort when the catalog's + covering index carries the same name (lookup failures never fail the + query). + +#### Hybrid search hardening + +- **FIXED**: on the opt-in native `$rankFusion` path, a scoped (non-master) + caller's `_hybrid_score` is now recomputed from the post-ACL visible + ordering instead of surfacing the raw fused score. The raw score is + materialized before the ACL `$match`, so it encoded a surviving row's + rank among rows the caller cannot read — a cross-tenant/cross-ACL + inference channel for callers probing with crafted queries. The + recomputed score is monotone with the true fused order but is a function + of visible rows only. Master-key results and the default client-side RRF + path (which ranks from already-filtered rows) are unchanged. +- **FIXED**: the `$rankFusion` support probe no longer classifies MongoDB + authorization errors as "stage unsupported". The probe's + unrecognized-stage matching included the broad phrase "is not allowed", + which also appears in auth failures ("not allowed to execute command + aggregate") and could cache the wrong verdict for the probe TTL. Matching + is narrowed to unambiguous unknown-stage phrases; any other failure is + treated as supported and the real query surfaces the real error, with + the client-side path as the standing fallback. + +#### Retrieval spend-cap and filter hardening + +- **NEW**: `Parse::Embeddings::SpendCap.configure(..., warn_at: 0.8)` — + soft-cap alerting. When a charge pushes a tenant's in-window usage across + the given fraction of its hard limit, a + `parse.embeddings.spend_cap_warning` ActiveSupport::Notifications event + is emitted (`tenant_id`, `used`, `limit`, `window`, `warn_at`, + `threshold`), once per crossing and re-arming as the window rolls off — + an operator alerting hook that fires BEFORE the hard refuse trips. + Disabled unless configured. Note the cap deliberately charges before the + query-embed cache lookup, so cache hits bill at full price: it bounds + query volume (an abuse control), not just provider spend. +- **NEW**: `Parse::Embeddings::Cache::MonetaStore` — persistent-L2 adapter + for the embedding cache. Wraps any Moneta-compatible store (`[]`/`[]=`, + optional `store(key, value, expires:)`) behind the cache's `get`/`set` + duck, with key namespacing and TTL forwarding, so + `Cache.enable!(store: MonetaStore.new(moneta, ttl: 30 * 24 * 3600))` + shares query-embed entries across processes and restarts. Fail-open: a + backend error degrades to a cache miss / dropped write, never a failed + embed. Cache keys are input hashes — plaintext queries never land in the + shared store. +- **NEW**: embedding spend-cap coverage on every query-embed path. The + per-tenant `Parse::Embeddings::SpendCap` was previously charged only at + the `semantic_search` agent-tool boundary; direct `find_similar(text:)`, + `hybrid_search(text:)`, and `Parse::Retrieval.retrieve` callers bypassed + it. The shared query-embed path now charges via + `SpendCap.charge_query!` — tenant identity resolves to the ambient + `Parse.with_cache_tenant` scope when set, else the shared default bucket. + The agent tool wraps its retrieval in the new `SpendCap.with_precharged` + block so a query it already charged with per-tenant identity is not + double-billed (and admin-exempt queries are not billed to the shared + bucket). As before, the cap is a no-op until configured. +- **NEW**: pointer-value translation for caller-supplied retrieval filters. + `Parse::Retrieval.retrieve` (and through it the `semantic_search` agent + tool) now rewrites Parse pointer values — `Parse::Pointer` / + `Parse::Object` instances and wire-form `{"__type": "Pointer"}` hashes, + including inside `$in` / `$eq` / `$ne` operator hashes — into their + MongoDB storage form, so `{ owner: some_user }` becomes + `{ "_p_owner" => "_User$abc123" }` and actually matches rows. Previously + a pointer-valued filter silently matched nothing. Translation runs after + the underscore-key gate and filter-field allowlist (callers still cannot + name `_p_*` columns directly) and before the tenant-scope fold. The + standalone helper is `Parse::Retrieval.translate_pointer_filter_values`. +- **IMPROVED**: `Parse::Schema::SearchIndexMigrator` auto-includes the + model's registered `agent_tenant_scope` field as a `type: "filter"` path + when planning or applying `vectorSearch` index declarations. Newly created + indexes support tenant-scoped pre-filtering out of the box; existing + indexes missing the path surface as `drifted:` in the plan instead of + failing at query time. + +#### Opt-in Unicode regex matching for text constraints + +- **NEW**: `starts_with`, `contains`, `ends_with`, and `like`/`regex` now accept + an opt-in `{ value:, unicode: true }` form that appends the `u` (Unicode) flag + to the compiled `$options`, enabling correct multibyte case-insensitive + matching for accented and non-Latin text (for example `café` matching + `CAFÉ`, or CJK characters). + + ```ruby + Post.where(:title.starts_with => { value: "café", unicode: true }) + # => "title": { "$regex": "^café", "$options": "iu" } + + Post.where(:title.like => { value: /café/i, unicode: true }) + # => "title": { "$regex": "café", "$options": "iu" } + ``` + + The flag is strictly opt-in: the bare-value forms + (`:title.starts_with => "café"`) compile exactly as before with `$options: "i"`, + so existing queries are unchanged. The `u` flag is honored by Parse Server + 8.3.0+ over the REST query interface and by MongoDB 6.1+ on the mongo-direct + query path; older Parse Servers reject it, which is why it is never emitted + unless requested. + +#### ACL permission query hardening + +- **FIXED**: `readable_by`, `writable_by`, `readable_by_role`, + `writable_by_role`, `publicly_readable`, and `publicly_writable` no longer + raise a pipeline-security error when they auto-route through the direct + MongoDB path. These constraints compile to an aggregation `$match` on the + internal `_rperm` / `_wperm` permission columns, and the internal-fields + denylist that protects user-supplied pipelines from referencing + server-internal columns was also rejecting these SDK-generated references. + The aggregation runner now forwards the `allow_internal_fields` sanction for + pipelines built entirely from SDK constraint translation — matching the + parity already held by the `results_direct` / `count_direct` / + `distinct_direct` helpers — so public-read detection (`publicly_readable`, + `readable_by("*")`) and role/user permission filtering work again. The + sanction is scoped to SDK-built ACL pipelines only; caller-supplied + aggregation pipelines remain subject to the full denylist, so they still + cannot reference password hashes, session tokens, or other internal columns. +- **FIXED**: `Query#count` now routes ACL permission filters + (`publicly_readable.count`, `readable_by(...).count`, and friends) through + the direct MongoDB path, mirroring `Query#results`. Previously `count` only + switched to the direct path for subquery `$lookup` stages, so an ACL count + was sent to Parse Server's REST aggregate endpoint, which cannot express a + `$match` on `_rperm` / `_wperm`. +- **FIXED**: the scalar aggregation terminals — `Query#sum`, `#average`, + `#min`, `#max`, `#distinct`, and `#count_distinct` — now honor ACL + permission filters and scoped queries. They funnel through `Query#aggregate`, + which previously only switched to the direct MongoDB path for subquery + `$lookup` stages. An ACL filter (`readable_by(...).sum(:plays)`) was sent to + Parse Server's REST aggregate endpoint, which cannot express a `$match` on + `_rperm` / `_wperm`. More seriously, a **scoped** terminal + (`scope_to_user(u).sum(:plays)`, `scope_to_role`, or a `session_token`) + reached the same REST endpoint, which is master-key-only and enforces + neither ACL nor CLP — so the aggregate ran unscoped as the master key, + computing the result over rows the caller cannot read. `Query#aggregate` now + routes to mongo-direct whenever the query is scoped or the pipeline + references the ACL columns, and **fails closed** (raises + `Parse::Query::MongoDirectRequired`) for a scoped terminal when mongo-direct + is unavailable, rather than silently bypassing enforcement. The same + contract covers the inline-pipeline terminals: a scoped `Query#count` or + `Query#results` whose constraints compile to an aggregation pipeline + (e.g. `:field.size`) promotes to mongo-direct and fails closed identically + instead of falling back to REST `/aggregate`. +- **FIXED**: `not_publicly_readable` / `not_publicly_writable` (and the + `:ACL.not_readable_by` / `:ACL.not_writable_by` constraints) no longer return + the rows they are meant to exclude. They compiled to `{ _rperm: { $nin: + [...] } }`, and MongoDB's `$nin` matches documents where the field is + **absent** — and a missing `_rperm` is treated by Parse Server as public. + A security audit using `not_publicly_writable` to find safe objects silently + excluded write-exposed (public-by-absence) objects. The constraints now carry + an `$exists: true` guard. "Not readable by X" additionally expands the + principal's roles and excludes publicly-readable rows (a public row is + readable by everyone, so it cannot be "not readable by X"). +- **FIXED**: `readable_by([])` / `writable_by([])` and the `:none` / `nil` + forms no longer raise `ArgumentError`; they now compile to the documented + "no permissions" match (an explicit empty `_rperm` / `_wperm`). Symbol + principals (`:public`, `:everyone`, `:world`) are accepted and map to the + public wildcard, matching the String forms. +- **FIXED**: `PrivateAclConstraint` (`:ACL.private_acl` / `master_key_only`) + no longer classifies public-by-absence rows as private. A truly master-key- + only object has an explicit empty `_rperm` **and** `_wperm`; a missing + column is public, the opposite of private, so the missing-field branch was + removed. `private_acl => false` is now the exact complement. +- **FIXED**: role expansion for `readable_by` / `writable_by` / + `readable_by_role` / `writable_by_role` now always includes the role's own + name in the permission set. The upward-inheritance walk yields nothing for + an unpersisted role (objectId still nil), which previously dropped the role + entirely and raised "no valid permissions"; the role's own `role:` + entry is now appended idempotently, so persisted roles compile unchanged. +- **CHANGED**: a mistyped ACL permission no longer vanishes silently. An + unrecognized element in a `readable_by` / `writable_by` array (or an + unsupported Symbol) now raises `ArgumentError` instead of being dropped from + the permission set, which would silently weaken the intended filter. +- **NEW**: `strict:` option on `readable_by` / `writable_by` / + `readable_by_role` / `writable_by_role` (and the `:ACL.readable_by_exact` / + `writable_by_exact` / `*_by_role_exact` operators) for an **exact** match — + only rows whose `_rperm` / `_wperm` literally contains one of the resolved + permissions, with no implicit public `"*"` and no missing-field rows. The + default remains inclusive (access-simulation) semantics; `strict: true` is + the right choice for ownership and security audits. +- **NEW**: `Query#not_readable_by` / `#not_writable_by` chained methods, the + fluent counterparts to the existing `:ACL.not_readable_by` symbol operators. +- **BREAKING**: the British-spelled `:ACL.writeable_by` operator now resolves + to the same public-inclusive, role-expanding implementation as + `:ACL.writable_by`. Previously the one-letter spelling difference selected a + separate, strict, non-role-expanding constraint, so `writeable_by` and + `writable_by` silently produced different result sets. Code that relied on + the old strict behavior of `writeable_by` should pass `strict: true` (or use + the `:writable_by_exact` operator). + +#### Webhook after_save callback hardening + +- **FIXED**: the model's chained `after_save` / `after_create` callbacks now + fire exactly once per `afterSave` delivery, even when an app registers both a + class-specific handler (`webhook :after_save, MyClass`) and a catch-all + handler (`webhook :after_save, "*"`). The webhook endpoint dispatches every + trigger to both the class route and the `"*"` route, and the callback chain + previously ran inside each route — so an app with both handlers fired its + model `after_save` twice (e.g. two emails per save). The chain now runs once, + after both routes are dispatched. The existing behavior is otherwise + preserved: an `afterSave` for a class with no registered handler never fires + model callbacks, and trusted Ruby-initiated saves still skip the webhook-side + callbacks so the local `run_callbacks :save` is the single fire. +- **FIXED**: a chained `after_save` or `after_create` callback that raises + during an `afterSave` webhook no longer crashes the webhook endpoint or + suppresses the other phase's side effects. Because `afterSave` fires after the + object is already persisted and Parse Server discards the response body, the + `after_create` and `after_save` phases now run independently and any + `StandardError` they raise is logged and swallowed (mirroring Parse Server's + own afterSave semantics). A raising `after_create :send_welcome_email` no + longer silently skips an unrelated `after_save :reindex`, and an uncaught + callback error can no longer return a 500 to Parse Server. +- **FIXED**: `Parse::Webhooks::Payload#ruby_initiated?` now memoizes a `false` + result stably instead of re-deriving it on every call. The prior `||=` + memoization recomputed whenever the cached value was `false`, so a stamped + `false` could be re-derived inconsistently; the detection result is now cached + exactly once. + +#### `verify_password` client-side rate-limit parity + +- **CHANGED**: `verify_password` now participates in the same client-side login + rate-limit as `login`. It calls the rate-limit guard before issuing the + request and records the result afterward, keyed on the bare username so + failures share a bucket with `login` — an attacker cannot sidestep a `login` + lockout by pivoting to the `verify_password` credential oracle. Because the + bucket is shared, a run of failed step-up / re-authentication calls counts + toward (and can trigger) the primary login lockout for that username. As with + `login`, this is a convenience guard, not a security boundary — server-side + rate limiting remains the real control. + +#### Cloud function results are server-authoritative + +- **IMPROVED**: Documented that decoded cloud function results are treated as + server-authoritative. A cloud function that returns a Parse object decodes + through the same trusted path as every query and `fetch` result, so + server-set fields on the returned object (including `sessionToken` on a + returned user) are preserved rather than stripped — consistent with how the + rest of the SDK hydrates server responses. If a cloud function is expected to + echo back third-party-influenced data that you want to sanitize yourself, + call it with `raw: true` (`Parse.call_function(name, body, raw: true)`) to + receive the undecoded response before any object is built. + ### 5.4.1 #### Webhook after_save callback fix diff --git a/Gemfile.lock b/Gemfile.lock index 10b34f1..abc7365 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,7 +1,7 @@ PATH remote: . specs: - parse-stack-next (5.4.1) + parse-stack-next (5.5.0) activemodel (>= 6.1, < 9) activesupport (>= 6.1, < 9) connection_pool (>= 2.2, < 4) diff --git a/README.md b/README.md index e6f9993..7013e9f 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,13 @@ A full-featured Ruby client SDK for [Parse Server](http://parseplatform.org/). [parse-stack-next](https://github.com/neurosynq/parse-stack-next) is a Ruby client SDK, REST client, and Active Model ORM for [Parse Server](http://parseplatform.org/), combining a low-level API client, a query engine, an object-relational mapper (ORM), and a Cloud Code Webhooks rack application in a single gem. +### What's new in 5.5 + +- **5.5.0 — Multimodal bytes-fetch with magic-byte MIME verification** — `embed_image ..., source: :bytes` has the SDK download an image itself through the `Parse::File.safe_open_url` SSRF primitive, verify the content by **magic-byte sniff** (the `Content-Type` header is never consulted — a `.jpg` URL serving HTML is refused), cross-check the URL extension, enforce a `Parse::Embeddings.allowed_image_types` allowlist, strip EXIF/XMP metadata **by default** (JPEG APP1, PNG `eXIf`, WebP `EXIF`/`XMP ` chunks; opt out with `exif_strip: false`), and forward the verified bytes to Voyage/Cohere as a base64 data URI. No provider-side URL fetch occurs, so the `trust_provider_url_fetch` sentinel is not required — the host allowlist still applies. See [CHANGELOG.md](./CHANGELOG.md) +- **5.5.0 — Embedding-model migration tooling** — `Class.reembed!(only_stale: true)` bulk re-embeds rows through the current provider/model (resumable; skips rows already current), driven by the new auto-declared `_meta` provenance sibling (`{provider, model, dimensions, modality, embedded_at}`, stamped on every recompute). `Parse::Embeddings::BatchEmbedder` adds batch-level requests-per-minute pacing and exponential backoff for bulk jobs; `Parse::Embeddings::Cache.enable!` adds an opt-in query-embed cache keyed by `(provider, model, input_type, input-hash)` so repeated identical queries skip the provider round-trip. See [CHANGELOG.md](./CHANGELOG.md) +- **5.5.0 — Vector index drift detection** — on first auto-discovered use of an Atlas vectorSearch index, the SDK verifies the deployed index's `numDimensions`/`similarity` against the `:vector` property declaration and confirms a registered `agent_tenant_scope` field is covered as a `type: "filter"` path. Policy via `Parse::VectorSearch.index_drift_policy` (`:warn` default / `:raise` / `:ignore`). `Parse::Schema::SearchIndexMigrator` now auto-includes the tenant-scope field in `vectorSearch` declarations, so newly created indexes support tenant-scoped pre-filtering out of the box. See [CHANGELOG.md](./CHANGELOG.md) +- **5.5.0 — Retrieval spend-cap and filter hardening** — the per-tenant embedding spend cap now covers every query-embed path (`find_similar(text:)`, `hybrid_search(text:)`, `Parse::Retrieval.retrieve`), not just the `semantic_search` agent tool; tenant identity resolves through the ambient `Parse.with_cache_tenant` scope. Caller-supplied retrieval filters now translate Parse pointer values to storage form (`{ owner: user }` → `{ "_p_owner" => "_User$id" }`), so pointer filters match rows instead of silently matching nothing. See [CHANGELOG.md](./CHANGELOG.md) + ### What's new in 5.4 - **5.4.0 — Hybrid search + reranking for RAG** — `Class.hybrid_search(text:, lexical:, vector:, k:, fusion:)` fuses a lexical Atlas Search branch with a `$vectorSearch` branch using reciprocal-rank fusion (RRF): lexical search nails exact tokens (codes, proper nouns), vector search nails paraphrase, and fusing the two beats either alone. Each branch enforces ACL/CLP independently before fusion (no separate hydration fetch to secure); results carry `#hybrid_score` / `#hybrid_ranks`. `Parse::VectorSearch::Hybrid.rank_fusion_supported?` detects Atlas 8.0+ native `$rankFusion` by a cached behavioural probe (native execution is opt-in; client-side RRF is the always-enforced default). `Parse::Retrieval::Reranker` adds cross-encoder reranking (`Reranker::Cohere` over `/v2/rerank`, plus a deterministic `Reranker::Fixture`), wired into `Parse::Retrieval.retrieve(hybrid:, rerank:)`. `Parse::Embeddings::SpendCap` adds an opt-in per-tenant embedding token cap (hard-refuse) at the `semantic_search` agent-tool boundary. See [CHANGELOG.md](./CHANGELOG.md) and [`docs/atlas_vector_search_guide.md`](./docs/atlas_vector_search_guide.md) @@ -38,7 +45,7 @@ See [CHANGELOG.md](./CHANGELOG.md) for the full 5.2 entry. - **`Parse::File` URL normalization + presigned-URL stash** — `Parse::File#url=` and `attributes=` now strip signed-URL query parameters (`X-Amz-Signature`, `AWSAccessKeyId`, `Key-Pair-Id`, etc.) before storage; the bare canonical URL lands in `@url`, and the original signed URL is stashed in `file.presigned_url` with a data-driven expiry in `file.presigned_url_expires_at`. New `file.presigned_url_valid?(buffer: 60)` predicate, configurable `Parse::File.signed_url_policy = :strip | :raise`, and `Parse::File.log_filter` / `log_filter_strict` regexes for `lograge` / Sentry / Honeybadger scrubbers. `Parse::File#inspect` no longer emits the URL — see CHANGELOG for the error-reporter payload migration callout - **`Parse::Lock` — public TTL-bounded mutual-exclusion primitive** — `Parse::Lock.acquire(key, ttl:, wait:) { … }` exposes the Redis-backed lock previously hidden inside `first_or_create!` as a first-class API. In-process `Mutex` fallback for memory-backed caches, fails closed on backend errors, HMAC-keyed via `PARSE_STACK_LOCK_SECRET`, namespace-separated from `first_or_create!` so the two cannot collide - **LiveQuery ergonomics** — autoloaded (no explicit `require 'parse/live_query'`); connections are **ACL-scoped by default** (build an admin, ACL-bypassing connection explicitly with `Parse::LiveQuery::Client.new(use_master_key: true)` — master-key authorization is per-connection, not per-subscription); `Query#subscribe` / `Klass.subscribe` accept a block yielded the `Subscription` *before* the subscribe frame is sent so `sub.on(:create) { … }` callbacks are wired before any server event can arrive; `Parse::LiveQuery.run_until_signal!(client:) { … }` is a signal-safe shutdown helper for long-running consumers -- **Image embeddings** — new `embed_image` class macro for `:file`-typed source properties plus `Voyage#embed_image` (`voyage-multimodal-3`, 1024-dim) and `Cohere#embed_image` (`embed-v4.0`, 1536-dim). URL-only routing in v5.1 (bytes-fetch with MIME-sniff lands later); operator-gated via the `Parse::Embeddings.trust_provider_url_fetch = "PROVIDER_EGRESS_VERIFIED"` sentinel plus a `Parse::Embeddings.allowed_image_hosts` CDN allowlist +- **Image embeddings** — new `embed_image` class macro for `:file`-typed source properties plus `Voyage#embed_image` (`voyage-multimodal-3`, 1024-dim) and `Cohere#embed_image` (`embed-v4.0`, 1536-dim). URL-only routing in v5.1 (the bytes-fetch path with MIME-sniff shipped in v5.5 as `source: :bytes`); operator-gated via the `Parse::Embeddings.trust_provider_url_fetch = "PROVIDER_EGRESS_VERIFIED"` sentinel plus a `Parse::Embeddings.allowed_image_hosts` CDN allowlist - **Tenant-aware cache namespacing** — `Parse.with_cache_tenant(scope) { … }` composes the tenant into the response-cache key as `:T::…` so a multi-tenant app sharing one Redis gets per-tenant key isolation and per-tenant SCAN-delete eviction without per-tenant `Parse::Client.new` plumbing. Fiber-local, restored on block exit, AS::N payloads carry `:cache_tenant` - **`_User` field-visibility DSL** — `Parse::User.master_only_fields(*fields)` and `Parse::User.self_visible_fields(*fields, via: :self)` declare admin-only and owner-only field protections on `_User`. Requires Parse Server's `protectedFieldsOwnerExempt: false` server option (the SDK emits a one-time advisory at class declaration so the dependency is surfaced before deploy). Parse Server's default for this option is changing to `false` in a future version; until your server adopts that default, set it explicitly - **`Parse::Installation` `belongs_to :user`** — read `installation.user` to find which user a device is currently signed in as. Symmetric `Parse::User#has_many :installations` for targeted-push grouping (master-key-only by Parse Server design; see the YARD for the owner-identity caveat) @@ -64,6 +71,16 @@ See [CHANGELOG.md](./CHANGELOG.md) for the full 5.0 entry, including security-ha ### Core capabilities +> **Vector search requires MongoDB Atlas (or Atlas Local).** The `:vector` +> property, `find_similar`, `hybrid_search`, and `Parse::Retrieval` all +> execute Atlas `$vectorSearch` / `$search` aggregation stages, which exist +> only on Atlas clusters and the Atlas Local container — community/self-hosted +> MongoDB is not supported and there is no in-process fallback (a pure-Ruby +> cosine scan over a real collection is a silent performance cliff, so the +> SDK refuses rather than degrades). This is a closed design decision. +> Everything else in this list works against any MongoDB that Parse Server +> supports. + - MongoDB Aggregation Framework support - **MongoDB Atlas Search** — full-text search, autocomplete, faceted search with direct MongoDB access - **Direct MongoDB Queries** — bypass Parse Server's REST surface for high-performance reads, with SDK-side ACL/CLP/`protectedFields` enforcement for scoped agents @@ -5533,13 +5550,24 @@ pipeline = [ Filter objects by ACL permissions using MongoDB's `_rperm` and `_wperm` fields: -**`readable_by` / `writable_by`** - Exact permission strings: +**`readable_by` / `writable_by`** - filter by principal: ```ruby Song.query.readable_by("user123").results(mongo_direct: true) # User ID Song.query.readable_by("role:Admin").results(mongo_direct: true) # Role (explicit prefix) -Song.query.readable_by(current_user).results(mongo_direct: true) # User object -Song.query.readable_by("public").results(mongo_direct: true) # Public access (alias for "*") -Song.query.readable_by("none").results(mongo_direct: true) # Empty _rperm (master key only) +Song.query.readable_by(current_user).results(mongo_direct: true) # User object (roles expanded) +Song.query.readable_by(:public).results(mongo_direct: true) # Public access (maps to "*") +Song.query.readable_by([]).results(mongo_direct: true) # No read perms (empty _rperm) +``` + +By default the match is **inclusive** — it ALSO returns publicly-readable rows +(`_rperm` contains `"*"`) and rows with a missing `_rperm` (public by absence), +because those are genuinely readable by the principal (access-simulation +semantics). For an **exact** match — only rows whose `_rperm` literally grants +the principal, with no public/missing rows — pass `strict: true`. This is what +an ownership or security audit wants: + +```ruby +Song.query.readable_by("role:Admin", strict: true).results # ONLY rows that explicitly grant Admin ``` **`readable_by_role` / `writable_by_role`** - Adds "role:" prefix automatically: @@ -5549,7 +5577,18 @@ Song.query.readable_by_role(admin_role).results(mongo_direct: true) # Song.query.writable_by_role(["Admin", "Editor"]).results(mongo_direct: true) # Multiple roles ``` -**Note:** Requires the `mongo` gem. Add `gem 'mongo'` to your Gemfile. +**Convenience and negation:** `publicly_readable` / `publicly_writable`, +`privately_readable` / `private_acl` (master-key-only), `not_readable_by` / +`not_writable_by`, and `not_publicly_readable` / `not_publicly_writable`. +"Not readable by X" excludes rows readable by X directly, via any role X +inherits, or publicly. + +**Note:** These constraints compile to an aggregation `$match` on the internal +`_rperm` / `_wperm` columns, so they auto-route to the direct-MongoDB path +(requires the `mongo` gem and `Parse::MongoDB.configure(...)`). For a scoped +query (`scope_to_user` / `scope_to_role` / `session_token`) the SDK enforces +ACL/CLP on that path; a scoped aggregate fails closed if mongo-direct is not +configured rather than running unscoped. ### ACL Dirty Tracking diff --git a/docs/atlas_vector_search_guide.md b/docs/atlas_vector_search_guide.md index 415bc3a..a9a3b92 100644 --- a/docs/atlas_vector_search_guide.md +++ b/docs/atlas_vector_search_guide.md @@ -288,6 +288,85 @@ declared `dimensions:` before sending the pipeline. A mismatch raises it — callers get "expected 1536, got 768" instead of a server-side error after a round-trip. +### Index drift verification (v5.5) + +On the first auto-discovered use of a vectorSearch index per +(class, field, index) per process, the SDK compares the deployed +index's `latestDefinition` against the model declaration: + +* `numDimensions` vs the property's declared `dimensions:` — a + mismatch means every query will be rejected or return nonsense + (usually an index that predates a model change). +* `similarity` vs the property's declared `similarity:` (checked only + when both sides declare one). +* When the class registers an `agent_tenant_scope`, the scope field + must appear among the index's `type: "filter"` paths — without it, + every tenant-scoped `$vectorSearch.filter` fails Atlas-side at + query time. + +Findings are computed once per (class, field, index) per process and +governed by `Parse::VectorSearch.index_drift_policy`: + +```ruby +Parse::VectorSearch.index_drift_policy = :warn # default — [Parse::VectorSearch:DRIFT] warning on first check +Parse::VectorSearch.index_drift_policy = :raise # IndexDriftError on EVERY query against a drifted index +Parse::VectorSearch.index_drift_policy = :ignore # skip verification +``` + +Under `:raise` the cached findings keep raising — strict mode means a +drifted index never serves results, not "fails once, then passes". +Auto-discovery verification costs no extra round-trip (the definition +is already in hand from index discovery). An explicit `index:` kwarg +is verified best-effort: when the catalog's covering index for the +field carries the same name, its definition is checked too; catalog +lookup failures never fail the query. + +### Query-embed caching and spend caps (v5.5) + +Every `text:`-overload query funnels through one embed path +(`find_similar(text:)`, `hybrid_search(text:)`, +`Parse::Retrieval.retrieve` all share it), which gives two controls: + +```ruby +# Opt-in query-embed cache: repeated identical queries skip the +# provider round-trip. Keyed by (provider, model, dimensions, +# input_type, SHA-256(input)) — plaintext never lands in the store. +Parse::Embeddings::Cache.enable!(max_entries: 2048, ttl: 600) +Parse::Embeddings::Cache.stats # => { enabled:, hits:, misses:, size: } + +# Per-tenant spend cap now covers DIRECT callers too, not just the +# semantic_search agent tool. Tenant identity resolves to the ambient +# Parse.with_cache_tenant scope when set, else a shared default bucket. +# warn_at: adds a soft cap — crossing 80% of the limit emits a +# parse.embeddings.spend_cap_warning AS::N event (alert, never refuse). +Parse::Embeddings::SpendCap.configure(limit_tokens: 1_000_000, window: 3600, + warn_at: 0.8) +Parse.with_cache_tenant("tenant_abc") do + Document.find_similar(text: query) # charged against tenant_abc +end +``` + +Cache hits emit the standard `parse.embeddings.embed` notification +with `cached: true`, so existing spend subscribers see hits and misses +on one stream. The cache is in-process by default; for a persistent +layer shared across processes, wrap any Moneta-compatible backend in +the bundled adapter: + +```ruby +moneta = Moneta.new(:Redis, url: ENV["REDIS_URL"]) +Parse::Embeddings::Cache.enable!( + store: Parse::Embeddings::Cache::MonetaStore.new(moneta, ttl: 30 * 24 * 3600), +) +``` + +`MonetaStore` namespaces keys, forwards TTL via Moneta's `expires:`, +and fails open (a backend error is a cache miss, never a failed +embed). Keys are input hashes — plaintext queries never land in the +shared store; the VALUES are embeddings, so give the store the same +access controls as the database. A query the agent tool already +charged per-tenant is not double-billed (`SpendCap.with_precharged` +wraps the tool's retrieval). + ### ACL/CLP inheritance Vector search routes through `Parse::MongoDB.aggregate`. Every layer @@ -405,6 +484,18 @@ branch — see [Hybrid search](#hybrid-search-vector--lexical) below) and chunking — see [Reranking](#reranking)). Both were reserved in earlier releases and now ship in 5.4.0. +**Pointer values in filters translate automatically (v5.5).** A filter +like `{ owner: some_user }` (a `Parse::Pointer` / `Parse::Object`, or a +wire-form `{"__type" => "Pointer", ...}` hash — including inside `$in` +/ `$eq` / `$ne` operator hashes) is rewritten to its MongoDB storage +form `{ "_p_owner" => "_User$abc123" }` before the `$match` / +`$vectorSearch.filter` is built, so pointer filters match rows instead +of silently matching nothing. Translation runs after the +underscore-key gate (callers still cannot name `_p_*` columns +directly) and before the tenant-scope fold; the `semantic_search` +agent tool inherits it. For `vector_filter:` use, the pointer column +(`_p_owner`) must be declared `type: "filter"` in the index. + ### Hybrid search (vector + lexical) `Class.hybrid_search` runs a lexical Atlas Search (`$search`) branch and a @@ -556,13 +647,26 @@ envelope. See the [MCP guide's Token Economy section](./mcp_guide.md#token-econo --- -## Image embedding: `embed_image` macro (v5.1) +## Image embedding: `embed_image` macro (v5.1 URL mode, v5.5 bytes mode) `embed_image` is the image-source counterpart to `embed`. The source property must be `:file`-typed; the target must be a `:vector` property whose declared `provider:` supports multimodal input (currently `:voyage` with `voyage-multimodal-3`, or `:cohere` with `embed-v4.0`). +Two fetch modes, selected per declaration with `source:`: + +* **`source: :url`** (default) — the SDK validates the file's URL and + forwards it; the **provider** performs the fetch from its own + network. Requires the `trust_provider_url_fetch` sentinel (see + operator setup below). +* **`source: :bytes`** (v5.5) — the **SDK** downloads the image + through `Parse::File.safe_open_url`, verifies the content by + magic-byte sniff, strips EXIF/XMP metadata, and forwards the bytes + to the provider as a base64 data URI. No provider-side URL fetch + occurs, so the sentinel is NOT required — the + `allowed_image_hosts` allowlist still is. + ```ruby class Post < Parse::Object property :cover_image, :file @@ -621,6 +725,57 @@ with `Parse::File`, not parallelized). Failures raise (`:scheme`, `:port`, `:userinfo`, `:host_blocked`, `:host_not_allowlisted`, `:parse`). +### Bytes mode (`source: :bytes`, v5.5) + +```ruby +# Operator setup — only the host allowlist is required (the sentinel +# applies to URL forwarding, not SDK-side fetches): +Parse::Embeddings.allowed_image_hosts = [".cloudfront.net"] + +class Post < Parse::Object + property :cover_image, :file + property :cover_image_embedding, :vector, + dimensions: 1024, provider: :voyage, model: "voyage-multimodal-3" + + embed_image :cover_image, into: :cover_image_embedding, + source: :bytes # exif_strip: true is the default +end +``` + +What happens on each (digest-miss) save: + +1. The file URL is validated through + `Parse::Embeddings.validate_image_url!(url, mode: :fetch)` — the + same host allowlist (deny-all when empty), obfuscated-IP screen, + port allowlist, and CIDR resolution check as URL mode, minus the + provider-egress sentinel. +2. `Parse::File.safe_open_url` downloads the bytes — CIDR blocks, + DNS-rebinding re-check, port allowlist, `max_remote_size` cap, + timeouts. No parallel fetch mechanism exists. +3. **Magic-byte verification** (`Parse::Embeddings::ImageFetch`): + the MIME type is determined exclusively from the leading bytes + (JPEG / PNG / GIF / WebP). The HTTP `Content-Type` header is never + consulted. The sniffed type must be in + `Parse::Embeddings.allowed_image_types` (default those four; SVG is + deliberately excluded as script-capable active content), and when + the URL carries a recognized image extension, the extension must + AGREE with the magic bytes — a `.jpg` URL serving PNG bytes (or + HTML) is refused as MIME laundering + (`ImageFetch::InvalidImageType`, with a `:reason` tag). +4. **EXIF/XMP stripping, default ON.** JPEG APP1 segments (Exif and + XMP), PNG `eXIf` chunks, and WebP `EXIF`/`XMP ` RIFF chunks (with + the VP8X flag bits cleared) are removed before the bytes leave the + process — user photos commonly carry GPS coordinates and device + serials. Opt out per declaration with `exif_strip: false` when + orientation metadata must survive. +5. The verified bytes ride to the provider as a base64 data URI + (Voyage `image_base64` content row; Cohere `image_url` data-URI + form). + +Direct provider calls accept the same shape: +`provider.embed_image([Parse::Embeddings::ImageFetch.fetch!(url)])` — +`FetchedImage` sources and URL Strings may be mixed in one batch. + ### Save-side semantics * Digest is the **SHA-256 of the URL String**, not the file bytes. @@ -641,24 +796,102 @@ with `Parse::File`, not parallelized). Failures raise ## Re-embedding existing rows -Changing `model:`, `dimensions:`, or `provider:` on an existing -`:vector` property is a migration regardless of whether the source is -text or images. Workflow: +### Provenance: the `_meta` sibling (v5.5) + +Every `embed` / `embed_image` declaration auto-declares an +`_meta` `:object` sibling (override with `meta_field:`) stamped +on each recompute and cleared with the vector: + +```ruby +doc.body_embedding_meta +# => { "provider" => "openai", +# "model" => "text-embedding-3-small", +# "dimensions" => 1536, +# "modality" => "text", +# "embedded_at" => "2026-06-09T17:32:11Z" } +``` + +This is the record migration tooling reads to know which model +produced any stored vector. + +### Same-shape migrations: `Class.reembed!` (v5.5) + +When the new model has the **same dimensions** (e.g. swapping +`text-embedding-3-small` for a same-width replacement, or a provider +change at equal width), re-embed in place: + +```ruby +# Re-embed every row through the CURRENT provider/model declaration. +Document.reembed!(batch_size: 100) + +# Resumable: skip rows whose _meta already matches the current +# provider + model + dimensions (rows with no meta count as stale). +Document.reembed!(only_stale: true) + +# Scope it +Document.reembed!(field: :body_embedding, where: { published: true }, limit: 10_000) +``` + +`reembed!` walks the class with objectId-cursor pagination, clears +each row's digest sibling (so the save-path recompute cannot elide the +provider call), and saves. Unlike `embed_pending!` — which only fills +NULL vectors — `reembed!` recomputes populated rows too. Run it with a +master-key client (or pass `save_opts:` with a session token that can +write every row). Each row's save makes one provider call; pace bulk +runs against provider rate limits (see `BatchEmbedder` below for the +pattern, or just throttle the loop). + +### Changed-width migrations: dual-field workflow + +Changing `dimensions:` is a different beast — the existing +vectorSearch index can't serve the new width. Use the shadow-field +workflow: 1. Add the new property alongside the old one (`property :body_embedding_v2, :vector, ...`) and an `embed` or `embed_image` block targeting it. -2. Backfill: iterate existing rows, force a save (or null+save) to - trigger the new directive. The old field stays valid for reads. -3. Once backfill completes, deploy a new vectorSearch index covering - the new field and migrate `find_similar` callers. -4. Drop the old property. - -Do NOT mutate the model in place — the digest mechanism will see -unchanged source text / unchanged source URL and skip recompute, -leaving stale vectors. For `embed_image`, also remember the digest is -over the URL String: if you replace bytes at the same URL (PUT-replace -on S3 without renaming), null the digest field to force re-embed. +2. Backfill with `embed_pending!(field: :body_embedding_v2)` — the new + field is null everywhere, so the null-filling walk is exactly right. +3. Deploy a new vectorSearch index covering the new field and migrate + `find_similar` callers. +4. Drop the old property and index. + +Do NOT mutate a model's `dimensions:` in place — the digest mechanism +will see unchanged source text and skip recompute, leaving stale +vectors, and the drift verifier will flag every query against the old +index (`index numDimensions=1536 but property declares ...`). For +`embed_image`, also remember the digest is over the URL String: if you +replace bytes at the same URL (PUT-replace on S3 without renaming), +null the digest field — or run `reembed!` — to force re-embed. + +--- + +## Bulk embedding: `BatchEmbedder` (v5.5) + +`Provider#embed_text_batched` only slices input into provider-sized +chunks; retry lives inside each provider's single HTTP call. For bulk +jobs (ingest pipelines, chunk-corpus embedding) use +`Parse::Embeddings::BatchEmbedder`, which adds batch-level pacing and +backoff: + +```ruby +embedder = Parse::Embeddings::BatchEmbedder.new( + Parse::Embeddings.provider(:openai), + requests_per_minute: 60, # inter-batch pacing + max_attempts: 5, # per-batch tries (exponential backoff + jitter) + on_progress: ->(done:, total:, batch_index:, batch_count:) { + puts "#{done}/#{total}" + }, +) +vectors = embedder.embed_text(texts, input_type: :search_document) +``` + +Rate-limit and transient errors (any provider error class ending in +`RateLimitError` / `TransientError`; override with `retry_on:`) retry +with exponential backoff; other errors propagate immediately. A batch +that exhausts its attempts raises `BatchEmbedder::BatchFailed` +carrying `batch_index` and `completed_count`, so a resumable job knows +exactly where to pick up. --- @@ -728,6 +961,54 @@ floats out). Vectors only flow through the Parse↔Mongo path, where the body builder's `` compaction prevents them from landing in stdout / error trackers. +### When the embedded source is PII: deployment checklist + +An embedding of PII is PII-equivalent. Inversion attacks reconstruct +substantial source text from dense embeddings, and a vector's nearest +neighbors leak the source's meaning even without reconstruction. If +the fields you `embed` contain personal data (names, addresses, health +or financial details, free-text user messages), treat the vector +column with the same handling as the source column: + +1. **Provider contract.** You are sending the raw source text (and in + bytes mode, image content) to the embedding provider on every + recompute. Confirm the provider's data-retention and training-use + terms cover PII, and that a DPA is in place where required. + Self-hosting via `LocalHTTP` (Ollama / vLLM / TEI) keeps the text + in your network. +2. **Keep vectors off the wire.** Leave `vector_visibility` at its + `:owner_only` default so vectors are omitted from `as_json` and + webhook payloads. Do not flip a PII class to `:public`. +3. **Row ACL still governs.** Vector hits route mongo-direct with + `_rperm` enforcement — verify your rows carry real ACLs and that + callers use scoped credentials (`session_token:` / `acl_user:`), + not blanket master key. +4. **Tenant isolation.** Multi-tenant deployments must declare + `agent_tenant_scope` on searchable classes; the scope folds into + `$vectorSearch.filter` (and v5.5's drift verification confirms the + index covers it). Without it, similarity scores leak cross-tenant + document existence. +5. **Score exposure.** Keep score quantization on for non-admin agent + contexts (the default) — full-precision scores enable + membership-inference probing. +6. **EXIF stays stripped.** For image embedding, keep the bytes-mode + default `exif_strip: true`; user photos carry GPS coordinates and + device serials that would otherwise reach the provider. +7. **Log and cache hygiene.** Redact query text at the Faraday layer + (above); if you enable the persistent L2 cache, note that cache + KEYS are hashes (no plaintext) but cache VALUES are the embeddings + themselves — point `MonetaStore` at a store with the same access + controls as the database. +8. **Deletion propagation.** When a user exercises erasure rights, + the vector, its `_digest`, and its `_meta` siblings + live on the same row and delete with it — but check external + copies: provider-side logs (their retention policy), your L2 + embedding cache (TTL or explicit flush), and any analytics sink + subscribed to embedding events. +9. **Migration hygiene.** `reembed!` re-sends every row's source text + to the provider — schedule PII-class migrations under the same + approvals as a data export. + --- ## Troubleshooting @@ -775,10 +1056,20 @@ on every poll) rather than a `until index_ready?; sleep` loop. Key files: * `lib/parse/embeddings.rb` — registry, `Configuration`, `register`, - `provider`, `configure`, `validate_image_url!`, - `trust_provider_url_fetch=`, `allowed_image_hosts=`. + `provider`, `configure`, `validate_image_url!` (`mode: :forward | :fetch`), + `trust_provider_url_fetch=`, `allowed_image_hosts=`, + `allowed_image_types=`. * `lib/parse/embeddings/provider.rb` — abstract base, `validate_response!`, `instrument_embed`, AS::N payload contract. +* `lib/parse/embeddings/image_fetch.rb` — bytes-fetch path: + `ImageFetch.fetch!`, magic-byte `sniff_mime`/`verify!`, EXIF/XMP + stripping, `FetchedImage`. +* `lib/parse/embeddings/batch_embedder.rb` — `BatchEmbedder` bulk + orchestration (pacing, batch-level backoff, `BatchFailed`). +* `lib/parse/embeddings/cache.rb` — opt-in query-embed cache + (`Cache.enable!` / `fetch_vector` / `stats`). +* `lib/parse/embeddings/spend_cap.rb` — per-tenant token cap + (`charge!`, `charge_query!`, `with_precharged`). * `lib/parse/embeddings/openai.rb` — OpenAI provider. * `lib/parse/embeddings/cohere.rb` — Cohere v3 + v4.0 text-mode provider. * `lib/parse/embeddings/voyage.rb` — Voyage text + multimodal-3 @@ -788,9 +1079,13 @@ Key files: * `lib/parse/embeddings/local_http.rb` — generic OpenAI-compatible local-gateway client. * `lib/parse/embeddings/fixture.rb` — deterministic test provider. -* `lib/parse/model/core/vector_searchable.rb` — `find_similar`. +* `lib/parse/model/core/vector_searchable.rb` — `find_similar`, + `hybrid_search`, index drift verification + (`Parse::VectorSearch.index_drift_policy`). * `lib/parse/model/core/embed_managed.rb` — `embed` and `embed_image` - macros, `EmbedDirective` (carries `modality:`, `allow_insecure:`). + macros, `EmbedDirective` (carries `modality:`, `allow_insecure:`, + `source_mode:`, `exif_strip:`, `meta_field:`), `embed_pending!`, + `reembed!`. * `lib/parse/vector_search.rb` — low-level `Parse::VectorSearch.search`. * `lib/parse/atlas_search/index_manager.rb` — `IndexCatalog.create_index`, `find_vector_index`, `wait_for_ready`. diff --git a/lib/parse/api/users.rb b/lib/parse/api/users.rb index b4f9f3f..e2355b0 100644 --- a/lib/parse/api/users.rb +++ b/lib/parse/api/users.rb @@ -223,15 +223,25 @@ def login_with_mfa(username, password, mfa_token, headers: {}, **opts) # - code 205 (+ERROR_EMAIL_NOT_FOUND+) when +preventLoginWithUnverifiedEmail+ # is enabled and the account's email has not been verified. # + # Client-side rate limited per username using the SAME bucket as {#login} + # (bare username, no namespace) — failures across both credential oracles + # accumulate, so an attacker cannot bypass a +login+ lockout by pivoting to + # this endpoint. The trade-off: a run of failed step-up re-auth calls counts + # toward (and can trigger) the primary login lockout for that username. + # Client-side limiting is a convenience, not a boundary — the server is the + # real control. + # # @param username [String] the Parse user username. # @param password [String] the Parse user's associated password. # @param headers [Hash] additional HTTP headers to send with the request. # @param opts [Hash] additional options to pass to the {Parse::Client} request. # @return [Parse::Response] def verify_password(username, password, headers: {}, **opts) + check_login_rate_limit!(username) body = { username: username, password: password } response = request :post, VERIFY_PASSWORD_PATH, body: body, headers: headers, opts: opts response.parse_class = Parse::Model::CLASS_USER + track_login_attempt(username, response.success?) response end diff --git a/lib/parse/client.rb b/lib/parse/client.rb index 3c74cf8..2c0d0c1 100644 --- a/lib/parse/client.rb +++ b/lib/parse/client.rb @@ -1425,6 +1425,22 @@ def self.setup(opts = {}, &block) # Object/Pointer envelope is converted, and an Object of an UNregistered class # is left as a raw Hash (building it would degrade to a field-less Pointer). # Plain Hashes and arbitrary `__type` app data pass through untouched. + # + # SECURITY — cloud results are treated as server-authoritative. The + # `__type:"Object"` decode in {._decode_cloud_value} routes through + # +Parse::Object.build+, which hydrates with trusted-init — the SAME path + # used to decode every query / +.fetch+ result. Trusted-init skips the + # +PROTECTED_INITIALIZE_KEYS+ filter, so credential-shaped keys + # (+sessionToken+, +authData+, +_rperm+, +_wperm+, +roles+, …) present in a + # cloud function's return value populate the in-memory object, exactly as they + # do for any other server response. This is by design: the payload is authored + # by your Cloud Code and the request is caller-authenticated, and making cloud + # results filter these keys would make them inconsistent with (and stricter + # than) query/+.fetch+ hydration — e.g. a cloud function returning + # +request.user+ would come back missing its +sessionToken+. If a cloud + # function is expected to echo back third-party-influenced data, call it with + # +raw: true+ (+Parse.call_function(name, body, raw: true)+) to receive the + # undecoded response and sanitize it yourself before building objects. def self._extract_cloud_result(response) r = response.result value = r.is_a?(Hash) ? r["result"] : r @@ -1568,7 +1584,9 @@ def self.call_function(name, body = {}, **opts) # specific {Parse::Error} subclasses as the underlying client does. # @param name (see Parse.call_function) # @param body (see Parse.call_function) - # @param opts (see Parse.call_function) — :raw is ignored. + # @param opts (see Parse.call_function) — +:raw+ has no effect; this method + # always decodes the result. Use {Parse.call_function} with +raw: true+ if + # you need the undecoded response. # @raise [Parse::Error::CloudCodeError] when the response indicates a cloud-code error. # @return [Object] the result data of the response. def self.call_function!(name, body = {}, **opts) diff --git a/lib/parse/embeddings.rb b/lib/parse/embeddings.rb index a2ddb6a..c6d60c0 100644 --- a/lib/parse/embeddings.rb +++ b/lib/parse/embeddings.rb @@ -246,6 +246,7 @@ def reset! CONFIG_MUTEX.synchronize do @configuration = nil @allowed_image_hosts = nil + @allowed_image_types = nil @trust_provider_url_fetch = nil end end @@ -298,6 +299,30 @@ def allowed_image_hosts @allowed_image_hosts ||= [].freeze end + # Configure the MIME types the bytes-fetch path accepts after + # magic-byte sniffing (see {ImageFetch.verify!}). Defaults to + # {ImageFetch::DEFAULT_ALLOWED_IMAGE_TYPES} (JPEG / PNG / GIF / + # WebP). The sniffed type — never the `Content-Type` header — is + # checked against this list, so adding a type here only matters + # when {ImageFetch.sniff_mime} can recognize its magic bytes. + # + # @param types [Array] MIME type strings. + # @return [Array] + def allowed_image_types=(types) + unless types.is_a?(Array) && !types.empty? && + types.all? { |t| t.is_a?(String) && t.include?("/") } + raise ArgumentError, + "Parse::Embeddings.allowed_image_types= expects a non-empty Array of " \ + "MIME type Strings (got #{types.inspect})." + end + CONFIG_MUTEX.synchronize { @allowed_image_types = types.dup.freeze } + end + + # @return [Array] MIME allowlist for the bytes-fetch path (frozen). + def allowed_image_types + @allowed_image_types ||= ImageFetch::DEFAULT_ALLOWED_IMAGE_TYPES + end + # Sentinel-gated opt-in for forwarding image URLs to embedding # providers. Assign the exact {TRUST_PROVIDER_URL_FETCH_SENTINEL} # String to unlock; any other value (including `true`, `1`, @@ -357,11 +382,20 @@ def trust_provider_url_fetch? # @param allow_insecure [Boolean] permit `http://` (default # false). Only meaningful for local development / container- # internal CDN proxies. + # @param mode [Symbol] `:forward` (default) validates for + # URL-forwarding to a provider and requires the + # {.trust_provider_url_fetch=} sentinel. `:fetch` validates for + # the SDK's OWN download through {Parse::File.safe_open_url} + # (the v5.5 bytes path) and skips the sentinel — no URL is + # forwarded to a third party, so the provider-egress + # acknowledgment doesn't apply. Every other layer (host + # allowlist deny-by-default, obfuscated-IP screen, port + # allowlist, CIDR resolution check) is identical in both modes. # @return [String] canonicalized URL (`URI.parse(url).to_s`). - # @raise [ConfirmationRequired] when the sentinel is unset. + # @raise [ConfirmationRequired] when the sentinel is unset (`:forward` mode). # @raise [InvalidImageURL] on any other validation failure. - def validate_image_url!(url, allow_insecure: false) - unless trust_provider_url_fetch? + def validate_image_url!(url, allow_insecure: false, mode: :forward) + unless mode == :fetch || trust_provider_url_fetch? hint = if allowed_image_hosts.empty? " First populate Parse::Embeddings.allowed_image_hosts with the CDN " \ @@ -555,3 +589,6 @@ def ip_shaped_but_not_canonical?(host) require_relative "embeddings/qwen" require_relative "embeddings/local_http" require_relative "embeddings/spend_cap" +require_relative "embeddings/image_fetch" +require_relative "embeddings/cache" +require_relative "embeddings/batch_embedder" diff --git a/lib/parse/embeddings/batch_embedder.rb b/lib/parse/embeddings/batch_embedder.rb new file mode 100644 index 0000000..c24d7b6 --- /dev/null +++ b/lib/parse/embeddings/batch_embedder.rb @@ -0,0 +1,188 @@ +# encoding: UTF-8 +# frozen_string_literal: true + +module Parse + module Embeddings + # Batch-level orchestration for bulk embedding jobs. + # + # {Provider#embed_text_batched} only slices input into + # provider-sized chunks; any retry/backoff lives inside each + # provider's single HTTP call. That is the wrong layer for bulk + # work: a 50k-document backfill needs *batch-level* pacing (stay + # under the provider's requests-per-minute budget across calls) and + # *batch-level* backoff (a 429 after the provider's internal retries + # are exhausted should pause the whole job, not kill it). + # {BatchEmbedder} wraps any registered provider with both. + # + # @example Backfill with pacing and backoff + # embedder = Parse::Embeddings::BatchEmbedder.new( + # Parse::Embeddings.provider(:openai), + # requests_per_minute: 60, + # max_attempts: 5, + # ) + # vectors = embedder.embed_text(texts, input_type: :search_document) + # + # @example Progress reporting + # embedder = Parse::Embeddings::BatchEmbedder.new(provider, + # on_progress: ->(done:, total:, batch_index:, batch_count:) { + # puts "#{done}/#{total}" + # }) + # + # == Retry classification + # + # By default a batch is retried when the provider raises a + # {Parse::Embeddings::Error} subclass whose class name ends in + # `RateLimitError` or `TransientError` — the convention every + # bundled provider follows (`OpenAI::RateLimitError`, + # `Voyage::TransientError`, …). Pass `retry_on:` with explicit + # exception classes to override. Non-retryable errors (auth, + # bad-request, response-contract violations) propagate immediately. + # + # Vectors are returned aligned 1:1 with the input, identical to + # `embed_text` on the wrapped provider. + class BatchEmbedder + # Raised when a batch still fails after `max_attempts` retryable + # failures. Wraps the final provider error in `#cause` and carries + # the index of the failing batch so a resumable job knows where to + # pick up. + class BatchFailed < Parse::Embeddings::Error + # @return [Integer] zero-based index of the failing batch. + attr_reader :batch_index + # @return [Integer] number of inputs successfully embedded before the failure. + attr_reader :completed_count + + def initialize(message, batch_index:, completed_count:) + @batch_index = batch_index + @completed_count = completed_count + super(message) + end + end + + RETRYABLE_NAME_SUFFIXES = %w[RateLimitError TransientError].freeze + + # @return [Provider] the wrapped provider. + attr_reader :provider + + # @param provider [Provider] any registered embedding provider. + # @param batch_size [Integer, nil] inputs per provider call. + # Defaults to the provider's own {Provider#embed_batch_size} + # hint, falling back to 64 when the provider has none. + # @param requests_per_minute [Numeric, nil] batch-level pacing + # budget. When set, consecutive provider calls are spaced at + # least `60.0 / requests_per_minute` seconds apart. nil disables + # pacing. + # @param max_attempts [Integer] attempts per batch (1 = no retry). + # @param base_delay [Numeric] first backoff delay in seconds; + # doubles per attempt. + # @param max_delay [Numeric] backoff ceiling in seconds. + # @param jitter [Numeric] random multiplier range added to each + # delay (`delay * (1 + rand * jitter)`); spreads thundering + # herds when several workers back off together. + # @param retry_on [Array, nil] explicit retryable exception + # classes; nil uses the name-suffix convention described above. + # @param on_progress [#call, nil] callable invoked after each + # successful batch with `done:, total:, batch_index:, batch_count:`. + def initialize(provider, batch_size: nil, requests_per_minute: nil, + max_attempts: 5, base_delay: 2.0, max_delay: 60.0, + jitter: 0.25, retry_on: nil, on_progress: nil) + unless provider.is_a?(Provider) + raise ArgumentError, + "Parse::Embeddings::BatchEmbedder expects a Parse::Embeddings::Provider " \ + "(got #{provider.class})." + end + @provider = provider + @batch_size = batch_size ? Integer(batch_size) : nil + raise ArgumentError, "batch_size must be positive" if @batch_size && @batch_size <= 0 + @min_interval = requests_per_minute ? (60.0 / Float(requests_per_minute)) : nil + @max_attempts = Integer(max_attempts) + raise ArgumentError, "max_attempts must be >= 1" if @max_attempts < 1 + @base_delay = Float(base_delay) + @max_delay = Float(max_delay) + @jitter = Float(jitter) + @retry_on = retry_on && Array(retry_on) + @on_progress = on_progress + @last_call_at = nil + end + + # Embed `strings` through the wrapped provider with pacing and + # batch-level backoff. + # + # @param strings [Array] + # @param input_type [Symbol] + # @return [Array>] aligned 1:1 with `strings`. + # @raise [BatchFailed] when a batch exhausts its attempts. + def embed_text(strings, input_type: :search_document) + unless strings.is_a?(Array) + raise ArgumentError, + "Parse::Embeddings::BatchEmbedder#embed_text expects Array " \ + "(got #{strings.class})." + end + return [] if strings.empty? + + size = @batch_size || @provider.embed_batch_size || 64 + batches = strings.each_slice(size).to_a + out = [] + batches.each_with_index do |batch, idx| + out.concat(run_batch(batch, input_type, idx, out.length)) + if @on_progress + @on_progress.call(done: out.length, total: strings.length, + batch_index: idx, batch_count: batches.length) + end + end + out + end + + private + + def run_batch(batch, input_type, batch_index, completed_count) + attempts = 0 + begin + attempts += 1 + pace! + @provider.embed_text(batch, input_type: input_type) + rescue StandardError => e + raise unless retryable?(e) + if attempts >= @max_attempts + raise BatchFailed.new( + "Parse::Embeddings::BatchEmbedder: batch #{batch_index} failed after " \ + "#{attempts} attempt(s) — #{e.class}: #{e.message}", + batch_index: batch_index, completed_count: completed_count, + ) + end + sleep(backoff_delay(attempts)) + retry + end + end + + def retryable?(error) + if @retry_on + return @retry_on.any? { |klass| error.is_a?(klass) } + end + return false unless error.is_a?(Parse::Embeddings::Error) + name = error.class.name.to_s + RETRYABLE_NAME_SUFFIXES.any? { |suffix| name.end_with?(suffix) } + end + + def backoff_delay(attempt) + delay = [@base_delay * (2**(attempt - 1)), @max_delay].min + delay * (1.0 + rand * @jitter) + end + + # Enforce the inter-call interval. Measured from the START of the + # previous call so a slow provider response counts toward the + # interval rather than stacking on top of it. + def pace! + return if @min_interval.nil? + now = Process.clock_gettime(Process::CLOCK_MONOTONIC) + if @last_call_at + wait = (@last_call_at + @min_interval) - now + if wait > 0 + sleep(wait) + now = Process.clock_gettime(Process::CLOCK_MONOTONIC) + end + end + @last_call_at = now + end + end + end +end diff --git a/lib/parse/embeddings/cache.rb b/lib/parse/embeddings/cache.rb new file mode 100644 index 0000000..1bc893b --- /dev/null +++ b/lib/parse/embeddings/cache.rb @@ -0,0 +1,322 @@ +# encoding: UTF-8 +# frozen_string_literal: true + +require "digest" +require "monitor" + +module Parse + module Embeddings + # Process-local embedding cache keyed by + # `(provider, model, input_type, input_hash)`. + # + # Query-side embedding is the hot repeat path: the same natural- + # language query (an agent retrying a tool call, a user paging + # through results, a dashboard refreshing) re-embeds identical text + # on every call, paying provider latency and per-token cost each + # time. The cache short-circuits those repeats. Write-side managed + # embeds (`embed` / `embed_image` save callbacks) already have their + # own digest-tracked elision and do not use this cache. + # + # == Disabled by default + # + # With the cache disabled {.fetch_vector} is a pass-through. Opt in: + # + # Parse::Embeddings::Cache.enable!(max_entries: 2048, ttl: 600) + # + # The default store is an in-process LRU with per-entry TTL. A + # custom store (e.g. Redis-backed) can be supplied via + # `enable!(store: my_store)` — it must respond to `get(key)` + # (returning `Array` or nil) and `set(key, vector)`; TTL + # management is then the store's responsibility. + # + # == Key derivation + # + # `provider.class.name | model_name | input_type | SHA-256(input)`. + # The full input text never becomes part of the key, so a shared + # external store does not accumulate plaintext queries. + # + # == Observability + # + # A cache hit emits the same `parse.embeddings.embed` AS::N event a + # real provider call would, with `cached: true` — existing + # spend-tracking subscribers see hits and misses on one stream. + module Cache + # Internal LRU + TTL store. Access is synchronized by the module- + # level monitor in {Cache}; the store itself is not thread-safe. + # @!visibility private + class LRUStore + def initialize(max_entries:, ttl:) + @max_entries = max_entries + @ttl = ttl + @entries = {} # key => [vector, monotonic_expiry] + end + + def get(key) + entry = @entries[key] + return nil if entry.nil? + if @ttl && entry[1] && entry[1] < Cache.monotonic + @entries.delete(key) + return nil + end + # Refresh recency (Hash preserves insertion order). + @entries.delete(key) + @entries[key] = entry + entry[0] + end + + def set(key, vector) + @entries.delete(key) + expiry = @ttl ? Cache.monotonic + @ttl : nil + @entries[key] = [vector, expiry] + @entries.shift while @entries.length > @max_entries + vector + end + + def size + @entries.length + end + + def clear + @entries = {} + end + end + + # Adapter exposing any Moneta-compatible key/value store (`[]` / + # `[]=`, optionally `store(key, value, expires:)`) through the + # `get`/`set` duck {Cache.enable!} expects — the persistent-L2 + # option. Point it at the same Redis your `Parse.cache` uses and + # query-embed cache entries survive process restarts and are + # shared across processes: + # + # require "moneta" + # moneta = Moneta.new(:Redis, url: ENV["REDIS_URL"]) + # Parse::Embeddings::Cache.enable!( + # store: Parse::Embeddings::Cache::MonetaStore.new(moneta, ttl: 30 * 24 * 3600), + # ) + # + # Keys are namespaced (`emb:` by default) so the entries are + # recognizable next to other application keys; values are the + # raw vector Arrays (Moneta's own serializer handles encoding). + # TTL is forwarded via Moneta's `expires:` option when the + # backend supports it, ignored otherwise. + # + # Fail-open by design: a backend error (Redis down, serialization + # hiccup) degrades to a cache miss / dropped write — the embed + # path must never fail because the CACHE is unhealthy. + # + # The cross-process race the in-process LRU doesn't have applies + # here: two processes missing the same key concurrently both call + # the provider and both write. That is correct (embeddings are + # deterministic per key) and bounded — no locking is attempted. + class MonetaStore + # @param moneta [#[], #[]=] a Moneta store (or anything with the + # same indexing duck). + # @param ttl [Numeric, nil] per-entry lifetime in seconds, + # forwarded as `expires:` when the backend supports + # `store(key, value, expires:)`. nil = no expiry. + # @param namespace [String] key prefix. + def initialize(moneta, ttl: nil, namespace: "emb:") + unless moneta.respond_to?(:[]) && moneta.respond_to?(:[]=) + raise ArgumentError, + "Parse::Embeddings::Cache::MonetaStore expects a Moneta-compatible " \ + "store responding to #[] and #[]= (got #{moneta.class})." + end + @moneta = moneta + @ttl = ttl && Float(ttl) + @namespace = namespace.to_s + end + + # @return [Array, nil] + def get(key) + value = @moneta[@namespace + key] + value.is_a?(Array) ? value : nil + rescue StandardError + nil + end + + # @return [Array] the vector, unchanged. + def set(key, vector) + k = @namespace + key + if @ttl && @moneta.respond_to?(:store) + begin + @moneta.store(k, vector, expires: @ttl) + rescue ArgumentError + # Hash-like backends define #store(key, value) with no + # options arg, so the expires: form raises ArgumentError. + # Fall back to a plain write (no expiry) rather than letting + # the fail-open rescue below silently drop every vector. + @moneta[k] = vector + end + else + @moneta[k] = vector + end + vector + rescue StandardError + vector + end + end + + MONITOR = Monitor.new + private_constant :MONITOR + + class << self + # Enable the cache. + # + # @param max_entries [Integer] LRU capacity (default store only). + # @param ttl [Numeric, nil] per-entry lifetime in seconds; nil + # disables expiry (default store only). Default 600. + # @param store [#get, #set, nil] custom backing store; overrides + # the built-in LRU when given. + # @return [void] + def enable!(max_entries: 2048, ttl: 600, store: nil) + if store && !(store.respond_to?(:get) && store.respond_to?(:set)) + raise ArgumentError, + "Parse::Embeddings::Cache.enable!: store must respond to #get and #set." + end + me = Integer(max_entries) + raise ArgumentError, "max_entries must be positive" if me <= 0 + MONITOR.synchronize do + @store = store || LRUStore.new(max_entries: me, ttl: ttl && Float(ttl)) + @enabled = true + @hits = 0 + @misses = 0 + end + nil + end + + # Disable and drop the store. + # @return [void] + def disable! + MONITOR.synchronize do + @enabled = false + @store = nil + end + nil + end + + # @return [Boolean] + def enabled? + MONITOR.synchronize { !!@enabled } + end + + # Clear cached entries (default store) and reset hit/miss counters. + # @return [void] + def clear! + MONITOR.synchronize do + @store.clear if @store.respond_to?(:clear) + @hits = 0 + @misses = 0 + end + nil + end + + # @return [Hash] `{ enabled:, hits:, misses:, size: }`. `size` is + # nil for custom stores that don't expose one. + def stats + MONITOR.synchronize do + { + enabled: !!@enabled, + hits: @hits.to_i, + misses: @misses.to_i, + size: (@store.respond_to?(:size) ? @store.size : nil), + } + end + end + + # Embed a single input through `provider`, serving repeats from + # the cache. Pass-through (no caching, no instrumentation + # changes) when the cache is disabled. + # + # @param provider [Provider] the embedding provider. + # @param input [String] the text to embed. + # @param input_type [Symbol] forwarded to `embed_text`. + # @return [Array] the embedding vector. + def fetch_vector(provider, input, input_type: :search_query) + unless enabled? + return embed_single!(provider, input, input_type) + end + key = key_for(provider, input, input_type) + cached = MONITOR.synchronize { @store && @store.get(key) } + if cached + MONITOR.synchronize { @hits = @hits.to_i + 1 } + instrument_hit(provider, input_type) + return cached + end + vector = embed_single!(provider, input, input_type) + MONITOR.synchronize do + @misses = @misses.to_i + 1 + @store.set(key, vector) if @store + end + vector + end + + # @!visibility private + # Composite cache key. The input is hashed so plaintext never + # lands in a shared store; provider identity + model + dimensions + # + input_type namespace the hash (two models' vectors are never + # confused). Dimensions matter independently of the model name: + # Matryoshka-capable providers (OpenAI text-embedding-3-*, Cohere + # embed-v4, Voyage, Jina, Qwen) can register the same model at + # different output widths, and serving one width's cached vector + # to the other poisons the narrower/wider field. + def key_for(provider, input, input_type) + model = begin + provider.model_name + rescue NotImplementedError + "unknown" + end + dims = begin + provider.dimensions + rescue NotImplementedError + "unknown" + end + "#{provider.class.name}|#{model}|#{dims}|#{input_type}|#{Digest::SHA256.hexdigest(input.to_s)}" + end + + # @!visibility private + def monotonic + Process.clock_gettime(Process::CLOCK_MONOTONIC) + end + + private + + def embed_single!(provider, input, input_type) + vectors = provider.embed_text([input], input_type: input_type) + unless vectors.is_a?(Array) && vectors.length == 1 && vectors.first.is_a?(Array) + raise InvalidResponseError, + "Parse::Embeddings::Cache: provider #{provider.class} did not return a " \ + "single vector (got #{vectors.inspect[0, 80]})." + end + vectors.first + end + + # Emit the standard embed event so spend subscribers see cache + # hits on the same stream as real calls. + def instrument_hit(provider, input_type) + return unless defined?(ActiveSupport::Notifications) + model = begin + provider.model_name + rescue NotImplementedError + nil + end + dims = begin + provider.dimensions + rescue NotImplementedError + nil + end + payload = { + provider: provider.class.name, + model: model, + dimensions: dims, + input_count: 1, + input_type: input_type, + total_tokens: nil, + cached: true, + error: nil, + } + ActiveSupport::Notifications.instrument(Provider::AS_NOTIFICATION_NAME, payload) {} + end + end + end + end +end diff --git a/lib/parse/embeddings/cohere.rb b/lib/parse/embeddings/cohere.rb index 80e12b6..7ace2c7 100644 --- a/lib/parse/embeddings/cohere.rb +++ b/lib/parse/embeddings/cohere.rb @@ -260,14 +260,23 @@ def modalities MULTIMODAL_MODELS.include?(@model) ? %i[text image] : [:text] end - # Embed a batch of image URLs through Cohere's `/v2/embed` - # multimodal endpoint. v5.1 ships URL-only — the provider - # receives a public URL and issues its own fetch. The SDK does - # NOT download the image; it validates the URL through - # {Parse::Embeddings.validate_image_url!} (sentinel-gated egress - # opt-in, CIDR / port / host allowlist) and forwards the - # canonicalized URL string in the `{ type: "image_url", - # image_url: { url: ... } }` content row. + # Embed a batch of images through Cohere's `/v2/embed` + # multimodal endpoint. Two source forms: + # + # * **String URL** (v5.1 path) — the provider receives a public + # URL and issues its own fetch. The SDK does NOT download the + # image; it validates the URL through + # {Parse::Embeddings.validate_image_url!} (sentinel-gated + # egress opt-in, CIDR / port / host allowlist) and forwards + # the canonicalized URL string in the `{ type: "image_url", + # image_url: { url: ... } }` content row. + # * **{Parse::Embeddings::ImageFetch::FetchedImage}** (v5.5 bytes + # path) — bytes the SDK already downloaded through + # {Parse::File.safe_open_url}, magic-byte-verified, and + # EXIF-stripped. Forwarded as a base64 data URI in the same + # `image_url` content row (Cohere v2 accepts data URIs). No + # URL validation runs and the `trust_provider_url_fetch` + # sentinel is NOT required. # # **Multimodal model required.** Cohere's v3 models do not accept # image inputs; calling `embed_image` on a v3-configured provider @@ -321,24 +330,28 @@ def embed_image(sources, input_type: :search_document, allow_insecure: false) # Validate every URL up-front so a malformed entry in slot N # does not slip through after slots 0..N-1 are already in the - # wire body. Forward the canonicalized URL the validator - # returned — not the caller's raw input. - canonical_urls = sources.each_with_index.map do |url, i| - unless url.is_a?(String) + # wire body. URL entries forward the validator's canonicalized + # URL — not the caller's raw input; fetched-bytes entries skip + # URL validation (already downloaded + verified by ImageFetch) + # and forward as a base64 data URI. + content_rows = sources.each_with_index.map do |src, i| + if src.is_a?(Parse::Embeddings::ImageFetch::FetchedImage) + { content: [{ type: "image_url", image_url: { url: src.to_data_uri } }] } + elsif src.is_a?(String) + canonical = Parse::Embeddings.validate_image_url!(src, allow_insecure: allow_insecure) + { content: [{ type: "image_url", image_url: { url: canonical } }] } + else raise ArgumentError, - "Parse::Embeddings::Cohere#embed_image sources[#{i}] is not a String " \ - "(#{url.class}). v5.1 ships URL-only — bytes/IO support is v5.3." + "Parse::Embeddings::Cohere#embed_image sources[#{i}] must be a URL String " \ + "or Parse::Embeddings::ImageFetch::FetchedImage (got #{src.class})." end - Parse::Embeddings.validate_image_url!(url, allow_insecure: allow_insecure) end body = { model: @model, input_type: wire_input_type, embedding_types: ["float"], - inputs: canonical_urls.map { |u| - { content: [{ type: "image_url", image_url: { url: u } }] } - }, + inputs: content_rows, } instrument_embed(sources.length, input_type, modality: :image) do |emit_payload| diff --git a/lib/parse/embeddings/image_fetch.rb b/lib/parse/embeddings/image_fetch.rb new file mode 100644 index 0000000..5d72d4b --- /dev/null +++ b/lib/parse/embeddings/image_fetch.rb @@ -0,0 +1,347 @@ +# encoding: UTF-8 +# frozen_string_literal: true + +require "base64" +require "uri" + +module Parse + module Embeddings + # SDK-side image download for the bytes-fetch embedding path (v5.5). + # + # Where the URL-forwarding path (v5.1) hands a validated URL to the + # embedding provider and lets the provider issue its own fetch, the + # bytes path downloads the image through the SDK's own SSRF-hardened + # primitive ({Parse::File.safe_open_url} — CIDR blocks, port + # allowlist, DNS-rebinding re-check, size caps, timeouts; NO parallel + # SSRF mechanism is introduced here), verifies the content, and + # forwards the bytes to the provider as a base64 data URI. + # + # == Content verification (closes NEW-NET-4, "File MIME laundering") + # + # The HTTP `Content-Type` header is **never trusted**. The MIME type + # is determined exclusively by magic-byte sniffing of the leading + # bytes ({.sniff_mime}), then: + # + # 1. The sniffed type must be in {Parse::Embeddings.allowed_image_types} + # (default: JPEG / PNG / GIF / WebP). + # 2. When the URL path carries a recognized image extension, the + # extension's implied type must AGREE with the sniffed type — + # a `.png` URL serving JPEG bytes (or an `.html` payload with an + # image extension) is refused as a laundering attempt. + # + # Unknown magic bytes are always refused: there is no fallthrough to + # header- or extension-derived typing. + # + # == EXIF stripping (default ON) + # + # User-uploaded photos commonly carry GPS coordinates and device + # serial numbers in EXIF. Forwarding those to a third-party embedding + # provider is a PII egress, so metadata is stripped by default: + # + # * JPEG — APP1 segments (Exif and XMP) are removed. + # * PNG — `eXIf` chunks are removed. + # * WebP — `EXIF` / `XMP ` RIFF chunks are removed and the VP8X + # EXIF/XMP flag bits cleared. + # * GIF — no EXIF container; pass-through. + # + # Callers that need orientation metadata preserved opt out per call + # with `exif_strip: false` (the `embed_image source: :bytes` + # directive forwards its own `exif_strip:` declaration). + module ImageFetch + # Raised when downloaded bytes fail content verification — unknown + # magic bytes, sniffed type outside the allowlist, or an + # extension / magic-byte disagreement. Carries a `:reason` tag + # (`:unknown_magic`, `:type_not_allowed`, `:extension_mismatch`, + # `:empty`) so callers can branch on the failure mode. + class InvalidImageType < Parse::Embeddings::Error + # @return [Symbol] failure-mode tag. + attr_reader :reason + def initialize(reason, message) + @reason = reason + super(message) + end + end + + # Value object for a fetched-and-verified image. `mime_type` is the + # SNIFFED type (never the server-reported `Content-Type`). The + # provider adapters consume this via {#to_data_uri}. + FetchedImage = Struct.new(:bytes, :mime_type, :url, keyword_init: true) do + # @return [String] `data:;base64,` for provider wire bodies. + def to_data_uri + "data:#{mime_type};base64,#{Base64.strict_encode64(bytes)}" + end + + # Keep multi-MB image payloads out of exception messages and logs. + def inspect + "#" + end + alias_method :to_s, :inspect + end + + # MIME types the bytes path accepts by default. Operators extend + # via {Parse::Embeddings.allowed_image_types=}. SVG is deliberately + # absent — it is active content (script-capable), not a bitmap. + DEFAULT_ALLOWED_IMAGE_TYPES = %w[image/jpeg image/png image/gif image/webp].freeze + + # URL-path extensions whose implied MIME type is cross-checked + # against the sniffed type. Extensions not listed here are ignored + # (the magic bytes alone govern). + EXTENSION_MIME = { + ".jpg" => "image/jpeg", + ".jpeg" => "image/jpeg", + ".jpe" => "image/jpeg", + ".png" => "image/png", + ".gif" => "image/gif", + ".webp" => "image/webp", + }.freeze + + module_function + + # Determine an image's MIME type from its leading magic bytes. + # The first ~16 bytes are sufficient for every supported format. + # Returns nil for anything unrecognized — callers must treat nil + # as a refusal, never fall back to header/extension typing. + # + # @param bytes [String] raw image bytes (at least the first 16). + # @return [String, nil] sniffed MIME type, or nil when unknown. + def sniff_mime(bytes) + return nil unless bytes.is_a?(String) && bytes.bytesize >= 12 + b = bytes.byteslice(0, 16).force_encoding(Encoding::BINARY) + return "image/jpeg" if b.start_with?("\xFF\xD8\xFF".b) + return "image/png" if b.start_with?("\x89PNG\r\n\x1A\n".b) + return "image/gif" if b.start_with?("GIF87a".b) || b.start_with?("GIF89a".b) + if b.start_with?("RIFF".b) && b.byteslice(8, 4) == "WEBP".b + return "image/webp" + end + nil + end + + # Download, verify, and (by default) EXIF-strip an image. + # + # The URL is validated through + # {Parse::Embeddings.validate_image_url!} in `:fetch` mode — host + # allowlist ({Parse::Embeddings.allowed_image_hosts}, deny-all when + # empty), obfuscated-IP-literal screen, port allowlist, CIDR check + # — but WITHOUT the {Parse::Embeddings.trust_provider_url_fetch=} + # sentinel, because no URL is forwarded to a third party: the SDK + # itself performs the fetch through {Parse::File.safe_open_url}. + # + # @param url [String] image URL (host must be allowlisted). + # @param allow_insecure [Boolean] permit `http://` (local dev only). + # @param exif_strip [Boolean] strip EXIF/XMP metadata (default true). + # @param max_bytes [Integer, nil] additional size cap below + # {Parse::File.max_remote_size}; nil applies only the global cap. + # @return [FetchedImage] verified bytes + sniffed MIME type. + # @raise [Parse::Embeddings::InvalidImageURL] URL validation failure. + # @raise [InvalidImageType] content verification failure. + # @raise [ArgumentError] from {Parse::File.safe_open_url} (SSRF / + # size / timeout refusals). + def fetch!(url, allow_insecure: false, exif_strip: true, max_bytes: nil) + canonical = Parse::Embeddings.validate_image_url!( + url, allow_insecure: allow_insecure, mode: :fetch, + ) + io = Parse::File.safe_open_url(canonical) + begin + bytes = io.read + ensure + io.close if io.respond_to?(:close) + end + bytes = bytes.to_s.dup.force_encoding(Encoding::BINARY) + + if max_bytes && bytes.bytesize > Integer(max_bytes) + raise ArgumentError, + "Parse::Embeddings::ImageFetch: image exceeds max_bytes " \ + "(#{bytes.bytesize} > #{Integer(max_bytes)})." + end + + mime = verify!(bytes, url: canonical) + bytes = strip_metadata(bytes, mime) if exif_strip + FetchedImage.new(bytes: bytes, mime_type: mime, url: canonical) + end + + # Verify raw bytes: sniff the magic, check the allowlist, and + # cross-check the URL extension. Public so the upload-side + # validation path can reuse the same check. + # + # @param bytes [String] raw image bytes. + # @param url [String, nil] source URL for the extension cross-check + # (nil skips it — e.g. caller-supplied byte payloads). + # @return [String] the sniffed MIME type. + # @raise [InvalidImageType] + def verify!(bytes, url: nil) + if bytes.nil? || bytes.empty? + raise InvalidImageType.new(:empty, + "Parse::Embeddings::ImageFetch: downloaded body is empty.") + end + mime = sniff_mime(bytes) + if mime.nil? + raise InvalidImageType.new(:unknown_magic, + "Parse::Embeddings::ImageFetch: leading bytes match no supported image " \ + "format (JPEG/PNG/GIF/WebP). The Content-Type header is not consulted — " \ + "unrecognized content is refused outright.") + end + allowed = Parse::Embeddings.allowed_image_types + unless allowed.include?(mime) + raise InvalidImageType.new(:type_not_allowed, + "Parse::Embeddings::ImageFetch: sniffed type #{mime.inspect} is not in " \ + "Parse::Embeddings.allowed_image_types (#{allowed.inspect}).") + end + ext_mime = extension_mime(url) + if ext_mime && ext_mime != mime + raise InvalidImageType.new(:extension_mismatch, + "Parse::Embeddings::ImageFetch: URL extension implies #{ext_mime.inspect} " \ + "but the magic bytes are #{mime.inspect} — refusing MIME-laundered content.") + end + mime + end + + # @!visibility private + # MIME type implied by the URL path's extension, or nil when the + # extension is absent / unrecognized. Only the URI *path* is + # consulted — a dot in the hostname (`https://cdn.v2.example.com/blob`) + # must not be mistaken for an extension. Unparseable URLs skip the + # cross-check (magic-byte verification still applies). + def extension_mime(url) + return nil unless url.is_a?(String) + path = begin + URI.parse(url).path.to_s + rescue URI::InvalidURIError + return nil + end + dot = path.rindex(".") + return nil if dot.nil? + EXTENSION_MIME[path[dot..].to_s.downcase] + end + + # Strip embedded metadata for the formats that carry it. Unknown / + # metadata-free formats pass through unchanged. Never raises on a + # malformed container — falls back to the original bytes (the + # provider will reject genuinely corrupt input) — but the fallback + # is no longer silent: a container the walker could not parse may + # still carry EXIF/XMP to a third-party provider, so the + # PII-egress protection not running is warned about. + # + # @param bytes [String] verified image bytes. + # @param mime [String] sniffed MIME type. + # @return [String] bytes with metadata removed. + def strip_metadata(bytes, mime) + stripped = + case mime + when "image/jpeg" then strip_jpeg_app1(bytes) + when "image/png" then strip_png_exif(bytes) + when "image/webp" then strip_webp_metadata(bytes) + else return bytes + end + # The format walkers return the *original object* when they bail + # on a structure they cannot parse; a successful walk always + # returns a fresh copy (even when nothing was removed). + if stripped.equal?(bytes) + warn "[Parse::Embeddings::ImageFetch] could not parse the #{mime} " \ + "container for metadata stripping; passing bytes through with " \ + "embedded EXIF/XMP (if any) intact." + end + stripped + rescue StandardError + warn "[Parse::Embeddings::ImageFetch] metadata stripping raised while " \ + "parsing the #{mime} container; passing bytes through with " \ + "embedded EXIF/XMP (if any) intact." + bytes + end + + # @!visibility private + # Remove APP1 (0xFFE1) segments — Exif and XMP both ride in APP1 — + # by walking the JPEG marker stream up to SOS and copying every + # other segment verbatim. Entropy-coded data after SOS is appended + # untouched. + def strip_jpeg_app1(bytes) + b = bytes + return b unless b.byteslice(0, 2) == "\xFF\xD8".b + out = +"\xFF\xD8".b + pos = 2 + len = b.bytesize + while pos + 4 <= len + return bytes unless b.getbyte(pos) == 0xFF + marker = b.getbyte(pos + 1) + # Standalone markers (RST/SOI/EOI/TEM) carry no length, but none + # legally appear between SOI and SOS in the header stream. + break if marker == 0xD9 # EOI with no SOS — malformed; bail to copy + seg_len = (b.getbyte(pos + 2) << 8) | b.getbyte(pos + 3) + return bytes if seg_len < 2 + if marker == 0xDA # SOS — header walk ends; copy the rest verbatim + out << b.byteslice(pos, len - pos) + return out + end + out << b.byteslice(pos, 2 + seg_len) unless marker == 0xE1 + pos += 2 + seg_len + end + # No SOS found — structurally odd; return the original untouched. + bytes + end + + # @!visibility private + # Remove `eXIf` chunks from a PNG chunk stream. Chunk layout: + # 4-byte length, 4-byte type, payload, 4-byte CRC. A truncated + # chunk bails to the original `bytes` object — an `eXIf` chunk + # past the abort point would otherwise slip through, and the + # identity check in strip_metadata only warns on the original. + def strip_png_exif(bytes) + sig_len = 8 + b = bytes + out = b.byteslice(0, sig_len).dup + pos = sig_len + len = b.bytesize + while pos + 8 <= len + chunk_len = (b.getbyte(pos) << 24) | (b.getbyte(pos + 1) << 16) | + (b.getbyte(pos + 2) << 8) | b.getbyte(pos + 3) + type = b.byteslice(pos + 4, 4) + total = 8 + chunk_len + 4 + return bytes if pos + total > len + out << b.byteslice(pos, total) unless type == "eXIf".b + pos += total + break if type == "IEND".b + end + # Trailing bytes after IEND (uncommon) are dropped with the copy; + # a sub-header tail (< 8 bytes) cannot hold another chunk. + out + end + + # @!visibility private + # Remove `EXIF` / `XMP ` chunks from a WebP RIFF container, patch + # the RIFF size field, and clear the VP8X EXIF/XMP flag bits so the + # header stays consistent with the chunk list. A truncated chunk + # bails to the original `bytes` object — an `EXIF` / `XMP ` chunk + # past the abort point would otherwise slip through, and the + # identity check in strip_metadata only warns on the original. + def strip_webp_metadata(bytes) + b = bytes + out_chunks = +"".b + pos = 12 # past "RIFF" + size + "WEBP" + len = b.bytesize + while pos + 8 <= len + type = b.byteslice(pos, 4) + chunk_len = b.getbyte(pos + 4) | (b.getbyte(pos + 5) << 8) | + (b.getbyte(pos + 6) << 16) | (b.getbyte(pos + 7) << 24) + padded = chunk_len + (chunk_len.odd? ? 1 : 0) + total = 8 + padded + return bytes if pos + 8 + chunk_len > len + unless type == "EXIF".b || type == "XMP ".b + chunk = b.byteslice(pos, [total, len - pos].min).dup + if type == "VP8X".b && chunk.bytesize >= 9 + flags = chunk.getbyte(8) + chunk.setbyte(8, flags & ~0x0C) # clear EXIF (0x08) + XMP (0x04) + end + out_chunks << chunk + end + pos += total + end + riff_size = 4 + out_chunks.bytesize # "WEBP" + chunks + out = +"RIFF".b + out << [riff_size].pack("V") + out << "WEBP".b + out << out_chunks + out + end + end + end +end diff --git a/lib/parse/embeddings/provider.rb b/lib/parse/embeddings/provider.rb index e5b12aa..020055f 100644 --- a/lib/parse/embeddings/provider.rb +++ b/lib/parse/embeddings/provider.rb @@ -39,17 +39,23 @@ def embed_text(strings, input_type: :search_document) raise NotImplementedError, "#{self.class}#embed_text must be implemented" end - # @param sources [Array] image sources — URI for - # remote, IO for streamed bytes, String for base64. Concrete - # providers document which forms they accept. In v5.1 (URL-only - # path), every source is a raw `String` URL forwarded unchanged - # from the managed path: {Parse::Core::EmbedManaged} deliberately - # does NOT validate before calling the provider (validating there - # would double-resolve every URL). The concrete `embed_image` - # override is therefore responsible for calling - # {Parse::Embeddings.validate_image_url!} (passing `allow_insecure:` - # through) before egress — see the bundled Voyage/Cohere providers, - # which validate internally. + # @param sources [Array] + # image sources. Two supported forms (mixable within a batch): + # + # * `String` URL (v5.1 URL-forwarding path) — forwarded + # unchanged from the managed path: {Parse::Core::EmbedManaged} + # deliberately does NOT validate before calling the provider + # (validating there would double-resolve every URL). The + # concrete `embed_image` override is therefore responsible for + # calling {Parse::Embeddings.validate_image_url!} (passing + # `allow_insecure:` through) before egress — see the bundled + # Voyage/Cohere providers, which validate internally. + # * {Parse::Embeddings::ImageFetch::FetchedImage} (v5.5 bytes + # path) — bytes the SDK already downloaded via + # {Parse::File.safe_open_url}, magic-byte-verified, and + # EXIF-stripped. Concrete overrides forward + # `fetched.to_data_uri` in their wire body's base64 slot and + # skip URL validation (there is no provider-side fetch). # @param input_type [Symbol] `:search_query` or `:search_document`, # parallel to {#embed_text}. # @param allow_insecure [Boolean] **contract kwarg** — diff --git a/lib/parse/embeddings/spend_cap.rb b/lib/parse/embeddings/spend_cap.rb index eb374ee..e949653 100644 --- a/lib/parse/embeddings/spend_cap.rb +++ b/lib/parse/embeddings/spend_cap.rb @@ -64,9 +64,24 @@ def initialize(tenant_id:, limit:, used:, requested:, window:, retry_after:) # default limit applied to every tenant lacking an override. DEFAULT_KEY = :__default__ + # Thread-local key marking that the current call stack has already + # charged the spend cap (or deliberately exempted itself). Set by + # {.with_precharged}; read by {.charge_query!} so the inner + # query-embed paths (`find_similar(text:)`, `hybrid_search`, + # `Parse::Retrieval.retrieve`) don't double-bill a query the agent + # tool already charged with proper tenant identity. + PRECHARGED_KEY = :parse_embed_spend_precharged + # Default sliding window (seconds) when none is configured. DEFAULT_WINDOW = 3600 + # AS::N event emitted when a tenant's in-window usage crosses the + # configured `warn_at:` fraction of its hard limit. Payload: + # `{ tenant_id:, used:, limit:, window:, warn_at:, threshold: }`. + # Emitted once per window-crossing (re-arms as usage rolls off), + # never on the hard-refuse itself (that raises {Exceeded}). + AS_NOTIFICATION_NAME = "parse.embeddings.spend_cap_warning" + class << self # Configure the cap. Two forms: # @@ -80,8 +95,15 @@ class << self # the global default. # @param limit_tokens [Integer, nil] token ceiling per window. # @param window [Integer] sliding window length in seconds. + # @param warn_at [Numeric, nil] soft-cap fraction of + # `limit_tokens` (exclusive 0...1). When a charge pushes a + # tenant's in-window usage across `limit * warn_at`, a + # {AS_NOTIFICATION_NAME} ActiveSupport::Notifications event is + # emitted (once per crossing — re-arms as the window rolls + # off). Gives operators an alerting hook BEFORE the hard + # refuse trips. nil (default) disables the soft cap. # @return [void] - def configure(tenant_id = nil, limit_tokens:, window: DEFAULT_WINDOW) + def configure(tenant_id = nil, limit_tokens:, window: DEFAULT_WINDOW, warn_at: nil) key = tenant_id.nil? ? DEFAULT_KEY : tenant_id unless limit_tokens.nil? li = Integer(limit_tokens) @@ -89,8 +111,21 @@ def configure(tenant_id = nil, limit_tokens:, window: DEFAULT_WINDOW) end w = Integer(window) raise ArgumentError, "SpendCap: window must be positive (got #{w})." if w <= 0 + unless warn_at.nil? + wa = Float(warn_at) + unless wa > 0.0 && wa < 1.0 + raise ArgumentError, "SpendCap: warn_at must be between 0 and 1 exclusive (got #{warn_at})." + end + end mutex.synchronize do - limits[key] = limit_tokens.nil? ? nil : { limit: Integer(limit_tokens), window: w } + limits[key] = + if limit_tokens.nil? + nil + else + cfg = { limit: Integer(limit_tokens), window: w } + cfg[:warn_at] = Float(warn_at) unless warn_at.nil? + cfg + end end nil end @@ -111,7 +146,8 @@ def charge!(tenant_id:, tokens:) raise ArgumentError, "SpendCap: tokens must be >= 0 (got #{t})." if t.negative? key = tenant_id.nil? ? DEFAULT_KEY : tenant_id - mutex.synchronize do + warn_payload = nil + total = mutex.synchronize do cfg = limit_for(key) return nil if cfg.nil? # uncapped @@ -128,8 +164,24 @@ def charge!(tenant_id:, tokens:) ) end entries << [now, t] if t.positive? + # Soft-cap crossing: fire only when THIS charge moves usage + # from below the threshold to at-or-above it, so a tenant + # hovering over the line doesn't spam an event per charge. + # Pruned entries re-arm the warning naturally as the window + # rolls off. + if (wa = cfg[:warn_at]) + threshold = limit * wa + if used < threshold && used + t >= threshold + warn_payload = { + tenant_id: key, used: used + t, limit: limit, + window: window, warn_at: wa, threshold: threshold, + } + end + end used + t end + emit_soft_cap_warning(warn_payload) if warn_payload + total end # Current in-window token usage for a tenant (0 when uncapped or @@ -146,6 +198,56 @@ def usage(tenant_id: nil) end end + # Run a block with the inner query-embed charge suppressed. + # Callers that have ALREADY charged the cap with better tenant + # identity (the `semantic_search` agent tool charges per-tenant + # before calling retrieve) — or that deliberately exempt the + # call (trusted admin agents) — wrap their downstream embed in + # this so {.charge_query!} inside `find_similar` / `retrieve` + # is a no-op. Restores the prior flag on exit (nesting-safe). + # + # @return [Object] the block's return value. + def with_precharged + prev = Thread.current[PRECHARGED_KEY] + Thread.current[PRECHARGED_KEY] = true + yield + ensure + Thread.current[PRECHARGED_KEY] = prev + end + + # @return [Boolean] whether the current call stack is inside + # {.with_precharged}. + def precharged? + !!Thread.current[PRECHARGED_KEY] + end + + # Charge a query-embed against the cap from a non-agent path. + # This is the v5.5 closure of "spend-cap coverage on all embed + # paths": `find_similar(text:)`, `hybrid_search(text:)`, and + # `Parse::Retrieval.retrieve` route their query text through + # here before embedding. + # + # * No-op inside {.with_precharged} (the agent tool charged + # already, with per-tenant identity). + # * Tenant identity falls back to the ambient cache-tenant + # scope ({Parse.with_cache_tenant}) when set, else the shared + # {DEFAULT_KEY} bucket. + # * No-op (like {.charge!}) when no limit is configured. + # + # @param text [String] the query text about to be embedded. + # @param tenant_id [Object, nil] explicit tenant identity; + # nil resolves the ambient cache tenant, then DEFAULT_KEY. + # @return [Integer, nil] new in-window total, or nil when + # uncapped / precharged. + # @raise [Exceeded] + def charge_query!(text, tenant_id: nil) + return nil if precharged? + if tenant_id.nil? && defined?(Parse) && Parse.respond_to?(:current_cache_tenant) + tenant_id = Parse.current_cache_tenant + end + charge!(tenant_id: tenant_id, tokens: estimate_tokens(text)) + end + # Estimate token count from a String. # # The familiar "~4 characters per token" ratio only holds for @@ -249,6 +351,18 @@ def retry_after_for(entries, requested, limit, window, now) def monotonic Process.clock_gettime(Process::CLOCK_MONOTONIC) end + + # Emit the soft-cap AS::N event OUTSIDE the mutex (subscribers + # run synchronously on the calling thread; a slow subscriber + # must not serialize every other tenant's charge). + def emit_soft_cap_warning(payload) + return unless defined?(ActiveSupport::Notifications) + ActiveSupport::Notifications.instrument(AS_NOTIFICATION_NAME, payload) {} + rescue StandardError + # A raising subscriber must not turn a successful (admitted) + # charge into a caller-visible failure. + nil + end end end end diff --git a/lib/parse/embeddings/voyage.rb b/lib/parse/embeddings/voyage.rb index 85bc366..f95c5d4 100644 --- a/lib/parse/embeddings/voyage.rb +++ b/lib/parse/embeddings/voyage.rb @@ -278,27 +278,32 @@ def modalities MULTIMODAL_MODELS.include?(@model) ? %i[text image] : [:text] end - # Embed a batch of image URLs through Voyage's - # `/v1/multimodalembeddings` endpoint. v5.1 ships URL-only — the - # provider receives a public URL and issues its own fetch. The - # SDK does NOT download the image; it validates the URL through - # {Parse::Embeddings.validate_image_url!} (CIDR / port / host - # allowlist, sentinel-gated egress opt-in) and forwards the - # canonicalized URL string in the `{ type: "image_url", - # image_url: ... }` content row. + # Embed a batch of images through Voyage's + # `/v1/multimodalembeddings` endpoint. Two source forms: + # + # * **String URL** (v5.1 path) — the provider receives a public + # URL and issues its own fetch. The SDK does NOT download the + # image; it validates the URL through + # {Parse::Embeddings.validate_image_url!} (CIDR / port / host + # allowlist, sentinel-gated egress opt-in) and forwards the + # canonicalized URL string in a `{ type: "image_url", + # image_url: ... }` content row. + # * **{Parse::Embeddings::ImageFetch::FetchedImage}** (v5.5 bytes + # path) — bytes the SDK already downloaded through + # {Parse::File.safe_open_url}, magic-byte-verified, and + # EXIF-stripped. Forwarded as a `{ type: "image_base64", + # image_base64: "data:;base64,..." }` content row. No URL + # validation runs (there is no provider-side fetch) and the + # `trust_provider_url_fetch` sentinel is NOT required. # # **Multimodal model required.** Voyage's text-only models # (`voyage-3`, `voyage-4`, etc.) do not accept image inputs; # calling `embed_image` on a provider configured with one of # those raises {BadRequestError} before any network call. # - # **Bytes-fetch path is v5.3.** A future `bytes:` option will - # download via {Parse::File.safe_open_url}, MIME-sniff the - # leading bytes, optionally EXIF-strip, and forward as - # base64. URL-only ships first because it sidesteps EXIF / - # MIME-confusion class issues entirely. - # - # @param sources [Array] image URLs. Each must satisfy + # @param sources [Array] + # image URLs and/or fetched-bytes wrappers (forms may be + # mixed). Each URL must satisfy # {Parse::Embeddings.validate_image_url!} — failing entries # raise the corresponding {Parse::Embeddings::InvalidImageURL} # / {Parse::Embeddings::ConfirmationRequired} and ABORT the @@ -342,22 +347,26 @@ def embed_image(sources, input_type: :search_document, allow_insecure: false) # Validate every URL up-front so a malformed entry in slot N # does not get past validation while slots 0..N-1 are already - # in the wire body. The validator returns the canonicalized - # URL — we forward exactly that, not the caller's raw input. - canonical_urls = sources.each_with_index.map do |url, i| - unless url.is_a?(String) + # in the wire body. URL entries forward the validator's + # canonicalized URL (never the caller's raw input); fetched- + # bytes entries skip URL validation (the bytes were already + # downloaded + verified by ImageFetch) and forward as base64. + content_rows = sources.each_with_index.map do |src, i| + if src.is_a?(Parse::Embeddings::ImageFetch::FetchedImage) + { content: [{ type: "image_base64", image_base64: src.to_data_uri }] } + elsif src.is_a?(String) + canonical = Parse::Embeddings.validate_image_url!(src, allow_insecure: allow_insecure) + { content: [{ type: "image_url", image_url: canonical }] } + else raise ArgumentError, - "Parse::Embeddings::Voyage#embed_image sources[#{i}] is not a String " \ - "(#{url.class}). v5.1 ships URL-only — bytes/IO support is v5.3." + "Parse::Embeddings::Voyage#embed_image sources[#{i}] must be a URL String " \ + "or Parse::Embeddings::ImageFetch::FetchedImage (got #{src.class})." end - Parse::Embeddings.validate_image_url!(url, allow_insecure: allow_insecure) end wire_input_type = INPUT_TYPE_WIRE_VALUES[input_type] body = { - inputs: canonical_urls.map { |u| - { content: [{ type: "image_url", image_url: u }] } - }, + inputs: content_rows, model: @model, truncation: @truncation, } diff --git a/lib/parse/model/acl.rb b/lib/parse/model/acl.rb index 32acda5..9ad2add 100644 --- a/lib/parse/model/acl.rb +++ b/lib/parse/model/acl.rb @@ -211,8 +211,8 @@ def self.permission(read, write = nil) # @example # permissions = ["*", user.id] + user.acl_roles.to_a.map { |n| "role:#{n}" } # pipeline << { "$match" => Parse::ACL.read_predicate(permissions) } - def self.read_predicate(permissions, include_public: true) - permission_predicate("_rperm", permissions, include_public: include_public) + def self.read_predicate(permissions, include_public: true, include_missing: true) + permission_predicate("_rperm", permissions, include_public: include_public, include_missing: include_missing) end # Build a MongoDB +$match+-shaped predicate that matches documents @@ -222,8 +222,8 @@ def self.read_predicate(permissions, include_public: true) # @param permissions [Array] permission strings. # @param include_public [Boolean] whether to append +"*"+. # @return [Hash] a MongoDB +$or+ subexpression. - def self.write_predicate(permissions, include_public: true) - permission_predicate("_wperm", permissions, include_public: include_public) + def self.write_predicate(permissions, include_public: true, include_missing: true) + permission_predicate("_wperm", permissions, include_public: include_public, include_missing: include_missing) end # @!visibility private @@ -231,15 +231,19 @@ def self.write_predicate(permissions, include_public: true) # Normalizes the permissions array (string-coerced, deduplicated, # +"*"+ appended when +include_public+) and returns the +$or+ # subexpression. - def self.permission_predicate(field, permissions, include_public: true) + # @param include_missing [Boolean] when true (default), append the + # +{ field => { "$exists" => false } }+ branch so a missing + # +_rperm+/+_wperm+ (treated as public by Parse Server) also matches. + # Set false for an EXACT match that requires the column to be present + # and to contain one of +permissions+ (the strict/`readable_by_exact` + # surface). When false and only the +$in+ branch remains, the +$or+ + # wrapper is dropped for a cleaner +{ field => { "$in" => perms } }+. + def self.permission_predicate(field, permissions, include_public: true, include_missing: true) perms = Array(permissions).map(&:to_s).reject(&:empty?).uniq perms << "*" if include_public && !perms.include?("*") - { - "$or" => [ - { field => { "$in" => perms } }, - { field => { "$exists" => false } }, - ], - } + branches = [{ field => { "$in" => perms } }] + branches << { field => { "$exists" => false } } if include_missing + branches.length == 1 ? branches.first : { "$or" => branches } end # Determines whether two ACLs or a Parse-ACL hash is equivalent to this object. # @example diff --git a/lib/parse/model/core/embed_managed.rb b/lib/parse/model/core/embed_managed.rb index a6f56bd..87e7dd9 100644 --- a/lib/parse/model/core/embed_managed.rb +++ b/lib/parse/model/core/embed_managed.rb @@ -2,6 +2,7 @@ # frozen_string_literal: true require "digest" +require "time" require_relative "../../embeddings" require_relative "../vector" @@ -108,9 +109,21 @@ class InvalidEmbedDeclaration < ArgumentError; end # # `allow_insecure` is forwarded to {.validate_image_url!} for # image directives only; ignored for text. + # + # `source_mode` (image directives only) is `:url` (forward the + # validated URL to the provider — v5.1 behavior, requires the + # `trust_provider_url_fetch` sentinel) or `:bytes` (the SDK + # downloads via {Parse::File.safe_open_url}, magic-byte-verifies, + # EXIF-strips, and forwards base64 — v5.5). `exif_strip` + # (default true) applies to `:bytes` mode only. + # + # `meta_field` names the `:object` sibling that records + # provider/model/dimensions provenance on every recompute — the + # input {ClassMethods#reembed!} uses to find stale rows after a + # model migration. EmbedDirective = Struct.new( :sources, :into, :digest_field, :input_type, :provider_name, - :modality, :allow_insecure, + :modality, :allow_insecure, :source_mode, :exif_strip, :meta_field, keyword_init: true, ) do def freeze @@ -121,6 +134,10 @@ def freeze def image? modality == :image end + + def bytes_mode? + source_mode == :bytes + end end # @!visibility private @@ -184,9 +201,17 @@ def embed_directives # @param digest_field [Symbol, nil] override for the digest # sibling property. Defaults to `:"#{into}_digest"`. Auto- # declared as `:string` if not already declared. + # @param meta_field [Symbol, nil] override for the provenance + # sibling property. Defaults to `:"#{into}_meta"`. Auto- + # declared as `:object` if not already declared; populated + # with `{ provider:, model:, dimensions:, modality:, + # embedded_at: }` on every recompute. Read by + # {ClassMethods#reembed!} to skip rows already embedded by + # the current provider/model. # @return [Symbol] the target vector field name. # @raise [InvalidEmbedDeclaration] on declaration-time misuse. - def embed(*source_fields, into:, input_type: :search_document, digest_field: nil) + def embed(*source_fields, into:, input_type: :search_document, digest_field: nil, + meta_field: nil) if source_fields.empty? raise InvalidEmbedDeclaration, "#{self}.embed: at least one source field is required." @@ -215,6 +240,10 @@ def embed(*source_fields, into:, input_type: :search_document, digest_field: nil unless fields.key?(digest_field) property digest_field, :string end + meta_field = (meta_field || :"#{into}_meta").to_sym + unless fields.key?(meta_field) + property meta_field, :object + end directive = EmbedDirective.new( sources: sources, @@ -222,6 +251,7 @@ def embed(*source_fields, into:, input_type: :search_document, digest_field: nil digest_field: digest_field, input_type: input_type, provider_name: provider_name, + meta_field: meta_field, ).freeze embed_directives[into] = directive @@ -243,12 +273,25 @@ def embed(*source_fields, into:, input_type: :search_document, digest_field: nil # Declare a managed image embedding. Mirrors {.embed} but the # source field is a `:file` property (Parse::File) and the # provider call routes through {Parse::Embeddings::Provider#embed_image} - # rather than `#embed_text`. v5.1 ships URL-only: the SDK - # extracts the file's URL, validates it through - # {Parse::Embeddings.validate_image_url!} (sentinel-gated egress - # opt-in, CIDR / port / host allowlist), and forwards the - # canonicalized URL to the provider. The SDK does NOT download - # image bytes — bytes-fetch is the v5.3 path. + # rather than `#embed_text`. Two fetch modes (`source:`): + # + # * `:url` (default, v5.1 behavior) — the SDK extracts the + # file's URL, validates it through + # {Parse::Embeddings.validate_image_url!} (sentinel-gated + # egress opt-in, CIDR / port / host allowlist), and forwards + # the canonicalized URL to the provider, which performs its + # own fetch. The SDK does NOT download image bytes. + # * `:bytes` (v5.5) — the SDK downloads the image itself via + # {Parse::File.safe_open_url} (through + # {Parse::Embeddings::ImageFetch.fetch!}), verifies the + # content by magic-byte sniff against + # {Parse::Embeddings.allowed_image_types} (the Content-Type + # header is never trusted), strips EXIF/XMP metadata by + # default, and forwards the bytes to the provider as a + # base64 data URI. Does NOT require the + # `trust_provider_url_fetch` sentinel (no third-party URL + # egress), but the file's host must still be in + # {Parse::Embeddings.allowed_image_hosts}. # # **Digest is the URL string, not the file contents.** Replacing # the Parse::File with one pointing to a different URL re-embeds; @@ -272,10 +315,21 @@ def embed(*source_fields, into:, input_type: :search_document, digest_field: nil # @param allow_insecure [Boolean] forwarded to # {Parse::Embeddings.validate_image_url!}; permit `http://` # for local-dev CDN proxies. Default false. + # @param source [Symbol] `:url` (provider fetches; default) or + # `:bytes` (SDK fetches, verifies, strips, forwards base64). + # @param exif_strip [Boolean] strip EXIF/XMP metadata before + # forwarding bytes (default true; `:bytes` mode only — + # ignored for `:url`, where the SDK never sees the bytes). + # @param meta_field [Symbol, nil] override for the provenance + # sibling property. Defaults to `:"#{into}_meta"`; see {.embed}. # @return [Symbol] the target vector field name. # @raise [InvalidEmbedDeclaration] on declaration-time misuse. def embed_image(source_field, into:, input_type: :search_document, - digest_field: nil, allow_insecure: false) + digest_field: nil, allow_insecure: false, + source: :url, exif_strip: true, meta_field: nil) + # Capture the fetch mode immediately — the legacy local + # `source = source_field.to_sym` below shadows the kwarg. + source_mode = source_mode_for_embed_image!(source) into = into.to_sym unless vector_properties.key?(into) raise InvalidEmbedDeclaration, @@ -306,6 +360,10 @@ def embed_image(source_field, into:, input_type: :search_document, unless fields.key?(digest_field) property digest_field, :string end + meta_field = (meta_field || :"#{into}_meta").to_sym + unless fields.key?(meta_field) + property meta_field, :object + end directive = EmbedDirective.new( sources: [source], @@ -315,6 +373,9 @@ def embed_image(source_field, into:, input_type: :search_document, provider_name: provider_name, modality: :image, allow_insecure: allow_insecure, + source_mode: source_mode, + exif_strip: exif_strip ? true : false, + meta_field: meta_field, ).freeze embed_directives[into] = directive @@ -333,6 +394,126 @@ def embed_image(source_field, into:, input_type: :search_document, into end + # @!visibility private + # Validate the `source:` kwarg of {.embed_image}. + def source_mode_for_embed_image!(source) + mode = source.to_sym + unless %i[url bytes].include?(mode) + raise InvalidEmbedDeclaration, + "#{self}.embed_image: source: must be :url or :bytes (got #{source.inspect})." + end + mode + end + + # Re-embed records through the CURRENT provider/model — the bulk + # migration counterpart to {#embed_pending!} (which only fills + # null vectors). Use after changing a `:vector` property's + # `provider:` / `model:` / `dimensions:` declaration: walks the + # class with objectId-cursor pagination, clears each record's + # digest sibling so the `before_save` recompute cannot elide the + # provider call, and saves. + # + # With `only_stale: true`, rows whose `_meta` provenance + # already matches the current provider name, model, and declared + # dimensions are skipped without a provider call — making the + # operation resumable: re-running after a partial failure only + # touches rows still carrying old-model vectors. Rows with no + # meta record (embedded before v5.5) always count as stale. + # + # Intended as an admin / maintenance operation: run it with a + # master-key client (or pass `save_opts:` carrying a + # `session_token:` that can write every row). Combine with + # {Parse::Embeddings::BatchEmbedder}-style pacing externally if + # the provider rate-limits — each record's save makes one + # provider call. + # + # @param field [Symbol, nil] limit to one embed target; nil + # processes every declared directive. + # @param batch_size [Integer] rows fetched per round (default 100). + # @param limit [Integer, nil] stop after re-embedding at most + # this many records across all directives; nil = no cap. + # @param where [Hash, nil] extra query constraints (e.g. + # `{ published: true }`). + # @param only_stale [Boolean] skip rows whose meta provenance + # matches the current provider/model/dimensions (default false + # — re-embed everything). + # @param save_opts [Hash] options forwarded to each `record.save`. + # @return [Integer] number of records re-embedded (saved). + # @raise [ArgumentError] when `field:` names no embed target, or + # the class declares no `embed` directives. + def reembed!(field: nil, batch_size: 100, limit: nil, where: nil, + only_stale: false, save_opts: {}) + bs = Integer(batch_size) + raise ArgumentError, "#{self}.reembed!: batch_size must be positive." if bs <= 0 + directives = resolve_embed_directives_for_backfill(field, caller_label: "reembed!") + + processed = 0 + directives.each do |directive| + remaining = limit ? (limit - processed) : nil + break if remaining && remaining <= 0 + processed += reembed_directive!(directive, bs, where, remaining, only_stale, save_opts) + end + processed + end + + # @!visibility private + # objectId-cursor walk over ALL rows (subject to `where:`), + # clearing the digest so the save-path recompute re-embeds. + def reembed_directive!(directive, batch_size, where, remaining, only_stale, save_opts) + count = 0 + cursor = nil + current = only_stale ? current_embed_identity(directive) : nil + loop do + q = query + q = q.where(where) if where.is_a?(Hash) && !where.empty? + q = q.where(:objectId.gt => cursor) if cursor + q.order(:objectId.asc) + q.limit(batch_size) + batch = q.results + break if batch.nil? || batch.empty? + + batch.each do |record| + cursor = record.id + next if current && embed_meta_current?(record, directive, current) + record.public_send(:"#{directive.digest_field}=", nil) + record.save(**save_opts) + count += 1 + return count if remaining && count >= remaining + end + break if batch.length < batch_size + end + count + end + + # @!visibility private + # The provenance tuple a freshly-embedded row would carry today. + def current_embed_identity(directive) + model = begin + Parse::Embeddings.provider(directive.provider_name).model_name + rescue Parse::Embeddings::ProviderNotRegistered + raise + rescue NotImplementedError + nil + end + { + "provider" => directive.provider_name.to_s, + "model" => model, + "dimensions" => vector_properties.dig(directive.into, :dimensions), + } + end + + # @!visibility private + # True when the record's meta sibling matches `current` (so + # `only_stale: true` can skip it). Missing/foreign-shaped meta + # counts as stale. + def embed_meta_current?(record, directive, current) + meta = directive.meta_field && record.public_send(directive.meta_field) + return false unless meta.is_a?(Hash) + %w[provider model dimensions].all? do |key| + current[key].nil? || meta[key] == current[key] || meta[key.to_sym] == current[key] + end + end + # Backfill embeddings for records whose managed vector field is # still null — the bulk counterpart to the per-save embed path. # Walks the class with objectId-cursor pagination (robust to the @@ -372,18 +553,20 @@ def embed_pending!(field: nil, batch_size: 100, limit: nil, where: nil, save_opt end # @!visibility private - def resolve_embed_directives_for_backfill(field) + # `caller_label` names the public entry point in error messages so + # a reembed! misuse is not reported as an embed_pending! one. + def resolve_embed_directives_for_backfill(field, caller_label: "embed_pending!") if field d = embed_directives[field.to_sym] unless d raise ArgumentError, - "#{self}.embed_pending!: :#{field} is not an embed target " \ + "#{self}.#{caller_label}: :#{field} is not an embed target " \ "(have #{embed_directives.keys.inspect})." end [d] else ds = embed_directives.values - raise ArgumentError, "#{self}.embed_pending!: no `embed` directives declared." if ds.empty? + raise ArgumentError, "#{self}.#{caller_label}: no `embed` directives declared." if ds.empty? ds end end @@ -470,6 +653,7 @@ def self.recompute_embedding!(record, directive) record.public_send(:"#{directive.into}=", nil) end record.public_send(:"#{directive.digest_field}=", nil) + clear_embed_meta(record, directive) end return end @@ -498,6 +682,36 @@ def self.recompute_embedding!(record, directive) record.public_send(:"#{directive.into}=", vector) end record.public_send(:"#{directive.digest_field}=", digest) + stamp_embed_meta(record, directive, provider, vector) + end + + # @!visibility private + # Record provider/model provenance on the `_meta` sibling so + # migration tooling ({ClassMethods#reembed!} `only_stale:`) can + # tell which model produced the stored vector. String keys — + # `:object` properties round-trip through JSON. + def self.stamp_embed_meta(record, directive, provider, vector) + return if directive.meta_field.nil? + return unless record.respond_to?(:"#{directive.meta_field}=") + model = begin + provider.model_name + rescue NotImplementedError + nil + end + record.public_send(:"#{directive.meta_field}=", { + "provider" => directive.provider_name.to_s, + "model" => model, + "dimensions" => vector.dimensions, + "modality" => directive.image? ? "image" : "text", + "embedded_at" => Time.now.utc.iso8601, + }) + end + + # @!visibility private + def self.clear_embed_meta(record, directive) + return if directive.meta_field.nil? + return unless record.respond_to?(:"#{directive.meta_field}=") + record.public_send(:"#{directive.meta_field}=", nil) end # @!visibility private @@ -529,10 +743,25 @@ def self.build_source_input(record, directive) end # @!visibility private - # Dispatch the provider call based on directive modality. + # Dispatch the provider call based on directive modality and (for + # images) fetch mode. `:bytes` mode downloads + verifies + strips + # through {Parse::Embeddings::ImageFetch.fetch!} and hands the + # provider a {Parse::Embeddings::ImageFetch::FetchedImage}; `:url` + # mode forwards the raw URL String (the provider validates and + # fetches it itself). def self.call_provider(provider, directive, input) if directive.image? - provider.embed_image([input], + source = + if directive.bytes_mode? + Parse::Embeddings::ImageFetch.fetch!( + input, + allow_insecure: directive.allow_insecure ? true : false, + exif_strip: directive.exif_strip != false, + ) + else + input + end + provider.embed_image([source], input_type: directive.input_type, allow_insecure: directive.allow_insecure ? true : false) else diff --git a/lib/parse/model/core/vector_searchable.rb b/lib/parse/model/core/vector_searchable.rb index d199085..64f7da6 100644 --- a/lib/parse/model/core/vector_searchable.rb +++ b/lib/parse/model/core/vector_searchable.rb @@ -55,6 +55,22 @@ class AmbiguousVectorField < ArgumentError; end # field and the caller didn't pass an explicit `index:` kwarg. class IndexNotResolved < ArgumentError; end + # Raised (under `Parse::VectorSearch.index_drift_policy = :raise`) + # when first-query verification finds the deployed vectorSearch + # index disagreeing with the model declaration — wrong + # `numDimensions`, wrong `similarity`, or a registered + # tenant-scope field missing from the index's `filter` paths. + # Under the default `:warn` policy the same findings emit a + # single `[Parse::VectorSearch:DRIFT]` warning instead. + class IndexDriftError < StandardError + # @return [Array] human-readable drift findings. + attr_reader :findings + def initialize(message, findings: []) + @findings = findings + super(message) + end + end + # Raised by the `find_similar(text:)` overload when the resolved # `:vector` property has no `provider:` (and therefore no way to # turn `text:` into a query vector). Distinct from @@ -367,13 +383,23 @@ def embed_query_text!(text, resolved_field) "on the property, or pass an explicit `vector:`." end provider = Parse::Embeddings.provider(provider_name) - vectors = provider.embed_text([text], input_type: :search_query) - unless vectors.is_a?(Array) && vectors.length == 1 && vectors.first.is_a?(Array) - raise Parse::Embeddings::InvalidResponseError, - "#{self}.find_similar: provider #{provider_name.inspect} did not return " \ - "a single vector for `text:` (got #{vectors.inspect[0, 80]})." - end - vectors.first + # Spend cap: every query-embed path (find_similar(text:), + # hybrid_search(text:), Retrieval.retrieve) funnels through this + # method, so charging here closes the "direct callers bypass the + # cap" gap. No-op when no limit is configured, or when an + # upstream caller (the semantic_search agent tool) has already + # charged with per-tenant identity (SpendCap.with_precharged). + # + # Deliberate: the charge runs BEFORE the cache lookup, so cache + # hits bill at full price. The cap bounds query *volume* (an + # abuse/probing control), not just provider spend — a caller + # replaying one cached query must not get unlimited throughput. + Parse::Embeddings::SpendCap.charge_query!(text) + # Query-embed cache: repeated identical queries skip the + # provider round-trip when Parse::Embeddings::Cache.enable! has + # been called; pass-through (with the provider's own response + # validation preserved) when disabled. + Parse::Embeddings::Cache.fetch_vector(provider, text, input_type: :search_query) end def coerce_query_vector(vector) @@ -387,7 +413,10 @@ def coerce_query_vector(vector) end def resolve_vector_index!(field, explicit_index) - return explicit_index if explicit_index && !explicit_index.to_s.empty? + if explicit_index && !explicit_index.to_s.empty? + verify_explicit_vector_index(field, explicit_index.to_s) + return explicit_index + end begin require_relative "../../atlas_search" rescue LoadError @@ -402,9 +431,129 @@ def resolve_vector_index!(field, explicit_index) "#{parse_class}.#{field}; pass index: explicitly or create one " \ "via Parse::AtlasSearch::IndexCatalog.create_index." end + verify_vector_index!(field, idx) (idx["name"] || idx[:name]).to_s end + # Best-effort drift verification for an explicitly named `index:`. + # The auto-discovery path verifies the index it resolves; an + # explicit kwarg would otherwise skip verification entirely. Look + # the field's covering index up in the catalog and verify it when + # its name matches the explicit one. Lookup failures (catalog + # unavailable, index not discoverable, name targeting a different + # index) skip verification rather than failing the query — the + # explicit kwarg is an override, not a discovery request. + def verify_explicit_vector_index(field, index_name) + return if Parse::VectorSearch.index_drift_policy == :ignore + begin + require_relative "../../atlas_search" + idx = Parse::AtlasSearch::IndexCatalog.find_vector_index(parse_class, field: field) + rescue StandardError, LoadError + return + end + return if idx.nil? + return unless (idx["name"] || idx[:name]).to_s == index_name + verify_vector_index!(field, idx) + end + + # First-query drift verification: compare the deployed index's + # `latestDefinition` against the model declaration. The drift + # findings are computed once per (field, index name) per class per + # process and cached; the policy check runs on EVERY query, so + # under `:raise` a drifted index keeps failing instead of failing + # once and then silently serving results. Under `:warn` the + # warning is emitted only on the first check to avoid log spam. + # Honors {Parse::VectorSearch.index_drift_policy} (`:warn` default + # / `:raise` / `:ignore`). + # + # Checks: + # 1. `numDimensions` on the covering `type: "vector"` entry vs the + # property's declared `dimensions:`. + # 2. `similarity` vs the property's declared `similarity:` (only + # when both sides declare one). + # 3. When the class registers an `agent_tenant_scope`, the scope + # field must appear among the index's `type: "filter"` paths — + # otherwise the tenant pre-filter that + # {Parse::Retrieval.retrieve} folds into `$vectorSearch.filter` + # fails Atlas-side at query time. + def verify_vector_index!(field, idx) + return if Parse::VectorSearch.index_drift_policy == :ignore + index_name = (idx["name"] || idx[:name]).to_s + @_verified_vector_indexes ||= {} + cache_key = "#{field}|#{index_name}" + findings = @_verified_vector_indexes[cache_key] + first_check = findings.nil? + if first_check + findings = vector_index_drift_findings(field, idx).freeze + @_verified_vector_indexes[cache_key] = findings + end + return if findings.empty? + + message = "#{self} vectorSearch index #{index_name.inspect} drifts from the " \ + "model declaration for :#{field}: #{findings.join("; ")}" + if Parse::VectorSearch.index_drift_policy == :raise + # Raise on every query, not just the first: strict mode means a + # drifted index must never serve results. + raise IndexDriftError.new(message, findings: findings) + end + warn "[Parse::VectorSearch:DRIFT] #{message}" if first_check + end + + # @!visibility private + # @return [Array] drift findings (empty when in sync). + def vector_index_drift_findings(field, idx) + defn = idx["latestDefinition"] || idx[:latestDefinition] || {} + entries = defn["fields"] || defn[:fields] || [] + field_str = field.to_s + vector_entry = entries.find do |f| + (f["type"] || f[:type]).to_s == "vector" && (f["path"] || f[:path]).to_s == field_str + end + findings = [] + return findings if vector_entry.nil? # find_vector_index matched on it; defensive + + declared_dims = vector_properties.dig(field.to_sym, :dimensions) + index_dims = vector_entry["numDimensions"] || vector_entry[:numDimensions] + if declared_dims && index_dims && Integer(index_dims) != Integer(declared_dims) + findings << "index numDimensions=#{index_dims} but property declares " \ + "dimensions: #{declared_dims} (every query will mismatch — " \ + "rebuild the index or run #{self}.reembed! after fixing the declaration)" + end + + declared_sim = vector_properties.dig(field.to_sym, :similarity) + index_sim = vector_entry["similarity"] || vector_entry[:similarity] + if declared_sim && index_sim && index_sim.to_s != declared_sim.to_s + findings << "index similarity=#{index_sim.inspect} but property declares " \ + "similarity: #{declared_sim.inspect}" + end + + scope_field = registered_tenant_scope_field + if scope_field + filter_paths = entries.select { |f| (f["type"] || f[:type]).to_s == "filter" } + .map { |f| (f["path"] || f[:path]).to_s } + unless filter_paths.include?(scope_field) + findings << "agent_tenant_scope field #{scope_field.inspect} is not declared " \ + "as a type: \"filter\" path in the index — tenant-scoped " \ + "$vectorSearch.filter will fail Atlas-side" + end + end + findings + end + + # @!visibility private + # Wire/storage name of the class's registered tenant-scope field, + # or nil. Mirrors the resolution Parse::Retrieval#wire_name uses + # when folding the scope into $vectorSearch.filter. + def registered_tenant_scope_field + return nil unless defined?(Parse::Agent::MetadataRegistry) + rule = Parse::Agent::MetadataRegistry.tenant_scope_rule(parse_class) + return nil unless rule + sym = rule[:field].to_sym + fmap = respond_to?(:field_map) ? field_map : {} + (fmap[sym] || sym.to_s.columnize).to_s + rescue StandardError + nil + end + def build_vector_hits(raw_hits) return [] if raw_hits.nil? || raw_hits.empty? converted = Parse::MongoDB.convert_documents_to_parse(raw_hits, parse_class) diff --git a/lib/parse/query.rb b/lib/parse/query.rb index dea13da..6929377 100644 --- a/lib/parse/query.rb +++ b/lib/parse/query.rb @@ -1252,14 +1252,32 @@ def count(mongo_direct: false) pipeline, has_lookup_stages = build_aggregation_pipeline pipeline << { "$count" => "count" } - # Auto-detect if MongoDB direct is needed + # Auto-detect if MongoDB direct is needed. Mirror the routing in + # #execute_aggregation_pipeline: a pipeline that references internal + # ACL columns (_rperm/_wperm via readable_by/publicly_readable and + # friends) MUST run mongo-direct — Parse Server's REST aggregate + # endpoint cannot express a $match on those columns — and the + # mongo-direct sink must be told the references are sanctioned so + # the PipelineSecurity internal-fields denylist lets them through. + uses_internal_fields = pipeline_uses_internal_fields?(pipeline) + scoped = distinct_query_is_scoped? use_mongo_direct = false - if has_lookup_stages && defined?(Parse::MongoDB) && Parse::MongoDB.enabled? + if defined?(@acl_query_mongo_direct) && !@acl_query_mongo_direct.nil? + use_mongo_direct = @acl_query_mongo_direct + elsif (scoped || has_lookup_stages || uses_internal_fields) && + defined?(Parse::MongoDB) && Parse::MongoDB.enabled? use_mongo_direct = true + elsif scoped + # Same fail-closed contract as #aggregate / #aggregate_from_query: + # a scoped count must not fall back to REST /aggregate, which + # would drop the scope and count rows the caller cannot read. + raise_scoped_aggregation_requires_mongo_direct! end # Execute aggregation - aggregation = Aggregation.new(self, pipeline, verbose: @verbose_aggregate, mongo_direct: use_mongo_direct) + aggregation = Aggregation.new(self, pipeline, verbose: @verbose_aggregate, + mongo_direct: use_mongo_direct, + allow_internal_fields: uses_internal_fields) response = aggregation.execute! # Extract count from aggregation result @@ -1803,6 +1821,25 @@ def distinct_query_is_scoped? false end + # Fail closed for a scoped aggregation that would otherwise fall back + # to REST /aggregate. That endpoint is master-key-only and enforces + # neither ACL nor CLP, so letting a scoped query through would silently + # run it unscoped as the master key. Every aggregation terminal that + # routes a scoped query (aggregate, aggregate_from_query, count, + # execute_aggregation_pipeline) raises through here. + # @raise [MongoDirectRequired] + # @api private + def raise_scoped_aggregation_requires_mongo_direct! + raise MongoDirectRequired, + "[Parse::Query] This scoped aggregation (session_token / " \ + "scope_to_user / scope_to_role) requires mongo-direct so the " \ + "SDK can enforce ACL/CLP. Parse Server's REST /aggregate " \ + "endpoint is master-key-only and enforces neither, so routing " \ + "it there would silently run unscoped as the master key. " \ + "Enable mongo-direct via Parse::MongoDB.configure(...), or " \ + "rewrite without the aggregation terminal." + end + # Scope a query to a specific user's row-level ACL when it auto-routes # through mongo-direct. The SDK records the user, computes the # effective `_rperm` allow-set (user objectId + `"*"` + every role @@ -3464,15 +3501,51 @@ def aggregate(pipeline, verbose: nil, mongo_direct: nil, rewrite_lookups: nil, r complete_pipeline << { "$limit" => @limit } end - # Auto-detect if mongo_direct is needed (when $inQuery constraints are present and MongoDB is available) - use_mongo_direct = mongo_direct - if use_mongo_direct.nil? && lookup_stages && lookup_stages.any? && defined?(Parse::MongoDB) && Parse::MongoDB.enabled? - use_mongo_direct = true - end - # Optimize pipeline by merging consecutive $match stages complete_pipeline = deduplicate_consecutive_match_stages(complete_pipeline) + # Auto-detect whether this aggregation must run via the direct-MongoDB + # path instead of Parse Server's REST /aggregate endpoint. Three + # independent triggers, each of which REST /aggregate cannot serve: + # + # * $inQuery / $notInQuery → $lookup stages (the original trigger). + # * An SDK-injected ACL $match on the internal _rperm / _wperm columns + # (readable_by / publicly_readable / writable_by and friends). Parse + # Server's REST aggregate rejects a $match on those columns. + # * A scoped query (session_token / scope_to_user / scope_to_role). + # REST /aggregate is master-key-only and enforces NEITHER ACL NOR + # CLP, so a scoped aggregate sent over REST silently runs unscoped + # as the master key — leaking sums/min/max/distinct over rows the + # caller cannot read. This is the same enforcement asymmetry the + # #distinct / #count / #results auto-routes already guard against; + # the scalar terminals (sum/average/min/max/count_distinct) all + # funnel through here, so routing them here fixes every one. + # + # `allow_internal_fields` is forwarded for internal-field pipelines: the + # caller-supplied `pipeline` arg was validated above (line ~3343) with + # the internal-fields denylist active, so any _rperm/_wperm reference in + # the merged pipeline is provably SDK-injected, never user input. + uses_internal_fields = pipeline_uses_internal_fields?(complete_pipeline) + scoped = distinct_query_is_scoped? + use_mongo_direct = mongo_direct + if use_mongo_direct.nil? + mongo_ready = defined?(Parse::MongoDB) && Parse::MongoDB.enabled? + if lookup_stages && lookup_stages.any? + use_mongo_direct = true if mongo_ready + elsif scoped || uses_internal_fields + if mongo_ready + use_mongo_direct = true + elsif scoped + # Fail closed: a scoped aggregation cannot fall back to REST + # /aggregate without silently bypassing ACL/CLP (master-key-only + # endpoint). Refuse rather than leak unscoped results. Unscoped + # internal-field pipelines keep the REST fallback (a master-key + # correctness edge, not an enforcement bypass). + raise_scoped_aggregation_requires_mongo_direct! + end + end + end + # When the pipeline is bound for direct MongoDB, translate every stage # through the direct-MongoDB field rewriter so user-supplied stages # (which use logical Parse field names like `$author`) reach the @@ -3484,6 +3557,7 @@ def aggregate(pipeline, verbose: nil, mongo_direct: nil, rewrite_lookups: nil, r end Aggregation.new(self, complete_pipeline, verbose: verbose, mongo_direct: use_mongo_direct || false, + allow_internal_fields: uses_internal_fields, raw_values: raw_values, raw_field_names: raw_field_names) end @@ -3550,17 +3624,40 @@ def aggregate_from_query(additional_stages = [], verbose: nil, mongo_direct: nil # Build pipeline from current query constraints pipeline, has_lookup_stages = build_query_aggregate_pipeline + # `allow_internal_fields` is computed from the SDK-built portion ONLY + # (before appending caller stages): build_query_aggregate_pipeline emits + # the _rperm/_wperm $match for readable_by/etc., but `additional_stages` + # is caller-supplied and NOT validated here, so we must not sanction an + # internal-field reference the caller smuggled in. A scoped query still + # routes to mongo-direct regardless (so ACL/CLP enforcement runs). + uses_internal_fields = pipeline_uses_internal_fields?(pipeline) + # Append any additional stages pipeline.concat(additional_stages) if additional_stages.any? - # Auto-detect if mongo_direct is needed (when $inQuery constraints are present and MongoDB is available) + # Same routing contract as #aggregate: $lookup subqueries, an SDK ACL + # $match, or a scoped query each require the direct-MongoDB path (REST + # /aggregate cannot express _rperm/_wperm and is master-key-only/ + # unenforced). A scoped query fails closed when mongo-direct is + # unavailable rather than silently running unscoped as master. + scoped = distinct_query_is_scoped? use_mongo_direct = mongo_direct - if use_mongo_direct.nil? && has_lookup_stages && defined?(Parse::MongoDB) && Parse::MongoDB.enabled? - use_mongo_direct = true + if use_mongo_direct.nil? + mongo_ready = defined?(Parse::MongoDB) && Parse::MongoDB.enabled? + if has_lookup_stages + use_mongo_direct = true if mongo_ready + elsif scoped || uses_internal_fields + if mongo_ready + use_mongo_direct = true + elsif scoped + raise_scoped_aggregation_requires_mongo_direct! + end + end end # Create Aggregation directly to avoid double-applying constraints - Aggregation.new(self, pipeline, verbose: verbose, mongo_direct: use_mongo_direct || false) + Aggregation.new(self, pipeline, verbose: verbose, mongo_direct: use_mongo_direct || false, + allow_internal_fields: uses_internal_fields) end private @@ -3607,6 +3704,16 @@ def build_query_aggregate_pipeline end end + # Fold in SDK-built aggregation-pipeline marker stages (the _rperm/_wperm + # $match emitted by readable_by/publicly_readable/etc., plus set-equality + # and empty_or_nil markers). `compile_where` strips these markers, so + # without this extraction an ACL filter on `aggregate_from_query` would + # be silently dropped — the same omission that affected `Query#count`. + markers = compile_markers + if markers.key?("__aggregation_pipeline") + markers["__aggregation_pipeline"].each { |stage| pipeline << stage } + end + # Add $sort stage from order constraints unless @order.empty? sort_stage = {} @@ -3702,19 +3809,35 @@ def execute_aggregation_pipeline # Parse Server blocks these for security - must use MongoDB direct use_mongo_direct = false + # When the SDK-built pipeline references internal ACL columns + # (_rperm/_wperm via readable_by/writable_by/publicly_readable and + # friends, or _acl), the mongo-direct sink must be told these + # references are sanctioned so the PipelineSecurity internal-fields + # denylist lets them through. The pipeline here is built entirely + # from SDK constraint translation (no caller-supplied stages), so + # this is safe — same posture as results_direct/count_direct. + uses_internal_fields = pipeline_uses_internal_fields?(pipeline) + scoped = distinct_query_is_scoped? + # Check for explicit mongo_direct preference first if defined?(@acl_query_mongo_direct) && !@acl_query_mongo_direct.nil? use_mongo_direct = @acl_query_mongo_direct elsif defined?(Parse::MongoDB) && Parse::MongoDB.enabled? - # Auto-detect based on pipeline contents - if has_lookup_stages || pipeline_uses_internal_fields?(pipeline) + # Auto-detect based on pipeline contents and query scope + if scoped || has_lookup_stages || uses_internal_fields use_mongo_direct = true end + elsif scoped + # Same fail-closed contract as #aggregate / #aggregate_from_query: + # a scoped pipeline must not fall back to REST /aggregate, which + # would drop the scope and return rows the caller cannot read. + raise_scoped_aggregation_requires_mongo_direct! end # Create Aggregation directly to avoid double-applying constraints # The aggregate() method would redundantly add where constraints again - Aggregation.new(self, pipeline, verbose: @verbose_aggregate, mongo_direct: use_mongo_direct) + Aggregation.new(self, pipeline, verbose: @verbose_aggregate, mongo_direct: use_mongo_direct, + allow_internal_fields: uses_internal_fields) end # Check if the pipeline references internal Parse fields that require MongoDB direct access @@ -5454,23 +5577,38 @@ def clone # Strings are used as-is (user IDs or "role:RoleName" format). # Use "public" for public access, "none" or [] for no read permissions. # - # @param permission [Parse::User, Parse::Role, String, Array] the permission to check + # @param permission [Parse::User, Parse::Role, Parse::Pointer, String, Symbol, Array] + # the permission to check. A `Parse::User` (or User pointer) expands to + # the user's objectId plus every role they inherit; a `Parse::Role` (or + # role name String / `:ACL.readable_by_role` form) expands up the role + # hierarchy. `"public"` / `:public` / `:everyone` / `:world` map to the + # `"*"` wildcard. `"none"` / `:none` / `[]` / `nil` match objects with no + # read permissions (explicit empty `_rperm`). # @param mongo_direct [Boolean] if true, forces MongoDB direct query. If nil (default), # auto-detects based on query complexity. Set to false to force Parse Server aggregation. + # @param strict [Boolean] when false (default), the match is **inclusive**: + # it ALSO returns publicly-readable rows (`_rperm` contains `"*"`) and + # rows with a missing `_rperm` (public by absence), because those are + # genuinely readable by the principal. This is access-simulation + # semantics ("what can this principal read"). Pass `strict: true` for an + # **exact** match — only rows whose `_rperm` literally contains one of + # the resolved permissions, with no public/missing rows — which is what + # an ownership or security audit wants ("which rows explicitly grant + # this principal"). Equivalent to the `:ACL.readable_by_exact` operator. # @return [Parse::Query] returns self for method chaining # @note This uses MongoDB aggregation pipeline because Parse Server restricts # direct queries on internal ACL fields (_rperm/_wperm). # @example - # Song.query.readable_by("user123") # Objects readable by user ID - # Song.query.readable_by("role:Admin") # Objects readable by Admin role - # Song.query.readable_by(current_user) # Objects readable by user object - # Song.query.readable_by("public") # Publicly readable objects - # Song.query.readable_by("none") # Objects with no read permissions - # Song.query.readable_by([]) # Objects with no read permissions (empty ACL) - # Song.query.readable_by([], mongo_direct: true) # Force MongoDB direct query - def readable_by(permission, mongo_direct: nil) + # Song.query.readable_by("user123") # readable by user ID (+ public) + # Song.query.readable_by("role:Admin") # readable by Admin role (+ public) + # Song.query.readable_by(current_user) # by user object, roles expanded (+ public) + # Song.query.readable_by(:public) # publicly readable objects + # Song.query.readable_by("none") # objects with no read permissions + # Song.query.readable_by([]) # objects with no read permissions (empty ACL) + # Song.query.readable_by("role:Admin", strict: true) # ONLY rows that explicitly grant Admin + def readable_by(permission, mongo_direct: nil, strict: false) @acl_query_mongo_direct = mongo_direct unless mongo_direct.nil? - where(:ACL.readable_by => permission) + where((strict ? :ACL.readable_by_exact : :ACL.readable_by) => permission) self end @@ -5478,14 +5616,16 @@ def readable_by(permission, mongo_direct: nil) # # @param role_name [Parse::Role, String, Array] the role name(s) to check # @param mongo_direct [Boolean] if true, forces MongoDB direct query. + # @param strict [Boolean] when true, exact match only — no implicit public + # `"*"` and no missing-`_rperm` rows. See {#readable_by}. # @return [Parse::Query] returns self for method chaining # @example # Song.query.readable_by_role("Admin") # Objects readable by Admin role # Song.query.readable_by_role(["Admin", "Editor"]) # Objects readable by Admin or Editor # Song.query.readable_by_role(admin_role) # Objects readable by Role object - def readable_by_role(role_name, mongo_direct: nil) + def readable_by_role(role_name, mongo_direct: nil, strict: false) @acl_query_mongo_direct = mongo_direct unless mongo_direct.nil? - where(:ACL.readable_by_role => role_name) + where((strict ? :ACL.readable_by_role_exact : :ACL.readable_by_role) => role_name) self end @@ -5493,23 +5633,27 @@ def readable_by_role(role_name, mongo_direct: nil) # Strings are used as-is (user IDs or "role:RoleName" format). # Use "public" for public access, "none" or [] for no write permissions. # - # @param permission [Parse::User, Parse::Role, String, Array] the permission to check + # @param permission [Parse::User, Parse::Role, Parse::Pointer, String, Symbol, Array] + # the permission to check. See {#readable_by} for value coercion and + # role expansion. # @param mongo_direct [Boolean] if true, forces MongoDB direct query. If nil (default), # auto-detects based on query complexity. Set to false to force Parse Server aggregation. + # @param strict [Boolean] when true, exact match only — no implicit public + # `"*"` and no missing-`_wperm` rows. See {#readable_by}. # @return [Parse::Query] returns self for method chaining # @note This uses MongoDB aggregation pipeline because Parse Server restricts # direct queries on internal ACL fields (_rperm/_wperm). # @example - # Song.query.writable_by("user123") # Objects writable by user ID - # Song.query.writable_by("role:Admin") # Objects writable by Admin role - # Song.query.writable_by(current_user) # Objects writable by user object - # Song.query.writable_by("public") # Publicly writable objects - # Song.query.writable_by("none") # Objects with no write permissions - # Song.query.writable_by([]) # Objects with no write permissions (empty ACL) - # Song.query.writable_by([], mongo_direct: true) # Force MongoDB direct query - def writable_by(permission, mongo_direct: nil) + # Song.query.writable_by("user123") # writable by user ID (+ public) + # Song.query.writable_by("role:Admin") # writable by Admin role (+ public) + # Song.query.writable_by(current_user) # by user object, roles expanded (+ public) + # Song.query.writable_by(:public) # Publicly writable objects + # Song.query.writable_by("none") # objects with no write permissions + # Song.query.writable_by([]) # objects with no write permissions (empty ACL) + # Song.query.writable_by("role:Admin", strict: true) # ONLY rows that explicitly grant Admin + def writable_by(permission, mongo_direct: nil, strict: false) @acl_query_mongo_direct = mongo_direct unless mongo_direct.nil? - where(:ACL.writable_by => permission) + where((strict ? :ACL.writable_by_exact : :ACL.writable_by) => permission) self end @@ -5517,14 +5661,16 @@ def writable_by(permission, mongo_direct: nil) # # @param role_name [Parse::Role, String, Array] the role name(s) to check # @param mongo_direct [Boolean] if true, forces MongoDB direct query. + # @param strict [Boolean] when true, exact match only — no implicit public + # `"*"` and no missing-`_wperm` rows. See {#readable_by}. # @return [Parse::Query] returns self for method chaining # @example # Song.query.writable_by_role("Admin") # Objects writable by Admin role # Song.query.writable_by_role(["Admin", "Editor"]) # Objects writable by Admin or Editor # Song.query.writable_by_role(admin_role) # Objects writable by Role object - def writable_by_role(role_name, mongo_direct: nil) + def writable_by_role(role_name, mongo_direct: nil, strict: false) @acl_query_mongo_direct = mongo_direct unless mongo_direct.nil? - where(:ACL.writable_by_role => role_name) + where((strict ? :ACL.writable_by_role_exact : :ACL.writable_by_role) => role_name) self end @@ -5599,6 +5745,38 @@ def private_acl(mongo_direct: nil) alias_method :master_key_only, :private_acl + # Find objects that are NOT readable by the given principal — i.e. hidden + # from them. Excludes rows readable by the principal directly, via any role + # they inherit, OR publicly (a public row is readable by everyone), and + # excludes rows with a missing `_rperm` (public by absence). + # + # @param permission [Parse::User, Parse::Role, Parse::Pointer, String, Symbol, Array] + # the principal to hide from. See {#readable_by} for value coercion. + # @param mongo_direct [Boolean] if true, forces MongoDB direct query. + # @return [Parse::Query] returns self for method chaining + # @example + # Song.query.not_readable_by(current_user).results # hidden from this user + def not_readable_by(permission, mongo_direct: nil) + @acl_query_mongo_direct = mongo_direct unless mongo_direct.nil? + where(:ACL.not_readable_by => permission) + self + end + + # Find objects that are NOT writable by the given principal. See + # {#not_readable_by} for the exclusion semantics (direct, role, public). + # + # @param permission [Parse::User, Parse::Role, Parse::Pointer, String, Symbol, Array] + # the principal to exclude. See {#readable_by} for value coercion. + # @param mongo_direct [Boolean] if true, forces MongoDB direct query. + # @return [Parse::Query] returns self for method chaining + # @example + # Song.query.not_writable_by("role:Admin").results + def not_writable_by(permission, mongo_direct: nil) + @acl_query_mongo_direct = mongo_direct unless mongo_direct.nil? + where(:ACL.not_writable_by => permission) + self + end + # Find objects that are NOT publicly readable. # Matches objects where _rperm does NOT contain "*". # @@ -5728,8 +5906,19 @@ class Aggregation # aggregate endpoint (PS 9.9.0+). Has no effect on the mongo-direct path. # @param raw_field_names [Boolean] when true, passes +rawFieldNames: true+ to the Parse Server # REST aggregate endpoint (PS 9.9.0+). Has no effect on the mongo-direct path. + # @param allow_internal_fields [Boolean] when true, the mongo-direct path + # forwards +allow_internal_fields: true+ to {Parse::MongoDB.aggregate} so + # SDK-built ACL `$match` stages that legitimately reference +_rperm+ / + # +_wperm+ (emitted by {Parse::Query#readable_by}, +#publicly_readable+, + # and friends) pass the pipeline-security internal-fields denylist — + # matching the parity already held by +results_direct+ / +count_direct+ / + # +distinct_direct+. Set +true+ ONLY when this Aggregation's pipeline was + # built entirely from SDK constraint translation (no caller-supplied + # stages); the credential-field guard (`_hashed_password`, session tokens, + # auth data) is what +allow_internal_fields+ relaxes, so it must never be + # set on a pipeline that interpolates user input. Defaults to +false+. def initialize(query, pipeline, verbose: nil, mongo_direct: false, max_time_ms: nil, - raw_values: false, raw_field_names: false) + raw_values: false, raw_field_names: false, allow_internal_fields: false) @query = query @pipeline = pipeline @cached_response = nil @@ -5737,6 +5926,7 @@ def initialize(query, pipeline, verbose: nil, mongo_direct: false, max_time_ms: @max_time_ms = max_time_ms @raw_values = raw_values @raw_field_names = raw_field_names + @allow_internal_fields = allow_internal_fields # Use provided verbose setting, or fall back to query's verbose_aggregate setting @verbose = verbose.nil? ? @query.instance_variable_get(:@verbose_aggregate) : verbose end @@ -5789,7 +5979,8 @@ def execute_direct!(max_time_ms: @max_time_ms) # honors it on the mongo-direct path too (parity with results_direct / # count_direct / distinct_direct). hint = @query.instance_variable_get(:@hint) - Parse::MongoDB.aggregate(table, @pipeline, max_time_ms: max_time_ms, hint: hint, **auth_kwargs) + Parse::MongoDB.aggregate(table, @pipeline, max_time_ms: max_time_ms, hint: hint, + allow_internal_fields: @allow_internal_fields, **auth_kwargs) end # Returns processed results from the aggregation. diff --git a/lib/parse/query/constraint.rb b/lib/parse/query/constraint.rb index 72c710b..4d98109 100644 --- a/lib/parse/query/constraint.rb +++ b/lib/parse/query/constraint.rb @@ -191,6 +191,28 @@ def formatted_value self.class.formatted_value(@value) end + # Supports the opt-in `{ value:, unicode: true }` form accepted by the + # regex-based constraints ({RegularExpressionConstraint}, + # {StartsWithConstraint}, {ContainsConstraint}, {EndsWithConstraint}). + # When the `unicode` flag is set, the constraint adds the `u` flag to the + # compiled `$options`, asking the backend to treat the pattern and subject + # as UTF-8 for correct multibyte (e.g. accented or CJK) case-insensitive + # matching. + # + # The `u` flag is only honored by Parse Server 8.3.0+ over REST (older + # servers reject it) and by MongoDB 6.1+ on the mongo-direct path; it is + # therefore strictly opt-in and never emitted for the bare-value form. + # + # @param raw [Object] the raw constraint value (`@value`). + # @return [Array(Object, Boolean)] the unwrapped value and the unicode flag. + # @api private + def regex_unicode_option(raw) + return [raw, false] unless raw.is_a?(Hash) + + opts = raw.symbolize_keys + [opts[:value], opts[:unicode] ? true : false] + end + # Registers the default constraint of equality register :eq, Constraint precedence 100 diff --git a/lib/parse/query/constraints.rb b/lib/parse/query/constraints.rb index 194034d..ea043d0 100644 --- a/lib/parse/query/constraints.rb +++ b/lib/parse/query/constraints.rb @@ -1638,6 +1638,12 @@ def build # q.where :field.like => /ruby_regex/i # :name.like => /Bob/i # + # # Opt into Unicode-aware matching (Parse Server 8.3.0+ over REST, + # # MongoDB 6.1+ mongo-direct). The hash form compiles to the explicit + # # $regex/$options shape and adds the `u` flag: + # q.where :name.like => { value: /café/i, unicode: true } + # # Generates: "name": { "$regex": "café", "$options": "iu" } + # class RegularExpressionConstraint < Constraint # Requires that a key's value match a regular expression. # Includes security validation to prevent ReDoS attacks. @@ -1659,6 +1665,24 @@ class RegularExpressionConstraint < Constraint # @raise [ArgumentError] if the pattern is potentially dangerous (ReDoS) # @return [Hash] the compiled constraint def build + # Opt-in `{ value:, unicode: true }` form. Unlike the bare form (which + # stringifies a Ruby Regexp to its inline-flag source, e.g. + # "(?i-mx:Bob)"), this compiles to the explicit $regex/$options shape so + # the `u` flag can be appended for Unicode-aware matching. + if @value.is_a?(Hash) + raw, unicode = regex_unicode_option(@value) + pattern_str = raw.is_a?(Regexp) ? raw.source : raw.to_s + options = +"" + options << "i" if raw.is_a?(Regexp) && raw.casefold? + options << "u" if unicode + + Parse::RegexSecurity.validate!(pattern_str) + + return options.empty? ? + { @operation.operand => { key => pattern_str } } : + { @operation.operand => { key => pattern_str, :$options => options } } + end + value = formatted_value pattern_str = value.is_a?(Regexp) ? value.source : value.to_s options = value.is_a?(Regexp) && value.casefold? ? "i" : nil @@ -2322,6 +2346,11 @@ def build # User.where(:name.starts_with => "John") # # Generates: "name": { "$regex": "^John", "$options": "i" } # + # # Opt into Unicode-aware case-insensitive matching (Parse Server 8.3.0+ + # # over REST, MongoDB 6.1+ mongo-direct): + # User.where(:name.starts_with => { value: "café", unicode: true }) + # # Generates: "name": { "$regex": "^café", "$options": "iu" } + # class StartsWithConstraint < Constraint # @!method starts_with # A registered method on a symbol to create the constraint. Maps to Parse operator "$regex". @@ -2333,7 +2362,8 @@ class StartsWithConstraint < Constraint # @return [Hash] the compiled constraint. def build - value = formatted_value + raw, unicode = regex_unicode_option(@value) + value = self.class.formatted_value(raw) unless value.is_a?(String) raise ArgumentError, "#{self.class}: Value must be a string for starts_with constraint" end @@ -2347,7 +2377,7 @@ def build escaped_value = Regexp.escape(value) regex_pattern = "^#{escaped_value}" - { @operation.operand => { :$regex => regex_pattern, :$options => "i" } } + { @operation.operand => { :$regex => regex_pattern, :$options => (unicode ? "iu" : "i") } } end end @@ -2358,6 +2388,11 @@ def build # Post.where(:title.contains => "parse") # # Generates: "title": { "$regex": ".*parse.*", "$options": "i" } # + # # Opt into Unicode-aware case-insensitive matching (Parse Server 8.3.0+ + # # over REST, MongoDB 6.1+ mongo-direct): + # Post.where(:title.contains => { value: "café", unicode: true }) + # # Generates: "title": { "$regex": ".*café.*", "$options": "iu" } + # class ContainsConstraint < Constraint # @!method contains # A registered method on a symbol to create the constraint. Maps to Parse operator "$regex". @@ -2369,7 +2404,8 @@ class ContainsConstraint < Constraint # @return [Hash] the compiled constraint. def build - value = formatted_value + raw, unicode = regex_unicode_option(@value) + value = self.class.formatted_value(raw) unless value.is_a?(String) raise ArgumentError, "#{self.class}: Value must be a string for contains constraint" end @@ -2383,7 +2419,7 @@ def build escaped_value = Regexp.escape(value) regex_pattern = ".*#{escaped_value}.*" - { @operation.operand => { :$regex => regex_pattern, :$options => "i" } } + { @operation.operand => { :$regex => regex_pattern, :$options => (unicode ? "iu" : "i") } } end end @@ -2394,6 +2430,11 @@ def build # File.where(:name.ends_with => ".pdf") # # Generates: "name": { "$regex": "\\.pdf$", "$options": "i" } # + # # Opt into Unicode-aware case-insensitive matching (Parse Server 8.3.0+ + # # over REST, MongoDB 6.1+ mongo-direct): + # Post.where(:title.ends_with => { value: "café", unicode: true }) + # # Generates: "title": { "$regex": "café$", "$options": "iu" } + # class EndsWithConstraint < Constraint # @!method ends_with # A registered method on a symbol to create the constraint. Maps to Parse operator "$regex". @@ -2405,7 +2446,8 @@ class EndsWithConstraint < Constraint # @return [Hash] the compiled constraint. def build - value = formatted_value + raw, unicode = regex_unicode_option(@value) + value = self.class.formatted_value(raw) unless value.is_a?(String) raise ArgumentError, "#{self.class}: Value must be a string for ends_with constraint" end @@ -2419,7 +2461,7 @@ def build escaped_value = Regexp.escape(value) regex_pattern = "#{escaped_value}$" - { @operation.operand => { :$regex => regex_pattern, :$options => "i" } } + { @operation.operand => { :$regex => regex_pattern, :$options => (unicode ? "iu" : "i") } } end end @@ -2535,15 +2577,41 @@ def collect(value) permissions_for_pointer(value) elsif value.is_a?(Array) value.flat_map { |item| collect_array_item(item) } + elsif value.is_a?(Symbol) + [symbol_permission(value)] elsif value.is_a?(String) - [value == "public" ? "*" : value] + [normalize_string_permission(value)] else raise ArgumentError, "ACL permission value must be a Parse::User, Parse::Role, " \ - "Parse::Pointer, String, or Array of these (got #{value.class})" + "Parse::Pointer, String, Symbol (:public/:everyone/:world), or " \ + "Array of these (got #{value.class})" + end + end + + # @!visibility private + # Map a Symbol permission (:public / :everyone / :world) to the "*" + # wildcard. Any other Symbol RAISES rather than silently mapping to a + # bogus key — a mistyped permission must not quietly weaken the filter. + def symbol_permission(sym) + case sym + when :public, :everyone, :world then "*" + else + raise ArgumentError, + "Unsupported ACL permission Symbol #{sym.inspect}. Use " \ + ":public / :everyone / :world for public access, or pass a " \ + "role name as a String or a Parse::Role." end end + # @!visibility private + # Normalize a String permission: the sentinel "public" maps to the + # "*" wildcard; every other String is an exact permission key + # (user objectId or "role:") used verbatim. + def normalize_string_permission(str) + str == "public" ? "*" : str + end + # Expand a +:ACL.readable_by_role+ / +:ACL.writable_by_role+ value # into a permission-string array. Differs from {.collect} by # auto-prefixing bare strings with +"role:"+ and refusing @@ -2573,10 +2641,11 @@ def collect_role_only(value) end # @!visibility private - # Array-element variant that silently skips unrecognized entries - # rather than raising, matching the pre-refactor behavior where - # the array branch tolerated a mixed bag of types and ignored - # anything it didn't understand. + # Array-element variant. An unrecognized element RAISES rather than + # being silently dropped: a mistyped permission that vanished from + # the key set would silently weaken the intended ACL filter (a + # security footgun). The Symbol :none contributes nothing (it is the + # array-element spelling of "no extra grant"). def collect_array_item(item) if item.is_a?(Parse::User) permissions_for_user(item) @@ -2584,10 +2653,15 @@ def collect_array_item(item) permissions_for_role(item) elsif item.is_a?(Parse::Pointer) permissions_for_pointer(item) + elsif item.is_a?(Symbol) + item == :none ? [] : [symbol_permission(item)] elsif item.is_a?(String) - [item == "public" ? "*" : item] + [normalize_string_permission(item)] else - [] + raise ArgumentError, + "Unsupported ACL permission element #{item.inspect} " \ + "(#{item.class}) in array. Expected a Parse::User / Parse::Role / " \ + "Parse::Pointer, a permission String, or :public/:everyone/:world." end end @@ -2619,19 +2693,61 @@ def collect_role_only_array_item(item) # @param field [String] +"_rperm"+ or +"_wperm"+. # @return [Hash] aggregation-pipeline wrapper compatible with # {Parse::Query}'s constraint-build contract. - def pipeline(permissions, field:) + # @param strict [Boolean] when true, build an EXACT match: suppress + # both the implicit public +"*"+ grant AND the missing-field + # (+$exists: false+) branch, so only rows whose +_rperm+/+_wperm+ + # literally contains one of +permissions+ match. Used by the + # +readable_by(..., strict: true)+ / +readable_by_exact+ surface. + def pipeline(permissions, field:, strict: false) deduped = permissions.compact.reject(&:empty?).uniq if deduped.empty? raise ArgumentError, "no valid permissions found in provided value" end predicate = if field == "_rperm" - Parse::ACL.read_predicate(deduped) + Parse::ACL.read_predicate(deduped, include_public: !strict, include_missing: !strict) else - Parse::ACL.write_predicate(deduped) + Parse::ACL.write_predicate(deduped, include_public: !strict, include_missing: !strict) end { "__aggregation_pipeline" => [{ "$match" => predicate }] } end + # @!visibility private + # Whether a +readable_by+ / +writable_by+ value expresses "no + # permissions" (master-key-only): +nil+, an empty Array, the String + # +"none"+, or the Symbol +:none+. These map to {.empty_pipeline}. + def empty_intent?(value) + return true if value.nil? + return true if value == "none" || value == :none + return true if value.is_a?(Array) && value.empty? + false + end + + # @!visibility private + # The match for "no permissions": an explicit empty array. A missing + # +_rperm+/+_wperm+ is treated by Parse Server as PUBLIC — the + # opposite of master-only — so it must NOT match here. +$eq: []+ + # already excludes a missing field (missing != []); the +$exists: + # true+ guard documents that intent. + # @param field [String] +"_rperm"+ or +"_wperm"+. + def empty_pipeline(field:) + { "__aggregation_pipeline" => [ + { "$match" => { field => { "$exists" => true, "$eq" => [] } } }, + ] } + end + + # @!visibility private + # Permission keys for a +not_readable_by+ / +not_writable_by+ value: + # the expanded grant set (user→roles, role→parent roles) PLUS the + # public +"*"+ wildcard. A public row is readable/writable by everyone, + # so it must be EXCLUDED from a "not readable/writable by X" result — + # hence +"*"+ is added to the +$nin+ set. Returns +[]+ for an + # empty-intent value (no negation constraint is applied). + # @return [Array] + def collect_for_negation(value) + return [] if empty_intent?(value) + (collect(value) + ["*"]).compact.reject(&:empty?).uniq + end + # @!visibility private def permissions_for_user(user) return [] unless user.id.present? @@ -2652,7 +2768,12 @@ def permissions_for_user(user) def permissions_for_role(role) return [] unless role.respond_to?(:name) && role.name.present? begin - role.all_parent_role_names(max_depth: 5).map { |name| "role:#{name}" } + names = role.all_parent_role_names(max_depth: 5) + # The role's OWN name must always be present: an unpersisted role + # (id still nil) yields [] from the upward-inheritance walk, which + # would otherwise drop the role entirely and raise "no valid + # permissions". Self is included idempotently for persisted roles. + (Array(names) + [role.name]).uniq.map { |name| "role:#{name}" } rescue ["role:#{role.name}"] end @@ -2709,23 +2830,41 @@ class ACLReadableByConstraint < Constraint # @return [ACLReadableByConstraint] register :readable_by + # @return [Boolean] whether to compile an EXACT match (suppress the + # implicit public +"*"+ grant and the missing-field branch). + # Overridden by {ACLReadableByExactConstraint}. + def strict? + false + end + # @return [Hash] the compiled constraint using _rperm field. def build # Use @value directly to preserve type information before # formatted_value converts to pointers. value = @value - # Special case: "none" matches objects whose _rperm is an empty - # array — master-key-only documents. Parse Server writes [] - # when no read permission is set, and an absent _rperm is - # treated as public (handled by the default predicate path). - if value.is_a?(String) && value == "none" - pipeline = [{ "$match" => { "_rperm" => { "$eq" => [] } } }] - return { "__aggregation_pipeline" => pipeline } - end + # "No permissions" intent (nil / [] / "none" / :none) matches + # objects whose _rperm is an explicit empty array — master-key-only + # documents. A missing _rperm is public (the opposite of "none"), so + # {ACLPermissions.empty_pipeline} deliberately does NOT match it. + return ACLPermissions.empty_pipeline(field: "_rperm") if ACLPermissions.empty_intent?(value) permissions = ACLPermissions.collect(value) - ACLPermissions.pipeline(permissions, field: "_rperm") + ACLPermissions.pipeline(permissions, field: "_rperm", strict: strict?) + end + end + + # Strict variant of {ACLReadableByConstraint}: matches ONLY rows whose + # +_rperm+ literally contains one of the resolved permissions — no + # implicit public +"*"+ and no missing-+_rperm+ (public-by-absence) rows. + # Reached via +Query#readable_by(value, strict: true)+ or the + # +:ACL.readable_by_exact+ symbol operator. Use this for ownership / + # security audits ("which rows explicitly grant this principal") rather + # than access simulation ("what can this principal read"). + class ACLReadableByExactConstraint < ACLReadableByConstraint + register :readable_by_exact + def strict? + true end end @@ -2748,10 +2887,25 @@ class ACLReadableByRoleConstraint < Constraint # @return [ACLReadableByRoleConstraint] register :readable_by_role + # @return [Boolean] whether to compile an EXACT match. Overridden by + # {ACLReadableByRoleExactConstraint}. + def strict? + false + end + # @return [Hash] the compiled constraint using _rperm field. def build permissions = ACLPermissions.collect_role_only(@value) - ACLPermissions.pipeline(permissions, field: "_rperm") + ACLPermissions.pipeline(permissions, field: "_rperm", strict: strict?) + end + end + + # Strict variant of {ACLReadableByRoleConstraint}. See + # {ACLReadableByExactConstraint}. + class ACLReadableByRoleExactConstraint < ACLReadableByRoleConstraint + register :readable_by_role_exact + def strict? + true end end @@ -2777,21 +2931,33 @@ class ACLWritableByConstraint < Constraint # @return [ACLWritableByConstraint] register :writable_by + # @return [Boolean] whether to compile an EXACT match. Overridden by + # {ACLWritableByExactConstraint}. + def strict? + false + end + # @return [Hash] the compiled constraint using _wperm field. def build # Use @value directly to preserve type information before # formatted_value converts to pointers. value = @value - # Special case: "none" matches objects whose _wperm is an empty - # array — master-key-only documents. See {ACLReadableByConstraint#build}. - if value.is_a?(String) && value == "none" - pipeline = [{ "$match" => { "_wperm" => { "$eq" => [] } } }] - return { "__aggregation_pipeline" => pipeline } - end + # "No permissions" intent (nil / [] / "none" / :none) — see + # {ACLReadableByConstraint#build}. + return ACLPermissions.empty_pipeline(field: "_wperm") if ACLPermissions.empty_intent?(value) permissions = ACLPermissions.collect(value) - ACLPermissions.pipeline(permissions, field: "_wperm") + ACLPermissions.pipeline(permissions, field: "_wperm", strict: strict?) + end + end + + # Strict variant of {ACLWritableByConstraint}. See + # {ACLReadableByExactConstraint}. + class ACLWritableByExactConstraint < ACLWritableByConstraint + register :writable_by_exact + def strict? + true end end @@ -2814,10 +2980,25 @@ class ACLWritableByRoleConstraint < Constraint # @return [ACLWritableByRoleConstraint] register :writable_by_role + # @return [Boolean] whether to compile an EXACT match. Overridden by + # {ACLWritableByRoleExactConstraint}. + def strict? + false + end + # @return [Hash] the compiled constraint using _wperm field. def build permissions = ACLPermissions.collect_role_only(@value) - ACLPermissions.pipeline(permissions, field: "_wperm") + ACLPermissions.pipeline(permissions, field: "_wperm", strict: strict?) + end + end + + # Strict variant of {ACLWritableByRoleConstraint}. See + # {ACLReadableByExactConstraint}. + class ACLWritableByRoleExactConstraint < ACLWritableByRoleConstraint + register :writable_by_role_exact + def strict? + true end end @@ -3004,179 +3185,29 @@ def format_field_name(field, is_pointer: true) end end - # Shared helper module for ACL constraint classes. - # Provides common normalization logic for converting various input types - # (User, Role, Pointer, symbols, strings) to ACL permission keys. - # @api private - module AclConstraintHelpers - private - - # Normalize various input types to ACL permission keys. - # @param value [Array, String, Symbol, Parse::User, Parse::Role, nil] - # @return [Array] normalized permission keys - # @note Returns empty array for nil, [], "none", or :none (indicating no permissions) - def normalize_acl_keys(value) - # Handle special "none" case for no permissions - return [] if value.nil? - return [] if value == "none" || value == :none - return [] if value.is_a?(Array) && value.empty? - - Array(value).map do |item| - case item - when Parse::User - item.id - when Parse::Role - "role:#{item.name}" - when Parse::Pointer - item.id - when :public, :everyone, :world - "*" - when "public", "*" - "*" - when "none", :none - nil # Will be compacted out, but array will be non-empty so won't match "no permissions" - when String - item - when Symbol - item == :public ? "*" : item.to_s - else - item.respond_to?(:id) ? item.id : item.to_s - end - end.compact.uniq - end + # @deprecated Thin alias of {ACLReadableByConstraint}. The +:readable_by+ + # operator is registered by {ACLReadableByConstraint}; this constant is + # retained only so any code referencing it keeps working. The previous + # standalone implementation (no role expansion, no implicit public + # +"*"+, divergent empty-ACL shape) has been removed — it never backed + # the +:readable_by+ operator and silently disagreed with it. + class ReadableByConstraint < ACLReadableByConstraint end - # ACL Read Permission Query Constraint - # Query objects based on read permissions using MongoDB's internal _rperm field. - # Parse Server restricts direct queries on _rperm, so this uses aggregation pipeline. - # - # @example Find objects with NO read permissions (master key only / private) - # Song.query.where(:acl.readable_by => []) - # - # @example Find objects readable by a specific user ID - # Song.query.where(:acl.readable_by => "userId123") - # Song.query.where(:acl.readable_by => current_user) - # - # @example Find objects readable by a role - # Song.query.where(:acl.readable_by => "role:Admin") - # - # @example Find objects with public read access - # Song.query.where(:acl.readable_by => "*") - # Song.query.where(:acl.readable_by => :public) - # - # @example Find objects readable by ANY of the specified users/roles - # Song.query.where(:acl.readable_by => [user1.id, "role:Admin", "*"]) - # - # @note This constraint uses aggregation pipeline because Parse Server - # restricts direct queries on the internal _rperm field. - class ReadableByConstraint < Constraint - include AclConstraintHelpers - - # @!method readable_by - # A registered method on a symbol to create the constraint. - # @example - # q.where :acl.readable_by => [] - # q.where :acl.readable_by => "userId" - # q.where :acl.readable_by => ["userId", "role:Admin"] - # @return [ReadableByConstraint] - # NOTE: :readable_by is already registered by ACLReadableByConstraint above. - # This class provides simplified empty ACL queries and is used internally. - - # @return [Hash] the compiled constraint using aggregation pipeline. - def build - keys = normalize_acl_keys(@value) - - if keys.empty? - # Empty array = no read permissions (master key only) - # Match documents where _rperm is an empty array - pipeline = [ - { - "$match" => { - "$or" => [ - { "_rperm" => { "$exists" => true, "$eq" => [] } }, - { "_rperm" => { "$exists" => false } }, - ], - }, - }, - ] - else - # Find objects readable by ANY of the specified keys - # Use $in to match if _rperm contains any of the keys - pipeline = [ - { - "$match" => { - "_rperm" => { "$in" => keys }, - }, - }, - ] - end - - { "__aggregation_pipeline" => pipeline } - end - end - - # ACL Write Permission Query Constraint - # Query objects based on write permissions using MongoDB's internal _wperm field. - # Parse Server restricts direct queries on _wperm, so this uses aggregation pipeline. - # - # @example Find objects with NO write permissions (master key only / read-only) - # Song.query.where(:acl.writeable_by => []) - # - # @example Find objects writable by a specific user ID - # Song.query.where(:acl.writeable_by => "userId123") - # Song.query.where(:acl.writeable_by => current_user) - # - # @example Find objects writable by a role - # Song.query.where(:acl.writeable_by => "role:Admin") - # - # @note This constraint uses aggregation pipeline because Parse Server - # restricts direct queries on the internal _wperm field. - class WriteableByConstraint < Constraint - include AclConstraintHelpers - - # @!method writeable_by - # A registered method on a symbol to create the constraint. - # @example - # q.where :acl.writeable_by => [] - # q.where :acl.writeable_by => "userId" - # @return [WriteableByConstraint] + # @deprecated Alias of {ACLWritableByConstraint}. The British-spelled + # +:writeable_by+ operator now resolves to the SAME public-inclusive, + # role-expanding implementation as +:writable_by+ — previously it was a + # separate, strict, non-expanding constraint, so the one-letter spelling + # difference silently changed query semantics. For the old exact-match + # behavior (no implicit public, no role expansion, no missing-field), + # use +readable_by(..., strict: true)+ / +writable_by(..., strict: true)+ + # or the +:writable_by_exact+ operator. + class WriteableByConstraint < ACLWritableByConstraint register :writeable_by - - # @return [Hash] the compiled constraint using aggregation pipeline. - def build - keys = normalize_acl_keys(@value) - - if keys.empty? - # Empty array = no write permissions (master key only) - pipeline = [ - { - "$match" => { - "$or" => [ - { "_wperm" => { "$exists" => true, "$eq" => [] } }, - { "_wperm" => { "$exists" => false } }, - ], - }, - }, - ] - else - # Find objects writable by ANY of the specified keys - pipeline = [ - { - "$match" => { - "_wperm" => { "$in" => keys }, - }, - }, - ] - end - - { "__aggregation_pipeline" => pipeline } - end end - # Alias for writeable_by (American spelling) - # NOTE: :writable_by is already registered by ACLWritableByConstraint above. - # This class provides simplified empty ACL queries and is used internally. - class WritableByConstraint < WriteableByConstraint + # @deprecated Alias of {ACLWritableByConstraint}; see {WriteableByConstraint}. + class WritableByConstraint < ACLWritableByConstraint end # ACL NOT Readable By Constraint @@ -3190,22 +3221,29 @@ class WritableByConstraint < WriteableByConstraint # Song.query.where(:acl.not_readable_by => "*") # Song.query.where(:acl.not_readable_by => :public) # + # @note "Not readable by X" excludes rows readable by X *directly*, *via + # any role X inherits*, AND *publicly* — so a User value expands its + # roles and the public +"*"+ is always added to the exclusion set. # @note This constraint uses aggregation pipeline because Parse Server # restricts direct queries on the internal _rperm field. class NotReadableByConstraint < Constraint - include AclConstraintHelpers - register :not_readable_by def build - keys = normalize_acl_keys(@value) + keys = ACLPermissions.collect_for_negation(@value) return { "__aggregation_pipeline" => [] } if keys.empty? - # Find objects where _rperm does NOT contain any of the keys + # Find objects whose _rperm EXISTS and does NOT contain any of the + # keys. The `$exists: true` guard is essential: Parse Server treats a + # missing `_rperm` as publicly readable, and MongoDB's `$nin` matches + # documents where the field is absent. Without the guard, + # `not_readable_by("*")` (i.e. #not_publicly_readable) would MATCH the + # public-by-absence rows it is meant to exclude — inverting the result + # and giving a security audit a false sense of safety. pipeline = [ { "$match" => { - "_rperm" => { "$nin" => keys }, + "_rperm" => { "$exists" => true, "$nin" => keys }, }, }, ] @@ -3223,18 +3261,20 @@ def build # @note This constraint uses aggregation pipeline because Parse Server # restricts direct queries on the internal _wperm field. class NotWriteableByConstraint < Constraint - include AclConstraintHelpers - register :not_writeable_by def build - keys = normalize_acl_keys(@value) + keys = ACLPermissions.collect_for_negation(@value) return { "__aggregation_pipeline" => [] } if keys.empty? + # See {NotReadableByConstraint#build}: the `$exists: true` guard + # prevents a missing `_wperm` (publicly writable per Parse Server) + # from matching `$nin`, which would otherwise make + # #not_publicly_writable report write-exposed objects as safe. pipeline = [ { "$match" => { - "_wperm" => { "$nin" => keys }, + "_wperm" => { "$exists" => true, "$nin" => keys }, }, }, ] @@ -3265,45 +3305,26 @@ class PrivateAclConstraint < Constraint register :master_key_only def build - is_private = @value == true + # A truly private (master-key-only) object has an EXPLICIT empty + # _rperm AND an explicit empty _wperm. A MISSING _rperm/_wperm is + # treated by Parse Server as PUBLIC — the opposite of private — so + # the `$exists: true` guards are required and the missing-field + # branch must NOT be matched (this is the bug-fixed shape: the + # previous version OR'd in `{$exists: false}`, wrongly classifying + # the most-public rows as private). + private_match = { + "$and" => [ + { "_rperm" => { "$exists" => true, "$eq" => [] } }, + { "_wperm" => { "$exists" => true, "$eq" => [] } }, + ], + } - if is_private - # Match objects with empty or missing _rperm AND _wperm - pipeline = [ - { - "$match" => { - "$and" => [ - { - "$or" => [ - { "_rperm" => { "$exists" => true, "$eq" => [] } }, - { "_rperm" => { "$exists" => false } }, - ], - }, - { - "$or" => [ - { "_wperm" => { "$exists" => true, "$eq" => [] } }, - { "_wperm" => { "$exists" => false } }, - ], - }, - ], - }, - }, - ] - else - # Match objects that have SOME permissions (either read or write) - pipeline = [ - { - "$match" => { - "$or" => [ - { "_rperm" => { "$exists" => true, "$ne" => [] } }, - { "_wperm" => { "$exists" => true, "$ne" => [] } }, - ], - }, - }, - ] - end + # `private_acl => false` is the exact complement: every object that + # is NOT fully master-key-only — those with any read/write grant AND + # those with a missing (public) _rperm/_wperm. + match = @value == true ? private_match : { "$nor" => [private_match] } - { "__aggregation_pipeline" => pipeline } + { "__aggregation_pipeline" => [{ "$match" => match }] } end end end diff --git a/lib/parse/retrieval/agent_tool.rb b/lib/parse/retrieval/agent_tool.rb index 59caf9d..aba5a7d 100644 --- a/lib/parse/retrieval/agent_tool.rb +++ b/lib/parse/retrieval/agent_tool.rb @@ -108,20 +108,27 @@ def semantic_search(agent, class_name: nil, query: nil, k: DEFAULT_K, score_quantize = (agent.permissions != :admin) vector_field = Parse::Agent::MetadataRegistry.searchable_field(cname) - chunks = Parse::Retrieval.retrieve( - query: query, - klass: klass, - field: vector_field, - text_field: resolved_text_field, - k: clamp_k(k), - filter: filter, - vector_filter: vector_filter, - chunker: build_chunker(chunk_size, chunk_overlap, chunk_by, max_chunks_per_document), - tenant_scope: scope, - score_quantize: score_quantize, - source_transform: source_projector(agent, cname, scope), - **agent.acl_scope_kwargs, - ) + # with_precharged: the cap was charged above with per-tenant + # identity (or deliberately skipped for trusted admin agents) — + # suppress the generic query-embed charge inside + # find_similar/embed_query_text! so the query isn't double-billed + # (or admin queries billed to the shared default bucket). + chunks = Parse::Embeddings::SpendCap.with_precharged do + Parse::Retrieval.retrieve( + query: query, + klass: klass, + field: vector_field, + text_field: resolved_text_field, + k: clamp_k(k), + filter: filter, + vector_filter: vector_filter, + chunker: build_chunker(chunk_size, chunk_overlap, chunk_by, max_chunks_per_document), + tenant_scope: scope, + score_quantize: score_quantize, + source_transform: source_projector(agent, cname, scope), + **agent.acl_scope_kwargs, + ) + end # Token budget (B4): trim the score-ordered chunk list before # building the envelope so `documents` only carries parents whose diff --git a/lib/parse/retrieval/retriever.rb b/lib/parse/retrieval/retriever.rb index f5178b0..fa9bba6 100644 --- a/lib/parse/retrieval/retriever.rb +++ b/lib/parse/retrieval/retriever.rb @@ -70,6 +70,84 @@ def assert_no_underscore_keys!(obj, path = []) obj end + # Translate Parse pointer VALUES in a caller-supplied filter into + # their MongoDB storage form so they actually match raw documents. + # `{ owner: }` becomes + # `{ "_p_owner" => "_User$abc" }` — pointer columns are stored under + # a `_p_` prefix with `"$"` string values, so a + # Parse-side pointer (a `{__type: "Pointer", ...}` hash on the wire, + # or a `Parse::Pointer` / `Parse::Object` instance from Ruby + # callers) in a `$match` / `$vectorSearch.filter` would otherwise + # never match anything. + # + # Recognized pointer values: + # * `Parse::Pointer` / `Parse::Object` instances, + # * `{ "__type" => "Pointer", "className" => ..., "objectId" => ... }` + # hashes (symbol or string keys). + # + # Translation applies to direct values, and to pointer values inside + # one level of operator hashes (`{ owner: { "$in" => [ptr, ptr] } }`, + # `$eq` / `$ne` / `$nin`). Non-pointer values and unrecognized keys + # pass through untouched, so the call is idempotent. + # + # SECURITY ORDERING: run this AFTER {.assert_no_underscore_keys!} / + # the agent filter-field allowlist (callers may not name `_p_*` + # columns directly) and BEFORE the tenant-scope fold. + # + # @param klass [Class] the Parse::Object subclass (for field_map + # wire-name resolution). + # @param filter [Hash, nil] caller filter. + # @return [Hash, nil] translated copy (or the input when nothing + # needed translation / input was nil). + def translate_pointer_filter_values(klass, filter) + return filter unless filter.is_a?(Hash) + out = {} + filter.each do |key, value| + if (storage = pointer_storage_value(value)) + out["_p_#{wire_name(klass, key)}"] = storage + elsif value.is_a?(Hash) && value.keys.any? { |op| op.to_s.start_with?("$") } + translated = value.transform_values do |opval| + if (s = pointer_storage_value(opval)) + s + elsif opval.is_a?(Array) + opval.map { |el| pointer_storage_value(el) || el } + else + opval + end + end + if translated == value + out[key] = value + else + out["_p_#{wire_name(klass, key)}"] = translated + end + else + out[key] = value + end + end + out + end + + # @!visibility private + # `"$"` storage string for a pointer-shaped + # value, or nil when the value is not a pointer. + def pointer_storage_value(value) + if defined?(Parse::Pointer) && value.is_a?(Parse::Pointer) + cname = value.parse_class + oid = value.id + return nil if cname.to_s.empty? || oid.to_s.empty? + return "#{cname}$#{oid}" + end + if value.is_a?(Hash) + type = value["__type"] || value[:__type] + return nil unless type.to_s == "Pointer" + cname = value["className"] || value[:className] + oid = value["objectId"] || value[:objectId] + return nil if cname.to_s.empty? || oid.to_s.empty? + return "#{cname}$#{oid}" + end + nil + end + # Retrieve and chunk documents semantically similar to `query`. # # @param query [String] natural-language query. @@ -136,6 +214,12 @@ def retrieve(query:, klass: nil, field: nil, text_field: nil, k: 10, end resolved_text_field = (text_field || infer_text_field!(klass)).to_sym + # Pointer-value translation runs BEFORE the tenant-scope fold (the + # fold's conflict check must see final storage-form keys) and after + # any caller-side underscore-key gate (the agent tool walks the raw + # filter before calling retrieve). + filter = translate_pointer_filter_values(klass, filter) + vector_filter = translate_pointer_filter_values(klass, vector_filter) merged_vector_filter = fold_tenant_scope(klass, vector_filter, tenant_scope) chunker ||= default_chunker text_wire = wire_name(klass, resolved_text_field) diff --git a/lib/parse/schema/search_index_migrator.rb b/lib/parse/schema/search_index_migrator.rb index 82513e2..47137a4 100644 --- a/lib/parse/schema/search_index_migrator.rb +++ b/lib/parse/schema/search_index_migrator.rb @@ -67,7 +67,7 @@ def collection_name def plan coll = collection_name existing, available = fetch_existing_indexes(coll) - declared = @model_class.mongo_search_index_declarations + declared = @model_class.mongo_search_index_declarations.map { |d| effective_declaration(d) } existing_by_name = existing.each_with_object({}) do |idx, h| name = (idx["name"] || idx[:name]).to_s @@ -189,6 +189,53 @@ def apply!(update: false, drop: false, wait: false, timeout: 600) private + # Augment a `vectorSearch` declaration with the model's registered + # `agent_tenant_scope` field as a `type: "filter"` path when the + # declaration doesn't already carry it. Tenant-scoped retrieval + # folds `{ => }` into `$vectorSearch.filter` + # (see Parse::Retrieval.retrieve) — Atlas rejects a pre-filter on + # any path not declared `type: "filter"` in the index, so an index + # created without the scope path fails every scoped query at + # runtime. Auto-including it here means `apply!` creates correct + # indexes by default, and pre-existing indexes lacking the path + # surface as `drifted:` in the plan instead of failing silently. + # + # Lexical (`type: "search"`) declarations pass through untouched. + def effective_declaration(decl) + return decl unless decl[:type] == "vectorSearch" + scope_path = tenant_scope_filter_path + return decl if scope_path.nil? + defn = decl[:definition] + return decl unless defn.is_a?(Hash) + fields_key = defn.key?("fields") ? "fields" : :fields + fields = defn[fields_key] + return decl unless fields.is_a?(Array) + covered = fields.any? do |f| + next false unless f.is_a?(Hash) + (f["type"] || f[:type]).to_s == "filter" && + (f["path"] || f[:path]).to_s == scope_path + end + return decl if covered + augmented = defn.dup + augmented[fields_key] = fields + [{ "type" => "filter", "path" => scope_path }] + decl.merge(definition: augmented) + end + + # Wire/storage path of the model's registered tenant-scope field, + # or nil when no `agent_tenant_scope` is declared (or the agent + # layer isn't loaded). Mirrors the wire-name resolution + # Parse::Retrieval uses when folding the scope into the filter. + def tenant_scope_filter_path + return nil unless defined?(Parse::Agent::MetadataRegistry) + rule = Parse::Agent::MetadataRegistry.tenant_scope_rule(collection_name) + return nil unless rule + sym = rule[:field].to_sym + fmap = @model_class.respond_to?(:field_map) ? @model_class.field_map : {} + (fmap[sym] || sym.to_s.columnize).to_s + rescue StandardError + nil + end + # Read existing search indexes via the IndexManager's cached path. # Returns `[indexes, available]`. `available` is false when Atlas # isn't reachable (e.g. running against a vanilla Mongo without diff --git a/lib/parse/stack/version.rb b/lib/parse/stack/version.rb index 1cb7056..f3c2cd1 100644 --- a/lib/parse/stack/version.rb +++ b/lib/parse/stack/version.rb @@ -6,6 +6,6 @@ module Parse # The Parse Server SDK for Ruby module Stack # The current version. - VERSION = "5.4.1" + VERSION = "5.5.0" end end diff --git a/lib/parse/vector_search.rb b/lib/parse/vector_search.rb index 15ba616..2e73ec5 100644 --- a/lib/parse/vector_search.rb +++ b/lib/parse/vector_search.rb @@ -95,7 +95,41 @@ class ConstraintNotSupported < ArgumentError; end # one. Atlas's guidance: numCandidates ≥ 10 × limit, ≤ 10_000. DEFAULT_NUM_CANDIDATES_MULTIPLIER = 20 + # Accepted {.index_drift_policy} values. + INDEX_DRIFT_POLICIES = %i[warn raise ignore].freeze + class << self + # Policy applied when first-query index verification (see + # {Parse::Core::VectorSearchable}) finds the deployed Atlas + # vectorSearch index disagreeing with the model declaration — + # wrong `numDimensions`, wrong `similarity`, or a tenant-scope + # field missing from the index's `filter` paths. + # + # * `:warn` (default) — emit a `[Parse::VectorSearch:DRIFT]` + # warning once per (class, field, index) and continue. Drift + # usually means the index predates a model change; queries + # still run but return degraded or wrongly-scoped results. + # * `:raise` — fail the query with + # {Parse::Core::VectorSearchable::IndexDriftError}. Strict mode + # for deployments that treat drift as a release blocker. + # * `:ignore` — skip verification entirely. + # + # @param value [Symbol] + # @return [Symbol] + def index_drift_policy=(value) + v = value.respond_to?(:to_sym) ? value.to_sym : nil + unless v && INDEX_DRIFT_POLICIES.include?(v) + raise ArgumentError, + "Parse::VectorSearch.index_drift_policy must be one of " \ + "#{INDEX_DRIFT_POLICIES.inspect} (got #{value.inspect})." + end + @index_drift_policy = v + end + + # @return [Symbol] current drift policy (default `:warn`). + def index_drift_policy + @index_drift_policy ||= :warn + end # Low-level `$vectorSearch` entry point. # # @param collection_name [String] Parse class name / Mongo diff --git a/lib/parse/vector_search/hybrid.rb b/lib/parse/vector_search/hybrid.rb index 549791c..d11c167 100644 --- a/lib/parse/vector_search/hybrid.rb +++ b/lib/parse/vector_search/hybrid.rb @@ -372,6 +372,17 @@ def run_native(collection_name, lex, vec, oversample, k_constant:, weights:, sco if pointer_fields rows = Parse::CLPScope.filter_by_pointer_fields(rows, pointer_fields, resolution.user_id) end + # NEW-VEC-1: the `$rankFusion` meta score is materialized + # BEFORE the ACL `$match`, so a surviving row's raw + # `_hybrid_score` encodes its rank among rows the caller + # cannot read — a cross-ACL inference channel for scoped + # callers probing with crafted queries. Recompute the + # surfaced score from the POST-filter ordering (the rows are + # already sorted by the true fused score, so relative order + # is preserved); the new value is a function of visible rows + # only. The client-side RRF path is unaffected — it ranks + # from already-filtered branch results. + recompute_scores_from_visible_order!(rows, k_constant: k_constant, weights: weights) end rows.map! { |doc| Parse::PipelineSecurity.strip_internal_fields(doc) } rows @@ -443,10 +454,20 @@ def run_probe(collection_name) # recognized-but-misused `$rankFusion` (or an unrelated auth/parse # error) is treated as supported and surfaces its real error on the # actual query rather than silently disabling native fusion. + # + # Deliberately narrow (NEW-VEC-2): a broad phrase like + # "is not allowed" also appears in MongoDB authorization errors + # ("not allowed to execute command aggregate"), which combined + # with the stage name in the message would misclassify an + # auth-failing cluster and cache the wrong probe verdict for + # PROBE_CACHE_TTL. Only phrases that unambiguously mean + # "this stage name is unknown to the parser" belong here; any + # other failure falls through to "supported" and the real query + # surfaces the real error (with the client path as fallback). UNSUPPORTED_STAGE_FRAGMENTS = [ "unrecognized pipeline stage name", "unknown aggregation stage", - "is not allowed", + "unknown stage", ].freeze private_constant :UNSUPPORTED_STAGE_FRAGMENTS @@ -455,6 +476,23 @@ def unsupported_stage_error?(err) msg.include?("rankfusion") && UNSUPPORTED_STAGE_FRAGMENTS.any? { |f| msg.include?(f) } end + # @!visibility private + # Replace each visible row's `_hybrid_score` with an RRF-shaped + # score derived from its position AMONG VISIBLE ROWS: + # `Σ_b weight_b / (k_constant + visible_rank)`. Monotone with the + # original fused order (input is already score-sorted), but + # carries no information about how many hidden rows ranked above + # or between the visible ones. See NEW-VEC-1. + def recompute_scores_from_visible_order!(rows, k_constant:, weights:) + w = weights ? symbolize(weights) : nil + total_weight = weight_for(w, :lexical).to_f + weight_for(w, :vector).to_f + rows.each_with_index do |doc, i| + next unless doc.is_a?(Hash) + doc["_hybrid_score"] = total_weight / (k_constant + i + 1) + end + rows + end + # -- probe cache ------------------------------------------------- PROBE_MUTEX_INIT = Mutex.new diff --git a/lib/parse/webhooks.rb b/lib/parse/webhooks.rb index c3a75a5..14a6fb1 100644 --- a/lib/parse/webhooks.rb +++ b/lib/parse/webhooks.rb @@ -455,34 +455,114 @@ def call_route(type, className, payload = nil) end if type == :after_save && payload&.parse_object.present? && payload.parse_object.is_a?(Parse::Object) - # Handle after_save callbacks intelligently based on request origin. - # For trusted-Ruby-initiated saves (both `_RB_` header AND master - # key), Parse Stack's local `run_callbacks :save` will fire - # after_create and after_save callbacks after the REST response - # returns; firing them again here would double-fire any side - # effect (e.g. an `after_save :send_email` would send two emails - # per save). For everything else -- client-initiated saves, or a - # spoofed `_RB_` from a non-master client -- Parse Stack never had - # a chance to run callbacks, so we fire them here. + # The chained ActiveModel after_save/after_create callbacks are NOT + # fired here. `call!` dispatches every trigger twice -- once for the + # specific class route and once for the generic `"*"` route -- so + # firing the model callbacks inside this per-route block double-fired + # them for any app that registered BOTH a class route and a `"*"` + # route (e.g. an `after_save :send_email` would send two emails per + # save). The dispatch now lives in `run_after_save_chain`, which + # `call!` invokes exactly once per delivery after both route calls. # - # The decision depends ONLY on request origin, never on what the - # handler returned. Parse Server discards the afterSave response - # body entirely (it resolves {success} even if the handler throws), - # so a handler that returns the parse_object -- the recommended - # before_save pattern, easy to copy by mistake -- must NOT silently - # suppress these callbacks. We normalize the result to `true` below - # so a returned object never leaks into the response or the log. - is_new = payload.original.nil? - unless trusted_ruby_initiated - payload.parse_object.run_after_create_callbacks if is_new - payload.parse_object.run_after_save_callbacks - end + # We still normalize the result to `true` so a handler that returned + # the parse_object (the recommended before_save pattern, easy to copy + # by mistake) never leaks an object into the response or the log. result = true end result end + # Fires the chained ActiveModel after_save (and after_create, for a new + # object) callbacks for an afterSave delivery -- exactly once per request. + # + # This lives in `call!` rather than `call_route` because `call!` dispatches + # every trigger twice (the specific class route AND the generic `"*"` + # route). Firing the model callbacks per-route would double-fire any side + # effect for an app that registered both routes. Calling this once, after + # both route calls, fires the chain exactly once regardless of how many + # routes matched. + # + # The decision to fire depends ONLY on request origin, never on what a + # handler returned: Parse Server discards the afterSave response body + # entirely, so a handler returning the parse_object must not suppress the + # callbacks. For trusted-Ruby-initiated saves (both the `_RB_` request-id + # header AND the master key) Parse Stack's local `run_callbacks :save` + # already fires these after the REST response returns, so we skip them + # here to avoid the double-fire. The route-present guard preserves the + # "an unregistered afterSave trigger never fires model callbacks" contract + # that `call_route`'s early return used to provide. + # + # @param payload [Parse::Webhooks::Payload] the afterSave payload. + # @return [void] + def run_after_save_chain(payload) + return unless payload&.after_save? + return unless payload.parse_object.is_a?(Parse::Object) + + # Preserve the "no registered route => no model callbacks" behavior that + # call_route's `return unless routes[type][className].present?` enforced. + # Mirror that guard exactly: key on parse_class.to_s (as call_route does) + # and use `.present?` on the value -- registration stores an Array, and an + # empty/absent registration must NOT fire (matching the original). + after_save_routes = routes[:after_save] + return unless after_save_routes && + (after_save_routes[payload.parse_class.to_s].present? || + after_save_routes["*"].present?) + + # Trusted-Ruby-initiated saves run their callbacks locally; firing again + # here would double them. This must match call_route's trusted_ruby_initiated + # EXACTLY. call_route runs (and stamps @ruby_initiated) before this for any + # matched route, so read that stamped value rather than recomputing via + # `ruby_initiated?` -- whose `||=` memoization re-derives on a stamped + # `false` and could disagree with call_route's header lookup. + return if payload.ruby_initiated? && payload.master? == true + + # By the time afterSave fires the object is ALREADY persisted in Parse + # Server, and Parse Server discards the afterSave response body entirely + # (it resolves success even if the handler throws). So a chained callback + # that raises must not (a) 500 the webhook endpoint -- `call!`'s rescue + # only catches ResponseError / ValidationError, so a bare StandardError + # would escape -- nor (b) take out the OTHER phase's unrelated side + # effects. Run the after_create and after_save phases independently, each + # guarded, logging and swallowing any StandardError. This mirrors Parse's + # own afterSave semantics (log-and-continue on a post-persist failure): + # a raising `after_create :send_welcome_email` no longer silently skips + # an unrelated `after_save :reindex`, and neither can crash the endpoint. + obj = payload.parse_object + run_after_save_phase(obj, :after_create) if payload.original.nil? + run_after_save_phase(obj, :after_save) + nil + end + + # Runs one phase (:after_create or :after_save) of an afterSave object's + # chained ActiveModel callbacks, swallowing and logging any StandardError + # so a post-persist callback failure can't crash the webhook endpoint or + # suppress the sibling phase. ActiveModel still halts the rest of *this* + # phase's chain on a raise -- only the cross-phase / endpoint blast radius + # is contained here. Note this also swallows a ResponseError/ValidationError + # raised from inside an after_save callback: afterSave is post-persist and + # Parse Server discards the response body, so an `error!` there cannot deny + # the (already-committed) write -- it is logged, not propagated. + # @param obj [Parse::Object] the persisted afterSave object. + # @param phase [Symbol] :after_create or :after_save. + # @return [void] + def run_after_save_phase(obj, phase) + case phase + when :after_create then obj.run_after_create_callbacks + when :after_save then obj.run_after_save_callbacks + end + nil + rescue => e + # Redact the exception message before logging: a callback error can echo + # record contents/tokens, and the rest of this file routes log output + # through the same redactor. + warn "[Parse::Webhooks] afterSave #{phase} callback raised for " \ + "#{obj.class}##{obj.id} -- the object is already persisted; " \ + "logging and continuing: #{e.class}: " \ + "#{Parse::Middleware::BodyBuilder.redact(e.message)}" + nil + end + # Generates a success response for Parse Server. # @param data [Object] the data to send back with the success. # @return [Hash] a success data payload @@ -688,6 +768,12 @@ def call!(env) # call hooks subscribed to any class route generic_result = Parse::Webhooks.call_route(payload.trigger_name, "*", payload) result = generic_result if generic_result.present? && result.nil? + + # Fire the chained ActiveModel after_save/after_create callbacks + # exactly once per delivery -- after BOTH route calls above -- so an + # app that registers both a class route and a `"*"` route doesn't + # double-fire them. No-op for every non-afterSave trigger. + Parse::Webhooks.run_after_save_chain(payload) else if self.logging.present? puts "[Webhooks] --> Could not find mapping route for " \ diff --git a/lib/parse/webhooks/payload.rb b/lib/parse/webhooks/payload.rb index b0132fc..363ee04 100644 --- a/lib/parse/webhooks/payload.rb +++ b/lib/parse/webhooks/payload.rb @@ -740,7 +740,13 @@ def parse_query # callback handling based on the request origin. # @return [Boolean] true if the request originated from Ruby Parse Stack def ruby_initiated? - @ruby_initiated ||= begin + # Stable memoization: a plain `||=` re-derives whenever the stored value + # is `false`, so a previously-computed (or externally-stamped, e.g. by + # Parse::Webhooks.call_route) `false` would be recomputed on every call + # and could disagree with the stamping caller. Cache on `defined?` so a + # `false` result is memoized exactly once and never silently re-derived. + return @ruby_initiated if defined?(@ruby_initiated) && !@ruby_initiated.nil? + @ruby_initiated = begin request_id = nil if @raw.respond_to?(:[]) diff --git a/test/lib/parse/acl_constraints_unit_test.rb b/test/lib/parse/acl_constraints_unit_test.rb index 15904c9..330e952 100644 --- a/test/lib/parse/acl_constraints_unit_test.rb +++ b/test/lib/parse/acl_constraints_unit_test.rb @@ -287,7 +287,7 @@ def test_privately_readable_convenience_method expected_pipeline = [ { "$match" => { - "_rperm" => { "$eq" => [] }, + "_rperm" => { "$exists" => true, "$eq" => [] }, }, }, ] @@ -307,7 +307,7 @@ def test_privately_writable_convenience_method expected_pipeline = [ { "$match" => { - "_wperm" => { "$eq" => [] }, + "_wperm" => { "$exists" => true, "$eq" => [] }, }, }, ] @@ -326,7 +326,7 @@ def test_master_key_read_only_alias expected_pipeline = [ { "$match" => { - "_rperm" => { "$eq" => [] }, + "_rperm" => { "$exists" => true, "$eq" => [] }, }, }, ] @@ -345,7 +345,7 @@ def test_master_key_write_only_alias expected_pipeline = [ { "$match" => { - "_wperm" => { "$eq" => [] }, + "_wperm" => { "$exists" => true, "$eq" => [] }, }, }, ] @@ -395,15 +395,18 @@ def test_not_publicly_readable_convenience_method query.not_publicly_readable pipeline = query.pipeline + # The `$exists: true` guard is required: a missing `_rperm` is public per + # Parse Server, and `$nin` matches missing-field docs, so without the + # guard not_publicly_readable would return the public rows it must exclude. expected_pipeline = [ { "$match" => { - "_rperm" => { "$nin" => ["*"] }, + "_rperm" => { "$exists" => true, "$nin" => ["*"] }, }, }, ] - assert_equal expected_pipeline, pipeline, "not_publicly_readable should query for '*' NOT in _rperm" + assert_equal expected_pipeline, pipeline, "not_publicly_readable should query for '*' NOT in _rperm and exclude missing-_rperm (public) rows" puts "✅ not_publicly_readable generates correct pipeline" end @@ -417,12 +420,12 @@ def test_not_publicly_writable_convenience_method expected_pipeline = [ { "$match" => { - "_wperm" => { "$nin" => ["*"] }, + "_wperm" => { "$exists" => true, "$nin" => ["*"] }, }, }, ] - assert_equal expected_pipeline, pipeline, "not_publicly_writable should query for '*' NOT in _wperm" + assert_equal expected_pipeline, pipeline, "not_publicly_writable should query for '*' NOT in _wperm and exclude missing-_wperm (public) rows" puts "✅ not_publicly_writable generates correct pipeline" end @@ -550,7 +553,7 @@ def test_privately_readable_hash_key_in_where expected_pipeline = [ { "$match" => { - "_rperm" => { "$eq" => [] }, + "_rperm" => { "$exists" => true, "$eq" => [] }, }, }, ] @@ -662,4 +665,207 @@ def test_multiple_acl_convenience_methods puts "✅ Multiple ACL convenience methods work together" end + + # Regression guard for the mongo-direct ACL routing fix: the Aggregation + # built for an ACL filter (readable_by/publicly_readable/etc.) must carry + # +allow_internal_fields: true+ so the SDK-built `_rperm`/`_wperm` $match + # passes Parse::PipelineSecurity's internal-fields denylist. Without the + # flag, every readable_by/publicly_readable query that auto-routes through + # mongo-direct raises Parse::PipelineSecurity::Error on the _rperm reference. + def test_acl_aggregation_marks_internal_fields_allowed + puts "\n=== Testing ACL aggregation forwards allow_internal_fields ===" + require "parse/pipeline_security" + + %i[publicly_readable publicly_writable].each do |method| + query = Parse::Query.new("Post").public_send(method) + agg = query.send(:execute_aggregation_pipeline) + + assert agg.instance_variable_get(:@allow_internal_fields), + "#{method} aggregation must forward allow_internal_fields: true" + + # The pipeline must survive the exact security check the mongo-direct + # sink runs (allow_internal_fields equal to the forwarded flag). + Parse::PipelineSecurity.validate_filter!( + agg.pipeline, + allow_internal_fields: agg.instance_variable_get(:@allow_internal_fields), + ) + end + + # readable_by with an explicit permission string routes the same way. + agg = Parse::Query.new("Post").readable_by("role:Admin").send(:execute_aggregation_pipeline) + assert agg.instance_variable_get(:@allow_internal_fields), + "readable_by aggregation must forward allow_internal_fields: true" + + puts "✅ ACL aggregations forward allow_internal_fields and pass the security validator" + end + + # Guard the credential-field boundary: a plain (non-ACL) aggregation must + # NOT relax the internal-fields denylist, so user-supplied pipelines can't + # smuggle references to password hashes / session tokens through the + # mongo-direct sink. + def test_non_acl_aggregation_keeps_internal_fields_guard + puts "\n=== Testing non-ACL aggregation keeps internal-fields guard ===" + + agg = Parse::Query.new("Post").where(title: "x") + .aggregate([{ "$group" => { "_id" => "$title" } }]) + + refute agg.instance_variable_get(:@allow_internal_fields), + "non-ACL aggregate must keep allow_internal_fields: false (credential guard intact)" + + puts "✅ Non-ACL aggregation keeps the internal-fields guard" + end + + # #1 regression: the scalar aggregation terminals (sum/average/min/max/ + # count_distinct/distinct) and the user-facing #aggregate all funnel through + # Query#aggregate. An ACL filter there must mark the pipeline so the + # mongo-direct sink allows the SDK-built _rperm/_wperm reference. + def test_aggregate_with_acl_filter_forwards_allow_internal_fields + puts "\n=== Testing #aggregate with ACL filter forwards allow_internal_fields ===" + + agg = Parse::Query.new("Post").publicly_readable + .aggregate([{ "$group" => { "_id" => "$genre" } }]) + + assert agg.instance_variable_get(:@allow_internal_fields), + "ACL-filtered aggregate must forward allow_internal_fields: true" + # The compiled pipeline must still carry the _rperm $match (it is not + # dropped) so the filter actually applies on whichever engine runs it. + assert agg.pipeline.to_json.include?("_rperm"), + "ACL $match must survive into the aggregate() pipeline" + + puts "✅ ACL-filtered aggregate forwards the flag and keeps the _rperm match" + end + + # #1 security regression: a scoped (scope_to_user / scope_to_role / + # session_token) aggregation terminal must NOT silently fall back to Parse + # Server's REST /aggregate endpoint, which is master-key-only and enforces + # neither ACL nor CLP. When mongo-direct is unavailable it must fail closed. + def test_scoped_aggregation_terminal_fails_closed_without_mongo_direct + puts "\n=== Testing scoped aggregation fails closed without mongo-direct ===" + + skip "requires Parse::MongoDB NOT enabled for this assertion" if defined?(Parse::MongoDB) && Parse::MongoDB.enabled? + + user = Parse::User.new(objectId: "scopedUser1") + + # A scoped query with NO internal fields (pure scope bypass) must refuse + # to run a scalar aggregation over REST-as-master. + err = assert_raises(Parse::Query::MongoDirectRequired) do + Parse::Query.new("Post").where(genre: "rock").scope_to_user(user) + .aggregate([{ "$group" => { "_id" => nil, "t" => { "$sum" => "$plays" } } }]) + end + assert_match(/scoped aggregation/i, err.message) + + # An UNSCOPED ACL aggregate keeps the REST fallback (master-key correctness + # edge, not an enforcement bypass) — it must NOT raise. + Parse::Query.new("Post").publicly_readable + .aggregate([{ "$group" => { "_id" => "$genre" } }]) + + puts "✅ Scoped aggregation fails closed; unscoped keeps REST fallback" + end + + # #3/#4: empty intent ([] / nil / "none" / :none) and Symbol values must + # COMPILE at the Query level (they previously raised ArgumentError despite + # being documented), and map to the right shapes. + def test_readable_by_accepts_empty_and_symbol_values + puts "\n=== Testing readable_by accepts [] / nil / :none / :public ===" + + empty = [{ "$match" => { "_rperm" => { "$exists" => true, "$eq" => [] } } }] + [[], nil, "none", :none].each do |v| + assert_equal empty, Parse::Query.new("Post").readable_by(v).pipeline, + "readable_by(#{v.inspect}) should compile to the explicit-empty match" + end + + public_shape = [{ "$match" => { "$or" => [ + { "_rperm" => { "$in" => ["*"] } }, + { "_rperm" => { "$exists" => false } }, + ] } }] + [:public, :everyone, :world, "public", "*"].each do |v| + assert_equal public_shape, Parse::Query.new("Post").readable_by(v).pipeline, + "readable_by(#{v.inspect}) should map to the public wildcard" + end + + puts "✅ readable_by accepts empty + symbol values" + end + + # #6: strict: true compiles an exact match — no implicit public, no + # missing-field branch. + def test_readable_by_strict_kwarg + puts "\n=== Testing readable_by(strict: true) ===" + + inclusive = Parse::Query.new("Post").readable_by("role:Admin").pipeline + assert inclusive.first["$match"].key?("$or"), "default is public-inclusive ($or)" + + strict = Parse::Query.new("Post").readable_by("role:Admin", strict: true).pipeline + assert_equal [{ "$match" => { "_rperm" => { "$in" => ["role:Admin"] } } }], strict, + "strict: true should be an exact $in with no public/missing branches" + + puts "✅ readable_by strict mode produces an exact match" + end + + # #5: the British :writeable_by spelling now resolves to the SAME + # public-inclusive, role-expanding implementation as :writable_by. + def test_writeable_by_is_alias_of_writable_by + puts "\n=== Testing writeable_by == writable_by ===" + + american = Parse::Query.new("Post").where(:ACL.writable_by => "role:Admin").pipeline + british = Parse::Query.new("Post").where(:ACL.writeable_by => "role:Admin").pipeline + assert_equal american, british, "writeable_by must compile identically to writable_by" + assert american.first["$match"].key?("$or"), "both are public-inclusive" + + puts "✅ writeable_by is a true alias of writable_by" + end + + # #8/#9: the new chained negation methods exist, and a mistyped permission + # is NOT silently swallowed. + def test_negation_methods_and_no_silent_swallow + puts "\n=== Testing not_readable_by/not_writable_by + no silent swallow ===" + + q = Parse::Query.new("Post").not_readable_by("role:Admin") + match = q.pipeline.first["$match"]["_rperm"] + # not readable by Admin also excludes publicly-readable rows -> "*" added. + assert_equal({ "$exists" => true, "$nin" => ["role:Admin", "*"] }, match) + + assert_respond_to Parse::Query.new("Post"), :not_writable_by + + # An unrecognized array element must RAISE, not vanish from the filter. + assert_raises(ArgumentError) do + Parse::Query.new("Post").readable_by(["role:Admin", 12345]).pipeline + end + # An unsupported Symbol must RAISE too. + assert_raises(ArgumentError) do + Parse::Query.new("Post").readable_by(:bogus).pipeline + end + + puts "✅ negation methods present; bad permissions raise instead of vanishing" + end + + # #1 (second sink): aggregate_from_query is a separate public pipeline sink. + # It must (a) fold the SDK ACL $match into the pipeline rather than dropping + # it, and (b) fail closed for a scoped query when mongo-direct is disabled. + def test_aggregate_from_query_applies_acl_and_fails_closed_when_scoped + puts "\n=== Testing aggregate_from_query ACL retention + scoped fail-closed ===" + + agg = Parse::Query.new("Post").publicly_readable + .aggregate_from_query([{ "$group" => { "_id" => "$genre" } }]) + assert agg.instance_variable_get(:@allow_internal_fields), + "aggregate_from_query must forward allow_internal_fields for an ACL filter" + assert agg.pipeline.to_json.include?("_rperm"), + "aggregate_from_query must fold the ACL $match into the pipeline (not drop it)" + + if !(defined?(Parse::MongoDB) && Parse::MongoDB.enabled?) + user = Parse::User.new(objectId: "scopedU") + assert_raises(Parse::Query::MongoDirectRequired) do + Parse::Query.new("Post").scope_to_user(user) + .aggregate_from_query([{ "$group" => { "_id" => nil } }]) + end + end + + # A caller-supplied stage that smuggles an internal field must NOT flip the + # sanction (only the SDK-built portion counts). + sneaky = Parse::Query.new("Post").where(title: "x") + .aggregate_from_query([{ "$match" => { "_rperm" => { "$in" => ["x"] } } }]) + refute sneaky.instance_variable_get(:@allow_internal_fields), + "additional_stages must not be able to sanction internal-field references" + + puts "✅ aggregate_from_query applies ACL filters and fails closed when scoped" + end end diff --git a/test/lib/parse/agent/mcp_resource_subscriptions_test.rb b/test/lib/parse/agent/mcp_resource_subscriptions_test.rb index 97e80dd..2366a11 100644 --- a/test/lib/parse/agent/mcp_resource_subscriptions_test.rb +++ b/test/lib/parse/agent/mcp_resource_subscriptions_test.rb @@ -120,6 +120,39 @@ def cancelled? end end +# Scoped variant of SubAgentStub that ALSO exposes the two surfaces the CLP +# and agent_hidden(except:) branches of Tools.assert_class_accessible! read: +# `acl_permission_strings` and `auth_context`. Kept as a SEPARATE class on +# purpose — the base SubAgentStub must NOT respond to `acl_permission_strings`, +# or every existing manager/dispatcher test would enter the CLP branch, hit an +# unseeded CLPScope cache (`:unresolvable` -> fail-closed), and turn red. Only +# the gate tests that seed the CLP cache use this variant. +class ScopedSubAgentStub < SubAgentStub + # @param permission_strings [Array, nil] the agent's ACL claim set + # ("*", a userObjectId, "role:Name", ...). nil models a master-key posture + # (CLPScope.permits? short-circuits to true before any lookup). + # @param using_master_key [Boolean] the value auth_context[:using_master_key] + # reports — the only axis the agent_hidden(except: :master_key) gate keys on. + def initialize(permission_strings: ["*"], using_master_key: false, **kwargs) + super(**kwargs) + @permission_strings = permission_strings + @using_master_key = using_master_key + end + + # Mirrors Parse::Agent#acl_permission_strings: nil for a master-key posture + # (the CLP gate's bypass), else the claim set checked against the class CLP. + def acl_permission_strings + @permission_strings + end + + # Mirrors Parse::Agent#auth_context. Only :using_master_key is consulted by + # the hidden gate; the rest of the hash is filler for shape parity. + def auth_context + { type: @using_master_key ? :master_key : :session_token, + using_master_key: @using_master_key, identity: nil } + end +end + # --------------------------------------------------------------------------- # MCPSubscriptions module-function tests (URI + credential derivation) # --------------------------------------------------------------------------- @@ -434,6 +467,140 @@ def test_debounce_coalesces_burst_into_single_update end end +# --------------------------------------------------------------------------- +# Authorization-gate parity tests: the subscribe path must run the SAME +# Tools.assert_class_accessible! gate as the read path — agent_hidden (via +# MetadataRegistry.hidden?), the per-agent `classes:` allowlist (covered by the +# Manager tests above), AND the class-level-permissions (CLP) branch. These +# tests drive the REAL gate (no stub on assert_class_accessible!) so the CLP +# branch and the agent_hidden(except: :master_key) axis actually execute, and +# assert the security invariant that NO LiveQuery socket opens on a denial. +# --------------------------------------------------------------------------- +class MCPSubscriptionsAuthorizationGateTest < Minitest::Test + M = Parse::Agent::MCPSubscriptions + + def setup + @lq = FakeLQClient.new + @mgr = M::Manager.new(supported: true, live_query_client: @lq, debounce_interval: 0) + # The CLP gate reads Parse::CLPScope's process-global cache. Reset it so a + # seeded fixture here never leaks into (or inherits from) another test. + Parse::CLPScope.reset_cache! + Parse::CLPScope.reset_warning_state! + end + + def teardown + Parse::CLPScope.reset_cache! + Parse::CLPScope.reset_warning_state! + end + + # --- CLP branch of assert_class_accessible! (scoped agent) -------------- + + def test_subscribe_refused_when_clp_denies_op_for_scope + # CLP grants `count` only to role:Admin. A Reader-scoped agent's claim set + # ("*", "role:Reader") doesn't satisfy it, so the gate raises AccessDenied + # (kind: :clp_denied) BEFORE any credential derivation or socket open. + Parse::CLPScope.__cache_put("Post", clp: { "count" => { "role:Admin" => true } }) + agent = ScopedSubAgentStub.new(session_token: "r:tok", + permission_strings: ["*", "role:Reader"]) + err = assert_raises(Parse::Agent::AccessDenied) do + @mgr.subscribe(session_id: "s1", uri: "parse://Post/count", agent: agent) + end + assert_equal :clp_denied, err.kind + assert_equal 0, @lq.subscriptions.size, "no socket opens when CLP refuses the op" + assert_equal 0, @mgr.subscription_count + end + + def test_subscribe_allowed_when_clp_permits_op_for_scope + # Same denying CLP, but the agent's claim set now includes role:Admin, so + # CLP permits, the gate passes, and the socket opens with session creds. + Parse::CLPScope.__cache_put("Post", clp: { "count" => { "role:Admin" => true } }) + agent = ScopedSubAgentStub.new(session_token: "r:tok", + permission_strings: ["role:Admin"]) + assert @mgr.subscribe(session_id: "s1", uri: "parse://Post/count", agent: agent) + assert_equal 1, @lq.subscriptions.size + assert_equal({ session_token: "r:tok" }, @lq.last.creds) + end + + def test_subscribe_clp_op_is_derived_from_uri_verb + # The CLP op mirrors the read path: `count` gates on :count, `samples` on + # :find. A CLP that makes count public but find Admin-only proves the op + # comes from the URI verb — the same public-only scope is admitted for + # count and refused for samples. + Parse::CLPScope.__cache_put("Post", + clp: { "count" => { "*" => true }, + "find" => { "role:Admin" => true } }) + public_agent = ScopedSubAgentStub.new(session_token: "r:tok", permission_strings: ["*"]) + assert @mgr.subscribe(session_id: "s1", uri: "parse://Post/count", agent: public_agent) + err = assert_raises(Parse::Agent::AccessDenied) do + @mgr.subscribe(session_id: "s1", uri: "parse://Post/samples", agent: public_agent) + end + assert_equal :clp_denied, err.kind + assert_equal 1, @lq.subscriptions.size, "only the permitted (count) subscribe opened a socket" + end + + def test_subscribe_master_posture_bypasses_clp + # A CLP locked to nobody-but-master (`count: {}`) is in cache, but a + # master-key posture reports acl_permission_strings => nil, which + # CLPScope.permits? short-circuits to true before any lookup — the same + # bypass contract as the read path. The socket opens with master creds. + Parse::CLPScope.__cache_put("Post", clp: { "count" => {} }) + master = ScopedSubAgentStub.new(permission_strings: nil, using_master_key: true) + assert @mgr.subscribe(session_id: "s1", uri: "parse://Post/count", agent: master) + assert_equal({ use_master_key: true }, @lq.last.creds) + end + + # --- agent_hidden gate via MetadataRegistry.hidden? --------------------- + + def test_subscribe_refused_for_globally_hidden_session_class + # The marquee scenario: parse://_Session/count. _Session is agent_hidden by + # default (the session-token store — PII). It must be refused through + # MetadataRegistry.hidden? before any socket opens, even for a master-key + # agent, since plain agent_hidden (no `except:`) admits no one — otherwise + # subscribe becomes a session change/timing oracle on a class the tool + # surface refuses to even list. + assert Parse::Agent::MetadataRegistry.hidden?("_Session"), + "_Session must be registered agent_hidden for this test to be meaningful" + assert_raises(Parse::Agent::AccessDenied) do + @mgr.subscribe(session_id: "s1", uri: "parse://_Session/count", agent: SubAgentStub.new) + end + assert_equal 0, @lq.subscriptions.size, "no socket opens for a hidden class" + assert_equal 0, @mgr.subscription_count + end + + def test_subscribe_hidden_except_master_key_refuses_session_but_permits_master + # agent_hidden(except: :master_key) is the "user-facing MCP never sees it, + # dev-MCP can" axis — the ONLY place auth_context[:using_master_key] flips + # the gate's outcome. Register a throwaway hidden-except-master class and + # assert both sides of the axis. + hidden = Object.new + def hidden.parse_class; "VaultDoc"; end + def hidden.name; "VaultDoc"; end + Parse::Agent::MetadataRegistry.register_hidden_class(hidden, except: :master_key) + begin + assert Parse::Agent::MetadataRegistry.hidden?("VaultDoc") + + # Session-bound (non-master) agent: refused — using_master_key is false, + # so the master-key exception does not apply. + session_agent = ScopedSubAgentStub.new(session_token: "r:tok", + permission_strings: ["*"], + using_master_key: false) + assert_raises(Parse::Agent::AccessDenied) do + @mgr.subscribe(session_id: "s1", uri: "parse://VaultDoc/count", agent: session_agent) + end + assert_equal 0, @lq.subscriptions.size, "session agent opens no socket on a hidden class" + + # Master-key agent: the except: :master_key bypass admits it; the socket + # opens with master creds. + master_agent = ScopedSubAgentStub.new(permission_strings: nil, using_master_key: true) + assert @mgr.subscribe(session_id: "s2", uri: "parse://VaultDoc/count", agent: master_agent) + assert_equal 1, @lq.subscriptions.size + assert_equal({ use_master_key: true }, @lq.last.creds) + ensure + Parse::Agent::MetadataRegistry.unregister_hidden_class(hidden) + end + end +end + # --------------------------------------------------------------------------- # Dispatcher integration: capability negotiation + subscribe/unsubscribe routing # --------------------------------------------------------------------------- @@ -551,6 +718,17 @@ def test_resources_subscribe_disallowed_class_is_invalid_params assert_equal 0, @lq.subscriptions.size end + def test_resources_subscribe_hidden_class_is_invalid_params + # parse://_Session/count routes through the REAL dispatcher; the + # agent_hidden gate (MetadataRegistry.hidden?) refuses it and the dispatcher + # maps AccessDenied -> JSON-RPC -32602, opening no socket. Locks the wire + # error code for a hidden class — the Manager-level test asserts the raise. + res = call_method("resources/subscribe", { "uri" => "parse://_Session/count" }) + assert_equal(-32_602, res[:body]["error"]["code"]) + assert_equal 0, @manager.subscription_count + assert_equal 0, @lq.subscriptions.size + end + def test_resources_subscribe_without_manager_is_method_not_found res = call_method("resources/subscribe", { "uri" => "parse://Post/count" }, manager: nil) assert_equal(-32_601, res[:body]["error"]["code"]) diff --git a/test/lib/parse/aggregation_auto_promotion_test.rb b/test/lib/parse/aggregation_auto_promotion_test.rb index 379acf4..a5e4085 100644 --- a/test/lib/parse/aggregation_auto_promotion_test.rb +++ b/test/lib/parse/aggregation_auto_promotion_test.rb @@ -55,14 +55,15 @@ def test_group_by_stays_on_rest_when_no_scope assert_stays_on_rest(gb, :count) end - def test_group_by_stays_on_rest_when_mongodb_disabled + def test_group_by_fails_closed_when_scoped_and_mongodb_disabled stub_mongodb_enabled!(false) @query.session_token = "r:test-session" gb = @query.group_by(:artist) - # Fail-closed: stays on REST rather than raising NotEnabled at run - # time. The integrator gets the (unscoped, master-key-only) REST - # response Parse Server would normally serve. - assert_stays_on_rest(gb, :count) + # Security: a SCOPED aggregation must NOT silently fall back to Parse + # Server's REST /aggregate endpoint, which is master-key-only and + # enforces neither ACL nor CLP — that would run the query unscoped as + # the master key. With mongo-direct unavailable it fails closed. + assert_raises(Parse::Query::MongoDirectRequired) { gb.count } end # ---- GroupByDate auto-promotion --------------------------------------- @@ -98,10 +99,76 @@ def test_distinct_stays_on_rest_when_no_scope assert_distinct_stays_on_rest end - def test_distinct_stays_on_rest_when_mongodb_disabled + def test_distinct_fails_closed_when_scoped_and_mongodb_disabled stub_mongodb_enabled!(false) @query.session_token = "r:test-session" - assert_distinct_stays_on_rest + # Security: see test_group_by_fails_closed_when_scoped_and_mongodb_disabled. + # A scoped distinct cannot fall back to REST /aggregate (unscoped master). + assert_raises(Parse::Query::MongoDirectRequired) { @query.distinct(:artist) } + end + + # ---- Query#count (aggregation-pipeline branch) ------------------------- + # `:field.size` compiles to an __aggregation_pipeline marker, forcing + # #count and #results through the inline-Aggregation terminals that the + # other tests above never reach. + + def test_count_aggregation_promotes_when_session_token_set + stub_mongodb_enabled!(true) + @query.session_token = "r:test-session" + @query.where :tags.size => 2 + direct_called = false + Parse::MongoDB.define_singleton_method(:aggregate) do |_class_name, _pipeline, **_kw| + direct_called = true + [] + end + @query.count + assert direct_called, "expected scoped #count (aggregation branch) to route through mongo-direct" + ensure + Parse::MongoDB.singleton_class.remove_method(:aggregate) if Parse::MongoDB.singleton_class.method_defined?(:aggregate) + end + + def test_count_aggregation_stays_on_rest_when_no_scope + stub_mongodb_enabled!(true) + @query.where :tags.size => 2 + response = stub_response([]) + @mock_client.expect :aggregate_pipeline, response do |_table, _pipeline, **_kw| + true + end + @query.count + @mock_client.verify + end + + def test_count_aggregation_fails_closed_when_scoped_and_mongodb_disabled + stub_mongodb_enabled!(false) + @query.session_token = "r:test-session" + @query.where :tags.size => 2 + # Security: same contract as #aggregate / #distinct — a scoped count + # must not fall back to REST /aggregate (master-key-only, unenforced). + assert_raises(Parse::Query::MongoDirectRequired) { @query.count } + end + + # ---- Query#results via execute_aggregation_pipeline -------------------- + + def test_results_pipeline_promotes_when_session_token_set + stub_mongodb_enabled!(true) + @query.session_token = "r:test-session" + @query.where :tags.size => 2 + direct_called = false + Parse::MongoDB.define_singleton_method(:aggregate) do |_class_name, _pipeline, **_kw| + direct_called = true + [] + end + @query.results + assert direct_called, "expected scoped #results (pipeline branch) to route through mongo-direct" + ensure + Parse::MongoDB.singleton_class.remove_method(:aggregate) if Parse::MongoDB.singleton_class.method_defined?(:aggregate) + end + + def test_results_pipeline_fails_closed_when_scoped_and_mongodb_disabled + stub_mongodb_enabled!(false) + @query.session_token = "r:test-session" + @query.where :tags.size => 2 + assert_raises(Parse::Query::MongoDirectRequired) { @query.results } end private diff --git a/test/lib/parse/cloud_functions_module_test.rb b/test/lib/parse/cloud_functions_module_test.rb index 651d2e1..6331344 100644 --- a/test/lib/parse/cloud_functions_module_test.rb +++ b/test/lib/parse/cloud_functions_module_test.rb @@ -294,6 +294,65 @@ def test_parse_call_function_bang_ignores_raw_option @mock_client.verify end + # The documented opt-out for sanitizing third-party-influenced cloud output: + # `raw: true` returns the undecoded Response BEFORE `_extract_cloud_result`, + # so a `__type:"Object"` envelope is NOT built into a Parse object and its + # server-set keys remain inspectable on the raw body. + def test_parse_call_function_raw_returns_undecoded_response + enc = { "__type" => "Object", "className" => "_User", + "objectId" => "u1", "sessionToken" => "r:tok" } + raw_response = Parse::Response.new("result" => enc) + + @mock_client.expect :call_function, raw_response, ["fn", {}], opts: {} + + result = nil + Parse::Client.stub :client, @mock_client do + result = Parse.call_function("fn", {}, raw: true) + end + + assert_same raw_response, result, "raw: true must return the Response object itself" + refute_kind_of Parse::User, result, "raw: true must not decode the envelope into an object" + @mock_client.verify + end + + # The complement: without `raw`, the same envelope decodes through the trusted + # path into a Parse::User that retains its server-set sessionToken — this is + # the server-authoritative behavior the SECURITY doc describes. + def test_parse_call_function_without_raw_decodes_envelope_to_object + enc = { "__type" => "Object", "className" => "_User", + "objectId" => "u1", "username" => "alice", "sessionToken" => "r:tok" } + ok_response = Parse::Response.new("result" => enc) + + @mock_client.expect :call_function, ok_response, ["fn", {}], opts: {} + + result = nil + Parse::Client.stub :client, @mock_client do + result = Parse.call_function("fn", {}) + end + + assert_kind_of Parse::User, result + assert_equal "r:tok", result.session_token + @mock_client.verify + end + + # `call_function!` always decodes on success — `:raw` has no effect, so a + # caller cannot get the undecoded response through the bang variant. + def test_parse_call_function_bang_decodes_on_success_even_with_raw_true + enc = { "__type" => "Object", "className" => "_User", + "objectId" => "u1", "sessionToken" => "r:tok" } + ok_response = Parse::Response.new("result" => enc) + + @mock_client.expect :call_function, ok_response, ["fn", {}], opts: {} + + result = nil + Parse::Client.stub :client, @mock_client do + result = Parse.call_function!("fn", {}, raw: true) + end + + assert_kind_of Parse::User, result, "the bang variant always decodes; :raw is a no-op" + @mock_client.verify + end + def test_parse_call_function_handles_non_hash_response_body # Guard against TypeError when Parse Server returns a non-Hash body for a # "successful" response. Should return the raw result rather than indexing diff --git a/test/lib/parse/cloud_result_decode_test.rb b/test/lib/parse/cloud_result_decode_test.rb index 2bccf4a..0a8b875 100644 --- a/test/lib/parse/cloud_result_decode_test.rb +++ b/test/lib/parse/cloud_result_decode_test.rb @@ -101,4 +101,42 @@ def test_extract_cloud_result_tolerates_non_hash_body resp = Resp.new("raw-string-body") assert_equal "raw-string-body", Parse._extract_cloud_result(resp) end + + # --------------------------------------------------------------------------- + # Server-authoritative decode: a cloud __type:"Object" envelope hydrates + # through the SAME trusted path as every query / fetch result, so server-set + # credential-shaped keys are PRESERVED rather than stripped. Filtering them + # here would make cloud results stricter than the rest of the SDK. + # --------------------------------------------------------------------------- + + def test_user_envelope_preserves_session_token + # Mirrors a cloud function that returns `request.user`: the resulting + # Parse::User must keep its server-set sessionToken (trusted-init does not + # filter PROTECTED_INITIALIZE_KEYS), exactly as a query/fetch would. + enc = { "__type" => "Object", "className" => "_User", + "objectId" => "u1", "username" => "alice", "sessionToken" => "r:tok123" } + user = decode(enc) + assert_kind_of Parse::User, user + assert_equal "u1", user.id + assert_equal "alice", user.username + assert_equal "r:tok123", user.session_token, + "cloud-decoded user must retain its server-set sessionToken (trusted-init)" + end + + def test_untrusted_new_strips_session_token_unlike_cloud_decode + # The contrast that justifies leaving cloud decode on the trusted path: + # untrusted mass-assignment (Klass.new) DROPS the same protected key, so + # filtering cloud results would diverge from query/fetch hydration. + user = Parse::User.new("username" => "bob", "sessionToken" => "r:should_strip") + assert_nil user.session_token, + "untrusted Parse::User.new must NOT accept a mass-assigned sessionToken" + end + + def test_extract_cloud_result_preserves_session_token_through_unwrap + enc = { "__type" => "Object", "className" => "_User", + "objectId" => "u1", "username" => "alice", "sessionToken" => "r:tok123" } + user = Parse._extract_cloud_result(Resp.new({ "result" => enc })) + assert_kind_of Parse::User, user + assert_equal "r:tok123", user.session_token + end end diff --git a/test/lib/parse/embed_managed_meta_reembed_test.rb b/test/lib/parse/embed_managed_meta_reembed_test.rb new file mode 100644 index 0000000..b0076f8 --- /dev/null +++ b/test/lib/parse/embed_managed_meta_reembed_test.rb @@ -0,0 +1,232 @@ +# encoding: UTF-8 +# frozen_string_literal: true + +require_relative "../../test_helper" +require "parse/embeddings" +require "parse/model/file" + +# Unit tests for the v5.5 embedding-migration surface: +# - auto-declared `_meta` provenance sibling (stamped/cleared) +# - Class.reembed! (force re-embed; only_stale: skip-current rows) +# - embed_image source: :bytes dispatch through ImageFetch +class EmbedManagedMetaReembedTest < Minitest::Test + def self.register + Parse::Embeddings.register(:fx_meta, Parse::Embeddings::Fixture.new(dimensions: 4)) + end + register + + class MetaItem < Parse::Object + parse_class "MetaItem" + property :title, :string + property :embedding, :vector, dimensions: 4, provider: :fx_meta + embed :title, into: :embedding + end + + # ---------- _meta provenance ---------- + + def test_meta_property_is_auto_declared + assert MetaItem.fields.key?(:embedding_meta) + assert_equal :embedding_meta, MetaItem.embed_directives[:embedding].meta_field + end + + def test_meta_is_stamped_on_recompute + r = MetaItem.new(title: "hello world") + r.compute_embedding! + meta = r.embedding_meta + refute_nil meta + assert_equal "fx_meta", meta["provider"] + assert_equal Parse::Embeddings.provider(:fx_meta).model_name, meta["model"] + assert_equal 4, meta["dimensions"] + assert_equal "text", meta["modality"] + refute_nil meta["embedded_at"] + assert Time.parse(meta["embedded_at"]) <= Time.now.utc + 1 + end + + def test_meta_is_cleared_when_source_clears + r = MetaItem.new(title: "hello world") + r.compute_embedding! + refute_nil r.embedding_meta + r.title = nil + r.compute_embedding! + assert_nil r.embedding + assert_nil r.embedding_meta + end + + # ---------- reembed! (stubbed query chain) ---------- + + class FakeRecord + attr_reader :id, :saves + attr_accessor :embedding_digest, :embedding_meta + def initialize(id, digest: "old", meta: nil) + @id = id + @saves = 0 + @embedding_digest = digest + @embedding_meta = meta + end + def save(**_opts) = (@saves += 1) + end + + class FakeQuery + def initialize(batches) = (@batches = batches; @i = -1) + def where(*) = self + def order(*) = self + def limit(*) = self + def results + @i += 1 + @batches[@i] || [] + end + end + + def current_meta + { + "provider" => "fx_meta", + "model" => Parse::Embeddings.provider(:fx_meta).model_name, + "dimensions" => 4, + } + end + + def test_reembed_clears_digest_and_saves_every_row + rows = [FakeRecord.new("a"), FakeRecord.new("b")] + fq = FakeQuery.new([rows]) + MetaItem.stub(:query, ->(*_a) { fq }) do + assert_equal 2, MetaItem.reembed!(batch_size: 5) + end + rows.each do |r| + assert_equal 1, r.saves + assert_nil r.embedding_digest, "digest must be cleared so the save-path recompute runs" + end + end + + def test_reembed_only_stale_skips_current_rows + fresh = FakeRecord.new("a", meta: current_meta) + stale_meta = current_meta.merge("model" => "old-model-1") + stale = FakeRecord.new("b", meta: stale_meta) + never = FakeRecord.new("c", meta: nil) + fq = FakeQuery.new([[fresh, stale, never]]) + MetaItem.stub(:query, ->(*_a) { fq }) do + assert_equal 2, MetaItem.reembed!(batch_size: 5, only_stale: true) + end + assert_equal 0, fresh.saves + assert_equal 1, stale.saves + assert_equal 1, never.saves, "rows with no meta count as stale" + end + + def test_reembed_respects_limit + rows = [FakeRecord.new("a"), FakeRecord.new("b"), FakeRecord.new("c")] + fq = FakeQuery.new([rows]) + MetaItem.stub(:query, ->(*_a) { fq }) do + assert_equal 1, MetaItem.reembed!(batch_size: 5, limit: 1) + end + assert_equal [1, 0, 0], rows.map(&:saves) + end + + def test_reembed_unknown_field_raises + err = assert_raises(ArgumentError) { MetaItem.reembed!(field: :nope) } + # The shared backfill resolver must name the entry point the caller + # actually used, not its embed_pending! sibling. + assert_includes err.message, "reembed!" + refute_includes err.message, "embed_pending!" + end + + def test_reembed_validates_batch_size + assert_raises(ArgumentError) { MetaItem.reembed!(batch_size: 0) } + end + + # ---------- embed_image source: :bytes ---------- + + class StubBytesProvider < Parse::Embeddings::Provider + attr_reader :calls + def initialize = @calls = [] + def dimensions; 4; end + def model_name; "stub-bytes-1"; end + def modalities; %i[text image]; end + def embed_image(sources, input_type: :search_document, allow_insecure: false) + @calls << { sources: sources, input_type: input_type } + sources.map { [0.1, 0.2, 0.3, 0.4] } + end + end + + def self.register_bytes + Parse::Embeddings.register(:stub_bytes, StubBytesProvider.new) + end + register_bytes + + class BytesItem < Parse::Object + parse_class "BytesItem" + property :photo, :file + property :photo_embedding, :vector, dimensions: 4, provider: :stub_bytes + embed_image :photo, into: :photo_embedding, source: :bytes + end + + def test_bytes_mode_recorded_on_directive + d = BytesItem.embed_directives[:photo_embedding] + assert d.bytes_mode? + assert_equal true, d.exif_strip + assert_equal :photo_embedding_meta, d.meta_field + end + + class BadBytesItem < Parse::Object + parse_class "BadBytesItem" + property :photo, :file + property :v, :vector, dimensions: 4, provider: :stub_bytes + end + + def test_invalid_source_mode_raises_at_declaration + err = assert_raises(Parse::Core::EmbedManaged::InvalidEmbedDeclaration) do + BadBytesItem.embed_image :photo, into: :v, source: :stream + end + assert_includes err.message, ":url or :bytes" + end + + def test_bytes_mode_fetches_and_forwards_fetched_image + fetched = Parse::Embeddings::ImageFetch::FetchedImage.new( + bytes: "\xFF\xD8\xFF".b, mime_type: "image/jpeg", + url: "https://1.1.1.1/p.jpg", + ) + fetch_args = [] + stub_fetch = lambda do |url, allow_insecure:, exif_strip:| + fetch_args << { url: url, allow_insecure: allow_insecure, exif_strip: exif_strip } + fetched + end + provider = Parse::Embeddings.provider(:stub_bytes) + provider.calls.clear + + r = BytesItem.new + file = Parse::File.new("name" => "p.jpg", "url" => "https://1.1.1.1/p.jpg") + r.photo = file + + Parse::Embeddings::ImageFetch.stub(:fetch!, stub_fetch) do + r.compute_embedding! + end + + assert_equal 1, fetch_args.length + assert_equal "https://1.1.1.1/p.jpg", fetch_args.first[:url] + assert_equal true, fetch_args.first[:exif_strip] + assert_equal false, fetch_args.first[:allow_insecure] + + assert_equal 1, provider.calls.length + assert_equal [fetched], provider.calls.first[:sources] + assert_equal 4, r.photo_embedding.dimensions + meta = r.photo_embedding_meta + assert_equal "image", meta["modality"] + assert_equal "stub-bytes-1", meta["model"] + end + + class UrlModeItem < Parse::Object + parse_class "UrlModeItem" + property :photo, :file + property :v, :vector, dimensions: 4, provider: :stub_bytes + embed_image :photo, into: :v # default source: :url + end + + def test_url_mode_still_forwards_raw_url + refute UrlModeItem.embed_directives[:v].bytes_mode? + + provider = Parse::Embeddings.provider(:stub_bytes) + provider.calls.clear + r = UrlModeItem.new + r.photo = Parse::File.new("name" => "p.jpg", "url" => "https://1.1.1.1/p.jpg") + r.compute_embedding! + assert_equal ["https://1.1.1.1/p.jpg"], provider.calls.first[:sources] + end +end diff --git a/test/lib/parse/embeddings_batch_embedder_test.rb b/test/lib/parse/embeddings_batch_embedder_test.rb new file mode 100644 index 0000000..aff3183 --- /dev/null +++ b/test/lib/parse/embeddings_batch_embedder_test.rb @@ -0,0 +1,171 @@ +# encoding: UTF-8 +# frozen_string_literal: true + +require_relative "../../test_helper" +require "parse/embeddings" + +# Unit tests for Parse::Embeddings::BatchEmbedder — batch slicing, +# requests-per-minute pacing, batch-level exponential backoff on +# rate-limit / transient errors, and the BatchFailed terminal error. +class EmbeddingsBatchEmbedderTest < Minitest::Test + # Provider double: deterministic vectors, scripted failures, call log. + class ScriptedProvider < Parse::Embeddings::Provider + class RateLimitError < Parse::Embeddings::Error; end + class TransientError < Parse::Embeddings::Error; end + class FatalError < Parse::Embeddings::Error; end + + attr_reader :calls + + def initialize(batch_size: 2, failures: []) + @batch_size = batch_size + @failures = failures # queue of exceptions to raise before succeeding + @calls = [] + end + + def dimensions; 3; end + def model_name; "scripted-1"; end + def embed_batch_size; @batch_size; end + + def embed_text(strings, input_type: :search_document) + if (err = @failures.shift) + @calls << { batch: strings.dup, input_type: input_type, raised: err.class.name } + raise err + end + @calls << { batch: strings.dup, input_type: input_type, raised: nil } + strings.map { |s| [s.length.to_f, 1.0, 2.0] } + end + end + + def fast_embedder(provider, **opts) + # base_delay tiny so retry tests run instantly. + Parse::Embeddings::BatchEmbedder.new(provider, base_delay: 0.001, jitter: 0.0, **opts) + end + + def test_rejects_non_provider + assert_raises(ArgumentError) { Parse::Embeddings::BatchEmbedder.new("nope") } + end + + def test_empty_input_returns_empty + provider = ScriptedProvider.new + assert_equal [], fast_embedder(provider).embed_text([]) + assert_empty provider.calls + end + + def test_rejects_non_array + provider = ScriptedProvider.new + assert_raises(ArgumentError) { fast_embedder(provider).embed_text("one") } + end + + def test_slices_by_provider_batch_size_and_preserves_order + provider = ScriptedProvider.new(batch_size: 2) + vectors = fast_embedder(provider).embed_text(%w[a bb ccc dddd e]) + assert_equal 5, vectors.length + assert_equal [1.0, 2.0, 3.0, 4.0, 1.0], vectors.map(&:first) + assert_equal [%w[a bb], %w[ccc dddd], %w[e]], provider.calls.map { |c| c[:batch] } + end + + def test_explicit_batch_size_overrides_provider_hint + provider = ScriptedProvider.new(batch_size: 2) + fast_embedder(provider, batch_size: 4).embed_text(%w[a b c d e]) + assert_equal [4, 1], provider.calls.map { |c| c[:batch].length } + end + + def test_retries_rate_limit_then_succeeds + provider = ScriptedProvider.new( + batch_size: 10, + failures: [ScriptedProvider::RateLimitError.new("429")], + ) + vectors = fast_embedder(provider).embed_text(%w[a b]) + assert_equal 2, vectors.length + assert_equal ["ScriptedProvider::RateLimitError", nil].map { |x| x&.split("::")&.last }, + provider.calls.map { |c| c[:raised]&.split("::")&.last } + end + + def test_retries_transient_error + provider = ScriptedProvider.new( + batch_size: 10, + failures: [ScriptedProvider::TransientError.new("503")], + ) + assert_equal 1, fast_embedder(provider).embed_text(%w[a]).length + end + + def test_batch_failed_after_max_attempts + provider = ScriptedProvider.new( + batch_size: 1, + failures: Array.new(3) { ScriptedProvider::RateLimitError.new("429") }, + ) + err = assert_raises(Parse::Embeddings::BatchEmbedder::BatchFailed) do + fast_embedder(provider, max_attempts: 3).embed_text(%w[a b]) + end + assert_equal 0, err.batch_index + assert_equal 0, err.completed_count + assert_includes err.message, "after 3 attempt(s)" + end + + def test_batch_failed_reports_progress_position + provider = ScriptedProvider.new( + batch_size: 1, + failures: [], + ) + # First batch succeeds, then two rate limits on the second exhaust + # max_attempts: 2. + def provider.embed_text(strings, input_type: :search_document) + @calls << { batch: strings.dup, input_type: input_type, raised: nil } + raise EmbeddingsBatchEmbedderTest::ScriptedProvider::RateLimitError, "429" if strings == ["b"] + strings.map { |s| [s.length.to_f, 1.0, 2.0] } + end + err = assert_raises(Parse::Embeddings::BatchEmbedder::BatchFailed) do + fast_embedder(provider, max_attempts: 2).embed_text(%w[a b c]) + end + assert_equal 1, err.batch_index + assert_equal 1, err.completed_count + end + + def test_non_retryable_error_propagates_immediately + provider = ScriptedProvider.new( + batch_size: 10, + failures: [ScriptedProvider::FatalError.new("401")], + ) + assert_raises(ScriptedProvider::FatalError) do + fast_embedder(provider).embed_text(%w[a]) + end + assert_equal 1, provider.calls.length + end + + def test_retry_on_override + provider = ScriptedProvider.new( + batch_size: 10, + failures: [ScriptedProvider::FatalError.new("flaky")], + ) + vectors = fast_embedder(provider, retry_on: [ScriptedProvider::FatalError]) + .embed_text(%w[a]) + assert_equal 1, vectors.length + end + + def test_on_progress_callback + provider = ScriptedProvider.new(batch_size: 2) + events = [] + embedder = fast_embedder(provider, on_progress: ->(**kw) { events << kw }) + embedder.embed_text(%w[a b c]) + assert_equal [ + { done: 2, total: 3, batch_index: 0, batch_count: 2 }, + { done: 3, total: 3, batch_index: 1, batch_count: 2 }, + ], events + end + + def test_pacing_spaces_calls + provider = ScriptedProvider.new(batch_size: 1) + # 1200 rpm => 50ms interval; two batches => one inter-batch wait. + embedder = fast_embedder(provider, requests_per_minute: 1200) + t0 = Process.clock_gettime(Process::CLOCK_MONOTONIC) + embedder.embed_text(%w[a b]) + elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - t0 + assert_operator elapsed, :>=, 0.045 + end + + def test_validation_kwargs + provider = ScriptedProvider.new + assert_raises(ArgumentError) { fast_embedder(provider, max_attempts: 0) } + assert_raises(ArgumentError) { fast_embedder(provider, batch_size: 0) } + end +end diff --git a/test/lib/parse/embeddings_cache_test.rb b/test/lib/parse/embeddings_cache_test.rb new file mode 100644 index 0000000..919012c --- /dev/null +++ b/test/lib/parse/embeddings_cache_test.rb @@ -0,0 +1,266 @@ +# encoding: UTF-8 +# frozen_string_literal: true + +require_relative "../../test_helper" +require "parse/embeddings" + +# Unit tests for Parse::Embeddings::Cache — the query-embed cache keyed +# by (provider, model, dimensions, input_type, input_hash). Covers +# disabled pass-through, hit/miss accounting, key separation, TTL expiry, +# LRU eviction, custom stores, and the cached:true instrumentation event. +class EmbeddingsCacheTest < Minitest::Test + CACHE = Parse::Embeddings::Cache + + # Counts embed_text invocations. + class CountingProvider < Parse::Embeddings::Provider + attr_reader :count + def initialize(model: "counting-1") + @model = model + @count = 0 + end + def dimensions; 3; end + def model_name; @model; end + def embed_text(strings, input_type: :search_document) + @count += 1 + strings.map { |s| [s.length.to_f, input_type == :search_query ? 1.0 : 0.0, 9.9] } + end + end + + def teardown + CACHE.disable! + end + + def test_disabled_is_passthrough + provider = CountingProvider.new + v1 = CACHE.fetch_vector(provider, "hello") + v2 = CACHE.fetch_vector(provider, "hello") + assert_equal v1, v2 + assert_equal 2, provider.count + refute CACHE.enabled? + end + + def test_enabled_serves_repeats_from_cache + CACHE.enable! + provider = CountingProvider.new + v1 = CACHE.fetch_vector(provider, "hello") + v2 = CACHE.fetch_vector(provider, "hello") + assert_equal v1, v2 + assert_equal 1, provider.count + stats = CACHE.stats + assert_equal 1, stats[:hits] + assert_equal 1, stats[:misses] + assert_equal 1, stats[:size] + end + + def test_key_separates_input_type + CACHE.enable! + provider = CountingProvider.new + CACHE.fetch_vector(provider, "hello", input_type: :search_query) + CACHE.fetch_vector(provider, "hello", input_type: :search_document) + assert_equal 2, provider.count + end + + def test_key_separates_model + CACHE.enable! + a = CountingProvider.new(model: "model-a") + b = CountingProvider.new(model: "model-b") + CACHE.fetch_vector(a, "hello") + CACHE.fetch_vector(b, "hello") + assert_equal 1, a.count + assert_equal 1, b.count + assert_equal 2, CACHE.stats[:size] + end + + def test_key_separates_input + CACHE.enable! + provider = CountingProvider.new + CACHE.fetch_vector(provider, "hello") + CACHE.fetch_vector(provider, "world") + assert_equal 2, provider.count + end + + def test_key_separates_dimensions + # Matryoshka-capable providers can register the same model at two + # output widths; the narrower instance must never be served the wider + # instance's cached vector. + CACHE.enable! + narrow = CountingProvider.new + wide = CountingProvider.new + wide.define_singleton_method(:dimensions) { 1024 } + CACHE.fetch_vector(narrow, "hello") + CACHE.fetch_vector(wide, "hello") + assert_equal 1, narrow.count + assert_equal 1, wide.count + assert_equal 2, CACHE.stats[:size] + end + + def test_lru_eviction + CACHE.enable!(max_entries: 2) + provider = CountingProvider.new + CACHE.fetch_vector(provider, "a") + CACHE.fetch_vector(provider, "b") + CACHE.fetch_vector(provider, "c") # evicts "a" + assert_equal 2, CACHE.stats[:size] + CACHE.fetch_vector(provider, "a") # miss again + assert_equal 4, provider.count + end + + def test_ttl_expiry + CACHE.enable!(ttl: 0.01) + provider = CountingProvider.new + CACHE.fetch_vector(provider, "hello") + sleep 0.02 + CACHE.fetch_vector(provider, "hello") + assert_equal 2, provider.count + end + + def test_clear_resets_entries_and_counters + CACHE.enable! + provider = CountingProvider.new + CACHE.fetch_vector(provider, "hello") + CACHE.clear! + stats = CACHE.stats + assert_equal 0, stats[:size] + assert_equal 0, stats[:hits] + assert_equal 0, stats[:misses] + end + + def test_custom_store + store = Class.new do + attr_reader :h + def initialize = @h = {} + def get(k) = @h[k] + def set(k, v) = @h[k] = v + end.new + CACHE.enable!(store: store) + provider = CountingProvider.new + CACHE.fetch_vector(provider, "hello") + CACHE.fetch_vector(provider, "hello") + assert_equal 1, provider.count + assert_equal 1, store.h.length + end + + def test_custom_store_must_quack + assert_raises(ArgumentError) { CACHE.enable!(store: Object.new) } + end + + def test_hit_emits_cached_instrumentation_event + CACHE.enable! + provider = CountingProvider.new + events = [] + sub = ActiveSupport::Notifications.subscribe( + Parse::Embeddings::Provider::AS_NOTIFICATION_NAME, + ) { |*, payload| events << payload } + begin + CACHE.fetch_vector(provider, "hello") + CACHE.fetch_vector(provider, "hello") + ensure + ActiveSupport::Notifications.unsubscribe(sub) + end + cached_events = events.select { |p| p[:cached] } + assert_equal 1, cached_events.length + assert_equal "counting-1", cached_events.first[:model] + end + + # ---------- MonetaStore (persistent L2 adapter) ---------- + + class FakeMoneta + attr_reader :h, :expires_seen + def initialize + @h = {} + @expires_seen = [] + end + + def [](k) + @h[k] + end + + def []=(k, v) + @h[k] = v + end + + def store(k, v, expires: nil) + @expires_seen << expires + @h[k] = v + end + end + + class BrokenMoneta + def [](_k) + raise "redis down" + end + + def []=(_k, _v) + raise "redis down" + end + end + + def test_moneta_store_round_trips_with_namespace_and_ttl + moneta = FakeMoneta.new + store = CACHE::MonetaStore.new(moneta, ttl: 3600) + CACHE.enable!(store: store) + provider = CountingProvider.new + v1 = CACHE.fetch_vector(provider, "hello") + v2 = CACHE.fetch_vector(provider, "hello") + assert_equal v1, v2 + assert_equal 1, provider.count + assert_equal 1, moneta.h.length + assert moneta.h.keys.first.start_with?("emb:") + assert_equal [3600.0], moneta.expires_seen + end + + def test_moneta_store_without_ttl_uses_plain_assignment + moneta = FakeMoneta.new + store = CACHE::MonetaStore.new(moneta) + store.set("k", [1.0]) + assert_empty moneta.expires_seen + assert_equal [1.0], store.get("k") + end + + def test_moneta_store_with_ttl_and_hash_backend_falls_back_to_plain_write + # Hash#store(key, value) rejects the expires: kwarg with ArgumentError; + # the adapter must fall back to a plain (no-expiry) write instead of + # letting the fail-open rescue silently drop every vector. + moneta = {} + store = CACHE::MonetaStore.new(moneta, ttl: 60) + store.set("k", [1.0, 2.0]) + assert_equal [1.0, 2.0], store.get("k") + assert_equal [1.0, 2.0], moneta["emb:k"] + end + + def test_moneta_store_fails_open_on_backend_errors + store = CACHE::MonetaStore.new(BrokenMoneta.new) + CACHE.enable!(store: store) + provider = CountingProvider.new + # get raises -> miss; set raises -> dropped write. Both swallowed. + v1 = CACHE.fetch_vector(provider, "hello") + v2 = CACHE.fetch_vector(provider, "hello") + assert_equal v1, v2 + assert_equal 2, provider.count, "broken store degrades to pass-through" + end + + def test_moneta_store_ignores_non_array_values + moneta = FakeMoneta.new + moneta.h["emb:poisoned"] = "not a vector" + store = CACHE::MonetaStore.new(moneta) + assert_nil store.get("poisoned") + end + + def test_moneta_store_requires_indexing_duck + assert_raises(ArgumentError) { CACHE::MonetaStore.new(Object.new) } + end + + def test_invalid_provider_response_raises + bad = Class.new(Parse::Embeddings::Provider) do + def dimensions; 3; end + def model_name; "bad"; end + def embed_text(strings, input_type: :search_document) + [[1.0], [2.0]] # two vectors for one input + end + end.new + CACHE.enable! + assert_raises(Parse::Embeddings::InvalidResponseError) do + CACHE.fetch_vector(bad, "hello") + end + end +end diff --git a/test/lib/parse/embeddings_cohere_image_test.rb b/test/lib/parse/embeddings_cohere_image_test.rb index 102956d..5d53583 100644 --- a/test/lib/parse/embeddings_cohere_image_test.rb +++ b/test/lib/parse/embeddings_cohere_image_test.rb @@ -73,8 +73,8 @@ def test_embed_image_rejects_non_string_source err = assert_raises(ArgumentError) do provider.embed_image(["https://1.1.1.1/img.jpg", 12345]) end - assert_match(/sources\[1\] is not a String/, err.message) - assert_match(/URL-only/, err.message) + assert_match(/sources\[1\] must be a URL String/, err.message) + assert_match(/FetchedImage/, err.message) end def test_embed_image_rejects_unknown_input_type diff --git a/test/lib/parse/embeddings_image_fetch_test.rb b/test/lib/parse/embeddings_image_fetch_test.rb new file mode 100644 index 0000000..94713cc --- /dev/null +++ b/test/lib/parse/embeddings_image_fetch_test.rb @@ -0,0 +1,340 @@ +# encoding: UTF-8 +# frozen_string_literal: true + +require_relative "../../test_helper" +require "parse/embeddings" +require "parse/model/file" +require "stringio" + +# Unit tests for Parse::Embeddings::ImageFetch — the v5.5 bytes-fetch +# path: magic-byte MIME sniffing (NEW-NET-4 closure: the Content-Type +# header is never consulted), extension cross-check, type allowlist, +# EXIF/XMP stripping, and the fetch! pipeline over a stubbed +# Parse::File.safe_open_url. +class EmbeddingsImageFetchTest < Minitest::Test + IF = Parse::Embeddings::ImageFetch + + def teardown + Parse::Embeddings.reset! + end + + # ---------- fixture builders ---------- + + def jpeg_with_exif + app0 = "\xFF\xE0".b + [16].pack("n") + "JFIF\x00".b + ("\x00".b * 9) + exif_payload = "Exif\x00\x00".b + ("E".b * 10) + app1 = "\xFF\xE1".b + [2 + exif_payload.bytesize].pack("n") + exif_payload + sos = "\xFF\xDA".b + [4].pack("n") + "\x01\x02".b + entropy = "\x12\x34\x56".b + "\xFF\xD9".b + "\xFF\xD8".b + app0 + app1 + sos + entropy + end + + def png_chunk(type, payload) + [payload.bytesize].pack("N") + type.b + payload.b + "\x00\x00\x00\x00".b + end + + def png_with_exif + sig = "\x89PNG\r\n\x1A\n".b + sig + png_chunk("IHDR", "\x00" * 13) + + png_chunk("eXIf", "EXIFDATA") + + png_chunk("IDAT", "\x01\x02\x03") + + png_chunk("IEND", "") + end + + def webp_chunk(type, payload) + chunk = type.b + [payload.bytesize].pack("V") + payload.b + chunk += "\x00".b if payload.bytesize.odd? + chunk + end + + def webp_with_metadata + vp8x_payload = (0x0C).chr + ("\x00" * 9) # EXIF + XMP flags set + chunks = webp_chunk("VP8X", vp8x_payload) + + webp_chunk("VP8 ", "\x01\x02\x03\x04") + + webp_chunk("EXIF", "EXIFDATA") + + webp_chunk("XMP ", "") + "RIFF".b + [4 + chunks.bytesize].pack("V") + "WEBP".b + chunks + end + + def plain_gif + "GIF89a".b + ("\x00" * 20) + end + + # ---------- sniff_mime ---------- + + def test_sniffs_jpeg + assert_equal "image/jpeg", IF.sniff_mime(jpeg_with_exif) + end + + def test_sniffs_png + assert_equal "image/png", IF.sniff_mime(png_with_exif) + end + + def test_sniffs_gif + assert_equal "image/gif", IF.sniff_mime(plain_gif) + assert_equal "image/gif", IF.sniff_mime("GIF87a".b + ("\x00" * 10)) + end + + def test_sniffs_webp + assert_equal "image/webp", IF.sniff_mime(webp_with_metadata) + end + + def test_sniff_unknown_returns_nil + assert_nil IF.sniff_mime("<html><body>hi</body></html>") + assert_nil IF.sniff_mime("%PDF-1.7 ........") + assert_nil IF.sniff_mime(nil) + assert_nil IF.sniff_mime("short") + end + + def test_sniff_riff_but_not_webp_returns_nil + wav = "RIFF".b + [100].pack("V") + "WAVE".b + ("\x00" * 8) + assert_nil IF.sniff_mime(wav) + end + + # ---------- verify! ---------- + + def test_verify_returns_sniffed_mime + assert_equal "image/jpeg", IF.verify!(jpeg_with_exif, url: "https://cdn.example.com/a.jpg") + end + + def test_verify_refuses_empty + err = assert_raises(IF::InvalidImageType) { IF.verify!("".b) } + assert_equal :empty, err.reason + end + + def test_verify_refuses_unknown_magic + err = assert_raises(IF::InvalidImageType) { IF.verify!("<html>not an image</html>") } + assert_equal :unknown_magic, err.reason + end + + def test_verify_refuses_extension_mismatch + # PNG bytes served from a .jpg URL — the MIME-laundering shape. + err = assert_raises(IF::InvalidImageType) do + IF.verify!(png_with_exif, url: "https://1.1.1.1/photo.jpg") + end + assert_equal :extension_mismatch, err.reason + end + + def test_verify_ignores_unrecognized_extension + assert_equal "image/png", IF.verify!(png_with_exif, url: "https://cdn.example.com/blob.bin") + assert_equal "image/png", IF.verify!(png_with_exif, url: "https://cdn.example.com/noext") + end + + def test_verify_extension_check_ignores_hostname_dots + # A dot in the hostname must not be read as a file extension — JPEG + # bytes from a host whose last label spells ".png" are fine when the + # path itself carries no extension. + assert_equal "image/jpeg", IF.verify!(jpeg_with_exif, url: "https://evil.png/blob") + assert_equal "image/jpeg", IF.verify!(jpeg_with_exif, url: "https://cdn.v2.example.com/blob") + # Path-less URL: the pre-URI split-based check read ".png" out of the + # hostname here and raised a false :extension_mismatch. + assert_equal "image/jpeg", IF.verify!(jpeg_with_exif, url: "https://evil.png") + # ...while a real path extension on such a host still cross-checks. + err = assert_raises(IF::InvalidImageType) do + IF.verify!(jpeg_with_exif, url: "https://evil.png/photo.png") + end + assert_equal :extension_mismatch, err.reason + end + + def test_verify_extension_check_ignores_query_string + err = assert_raises(IF::InvalidImageType) do + IF.verify!(png_with_exif, url: "https://1.1.1.1/photo.jpg?w=100&h=100") + end + assert_equal :extension_mismatch, err.reason + end + + def test_verify_enforces_type_allowlist + Parse::Embeddings.allowed_image_types = ["image/png"] + err = assert_raises(IF::InvalidImageType) do + IF.verify!(jpeg_with_exif, url: "https://cdn.example.com/a.jpg") + end + assert_equal :type_not_allowed, err.reason + end + + def test_allowed_image_types_rejects_bad_config + assert_raises(ArgumentError) { Parse::Embeddings.allowed_image_types = [] } + assert_raises(ArgumentError) { Parse::Embeddings.allowed_image_types = "image/png" } + assert_raises(ArgumentError) { Parse::Embeddings.allowed_image_types = ["notamime"] } + end + + # ---------- EXIF stripping ---------- + + def test_strip_jpeg_removes_app1_keeps_app0_and_image_data + original = jpeg_with_exif + stripped = IF.strip_metadata(original, "image/jpeg") + refute_includes stripped, "Exif\x00\x00".b + assert_includes stripped, "JFIF\x00".b + assert stripped.start_with?("\xFF\xD8".b) + assert stripped.end_with?("\xFF\xD9".b) + assert_operator stripped.bytesize, :<, original.bytesize + # Stripping is idempotent and the result still sniffs as JPEG. + assert_equal "image/jpeg", IF.sniff_mime(stripped) + assert_equal stripped, IF.strip_metadata(stripped, "image/jpeg") + end + + def test_strip_png_removes_exif_chunk_keeps_idat + original = png_with_exif + stripped = IF.strip_metadata(original, "image/png") + refute_includes stripped, "eXIf".b + assert_includes stripped, "IDAT".b + assert_includes stripped, "IEND".b + assert_equal "image/png", IF.sniff_mime(stripped) + end + + def test_strip_webp_removes_exif_xmp_and_clears_vp8x_flags + original = webp_with_metadata + stripped = IF.strip_metadata(original, "image/webp") + refute_includes stripped, "EXIFDATA".b + refute_includes stripped, "<xmp/>".b + assert_includes stripped, "VP8 ".b + assert_equal "image/webp", IF.sniff_mime(stripped) + # VP8X flag byte: EXIF (0x08) and XMP (0x04) cleared. + vp8x_at = stripped.index("VP8X".b) + refute_nil vp8x_at + flags = stripped.getbyte(vp8x_at + 8) + assert_equal 0, flags & 0x0C + # RIFF size field patched to match the shrunken payload. + riff_size = stripped.byteslice(4, 4).unpack1("V") + assert_equal stripped.bytesize - 8, riff_size + end + + def test_strip_gif_is_passthrough + assert_equal plain_gif, IF.strip_metadata(plain_gif, "image/gif") + end + + def test_strip_malformed_jpeg_returns_original_with_warning + junk = "\xFF\xD8".b + "garbage that is not a marker stream".b + result = nil + # The fallback is fail-open by design, but no longer silent: bytes + # the walker couldn't parse may carry EXIF/XMP to the provider. + assert_output(nil, /could not parse the image\/jpeg container/) do + result = IF.strip_metadata(junk, "image/jpeg") + end + assert_equal junk, result + end + + def test_strip_truncated_png_returns_original_with_warning + # A chunk whose declared length runs past the end of the buffer stops + # the walk. The walker must bail to the ORIGINAL bytes (not a partial + # rebuild) so the pass-through warning fires — an eXIf chunk past the + # abort point would otherwise be forwarded silently. + sig = "\x89PNG\r\n\x1A\n".b + truncated = sig + png_chunk("IHDR", "\x00" * 13) + + [9999].pack("N") + "IDAT".b + "short".b + + png_chunk("eXIf", "EXIFDATA") + result = nil + assert_output(nil, /could not parse the image\/png container/) do + result = IF.strip_metadata(truncated, "image/png") + end + assert_equal truncated, result + end + + def test_strip_truncated_webp_returns_original_with_warning + chunks = webp_chunk("VP8 ", "\x01\x02\x03\x04") + + "ALPH".b + [9999].pack("V") + "short".b + + webp_chunk("EXIF", "EXIFDATA") + truncated = "RIFF".b + [4 + chunks.bytesize].pack("V") + "WEBP".b + chunks + result = nil + assert_output(nil, /could not parse the image\/webp container/) do + result = IF.strip_metadata(truncated, "image/webp") + end + assert_equal truncated, result + end + + def test_strip_clean_parse_does_not_warn + assert_output(nil, "") { IF.strip_metadata(jpeg_with_exif, "image/jpeg") } + assert_output(nil, "") { IF.strip_metadata(plain_gif, "image/gif") } + end + + # ---------- fetch! pipeline ---------- + + def with_stubbed_download(bytes) + Parse::File.stub(:safe_open_url, ->(_url) { StringIO.new(bytes) }) do + yield + end + end + + def test_fetch_closes_io_when_read_raises + Parse::Embeddings.allowed_image_hosts = ["1.1.1.1"] + io = Class.new(StringIO) do + def read(*) + raise IOError, "connection reset mid-body" + end + end.new("".b) + Parse::File.stub(:safe_open_url, ->(_url) { io }) do + assert_raises(IOError) { IF.fetch!("https://1.1.1.1/a.jpg") } + end + assert io.closed?, "the download handle must be closed even when read raises" + end + + def test_fetch_requires_host_allowlist_but_not_sentinel + # No trust_provider_url_fetch sentinel set — :fetch mode must work + # on allowlist alone (the SDK fetches; no provider egress). + Parse::Embeddings.allowed_image_hosts = ["1.1.1.1"] + with_stubbed_download(jpeg_with_exif) do + img = IF.fetch!("https://1.1.1.1/photo.jpg") + assert_instance_of IF::FetchedImage, img + assert_equal "image/jpeg", img.mime_type + refute_includes img.bytes, "Exif\x00\x00".b + end + end + + def test_fetch_denies_host_not_in_allowlist + Parse::Embeddings.allowed_image_hosts = ["1.1.1.1"] + err = assert_raises(Parse::Embeddings::InvalidImageURL) do + IF.fetch!("https://evil.example.net/photo.jpg") + end + assert_equal :host_not_allowlisted, err.reason + end + + def test_fetch_denies_everything_with_empty_allowlist + err = assert_raises(Parse::Embeddings::InvalidImageURL) do + IF.fetch!("https://1.1.1.1/photo.jpg") + end + assert_equal :host_not_allowlisted, err.reason + end + + def test_forward_mode_still_requires_sentinel + Parse::Embeddings.allowed_image_hosts = ["1.1.1.1"] + assert_raises(Parse::Embeddings::ConfirmationRequired) do + Parse::Embeddings.validate_image_url!("https://1.1.1.1/photo.jpg") + end + end + + def test_fetch_refuses_laundered_content + Parse::Embeddings.allowed_image_hosts = ["1.1.1.1"] + with_stubbed_download("<html>fake image</html>") do + err = assert_raises(IF::InvalidImageType) do + IF.fetch!("https://1.1.1.1/photo.jpg") + end + assert_equal :unknown_magic, err.reason + end + end + + def test_fetch_exif_strip_opt_out + Parse::Embeddings.allowed_image_hosts = ["1.1.1.1"] + with_stubbed_download(jpeg_with_exif) do + img = IF.fetch!("https://1.1.1.1/photo.jpg", exif_strip: false) + assert_includes img.bytes, "Exif\x00\x00".b + end + end + + def test_fetch_enforces_max_bytes + Parse::Embeddings.allowed_image_hosts = ["1.1.1.1"] + with_stubbed_download(jpeg_with_exif) do + assert_raises(ArgumentError) do + IF.fetch!("https://1.1.1.1/photo.jpg", max_bytes: 4) + end + end + end + + # ---------- FetchedImage ---------- + + def test_fetched_image_data_uri_and_safe_inspect + img = IF::FetchedImage.new(bytes: "\x01\x02".b, mime_type: "image/png", + url: "https://cdn.example.com/x.png") + assert_equal "data:image/png;base64,#{Base64.strict_encode64("\x01\x02".b)}", img.to_data_uri + refute_includes img.inspect, Base64.strict_encode64("\x01\x02".b) + assert_includes img.inspect, "image/png" + end +end diff --git a/test/lib/parse/embeddings_spend_cap_query_test.rb b/test/lib/parse/embeddings_spend_cap_query_test.rb new file mode 100644 index 0000000..fe66612 --- /dev/null +++ b/test/lib/parse/embeddings_spend_cap_query_test.rb @@ -0,0 +1,150 @@ +# encoding: UTF-8 +# frozen_string_literal: true + +require_relative "../../test_helper" +require "parse/embeddings" + +# Unit tests for the v5.5 spend-cap coverage extension: SpendCap.charge_query! +# (the non-agent embed-path charge), with_precharged suppression, ambient +# cache-tenant identity resolution, and the find_similar(text:) wiring. +class EmbeddingsSpendCapQueryTest < Minitest::Test + CAP = Parse::Embeddings::SpendCap + + def self.register + Parse::Embeddings.register(:fx_capq, Parse::Embeddings::Fixture.new(dimensions: 4)) + end + register + + class CapQItem < Parse::Object + parse_class "CapQItem" + property :title, :string + property :embedding, :vector, dimensions: 4, provider: :fx_capq + end + + def teardown + CAP.reset_all! + Parse::Embeddings::Cache.disable! + end + + def test_charge_query_noop_when_uncapped + assert_nil CAP.charge_query!("hello world") + end + + def test_charge_query_charges_default_bucket + CAP.configure(limit_tokens: 10, window: 60) + CAP.charge_query!("a" * 32) # ~8 tokens + assert_raises(CAP::Exceeded) { CAP.charge_query!("a" * 32) } + end + + def test_charge_query_uses_ambient_cache_tenant + CAP.configure("tenantA", limit_tokens: 5, window: 60) + # tenantA capped at 5; default uncapped. + Parse.with_cache_tenant("tenantA") do + assert_raises(CAP::Exceeded) { CAP.charge_query!("a" * 100) } + end + # Outside the tenant block the default (uncapped) bucket applies. + assert_nil CAP.charge_query!("a" * 100) + end + + def test_explicit_tenant_id_wins + CAP.configure("tenantB", limit_tokens: 5, window: 60) + assert_raises(CAP::Exceeded) { CAP.charge_query!("a" * 100, tenant_id: "tenantB") } + end + + def test_with_precharged_suppresses_charge + CAP.configure(limit_tokens: 5, window: 60) + CAP.with_precharged do + assert CAP.precharged? + assert_nil CAP.charge_query!("a" * 100) + end + refute CAP.precharged? + end + + def test_with_precharged_restores_on_exception + begin + CAP.with_precharged { raise "boom" } + rescue RuntimeError + nil + end + refute CAP.precharged? + end + + def test_find_similar_text_charges_the_cap + CAP.configure(limit_tokens: 5, window: 60) + err = assert_raises(CAP::Exceeded) do + CapQItem.find_similar(text: "a" * 100) + end + assert_includes err.message, "spend cap exceeded" + end + + # ---------- soft-cap warning (warn_at:) ---------- + + def collect_warnings + events = [] + sub = ActiveSupport::Notifications.subscribe(CAP::AS_NOTIFICATION_NAME) do |*, payload| + events << payload + end + yield + events + ensure + ActiveSupport::Notifications.unsubscribe(sub) + end + + def test_warn_at_emits_event_on_threshold_crossing + CAP.configure(limit_tokens: 100, window: 60, warn_at: 0.8) + events = collect_warnings do + CAP.charge!(tenant_id: "t1", tokens: 70) # below threshold + CAP.charge!(tenant_id: "t1", tokens: 15) # crosses 80 + end + assert_equal 1, events.length + payload = events.first + assert_equal "t1", payload[:tenant_id] + assert_equal 85, payload[:used] + assert_equal 100, payload[:limit] + assert_in_delta 80.0, payload[:threshold], 1e-9 + end + + def test_warn_at_fires_once_not_per_charge_above_threshold + CAP.configure(limit_tokens: 100, window: 60, warn_at: 0.5) + events = collect_warnings do + CAP.charge!(tenant_id: "t1", tokens: 60) # crosses 50 + CAP.charge!(tenant_id: "t1", tokens: 10) # already above — no event + CAP.charge!(tenant_id: "t1", tokens: 10) + end + assert_equal 1, events.length + end + + def test_warn_at_does_not_fire_below_threshold_or_on_refusal + CAP.configure(limit_tokens: 100, window: 60, warn_at: 0.9) + events = collect_warnings do + CAP.charge!(tenant_id: "t1", tokens: 50) + assert_raises(CAP::Exceeded) { CAP.charge!(tenant_id: "t1", tokens: 60) } + end + assert_empty events + end + + def test_warn_at_validates_range + assert_raises(ArgumentError) { CAP.configure(limit_tokens: 100, warn_at: 0) } + assert_raises(ArgumentError) { CAP.configure(limit_tokens: 100, warn_at: 1.0) } + assert_raises(ArgumentError) { CAP.configure(limit_tokens: 100, warn_at: 2) } + end + + def test_without_warn_at_no_event + CAP.configure(limit_tokens: 100, window: 60) + events = collect_warnings do + CAP.charge!(tenant_id: "t1", tokens: 99) + end + assert_empty events + end + + def test_find_similar_text_inside_precharged_skips_cap + CAP.configure(limit_tokens: 5, window: 60) + # Embedding succeeds (no Exceeded); the call then fails later at + # index resolution because no Mongo/Atlas is configured — that error + # PROVES the embed step got past the cap. + err = assert_raises(StandardError) do + CAP.with_precharged { CapQItem.find_similar(text: "a" * 100) } + end + refute_kind_of CAP::Exceeded, err + end +end diff --git a/test/lib/parse/embeddings_voyage_image_test.rb b/test/lib/parse/embeddings_voyage_image_test.rb index d4d1ee2..b7be9de 100644 --- a/test/lib/parse/embeddings_voyage_image_test.rb +++ b/test/lib/parse/embeddings_voyage_image_test.rb @@ -72,9 +72,8 @@ def test_embed_image_rejects_non_string_source err = assert_raises(ArgumentError) do provider.embed_image(["https://1.1.1.1/img.jpg", 12345]) end - assert_match(/sources\[1\] is not a String/, err.message) - # The URL-only constraint should be called out. - assert_match(/URL-only/, err.message) + assert_match(/sources\[1\] must be a URL String/, err.message) + assert_match(/FetchedImage/, err.message) end def test_embed_image_rejects_unknown_input_type diff --git a/test/lib/parse/find_similar_test.rb b/test/lib/parse/find_similar_test.rb index 7007f07..7d6bb2a 100644 --- a/test/lib/parse/find_similar_test.rb +++ b/test/lib/parse/find_similar_test.rb @@ -157,12 +157,28 @@ def test_no_discoverable_index_raises end end - def test_explicit_index_skips_discovery + def test_explicit_index_used_verbatim captured = {} - # Catalog should not be called at all; stub it to fail loudly if it is. - catalog = lambda do |_coll, field:| - flunk "IndexCatalog.find_vector_index must not be called when index: is explicit" + # An explicit index: is the resolved name — discovery never overrides + # it. The catalog IS still consulted for best-effort drift + # verification, but only an index whose name matches the explicit one + # is verified; here the covering index has a different name, so the + # explicit name passes through untouched. + stub_index_catalog(@catalog_stub) do + stub_vector_search(captured) do + SingleVecDoc.find_similar(vector: [0.1, 0.2, 0.3], + index: "explicit_idx", raw: true) + end end + assert_equal "explicit_idx", captured[:index] + end + + def test_explicit_index_tolerates_catalog_failure + captured = {} + # Drift verification for an explicit index: is best-effort — a + # catalog lookup failure (Atlas unreachable, listSearchIndexes + # unsupported) must not fail the query. + catalog = lambda { |_coll, field:| raise "catalog unavailable" } stub_index_catalog(catalog) do stub_vector_search(captured) do SingleVecDoc.find_similar(vector: [0.1, 0.2, 0.3], @@ -172,6 +188,28 @@ def test_explicit_index_skips_discovery assert_equal "explicit_idx", captured[:index] end + def test_explicit_index_matching_catalog_entry_is_drift_verified + Parse::VectorSearch.index_drift_policy = :raise + # The catalog's covering index has the SAME name as the explicit + # kwarg and a drifted dimension count — strict mode must refuse. + drifted = { + "name" => "explicit_idx", + "latestDefinition" => { + "fields" => [{ "type" => "vector", "path" => "embedding", + "numDimensions" => 1536, "similarity" => "cosine" }], + }, + } + stub_index_catalog(->(_coll, field:) { drifted }) do + assert_raises(Parse::Core::VectorSearchable::IndexDriftError) do + SingleVecDoc.find_similar(vector: [0.1, 0.2, 0.3], + index: "explicit_idx", raw: true) + end + end + ensure + Parse::VectorSearch.index_drift_policy = :warn + SingleVecDoc.instance_variable_set(:@_verified_vector_indexes, nil) + end + # ---- pass-through arguments ------------------------------------------ def test_forwards_filter_k_scope_kwargs diff --git a/test/lib/parse/query/constraints/acl_query_constraints_test.rb b/test/lib/parse/query/constraints/acl_query_constraints_test.rb index 767364d..86436e6 100644 --- a/test/lib/parse/query/constraints/acl_query_constraints_test.rb +++ b/test/lib/parse/query/constraints/acl_query_constraints_test.rb @@ -6,8 +6,11 @@ class TestAclQueryConstraints < Minitest::Test extend Minitest::Spec::DSL - # Test ReadableByConstraint - describe "ReadableByConstraint" do + # ReadableByConstraint is now a thin alias of ACLReadableByConstraint + # (its full behavior is covered by acl_readable_by_test.rb). These tests + # pin that the alias resolves to the unified, public-inclusive, + # role-expanding implementation — NOT the removed standalone shape. + describe "ReadableByConstraint (alias of ACLReadableByConstraint)" do it "registers :readable_by operator" do assert_includes Parse::Operation.operators.keys, :readable_by end @@ -19,138 +22,110 @@ class TestAclQueryConstraints < Minitest::Test assert_equal :readable_by, op.operator end - it "builds empty array constraint for no read permissions" do - constraint = Parse::Constraint::ReadableByConstraint.new(:acl.readable_by, []) - result = constraint.build - - assert result.key?("__aggregation_pipeline") - pipeline = result["__aggregation_pipeline"] - assert_instance_of Array, pipeline - assert_equal 1, pipeline.length - - match_stage = pipeline.first["$match"] - assert match_stage.key?("$or") - # Should match empty _rperm or missing _rperm - or_conditions = match_stage["$or"] - assert_equal 2, or_conditions.length - end - - it "builds empty array constraint for 'none' string" do - constraint = Parse::Constraint::ReadableByConstraint.new(:acl.readable_by, "none") - result = constraint.build - - assert result.key?("__aggregation_pipeline") - pipeline = result["__aggregation_pipeline"] - match_stage = pipeline.first["$match"] - assert match_stage.key?("$or") + it "is a subclass of ACLReadableByConstraint" do + assert Parse::Constraint::ReadableByConstraint < Parse::Constraint::ACLReadableByConstraint end - it "builds empty array constraint for :none symbol" do - constraint = Parse::Constraint::ReadableByConstraint.new(:acl.readable_by, :none) - result = constraint.build - - assert result.key?("__aggregation_pipeline") - pipeline = result["__aggregation_pipeline"] - match_stage = pipeline.first["$match"] - assert match_stage.key?("$or") + # Empty intent ([] / "none" / :none / nil) -> explicit-empty _rperm match + # (NOT missing, which Parse Server treats as public). Single $match, no $or. + [[], "none", :none, nil].each do |empty_value| + it "builds explicit-empty match for #{empty_value.inspect}" do + constraint = Parse::Constraint::ReadableByConstraint.new(:acl.readable_by, empty_value) + result = constraint.build + assert_equal( + [{ "$match" => { "_rperm" => { "$exists" => true, "$eq" => [] } } }], + result["__aggregation_pipeline"], + ) + end end - it "builds $in constraint for user ID string" do + it "builds public-inclusive $or for a user ID string" do constraint = Parse::Constraint::ReadableByConstraint.new(:acl.readable_by, "user123") - result = constraint.build - - assert result.key?("__aggregation_pipeline") - pipeline = result["__aggregation_pipeline"] - match_stage = pipeline.first["$match"] - assert_equal({ "$in" => ["user123"] }, match_stage["_rperm"]) - end - - it "builds $in constraint for role string" do - constraint = Parse::Constraint::ReadableByConstraint.new(:acl.readable_by, "role:Admin") - result = constraint.build - - pipeline = result["__aggregation_pipeline"] - match_stage = pipeline.first["$match"] - assert_equal({ "$in" => ["role:Admin"] }, match_stage["_rperm"]) - end - - it "converts :public to *" do - constraint = Parse::Constraint::ReadableByConstraint.new(:acl.readable_by, :public) - result = constraint.build - - pipeline = result["__aggregation_pipeline"] - match_stage = pipeline.first["$match"] - assert_equal({ "$in" => ["*"] }, match_stage["_rperm"]) - end - - it "converts 'public' string to *" do - constraint = Parse::Constraint::ReadableByConstraint.new(:acl.readable_by, "public") - result = constraint.build - - pipeline = result["__aggregation_pipeline"] - match_stage = pipeline.first["$match"] - assert_equal({ "$in" => ["*"] }, match_stage["_rperm"]) - end - - it "handles array of mixed permissions" do + match = constraint.build["__aggregation_pipeline"].first["$match"] + assert_equal( + { "$or" => [ + { "_rperm" => { "$in" => ["user123", "*"] } }, + { "_rperm" => { "$exists" => false } }, + ] }, + match, + ) + end + + [:public, "public", "*"].each do |pub| + it "maps #{pub.inspect} to the public wildcard" do + constraint = Parse::Constraint::ReadableByConstraint.new(:acl.readable_by, pub) + match = constraint.build["__aggregation_pipeline"].first["$match"] + assert_equal( + { "$or" => [ + { "_rperm" => { "$in" => ["*"] } }, + { "_rperm" => { "$exists" => false } }, + ] }, + match, + ) + end + end + + it "handles an array of mixed permissions (public deduped)" do constraint = Parse::Constraint::ReadableByConstraint.new(:acl.readable_by, ["user123", "role:Admin", "*"]) - result = constraint.build - - pipeline = result["__aggregation_pipeline"] - match_stage = pipeline.first["$match"] - in_array = match_stage["_rperm"]["$in"] - assert_includes in_array, "user123" - assert_includes in_array, "role:Admin" - assert_includes in_array, "*" + in_array = constraint.build["__aggregation_pipeline"].first["$match"]["$or"].first["_rperm"]["$in"] + assert_equal(["user123", "role:Admin", "*"], in_array) end - it "extracts user ID from Parse::User" do + it "extracts user ID from Parse::User (role expansion best-effort)" do user = Parse::User.new user.id = "abc123" constraint = Parse::Constraint::ReadableByConstraint.new(:acl.readable_by, user) - result = constraint.build - - pipeline = result["__aggregation_pipeline"] - match_stage = pipeline.first["$match"] - assert_equal({ "$in" => ["abc123"] }, match_stage["_rperm"]) + in_array = constraint.build["__aggregation_pipeline"].first["$match"]["$or"].first["_rperm"]["$in"] + assert_includes in_array, "abc123" + assert_includes in_array, "*" end - it "extracts role name from Parse::Role" do + it "extracts role name from Parse::Role (self always included)" do role = Parse::Role.new role.name = "Editor" constraint = Parse::Constraint::ReadableByConstraint.new(:acl.readable_by, role) - result = constraint.build + in_array = constraint.build["__aggregation_pipeline"].first["$match"]["$or"].first["_rperm"]["$in"] + assert_includes in_array, "role:Editor" + assert_includes in_array, "*" + end - pipeline = result["__aggregation_pipeline"] - match_stage = pipeline.first["$match"] - assert_equal({ "$in" => ["role:Editor"] }, match_stage["_rperm"]) + it "strict mode (readable_by_exact) suppresses public and missing-field branches" do + constraint = Parse::Constraint::ACLReadableByExactConstraint.new(:acl.readable_by_exact, "role:Admin") + match = constraint.build["__aggregation_pipeline"].first["$match"] + assert_equal({ "_rperm" => { "$in" => ["role:Admin"] } }, match) end end - # Test WriteableByConstraint - describe "WriteableByConstraint" do + # WriteableByConstraint (British spelling) is now an alias of + # ACLWritableByConstraint — the previous strict, non-expanding fork is gone. + describe "WriteableByConstraint (alias of ACLWritableByConstraint)" do it "registers :writeable_by and :writable_by operators" do assert_includes Parse::Operation.operators.keys, :writeable_by assert_includes Parse::Operation.operators.keys, :writable_by end - it "builds empty array constraint for no write permissions" do - constraint = Parse::Constraint::WriteableByConstraint.new(:acl.writeable_by, []) - result = constraint.build + it ":writeable_by resolves to the same implementation as :writable_by" do + assert Parse::Constraint::WriteableByConstraint < Parse::Constraint::ACLWritableByConstraint + end - assert result.key?("__aggregation_pipeline") - pipeline = result["__aggregation_pipeline"] - match_stage = pipeline.first["$match"] - assert match_stage.key?("$or") + it "builds explicit-empty match for []" do + constraint = Parse::Constraint::WriteableByConstraint.new(:acl.writeable_by, []) + assert_equal( + [{ "$match" => { "_wperm" => { "$exists" => true, "$eq" => [] } } }], + constraint.build["__aggregation_pipeline"], + ) end - it "builds $in constraint for user ID" do + it "builds public-inclusive $or for a user ID (writeable == writable now)" do constraint = Parse::Constraint::WriteableByConstraint.new(:acl.writeable_by, "user456") - result = constraint.build - - pipeline = result["__aggregation_pipeline"] - match_stage = pipeline.first["$match"] - assert_equal({ "$in" => ["user456"] }, match_stage["_wperm"]) + match = constraint.build["__aggregation_pipeline"].first["$match"] + assert_equal( + { "$or" => [ + { "_wperm" => { "$in" => ["user456", "*"] } }, + { "_wperm" => { "$exists" => false } }, + ] }, + match, + ) end end @@ -160,19 +135,24 @@ class TestAclQueryConstraints < Minitest::Test assert_includes Parse::Operation.operators.keys, :not_readable_by end - it "builds $nin constraint" do + it "builds $nin constraint including public, with $exists guard" do constraint = Parse::Constraint::NotReadableByConstraint.new(:acl.not_readable_by, "user123") - result = constraint.build + match = constraint.build["__aggregation_pipeline"].first["$match"] + # "not readable by user" must also exclude publicly-readable rows, so + # "*" is added; the $exists:true guard excludes missing-_rperm (public) + # rows that $nin would otherwise match. + assert_equal({ "$exists" => true, "$nin" => ["user123", "*"] }, match["_rperm"]) + end - pipeline = result["__aggregation_pipeline"] - match_stage = pipeline.first["$match"] - assert_equal({ "$nin" => ["user123"] }, match_stage["_rperm"]) + it "for '*' (not_publicly_readable) excludes only public + missing" do + constraint = Parse::Constraint::NotReadableByConstraint.new(:acl.not_readable_by, "*") + match = constraint.build["__aggregation_pipeline"].first["$match"] + assert_equal({ "$exists" => true, "$nin" => ["*"] }, match["_rperm"]) end it "returns empty pipeline for empty array" do constraint = Parse::Constraint::NotReadableByConstraint.new(:acl.not_readable_by, []) result = constraint.build - assert result.key?("__aggregation_pipeline") assert_empty result["__aggregation_pipeline"] end @@ -185,13 +165,10 @@ class TestAclQueryConstraints < Minitest::Test assert_includes Parse::Operation.operators.keys, :not_writable_by end - it "builds $nin constraint" do + it "builds $nin constraint including public, with $exists guard" do constraint = Parse::Constraint::NotWriteableByConstraint.new(:acl.not_writeable_by, "user123") - result = constraint.build - - pipeline = result["__aggregation_pipeline"] - match_stage = pipeline.first["$match"] - assert_equal({ "$nin" => ["user123"] }, match_stage["_wperm"]) + match = constraint.build["__aggregation_pipeline"].first["$match"] + assert_equal({ "$exists" => true, "$nin" => ["user123", "*"] }, match["_wperm"]) end end @@ -202,25 +179,31 @@ class TestAclQueryConstraints < Minitest::Test assert_includes Parse::Operation.operators.keys, :master_key_only end - it "builds constraint for private ACL (true)" do + it "private (true) matches explicit-empty _rperm AND _wperm, excluding missing" do constraint = Parse::Constraint::PrivateAclConstraint.new(:acl.private_acl, true) - result = constraint.build - - pipeline = result["__aggregation_pipeline"] - match_stage = pipeline.first["$match"] - # Should have $and with conditions for both _rperm and _wperm being empty - assert match_stage.key?("$and") - assert_equal 2, match_stage["$and"].length - end - - it "builds constraint for non-private ACL (false)" do + match = constraint.build["__aggregation_pipeline"].first["$match"] + assert_equal( + { "$and" => [ + { "_rperm" => { "$exists" => true, "$eq" => [] } }, + { "_wperm" => { "$exists" => true, "$eq" => [] } }, + ] }, + match, + ) + # A missing _rperm is PUBLIC, not private — must NOT appear here. + refute_includes match.to_json, '"$exists":false' + end + + it "non-private (false) is the exact complement ($nor of the private match)" do constraint = Parse::Constraint::PrivateAclConstraint.new(:acl.private_acl, false) - result = constraint.build - - pipeline = result["__aggregation_pipeline"] - match_stage = pipeline.first["$match"] - # Should have $or to match objects with some permissions - assert match_stage.key?("$or") + match = constraint.build["__aggregation_pipeline"].first["$match"] + assert match.key?("$nor") + assert_equal( + [{ "$and" => [ + { "_rperm" => { "$exists" => true, "$eq" => [] } }, + { "_wperm" => { "$exists" => true, "$eq" => [] } }, + ] }], + match["$nor"], + ) end end diff --git a/test/lib/parse/query/hint_mongo_direct_integration_test.rb b/test/lib/parse/query/hint_mongo_direct_integration_test.rb new file mode 100644 index 0000000..7481cf1 --- /dev/null +++ b/test/lib/parse/query/hint_mongo_direct_integration_test.rb @@ -0,0 +1,109 @@ +# encoding: UTF-8 +# frozen_string_literal: true + +require_relative "../../../test_helper_integration" +require "timeout" + +# Integration test for Query#hint against a REAL MongoDB connection. +# +# The companion unit test (hint_mongo_direct_test.rb) proves the SDK forwards +# `hint:` into Parse::MongoDB.aggregate's options. This test proves the driver +# actually applies it end-to-end: +# (a) .hint("<existing index>").results_direct returns the seeded rows. +# (b) .hint("<bogus index>").results_direct fails fast with a raw +# Mongo::Error::OperationFailure (the documented stale-hint signal — the +# SDK only wraps code-50 timeouts, so a bad hint propagates unwrapped). +# +# Requires the Docker test stack (PARSE_TEST_USE_DOCKER=true) and the mongo gem. +# Run with: +# PARSE_TEST_USE_DOCKER=true bundle exec ruby -Ilib:test \ +# test/lib/parse/query/hint_mongo_direct_integration_test.rb +class HintItem < Parse::Object + parse_class "HintItem" + property :category, :string + property :label, :string +end + +class HintMongoDirectIntegrationTest < Minitest::Test + include ParseStackIntegrationTest + + # Same URI the other direct integration tests use. + MONGODB_URI = (ENV["PARSE_TEST_MONGO_URI"] || "mongodb://admin:password@localhost:29017/parse_stack_next_it?authSource=admin") + + INDEX_NAME = "hint_integ_category_1" + CATEGORY = "hint_integ_seeds" + + def setup + super + skip "Docker integration tests require PARSE_TEST_USE_DOCKER=true" unless ENV["PARSE_TEST_USE_DOCKER"] == "true" + + begin + require "mongo" + require "parse/mongodb" + Parse::MongoDB.configure(uri: MONGODB_URI, enabled: true) + # configure is lazy (connection deferred to first query). Ping now so an + # auth/connectivity wrinkle surfaces here as a clean skip instead of an + # error mid-test. Use a server ping, not a collection op — the DB was just + # reset, so HintItem's collection doesn't exist yet (listing its indexes + # would raise NamespaceNotFound and skip for the wrong reason). + Parse::MongoDB.client.database.command(ping: 1) + rescue LoadError => e + skip "mongo gem not available: #{e.message}" + rescue => e + skip "MongoDB unavailable: #{e.class}: #{e.message}" + end + end + + def teardown + Parse::MongoDB.reset! if defined?(Parse::MongoDB) + super + end + + # Seed a few rows and create the named index they can be queried under. + def seed_and_index! + 3.times do |i| + item = HintItem.new(category: CATEGORY, label: "item_#{i}") + assert item.save, "setup: failed to save HintItem #{i}" + end + # Let the writes commit before the direct read (mirrors sibling tests). + sleep 0.5 + Parse::MongoDB.collection("HintItem").indexes.create_one({ "category" => 1 }, name: INDEX_NAME) + end + + # (a) A real, named index is honored and rows come back. + def test_hint_named_index_returns_rows + Timeout.timeout(30) do + seed_and_index! + + results = HintItem.query(category: CATEGORY) + .hint(INDEX_NAME) + .results_direct(master: true) + + refute_empty results, + ".hint(#{INDEX_NAME.inspect}).results_direct must return the seeded rows" + assert_equal 3, results.size, + "all seeded rows must be visible under the forced index hint" + assert(results.all? { |r| r.category == CATEGORY }, + "every returned row must belong to the seeded category") + end + end + + # (b) A bogus index name fails fast — the documented stale-hint signal. + def test_hint_nonexistent_index_raises_operation_failure + Timeout.timeout(30) do + item = HintItem.new(category: CATEGORY, label: "negative_case") + assert item.save, "setup: failed to save HintItem" + sleep 0.5 + + err = assert_raises(Mongo::Error::OperationFailure) do + HintItem.query(category: CATEGORY) + .hint("nonexistent_index_xyz") + .results_direct(master: true) + end + # Loose match — wording varies across MongoDB versions; the class is the + # primary assertion. + assert_match(/hint|index|nonexistent_index_xyz/i, err.message, + "OperationFailure message should reference the bad hint") + end + end +end diff --git a/test/lib/parse/query/regex_unicode_integration_test.rb b/test/lib/parse/query/regex_unicode_integration_test.rb new file mode 100644 index 0000000..47c9c4e --- /dev/null +++ b/test/lib/parse/query/regex_unicode_integration_test.rb @@ -0,0 +1,114 @@ +# encoding: UTF-8 +# frozen_string_literal: true + +require_relative "../../../test_helper_integration" +require "timeout" + +# Integration test for the opt-in Unicode regex flag against a REAL +# Parse Server + MongoDB stack. +# +# The companion unit test (regex_unicode_option_unit_test.rb) pins the +# compiled wire shape (`$regex` + `$options: "iu"`). This test proves both +# execution paths actually ACCEPT that shape — MongoDB's documented +# `$options` letters are i/m/x/s, and the `u` flag is only valid on the +# PCRE2 engine (MongoDB 6.1+ / Parse Server 8.3.0+), so a regression here +# would surface as a server-side error on every unicode-flagged query +# while the wire-shape unit test kept passing: +# (a) REST: `.results` through Parse Server. +# (b) mongo-direct: `.results_direct` straight into the driver. +# +# Requires the Docker test stack (PARSE_TEST_USE_DOCKER=true) and the +# mongo gem. Run with: +# PARSE_TEST_USE_DOCKER=true bundle exec ruby -Ilib:test \ +# test/lib/parse/query/regex_unicode_integration_test.rb +class UnicodeRegexItem < Parse::Object + parse_class "UnicodeRegexItem" + property :name, :string +end + +class RegexUnicodeIntegrationTest < Minitest::Test + include ParseStackIntegrationTest + + # Same URI the other direct integration tests use. + MONGODB_URI = (ENV["PARSE_TEST_MONGO_URI"] || "mongodb://admin:password@localhost:29017/parse_stack_next_it?authSource=admin") + + def setup + super + skip "Docker integration tests require PARSE_TEST_USE_DOCKER=true" unless ENV["PARSE_TEST_USE_DOCKER"] == "true" + + begin + require "mongo" + require "parse/mongodb" + Parse::MongoDB.configure(uri: MONGODB_URI, enabled: true) + Parse::MongoDB.client.database.command(ping: 1) + rescue LoadError => e + skip "mongo gem not available: #{e.message}" + rescue => e + skip "MongoDB unavailable: #{e.class}: #{e.message}" + end + end + + def teardown + Parse::MongoDB.reset! if defined?(Parse::MongoDB) + super + end + + def seed! + ["CAFÉ corner", "café latte", "plain coffee"].each_with_index do |name, i| + item = UnicodeRegexItem.new(name: name) + assert item.save, "setup: failed to save UnicodeRegexItem #{i}" + end + # Let the writes commit before the reads (mirrors sibling tests). + sleep 0.5 + end + + # (a) REST path: Parse Server accepts $options "iu" and folds the + # non-ASCII É/é pair case-insensitively. + def test_unicode_contains_executes_over_rest + Timeout.timeout(30) do + seed! + + results = UnicodeRegexItem.query( + :name.contains => { value: "café", unicode: true }, + ).results + + names = results.map(&:name).sort + assert_equal ["CAFÉ corner", "café latte"], names, + "unicode contains over REST must fold É/é and exclude the ASCII row" + end + end + + # (b) mongo-direct path: the driver accepts $options "iu" (PCRE2 engine) + # with identical results. The load-bearing assertion is that no + # OperationFailure is raised — `u` is not one of MongoDB's documented + # i/m/x/s letters, so acceptance is an engine property, not a given. + def test_unicode_contains_executes_mongo_direct + Timeout.timeout(30) do + seed! + + results = UnicodeRegexItem.query( + :name.contains => { value: "café", unicode: true }, + ).results_direct(master: true) + + names = results.map(&:name).sort + assert_equal ["CAFÉ corner", "café latte"], names, + "unicode contains over mongo-direct must fold É/é and exclude the ASCII row" + end + end + + # The `like` hash form (explicit Regexp + unicode flag) executes + # mongo-direct as well. + def test_unicode_like_executes_mongo_direct + Timeout.timeout(30) do + seed! + + results = UnicodeRegexItem.query( + :name.like => { value: /CAFÉ/i, unicode: true }, + ).results_direct(master: true) + + names = results.map(&:name).sort + assert_equal ["CAFÉ corner", "café latte"], names, + "unicode like over mongo-direct must fold É/é and exclude the ASCII row" + end + end +end diff --git a/test/lib/parse/regex_unicode_option_unit_test.rb b/test/lib/parse/regex_unicode_option_unit_test.rb new file mode 100644 index 0000000..cf8bc54 --- /dev/null +++ b/test/lib/parse/regex_unicode_option_unit_test.rb @@ -0,0 +1,99 @@ +# encoding: UTF-8 +# frozen_string_literal: true + +require_relative "../../test_helper" + +# Unit coverage for the opt-in `{ value:, unicode: true }` form on the regex +# builders. The bare-value form must compile byte-for-byte as before; the +# unicode flag adds `u` to the compiled `$options` for correct multibyte +# (e.g. accented / CJK) case-insensitive matching. +class RegexUnicodeOptionUnitTest < Minitest::Test + def where_clause(constraint) + Parse::Query.new("Post").where(constraint).compile(encode: false)[:where] + end + + # -------------------------------------------------------------------------- + # Bare forms are unchanged (back-compat guard). + # -------------------------------------------------------------------------- + + def test_starts_with_bare_form_unchanged + assert_equal({ "name" => { :$regex => "^John", :$options => "i" } }, + where_clause(:name.starts_with => "John")) + end + + def test_contains_bare_form_unchanged + assert_equal({ "title" => { :$regex => ".*parse.*", :$options => "i" } }, + where_clause(:title.contains => "parse")) + end + + def test_ends_with_bare_form_unchanged + assert_equal({ "name" => { :$regex => "\\.pdf$", :$options => "i" } }, + where_clause(:name.ends_with => ".pdf")) + end + + def test_like_bare_form_uses_inline_flags + # The bare Regexp form stringifies to PCRE inline flags and emits no + # $options. This is the pre-existing behavior we must not change. + assert_equal({ "name" => { :$regex => "(?i-mx:Bob)" } }, + where_clause(:name.like => /Bob/i)) + end + + # -------------------------------------------------------------------------- + # Unicode opt-in appends `u`. + # -------------------------------------------------------------------------- + + def test_starts_with_unicode_opt_in + assert_equal({ "name" => { :$regex => "^café", :$options => "iu" } }, + where_clause(:name.starts_with => { value: "café", unicode: true })) + end + + def test_contains_unicode_opt_in + assert_equal({ "title" => { :$regex => ".*café.*", :$options => "iu" } }, + where_clause(:title.contains => { value: "café", unicode: true })) + end + + def test_ends_with_unicode_opt_in + assert_equal({ "title" => { :$regex => "café$", :$options => "iu" } }, + where_clause(:title.ends_with => { value: "café", unicode: true })) + end + + def test_like_unicode_opt_in_with_casefold + assert_equal({ "name" => { :$regex => "café", :$options => "iu" } }, + where_clause(:name.like => { value: /café/i, unicode: true })) + end + + def test_like_unicode_opt_in_without_casefold + assert_equal({ "name" => { :$regex => "café", :$options => "u" } }, + where_clause(:name.like => { value: /café/, unicode: true })) + end + + # -------------------------------------------------------------------------- + # Hash form without the flag does not leak `u`. + # -------------------------------------------------------------------------- + + def test_starts_with_hash_without_unicode_keeps_i + assert_equal({ "name" => { :$regex => "^John", :$options => "i" } }, + where_clause(:name.starts_with => { value: "John" })) + end + + def test_starts_with_unicode_false_keeps_i + assert_equal({ "name" => { :$regex => "^John", :$options => "i" } }, + where_clause(:name.starts_with => { value: "John", unicode: false })) + end + + def test_like_hash_form_without_unicode_emits_structured_shape + # The hash form always compiles to the explicit $regex/$options shape + # (not inline flags), so casefold becomes an explicit `i`. + assert_equal({ "name" => { :$regex => "Bob", :$options => "i" } }, + where_clause(:name.like => { value: /Bob/i })) + end + + # -------------------------------------------------------------------------- + # String keys in the opt-in hash are accepted. + # -------------------------------------------------------------------------- + + def test_string_keys_in_opt_in_hash + assert_equal({ "name" => { :$regex => "^café", :$options => "iu" } }, + where_clause(:name.starts_with => { "value" => "café", "unicode" => true })) + end +end diff --git a/test/lib/parse/retrieval_pointer_filter_test.rb b/test/lib/parse/retrieval_pointer_filter_test.rb new file mode 100644 index 0000000..6910013 --- /dev/null +++ b/test/lib/parse/retrieval_pointer_filter_test.rb @@ -0,0 +1,101 @@ +# encoding: UTF-8 +# frozen_string_literal: true + +require_relative "../../test_helper" + +# Unit tests for Parse::Retrieval.translate_pointer_filter_values — the +# v5.5 storage-form translation of pointer VALUES in caller-supplied +# filters: { owner: <Pointer _User$abc> } => { "_p_owner" => "_User$abc" }. +class RetrievalPointerFilterTest < Minitest::Test + class PFDoc < Parse::Object + parse_class "PFDoc" + property :title, :string + property :status, :string + belongs_to :owner, as: :user + end + + def translate(filter) + Parse::Retrieval.translate_pointer_filter_values(PFDoc, filter) + end + + def pointer + Parse::Pointer.new("_User", "abc123") + end + + def pointer_hash + { "__type" => "Pointer", "className" => "_User", "objectId" => "abc123" } + end + + def test_nil_and_non_hash_pass_through + assert_nil translate(nil) + assert_equal "x", Parse::Retrieval.translate_pointer_filter_values(PFDoc, "x") + end + + def test_plain_values_untouched + f = { "status" => "published", "title" => "hello" } + assert_equal f, translate(f) + end + + def test_parse_pointer_value_translates_to_storage_form + out = translate({ owner: pointer }) + assert_equal({ "_p_owner" => "_User$abc123" }, out) + end + + def test_wire_pointer_hash_translates + out = translate({ "owner" => pointer_hash }) + assert_equal({ "_p_owner" => "_User$abc123" }, out) + end + + def test_symbol_keyed_pointer_hash_translates + out = translate({ owner: { __type: "Pointer", className: "_User", objectId: "abc123" } }) + assert_equal({ "_p_owner" => "_User$abc123" }, out) + end + + def test_pointer_inside_in_operator + out = translate({ owner: { "$in" => [pointer, pointer_hash] } }) + assert_equal({ "_p_owner" => { "$in" => %w[_User$abc123 _User$abc123] } }, out) + end + + def test_pointer_inside_eq_and_ne + out = translate({ owner: { "$ne" => pointer } }) + assert_equal({ "_p_owner" => { "$ne" => "_User$abc123" } }, out) + end + + def test_operator_hash_without_pointers_untouched + f = { "plays" => { "$gt" => 100 } } + assert_equal f, translate(f) + end + + def test_incomplete_pointer_hash_not_translated + f = { "owner" => { "__type" => "Pointer", "className" => "_User" } } + assert_equal f, translate(f) + end + + def test_translation_is_idempotent + once = translate({ owner: pointer }) + assert_equal once, translate(once) + end + + def test_mixed_filter_translates_only_pointer_entries + out = translate({ "status" => "published", :owner => pointer }) + assert_equal({ "status" => "published", "_p_owner" => "_User$abc123" }, out) + end + + def test_retrieve_applies_translation_to_filters + captured = {} + fake = lambda do |text:, k:, field:, filter:, vector_filter:, raw:, **_opts| + captured[:filter] = filter + captured[:vector_filter] = vector_filter + [] + end + PFDoc.stub(:find_similar, fake) do + Parse::Retrieval.retrieve( + query: "find docs", klass: PFDoc, text_field: :title, + filter: { owner: pointer }, + vector_filter: { "owner" => pointer_hash }, + ) + end + assert_equal({ "_p_owner" => "_User$abc123" }, captured[:filter]) + assert_equal({ "_p_owner" => "_User$abc123" }, captured[:vector_filter]) + end +end diff --git a/test/lib/parse/search_index_migrator_tenant_filter_test.rb b/test/lib/parse/search_index_migrator_tenant_filter_test.rb new file mode 100644 index 0000000..b119994 --- /dev/null +++ b/test/lib/parse/search_index_migrator_tenant_filter_test.rb @@ -0,0 +1,101 @@ +# encoding: UTF-8 +# frozen_string_literal: true + +require_relative "../../test_helper" + +# Unit tests for the v5.5 SearchIndexMigrator augmentation: vectorSearch +# index declarations on classes with a registered agent_tenant_scope are +# auto-extended with the scope field as a `type: "filter"` path (so the +# tenant pre-filter Parse::Retrieval folds into $vectorSearch.filter is +# always covered by the deployed index). +class SearchIndexMigratorTenantFilterTest < Minitest::Test + class TFDoc < Parse::Object + parse_class "TFDoc" + property :title, :string + property :tenant_key, :string + + mongo_search_index "tfdoc_vec", { + "fields" => [ + { "type" => "vector", "path" => "embedding", + "numDimensions" => 4, "similarity" => "cosine" }, + ], + }, type: "vectorSearch" + + mongo_search_index "tfdoc_lex", { + "mappings" => { "dynamic" => true }, + } + end + + class TFCovered < Parse::Object + parse_class "TFCovered" + property :tenant_key, :string + + mongo_search_index "tfcovered_vec", { + "fields" => [ + { "type" => "vector", "path" => "embedding", + "numDimensions" => 4, "similarity" => "cosine" }, + { "type" => "filter", "path" => "tenantKey" }, + ], + }, type: "vectorSearch" + end + + def with_tenant_scope(class_name, field) + Parse::Agent::MetadataRegistry.register_tenant_scope(class_name, field, from: ->(_a) { "t" }) + yield + ensure + Parse::Agent::MetadataRegistry.instance_variable_get(:@tenant_scope_rules)&.delete(class_name) + end + + def declared_for(klass) + Parse::Schema::SearchIndexMigrator.new(klass).plan[:declared] + end + + def filter_paths(decl) + (decl[:definition]["fields"] || []).select { |f| (f["type"] || f[:type]).to_s == "filter" } + .map { |f| (f["path"] || f[:path]).to_s } + end + + def test_no_tenant_scope_leaves_declaration_untouched + decls = declared_for(TFDoc) + vec = decls.find { |d| d[:name] == "tfdoc_vec" } + assert_empty filter_paths(vec) + end + + def test_tenant_scope_field_auto_added_as_filter_path + with_tenant_scope("TFDoc", :tenant_key) do + decls = declared_for(TFDoc) + vec = decls.find { |d| d[:name] == "tfdoc_vec" } + # tenant_key columnizes to its wire name. + assert_equal ["tenantKey"], filter_paths(vec) + # The vector entry is preserved. + types = vec[:definition]["fields"].map { |f| f["type"] } + assert_includes types, "vector" + end + end + + def test_lexical_declaration_never_augmented + with_tenant_scope("TFDoc", :tenant_key) do + decls = declared_for(TFDoc) + lex = decls.find { |d| d[:name] == "tfdoc_lex" } + refute lex[:definition].key?("fields") + end + end + + def test_already_covered_declaration_unchanged + with_tenant_scope("TFCovered", :tenant_key) do + decls = declared_for(TFCovered) + vec = decls.find { |d| d[:name] == "tfcovered_vec" } + assert_equal ["tenantKey"], filter_paths(vec) + assert_equal 2, vec[:definition]["fields"].length + end + end + + def test_original_declaration_not_mutated + with_tenant_scope("TFDoc", :tenant_key) do + declared_for(TFDoc) + raw = TFDoc.mongo_search_index_declarations.find { |d| d[:name] == "tfdoc_vec" } + assert_equal 1, raw[:definition]["fields"].length, + "augmentation must not write back into the frozen declaration" + end + end +end diff --git a/test/lib/parse/vector_index_drift_test.rb b/test/lib/parse/vector_index_drift_test.rb new file mode 100644 index 0000000..8f20057 --- /dev/null +++ b/test/lib/parse/vector_index_drift_test.rb @@ -0,0 +1,257 @@ +# encoding: UTF-8 +# frozen_string_literal: true + +require_relative "../../test_helper" +require "parse/embeddings" + +# Unit tests for first-query vectorSearch index drift verification +# (Parse::Core::VectorSearchable#verify_vector_index!): dimension / +# similarity mismatch detection, tenant-scope filter-path coverage, +# the warn/raise/ignore policy, and the once-per-(field,index) cache. +class VectorIndexDriftTest < Minitest::Test + def self.register + Parse::Embeddings.register(:fx_drift, Parse::Embeddings::Fixture.new(dimensions: 4)) + end + register + + class DriftItem < Parse::Object + parse_class "DriftItem" + property :title, :string + property :embedding, :vector, dimensions: 4, provider: :fx_drift, similarity: :cosine + end + + def teardown + Parse::VectorSearch.index_drift_policy = :warn + DriftItem.instance_variable_set(:@_verified_vector_indexes, nil) + end + + def index_fixture(dims: 4, similarity: "cosine", filters: [], name: "drift_idx") + fields = [{ "type" => "vector", "path" => "embedding", + "numDimensions" => dims, "similarity" => similarity }] + filters.each { |p| fields << { "type" => "filter", "path" => p } } + { "name" => name, "type" => "vectorSearch", + "latestDefinition" => { "fields" => fields } } + end + + def findings_for(idx) + DriftItem.send(:vector_index_drift_findings, :embedding, idx) + end + + def test_in_sync_index_yields_no_findings + assert_empty findings_for(index_fixture) + end + + def test_dimension_mismatch_detected + findings = findings_for(index_fixture(dims: 1536)) + assert_equal 1, findings.length + assert_includes findings.first, "numDimensions=1536" + assert_includes findings.first, "dimensions: 4" + end + + def test_similarity_mismatch_detected + findings = findings_for(index_fixture(similarity: "dotProduct")) + assert_equal 1, findings.length + assert_includes findings.first, "dotProduct" + end + + def test_missing_index_similarity_is_not_drift + idx = index_fixture + idx["latestDefinition"]["fields"].first.delete("similarity") + assert_empty findings_for(idx) + end + + def test_warn_policy_does_not_raise + Parse::VectorSearch.index_drift_policy = :warn + out = capture_warn do + DriftItem.send(:verify_vector_index!, :embedding, index_fixture(dims: 99)) + end + assert_includes out, "[Parse::VectorSearch:DRIFT]" + end + + def test_raise_policy_raises_with_findings + Parse::VectorSearch.index_drift_policy = :raise + err = assert_raises(Parse::Core::VectorSearchable::IndexDriftError) do + DriftItem.send(:verify_vector_index!, :embedding, index_fixture(dims: 99)) + end + assert_equal 1, err.findings.length + end + + def test_ignore_policy_skips_verification + Parse::VectorSearch.index_drift_policy = :ignore + out = capture_warn do + DriftItem.send(:verify_vector_index!, :embedding, index_fixture(dims: 99)) + end + assert_equal "", out + end + + def test_raise_policy_raises_on_every_query + Parse::VectorSearch.index_drift_policy = :raise + bad = index_fixture(dims: 99) + assert_raises(Parse::Core::VectorSearchable::IndexDriftError) do + DriftItem.send(:verify_vector_index!, :embedding, bad) + end + # Strict mode: the cached findings keep raising — a drifted index + # must never serve results after the first failure. + assert_raises(Parse::Core::VectorSearchable::IndexDriftError) do + DriftItem.send(:verify_vector_index!, :embedding, bad) + end + end + + def test_findings_computed_once_per_field_index_pair + Parse::VectorSearch.index_drift_policy = :warn + good = index_fixture + capture_warn { DriftItem.send(:verify_vector_index!, :embedding, good) } + cache = DriftItem.instance_variable_get(:@_verified_vector_indexes) + assert_equal [], cache["embedding|drift_idx"] + # Second call returns via the cache without recomputing findings. + DriftItem.stub(:vector_index_drift_findings, ->(*) { flunk "recomputed" }) do + DriftItem.send(:verify_vector_index!, :embedding, good) + end + end + + def test_warn_policy_warns_only_on_first_check + Parse::VectorSearch.index_drift_policy = :warn + bad = index_fixture(dims: 99) + first = capture_warn { DriftItem.send(:verify_vector_index!, :embedding, bad) } + assert_includes first, "[Parse::VectorSearch:DRIFT]" + second = capture_warn { DriftItem.send(:verify_vector_index!, :embedding, bad) } + assert_equal "", second + end + + def test_policy_escalation_after_first_check_takes_effect + # A deployment that boots under :warn and flips to :raise (e.g. in a + # console) should start failing without a process restart. + Parse::VectorSearch.index_drift_policy = :warn + bad = index_fixture(dims: 99) + capture_warn { DriftItem.send(:verify_vector_index!, :embedding, bad) } + Parse::VectorSearch.index_drift_policy = :raise + assert_raises(Parse::Core::VectorSearchable::IndexDriftError) do + DriftItem.send(:verify_vector_index!, :embedding, bad) + end + end + + def test_policy_writer_validates + assert_raises(ArgumentError) { Parse::VectorSearch.index_drift_policy = :loud } + Parse::VectorSearch.index_drift_policy = :raise + assert_equal :raise, Parse::VectorSearch.index_drift_policy + end + + def test_policy_writer_rejects_nil_with_argument_error + err = assert_raises(ArgumentError) { Parse::VectorSearch.index_drift_policy = nil } + assert_includes err.message, "must be one of" + err = assert_raises(ArgumentError) { Parse::VectorSearch.index_drift_policy = 42 } + assert_includes err.message, "must be one of" + end + + # ---- verify_explicit_vector_index (explicit index: kwarg) -------------- + # The auto-discovery path verifies what it resolves; an explicit index: + # kwarg is drift-verified best-effort when the catalog's covering index + # carries the same name, and skipped (never failed) otherwise. + + def test_explicit_index_with_matching_name_is_drift_verified + require "parse/atlas_search" + Parse::VectorSearch.index_drift_policy = :raise + drifted = index_fixture(dims: 99) + Parse::AtlasSearch::IndexCatalog.stub(:find_vector_index, drifted) do + assert_raises(Parse::Core::VectorSearchable::IndexDriftError) do + DriftItem.send(:verify_explicit_vector_index, :embedding, "drift_idx") + end + end + end + + def test_explicit_index_with_different_name_skips_verification + require "parse/atlas_search" + Parse::VectorSearch.index_drift_policy = :raise + # The catalog's covering index ("drift_idx") is drifted, but the + # explicit kwarg targets a different index — an override, not a + # discovery request, so verification is skipped without warning. + drifted = index_fixture(dims: 99) + Parse::AtlasSearch::IndexCatalog.stub(:find_vector_index, drifted) do + out = capture_warn do + DriftItem.send(:verify_explicit_vector_index, :embedding, "other_idx") + end + assert_equal "", out + end + end + + def test_explicit_index_skips_when_catalog_lookup_fails + require "parse/atlas_search" + Parse::VectorSearch.index_drift_policy = :raise + boom = ->(*_a, **_kw) { raise StandardError, "catalog unavailable" } + Parse::AtlasSearch::IndexCatalog.stub(:find_vector_index, boom) do + out = capture_warn do + DriftItem.send(:verify_explicit_vector_index, :embedding, "drift_idx") + end + assert_equal "", out + end + end + + def test_explicit_index_skips_when_catalog_has_no_index + require "parse/atlas_search" + Parse::VectorSearch.index_drift_policy = :raise + Parse::AtlasSearch::IndexCatalog.stub(:find_vector_index, nil) do + out = capture_warn do + DriftItem.send(:verify_explicit_vector_index, :embedding, "drift_idx") + end + assert_equal "", out + end + end + + def test_explicit_index_ignore_policy_skips_catalog_lookup + require "parse/atlas_search" + Parse::VectorSearch.index_drift_policy = :ignore + untouched = ->(*) { flunk "catalog consulted under :ignore" } + Parse::AtlasSearch::IndexCatalog.stub(:find_vector_index, untouched) do + out = capture_warn do + DriftItem.send(:verify_explicit_vector_index, :embedding, "drift_idx") + end + assert_equal "", out + end + end + + def test_resolve_with_explicit_index_runs_drift_verification + require "parse/atlas_search" + Parse::VectorSearch.index_drift_policy = :raise + drifted = index_fixture(dims: 99) + Parse::AtlasSearch::IndexCatalog.stub(:find_vector_index, drifted) do + assert_raises(Parse::Core::VectorSearchable::IndexDriftError) do + DriftItem.send(:resolve_vector_index!, :embedding, "drift_idx") + end + end + end + + def test_resolve_with_explicit_index_returns_it_when_in_sync + require "parse/atlas_search" + Parse::VectorSearch.index_drift_policy = :raise + Parse::AtlasSearch::IndexCatalog.stub(:find_vector_index, index_fixture) do + assert_equal "drift_idx", + DriftItem.send(:resolve_vector_index!, :embedding, "drift_idx") + end + end + + def test_tenant_scope_filter_coverage + Parse::Agent::MetadataRegistry.register_tenant_scope("DriftItem", :tenant, from: ->(_a) { "t1" }) + begin + findings = findings_for(index_fixture) # no filter path declared + assert_equal 1, findings.length + assert_includes findings.first, "tenant" + assert_includes findings.first, "filter" + + assert_empty findings_for(index_fixture(filters: ["tenant"])) + ensure + # Remove the registration so other tests see a clean registry. + Parse::Agent::MetadataRegistry.instance_variable_get(:@tenant_scope_rules)&.delete("DriftItem") + end + end + + private + + def capture_warn + old_stderr = $stderr + $stderr = StringIO.new + yield + $stderr.string + ensure + $stderr = old_stderr + end +end diff --git a/test/lib/parse/vector_search_hybrid_security_test.rb b/test/lib/parse/vector_search_hybrid_security_test.rb new file mode 100644 index 0000000..cdb9a10 --- /dev/null +++ b/test/lib/parse/vector_search_hybrid_security_test.rb @@ -0,0 +1,96 @@ +# encoding: UTF-8 +# frozen_string_literal: true + +require_relative "../../test_helper" +require "parse/vector_search/hybrid" + +# Unit tests for the v5.5 hybrid-search security follow-ups: +# - NEW-VEC-2: unsupported_stage_error? no longer matches the broad +# "is not allowed" phrase (authorization errors must not be +# classified as unknown-stage probe results). +# - NEW-VEC-1: _hybrid_score for non-master native results is +# recomputed from the post-ACL visible ordering, so it carries no +# information about hidden rows. +class VectorSearchHybridSecurityTest < Minitest::Test + HYBRID = Parse::VectorSearch::Hybrid + + # ---------- NEW-VEC-2: probe-failure classification ---------- + + def classify(message) + HYBRID.send(:unsupported_stage_error?, StandardError.new(message)) + end + + def test_unknown_stage_phrases_classify_as_unsupported + assert classify("Unrecognized pipeline stage name: '$rankFusion'") + assert classify("Unknown aggregation stage $rankFusion") + assert classify("unknown stage rankFusion in pipeline") + end + + def test_authorization_error_is_not_classified_as_unsupported + # The pre-fix list included "is not allowed", which matches MongoDB + # authorization failures and would cache the wrong probe verdict. + refute classify("user is not allowed to execute command aggregate with $rankFusion") + refute classify("$rankFusion is not allowed in this context") + end + + def test_unrelated_error_is_not_classified_as_unsupported + refute classify("operation exceeded time limit") + refute classify("Unrecognized pipeline stage name: '$weirdStage'") + end + + # ---------- NEW-VEC-1: visible-order score recompute ---------- + + def rows_with_scores(scores) + scores.each_with_index.map do |s, i| + { "_id" => "row#{i}", "_hybrid_score" => s } + end + end + + def test_recompute_replaces_scores_with_visible_rank_function + rows = rows_with_scores([0.0321, 0.0289, 0.0164]) + HYBRID.send(:recompute_scores_from_visible_order!, rows, + k_constant: 60, weights: nil) + # weight 1.0 + 1.0 = 2.0; rank i+1 among VISIBLE rows. + assert_in_delta 2.0 / 61, rows[0]["_hybrid_score"], 1e-12 + assert_in_delta 2.0 / 62, rows[1]["_hybrid_score"], 1e-12 + assert_in_delta 2.0 / 63, rows[2]["_hybrid_score"], 1e-12 + end + + def test_recomputed_scores_are_independent_of_hidden_rows + # Same three visible rows, but in scenario B they survived a much + # deeper fused ranking (huge raw-score gaps from hidden rows between + # them). Post-recompute the two scenarios must be indistinguishable. + visible_a = rows_with_scores([0.0321, 0.0320, 0.0319]) + visible_b = rows_with_scores([0.0321, 0.0150, 0.0021]) + [visible_a, visible_b].each do |rows| + HYBRID.send(:recompute_scores_from_visible_order!, rows, + k_constant: 60, weights: nil) + end + assert_equal visible_a.map { |r| r["_hybrid_score"] }, + visible_b.map { |r| r["_hybrid_score"] } + end + + def test_recompute_preserves_descending_order + rows = rows_with_scores([0.9, 0.5, 0.1, 0.05]) + HYBRID.send(:recompute_scores_from_visible_order!, rows, + k_constant: 60, weights: nil) + scores = rows.map { |r| r["_hybrid_score"] } + assert_equal scores.sort.reverse, scores + end + + def test_recompute_honors_branch_weights + rows = rows_with_scores([0.5]) + HYBRID.send(:recompute_scores_from_visible_order!, rows, + k_constant: 60, weights: { lexical: 0.4, vector: 0.6 }) + assert_in_delta 1.0 / 61, rows[0]["_hybrid_score"], 1e-12 + end + + def test_recompute_handles_empty_and_non_hash_rows + assert_equal [], HYBRID.send(:recompute_scores_from_visible_order!, [], + k_constant: 60, weights: nil) + rows = [nil, { "_id" => "a", "_hybrid_score" => 0.5 }] + HYBRID.send(:recompute_scores_from_visible_order!, rows, + k_constant: 60, weights: nil) + assert_in_delta 2.0 / 62, rows[1]["_hybrid_score"], 1e-12 + end +end diff --git a/test/lib/parse/verify_password_rate_limit_test.rb b/test/lib/parse/verify_password_rate_limit_test.rb new file mode 100644 index 0000000..2262b5c --- /dev/null +++ b/test/lib/parse/verify_password_rate_limit_test.rb @@ -0,0 +1,128 @@ +# encoding: UTF-8 +# frozen_string_literal: true + +require_relative "../../test_helper" + +# Tests that Parse::API::Users#verify_password participates in the client-side +# login rate-limit the same way #login does: it calls check_login_rate_limit! +# before issuing the request and track_login_attempt after, keyed on the BARE +# username so failures share a bucket with #login. +# +# These do NOT stub check_login_rate_limit! / track_login_attempt (that would +# make the assertions vacuous). Instead they include the real module and inject +# a fake #request so verify_password runs end-to-end without a live server. +class VerifyPasswordRateLimitTest < Minitest::Test + + # A minimal response double exposing the two surfaces verify_password touches. + class FakeResponse + def initialize(success) + @success = success + end + + def success? + @success + end + + attr_accessor :parse_class + end + + # Host class that includes the real Users module and lets the test control + # what #request returns (success vs failure) without any HTTP. + class Limiter + include Parse::API::Users + + attr_accessor :next_response + attr_reader :request_count + + def initialize + @request_count = 0 + end + + def request(*) + @request_count += 1 + @next_response + end + end + + def make_limiter(success: false) + l = Limiter.new + l.next_response = FakeResponse.new(success) + l + end + + # ========================================================================= + # Guard: a pre-existing lockout blocks verify_password before any request + # ========================================================================= + + def test_verify_password_raises_when_locked_out + limiter = make_limiter + limiter.send(:login_rate_limits)["alice"] = { + failures: 5, + locked_until: Time.now + 300 + } + + assert_raises(Parse::Error::AccountLockoutError) do + limiter.verify_password("alice", "secret") + end + assert_equal 0, limiter.request_count, + "the rate-limit guard must short-circuit before issuing the request" + end + + # ========================================================================= + # Tracking: repeated verify_password failures accumulate to a lockout + # ========================================================================= + + def test_verify_password_failures_accumulate_to_lockout + limiter = make_limiter(success: false) + failures = Parse::API::Users::LOGIN_MAX_FAILURES + + # The Nth failure trips the lockout; the (N+1)th call must be blocked. + failures.times { limiter.verify_password("mallory", "wrong") } + + assert_raises(Parse::Error::AccountLockoutError) do + limiter.verify_password("mallory", "wrong") + end + end + + def test_verify_password_success_clears_failure_counter + limiter = make_limiter(success: false) + # A few failures below the threshold... + failures = Parse::API::Users::LOGIN_MAX_FAILURES - 1 + failures.times { limiter.verify_password("nadia", "wrong") } + + # Precondition: the failures actually accumulated an entry. Without this, + # the test would pass even if track_login_attempt were removed entirely + # (no entry to delete, so the final assert_nil would hold vacuously). + assert_equal failures, limiter.send(:login_rate_limits)["nadia"][:failures], + "failed verify_password calls must accumulate a failure counter" + + # ...then a success deletes the bucket entry entirely. Asserting the entry + # is gone (not merely that the guard passes) is what actually proves the + # success-side track wiring — a non-zero failure count below the lockout + # threshold would also pass the guard. + limiter.next_response = FakeResponse.new(true) + limiter.verify_password("nadia", "correct") + + assert_nil limiter.send(:login_rate_limits)["nadia"], + "a successful verify_password must delete the username's rate-limit entry" + end + + # ========================================================================= + # Shared bucket: verify_password and login key on the same bare username + # ========================================================================= + + def test_verify_password_shares_lockout_bucket_with_login + limiter = make_limiter(success: false) + # Drive login-side failures to the lockout threshold for the username... + Parse::API::Users::LOGIN_MAX_FAILURES.times do + limiter.send(:track_login_attempt, "trudy", false) + end + + # ...and verify_password for the SAME username is now blocked, proving the + # shared (bare-username) bucket — an attacker cannot pivot past a login + # lockout by switching to verify_password. + assert_raises(Parse::Error::AccountLockoutError) do + limiter.verify_password("trudy", "secret") + end + end +end diff --git a/test/lib/parse/webhook_aftersave_payload_fidelity_test.rb b/test/lib/parse/webhook_aftersave_payload_fidelity_test.rb index ad41edc..b71ccff 100644 --- a/test/lib/parse/webhook_aftersave_payload_fidelity_test.rb +++ b/test/lib/parse/webhook_aftersave_payload_fidelity_test.rb @@ -541,12 +541,17 @@ def lifecycle_snapshot(hook) end # Dispatch a payload through the real webhook router with a route block. + # The chained after_save/after_create callbacks fire in run_after_save_chain + # (which call! invokes once per delivery), not in call_route, so we drive it + # here too. It is a no-op for the before_save triggers, whose before_* chain + # still fires inside call_route. def dispatch_lifecycle(trigger, payload_hash, &route) LifecyclePost.seen = [] Parse::Webhooks.instance_variable_set(:@routes, nil) Parse::Webhooks.route(trigger, "LifecyclePost", &route) - Parse::Webhooks.call_route(trigger, "LifecyclePost", - Parse::Webhooks::Payload.new(payload_hash)) + payload = Parse::Webhooks::Payload.new(payload_hash) + Parse::Webhooks.call_route(trigger, "LifecyclePost", payload) + Parse::Webhooks.run_after_save_chain(payload) LifecyclePost.seen.map { |s| s[:hook] } end diff --git a/test/lib/parse/webhook_callbacks_test.rb b/test/lib/parse/webhook_callbacks_test.rb index e2413ad..30bd853 100644 --- a/test/lib/parse/webhook_callbacks_test.rb +++ b/test/lib/parse/webhook_callbacks_test.rb @@ -1,5 +1,84 @@ require_relative "../../test_helper" require "minitest/autorun" +require "stringio" + +# Real Parse::Object model for the run_after_save_chain tests. A class-level +# counter records how many times the chained ActiveModel after_save callback +# fired, so a double-fire (or a fire when no route is registered) is visible. +class WebhookChainModel < Parse::Object + parse_class "WebhookChainModel" + + class << self + attr_accessor :after_save_count + end + self.after_save_count = 0 + + after_save :bump_after_save + def bump_after_save + self.class.after_save_count += 1 + end +end + +# Real model whose chained after_create / after_save callbacks can be made to +# raise, to exercise the throw-mid-chain containment in run_after_save_chain. +# Each callback increments its counter BEFORE (optionally) raising, so a test +# can assert the callback actually ran. +class WebhookRaisingModel < Parse::Object + parse_class "WebhookRaisingModel" + + class << self + attr_accessor :after_create_count, :after_save_count, :raise_on + end + self.after_create_count = 0 + self.after_save_count = 0 + self.raise_on = nil # nil, :after_create, or :after_save + + after_create :on_after_create + after_save :on_after_save + + def on_after_create + self.class.after_create_count += 1 + raise "boom in after_create" if self.class.raise_on == :after_create + end + + def on_after_save + self.class.after_save_count += 1 + raise "boom in after_save" if self.class.raise_on == :after_save + end +end + +# Real model with TWO after_save callbacks that BOTH raise, plus an after_create +# probe. Used to prove the containment is at the *chain* level (ActiveModel halts +# the rest of the phase once a callback raises) rather than a per-callback wrap: +# exactly one of the two after_save callbacks should run, not both. Order-agnostic +# (whichever ActiveModel runs first raises and halts the other). +class WebhookHaltModel < Parse::Object + parse_class "WebhookHaltModel" + + class << self + attr_accessor :after_save_ran, :after_create_ran + end + self.after_save_ran = [] + self.after_create_ran = 0 + + after_create :note_create + after_save :cb_one + after_save :cb_two + + def note_create + self.class.after_create_ran += 1 + end + + def cb_one + self.class.after_save_ran << :one + raise "boom in cb_one" + end + + def cb_two + self.class.after_save_ran << :two + raise "boom in cb_two" + end +end class WebhookCallbacksTest < Minitest::Test def setup @@ -155,6 +234,11 @@ def test_before_save_callback_handling puts "✅ Client-initiated before_save on update skips before_create" end + # The chained ActiveModel after_save/after_create callbacks no longer fire + # inside call_route -- the dispatch moved to Parse::Webhooks.run_after_save_chain, + # which call! invokes exactly once per delivery (after both the class route and + # the "*" route). So callback-firing assertions drive run_after_save_chain + # directly; result-normalization assertions still drive call_route. def test_after_save_callback_handling puts "\n=== Testing After Save Callback Handling ===" @@ -189,6 +273,7 @@ def test_after_save_callback_handling ruby_new_payload.define_singleton_method(:original) { nil } result = Parse::Webhooks.call_route(:after_save, "TestObject", ruby_new_payload) + Parse::Webhooks.run_after_save_chain(ruby_new_payload) refute after_create_called, "after_create should not be called for Ruby-initiated new objects" refute after_save_called, "after_save should not be called for Ruby-initiated new objects" @@ -214,6 +299,7 @@ def test_after_save_callback_handling client_new_payload.define_singleton_method(:original) { nil } result = Parse::Webhooks.call_route(:after_save, "TestObject", client_new_payload) + Parse::Webhooks.run_after_save_chain(client_new_payload) assert after_create_called, "after_create should be called for client-initiated new objects" assert after_save_called, "after_save should be called for client-initiated new objects" @@ -240,6 +326,7 @@ def test_after_save_callback_handling ruby_existing_payload.define_singleton_method(:original) { { "name" => "old" } } result = Parse::Webhooks.call_route(:after_save, "TestObject", ruby_existing_payload) + Parse::Webhooks.run_after_save_chain(ruby_existing_payload) refute after_create_called, "after_create should not be called for existing objects" # Previously this asserted after_save WAS called, which was a bug: a @@ -271,6 +358,7 @@ def test_after_save_callback_handling client_existing_payload.define_singleton_method(:original) { { "name" => "old" } } result = Parse::Webhooks.call_route(:after_save, "TestObject", client_existing_payload) + Parse::Webhooks.run_after_save_chain(client_existing_payload) refute after_create_called, "after_create should not be called for existing objects" assert after_save_called, "after_save should be called for client-initiated existing objects" @@ -312,6 +400,7 @@ def test_after_save_handler_returning_object_still_fires_callbacks client_new_payload.define_singleton_method(:original) { nil } result = Parse::Webhooks.call_route(:after_save, "TestObject", client_new_payload) + Parse::Webhooks.run_after_save_chain(client_new_payload) assert after_create_called, "after_create must fire even when handler returns the object" assert after_save_called, "after_save must fire even when handler returns the object" @@ -336,6 +425,7 @@ def test_after_save_handler_returning_object_still_fires_callbacks ruby_payload.define_singleton_method(:original) { nil } result = Parse::Webhooks.call_route(:after_save, "TestObject", ruby_payload) + Parse::Webhooks.run_after_save_chain(ruby_payload) refute after_create_called, "after_create must stay suppressed for trusted-ruby-initiated saves" refute after_save_called, "after_save must stay suppressed for trusted-ruby-initiated saves" @@ -388,6 +478,7 @@ def test_webhook_integration_with_request_idempotency webhook_payload.define_singleton_method(:original) { nil } result = Parse::Webhooks.call_route(:after_save, "TestObject", webhook_payload) + Parse::Webhooks.run_after_save_chain(webhook_payload) refute callback_called, "Ruby callbacks should not be called for Ruby-initiated webhook" assert_equal true, result, "Webhook should still return success" @@ -407,6 +498,7 @@ def test_webhook_integration_with_request_idempotency callback_called = false result = Parse::Webhooks.call_route(:after_save, "TestObject", client_webhook_payload) + Parse::Webhooks.run_after_save_chain(client_webhook_payload) assert callback_called, "Ruby callbacks should be called for client-initiated webhook" assert_equal true, result, "Webhook should return success" @@ -510,4 +602,301 @@ def test_multiple_webhook_handlers assert_equal true, result, "Should return result from last handler" puts "✓ Multiple handlers execute with correct client_initiated flag" end + + # The chain must honor the "unregistered afterSave trigger never fires model + # callbacks" contract that call_route's early `return unless routes[...]` + # provided. With the firing moved into run_after_save_chain, this guard is + # the load-bearing correctness check: without it, every afterSave delivery + # for a class with no registered handler would start firing the model's + # callbacks. + def test_run_after_save_chain_does_not_fire_without_a_registered_route + puts "\n=== Testing run_after_save_chain route-present guard ===" + + WebhookChainModel.after_save_count = 0 + + # No after_save route registered for WebhookChainModel (or "*"). + payload = Parse::Webhooks::Payload.new( + "triggerName" => "afterSave", + "object" => { "className" => "WebhookChainModel", "objectId" => "noroute1" }, + ) + assert_kind_of Parse::Object, payload.parse_object, "sanity: payload builds a real object" + + Parse::Webhooks.run_after_save_chain(payload) + + assert_equal 0, WebhookChainModel.after_save_count, + "model after_save must NOT fire when no afterSave route is registered" + puts "✓ No registered route => no model callbacks" + + # Registering a "*" route is enough to satisfy the guard (client-initiated, + # so not suppressed) and the chain fires exactly once. + Parse::Webhooks.route(:after_save, "*") { true } + Parse::Webhooks.run_after_save_chain(payload) + assert_equal 1, WebhookChainModel.after_save_count, + 'a registered "*" route lets the chain fire once' + puts '✓ Registered "*" route => chain fires once' + end + + # Marquee regression: call! dispatches every trigger twice (the specific class + # route AND the generic "*" route). When the chained callbacks fired inside + # call_route, an app that registered BOTH routes ran the model's after_save + # twice per delivery (e.g. two emails). With the dispatch moved into + # run_after_save_chain -- invoked once by call! after both route calls -- the + # model callback fires exactly once regardless of how many routes match. + def test_call_fires_after_save_chain_once_with_both_class_and_wildcard_routes + puts "\n=== Testing call! fires after_save chain once (class + wildcard) ===" + + WebhookChainModel.after_save_count = 0 + + class_fires = 0 + wildcard_fires = 0 + Parse::Webhooks.route(:after_save, "WebhookChainModel") { class_fires += 1; true } + Parse::Webhooks.route(:after_save, "*") { wildcard_fires += 1; true } + + body = JSON.generate( + "triggerName" => "afterSave", + "object" => { "className" => "WebhookChainModel", "objectId" => "bothroutes1" }, + ) + + with_rack_webhook_env do + status, _headers, resp = Parse::Webhooks.call( + rack_env(body: body, path: "/webhooks/afterSave/WebhookChainModel") + ) + assert_equal 200, status, "afterSave delivery should succeed" + assert_equal({ "success" => true }, JSON.parse(resp.join)) + end + + # Both route handlers run (specific + wildcard) ... + assert_equal 1, class_fires, "class-route handler should run once" + assert_equal 1, wildcard_fires, "wildcard-route handler should run once" + # ... but the chained model callback fires EXACTLY once across both. + assert_equal 1, WebhookChainModel.after_save_count, + "model after_save must fire exactly once even with both routes registered" + puts "✓ Model after_save fired exactly once across class + wildcard routes" + end + + # The suppression decision (trusted-Ruby-initiated => skip the webhook-side + # callbacks, local run_callbacks :save already fired them) must still hold + # end-to-end through call! AND under the dual class+"*" dispatch -- the exact + # intersection this change touches. Both route handlers run, but the model + # callback fires ZERO times webhook-side. + def test_call_suppresses_trusted_ruby_callbacks_even_with_both_routes + puts "\n=== Testing call! suppresses trusted-ruby callbacks (both routes) ===" + + WebhookChainModel.after_save_count = 0 + + class_fires = 0 + wildcard_fires = 0 + Parse::Webhooks.route(:after_save, "WebhookChainModel") { class_fires += 1; true } + Parse::Webhooks.route(:after_save, "*") { wildcard_fires += 1; true } + + # Trusted-Ruby-initiated: _RB_ request id (nested, as Parse Server sends it) + # AND master:true. + body = JSON.generate( + "triggerName" => "afterSave", + "master" => true, + "object" => { "className" => "WebhookChainModel", "objectId" => "trusted1" }, + "headers" => { "x-parse-request-id" => "_RB_trusted_both_routes" }, + ) + + with_rack_webhook_env do + status, _headers, resp = Parse::Webhooks.call( + rack_env(body: body, path: "/webhooks/afterSave/WebhookChainModel") + ) + assert_equal 200, status + assert_equal({ "success" => true }, JSON.parse(resp.join)) + end + + assert_equal 1, class_fires, "class-route handler still runs" + assert_equal 1, wildcard_fires, "wildcard-route handler still runs" + assert_equal 0, WebhookChainModel.after_save_count, + "trusted-ruby-initiated save must NOT fire webhook-side callbacks " \ + "(the local run_callbacks :save is the single fire)" + puts "✓ Trusted-ruby suppression holds through call! with both routes" + end + + # An afterSave UPDATE (original present) with both routes must also fire the + # after_save chain exactly once -- the create-only marquee test doesn't cover + # the update path. + def test_call_fires_after_save_chain_once_on_update_with_both_routes + puts "\n=== Testing call! fires after_save once on update (both routes) ===" + + WebhookChainModel.after_save_count = 0 + Parse::Webhooks.route(:after_save, "WebhookChainModel") { true } + Parse::Webhooks.route(:after_save, "*") { true } + + body = JSON.generate( + "triggerName" => "afterSave", + "object" => { "className" => "WebhookChainModel", "objectId" => "upd1" }, + "original" => { "className" => "WebhookChainModel", "objectId" => "upd1" }, + ) + + with_rack_webhook_env do + status, _headers, _resp = Parse::Webhooks.call( + rack_env(body: body, path: "/webhooks/afterSave/WebhookChainModel") + ) + assert_equal 200, status + end + + assert_equal 1, WebhookChainModel.after_save_count, + "after_save fires exactly once on an update with both routes" + puts "✓ Update fires after_save exactly once across both routes" + end + + # The containment is chain-level, not per-callback: when a callback raises, the + # REST of that phase's chain is halted (ActiveModel semantics). With two + # raising after_save callbacks, exactly ONE runs (not both) -- a per-callback + # rescue refactor would let both run and would fail this. The sibling + # after_create phase still runs, and the endpoint stays 200. + def test_call_halts_rest_of_phase_chain_when_a_callback_raises + puts "\n=== Testing chain-level halt when an after_save callback raises ===" + + WebhookHaltModel.after_save_ran = [] + WebhookHaltModel.after_create_ran = 0 + + Parse::Webhooks.route(:after_save, "WebhookHaltModel") { true } + body = JSON.generate( + "triggerName" => "afterSave", + "object" => { "className" => "WebhookHaltModel", "objectId" => "halt1" }, + # no "original" => a create, so after_create runs as the sibling phase + ) + + with_rack_webhook_env do + status, _headers, resp = Parse::Webhooks.call( + rack_env(body: body, path: "/webhooks/afterSave/WebhookHaltModel") + ) + assert_equal 200, status, "endpoint stays 200 despite the raising callback" + assert_equal({ "success" => true }, JSON.parse(resp.join)) + end + + assert_equal 1, WebhookHaltModel.after_create_ran, + "the sibling after_create phase still ran" + assert_equal 1, WebhookHaltModel.after_save_ran.size, + "exactly ONE after_save callback ran -- the raise halted the rest " \ + "of the chain (a per-callback wrap would let both run)" + puts "✓ A raising callback halts the rest of its phase; sibling phase + 200 intact" + end + + # Symmetric to the after_create log assertion: a raising after_save is also + # contained-but-logged (not silently swallowed). + def test_run_after_save_chain_logs_a_raising_after_save + puts "\n=== Testing a raising after_save is logged, not silent ===" + + WebhookRaisingModel.after_create_count = 0 + WebhookRaisingModel.after_save_count = 0 + WebhookRaisingModel.raise_on = :after_save + + Parse::Webhooks.route(:after_save, "WebhookRaisingModel") { true } + payload = Parse::Webhooks::Payload.new( + "triggerName" => "afterSave", + "object" => { "className" => "WebhookRaisingModel", "objectId" => "raise3" }, + ) + + _out, err = capture_io { Parse::Webhooks.run_after_save_chain(payload) } + + assert_equal 1, WebhookRaisingModel.after_save_count, "after_save ran (and raised)" + assert_match(/after_save callback raised/, err, + "the contained after_save failure must be logged") + puts "✓ Raising after_save is logged" + ensure + WebhookRaisingModel.raise_on = nil + end + + # afterSave fires AFTER the object is already persisted, and Parse Server + # discards the response body. So a chained after_create callback that raises + # must not (a) propagate out and skip the unrelated after_save side effects, + # nor (b) crash the dispatcher. run_after_save_chain runs the two phases + # independently, swallowing+logging the raise. + def test_run_after_save_chain_contains_a_raising_after_create_and_still_fires_after_save + puts "\n=== Testing run_after_save_chain contains a raising after_create ===" + + WebhookRaisingModel.after_create_count = 0 + WebhookRaisingModel.after_save_count = 0 + WebhookRaisingModel.raise_on = :after_create + + Parse::Webhooks.route(:after_save, "WebhookRaisingModel") { true } + payload = Parse::Webhooks::Payload.new( + "triggerName" => "afterSave", + "object" => { "className" => "WebhookRaisingModel", "objectId" => "raise1" }, + # no "original" => a create, so after_create runs first + ) + + # Must not raise out of the dispatcher. + _out, err = capture_io { Parse::Webhooks.run_after_save_chain(payload) } + + assert_equal 1, WebhookRaisingModel.after_create_count, + "after_create ran (and raised) once" + assert_equal 1, WebhookRaisingModel.after_save_count, + "after_save still fired even though after_create raised" + # The failure is contained, but NOT silent -- it is logged so the unrelated + # work isn't dropped without a trace. + assert_match(/after_create callback raised/, err, + "the contained after_create failure must be logged, not swallowed silently") + puts "✓ Raising after_create is contained (and logged); after_save still fires" + ensure + WebhookRaisingModel.raise_on = nil + end + + # End-to-end through call!: a chained callback raising must NOT 500 the + # webhook endpoint. call!'s rescue only catches ResponseError / + # ValidationError, so an unguarded StandardError from a callback would escape + # and crash the response. The phase guard keeps the delivery a 200 success + # (the object is already saved; the response is discarded by Parse Server). + def test_call_returns_success_when_a_chained_after_save_callback_raises + puts "\n=== Testing call! survives a raising after_save callback ===" + + WebhookRaisingModel.after_create_count = 0 + WebhookRaisingModel.after_save_count = 0 + WebhookRaisingModel.raise_on = :after_save + + Parse::Webhooks.route(:after_save, "WebhookRaisingModel") { true } + body = JSON.generate( + "triggerName" => "afterSave", + "object" => { "className" => "WebhookRaisingModel", "objectId" => "raise2" }, + ) + + status = nil + resp = nil + with_rack_webhook_env do + status, _headers, resp = Parse::Webhooks.call( + rack_env(body: body, path: "/webhooks/afterSave/WebhookRaisingModel") + ) + end + + assert_equal 200, status, "endpoint must stay 200 when a chained callback raises" + assert_equal({ "success" => true }, JSON.parse(resp.join)) + assert_equal 1, WebhookRaisingModel.after_save_count, + "after_save callback actually ran (and raised)" + puts "✓ call! returns success despite a raising chained callback" + ensure + WebhookRaisingModel.raise_on = nil + end + + # ========================================================================== + # Rack entry-point (#call!) harness -- drives the real production path on a + # raw body. Mirrors the helper in webhook_non_object_triggers_test.rb. + # ========================================================================== + def with_rack_webhook_env + saved_key = Parse::Webhooks.instance_variable_get(:@key) + saved_allow = Parse::Webhooks.instance_variable_get(:@allow_unauthenticated) + saved_logging = Parse::Webhooks.logging + Parse::Webhooks.instance_variable_set(:@key, nil) + Parse::Webhooks.instance_variable_set(:@allow_unauthenticated, true) + Parse::Webhooks.logging = false + Parse::Webhooks::ReplayProtection.reset! + capture_io { yield } + ensure + Parse::Webhooks.instance_variable_set(:@key, saved_key) + Parse::Webhooks.instance_variable_set(:@allow_unauthenticated, saved_allow) + Parse::Webhooks.logging = saved_logging + end + + def rack_env(body:, path:) + { + "REQUEST_METHOD" => "POST", + "CONTENT_TYPE" => "application/json", + "PATH_INFO" => path, + "rack.input" => StringIO.new(body), + "CONTENT_LENGTH" => body.bytesize.to_s, + } + end end diff --git a/test/lib/parse/webhook_triggers_test.rb b/test/lib/parse/webhook_triggers_test.rb index fa68dd5..1d2d9ef 100644 --- a/test/lib/parse/webhook_triggers_test.rb +++ b/test/lib/parse/webhook_triggers_test.rb @@ -121,6 +121,10 @@ def test_after_save_trigger ruby_payload.define_singleton_method(:original) { { "name" => "old" } } result = Parse::Webhooks.call_route(:after_save, "TestObject", ruby_payload) + # The chained model callbacks fire in run_after_save_chain (call! invokes it + # once per delivery), not in call_route. This request carries _RB_ but NOT + # master, so it is not trusted-Ruby-initiated and the callbacks still fire. + Parse::Webhooks.run_after_save_chain(ruby_payload) assert hook_called, "after_save hook should be called" assert hook_payload.after_save?, "Payload should identify as after_save" @@ -150,6 +154,7 @@ def test_after_save_trigger test_object.define_singleton_method(:run_after_create_callbacks) { callback_executed = true } result = Parse::Webhooks.call_route(:after_save, "TestObject", client_payload) + Parse::Webhooks.run_after_save_chain(client_payload) assert hook_called, "after_save hook should be called for client" assert hook_payload.after_save?, "Payload should identify as after_save"