Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 32 additions & 8 deletions src/apify/_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from decimal import Decimal
from logging import getLogger
from pathlib import Path
from typing import Annotated, Any, Self
from typing import TYPE_CHECKING, Annotated, Any, Self

from pydantic import AliasChoices, BeforeValidator, Field, model_validator
from typing_extensions import TypedDict
Expand All @@ -23,6 +23,9 @@
)
from apify._utils import docs_group

if TYPE_CHECKING:
from collections.abc import Callable

logger = getLogger(__name__)


Expand All @@ -34,6 +37,20 @@ def _transform_to_list(value: Any) -> list[str] | None:
return value if isinstance(value, list) else str(value).split(',')


def _default_if_empty(*, default: Any) -> Callable[[Any], Any]:
"""Build a validator that substitutes `default` for an empty-string env var.

The Apify platform sometimes sets an env var to an empty string instead of leaving it unset. For fields whose
target type cannot parse `''` (datetimes, numbers, booleans, ...), passing the value straight through would crash
validation and, in turn, `Actor.init()`. Treat `''` as "not provided" and fall back to the field default instead.
"""

def transform(value: Any) -> Any:
return default if value == '' else value

return transform


class ActorStorages(TypedDict):
"""Mapping of storage aliases to their IDs, grouped by storage type.

Expand All @@ -59,9 +76,9 @@ def _load_storage_keys(data: None | str | ActorStorages) -> ActorStorages | None
`ActorStorages` dict when set programmatically.

Returns:
Normalized storage mapping, or `None` if the input is `None`.
Normalized storage mapping, or `None` if the input is `None` or an empty string.
"""
if data is None:
if data is None or data == '':
return None
storage_mapping = json.loads(data) if isinstance(data, str) else data
return {
Expand Down Expand Up @@ -206,6 +223,7 @@ class Configuration(CrawleeConfiguration):
alias='apify_dedicated_cpus',
description='Number of CPU cores reserved for the actor, based on allocated memory',
),
BeforeValidator(_default_if_empty(default=None)),
] = None

default_dataset_id: Annotated[
Expand Down Expand Up @@ -286,6 +304,7 @@ class Configuration(CrawleeConfiguration):
alias='apify_is_at_home',
description='True if the Actor is running on Apify servers',
),
BeforeValidator(_default_if_empty(default=False)),
] = False

max_paid_dataset_items: Annotated[
Expand All @@ -294,7 +313,7 @@ class Configuration(CrawleeConfiguration):
alias='actor_max_paid_dataset_items',
description='For paid-per-result Actors, the user-set limit on returned results. Do not exceed this limit',
),
BeforeValidator(lambda val: val if val != '' else None),
BeforeValidator(_default_if_empty(default=None)),
] = None

max_total_charge_usd: Annotated[
Expand All @@ -303,7 +322,7 @@ class Configuration(CrawleeConfiguration):
alias='actor_max_total_charge_usd',
description='For pay-per-event Actors, the user-set limit on total charges. Do not exceed this limit',
),
BeforeValidator(lambda val: val if val != '' else None),
BeforeValidator(_default_if_empty(default=None)),
] = None

test_pay_per_event: Annotated[
Expand All @@ -312,6 +331,7 @@ class Configuration(CrawleeConfiguration):
alias='actor_test_pay_per_event',
description='Enable pay-per-event functionality for local development',
),
BeforeValidator(_default_if_empty(default=False)),
] = False

meta_origin: Annotated[
Expand All @@ -328,6 +348,7 @@ class Configuration(CrawleeConfiguration):
alias='apify_metamorph_after_sleep_millis',
description='How long the Actor needs to wait before exiting after triggering a metamorph',
),
BeforeValidator(_default_if_empty(default=timedelta(minutes=5))),
] = timedelta(minutes=5)

proxy_hostname: Annotated[
Expand All @@ -352,6 +373,7 @@ class Configuration(CrawleeConfiguration):
alias='apify_proxy_port',
description='Port to communicate with the Apify proxy',
),
BeforeValidator(_default_if_empty(default=8000)),
] = 8000

proxy_status_url: Annotated[
Expand All @@ -371,6 +393,7 @@ class Configuration(CrawleeConfiguration):
),
description='Date when the Actor was started',
),
BeforeValidator(_default_if_empty(default=None)),
] = None

timeout_at: Annotated[
Expand All @@ -382,7 +405,7 @@ class Configuration(CrawleeConfiguration):
),
description='Date when the Actor will time out',
),
BeforeValidator(lambda val: val if val != '' else None), # We should accept empty environment variables as well
BeforeValidator(_default_if_empty(default=None)),
] = None

standby_url: Annotated[
Expand Down Expand Up @@ -416,7 +439,7 @@ class Configuration(CrawleeConfiguration):
alias='apify_user_is_paying',
description='True if the user calling the Actor is paying user',
),
BeforeValidator(lambda val: False if val == '' else val),
BeforeValidator(_default_if_empty(default=False)),
] = False

web_server_port: Annotated[
Expand All @@ -429,6 +452,7 @@ class Configuration(CrawleeConfiguration):
description='TCP port for the Actor to start an HTTP server on'
'This server can be used to receive external messages or expose monitoring and control interfaces',
),
BeforeValidator(_default_if_empty(default=4321)),
] = 4321

web_server_url: Annotated[
Expand Down Expand Up @@ -470,7 +494,7 @@ class Configuration(CrawleeConfiguration):
alias='apify_charged_actor_event_counts',
description='Counts of events that were charged for the actor',
),
BeforeValidator(lambda data: json.loads(data) if isinstance(data, str) else data or None),
BeforeValidator(lambda data: json.loads(data) if isinstance(data, str) and data else data or None),
] = None

actor_storages: Annotated[
Expand Down
40 changes: 40 additions & 0 deletions tests/unit/actor/test_configuration.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
from datetime import timedelta
from decimal import Decimal
from pathlib import Path

Expand Down Expand Up @@ -373,6 +374,13 @@ def test_actor_pricing_info_env_var_empty_becomes_none(monkeypatch: pytest.Monke
assert config.actor_pricing_info is None


def test_charged_event_counts_env_var_empty_string_becomes_none(monkeypatch: pytest.MonkeyPatch) -> None:
"""Test that an empty env var for charged_event_counts is converted to None instead of crashing."""
monkeypatch.setenv('APIFY_CHARGED_ACTOR_EVENT_COUNTS', '')
config = ApifyConfiguration()
assert config.charged_event_counts is None


def test_actor_storage_json_env_var(monkeypatch: pytest.MonkeyPatch) -> None:
"""Test that actor_storages_json is parsed from JSON env var."""
datasets = {'default': 'default_dataset_id', 'custom': 'custom_dataset_id'}
Expand All @@ -392,3 +400,35 @@ def test_actor_storage_json_env_var(monkeypatch: pytest.MonkeyPatch) -> None:
assert config.actor_storages['datasets'] == datasets
assert config.actor_storages['request_queues'] == request_queues
assert config.actor_storages['key_value_stores'] == key_value_stores


def test_actor_storages_env_var_empty_string_becomes_none(monkeypatch: pytest.MonkeyPatch) -> None:
"""Test that an empty env var for actor_storages is converted to None instead of crashing."""
monkeypatch.setenv('ACTOR_STORAGES_JSON', '')

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this actually less defensive?

Is there any actual use case for having 'ACTOR_STORAGES_JSON' be set as an empty string? The env variable clearly states it should be a json. So either have json there, or don't set the variable.

Code should not try to guess the intention from an ambiguous action. And here we are guessing that an empty env variable probably means the user did not want to set it up in the first place...

config = ApifyConfiguration()
assert config.actor_storages is None


@pytest.mark.parametrize(
('env_var', 'attr', 'expected'),
[
('APIFY_STARTED_AT', 'started_at', None),
('APIFY_TIMEOUT_AT', 'timeout_at', None),
('APIFY_DEDICATED_CPUS', 'dedicated_cpus', None),
('ACTOR_MAX_PAID_DATASET_ITEMS', 'max_paid_dataset_items', None),
('ACTOR_MAX_TOTAL_CHARGE_USD', 'max_total_charge_usd', None),
('APIFY_PROXY_PORT', 'proxy_port', 8000),
('ACTOR_WEB_SERVER_PORT', 'web_server_port', 4321),
('APIFY_METAMORPH_AFTER_SLEEP_MILLIS', 'metamorph_after_sleep', timedelta(minutes=5)),
('APIFY_IS_AT_HOME', 'is_at_home', False),
('ACTOR_TEST_PAY_PER_EVENT', 'test_pay_per_event', False),
('APIFY_USER_IS_PAYING', 'user_is_paying', False),
],
)
def test_typed_env_var_empty_string_falls_back_to_default(
monkeypatch: pytest.MonkeyPatch, env_var: str, attr: str, expected: object
) -> None:
"""Platform may set a typed env var to '' instead of leaving it unset; that must not crash `Actor.init()`."""
monkeypatch.setenv(env_var, '')
config = ApifyConfiguration()
assert getattr(config, attr) == expected
Loading