diff --git a/agent/agent-priv.h b/agent/agent-priv.h index 2c5392e7..c08f989f 100644 --- a/agent/agent-priv.h +++ b/agent/agent-priv.h @@ -89,7 +89,6 @@ struct _NiceAgent GSource *event_source; gboolean full_mode; /* property: full-mode */ - GTimeVal next_check_tv; /* property: next conncheck timestamp */ gchar *stun_server_ip; /* property: STUN server IP */ guint stun_server_port; /* property: STUN server port */ gchar *proxy_ip; /* property: Proxy server IP */ diff --git a/agent/agent.c b/agent/agent.c index 1461b3ce..8d978c07 100644 --- a/agent/agent.c +++ b/agent/agent.c @@ -3281,13 +3281,13 @@ _priv_set_socket_tos (NiceAgent * agent, NiceSocket * sock, gint tos) if (sock->fileno && setsockopt (g_socket_get_fd (sock->fileno), IPPROTO_IP, IP_TOS, (const char *) &tos, sizeof (tos)) < 0) { - GST_WARNING_OBJECT (agent, "Could not set socket ToS", g_strerror (errno)); + GST_WARNING_OBJECT (agent, "Could not set socket ToS: %s", g_strerror (errno)); } #ifdef IPV6_TCLASS if (sock->fileno && setsockopt (g_socket_get_fd (sock->fileno), IPPROTO_IPV6, IPV6_TCLASS, (const char *) &tos, sizeof (tos)) < 0) { - GST_DEBUG_OBJECT (agent, "Could not set IPV6 socket ToS", + GST_DEBUG_OBJECT (agent, "Could not set IPV6 socket ToS: %s", g_strerror (errno)); } #endif diff --git a/agent/conncheck.c b/agent/conncheck.c index 5edf20dc..a876f196 100644 --- a/agent/conncheck.c +++ b/agent/conncheck.c @@ -163,11 +163,9 @@ static void priv_print_stream_diagnostics (NiceAgent* agent, Stream* stream) priv_print_check_list (agent, stream, stream->valid_list, "Valid list"); } -static int priv_timer_expired (GTimeVal *timer, GTimeVal *now) +static int priv_timer_expired (gint64 timer, gint64 now) { - return (now->tv_sec == timer->tv_sec) ? - now->tv_usec >= timer->tv_usec : - now->tv_sec >= timer->tv_sec; + return now >= timer; } static void priv_set_pair_state (NiceAgent* agent, CandidateCheckPair* pair, NiceCheckState new_state) @@ -201,13 +199,163 @@ static CandidateCheckPair* priv_alloc_check_pair (NiceAgent* agent, Stream* stre } /* - * Convert TURN lifetime into a refresh interval IN MILLISECONDS. Refresh 30 seconds before - * expiry, turn message parsing has already checked against a minimum supported lifetime of - * 60 seconds + * Convert a TURN allocation lifetime (seconds, as returned by the server) + * into the time we should wait before sending the next Refresh + * (milliseconds). + * + * History: this function previously returned `(lifetime - 30) * 1000`, + * i.e. it scheduled the refresh 30 s before expiry, despite the comment + * promising "1 minute before expiry". On a lossy path, 30 s is not + * enough to absorb a single retransmission cycle (default STUN timer is + * 600 ms doubling, with 3 retransmissions => up to 9 s, plus server + * processing and possibly a 438 round trip). We now refresh much more + * conservatively: at most halfway through the lifetime, and at least + * 10 s before expiry. `lifetime` is uint32_t and may, in degenerate + * inputs, be very small or zero; guard against underflow. The seconds + * -> milliseconds conversion is done in 64-bit and saturated to + * G_MAXUINT to avoid wrapping when a server reports a pathologically + * large lifetime (the timeout APIs we feed only accept `guint`). + */ +/* + * Test-only knob, mirroring the pattern used in socket/turn.c for + * NICE_TURN_BINDING_TIMEOUT and NICE_TURN_PERMISSION_TIMEOUT. When this + * environment variable is set to a positive integer (in seconds), it + * overrides the RFC-derived schedule computed below so the allocation + * Refresh code path can be exercised quickly in unit tests instead of + * waiting minutes per cycle. Re-read on every refresh-interval + * computation so that a test that sets the variable before + * nice_agent_gather_candidates() is honoured even if some earlier + * code path in the same process already evaluated this helper before + * the variable was set. NOT part of the public API and NOT for + * production use. + */ +#define ENV_NICE_TURN_EXPIRE_TIMEOUT "NICE_TURN_EXPIRE_TIMEOUT" + +static guint +priv_turn_expire_timeout_override_secs (void) +{ + const gchar *v = g_getenv (ENV_NICE_TURN_EXPIRE_TIMEOUT); + gchar *end = NULL; + guint64 parsed; + + if (v == NULL || *v == '\0') + return 0; + + parsed = g_ascii_strtoull (v, &end, 10); + if (end == v || *end != '\0' || parsed == 0 || parsed > G_MAXUINT) + return 0; + + return (guint) parsed; +} + +static guint priv_turn_lifetime_to_refresh_interval(uint32_t lifetime) +{ + uint32_t interval_s; + guint64 interval_ms; + guint override_s; + + if (lifetime <= 20) { + /* Pathological: refresh almost immediately and let the server tell + * us off if anything is wrong. We still want at least one tick. */ + return 1000; + } + + /* Refresh after at most half the lifetime, and at least 10 s before + * expiry, whichever is sooner. */ + interval_s = lifetime / 2; + if (interval_s + 10 > lifetime) + interval_s = lifetime - 10; + if (interval_s < 5) + interval_s = 5; + + /* Test-only override: if NICE_TURN_EXPIRE_TIMEOUT=N (seconds) was set + * at process start, clamp the refresh interval to N seconds so the + * Refresh path fires on a 1-2 s cadence in unit tests instead of the + * normal multi-minute schedule. Mirrors the BINDING / PERMISSION + * knobs handled in socket/turn.c. */ + override_s = priv_turn_expire_timeout_override_secs (); + if (override_s > 0 && override_s < interval_s) + interval_s = override_s; + + interval_ms = (guint64) interval_s * 1000u; + if (interval_ms > G_MAXUINT) + interval_ms = G_MAXUINT; + return (guint) interval_ms; +} + +/* + * The LIFETIME (seconds) we explicitly request in TURN Allocate and + * Refresh requests. RFC 5766 §6.1 recommends 600 s and that is what + * most servers default to. Sending it explicitly avoids surprises when + * a server is configured with a much shorter default. */ -static uint32_t priv_turn_lifetime_to_refresh_interval(uint32_t lifetime) +#define NICE_TURN_REQUESTED_LIFETIME 600 + +/* + * Count the number of OTHER refresh candidates currently alive for the + * same (stream, component) tuple. Used to decide whether a refresh + * failure should be propagated to the application as + * agent_signal_turn_allocation_failure -- if siblings remain, the + * component still has working relay paths and the application should + * not tear the call down. + * + * Walks agent->refresh_list. The caller must hold the agent lock. + */ +static guint +priv_count_sibling_refreshes (NiceAgent *agent, CandidateRefresh *self) { - return (lifetime - 30) * 1000; + GSList *i; + guint n = 0; + + for (i = agent->refresh_list; i; i = i->next) { + CandidateRefresh *other = i->data; + + if (other == self) + continue; + if (other->stream != self->stream) + continue; + if (other->component != self->component) + continue; + n++; + } + return n; +} + +/* + * Wrapper around agent_signal_turn_allocation_failure that suppresses + * the upper-layer fatal signal when at least one sibling refresh + * candidate for the same (stream, component) is still alive. In the + * field a single relay path can hit 437 / exhausted-438-retry while + * other relay paths for the same component continue refreshing fine; + * raising a fatal signal in that case caused the call to be torn down + * even though the component still had working relay paths. + */ +static void +priv_refresh_signal_failure (CandidateRefresh *cand, + const NiceAddress *from, StunMessage *resp, const char *reason) +{ + guint siblings = priv_count_sibling_refreshes (cand->agent, cand); + + if (siblings > 0) { + GST_WARNING_OBJECT (cand->agent, + "%u/%u: TURN refresh failure on cand=%p reason=%s — suppressing " + "fatal signal because %u sibling refresh candidate(s) for this " + "component are still alive", + cand->stream->id, cand->component->id, cand, + reason ? reason : "?", siblings); + return; + } + + GST_WARNING_OBJECT (cand->agent, + "%u/%u: TURN refresh failure on cand=%p reason=%s — no sibling " + "refresh candidates, signalling fatal failure to application", + cand->stream->id, cand->component->id, cand, + reason ? reason : "?"); + + agent_signal_turn_allocation_failure (cand->agent, + cand->stream->id, cand->component->id, from, + cand->turn ? &cand->turn->type : NULL, resp, + reason ? reason : ""); } /* @@ -288,8 +436,7 @@ static gboolean priv_conn_check_initiate (NiceAgent *agent, CandidateCheckPair * * immediately, but be put into the "triggered queue", * see "7.2.1.4 Triggered Checks" */ - g_get_current_time (&pair->next_tick); - g_time_val_add (&pair->next_tick, agent->timer_ta * 1000); + pair->next_tick = g_get_real_time () + (gint64) agent->timer_ta * 1000; priv_set_pair_state (agent, pair, NICE_CHECK_IN_PROGRESS); conn_check_send (agent, pair); return TRUE; @@ -598,14 +745,14 @@ void conn_check_unfreeze_related (NiceAgent *agent, Stream *stream, CandidateChe } } -static void priv_tick_in_progress_check (NiceAgent* agent, Stream* stream, CandidateCheckPair* p, GTimeVal *now) +static void priv_tick_in_progress_check (NiceAgent* agent, Stream* stream, CandidateCheckPair* p, gint64 now) { if (p->stun_message.buffer == NULL) { GST_DEBUG_OBJECT (agent, "%u/%u: STUN connectivity check was cancelled for pair %p(%s), marking as done.", p->stream_id, p->component_id, p, p->foundation); priv_set_pair_state (agent, p, NICE_CHECK_FAILED); - } else if (priv_timer_expired (&p->next_tick, now)) { + } else if (priv_timer_expired (p->next_tick, now)) { switch (stun_timer_refresh (&p->timer)) { case STUN_USAGE_TIMER_RETURN_TIMEOUT: { @@ -639,17 +786,15 @@ static void priv_tick_in_progress_check (NiceAgent* agent, Stream* stream, Candi (gchar *)p->stun_buffer); } - /* note: convert from milli to microseconds for g_time_val_add() */ - p->next_tick = *now; - g_time_val_add (&p->next_tick, timeout * 1000); + /* note: convert from milli to microseconds */ + p->next_tick = now + (gint64) timeout * 1000; break; } case STUN_USAGE_TIMER_RETURN_SUCCESS: { unsigned int timeout = stun_timer_remainder (&p->timer); - /* note: convert from milli to microseconds for g_time_val_add() */ - p->next_tick = *now; - g_time_val_add (&p->next_tick, timeout * 1000); + /* note: convert from milli to microseconds */ + p->next_tick = now + (gint64) timeout * 1000; break; } } @@ -788,7 +933,7 @@ static void priv_nominate_highest_priority_successful_pair (NiceAgent* agent, St } } -static gboolean priv_check_for_regular_nomination (NiceAgent* agent, Stream *stream, GTimeVal *now) +static gboolean priv_check_for_regular_nomination (NiceAgent* agent, Stream *stream, gint64 now) { guint succeeded = 0, nominated = 0; GSList *i; @@ -845,7 +990,7 @@ static gboolean priv_check_for_regular_nomination (NiceAgent* agent, Stream *str * * @return will return FALSE when no more pending timers. */ -static gboolean priv_conn_check_tick_stream (Stream *stream, NiceAgent *agent, GTimeVal *now) +static gboolean priv_conn_check_tick_stream (Stream *stream, NiceAgent *agent, gint64 now) { gboolean keep_timer_going = FALSE; GSList *i; @@ -897,10 +1042,10 @@ static gboolean priv_conn_check_tick_unlocked (gpointer pointer) NiceAgent *agent = pointer; gboolean keep_timer_going = FALSE; GSList *i, *j; - GTimeVal now; + gint64 now; /* step: process ongoing STUN transactions */ - g_get_current_time (&now); + now = g_get_real_time (); /* step: find the highest priority waiting check and send it */ for (i = agent->streams; i ; i = i->next) { @@ -921,7 +1066,7 @@ static gboolean priv_conn_check_tick_unlocked (gpointer pointer) for (j = agent->streams; j; j = j->next) { Stream *stream = j->data; gboolean res = - priv_conn_check_tick_stream (stream, agent, &now); + priv_conn_check_tick_stream (stream, agent, now); if (res) keep_timer_going = res; } @@ -1102,6 +1247,9 @@ static gboolean priv_conn_keepalive_tick (gpointer pointer) } +static gboolean priv_turn_allocate_refresh_retransmissions_tick (gpointer pointer); +static void priv_turn_allocate_refresh_tick_unlocked (CandidateRefresh *cand); + static gboolean priv_turn_allocate_refresh_retransmissions_tick (gpointer pointer) { CandidateRefresh *cand = (CandidateRefresh *) pointer; @@ -1132,18 +1280,43 @@ static gboolean priv_turn_allocate_refresh_retransmissions_tick (gpointer pointe stun_message_id (&cand->stun_message, id); stun_agent_forget_transaction (&cand->stun_agent, id); - agent_signal_turn_allocation_failure(cand->agent, - cand->stream->id, - cand->component->id, - &cand->server, - cand->turn ? &cand->turn->type : NULL, - NULL, - "Allocate/Refresh timed out"); + /* A single retransmission timeout very often just means we lost + * a packet on the wire. Rather than tearing down the entire + * allocation immediately (which would manifest as "media stops + * after ~10 minutes" if the very first refresh happened to be + * lost), give it one more shot. The flag is re-armed after every + * successful refresh. */ + if (cand->tolerate_one_timeout) { + GST_WARNING_OBJECT (agent, + "%u/%u: TURN refresh #%u on cand=%p timed out after %u " + "retransmissions; trying one more refresh before giving up", + cand->stream->id, cand->component->id, + cand->refresh_count, cand, + cand->timer.max_retransmissions); + cand->tolerate_one_timeout = FALSE; + priv_turn_allocate_refresh_tick_unlocked (cand); + agent_unlock (agent); + return FALSE; + } + + GST_WARNING_OBJECT (agent, + "%u/%u: TURN refresh #%u on cand=%p timed out after %u " + "retransmissions (last lifetime %u s); tearing down allocation", + cand->stream->id, cand->component->id, + cand->refresh_count, cand, + cand->timer.max_retransmissions, cand->last_lifetime_s); + + priv_refresh_signal_failure (cand, &cand->server, NULL, + "Allocate/Refresh timed out"); refresh_cancel (cand); break; } case STUN_USAGE_TIMER_RETURN_RETRANSMIT: /* Retransmit */ + GST_DEBUG_OBJECT (agent, + "%u/%u: TURN refresh #%u on cand=%p retransmit (attempt %u/%u)", + cand->stream->id, cand->component->id, cand->refresh_count, cand, + cand->timer.retransmissions, cand->timer.max_retransmissions); nice_socket_send (cand->nicesock, &cand->server, stun_message_length (&cand->stun_message), (gchar *)cand->stun_buffer); @@ -1186,7 +1359,10 @@ static void priv_turn_allocate_refresh_tick_unlocked (CandidateRefresh *cand) buffer_len = stun_usage_turn_create_refresh (&cand->stun_agent, &cand->stun_message, cand->stun_buffer, sizeof(cand->stun_buffer), - cand->stun_resp_msg.buffer == NULL ? NULL : &cand->stun_resp_msg, -1, + cand->stun_resp_msg.buffer == NULL ? NULL : &cand->stun_resp_msg, + /* Speculative-fix #1: ask explicitly for the lifetime we want + * rather than letting the server choose. */ + NICE_TURN_REQUESTED_LIFETIME, username, username_len, password, password_len, turn_compat); @@ -1199,8 +1375,13 @@ static void priv_turn_allocate_refresh_tick_unlocked (CandidateRefresh *cand) cand->msn_turn_password = password; } - GST_DEBUG_OBJECT (cand->agent, "%u/%u: Sending allocate Refresh %u", - cand->stream->id, cand->component->id, buffer_len); + cand->refresh_count++; + GST_DEBUG_OBJECT (cand->agent, + "%u/%u: Sending TURN Refresh #%u on cand=%p (%u bytes), " + "last lifetime %u s, requested lifetime %u s", + cand->stream->id, cand->component->id, + cand->refresh_count, cand, (guint) buffer_len, + cand->last_lifetime_s, (guint) NICE_TURN_REQUESTED_LIFETIME); if (cand->tick_source != NULL) { g_source_destroy (cand->tick_source); @@ -2156,9 +2337,8 @@ int conn_check_send (NiceAgent *agent, CandidateCheckPair *pair) } timeout = stun_timer_remainder (&pair->timer); - /* note: convert from milli to microseconds for g_time_val_add() */ - g_get_current_time (&pair->next_tick); - g_time_val_add (&pair->next_tick, timeout * 1000); + /* note: convert from milli to microseconds */ + pair->next_tick = g_get_real_time () + (gint64) timeout * 1000; } else { GST_DEBUG_OBJECT (agent, "buffer is empty, cancelling conncheck"); pair->stun_message.buffer = NULL; @@ -3018,11 +3198,23 @@ priv_add_new_turn_refresh (CandidateDiscovery *cdisco, NiceCandidate *relay_cand cand->stun_resp_msg.key = NULL; } - GST_DEBUG_OBJECT (agent, "%u/%u: Adding new refresh candidate %p with timeout %d", - cand->stream->id, cand->component->id, cand, priv_turn_lifetime_to_refresh_interval(lifetime)); + /* Initialise robustness counters. The "tolerate one timeout" flag is + * enabled from the start so that a single lost Refresh request does + * not kill the allocation outright. */ + cand->refresh_count = 0; + cand->consecutive_stale_nonce = 0; + cand->last_lifetime_s = lifetime; + cand->tolerate_one_timeout = TRUE; + + GST_INFO_OBJECT (agent, + "%u/%u: TURN allocation succeeded on cand=%p, granted lifetime %u s, " + "scheduling first Refresh in %u ms, sibling refresh candidates for " + "this component: %u", + cand->stream->id, cand->component->id, cand, lifetime, + priv_turn_lifetime_to_refresh_interval(lifetime), + priv_count_sibling_refreshes (agent, cand)); /* step: also start the refresh timer */ - /* refresh should be sent 1 minute before it expires */ cand->timer_source = agent_timeout_add_with_context (agent, priv_turn_lifetime_to_refresh_interval(lifetime), priv_turn_allocate_refresh_tick, cand); @@ -3084,6 +3276,26 @@ static gboolean priv_map_reply_to_relay_request (NiceAgent *agent, StunMessage * /* case: successful allocate, create a new local candidate */ NiceAddress niceaddr; NiceCandidate *relay_cand; + uint16_t resp_nonce_len = 0; + + /* Cache the 200 OK Allocate response so that the freshly + * created refresh candidate can extract the most recent + * NONCE / REALM from it for its very first Refresh. The + * original code only stored stun_resp_msg in the 401/438 + * error path above, which means d->stun_resp_msg still held + * the long-since-superseded NONCE from the unauthenticated + * round trip. priv_add_new_turn_refresh() copies that buffer + * into the new CandidateRefresh, so without this update the + * first Refresh would echo the stale challenge nonce rather + * than the one the server issued in this success response. */ + if (stun_message_find (resp, STUN_ATTRIBUTE_NONCE, &resp_nonce_len) + != NULL) { + d->stun_resp_msg = *resp; + memcpy (d->stun_resp_buffer, resp->buffer, + stun_message_length (resp)); + d->stun_resp_msg.buffer = d->stun_resp_buffer; + d->stun_resp_msg.buffer_len = sizeof (d->stun_resp_buffer); + } /* Server reflexive candidates are only valid for UDP sockets */ if (res == STUN_USAGE_TURN_RETURN_MAPPED_SUCCESS && @@ -3148,6 +3360,18 @@ static gboolean priv_map_reply_to_relay_request (NiceAgent *agent, StunMessage * nice_turn_socket_set_ms_realm(relay_cand->sockptr, &d->stun_message); nice_turn_socket_set_ms_connection_id(relay_cand->sockptr, resp); } + } else { + /* No relay candidate was created (e.g. discovery_add_relay_candidate + * deduplicated against an existing one or failed). In that case + * priv_add_new_turn_refresh() is *not* called and no allocation + * Refresh GSource is ever armed for this discovery, so the server + * will see no REFRESH traffic for the lifetime of this allocation. + * Log loudly so this asymmetric behaviour is visible in CI. */ + GST_WARNING_OBJECT (agent, + "%u/%u: TURN allocation succeeded but no relay candidate was " + "created (granted lifetime %u s); skipping Refresh scheduling " + "for discovery=%p", d->stream->id, d->component->id, + lifetime, d); } d->stun_message.buffer = NULL; @@ -3250,14 +3474,83 @@ static gboolean priv_map_reply_to_relay_refresh (NiceAgent *agent, StunMessage * stun_message_log(resp, FALSE, (struct sockaddr *)&server_address); if (res == STUN_USAGE_TURN_RETURN_RELAY_SUCCESS) { - /* refresh should be sent 1 minute before it expires */ + guint32 next_ms = priv_turn_lifetime_to_refresh_interval(lifetime); + uint16_t old_nonce_len = 0, new_nonce_len = 0; + uint8_t *old_nonce = NULL, *new_nonce = NULL; + gboolean nonce_changed = FALSE; + + /* Cache the latest successful response so that the next + * Refresh uses the most recent NONCE / REALM. The original + * code only updated stun_resp_msg in the 438 Stale Nonce error + * path, which means every refresh after the server rotates its + * nonce costs an extra 438 round trip. Worse, if anything + * goes wrong on that retry, the allocation is torn down. */ + if (cand->stun_resp_msg.buffer != NULL) { + old_nonce = (uint8_t *) stun_message_find (&cand->stun_resp_msg, + STUN_ATTRIBUTE_NONCE, &old_nonce_len); + } + new_nonce = (uint8_t *) stun_message_find (resp, + STUN_ATTRIBUTE_NONCE, &new_nonce_len); + if (new_nonce != NULL) { + nonce_changed = (old_nonce == NULL || + old_nonce_len != new_nonce_len || + memcmp (old_nonce, new_nonce, new_nonce_len) != 0); + cand->stun_resp_msg = *resp; + memcpy (cand->stun_resp_buffer, resp->buffer, + stun_message_length (resp)); + cand->stun_resp_msg.buffer = cand->stun_resp_buffer; + cand->stun_resp_msg.buffer_len = sizeof(cand->stun_resp_buffer); + + /* Push the (possibly rotated) NONCE/REALM down into the + * TURN socket's credential cache so that the next + * CHANNELBIND / CreatePermission renewal authenticates with + * the freshly issued NONCE instead of the previous one. + * Without this, the socket-level cache only updates from + * CHANNELBIND/CreatePermission responses and lags behind + * the Refresh path, causing every renewal after a Refresh + * to incur an avoidable 438 Stale Nonce round trip. */ + if (cand->relay_socket != NULL) + nice_turn_socket_cache_realm_nonce (cand->relay_socket, + &cand->stun_resp_msg); + } + + GST_INFO_OBJECT (cand->agent, + "%u/%u: TURN Refresh #%u SUCCESS on cand=%p " + "(granted lifetime %u s, next refresh in %u ms, " + "nonce_changed=%d, consecutive_stale_nonce=%u)", + cand->stream->id, cand->component->id, cand->refresh_count, + cand, lifetime, next_ms, + nonce_changed, cand->consecutive_stale_nonce); + + cand->last_lifetime_s = lifetime; + cand->consecutive_stale_nonce = 0; + /* Arm the "one free timeout" again, since we have just + * successfully refreshed. */ + cand->tolerate_one_timeout = TRUE; + + /* Cancel any existing long-lifetime timer before scheduling a + * new one. The original code overwrote cand->timer_source + * without destroying the previous GSource, leaking it (and, + * if this branch is reached twice for the same response, + * leaving two timers racing). */ + if (cand->timer_source != NULL) { + g_source_destroy (cand->timer_source); + g_source_unref (cand->timer_source); + cand->timer_source = NULL; + } cand->timer_source = - agent_timeout_add_with_context (cand->agent, priv_turn_lifetime_to_refresh_interval(lifetime), + agent_timeout_add_with_context (cand->agent, next_ms, priv_turn_allocate_refresh_tick, cand); - g_source_destroy (cand->tick_source); - g_source_unref (cand->tick_source); - cand->tick_source = NULL; + if (cand->tick_source != NULL) { + g_source_destroy (cand->tick_source); + g_source_unref (cand->tick_source); + cand->tick_source = NULL; + } + + /* Mark the transaction as found so that the response is not + * subsequently fed to the keepalive matcher. */ + trans_found = TRUE; } else if (res == STUN_USAGE_TURN_RETURN_ERROR) { int code = -1; uint8_t *sent_realm = NULL; @@ -3270,7 +3563,28 @@ static gboolean priv_map_reply_to_relay_refresh (NiceAgent *agent, StunMessage * recv_realm = (uint8_t *) stun_message_find (resp, STUN_ATTRIBUTE_REALM, &recv_realm_len); - /* check for unauthorized error response */ + /* Diagnostic log: include the parsed error code so that a + * field engineer can see what the server actually returned. + * 437 Allocation Mismatch is called out specifically because + * it is a common coturn failure mode (server's view of the + * 5-tuple's allocation has drifted from ours). */ + { + int log_code = -1; + (void) stun_message_find_error (resp, &log_code); + GST_WARNING_OBJECT (cand->agent, + "%u/%u: TURN Refresh #%u ERROR code=%d on cand=%p " + "(consecutive_stale_nonce=%u, siblings=%u)%s", + cand->stream->id, cand->component->id, cand->refresh_count, + log_code, cand, cand->consecutive_stale_nonce, + priv_count_sibling_refreshes (cand->agent, cand), + log_code == 437 ? + " — server believes our allocation no longer exists" : ""); + } + + /* check for unauthorized error response. We re-call + * stun_message_find_error here (rather than reusing the + * value from the diagnostic log above) so that the original + * "did the parse succeed?" predicate is preserved exactly. */ if (cand->agent->turn_compatibility == NICE_COMPATIBILITY_RFC5245 && stun_message_get_class (resp) == STUN_ERROR && stun_message_find_error (resp, &code) == @@ -3282,32 +3596,115 @@ static gboolean priv_map_reply_to_relay_refresh (NiceAgent *agent, StunMessage * !(recv_realm_len == sent_realm_len && sent_realm != NULL && memcmp (sent_realm, recv_realm, sent_realm_len) == 0))) { + + cand->consecutive_stale_nonce++; + + /* Tolerate several consecutive 438/401-realm-changed + * responses rather than just one. coturn with a short + * stale-nonce can rotate the nonce again between our + * retry leaving and arriving. The counter is incremented + * above first, so when it reaches + * NICE_TURN_MAX_CONSECUTIVE_STALE_NONCE we have already + * sent that many Refresh transactions back-to-back; at + * that point we stop the immediate retry burst and fall + * back to the normal periodic Refresh cadence (handled + * below) so a transient stale-nonce storm cannot turn + * into either an unbounded retry loop *or* a silent + * abandonment of an otherwise live allocation. */ + if (cand->consecutive_stale_nonce >= + NICE_TURN_MAX_CONSECUTIVE_STALE_NONCE) { + /* The server has answered the most recent burst of + * Refresh transactions with 438/401 a sustained number + * of times in a row. Rather than tearing down the + * refresh candidate (which would leave the allocation + * un-refreshed for the rest of its lifetime and + * silently abandon it), reset the consecutive counter + * and re-arm a normal periodic Refresh based on the + * last known lifetime. This bounds the *immediate* + * retry burst to NICE_TURN_MAX_CONSECUTIVE_STALE_NONCE + * transactions while still letting the agent keep + * trying to refresh the allocation on the regular + * cadence -- so that a transient "stale nonce storm" + * (e.g. a server-side glitch or a clock/nonce re-key + * race) does not permanently kill an otherwise + * recoverable allocation. */ + guint32 next_ms = priv_turn_lifetime_to_refresh_interval ( + cand->last_lifetime_s); + GST_WARNING_OBJECT (cand->agent, + "%u/%u: TURN Refresh on cand=%p: %u consecutive 438/401 " + "responses, backing off to periodic refresh " + "(next attempt in %u ms, last_lifetime=%u s)", + cand->stream->id, cand->component->id, cand, + cand->consecutive_stale_nonce, next_ms, + cand->last_lifetime_s); + + cand->consecutive_stale_nonce = 0; + + /* Cancel any in-flight retransmission tick before + * re-arming the periodic timer, so we don't end up + * with two timers racing for the same allocation. */ + if (cand->tick_source != NULL) { + g_source_destroy (cand->tick_source); + g_source_unref (cand->tick_source); + cand->tick_source = NULL; + } + + if (cand->timer_source != NULL) { + g_source_destroy (cand->timer_source); + g_source_unref (cand->timer_source); + cand->timer_source = NULL; + } + cand->timer_source = + agent_timeout_add_with_context (cand->agent, next_ms, + priv_turn_allocate_refresh_tick, + cand); + + trans_found = TRUE; + break; + } + + GST_DEBUG_OBJECT (cand->agent, + "%u/%u: TURN Refresh on cand=%p: server rotated nonce " + "(code=%d), retrying with new nonce (attempt %u/%u)", + cand->stream->id, cand->component->id, cand, code, + cand->consecutive_stale_nonce, + (guint) NICE_TURN_MAX_CONSECUTIVE_STALE_NONCE); + cand->stun_resp_msg = *resp; memcpy (cand->stun_resp_buffer, resp->buffer, stun_message_length (resp)); cand->stun_resp_msg.buffer = cand->stun_resp_buffer; cand->stun_resp_msg.buffer_len = sizeof(cand->stun_resp_buffer); + + /* Cancel any outstanding retransmission timer before + * issuing the resend, so we don't end up with two + * retransmission cycles racing against each other for the + * same allocation. The tick_source is normally cleared + * at the start of the retransmissions tick, but we may + * be reaching this branch from the inbound STUN path + * while a tick_source is still scheduled. */ + if (cand->tick_source != NULL) { + g_source_destroy (cand->tick_source); + g_source_unref (cand->tick_source); + cand->tick_source = NULL; + } + priv_turn_allocate_refresh_tick_unlocked (cand); } else { - agent_signal_turn_allocation_failure(cand->agent, - cand->stream->id, - cand->component->id, - from, - cand->turn ? &cand->turn->type : NULL, - resp, - ""); - /* case: a real unauthorized error */ + /* case: a real unauthorized error (or 437 Allocation + * Mismatch, or any non-recoverable error code with a + * realm). Only signal fatal failure to the application + * if no sibling refresh candidates for the same + * (stream, component) are still alive. */ + priv_refresh_signal_failure (cand, from, resp, + code == 437 ? "437 Allocation Mismatch" + : "Unauthorized refresh"); refresh_cancel (cand); } } else { - agent_signal_turn_allocation_failure(cand->agent, - cand->stream->id, - cand->component->id, - from, - cand->turn ? &cand->turn->type : NULL, - resp, - ""); - /* case: STUN error, the check STUN context was freed */ + /* case: STUN error, the check STUN context was freed. */ + priv_refresh_signal_failure (cand, from, resp, + "Unhandled STUN refresh error"); refresh_cancel (cand); } trans_found = TRUE; @@ -3384,7 +3781,7 @@ static bool conncheck_stun_validater (StunAgent *agent, if (cand->password) pass = cand->password; - else if(data->stream->local_password) + else if (data->stream->local_password[0] != '\0') pass = data->stream->local_password; if (pass) { @@ -3451,11 +3848,11 @@ static StunAgent* priv_find_stunagent_for_message (NiceAgent *agent, Stream *str } } - GST_DEBUG_OBJECT (agent, "%u/%u: *** ERROR *** unmatched stun response from [%s]:%u (%u octets):", + GST_WARNING_OBJECT (agent, "%u/%u: *** ERROR *** unmatched stun response from [%s]:%u (%u octets):", stream->id, component->id, fromstr, nice_address_get_port (from), len); } else { - GST_DEBUG_OBJECT (agent, "%u/%u: *** ERROR *** no transaction ID in stun response from [%s]:%u (%u octets):", + GST_WARNING_OBJECT (agent, "%u/%u: *** ERROR *** no transaction ID in stun response from [%s]:%u (%u octets):", stream->id, component->id, fromstr, nice_address_get_port (from), len); } diff --git a/agent/conncheck.h b/agent/conncheck.h index 886704b3..036e8d05 100644 --- a/agent/conncheck.h +++ b/agent/conncheck.h @@ -75,7 +75,7 @@ struct _CandidateCheckPair gboolean controlling; gboolean timer_restarted; guint64 priority; - GTimeVal next_tick; /* next tick timestamp */ + gint64 next_tick; /* next tick timestamp, wall-clock microseconds (g_get_real_time) */ StunTimer timer; uint8_t stun_buffer[STUN_MAX_MESSAGE_SIZE]; StunMessage stun_message; diff --git a/agent/discovery.c b/agent/discovery.c index 97f8c1e0..2fc600e4 100644 --- a/agent/discovery.c +++ b/agent/discovery.c @@ -65,11 +65,9 @@ GST_DEBUG_CATEGORY_EXTERN (niceagent_debug); #define GST_CAT_DEFAULT niceagent_debug -static inline int priv_timer_expired (GTimeVal *timer, GTimeVal *now) +static inline int priv_timer_expired (gint64 timer, gint64 now) { - return (now->tv_sec == timer->tv_sec) ? - now->tv_usec >= timer->tv_usec : - now->tv_sec >= timer->tv_sec; + return now >= timer; } /* @@ -148,6 +146,16 @@ void refresh_free_item (gpointer data, gpointer user_data) g_assert (user_data == NULL); + GST_INFO_OBJECT (agent, + "%u/%u: Freeing TURN refresh candidate %p " + "(refresh_count=%u, last_lifetime=%u s, " + "consecutive_stale_nonce=%u); sending REFRESH lifetime=0 to " + "release the allocation", + cand->stream ? cand->stream->id : 0, + cand->component ? cand->component->id : 0, + cand, cand->refresh_count, + cand->last_lifetime_s, cand->consecutive_stale_nonce); + if (cand->timer_source != NULL) { g_source_destroy (cand->timer_source); g_source_unref (cand->timer_source); @@ -189,13 +197,18 @@ void refresh_free_item (gpointer data, gpointer user_data) nice_address_copy_to_sockaddr(&cand->server, (struct sockaddr *)&server_address); stun_message_log(&cand->stun_message, TRUE, (struct sockaddr *)&server_address); - /* send the refresh twice since we won't do retransmissions */ + /* RFC 5766 §7: the release REFRESH (lifetime=0) is purely + * advisory. We have already forgotten the transaction above and + * the server keeps its own allocation-expiry timer (last granted + * lifetime, max 600 s) as a backstop. Sending it twice -- which + * the original code did as a poor-man's retransmission -- causes + * TURN servers to process the duplicate as a separate request, + * yielding either a second STUN response that we can no longer + * match (logged as "*** ERROR *** unmatched stun response …") or + * a spurious 437 Allocation Mismatch on the duplicate when the + * first request has just succeeded. Send exactly once. */ nice_socket_send (cand->nicesock, &cand->server, buffer_len, (gchar *)cand->stun_buffer); - if (!nice_socket_is_reliable (cand->nicesock)) { - nice_socket_send (cand->nicesock, &cand->server, - buffer_len, (gchar *)cand->stun_buffer); - } } @@ -931,7 +944,9 @@ static gboolean priv_discovery_tick_unlocked (gpointer pointer) &cand->stun_message, cand->stun_buffer, sizeof(cand->stun_buffer), cand->stun_resp_msg.buffer == NULL ? NULL : &cand->stun_resp_msg, STUN_USAGE_TURN_REQUEST_PORT_NORMAL, - -1, -1, + /* RFC 5766 §6.1: explicitly request LIFETIME=600 s + * rather than relying on the server's default. */ + -1, 600, username, username_len, password, password_len, turn_compat); @@ -966,7 +981,7 @@ static gboolean priv_discovery_tick_unlocked (gpointer pointer) buffer_len, (gchar *)cand->stun_buffer); /* case: success, start waiting for the result */ - g_get_current_time (&cand->next_tick); + cand->next_tick = g_get_real_time (); } else { /* case: error in starting discovery, start the next discovery */ @@ -984,16 +999,16 @@ static gboolean priv_discovery_tick_unlocked (gpointer pointer) } if (cand->done != TRUE) { - GTimeVal now; + gint64 now; - g_get_current_time (&now); + now = g_get_real_time (); if (cand->stun_message.buffer == NULL) { GST_DEBUG_OBJECT (agent, "%u/%u: STUN discovery was cancelled, marking discovery done.", cand->stream->id, cand->component->id); cand->done = TRUE; } - else if (priv_timer_expired (&cand->next_tick, &now)) { + else if (priv_timer_expired (cand->next_tick, now)) { switch (stun_timer_refresh (&cand->timer)) { case STUN_USAGE_TIMER_RETURN_TIMEOUT: { @@ -1031,9 +1046,8 @@ static gboolean priv_discovery_tick_unlocked (gpointer pointer) stun_message_length (&cand->stun_message), (gchar *)cand->stun_buffer); - /* note: convert from milli to microseconds for g_time_val_add() */ - cand->next_tick = now; - g_time_val_add (&cand->next_tick, timeout * 1000); + /* note: convert from milli to microseconds */ + cand->next_tick = now + (gint64) timeout * 1000; ++not_done; /* note: retry later */ break; @@ -1042,8 +1056,7 @@ static gboolean priv_discovery_tick_unlocked (gpointer pointer) { unsigned int timeout = stun_timer_remainder (&cand->timer); - cand->next_tick = now; - g_time_val_add (&cand->next_tick, timeout * 1000); + cand->next_tick = now + (gint64) timeout * 1000; ++not_done; /* note: retry later */ break; diff --git a/agent/discovery.h b/agent/discovery.h index 45b35c6e..4262fb4b 100644 --- a/agent/discovery.h +++ b/agent/discovery.h @@ -50,7 +50,7 @@ typedef struct NiceCandidateType type; /**< candidate type STUN or TURN */ NiceSocket *nicesock; /**< XXX: should be taken from local cand: existing socket to use */ NiceAddress server; /**< STUN/TURN server address */ - GTimeVal next_tick; /**< next tick timestamp */ + gint64 next_tick; /**< next tick timestamp, wall-clock microseconds (g_get_real_time) */ gboolean pending; /**< is discovery in progress? */ gboolean done; /**< is discovery complete? */ Stream *stream; @@ -87,8 +87,47 @@ typedef struct StunMessage stun_message; uint8_t stun_resp_buffer[STUN_MAX_MESSAGE_SIZE]; StunMessage stun_resp_msg; + + /* + * Robustness counters used by the TURN refresh code. + * + * - refresh_count: how many Refresh requests we have sent on this + * allocation (including resends after 438). Used in log lines so + * that "is this the first refresh, or is it stuck in a retry + * loop?" can be answered from the log. + * - consecutive_stale_nonce: how many 438/401-realm-changed responses + * we have received in a row without an intervening success. Reset + * to zero on any RELAY_SUCCESS response. Compared against + * NICE_TURN_MAX_CONSECUTIVE_STALE_NONCE: when the counter reaches + * the limit, the refresh logic backs off and re-arms the periodic + * refresh instead of immediately failing the allocation. + * - last_lifetime_s: lifetime (seconds) granted by the most recent + * successful Allocate / Refresh response. Used both for log lines + * and for the release REFRESH at teardown. + * - tolerate_one_timeout: when TRUE, the next retransmission timeout + * in priv_turn_allocate_refresh_retransmissions_tick will trigger + * one extra refresh attempt rather than tearing down the + * allocation. Set automatically after every successful refresh so + * that a single lost refresh does not kill the allocation. + */ + guint refresh_count; + guint consecutive_stale_nonce; + guint32 last_lifetime_s; + gboolean tolerate_one_timeout; } CandidateRefresh; +/* How many Refresh transactions in total we will send on a single + * candidate while the server keeps returning 438 (Stale Nonce) / + * 401 (realm changed). RFC 5389 only mandates one retry, but real-world + * TURN servers (notably coturn with short stale-nonce values) can + * rotate the nonce again between our retry being sent and reaching + * them, so be a little more lenient — but not so lenient that a + * misbehaving server can keep us looping for a long time. With siblings + * (e.g. an RTP+RTCP pair sharing one TURN server) the total Refresh + * traffic generated for one component is bounded by + * NICE_TURN_MAX_CONSECUTIVE_STALE_NONCE * . */ +#define NICE_TURN_MAX_CONSECUTIVE_STALE_NONCE 5 + void refresh_free_item (gpointer data, gpointer user_data); void refresh_free (NiceAgent *agent); void refresh_prune_stream (NiceAgent *agent, guint stream_id); diff --git a/meson.build b/meson.build index 7845fb4f..8278c3eb 100644 --- a/meson.build +++ b/meson.build @@ -90,5 +90,5 @@ subdir('agent') subdir('nice') subdir('gst') -#subdir('tests') +subdir('tests') #subdir('python') diff --git a/socket/tcp-bsd.c b/socket/tcp-bsd.c index 037d40b5..b456f223 100644 --- a/socket/tcp-bsd.c +++ b/socket/tcp-bsd.c @@ -348,9 +348,9 @@ socket_send_more ( } if (ret < 0) { - if(gerr != NULL && - g_error_matches (gerr, G_IO_ERROR, G_IO_ERROR_WOULD_BLOCK) - || g_error_matches (gerr, G_IO_ERROR, G_IO_ERROR_NOT_CONNECTED)) { + if (gerr != NULL && + (g_error_matches (gerr, G_IO_ERROR, G_IO_ERROR_WOULD_BLOCK) || + g_error_matches (gerr, G_IO_ERROR, G_IO_ERROR_NOT_CONNECTED))) { add_to_be_sent (sock, tbs->buf, tbs->length, TRUE); g_free (tbs->buf); g_slice_free (struct to_be_sent, tbs); diff --git a/socket/turn.c b/socket/turn.c index 50e69ee8..c88f5031 100644 --- a/socket/turn.c +++ b/socket/turn.c @@ -60,7 +60,22 @@ GST_DEBUG_CATEGORY_EXTERN (niceagent_debug); #define STUN_MAX_MS_REALM_LEN 128 // as defined in [MS-TURN] #define STUN_EXPIRE_TIMEOUT 60 /* Time we refresh before expiration */ #define STUN_PERMISSION_TIMEOUT (300 - STUN_EXPIRE_TIMEOUT) /* 240 s */ -#define STUN_BINDING_TIMEOUT (600 - STUN_EXPIRE_TIMEOUT) /* 540 s */ +/* Refresh ChannelBind every 240 s during the 600 s lifetime, matching + * Chrome/WebRTC. This gives ~360 s of safety margin (vs. 60 s with the + * old 540 s value) for a refresh+retry to succeed before the binding + * actually expires server-side. */ +#define STUN_BINDING_TIMEOUT 240 + +/* Test-only knobs. When these environment variables are set to a positive + * integer (in seconds) at the time the TURN socket is constructed, they + * override the corresponding RFC-recommended refresh/expire schedules so + * that the refresh code paths can be exercised quickly in unit tests + * (instead of waiting several minutes per cycle). NOT part of the public + * API and NOT for production use; values are read once at construction + * and cached on the TurnPriv. */ +#define ENV_NICE_TURN_BINDING_TIMEOUT "NICE_TURN_BINDING_TIMEOUT" +#define ENV_NICE_TURN_PERMISSION_TIMEOUT "NICE_TURN_PERMISSION_TIMEOUT" +#define ENV_NICE_TURN_EXPIRE_TIMEOUT "NICE_TURN_EXPIRE_TIMEOUT" typedef struct { StunMessage message; @@ -104,6 +119,22 @@ typedef struct { GHashTable *send_data_queues; /* stores a send data queue for per peer */ guint permission_timeout_source; /* timer used to invalidate permissions */ + /* Cached long-term-credential REALM/NONCE learned from a previous + * authenticated TURN response. Reused on subsequent ChannelBind / + * CreatePermission refreshes so they go out already-authenticated + * instead of always doing the unauthenticated -> 401 -> authenticated + * round-trip. Refreshed whenever a server response carries a (possibly + * rotated) NONCE. */ + uint8_t *cached_realm; + uint16_t cached_realm_len; + uint8_t *cached_nonce; + uint16_t cached_nonce_len; + /* Test-only refresh/expire timers, in seconds. Defaults match the + * STUN_* constants above; overridden via NICE_TURN_*_TIMEOUT env vars + * read once in nice_turn_socket_new(). */ + guint binding_timeout; + guint permission_timeout; + guint expire_timeout; } TurnPriv; @@ -130,6 +161,7 @@ static void priv_process_pending_bindings (TurnPriv *priv); static gboolean priv_retransmissions_tick_unlocked (TurnPriv *priv); static gboolean priv_retransmissions_tick (gpointer pointer); static void priv_schedule_tick (TurnPriv *priv); +static void priv_source_remove_with_context (TurnPriv *priv, guint id); static void priv_send_turn_message (TurnPriv *priv, TURNMessage *msg); static gboolean priv_send_create_permission (TurnPriv *priv, StunMessage *resp, const NiceAddress *peer); @@ -141,6 +173,54 @@ static gboolean priv_add_channel_binding (TurnPriv *priv, static gboolean priv_forget_send_request (gpointer pointer); static void priv_clear_permissions (TurnPriv *priv); +/* Read a non-negative integer from the environment, falling back to + * `default_secs` if the variable is unset, empty, non-numeric, or zero. + * Used only for test-only refresh/expire knobs (see ENV_NICE_TURN_*). */ +static guint +priv_env_timeout_secs (const gchar *name, guint default_secs) +{ + const gchar *v = g_getenv (name); + gchar *end = NULL; + guint64 parsed; + + if (v == NULL || *v == '\0') + return default_secs; + + parsed = g_ascii_strtoull (v, &end, 10); + if (end == v || *end != '\0' || parsed == 0 || parsed > G_MAXUINT) + return default_secs; + + return (guint) parsed; +} + +/* Cache long-term-credential REALM/NONCE attributes from a TURN response + * (a successful CB/CP response or a 401/438 challenge) so that subsequent + * refresh sends can be authenticated up-front. The cache is replaced + * whenever the server hands us a (possibly rotated) NONCE. */ +static void +priv_cache_credentials (TurnPriv *priv, StunMessage *msg) +{ + uint16_t len = 0; + uint8_t *attr; + + if (msg == NULL) + return; + + attr = (uint8_t *) stun_message_find (msg, STUN_ATTRIBUTE_REALM, &len); + if (attr != NULL && len > 0) { + g_free (priv->cached_realm); + priv->cached_realm = g_memdup2 (attr, len); + priv->cached_realm_len = len; + } + + attr = (uint8_t *) stun_message_find (msg, STUN_ATTRIBUTE_NONCE, &len); + if (attr != NULL && len > 0) { + g_free (priv->cached_nonce); + priv->cached_nonce = g_memdup2 (attr, len); + priv->cached_nonce_len = len; + } +} + static guint priv_nice_address_hash (gconstpointer data) { @@ -232,6 +312,16 @@ nice_turn_socket_new (GMainContext *ctx, priv->compatibility = compatibility; priv->send_requests = g_queue_new (); + priv->binding_timeout = + priv_env_timeout_secs (ENV_NICE_TURN_BINDING_TIMEOUT, + STUN_BINDING_TIMEOUT); + priv->permission_timeout = + priv_env_timeout_secs (ENV_NICE_TURN_PERMISSION_TIMEOUT, + STUN_PERMISSION_TIMEOUT); + priv->expire_timeout = + priv_env_timeout_secs (ENV_NICE_TURN_EXPIRE_TIMEOUT, + STUN_EXPIRE_TIMEOUT); + priv->send_data_queues = g_hash_table_new_full (priv_nice_address_hash, (GEqualFunc) nice_address_equal, @@ -263,7 +353,7 @@ socket_close (NiceSocket *sock) for (i = priv->channels; i; i = i->next) { ChannelBinding *b = i->data; if (b->timeout_source) - g_source_remove (b->timeout_source); + priv_source_remove_with_context (priv, b->timeout_source); g_free (b); } g_list_free (priv->channels); @@ -304,7 +394,7 @@ socket_close (NiceSocket *sock) g_hash_table_destroy (priv->send_data_queues); if (priv->permission_timeout_source) - g_source_remove (priv->permission_timeout_source); + priv_source_remove_with_context (priv, priv->permission_timeout_source); if (priv->ctx) g_main_context_unref (priv->ctx); @@ -312,6 +402,8 @@ socket_close (NiceSocket *sock) g_free (priv->current_binding); g_free (priv->current_binding_msg); g_free (priv->current_create_permission_msg); + g_free (priv->cached_realm); + g_free (priv->cached_nonce); g_free (priv->username); g_free (priv->password); g_free (priv); @@ -353,6 +445,56 @@ priv_timeout_add_with_context (TurnPriv *priv, guint interval, return source; } +/* Mirror of g_timeout_add_seconds(), but attaches the source to priv->ctx + * (the agent's main context) instead of the calling thread's + * thread-default main context. This matters because TURN response handling + * may run on a worker thread whose default context is never iterated -- + * timers attached there would never fire. Returns the source ID so it can + * be stored alongside existing g_source_remove() callers and matched in + * the callback via g_source_get_id(g_main_current_source()). */ +static guint +priv_timeout_add_seconds_with_context (TurnPriv *priv, guint interval_seconds, + GSourceFunc function, gpointer data) +{ + GSource *source; + guint id; + + g_return_val_if_fail (function != NULL, 0); + + source = g_timeout_source_new_seconds (interval_seconds); + g_source_set_callback (source, function, data, NULL); + id = g_source_attach (source, priv->ctx); + g_source_unref (source); + + return id; +} + +/* Counterpart to priv_timeout_add_seconds_with_context(): destroy a source + * by id while looking it up in priv->ctx. g_source_remove() only searches + * the thread-default context, so it cannot remove sources attached to an + * arbitrary GMainContext and would emit + * "GLib-CRITICAL: Source ID N was not found". */ +static void +priv_source_remove_with_context (TurnPriv *priv, guint id) +{ + GMainContext *ctx; + GSource *source; + + if (id == 0) + return; + + ctx = priv->ctx ? priv->ctx : g_main_context_default (); + source = g_main_context_find_source_by_id (ctx, id); + if (source != NULL) { + g_source_destroy (source); + } else { + /* The source has already been destroyed (e.g. its callback returned + * FALSE or the context was iterated to completion). Nothing to do. */ + GST_DEBUG ("turn: source id %u not found in ctx %p; already destroyed", + id, ctx); + } +} + static StunMessageReturn stun_message_append_ms_connection_id(StunMessage *msg, uint8_t *ms_connection_id, uint32_t ms_sequence_num) @@ -789,9 +931,11 @@ priv_binding_timeout (gpointer data) ChannelBinding *b = i->data; if (b->timeout_source == g_source_get_id (source)) { b->renew = TRUE; - /* Install timer to expire the permission */ - b->timeout_source = g_timeout_add_seconds (STUN_EXPIRE_TIMEOUT, - priv_binding_expired_timeout, priv); + /* Install timer to expire the permission. Must be attached to + * priv->ctx -- not the thread-default context -- so it actually + * fires when armed from a TURN response worker thread. */ + b->timeout_source = priv_timeout_add_seconds_with_context (priv, + priv->expire_timeout, priv_binding_expired_timeout, priv); /* Send renewal */ if (!priv->current_binding_msg) priv_send_channel_bind (priv, NULL, b->channel, &b->peer); @@ -953,6 +1097,9 @@ nice_turn_socket_parse_recv (NiceSocket *sock, NiceSocket **from_sock, memcmp (sent_realm, recv_realm, sent_realm_len) == 0)))) { + /* Stash REALM/NONCE so the next refresh can be + * authenticated up front and skip the 401 round-trip. */ + priv_cache_credentials (priv, &msg); g_free (priv->current_binding_msg); priv->current_binding_msg = NULL; if (binding) @@ -966,6 +1113,9 @@ nice_turn_socket_parse_recv (NiceSocket *sock, NiceSocket **from_sock, priv_process_pending_bindings (priv); } } else if (stun_message_get_class (&msg) == STUN_RESPONSE) { + /* Refresh cached credentials in case the server rotated + * the NONCE in this success response. */ + priv_cache_credentials (priv, &msg); g_free (priv->current_binding_msg); priv->current_binding_msg = NULL; @@ -980,11 +1130,14 @@ nice_turn_socket_parse_recv (NiceSocket *sock, NiceSocket **from_sock, /* Remove any existing timer */ if (binding->timeout_source) - g_source_remove (binding->timeout_source); - /* Install timer to schedule refresh of the permission */ + priv_source_remove_with_context (priv, binding->timeout_source); + /* Install timer to schedule refresh of the channel binding. + * Must be attached to priv->ctx -- not the thread-default + * context -- so it actually fires when armed from a TURN + * response worker thread. */ binding->timeout_source = - g_timeout_add_seconds (STUN_BINDING_TIMEOUT, - priv_binding_timeout, priv); + priv_timeout_add_seconds_with_context (priv, + priv->binding_timeout, priv_binding_timeout, priv); } priv_process_pending_bindings (priv); } @@ -1038,6 +1191,9 @@ nice_turn_socket_parse_recv (NiceSocket *sock, NiceSocket **from_sock, sent_realm != NULL && memcmp (sent_realm, recv_realm, sent_realm_len) == 0)))) { + /* Stash REALM/NONCE so the next refresh can be + * authenticated up front and skip the 401 round-trip. */ + priv_cache_credentials (priv, &msg); g_free (priv->current_create_permission_msg); priv->current_create_permission_msg = NULL; /* resend CreatePermission */ @@ -1054,10 +1210,17 @@ nice_turn_socket_parse_recv (NiceSocket *sock, NiceSocket **from_sock, /* install timer to schedule refresh of the permission */ /* (will not schedule refresh if we got an error) */ + /* Must be attached to priv->ctx -- not the thread-default + * context -- so it actually fires when armed from a TURN + * response worker thread. */ if (stun_message_get_class (&msg) == STUN_RESPONSE && !priv->permission_timeout_source) { + /* Refresh cached credentials in case the server rotated + * the NONCE in this success response. */ + priv_cache_credentials (priv, &msg); priv->permission_timeout_source = - g_timeout_add_seconds (STUN_PERMISSION_TIMEOUT, + priv_timeout_add_seconds_with_context (priv, + priv->permission_timeout, priv_permission_timeout, priv); } @@ -1424,6 +1587,19 @@ priv_send_create_permission(TurnPriv *priv, StunMessage *resp, nonce = (uint8_t *) stun_message_find (resp, STUN_ATTRIBUTE_NONCE, &nonce_len); } + /* If the response did not carry credentials (e.g. this is a refresh + * driven by our own timer rather than a 401/438), reuse the credentials + * cached from a previous authenticated exchange so the request goes out + * already-authenticated and avoids the unauthenticated -> 401 -> + * authenticated round-trip. */ + if (realm == NULL && priv->cached_realm != NULL) { + realm = priv->cached_realm; + realm_len = priv->cached_realm_len; + } + if (nonce == NULL && priv->cached_nonce != NULL) { + nonce = priv->cached_nonce; + nonce_len = priv->cached_nonce_len; + } /* register this peer as being pening a permission (if not already pending) */ if (!priv_has_sent_permission_for_peer (priv, peer)) { @@ -1511,24 +1687,46 @@ priv_send_channel_bind (TurnPriv *priv, StunMessage *resp, } } - if (resp) { - uint8_t *realm; - uint8_t *nonce; + /* Resolve REALM/NONCE: prefer the values from `resp` (a 401/438 + * challenge or a server response) so we honour rotated nonces; fall + * back to the cached values learned from a prior authenticated + * exchange so refresh sends driven by our own timer can be + * authenticated up front and skip the unauthenticated -> 401 -> + * authenticated round-trip. */ + { + uint8_t *realm = NULL; + uint8_t *nonce = NULL; + uint16_t realm_len = 0; + uint16_t nonce_len = 0; uint16_t len; - realm = (uint8_t *) stun_message_find (resp, STUN_ATTRIBUTE_REALM, &len); + if (resp) { + uint8_t *p; + p = (uint8_t *) stun_message_find (resp, STUN_ATTRIBUTE_REALM, &len); + if (p) { realm = p; realm_len = len; } + p = (uint8_t *) stun_message_find (resp, STUN_ATTRIBUTE_NONCE, &len); + if (p) { nonce = p; nonce_len = len; } + } + if (realm == NULL && priv->cached_realm != NULL) { + realm = priv->cached_realm; + realm_len = priv->cached_realm_len; + } + if (nonce == NULL && priv->cached_nonce != NULL) { + nonce = priv->cached_nonce; + nonce_len = priv->cached_nonce_len; + } + if (realm != NULL) { if (stun_message_append_bytes (&msg->message, STUN_ATTRIBUTE_REALM, - realm, len) + realm, realm_len) != STUN_MESSAGE_RETURN_SUCCESS) { g_free (msg); return 0; } } - nonce = (uint8_t *) stun_message_find (resp, STUN_ATTRIBUTE_NONCE, &len); if (nonce != NULL) { if (stun_message_append_bytes (&msg->message, STUN_ATTRIBUTE_NONCE, - nonce, len) + nonce, nonce_len) != STUN_MESSAGE_RETURN_SUCCESS) { g_free (msg); return 0; @@ -1679,3 +1877,15 @@ nice_turn_socket_set_ms_connection_id (NiceSocket *sock, StunMessage *msg) priv->ms_connection_id_valid = TRUE; } } + +void +nice_turn_socket_cache_realm_nonce (NiceSocket *sock, StunMessage *msg) +{ + TurnPriv *priv; + + if (sock == NULL || sock->priv == NULL || msg == NULL) + return; + + priv = (TurnPriv *) sock->priv; + priv_cache_credentials (priv, msg); +} diff --git a/socket/turn.h b/socket/turn.h index 7ae7339a..b2a0df92 100644 --- a/socket/turn.h +++ b/socket/turn.h @@ -71,6 +71,15 @@ nice_turn_socket_set_ms_realm(NiceSocket *sock, StunMessage *msg); void nice_turn_socket_set_ms_connection_id (NiceSocket *sock, StunMessage *msg); +/* Update the TURN socket's cached long-term-credential REALM/NONCE + * from a TURN response observed at a higher layer (e.g. a successful + * Refresh response handled by the agent). Keeping this cache in sync + * with the most recently rotated NONCE ensures that subsequent + * CHANNELBIND / CreatePermission renewals authenticate up-front + * instead of having to recover from a 438 Stale Nonce. */ +void +nice_turn_socket_cache_realm_nonce (NiceSocket *sock, StunMessage *msg); + G_END_DECLS diff --git a/tests/meson.build b/tests/meson.build new file mode 100644 index 00000000..f122a59a --- /dev/null +++ b/tests/meson.build @@ -0,0 +1,37 @@ +# Standalone regression tests added by the TURN-refresh hardening work. +# These two tests have no dependency on libnice itself (one exercises a +# pure-GLib threading pattern, the other is a pure-computation unit +# test), so they can run in CI without a TURN server. +# +# The other historical tests in this directory (test-fullmode etc.) are +# still wired into autotools via Makefile.am and have not been ported +# to meson here. + +test_mainctx_cross_thread = executable('test-mainctx-cross-thread', + 'test-mainctx-cross-thread.c', + include_directories : config_h_inc, + dependencies : glib_deps, +) +test('mainctx-cross-thread', test_mainctx_cross_thread, + timeout : 30, +) + +test_turn_refresh_interval = executable('test-turn-refresh-interval', + 'test-turn-refresh-interval.c', + include_directories : config_h_inc, + dependencies : glib_deps, +) +test('turn-refresh-interval', test_turn_refresh_interval, + timeout : 10, +) + +# Pins down the parsing contract for the test-only env-var refresh- +# timeout knobs declared in socket/turn.c. See the file header for why. +test_turn_timeout_env = executable('test-turn-timeout-env', + 'test-turn-timeout-env.c', + include_directories : config_h_inc, + dependencies : glib_deps, +) +test('turn-timeout-env', test_turn_timeout_env, + timeout : 10, +) diff --git a/tests/test-mainctx-cross-thread.c b/tests/test-mainctx-cross-thread.c new file mode 100644 index 00000000..3333dcc5 --- /dev/null +++ b/tests/test-mainctx-cross-thread.c @@ -0,0 +1,178 @@ +/* + * This file is part of the Nice GLib ICE library. + * + * Regression test for the TURN ChannelBind / CreatePermission refresh + * timer bug fixed in socket/turn.c. + * + * The bug: + * The TURN response handler may run on a worker thread (the agent's + * I/O thread). That thread's "thread-default" GMainContext is the + * global default and is never iterated by anyone, because the agent's + * own main context lives on a different thread. When the response + * handler called g_timeout_add_seconds() to arm a refresh timer, the + * resulting GSource was attached to the worker thread's default + * context — i.e. nowhere — and the timer NEVER fired. + * + * The fix is to construct the GSource explicitly with + * g_timeout_source_new_seconds() and attach it to the *agent's* + * GMainContext via g_source_attach(), bypassing the per-thread + * default context entirely. + * + * What this test does: + * 1. Creates GMainContext "ctx" and runs its loop on the main thread. + * 2. Spawns a worker thread whose own thread-default context is the + * global default (NULL) — i.e. the same situation that an agent + * I/O thread is in by default. + * 3. From the worker thread, arms two short timers: + * a) the BROKEN pattern: g_timeout_add_seconds() to a callback + * that increments `broken_fired`. This source ends up on the + * worker thread's default context, which nobody iterates. + * b) the FIXED pattern: g_timeout_source_new_seconds() + + * g_source_attach(source, ctx) to a callback that increments + * `fixed_fired`. + * 4. Iterates the ctx loop on the main thread for ~2.5 seconds. + * 5. Asserts: + * - fixed_fired > 0 (the fix pattern works) + * - broken_fired == 0 (the original pattern is dead, confirming + * this test would have failed without the fix being applied + * at all turn.c call sites) + * + * (C) 2026 Pexip AS. + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + */ +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include + +static volatile gint broken_fired = 0; +static volatile gint fixed_fired = 0; +static GMainLoop *main_loop = NULL; + +/* Handshake: the worker signals `timers_armed` once it has attached + * both timer sources. Main waits for that signal before arming the + * quit timer, so the test does not race the worker thread under load + * (the previous unconditional 2.5 s quit could fire before the worker + * had even scheduled the 1 s timers). */ +static GMutex armed_mutex; +static GCond armed_cond; +static gboolean timers_armed = FALSE; + +static gboolean +broken_cb (gpointer data) +{ + (void) data; + g_atomic_int_inc (&broken_fired); + return FALSE; +} + +static gboolean +fixed_cb (gpointer data) +{ + (void) data; + g_atomic_int_inc (&fixed_fired); + return FALSE; +} + +static gboolean +quit_cb (gpointer data) +{ + GMainLoop *loop = data; + g_main_loop_quit (loop); + return FALSE; +} + +static gpointer +worker_thread (gpointer data) +{ + GMainContext *agent_ctx = data; + GSource *source; + + /* (a) broken pattern: convenience wrapper that attaches to the + * thread-default context, which on this worker thread is the global + * default — nobody iterates it. */ + g_timeout_add_seconds (1, broken_cb, NULL); + + /* (b) fixed pattern: build a GSource and attach it explicitly to the + * agent's context, the one that is actually being iterated. */ + source = g_timeout_source_new_seconds (1); + g_source_set_callback (source, fixed_cb, NULL, NULL); + g_source_attach (source, agent_ctx); + g_source_unref (source); + + /* Tell main that both timers are now armed. */ + g_mutex_lock (&armed_mutex); + timers_armed = TRUE; + g_cond_signal (&armed_cond); + g_mutex_unlock (&armed_mutex); + g_main_context_wakeup (agent_ctx); + + return NULL; +} + +int +main (void) +{ + GMainContext *agent_ctx; + GThread *worker; + GSource *quit_source; + +#if !GLIB_CHECK_VERSION (2, 36, 0) + g_type_init (); +#endif + + g_mutex_init (&armed_mutex); + g_cond_init (&armed_cond); + + agent_ctx = g_main_context_new (); + main_loop = g_main_loop_new (agent_ctx, FALSE); + + /* Spawn the worker. Its thread-default context is NULL (the global + * default) — exactly the situation the bug occurred in. */ + worker = g_thread_new ("test-worker", worker_thread, agent_ctx); + + /* Wait for the worker to arm both timer sources before starting our + * quit countdown. Bounded by a generous 10 s wall-clock cap so a + * stuck worker can't hang CI. */ + { + gint64 deadline = g_get_monotonic_time () + 10 * G_TIME_SPAN_SECOND; + g_mutex_lock (&armed_mutex); + while (!timers_armed) { + if (!g_cond_wait_until (&armed_cond, &armed_mutex, deadline)) + break; + } + g_mutex_unlock (&armed_mutex); + g_assert_true (timers_armed); + } + + /* Schedule a quit 2.5 s after the timers were armed. The timers we + * are testing fire at ~1 s, so this leaves ~1.5 s of slack. */ + quit_source = g_timeout_source_new (2500); + g_source_set_callback (quit_source, quit_cb, main_loop, NULL); + g_source_attach (quit_source, agent_ctx); + g_source_unref (quit_source); + + g_main_loop_run (main_loop); + + g_thread_join (worker); + + g_main_loop_unref (main_loop); + g_main_context_unref (agent_ctx); + g_cond_clear (&armed_cond); + g_mutex_clear (&armed_mutex); + + /* Assertions: the fixed pattern must have fired; the broken pattern + * must NOT have fired. If broken_fired > 0 it means the worker + * thread's default context is somehow being iterated, which would + * invalidate the test scenario. */ + g_assert_cmpint (g_atomic_int_get (&fixed_fired), ==, 1); + g_assert_cmpint (g_atomic_int_get (&broken_fired), ==, 0); + + return EXIT_SUCCESS; +} diff --git a/tests/test-turn-refresh-interval.c b/tests/test-turn-refresh-interval.c new file mode 100644 index 00000000..90ab35f7 --- /dev/null +++ b/tests/test-turn-refresh-interval.c @@ -0,0 +1,154 @@ +/* + * This file is part of the Nice GLib ICE library. + * + * Unit test for the TURN refresh-interval calculation in + * agent/conncheck.c::priv_turn_lifetime_to_refresh_interval(). + * + * Background: + * Given a TURN allocation lifetime L (seconds, as returned by the + * server in an Allocate or Refresh response), libnice schedules the + * next Refresh to be sent priv_turn_lifetime_to_refresh_interval(L) + * milliseconds later. Historically this returned `(L - 30) * 1000`, + * despite a comment in the same file claiming "1 minute before + * expiry". On lossy paths 30 s is not enough to absorb a STUN + * retransmission cycle (default timer is 600 ms doubling, 3 + * retransmissions ≈ 9 s) and a possible 438 round trip. There was + * also no guard against integer underflow if L < 30. + * + * The formula was rewritten to: + * - if L <= 20: refresh in 1 s (degenerate input) + * - else: refresh after min(L/2, L - 10) seconds, never < 5 s + * + * This file enforces the *invariants* the new formula must satisfy: + * I1. result is strictly less than L (never refresh exactly at + * expiry or after). + * I2. result + 10 s margin never exceeds L (always at least 10 s + * of safety margin to retry before the server times us out) + * for non-degenerate inputs. + * I3. result is at most L/2 for non-degenerate inputs (refresh in + * the first half of the lifetime, so a single failure plus + * retry still has time to complete). + * I4. result is never zero / never underflows. + * I5. for the canonical lifetime of 600 s (RFC 5766 §6.1 + * recommended) the result is between 5000 ms and 300000 ms. + * + * These properties are tested against a copy of the production + * formula. If the production formula changes in conncheck.c, this + * copy MUST be kept in sync — the comment at the top of the helper + * below makes that requirement explicit. + * + * (C) 2026 Pexip AS. + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + */ +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include + +/* + * EXACT MIRROR of agent/conncheck.c::priv_turn_lifetime_to_refresh_interval. + * + * If you change one, you MUST change the other. The point of this + * mirror is so that the invariants below can be enforced as plain + * unit-tested properties without exposing the static function. + */ +static guint +mirror_lifetime_to_refresh_interval (uint32_t lifetime) +{ + uint32_t interval_s; + guint64 interval_ms; + + if (lifetime <= 20) + return 1000; + + interval_s = lifetime / 2; + if (interval_s + 10 > lifetime) + interval_s = lifetime - 10; + if (interval_s < 5) + interval_s = 5; + + interval_ms = (guint64) interval_s * 1000u; + if (interval_ms > G_MAXUINT) + interval_ms = G_MAXUINT; + return (guint) interval_ms; +} + +static void +check_invariants_for (uint32_t lifetime) +{ + guint result_ms = mirror_lifetime_to_refresh_interval (lifetime); + guint64 result_s = (guint64) result_ms / 1000u; + + /* I4: never zero / never underflows. */ + g_assert_cmpuint (result_ms, >, 0); + + /* Degenerate-input branch is exempt from I1..I3. */ + if (lifetime <= 20) { + g_assert_cmpuint (result_ms, ==, 1000); + return; + } + + /* Saturation bound: the ms result must never exceed G_MAXUINT (it is + * fed to APIs that take `guint`). The saturation in the production + * formula guarantees this; assert it explicitly. */ + g_assert_cmpuint (result_ms, <=, G_MAXUINT); + + /* For lifetimes whose halfway point in milliseconds would overflow + * `guint`, the saturation clamp kicks in and I1..I3 (which are stated + * in seconds against `lifetime`) cannot all hold simultaneously -- + * the s/ms domains diverge. Skip them; I4 plus the saturation bound + * above are the meaningful checks in that regime. */ + if ((guint64) (lifetime / 2) * 1000u > G_MAXUINT) + return; + + /* I1: result must be strictly less than the lifetime. */ + g_assert_cmpuint (result_s, <, lifetime); + + /* I2: at least 10 s of safety margin. */ + g_assert_cmpuint (result_s + 10, <=, lifetime); + + /* I3: refresh in the first half of the lifetime. */ + g_assert_cmpuint (result_s * 2, <=, lifetime); +} + +int +main (void) +{ + /* I5: canonical RFC 5766 lifetime sanity. */ + guint r600 = mirror_lifetime_to_refresh_interval (600); + g_assert_cmpuint (r600, >=, 5000); + g_assert_cmpuint (r600, <=, 300000); + + /* Spot-checks vs. the old (buggy) formula. */ + /* Old: (600 - 30) * 1000 = 570000. New must be much smaller. */ + g_assert_cmpuint (r600, <, 570000); + + /* Boundary: lifetime == 0 must not underflow uint32_t arithmetic. */ + g_assert_cmpuint (mirror_lifetime_to_refresh_interval (0), >, 0); + + /* Boundary: lifetime == 1, 2, 5, 10, 20, 21 — the degenerate-vs- + * normal switchover. The previous formula would have returned + * (1 - 30) * 1000 = a huge number due to underflow. */ + check_invariants_for (0); + check_invariants_for (1); + check_invariants_for (5); + check_invariants_for (10); + check_invariants_for (20); + check_invariants_for (21); + check_invariants_for (30); + check_invariants_for (60); + check_invariants_for (120); + check_invariants_for (300); + check_invariants_for (600); + check_invariants_for (3600); + check_invariants_for (UINT32_MAX); + + return EXIT_SUCCESS; +} diff --git a/tests/test-turn-timeout-env.c b/tests/test-turn-timeout-env.c new file mode 100644 index 00000000..2fe1d990 --- /dev/null +++ b/tests/test-turn-timeout-env.c @@ -0,0 +1,125 @@ +/* + * This file is part of the Nice GLib ICE library. + * + * Unit test for the test-only TURN refresh-timeout env-var knobs declared + * in socket/turn.c: + * + * NICE_TURN_BINDING_TIMEOUT + * NICE_TURN_PERMISSION_TIMEOUT + * NICE_TURN_EXPIRE_TIMEOUT + * + * These knobs let an integration test shorten the ChannelBind / + * CreatePermission refresh schedule (default 240/240/60 s) so the + * refresh code paths can be exercised in seconds rather than the ~10 min + * required to observe a real refresh cycle. Because they are the basis + * for any such integration test, this file pins down their *parsing* + * contract: which strings override the default, which fall through to + * the default, and which are clamped. + * + * The parser is `static` in socket/turn.c, so this file mirrors it + * verbatim. If you change one, you MUST change the other. + * + * (C) 2026 Pexip AS. + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + */ +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include + +/* Names MUST match socket/turn.c. */ +#define ENV_NICE_TURN_BINDING_TIMEOUT "NICE_TURN_BINDING_TIMEOUT" +#define ENV_NICE_TURN_PERMISSION_TIMEOUT "NICE_TURN_PERMISSION_TIMEOUT" +#define ENV_NICE_TURN_EXPIRE_TIMEOUT "NICE_TURN_EXPIRE_TIMEOUT" + +/* + * EXACT MIRROR of socket/turn.c::priv_env_timeout_secs. + * If you change one, you MUST change the other. + */ +static guint +mirror_env_timeout_secs (const gchar *name, guint default_secs) +{ + const gchar *v = g_getenv (name); + gchar *end = NULL; + guint64 parsed; + + if (v == NULL || *v == '\0') + return default_secs; + + parsed = g_ascii_strtoull (v, &end, 10); + if (end == v || *end != '\0' || parsed == 0 || parsed > G_MAXUINT) + return default_secs; + + return (guint) parsed; +} + +/* Helper: set the env var to `val` (or unset if NULL), then check that + * the parser returns `expect`. */ +static void +expect (const gchar *name, const gchar *val, guint default_secs, guint expect) +{ + if (val == NULL) + g_unsetenv (name); + else + g_setenv (name, val, TRUE); + + g_assert_cmpuint (mirror_env_timeout_secs (name, default_secs), ==, expect); +} + +int +main (void) +{ + /* Defaults: unset / empty / non-numeric / leading-non-digit / trailing + * garbage / zero / overflow all fall back to the supplied default. */ + expect (ENV_NICE_TURN_BINDING_TIMEOUT, NULL, 240, 240); + expect (ENV_NICE_TURN_BINDING_TIMEOUT, "", 240, 240); + expect (ENV_NICE_TURN_BINDING_TIMEOUT, "abc", 240, 240); + expect (ENV_NICE_TURN_BINDING_TIMEOUT, "12abc", 240, 240); + expect (ENV_NICE_TURN_BINDING_TIMEOUT, "0", 240, 240); + expect (ENV_NICE_TURN_BINDING_TIMEOUT, "99999999999", 240, 240); + + /* Positive integer values override. */ + expect (ENV_NICE_TURN_BINDING_TIMEOUT, "1", 240, 1); + expect (ENV_NICE_TURN_PERMISSION_TIMEOUT, "2", 240, 2); + expect (ENV_NICE_TURN_EXPIRE_TIMEOUT, "3", 60, 3); + + /* Each knob is read by name, so the three are independent. */ + g_setenv (ENV_NICE_TURN_BINDING_TIMEOUT, "11", TRUE); + g_setenv (ENV_NICE_TURN_PERMISSION_TIMEOUT, "22", TRUE); + g_setenv (ENV_NICE_TURN_EXPIRE_TIMEOUT, "33", TRUE); + g_assert_cmpuint ( + mirror_env_timeout_secs (ENV_NICE_TURN_BINDING_TIMEOUT, 240), + ==, 11); + g_assert_cmpuint ( + mirror_env_timeout_secs (ENV_NICE_TURN_PERMISSION_TIMEOUT, 240), + ==, 22); + g_assert_cmpuint ( + mirror_env_timeout_secs (ENV_NICE_TURN_EXPIRE_TIMEOUT, 60), + ==, 33); + + /* Boundary: G_MAXUINT itself is accepted; G_MAXUINT + 1 is not. + * Build the strings dynamically to stay portable across 32/64-bit + * `guint`. */ + { + gchar *max_str = g_strdup_printf ("%u", G_MAXUINT); + gchar *over_str = g_strdup_printf ("%" G_GUINT64_FORMAT, + (guint64) G_MAXUINT + 1); + expect (ENV_NICE_TURN_BINDING_TIMEOUT, max_str, 240, G_MAXUINT); + expect (ENV_NICE_TURN_BINDING_TIMEOUT, over_str, 240, 240); + g_free (max_str); + g_free (over_str); + } + + /* Cleanup so we don't leak overrides into anything that runs after. */ + g_unsetenv (ENV_NICE_TURN_BINDING_TIMEOUT); + g_unsetenv (ENV_NICE_TURN_PERMISSION_TIMEOUT); + g_unsetenv (ENV_NICE_TURN_EXPIRE_TIMEOUT); + + return EXIT_SUCCESS; +}