diff --git a/CHANGELOG.md b/CHANGELOG.md index 25363ae..ba87b80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,27 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [3.3.17] - 2026-04-26 + +### Changed + +- **HA / API:** Non-primary replicas return **`Result.ServiceUnavailable`** with stable marker `urn:maksit:certs-ui:primary-replica-required` for ACME orchestration; the host maps that to **HTTP 503**, **`Retry-After`**, and **RFC 7807 `ProblemDetails`** (replacing ad-hoc 429-style overload semantics for this case). +- **Helm:** Default **`components.server.service.sessionAffinity`** (`ClientIP`, configurable timeout), **`terminationGracePeriodSeconds`**, and a short **`preStop` sleep** so rolling updates drain connections before the primary lease TTL window. Disable or tune under **`components.server`** if your ingress already pins API traffic. + +## [3.3.16] - 2026-04-26 + +### Changed + +- **HA / primary replica:** A single elected instance holds Postgres lease `certs-ui-primary` (`RuntimeLeaseNames.PrimaryReplica`), renews it periodically, and is the only instance with `IPrimaryReplicaWorkload.IsPrimary` after startup. It runs coordination DDL, identity bootstrap, **all ACME domain flows** (`CertsFlowDomainService`), and **`AutoRenewal`**. Other replicas serve HTTP (identity, health, etc.) and **`AcmeChallengeAsync`** (HTTP-01 token materialization for ingress). Followers reject ACME orchestration at the domain layer until they become primary after failover. +- **Startup:** Removed separate `certs-ui-bootstrap` lease; primary lease serializes first-time admin creation. `PrimaryReplicaShutdownHostedService` (registered last) releases the primary lease on shutdown. + +## [3.3.15] - 2026-04-26 + +### Fixed + +- **Startup / HA:** `InitializationHostedService` no longer takes the bootstrap lease when PostgreSQL already has users. Only the empty-database path waits on the lease (single-writer default admin). Extra replicas used to block on the lease until Kubernetes canceled `StartAsync`, surfacing as `TaskCanceledException` at startup while the first replica held the lease. +- **Startup:** Retry backoff treats `OperationCanceledException` when the host is stopping as shutdown (no misleading “initialization failed” loop); cooperative cancel still ends startup. + ## [3.3.14] - 2026-04-26 ### Fixed diff --git a/src/MaksIT.CertsUI.Engine/DomainServices/CertsFlowDomainService.cs b/src/MaksIT.CertsUI.Engine/DomainServices/CertsFlowDomainService.cs index 41231e4..f902273 100644 --- a/src/MaksIT.CertsUI.Engine/DomainServices/CertsFlowDomainService.cs +++ b/src/MaksIT.CertsUI.Engine/DomainServices/CertsFlowDomainService.cs @@ -57,6 +57,7 @@ public class CertsFlowDomainService : ICertsFlowDomainService { private readonly IAcmeHttpChallengePersistenceService _httpChallenges; private readonly IRuntimeLeaseService _runtimeLease; private readonly IRuntimeInstanceId _runtimeInstance; + private readonly IPrimaryReplicaWorkload _primaryReplica; private readonly string _acmePath; public CertsFlowDomainService( @@ -68,7 +69,8 @@ public class CertsFlowDomainService : ICertsFlowDomainService { ICertsFlowEngineConfiguration config, IAcmeHttpChallengePersistenceService httpChallenges, IRuntimeLeaseService runtimeLease, - IRuntimeInstanceId runtimeInstance) { + IRuntimeInstanceId runtimeInstance, + IPrimaryReplicaWorkload primaryReplica) { _logger = logger; _httpClient = httpClient; _letsEncryptService = letsEncryptService; @@ -78,12 +80,16 @@ public class CertsFlowDomainService : ICertsFlowDomainService { _httpChallenges = httpChallenges; _runtimeLease = runtimeLease; _runtimeInstance = runtimeInstance; + _primaryReplica = primaryReplica; _acmePath = config.AcmeFolder; } #region Terms of service public Result GetTermsOfService(Guid sessionId) { + if (!_primaryReplica.IsPrimary) + return Result.ServiceUnavailable(null, CertsFlowPrimaryReplica.ServiceUnavailableMessages); + var result = _letsEncryptService.GetTermsOfServiceUri(sessionId); if (!result.IsSuccess || result.Value == null) return result; @@ -122,10 +128,15 @@ public class CertsFlowDomainService : ICertsFlowDomainService { #region Session, orders, and certificates - public async Task CompleteChallengesAsync(Guid sessionId) => - await _letsEncryptService.CompleteChallenges(sessionId); + public async Task CompleteChallengesAsync(Guid sessionId) { + if (!_primaryReplica.IsPrimary) + return Result.ServiceUnavailable(CertsFlowPrimaryReplica.ServiceUnavailableMessages); + return await _letsEncryptService.CompleteChallenges(sessionId); + } public async Task> ConfigureClientAsync(bool isStaging) { + if (!_primaryReplica.IsPrimary) + return Result.ServiceUnavailable(null, CertsFlowPrimaryReplica.ServiceUnavailableMessages); var sessionId = Guid.NewGuid(); var result = await _letsEncryptService.ConfigureClient(sessionId, isStaging); if (!result.IsSuccess) @@ -134,6 +145,8 @@ public class CertsFlowDomainService : ICertsFlowDomainService { } public async Task> InitAsync(Guid sessionId, Guid? accountId, string description, string[] contacts) { + if (!_primaryReplica.IsPrimary) + return Result.ServiceUnavailable(null, CertsFlowPrimaryReplica.ServiceUnavailableMessages); RegistrationCache? cache = null; if (accountId == null) { accountId = Guid.NewGuid(); @@ -154,6 +167,8 @@ public class CertsFlowDomainService : ICertsFlowDomainService { } public async Task?>> NewOrderAsync(Guid sessionId, string[] hostnames, string challengeType) { + if (!_primaryReplica.IsPrimary) + return Result?>.ServiceUnavailable(null, CertsFlowPrimaryReplica.ServiceUnavailableMessages); var holder = _runtimeInstance.InstanceId; var acquired = await _runtimeLease.TryAcquireAsync(RuntimeLeaseNames.AcmeWriter, holder, AcmeWriterLeaseTtl, CancellationToken.None); if (!acquired.IsSuccess) @@ -189,6 +204,8 @@ public class CertsFlowDomainService : ICertsFlowDomainService { } public async Task GetCertificatesAsync(Guid sessionId, string[] hostnames) { + if (!_primaryReplica.IsPrimary) + return Result.ServiceUnavailable(CertsFlowPrimaryReplica.ServiceUnavailableMessages); foreach (var subject in hostnames) { var result = await _letsEncryptService.GetCertificate(sessionId, subject); if (!result.IsSuccess) @@ -204,14 +221,19 @@ public class CertsFlowDomainService : ICertsFlowDomainService { return Result.Ok(); } - public async Task GetOrderAsync(Guid sessionId, string[] hostnames) => - await _letsEncryptService.GetOrder(sessionId, hostnames); + public async Task GetOrderAsync(Guid sessionId, string[] hostnames) { + if (!_primaryReplica.IsPrimary) + return Result.ServiceUnavailable(CertsFlowPrimaryReplica.ServiceUnavailableMessages); + return await _letsEncryptService.GetOrder(sessionId, hostnames); + } #endregion #region Deploy and revoke public async Task?>> ApplyCertificatesAsync(Guid accountId) { + if (!_primaryReplica.IsPrimary) + return Result?>.ServiceUnavailable(null, CertsFlowPrimaryReplica.ServiceUnavailableMessages); var cacheResult = await _registrationCache.LoadAsync(accountId); if (!cacheResult.IsSuccess || cacheResult.Value?.CachedCerts == null) return cacheResult.ToResultOfType?>(_ => null); @@ -231,6 +253,8 @@ public class CertsFlowDomainService : ICertsFlowDomainService { } public async Task RevokeCertificatesAsync(Guid sessionId, string[] hostnames) { + if (!_primaryReplica.IsPrimary) + return Result.ServiceUnavailable(CertsFlowPrimaryReplica.ServiceUnavailableMessages); foreach (var hostname in hostnames) { var result = await _letsEncryptService.RevokeCertificate(sessionId, hostname, RevokeReason.Unspecified); if (!result.IsSuccess) diff --git a/src/MaksIT.CertsUI.Engine/DomainServices/CertsFlowPrimaryReplica.cs b/src/MaksIT.CertsUI.Engine/DomainServices/CertsFlowPrimaryReplica.cs new file mode 100644 index 0000000..3196c98 --- /dev/null +++ b/src/MaksIT.CertsUI.Engine/DomainServices/CertsFlowPrimaryReplica.cs @@ -0,0 +1,16 @@ +namespace MaksIT.CertsUI.Engine.DomainServices; + +/// +/// Stable markers for Result.ServiceUnavailable when ACME is invoked on a non-primary replica. +/// The host maps these to HTTP 503 + Retry-After + RFC 7807 ProblemDetails. +/// +public static class CertsFlowPrimaryReplica { + + /// Machine-readable first line in result messages for detection in MVC. + public const string DiagnosticMarker = "urn:maksit:certs-ui:primary-replica-required"; + + public static readonly string[] ServiceUnavailableMessages = [ + DiagnosticMarker, + "Only the elected primary Certs UI replica runs ACME orchestration. Retry after a short delay; use service session affinity (ClientIP) so interactive flows stay on the primary." + ]; +} diff --git a/src/MaksIT.CertsUI.Engine/RuntimeCoordination/IPrimaryReplicaWorkload.cs b/src/MaksIT.CertsUI.Engine/RuntimeCoordination/IPrimaryReplicaWorkload.cs new file mode 100644 index 0000000..30f8761 --- /dev/null +++ b/src/MaksIT.CertsUI.Engine/RuntimeCoordination/IPrimaryReplicaWorkload.cs @@ -0,0 +1,8 @@ +namespace MaksIT.CertsUI.Engine.RuntimeCoordination; + +/// +/// True when this process is the elected primary replica (Postgres lease) and may run ACME orchestration and background renewal. +/// +public interface IPrimaryReplicaWorkload { + bool IsPrimary { get; } +} diff --git a/src/MaksIT.CertsUI.Engine/RuntimeCoordination/RuntimeLeaseNames.cs b/src/MaksIT.CertsUI.Engine/RuntimeCoordination/RuntimeLeaseNames.cs index 3fabcbd..b1cbba4 100644 --- a/src/MaksIT.CertsUI.Engine/RuntimeCoordination/RuntimeLeaseNames.cs +++ b/src/MaksIT.CertsUI.Engine/RuntimeCoordination/RuntimeLeaseNames.cs @@ -3,6 +3,7 @@ namespace MaksIT.CertsUI.Engine.RuntimeCoordination; /// PostgreSQL app_runtime_leases.lease_name values. public static class RuntimeLeaseNames { public const string AcmeWriter = "certs-ui-acme-writer"; - public const string Bootstrap = "certs-ui-bootstrap"; - public const string AutoRenewal = "certs-ui-auto-renewal"; + + /// Single elected instance: identity bootstrap, ACME orchestration, and background renewal. + public const string PrimaryReplica = "certs-ui-primary"; } diff --git a/src/MaksIT.CertsUI.Tests/Services/CertsFlowServiceTests.cs b/src/MaksIT.CertsUI.Tests/Services/CertsFlowServiceTests.cs index e67b54d..21d0447 100644 --- a/src/MaksIT.CertsUI.Tests/Services/CertsFlowServiceTests.cs +++ b/src/MaksIT.CertsUI.Tests/Services/CertsFlowServiceTests.cs @@ -29,7 +29,8 @@ public sealed class CertsFlowServiceTests Mock? httpChallenges = null, Mock? runtimeLease = null, Mock? runtimeInstance = null, - HttpMessageHandler? httpHandler = null) + HttpMessageHandler? httpHandler = null, + Mock? primaryReplica = null) { registrationCache ??= new Mock(); agent ??= new Mock(); @@ -55,6 +56,9 @@ public sealed class CertsFlowServiceTests runtimeInstance ??= new Mock(); if (!runtimeInstanceProvided) runtimeInstance.Setup(i => i.InstanceId).Returns("test-instance"); + var primaryWorkload = primaryReplica ?? new Mock(); + if (primaryReplica is null) + primaryWorkload.Setup(p => p.IsPrimary).Returns(true); var handler = httpHandler ?? new StubHttpMessageHandler(_ => new HttpResponseMessage(HttpStatusCode.OK) { Content = new ByteArrayContent([0x25, 0x50, 0x44, 0x46]) }); var httpClient = new HttpClient(handler, disposeHandler: true); return new CertsFlowDomainService( @@ -66,7 +70,8 @@ public sealed class CertsFlowServiceTests new TestCertsFlowEngineConfiguration(fx), httpChallenges.Object, runtimeLease.Object, - runtimeInstance.Object); + runtimeInstance.Object, + primaryWorkload.Object); } [Fact] @@ -85,6 +90,45 @@ public sealed class CertsFlowServiceTests Assert.NotNull(result.Value); } + [Fact] + public async Task ConfigureClientAsync_WhenNotPrimary_ReturnsServiceUnavailableWithMarker() + { + using var fx = new WebApiTestFixture(); + var le = new Mock(); + var primary = new Mock(); + primary.Setup(p => p.IsPrimary).Returns(false); + var sut = CreateSut(fx, le, primaryReplica: primary); + + var result = await sut.ConfigureClientAsync(isStaging: false); + + Assert.False(result.IsSuccess); + Assert.Contains(CertsFlowPrimaryReplica.DiagnosticMarker, result.Messages ?? []); + le.Verify(x => x.ConfigureClient(It.IsAny(), It.IsAny()), Times.Never); + } + + [Fact] + public async Task AcmeChallenge_WhenNotPrimary_StillSucceedsFromDatabase() + { + using var fx = new WebApiTestFixture(); + var name = "challenge-token"; + var le = new Mock(); + var primary = new Mock(); + primary.Setup(p => p.IsPrimary).Returns(false); + var challenges = new Mock(); + challenges.Setup(c => c.GetTokenValueAsync(name, It.IsAny())) + .ReturnsAsync(Result.Ok("body")); + challenges.Setup(c => c.UpsertAsync(It.IsAny(), It.IsAny(), It.IsAny())) + .ReturnsAsync(Result.Ok()); + challenges.Setup(c => c.DeleteOlderThanAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(Result.Ok(0)); + var sut = CreateSut(fx, le, httpChallenges: challenges, primaryReplica: primary); + + var result = await sut.AcmeChallengeAsync(name, CancellationToken.None); + + Assert.True(result.IsSuccess); + Assert.Equal("body", result.Value); + } + [Fact] public async Task ConfigureClientAsync_WhenConfigureFails_PropagatesFailure() { diff --git a/src/MaksIT.CertsUI/Controllers/CertsFlowController.cs b/src/MaksIT.CertsUI/Controllers/CertsFlowController.cs index 0b4cad2..2f63816 100644 --- a/src/MaksIT.CertsUI/Controllers/CertsFlowController.cs +++ b/src/MaksIT.CertsUI/Controllers/CertsFlowController.cs @@ -1,5 +1,6 @@ using MaksIT.Models.LetsEncryptServer.CertsFlow.Requests; using MaksIT.CertsUI.Authorization.Filters; +using MaksIT.CertsUI.Mvc; using MaksIT.CertsUI.Services; using Microsoft.AspNetCore.Mvc; @@ -20,55 +21,55 @@ namespace MaksIT.CertsUI.Controllers { [HttpPost("configure-client")] public async Task ConfigureClient([FromBody] ConfigureClientRequest requestData) { var result = await _certsFlowService.ConfigureClientAsync(requestData.IsStaging); - return result.ToActionResult(); + return result.ToCertsFlowActionResult(); } [HttpGet("{sessionId}/terms-of-service")] public IActionResult TermsOfService(Guid sessionId) { var result = _certsFlowService.GetTermsOfService(sessionId); - return result.ToActionResult(); + return result.ToCertsFlowActionResult(); } [HttpPost("{sessionId}/init/{accountId?}")] public async Task Init(Guid sessionId, Guid? accountId, [FromBody] InitRequest requestData) { var result = await _certsFlowService.InitAsync(sessionId, accountId, requestData.Description, requestData.Contacts); - return result.ToActionResult(); + return result.ToCertsFlowActionResult(); } [HttpPost("{sessionId}/order")] public async Task NewOrder(Guid sessionId, [FromBody] NewOrderRequest requestData) { var result = await _certsFlowService.NewOrderAsync(sessionId, requestData.Hostnames, requestData.ChallengeType); - return result.ToActionResult(); + return result.ToCertsFlowActionResult(); } [HttpPost("{sessionId}/complete-challenges")] public async Task CompleteChallenges(Guid sessionId) { var result = await _certsFlowService.CompleteChallengesAsync(sessionId); - return result.ToActionResult(); + return result.ToCertsFlowActionResult(); } [HttpGet("{sessionId}/order-status")] public async Task GetOrder(Guid sessionId, [FromBody] GetOrderRequest requestData) { var result = await _certsFlowService.GetOrderAsync(sessionId, requestData.Hostnames); - return result.ToActionResult(); + return result.ToCertsFlowActionResult(); } [HttpPost("{sessionId}/certificates/download")] public async Task GetCertificates(Guid sessionId, [FromBody] GetCertificatesRequest requestData) { var result = await _certsFlowService.GetCertificatesAsync(sessionId, requestData.Hostnames); - return result.ToActionResult(); + return result.ToCertsFlowActionResult(); } [HttpPost("{accountId}/certificates/apply")] public async Task ApplyCertificates(Guid accountId) { var result = await _certsFlowService.ApplyCertificatesAsync(accountId); - return result.ToActionResult(); + return result.ToCertsFlowActionResult(); } [HttpPost("{sessionId}/certificates/revoke")] public async Task RevokeCertificates(Guid sessionId, [FromBody] RevokeCertificatesRequest requestData) { var result = await _certsFlowService.RevokeCertificatesAsync(sessionId, requestData.Hostnames); - return result.ToActionResult(); + return result.ToCertsFlowActionResult(); } } } diff --git a/src/MaksIT.CertsUI/HostedServices/AutoRenewal.cs b/src/MaksIT.CertsUI/HostedServices/AutoRenewal.cs index ad11cd3..1217929 100644 --- a/src/MaksIT.CertsUI/HostedServices/AutoRenewal.cs +++ b/src/MaksIT.CertsUI/HostedServices/AutoRenewal.cs @@ -1,5 +1,6 @@ using MaksIT.CertsUI.Engine.Domain.Certs; using MaksIT.CertsUI.Engine.Persistance.Services; +using MaksIT.CertsUI.Engine.RuntimeCoordination; using MaksIT.Results; using MaksIT.CertsUI.Services; using Microsoft.Extensions.Options; @@ -11,22 +12,30 @@ namespace MaksIT.CertsUI.HostedServices { private readonly IOptions _appSettings; private readonly ILogger _logger; private readonly IServiceScopeFactory _scopeFactory; + private readonly IPrimaryReplicaWorkload _primaryReplica; private static readonly Random _random = new(); public AutoRenewal( IOptions appSettings, ILogger logger, - IServiceScopeFactory scopeFactory + IServiceScopeFactory scopeFactory, + IPrimaryReplicaWorkload primaryReplica ) { _appSettings = appSettings; _logger = logger; _scopeFactory = scopeFactory; + _primaryReplica = primaryReplica; } protected override async Task ExecuteAsync(CancellationToken stoppingToken) { while (!stoppingToken.IsCancellationRequested) { - _logger.LogInformation("Background service is running."); + if (!_primaryReplica.IsPrimary) { + await Task.Delay(TimeSpan.FromSeconds(10), stoppingToken).ConfigureAwait(false); + continue; + } + + _logger.LogInformation("Background service is running (primary replica)."); using var scope = _scopeFactory.CreateScope(); var cacheService = scope.ServiceProvider.GetRequiredService(); diff --git a/src/MaksIT.CertsUI/HostedServices/InitializationHostedService.cs b/src/MaksIT.CertsUI/HostedServices/InitializationHostedService.cs index 080c9c7..6096c56 100644 --- a/src/MaksIT.CertsUI/HostedServices/InitializationHostedService.cs +++ b/src/MaksIT.CertsUI/HostedServices/InitializationHostedService.cs @@ -1,70 +1,92 @@ +using Microsoft.Extensions.Hosting; using Microsoft.Extensions.Options; using MaksIT.CertsUI.Engine; using MaksIT.CertsUI.Engine.DomainServices; using MaksIT.CertsUI.Engine.Infrastructure; using MaksIT.CertsUI.Engine.RuntimeCoordination; +using MaksIT.CertsUI.Infrastructure; namespace MaksIT.CertsUI.HostedServices; /// -/// Runs identity bootstrap before the API starts serving requests. FluentMigrator already ran in Program.cs -/// before the host starts; coordination tables in public are ensured again here before the bootstrap lease. -/// The bootstrap lease ensures only one replica writes against shared . +/// Exactly one instance holds and runs coordination DDL plus identity bootstrap. +/// Other instances wait until the database (and optional shared init marker under ) shows bootstrap complete, then start without ACME privileges. /// public sealed class InitializationHostedService( ILogger logger, IServiceProvider serviceProvider, IOptions appSettings, - IRuntimeLeaseService runtimeLease, - IRuntimeInstanceId runtimeInstance + PrimaryReplicaGate primaryGate ) : IHostedService { - private static readonly TimeSpan BootstrapLeaseTtl = TimeSpan.FromMinutes(8); - public async Task StartAsync(CancellationToken cancellationToken) { const int delayMilliseconds = 2000; + var appLifetime = serviceProvider.GetRequiredService(); while (!cancellationToken.IsCancellationRequested) { try { - logger.LogInformation("Running startup initialization..."); + logger.LogInformation("Running startup initialization (primary replica election)..."); - var engineConfig = serviceProvider.GetRequiredService(); - await CoordinationTableProvisioner.EnsureAsync(engineConfig.ConnectionString, cancellationToken).ConfigureAwait(false); + if (await primaryGate.TryAcquirePrimaryLeaseAsync(cancellationToken).ConfigureAwait(false)) { + primaryGate.StartLeaseRenewal(appLifetime); + try { + var engineConfig = serviceProvider.GetRequiredService(); + await CoordinationTableProvisioner.EnsureAsync(engineConfig.ConnectionString, cancellationToken).ConfigureAwait(false); - var holder = runtimeInstance.InstanceId; - var acquired = await runtimeLease.TryAcquireAsync(RuntimeLeaseNames.Bootstrap, holder, BootstrapLeaseTtl, cancellationToken).ConfigureAwait(false); - if (!acquired.IsSuccess) - throw new InvalidOperationException(string.Join(", ", acquired.Messages ?? ["Lease acquire failed."])); - if (!acquired.Value) { - logger.LogInformation("Bootstrap lease held by another instance; waiting..."); - await Task.Delay(delayMilliseconds, cancellationToken).ConfigureAwait(false); - continue; + await using var scope = serviceProvider.CreateAsyncScope(); + var identityDomainService = scope.ServiceProvider.GetRequiredService(); + await EnsureIdentityAsLeaderAsync(appSettings.Value, identityDomainService, cancellationToken).ConfigureAwait(false); + } + catch { + await primaryGate.AbandonPrimaryAsync().ConfigureAwait(false); + throw; + } + + primaryGate.EnablePrimaryWorkload(); + logger.LogInformation("Startup initialization completed; this instance is the primary replica."); + return; } - try { - await using var scope = serviceProvider.CreateAsyncScope(); - var identityDomainService = scope.ServiceProvider.GetRequiredService(); - await EnsureIdentityInitializedAsync(appSettings.Value, identityDomainService, cancellationToken).ConfigureAwait(false); - } - finally { - var released = await runtimeLease.ReleaseAsync(RuntimeLeaseNames.Bootstrap, holder, cancellationToken).ConfigureAwait(false); - if (!released.IsSuccess) - logger.LogWarning("Bootstrap lease release reported failure: {Messages}", string.Join("; ", released.Messages ?? [])); + await using (var followerScope = serviceProvider.CreateAsyncScope()) { + var identityFollower = followerScope.ServiceProvider.GetRequiredService(); + var cfg = appSettings.Value; + while (!cancellationToken.IsCancellationRequested) { + if (await IsClusterIdentityReadyAsync(cfg, identityFollower, cancellationToken).ConfigureAwait(false)) { + logger.LogInformation("Startup initialization completed; this instance is a secondary replica."); + return; + } + + logger.LogInformation("Waiting for primary replica to finish database bootstrap..."); + await Task.Delay(delayMilliseconds, cancellationToken).ConfigureAwait(false); + } } - logger.LogInformation("Startup initialization completed."); - return; + cancellationToken.ThrowIfCancellationRequested(); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) { + logger.LogInformation("Startup initialization canceled (host is stopping)."); + throw; } catch (Exception ex) { + if (cancellationToken.IsCancellationRequested) { + logger.LogInformation(ex, "Startup initialization aborted while stopping host."); + throw new OperationCanceledException("Host stopped during startup initialization.", ex, cancellationToken); + } logger.LogError(ex, "Startup initialization failed. Retrying..."); - await Task.Delay(delayMilliseconds, cancellationToken).ConfigureAwait(false); + try { + await Task.Delay(delayMilliseconds, cancellationToken).ConfigureAwait(false); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) { + logger.LogInformation("Startup initialization retry wait canceled (host is stopping)."); + throw; + } } } } public Task StopAsync(CancellationToken cancellationToken) => Task.CompletedTask; - private static async Task EnsureIdentityInitializedAsync( + private static async Task EnsureIdentityAsLeaderAsync( Configuration appSettings, IIdentityDomainService identityDomainService, CancellationToken cancellationToken @@ -89,4 +111,29 @@ public sealed class InitializationHostedService( await File.WriteAllTextAsync(initPath, string.Empty, cancellationToken).ConfigureAwait(false); } + + private static async Task IsClusterIdentityReadyAsync( + Configuration appSettings, + IIdentityDomainService identityDomainService, + CancellationToken cancellationToken + ) { + var dataDir = appSettings.CertsUIEngineConfiguration.DataFolder; + if (!Directory.Exists(dataDir)) + Directory.CreateDirectory(dataDir); + + var initPath = Path.Combine(dataDir, "init"); + if (File.Exists(initPath)) + return true; + + var count = await identityDomainService.CountUsersAsync(cancellationToken).ConfigureAwait(false); + if (!count.IsSuccess) + throw new InvalidOperationException(string.Join(", ", count.Messages)); + + if (count.Value > 0) { + await File.WriteAllTextAsync(initPath, string.Empty, cancellationToken).ConfigureAwait(false); + return true; + } + + return false; + } } diff --git a/src/MaksIT.CertsUI/HostedServices/PrimaryReplicaShutdownHostedService.cs b/src/MaksIT.CertsUI/HostedServices/PrimaryReplicaShutdownHostedService.cs new file mode 100644 index 0000000..e9ae4f6 --- /dev/null +++ b/src/MaksIT.CertsUI/HostedServices/PrimaryReplicaShutdownHostedService.cs @@ -0,0 +1,14 @@ +using MaksIT.CertsUI.Infrastructure; + +namespace MaksIT.CertsUI.HostedServices; + +/// +/// Registered last so runs first on shutdown: releases the primary Postgres lease and stops renewal. +/// +public sealed class PrimaryReplicaShutdownHostedService(PrimaryReplicaGate primaryGate) : IHostedService { + + public Task StartAsync(CancellationToken cancellationToken) => Task.CompletedTask; + + public async Task StopAsync(CancellationToken cancellationToken) => + await primaryGate.AbandonPrimaryAsync().ConfigureAwait(false); +} diff --git a/src/MaksIT.CertsUI/Infrastructure/PrimaryReplicaGate.cs b/src/MaksIT.CertsUI/Infrastructure/PrimaryReplicaGate.cs new file mode 100644 index 0000000..757832e --- /dev/null +++ b/src/MaksIT.CertsUI/Infrastructure/PrimaryReplicaGate.cs @@ -0,0 +1,121 @@ +using Microsoft.Extensions.Hosting; +using MaksIT.CertsUI.Engine.Infrastructure; +using MaksIT.CertsUI.Engine.RuntimeCoordination; + +namespace MaksIT.CertsUI.Infrastructure; + +/// +/// Holds and renews it while this instance is leader. +/// stays false until runs after successful startup bootstrap. +/// +public sealed class PrimaryReplicaGate( + IRuntimeLeaseService leaseService, + IRuntimeInstanceId runtimeInstance, + ILogger logger +) : IPrimaryReplicaWorkload, IAsyncDisposable { + + private static readonly TimeSpan PrimaryLeaseTtl = TimeSpan.FromSeconds(90); + private static readonly TimeSpan RenewInterval = TimeSpan.FromSeconds(30); + + private readonly object _sync = new(); + private CancellationTokenSource? _renewCts; + private Task? _renewalTask; + private string? _holderId; + private volatile bool _mayRunPrimaryWorkload; + + public bool IsPrimary => _mayRunPrimaryWorkload; + + /// Single attempt to insert/update the primary lease row for this holder. + public async Task TryAcquirePrimaryLeaseAsync(CancellationToken cancellationToken) { + var holder = runtimeInstance.InstanceId; + var acquired = await leaseService.TryAcquireAsync(RuntimeLeaseNames.PrimaryReplica, holder, PrimaryLeaseTtl, cancellationToken).ConfigureAwait(false); + if (!acquired.IsSuccess) + throw new InvalidOperationException(string.Join(", ", acquired.Messages ?? ["Primary lease acquire failed."])); + if (!acquired.Value) + return false; + + lock (_sync) { + _holderId = holder; + _mayRunPrimaryWorkload = false; + } + + return true; + } + + /// After returned true, start renewal (call before long init). + public void StartLeaseRenewal(IHostApplicationLifetime applicationLifetime) { + lock (_sync) { + if (_holderId == null) + throw new InvalidOperationException("Cannot start renewal without an acquired primary lease."); + _renewCts?.Cancel(); + _renewCts?.Dispose(); + _renewCts = CancellationTokenSource.CreateLinkedTokenSource(applicationLifetime.ApplicationStopping); + var holder = _holderId; + var ct = _renewCts.Token; + _renewalTask = RenewalLoopAsync(holder, ct); + } + } + + public void EnablePrimaryWorkload() => _mayRunPrimaryWorkload = true; + + private async Task RenewalLoopAsync(string holderId, CancellationToken cancellationToken) { + try { + while (!cancellationToken.IsCancellationRequested) { + var renewed = await leaseService.TryAcquireAsync(RuntimeLeaseNames.PrimaryReplica, holderId, PrimaryLeaseTtl, cancellationToken).ConfigureAwait(false); + if (!renewed.IsSuccess || !renewed.Value) { + if (logger.IsEnabled(LogLevel.Warning)) + logger.LogWarning("Primary replica lease was not renewed (success={Success}, acquired={Acquired}).", renewed.IsSuccess, renewed.Value); + _mayRunPrimaryWorkload = false; + return; + } + + await Task.Delay(RenewInterval, cancellationToken).ConfigureAwait(false); + } + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) { + // normal shutdown + } + catch (Exception ex) { + if (logger.IsEnabled(LogLevel.Error)) + logger.LogError(ex, "Primary replica lease renewal loop failed."); + _mayRunPrimaryWorkload = false; + } + } + + /// Release lease and stop renewal after failed leader bootstrap (instance stays usable for retry). + public async Task AbandonPrimaryAsync() { + _mayRunPrimaryWorkload = false; + Task? renewalToAwait; + CancellationTokenSource? cts; + string? holder; + lock (_sync) { + holder = _holderId; + _holderId = null; + cts = _renewCts; + _renewCts = null; + renewalToAwait = _renewalTask; + _renewalTask = null; + } + + try { + cts?.Cancel(); + if (renewalToAwait != null) + await renewalToAwait.ConfigureAwait(false); + } + catch (Exception ex) { + if (logger.IsEnabled(LogLevel.Debug)) + logger.LogDebug(ex, "Primary renewal task did not end cleanly during abandon."); + } + finally { + cts?.Dispose(); + } + + if (holder != null) { + var released = await leaseService.ReleaseAsync(RuntimeLeaseNames.PrimaryReplica, holder, CancellationToken.None).ConfigureAwait(false); + if (!released.IsSuccess && logger.IsEnabled(LogLevel.Warning)) + logger.LogWarning("Primary lease release (abandon): {Messages}", string.Join("; ", released.Messages ?? [])); + } + } + + public async ValueTask DisposeAsync() => await AbandonPrimaryAsync().ConfigureAwait(false); +} diff --git a/src/MaksIT.CertsUI/MaksIT.CertsUI.csproj b/src/MaksIT.CertsUI/MaksIT.CertsUI.csproj index d1780d1..a9b89d9 100644 --- a/src/MaksIT.CertsUI/MaksIT.CertsUI.csproj +++ b/src/MaksIT.CertsUI/MaksIT.CertsUI.csproj @@ -1,7 +1,7 @@ - 3.3.14 + 3.3.17 net10.0 enable enable diff --git a/src/MaksIT.CertsUI/Mvc/CertsFlowResultExtensions.cs b/src/MaksIT.CertsUI/Mvc/CertsFlowResultExtensions.cs new file mode 100644 index 0000000..7d4a0c4 --- /dev/null +++ b/src/MaksIT.CertsUI/Mvc/CertsFlowResultExtensions.cs @@ -0,0 +1,26 @@ +using MaksIT.Results; +using MaksIT.Results.Mvc; +using Microsoft.AspNetCore.Mvc; + +namespace MaksIT.CertsUI.Mvc; + +/// +/// Maps ACME domain results to HTTP: primary-replica required becomes 503 + Retry-After + ProblemDetails. +/// +public static class CertsFlowResultExtensions { + + /// Default retry hint for clients and caches (seconds). + public const int DefaultPrimaryReplicaRetryAfterSeconds = 2; + + public static IActionResult ToCertsFlowActionResult(this Result result) { + if (!result.IsSuccess && PrimaryReplicaRequiredObjectResult.IsPrimaryReplicaResult(result.Messages)) + return PrimaryReplicaRequiredObjectResult.FromMessages(result.Messages, DefaultPrimaryReplicaRetryAfterSeconds); + return result.ToActionResult(); + } + + public static IActionResult ToCertsFlowActionResult(this Result result) { + if (!result.IsSuccess && PrimaryReplicaRequiredObjectResult.IsPrimaryReplicaResult(result.Messages)) + return PrimaryReplicaRequiredObjectResult.FromMessages(result.Messages, DefaultPrimaryReplicaRetryAfterSeconds); + return result.ToActionResult(); + } +} diff --git a/src/MaksIT.CertsUI/Mvc/PrimaryReplicaRequiredObjectResult.cs b/src/MaksIT.CertsUI/Mvc/PrimaryReplicaRequiredObjectResult.cs new file mode 100644 index 0000000..a16ce16 --- /dev/null +++ b/src/MaksIT.CertsUI/Mvc/PrimaryReplicaRequiredObjectResult.cs @@ -0,0 +1,39 @@ +using MaksIT.CertsUI.Engine.DomainServices; +using Microsoft.AspNetCore.Mvc; + +namespace MaksIT.CertsUI.Mvc; + +/// +/// HTTP 503 with Retry-After (delay-seconds) and RFC 7807 for primary-replica routing. +/// +internal sealed class PrimaryReplicaRequiredObjectResult : ObjectResult { + + public PrimaryReplicaRequiredObjectResult(ProblemDetails problemDetails, int retryAfterSeconds) : base(problemDetails) { + ArgumentOutOfRangeException.ThrowIfLessThan(retryAfterSeconds, 1); + StatusCode = StatusCodes.Status503ServiceUnavailable; + DeclaredType = typeof(ProblemDetails); + RetryAfterSeconds = retryAfterSeconds; + } + + public int RetryAfterSeconds { get; } + + public override Task ExecuteResultAsync(ActionContext context) { + context.HttpContext.Response.Headers.RetryAfter = RetryAfterSeconds.ToString(System.Globalization.NumberFormatInfo.InvariantInfo); + return base.ExecuteResultAsync(context); + } + + internal static bool IsPrimaryReplicaResult(IReadOnlyList? messages) => + messages is { Count: > 0 } && string.Equals(messages[0], CertsFlowPrimaryReplica.DiagnosticMarker, StringComparison.Ordinal); + + internal static IActionResult FromMessages(IReadOnlyList? messages, int retryAfterSeconds) { + var detail = (messages is { Count: > 1 } ? messages[1] : null) ?? "Only the primary replica runs this operation."; + var pd = new ProblemDetails { + Status = StatusCodes.Status503ServiceUnavailable, + Title = "Primary replica required", + Detail = detail, + Type = CertsFlowPrimaryReplica.DiagnosticMarker, + }; + pd.Extensions["retryAfterSeconds"] = retryAfterSeconds; + return new PrimaryReplicaRequiredObjectResult(pd, retryAfterSeconds); + } +} diff --git a/src/MaksIT.CertsUI/Program.cs b/src/MaksIT.CertsUI/Program.cs index 2866dcf..9b66059 100644 --- a/src/MaksIT.CertsUI/Program.cs +++ b/src/MaksIT.CertsUI/Program.cs @@ -67,9 +67,14 @@ builder.Services.AddOptions().Configure(o => builder.Services.AddScoped(); builder.Services.AddScoped(); +// Primary replica: one elected instance (Postgres lease) runs ACME + renewal; register shutdown last so StopAsync releases the lease first. +builder.Services.AddSingleton(); +builder.Services.AddSingleton(sp => sp.GetRequiredService()); + // Hosted services: initialization first, then autorenewal loop. builder.Services.AddHostedService(); builder.Services.AddHostedService(); +builder.Services.AddHostedService(); // PostgreSQL: prefer Configuration:CertsUIEngineConfiguration:ConnectionString in appsecrets.json; fallback ConnectionStrings:Certs for older files. var certsConnectionString = appSettings.CertsUIEngineConfiguration.ConnectionString diff --git a/src/helm/templates/NOTES.txt b/src/helm/templates/NOTES.txt index bebefaf..75f5978 100644 --- a/src/helm/templates/NOTES.txt +++ b/src/helm/templates/NOTES.txt @@ -32,6 +32,8 @@ Optional per workload under **`components.`**: **`replicaCount`** (default When **`replicaCount` > 1**, the chart creates a **PodDisruptionBudget** (`minAvailable: 1`) for that component. +**Primary replica + ACME:** With multiple **server** pods, exactly one holds the Postgres lease `certs-ui-primary` and runs ACME orchestration (`CertsFlowDomainService`, renewal). Others answer **`AcmeChallengeAsync`** from the database for HTTP-01. Interactive UI flows should hit the primary: the chart defaults **`ClientIP`** session affinity on the **server** Service, and clients should retry on **503** (see `Retry-After` / `ProblemDetails`). After unclean failover, the old lease row can linger until its TTL (~90s with defaults); renewals and clean shutdown avoid stuck primaries. + **Server + RWO PVCs:** the default **acme** / **data** volumes use **ReadWriteOnce**. Kubernetes will not schedule a second server pod on the same volume; for multiple server replicas you need **ReadWriteMany** (or equivalent) and an application design that tolerates shared disk (see product HA roadmap). ------------------------------------------------------------ diff --git a/src/helm/templates/deployments.yaml b/src/helm/templates/deployments.yaml index b550a62..917f138 100644 --- a/src/helm/templates/deployments.yaml +++ b/src/helm/templates/deployments.yaml @@ -54,6 +54,9 @@ spec: {{- end }} certs-ui.io/image: {{ printf "%s/%s:%s" $comp.image.registry $comp.image.repository $imageTag | quote }} spec: + {{- with $comp.terminationGracePeriodSeconds }} + terminationGracePeriodSeconds: {{ . }} + {{- end }} {{- include "certs-ui.imagePullSecrets" $root | nindent 6 }} containers: - name: {{ $compName }} @@ -84,6 +87,10 @@ spec: {{- end }} {{- with $comp.resources }} resources: +{{- toYaml . | nindent 12 }} + {{- end }} + {{- with $comp.lifecycle }} + lifecycle: {{- toYaml . | nindent 12 }} {{- end }} {{- $p := default dict $comp.persistence -}} diff --git a/src/helm/templates/services.yaml b/src/helm/templates/services.yaml index c66ee75..127c702 100644 --- a/src/helm/templates/services.yaml +++ b/src/helm/templates/services.yaml @@ -13,6 +13,13 @@ metadata: app.kubernetes.io/component: {{ $compName }} spec: type: {{ default "ClusterIP" $svc.type }} + {{- $sa := default dict $svc.sessionAffinity }} + {{- if $sa.enabled }} + sessionAffinity: ClientIP + sessionAffinityConfig: + clientIP: + timeoutSeconds: {{ default 10800 $sa.clientIPTimeoutSeconds }} + {{- end }} ports: - port: {{ default 80 $svc.port }} targetPort: http diff --git a/src/helm/values.yaml b/src/helm/values.yaml index 5860d58..a0aa5d2 100644 --- a/src/helm/values.yaml +++ b/src/helm/values.yaml @@ -83,6 +83,16 @@ components: type: ClusterIP port: 5000 targetPort: 5000 + # ClientIP affinity helps browsers hit the same server pod for multi-step ACME (primary holds orchestration). + sessionAffinity: + enabled: true + clientIPTimeoutSeconds: 10800 + # Give kube-proxy / ingress time to stop sending new connections before SIGKILL (pairs with preStop). + terminationGracePeriodSeconds: 90 + lifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "sleep 5"] persistence: storageClass: local-path volumes: