(bugfix): ha mode replicas debug

This commit is contained in:
Maksym Sadovnychyy 2026-04-26 16:14:06 +02:00
parent c6d5b3fd1e
commit 23d143aabe
19 changed files with 454 additions and 52 deletions

View File

@ -4,6 +4,27 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [3.3.17] - 2026-04-26
### Changed
- **HA / API:** Non-primary replicas return **`Result.ServiceUnavailable`** with stable marker `urn:maksit:certs-ui:primary-replica-required` for ACME orchestration; the host maps that to **HTTP 503**, **`Retry-After`**, and **RFC 7807 `ProblemDetails`** (replacing ad-hoc 429-style overload semantics for this case).
- **Helm:** Default **`components.server.service.sessionAffinity`** (`ClientIP`, configurable timeout), **`terminationGracePeriodSeconds`**, and a short **`preStop` sleep** so rolling updates drain connections before the primary lease TTL window. Disable or tune under **`components.server`** if your ingress already pins API traffic.
## [3.3.16] - 2026-04-26
### Changed
- **HA / primary replica:** A single elected instance holds Postgres lease `certs-ui-primary` (`RuntimeLeaseNames.PrimaryReplica`), renews it periodically, and is the only instance with `IPrimaryReplicaWorkload.IsPrimary` after startup. It runs coordination DDL, identity bootstrap, **all ACME domain flows** (`CertsFlowDomainService`), and **`AutoRenewal`**. Other replicas serve HTTP (identity, health, etc.) and **`AcmeChallengeAsync`** (HTTP-01 token materialization for ingress). Followers reject ACME orchestration at the domain layer until they become primary after failover.
- **Startup:** Removed separate `certs-ui-bootstrap` lease; primary lease serializes first-time admin creation. `PrimaryReplicaShutdownHostedService` (registered last) releases the primary lease on shutdown.
## [3.3.15] - 2026-04-26
### Fixed
- **Startup / HA:** `InitializationHostedService` no longer takes the bootstrap lease when PostgreSQL already has users. Only the empty-database path waits on the lease (single-writer default admin). Extra replicas used to block on the lease until Kubernetes canceled `StartAsync`, surfacing as `TaskCanceledException` at startup while the first replica held the lease.
- **Startup:** Retry backoff treats `OperationCanceledException` when the host is stopping as shutdown (no misleading “initialization failed” loop); cooperative cancel still ends startup.
## [3.3.14] - 2026-04-26 ## [3.3.14] - 2026-04-26
### Fixed ### Fixed

View File

@ -57,6 +57,7 @@ public class CertsFlowDomainService : ICertsFlowDomainService {
private readonly IAcmeHttpChallengePersistenceService _httpChallenges; private readonly IAcmeHttpChallengePersistenceService _httpChallenges;
private readonly IRuntimeLeaseService _runtimeLease; private readonly IRuntimeLeaseService _runtimeLease;
private readonly IRuntimeInstanceId _runtimeInstance; private readonly IRuntimeInstanceId _runtimeInstance;
private readonly IPrimaryReplicaWorkload _primaryReplica;
private readonly string _acmePath; private readonly string _acmePath;
public CertsFlowDomainService( public CertsFlowDomainService(
@ -68,7 +69,8 @@ public class CertsFlowDomainService : ICertsFlowDomainService {
ICertsFlowEngineConfiguration config, ICertsFlowEngineConfiguration config,
IAcmeHttpChallengePersistenceService httpChallenges, IAcmeHttpChallengePersistenceService httpChallenges,
IRuntimeLeaseService runtimeLease, IRuntimeLeaseService runtimeLease,
IRuntimeInstanceId runtimeInstance) { IRuntimeInstanceId runtimeInstance,
IPrimaryReplicaWorkload primaryReplica) {
_logger = logger; _logger = logger;
_httpClient = httpClient; _httpClient = httpClient;
_letsEncryptService = letsEncryptService; _letsEncryptService = letsEncryptService;
@ -78,12 +80,16 @@ public class CertsFlowDomainService : ICertsFlowDomainService {
_httpChallenges = httpChallenges; _httpChallenges = httpChallenges;
_runtimeLease = runtimeLease; _runtimeLease = runtimeLease;
_runtimeInstance = runtimeInstance; _runtimeInstance = runtimeInstance;
_primaryReplica = primaryReplica;
_acmePath = config.AcmeFolder; _acmePath = config.AcmeFolder;
} }
#region Terms of service #region Terms of service
public Result<string?> GetTermsOfService(Guid sessionId) { public Result<string?> GetTermsOfService(Guid sessionId) {
if (!_primaryReplica.IsPrimary)
return Result<string?>.ServiceUnavailable(null, CertsFlowPrimaryReplica.ServiceUnavailableMessages);
var result = _letsEncryptService.GetTermsOfServiceUri(sessionId); var result = _letsEncryptService.GetTermsOfServiceUri(sessionId);
if (!result.IsSuccess || result.Value == null) if (!result.IsSuccess || result.Value == null)
return result; return result;
@ -122,10 +128,15 @@ public class CertsFlowDomainService : ICertsFlowDomainService {
#region Session, orders, and certificates #region Session, orders, and certificates
public async Task<Result> CompleteChallengesAsync(Guid sessionId) => public async Task<Result> CompleteChallengesAsync(Guid sessionId) {
await _letsEncryptService.CompleteChallenges(sessionId); if (!_primaryReplica.IsPrimary)
return Result.ServiceUnavailable(CertsFlowPrimaryReplica.ServiceUnavailableMessages);
return await _letsEncryptService.CompleteChallenges(sessionId);
}
public async Task<Result<Guid?>> ConfigureClientAsync(bool isStaging) { public async Task<Result<Guid?>> ConfigureClientAsync(bool isStaging) {
if (!_primaryReplica.IsPrimary)
return Result<Guid?>.ServiceUnavailable(null, CertsFlowPrimaryReplica.ServiceUnavailableMessages);
var sessionId = Guid.NewGuid(); var sessionId = Guid.NewGuid();
var result = await _letsEncryptService.ConfigureClient(sessionId, isStaging); var result = await _letsEncryptService.ConfigureClient(sessionId, isStaging);
if (!result.IsSuccess) if (!result.IsSuccess)
@ -134,6 +145,8 @@ public class CertsFlowDomainService : ICertsFlowDomainService {
} }
public async Task<Result<Guid?>> InitAsync(Guid sessionId, Guid? accountId, string description, string[] contacts) { public async Task<Result<Guid?>> InitAsync(Guid sessionId, Guid? accountId, string description, string[] contacts) {
if (!_primaryReplica.IsPrimary)
return Result<Guid?>.ServiceUnavailable(null, CertsFlowPrimaryReplica.ServiceUnavailableMessages);
RegistrationCache? cache = null; RegistrationCache? cache = null;
if (accountId == null) { if (accountId == null) {
accountId = Guid.NewGuid(); accountId = Guid.NewGuid();
@ -154,6 +167,8 @@ public class CertsFlowDomainService : ICertsFlowDomainService {
} }
public async Task<Result<List<string>?>> NewOrderAsync(Guid sessionId, string[] hostnames, string challengeType) { public async Task<Result<List<string>?>> NewOrderAsync(Guid sessionId, string[] hostnames, string challengeType) {
if (!_primaryReplica.IsPrimary)
return Result<List<string>?>.ServiceUnavailable(null, CertsFlowPrimaryReplica.ServiceUnavailableMessages);
var holder = _runtimeInstance.InstanceId; var holder = _runtimeInstance.InstanceId;
var acquired = await _runtimeLease.TryAcquireAsync(RuntimeLeaseNames.AcmeWriter, holder, AcmeWriterLeaseTtl, CancellationToken.None); var acquired = await _runtimeLease.TryAcquireAsync(RuntimeLeaseNames.AcmeWriter, holder, AcmeWriterLeaseTtl, CancellationToken.None);
if (!acquired.IsSuccess) if (!acquired.IsSuccess)
@ -189,6 +204,8 @@ public class CertsFlowDomainService : ICertsFlowDomainService {
} }
public async Task<Result> GetCertificatesAsync(Guid sessionId, string[] hostnames) { public async Task<Result> GetCertificatesAsync(Guid sessionId, string[] hostnames) {
if (!_primaryReplica.IsPrimary)
return Result.ServiceUnavailable(CertsFlowPrimaryReplica.ServiceUnavailableMessages);
foreach (var subject in hostnames) { foreach (var subject in hostnames) {
var result = await _letsEncryptService.GetCertificate(sessionId, subject); var result = await _letsEncryptService.GetCertificate(sessionId, subject);
if (!result.IsSuccess) if (!result.IsSuccess)
@ -204,14 +221,19 @@ public class CertsFlowDomainService : ICertsFlowDomainService {
return Result.Ok(); return Result.Ok();
} }
public async Task<Result> GetOrderAsync(Guid sessionId, string[] hostnames) => public async Task<Result> GetOrderAsync(Guid sessionId, string[] hostnames) {
await _letsEncryptService.GetOrder(sessionId, hostnames); if (!_primaryReplica.IsPrimary)
return Result.ServiceUnavailable(CertsFlowPrimaryReplica.ServiceUnavailableMessages);
return await _letsEncryptService.GetOrder(sessionId, hostnames);
}
#endregion #endregion
#region Deploy and revoke #region Deploy and revoke
public async Task<Result<Dictionary<string, string>?>> ApplyCertificatesAsync(Guid accountId) { public async Task<Result<Dictionary<string, string>?>> ApplyCertificatesAsync(Guid accountId) {
if (!_primaryReplica.IsPrimary)
return Result<Dictionary<string, string>?>.ServiceUnavailable(null, CertsFlowPrimaryReplica.ServiceUnavailableMessages);
var cacheResult = await _registrationCache.LoadAsync(accountId); var cacheResult = await _registrationCache.LoadAsync(accountId);
if (!cacheResult.IsSuccess || cacheResult.Value?.CachedCerts == null) if (!cacheResult.IsSuccess || cacheResult.Value?.CachedCerts == null)
return cacheResult.ToResultOfType<Dictionary<string, string>?>(_ => null); return cacheResult.ToResultOfType<Dictionary<string, string>?>(_ => null);
@ -231,6 +253,8 @@ public class CertsFlowDomainService : ICertsFlowDomainService {
} }
public async Task<Result> RevokeCertificatesAsync(Guid sessionId, string[] hostnames) { public async Task<Result> RevokeCertificatesAsync(Guid sessionId, string[] hostnames) {
if (!_primaryReplica.IsPrimary)
return Result.ServiceUnavailable(CertsFlowPrimaryReplica.ServiceUnavailableMessages);
foreach (var hostname in hostnames) { foreach (var hostname in hostnames) {
var result = await _letsEncryptService.RevokeCertificate(sessionId, hostname, RevokeReason.Unspecified); var result = await _letsEncryptService.RevokeCertificate(sessionId, hostname, RevokeReason.Unspecified);
if (!result.IsSuccess) if (!result.IsSuccess)

View File

@ -0,0 +1,16 @@
namespace MaksIT.CertsUI.Engine.DomainServices;
/// <summary>
/// Stable markers for <c>Result.ServiceUnavailable</c> when ACME is invoked on a non-primary replica.
/// The host maps these to HTTP 503 + <c>Retry-After</c> + RFC 7807 <c>ProblemDetails</c>.
/// </summary>
public static class CertsFlowPrimaryReplica {
/// <summary>Machine-readable first line in result messages for detection in MVC.</summary>
public const string DiagnosticMarker = "urn:maksit:certs-ui:primary-replica-required";
public static readonly string[] ServiceUnavailableMessages = [
DiagnosticMarker,
"Only the elected primary Certs UI replica runs ACME orchestration. Retry after a short delay; use service session affinity (ClientIP) so interactive flows stay on the primary."
];
}

View File

@ -0,0 +1,8 @@
namespace MaksIT.CertsUI.Engine.RuntimeCoordination;
/// <summary>
/// True when this process is the elected primary replica (Postgres lease) and may run ACME orchestration and background renewal.
/// </summary>
public interface IPrimaryReplicaWorkload {
bool IsPrimary { get; }
}

View File

@ -3,6 +3,7 @@ namespace MaksIT.CertsUI.Engine.RuntimeCoordination;
/// <summary>PostgreSQL <c>app_runtime_leases.lease_name</c> values.</summary> /// <summary>PostgreSQL <c>app_runtime_leases.lease_name</c> values.</summary>
public static class RuntimeLeaseNames { public static class RuntimeLeaseNames {
public const string AcmeWriter = "certs-ui-acme-writer"; public const string AcmeWriter = "certs-ui-acme-writer";
public const string Bootstrap = "certs-ui-bootstrap";
public const string AutoRenewal = "certs-ui-auto-renewal"; /// <summary>Single elected instance: identity bootstrap, ACME orchestration, and background renewal.</summary>
public const string PrimaryReplica = "certs-ui-primary";
} }

View File

@ -29,7 +29,8 @@ public sealed class CertsFlowServiceTests
Mock<IAcmeHttpChallengePersistenceService>? httpChallenges = null, Mock<IAcmeHttpChallengePersistenceService>? httpChallenges = null,
Mock<IRuntimeLeaseService>? runtimeLease = null, Mock<IRuntimeLeaseService>? runtimeLease = null,
Mock<IRuntimeInstanceId>? runtimeInstance = null, Mock<IRuntimeInstanceId>? runtimeInstance = null,
HttpMessageHandler? httpHandler = null) HttpMessageHandler? httpHandler = null,
Mock<IPrimaryReplicaWorkload>? primaryReplica = null)
{ {
registrationCache ??= new Mock<IRegistrationCachePersistanceService>(); registrationCache ??= new Mock<IRegistrationCachePersistanceService>();
agent ??= new Mock<IAgentDeploymentService>(); agent ??= new Mock<IAgentDeploymentService>();
@ -55,6 +56,9 @@ public sealed class CertsFlowServiceTests
runtimeInstance ??= new Mock<IRuntimeInstanceId>(); runtimeInstance ??= new Mock<IRuntimeInstanceId>();
if (!runtimeInstanceProvided) if (!runtimeInstanceProvided)
runtimeInstance.Setup(i => i.InstanceId).Returns("test-instance"); runtimeInstance.Setup(i => i.InstanceId).Returns("test-instance");
var primaryWorkload = primaryReplica ?? new Mock<IPrimaryReplicaWorkload>();
if (primaryReplica is null)
primaryWorkload.Setup(p => p.IsPrimary).Returns(true);
var handler = httpHandler ?? new StubHttpMessageHandler(_ => new HttpResponseMessage(HttpStatusCode.OK) { Content = new ByteArrayContent([0x25, 0x50, 0x44, 0x46]) }); var handler = httpHandler ?? new StubHttpMessageHandler(_ => new HttpResponseMessage(HttpStatusCode.OK) { Content = new ByteArrayContent([0x25, 0x50, 0x44, 0x46]) });
var httpClient = new HttpClient(handler, disposeHandler: true); var httpClient = new HttpClient(handler, disposeHandler: true);
return new CertsFlowDomainService( return new CertsFlowDomainService(
@ -66,7 +70,8 @@ public sealed class CertsFlowServiceTests
new TestCertsFlowEngineConfiguration(fx), new TestCertsFlowEngineConfiguration(fx),
httpChallenges.Object, httpChallenges.Object,
runtimeLease.Object, runtimeLease.Object,
runtimeInstance.Object); runtimeInstance.Object,
primaryWorkload.Object);
} }
[Fact] [Fact]
@ -85,6 +90,45 @@ public sealed class CertsFlowServiceTests
Assert.NotNull(result.Value); Assert.NotNull(result.Value);
} }
[Fact]
public async Task ConfigureClientAsync_WhenNotPrimary_ReturnsServiceUnavailableWithMarker()
{
using var fx = new WebApiTestFixture();
var le = new Mock<ILetsEncryptService>();
var primary = new Mock<IPrimaryReplicaWorkload>();
primary.Setup(p => p.IsPrimary).Returns(false);
var sut = CreateSut(fx, le, primaryReplica: primary);
var result = await sut.ConfigureClientAsync(isStaging: false);
Assert.False(result.IsSuccess);
Assert.Contains(CertsFlowPrimaryReplica.DiagnosticMarker, result.Messages ?? []);
le.Verify(x => x.ConfigureClient(It.IsAny<Guid>(), It.IsAny<bool>()), Times.Never);
}
[Fact]
public async Task AcmeChallenge_WhenNotPrimary_StillSucceedsFromDatabase()
{
using var fx = new WebApiTestFixture();
var name = "challenge-token";
var le = new Mock<ILetsEncryptService>();
var primary = new Mock<IPrimaryReplicaWorkload>();
primary.Setup(p => p.IsPrimary).Returns(false);
var challenges = new Mock<IAcmeHttpChallengePersistenceService>();
challenges.Setup(c => c.GetTokenValueAsync(name, It.IsAny<CancellationToken>()))
.ReturnsAsync(Result<string?>.Ok("body"));
challenges.Setup(c => c.UpsertAsync(It.IsAny<string>(), It.IsAny<string>(), It.IsAny<CancellationToken>()))
.ReturnsAsync(Result.Ok());
challenges.Setup(c => c.DeleteOlderThanAsync(It.IsAny<TimeSpan>(), It.IsAny<CancellationToken>()))
.ReturnsAsync(Result<int>.Ok(0));
var sut = CreateSut(fx, le, httpChallenges: challenges, primaryReplica: primary);
var result = await sut.AcmeChallengeAsync(name, CancellationToken.None);
Assert.True(result.IsSuccess);
Assert.Equal("body", result.Value);
}
[Fact] [Fact]
public async Task ConfigureClientAsync_WhenConfigureFails_PropagatesFailure() public async Task ConfigureClientAsync_WhenConfigureFails_PropagatesFailure()
{ {

View File

@ -1,5 +1,6 @@
using MaksIT.Models.LetsEncryptServer.CertsFlow.Requests; using MaksIT.Models.LetsEncryptServer.CertsFlow.Requests;
using MaksIT.CertsUI.Authorization.Filters; using MaksIT.CertsUI.Authorization.Filters;
using MaksIT.CertsUI.Mvc;
using MaksIT.CertsUI.Services; using MaksIT.CertsUI.Services;
using Microsoft.AspNetCore.Mvc; using Microsoft.AspNetCore.Mvc;
@ -20,55 +21,55 @@ namespace MaksIT.CertsUI.Controllers {
[HttpPost("configure-client")] [HttpPost("configure-client")]
public async Task<IActionResult> ConfigureClient([FromBody] ConfigureClientRequest requestData) { public async Task<IActionResult> ConfigureClient([FromBody] ConfigureClientRequest requestData) {
var result = await _certsFlowService.ConfigureClientAsync(requestData.IsStaging); var result = await _certsFlowService.ConfigureClientAsync(requestData.IsStaging);
return result.ToActionResult(); return result.ToCertsFlowActionResult();
} }
[HttpGet("{sessionId}/terms-of-service")] [HttpGet("{sessionId}/terms-of-service")]
public IActionResult TermsOfService(Guid sessionId) { public IActionResult TermsOfService(Guid sessionId) {
var result = _certsFlowService.GetTermsOfService(sessionId); var result = _certsFlowService.GetTermsOfService(sessionId);
return result.ToActionResult(); return result.ToCertsFlowActionResult();
} }
[HttpPost("{sessionId}/init/{accountId?}")] [HttpPost("{sessionId}/init/{accountId?}")]
public async Task<IActionResult> Init(Guid sessionId, Guid? accountId, [FromBody] InitRequest requestData) { public async Task<IActionResult> Init(Guid sessionId, Guid? accountId, [FromBody] InitRequest requestData) {
var result = await _certsFlowService.InitAsync(sessionId, accountId, requestData.Description, requestData.Contacts); var result = await _certsFlowService.InitAsync(sessionId, accountId, requestData.Description, requestData.Contacts);
return result.ToActionResult(); return result.ToCertsFlowActionResult();
} }
[HttpPost("{sessionId}/order")] [HttpPost("{sessionId}/order")]
public async Task<IActionResult> NewOrder(Guid sessionId, [FromBody] NewOrderRequest requestData) { public async Task<IActionResult> NewOrder(Guid sessionId, [FromBody] NewOrderRequest requestData) {
var result = await _certsFlowService.NewOrderAsync(sessionId, requestData.Hostnames, requestData.ChallengeType); var result = await _certsFlowService.NewOrderAsync(sessionId, requestData.Hostnames, requestData.ChallengeType);
return result.ToActionResult(); return result.ToCertsFlowActionResult();
} }
[HttpPost("{sessionId}/complete-challenges")] [HttpPost("{sessionId}/complete-challenges")]
public async Task<IActionResult> CompleteChallenges(Guid sessionId) { public async Task<IActionResult> CompleteChallenges(Guid sessionId) {
var result = await _certsFlowService.CompleteChallengesAsync(sessionId); var result = await _certsFlowService.CompleteChallengesAsync(sessionId);
return result.ToActionResult(); return result.ToCertsFlowActionResult();
} }
[HttpGet("{sessionId}/order-status")] [HttpGet("{sessionId}/order-status")]
public async Task<IActionResult> GetOrder(Guid sessionId, [FromBody] GetOrderRequest requestData) { public async Task<IActionResult> GetOrder(Guid sessionId, [FromBody] GetOrderRequest requestData) {
var result = await _certsFlowService.GetOrderAsync(sessionId, requestData.Hostnames); var result = await _certsFlowService.GetOrderAsync(sessionId, requestData.Hostnames);
return result.ToActionResult(); return result.ToCertsFlowActionResult();
} }
[HttpPost("{sessionId}/certificates/download")] [HttpPost("{sessionId}/certificates/download")]
public async Task<IActionResult> GetCertificates(Guid sessionId, [FromBody] GetCertificatesRequest requestData) { public async Task<IActionResult> GetCertificates(Guid sessionId, [FromBody] GetCertificatesRequest requestData) {
var result = await _certsFlowService.GetCertificatesAsync(sessionId, requestData.Hostnames); var result = await _certsFlowService.GetCertificatesAsync(sessionId, requestData.Hostnames);
return result.ToActionResult(); return result.ToCertsFlowActionResult();
} }
[HttpPost("{accountId}/certificates/apply")] [HttpPost("{accountId}/certificates/apply")]
public async Task<IActionResult> ApplyCertificates(Guid accountId) { public async Task<IActionResult> ApplyCertificates(Guid accountId) {
var result = await _certsFlowService.ApplyCertificatesAsync(accountId); var result = await _certsFlowService.ApplyCertificatesAsync(accountId);
return result.ToActionResult(); return result.ToCertsFlowActionResult();
} }
[HttpPost("{sessionId}/certificates/revoke")] [HttpPost("{sessionId}/certificates/revoke")]
public async Task<IActionResult> RevokeCertificates(Guid sessionId, [FromBody] RevokeCertificatesRequest requestData) { public async Task<IActionResult> RevokeCertificates(Guid sessionId, [FromBody] RevokeCertificatesRequest requestData) {
var result = await _certsFlowService.RevokeCertificatesAsync(sessionId, requestData.Hostnames); var result = await _certsFlowService.RevokeCertificatesAsync(sessionId, requestData.Hostnames);
return result.ToActionResult(); return result.ToCertsFlowActionResult();
} }
} }
} }

View File

@ -1,5 +1,6 @@
using MaksIT.CertsUI.Engine.Domain.Certs; using MaksIT.CertsUI.Engine.Domain.Certs;
using MaksIT.CertsUI.Engine.Persistance.Services; using MaksIT.CertsUI.Engine.Persistance.Services;
using MaksIT.CertsUI.Engine.RuntimeCoordination;
using MaksIT.Results; using MaksIT.Results;
using MaksIT.CertsUI.Services; using MaksIT.CertsUI.Services;
using Microsoft.Extensions.Options; using Microsoft.Extensions.Options;
@ -11,22 +12,30 @@ namespace MaksIT.CertsUI.HostedServices {
private readonly IOptions<Configuration> _appSettings; private readonly IOptions<Configuration> _appSettings;
private readonly ILogger<AutoRenewal> _logger; private readonly ILogger<AutoRenewal> _logger;
private readonly IServiceScopeFactory _scopeFactory; private readonly IServiceScopeFactory _scopeFactory;
private readonly IPrimaryReplicaWorkload _primaryReplica;
private static readonly Random _random = new(); private static readonly Random _random = new();
public AutoRenewal( public AutoRenewal(
IOptions<Configuration> appSettings, IOptions<Configuration> appSettings,
ILogger<AutoRenewal> logger, ILogger<AutoRenewal> logger,
IServiceScopeFactory scopeFactory IServiceScopeFactory scopeFactory,
IPrimaryReplicaWorkload primaryReplica
) { ) {
_appSettings = appSettings; _appSettings = appSettings;
_logger = logger; _logger = logger;
_scopeFactory = scopeFactory; _scopeFactory = scopeFactory;
_primaryReplica = primaryReplica;
} }
protected override async Task ExecuteAsync(CancellationToken stoppingToken) { protected override async Task ExecuteAsync(CancellationToken stoppingToken) {
while (!stoppingToken.IsCancellationRequested) { while (!stoppingToken.IsCancellationRequested) {
_logger.LogInformation("Background service is running."); if (!_primaryReplica.IsPrimary) {
await Task.Delay(TimeSpan.FromSeconds(10), stoppingToken).ConfigureAwait(false);
continue;
}
_logger.LogInformation("Background service is running (primary replica).");
using var scope = _scopeFactory.CreateScope(); using var scope = _scopeFactory.CreateScope();
var cacheService = scope.ServiceProvider.GetRequiredService<ICacheService>(); var cacheService = scope.ServiceProvider.GetRequiredService<ICacheService>();

View File

@ -1,70 +1,92 @@
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Options; using Microsoft.Extensions.Options;
using MaksIT.CertsUI.Engine; using MaksIT.CertsUI.Engine;
using MaksIT.CertsUI.Engine.DomainServices; using MaksIT.CertsUI.Engine.DomainServices;
using MaksIT.CertsUI.Engine.Infrastructure; using MaksIT.CertsUI.Engine.Infrastructure;
using MaksIT.CertsUI.Engine.RuntimeCoordination; using MaksIT.CertsUI.Engine.RuntimeCoordination;
using MaksIT.CertsUI.Infrastructure;
namespace MaksIT.CertsUI.HostedServices; namespace MaksIT.CertsUI.HostedServices;
/// <summary> /// <summary>
/// Runs identity bootstrap before the API starts serving requests. FluentMigrator already ran in <c>Program.cs</c> /// Exactly one instance holds <see cref="RuntimeLeaseNames.PrimaryReplica"/> and runs coordination DDL plus identity bootstrap.
/// before the host starts; coordination tables in <c>public</c> are ensured again here before the bootstrap lease. /// Other instances wait until the database (and optional shared <c>init</c> marker under <see cref="Configuration.CertsUIEngineConfiguration.DataFolder"/>) shows bootstrap complete, then start without ACME privileges.
/// The bootstrap lease ensures only one replica writes against shared <see cref="Configuration.CertsUIEngineConfiguration.DataFolder"/>.
/// </summary> /// </summary>
public sealed class InitializationHostedService( public sealed class InitializationHostedService(
ILogger<InitializationHostedService> logger, ILogger<InitializationHostedService> logger,
IServiceProvider serviceProvider, IServiceProvider serviceProvider,
IOptions<Configuration> appSettings, IOptions<Configuration> appSettings,
IRuntimeLeaseService runtimeLease, PrimaryReplicaGate primaryGate
IRuntimeInstanceId runtimeInstance
) : IHostedService { ) : IHostedService {
private static readonly TimeSpan BootstrapLeaseTtl = TimeSpan.FromMinutes(8);
public async Task StartAsync(CancellationToken cancellationToken) { public async Task StartAsync(CancellationToken cancellationToken) {
const int delayMilliseconds = 2000; const int delayMilliseconds = 2000;
var appLifetime = serviceProvider.GetRequiredService<IHostApplicationLifetime>();
while (!cancellationToken.IsCancellationRequested) { while (!cancellationToken.IsCancellationRequested) {
try { try {
logger.LogInformation("Running startup initialization..."); logger.LogInformation("Running startup initialization (primary replica election)...");
var engineConfig = serviceProvider.GetRequiredService<ICertsEngineConfiguration>(); if (await primaryGate.TryAcquirePrimaryLeaseAsync(cancellationToken).ConfigureAwait(false)) {
await CoordinationTableProvisioner.EnsureAsync(engineConfig.ConnectionString, cancellationToken).ConfigureAwait(false); primaryGate.StartLeaseRenewal(appLifetime);
try {
var engineConfig = serviceProvider.GetRequiredService<ICertsEngineConfiguration>();
await CoordinationTableProvisioner.EnsureAsync(engineConfig.ConnectionString, cancellationToken).ConfigureAwait(false);
var holder = runtimeInstance.InstanceId; await using var scope = serviceProvider.CreateAsyncScope();
var acquired = await runtimeLease.TryAcquireAsync(RuntimeLeaseNames.Bootstrap, holder, BootstrapLeaseTtl, cancellationToken).ConfigureAwait(false); var identityDomainService = scope.ServiceProvider.GetRequiredService<IIdentityDomainService>();
if (!acquired.IsSuccess) await EnsureIdentityAsLeaderAsync(appSettings.Value, identityDomainService, cancellationToken).ConfigureAwait(false);
throw new InvalidOperationException(string.Join(", ", acquired.Messages ?? ["Lease acquire failed."])); }
if (!acquired.Value) { catch {
logger.LogInformation("Bootstrap lease held by another instance; waiting..."); await primaryGate.AbandonPrimaryAsync().ConfigureAwait(false);
await Task.Delay(delayMilliseconds, cancellationToken).ConfigureAwait(false); throw;
continue; }
primaryGate.EnablePrimaryWorkload();
logger.LogInformation("Startup initialization completed; this instance is the primary replica.");
return;
} }
try { await using (var followerScope = serviceProvider.CreateAsyncScope()) {
await using var scope = serviceProvider.CreateAsyncScope(); var identityFollower = followerScope.ServiceProvider.GetRequiredService<IIdentityDomainService>();
var identityDomainService = scope.ServiceProvider.GetRequiredService<IIdentityDomainService>(); var cfg = appSettings.Value;
await EnsureIdentityInitializedAsync(appSettings.Value, identityDomainService, cancellationToken).ConfigureAwait(false); while (!cancellationToken.IsCancellationRequested) {
} if (await IsClusterIdentityReadyAsync(cfg, identityFollower, cancellationToken).ConfigureAwait(false)) {
finally { logger.LogInformation("Startup initialization completed; this instance is a secondary replica.");
var released = await runtimeLease.ReleaseAsync(RuntimeLeaseNames.Bootstrap, holder, cancellationToken).ConfigureAwait(false); return;
if (!released.IsSuccess) }
logger.LogWarning("Bootstrap lease release reported failure: {Messages}", string.Join("; ", released.Messages ?? []));
logger.LogInformation("Waiting for primary replica to finish database bootstrap...");
await Task.Delay(delayMilliseconds, cancellationToken).ConfigureAwait(false);
}
} }
logger.LogInformation("Startup initialization completed."); cancellationToken.ThrowIfCancellationRequested();
return; }
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) {
logger.LogInformation("Startup initialization canceled (host is stopping).");
throw;
} }
catch (Exception ex) { catch (Exception ex) {
if (cancellationToken.IsCancellationRequested) {
logger.LogInformation(ex, "Startup initialization aborted while stopping host.");
throw new OperationCanceledException("Host stopped during startup initialization.", ex, cancellationToken);
}
logger.LogError(ex, "Startup initialization failed. Retrying..."); logger.LogError(ex, "Startup initialization failed. Retrying...");
await Task.Delay(delayMilliseconds, cancellationToken).ConfigureAwait(false); try {
await Task.Delay(delayMilliseconds, cancellationToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) {
logger.LogInformation("Startup initialization retry wait canceled (host is stopping).");
throw;
}
} }
} }
} }
public Task StopAsync(CancellationToken cancellationToken) => Task.CompletedTask; public Task StopAsync(CancellationToken cancellationToken) => Task.CompletedTask;
private static async Task EnsureIdentityInitializedAsync( private static async Task EnsureIdentityAsLeaderAsync(
Configuration appSettings, Configuration appSettings,
IIdentityDomainService identityDomainService, IIdentityDomainService identityDomainService,
CancellationToken cancellationToken CancellationToken cancellationToken
@ -89,4 +111,29 @@ public sealed class InitializationHostedService(
await File.WriteAllTextAsync(initPath, string.Empty, cancellationToken).ConfigureAwait(false); await File.WriteAllTextAsync(initPath, string.Empty, cancellationToken).ConfigureAwait(false);
} }
private static async Task<bool> IsClusterIdentityReadyAsync(
Configuration appSettings,
IIdentityDomainService identityDomainService,
CancellationToken cancellationToken
) {
var dataDir = appSettings.CertsUIEngineConfiguration.DataFolder;
if (!Directory.Exists(dataDir))
Directory.CreateDirectory(dataDir);
var initPath = Path.Combine(dataDir, "init");
if (File.Exists(initPath))
return true;
var count = await identityDomainService.CountUsersAsync(cancellationToken).ConfigureAwait(false);
if (!count.IsSuccess)
throw new InvalidOperationException(string.Join(", ", count.Messages));
if (count.Value > 0) {
await File.WriteAllTextAsync(initPath, string.Empty, cancellationToken).ConfigureAwait(false);
return true;
}
return false;
}
} }

View File

@ -0,0 +1,14 @@
using MaksIT.CertsUI.Infrastructure;
namespace MaksIT.CertsUI.HostedServices;
/// <summary>
/// Registered last so <see cref="IHostedService.StopAsync"/> runs first on shutdown: releases the primary Postgres lease and stops renewal.
/// </summary>
public sealed class PrimaryReplicaShutdownHostedService(PrimaryReplicaGate primaryGate) : IHostedService {
public Task StartAsync(CancellationToken cancellationToken) => Task.CompletedTask;
public async Task StopAsync(CancellationToken cancellationToken) =>
await primaryGate.AbandonPrimaryAsync().ConfigureAwait(false);
}

View File

@ -0,0 +1,121 @@
using Microsoft.Extensions.Hosting;
using MaksIT.CertsUI.Engine.Infrastructure;
using MaksIT.CertsUI.Engine.RuntimeCoordination;
namespace MaksIT.CertsUI.Infrastructure;
/// <summary>
/// Holds <see cref="RuntimeLeaseNames.PrimaryReplica"/> and renews it while this instance is leader.
/// <see cref="IPrimaryReplicaWorkload.IsPrimary"/> stays false until <see cref="EnablePrimaryWorkload"/> runs after successful startup bootstrap.
/// </summary>
public sealed class PrimaryReplicaGate(
IRuntimeLeaseService leaseService,
IRuntimeInstanceId runtimeInstance,
ILogger<PrimaryReplicaGate> logger
) : IPrimaryReplicaWorkload, IAsyncDisposable {
private static readonly TimeSpan PrimaryLeaseTtl = TimeSpan.FromSeconds(90);
private static readonly TimeSpan RenewInterval = TimeSpan.FromSeconds(30);
private readonly object _sync = new();
private CancellationTokenSource? _renewCts;
private Task? _renewalTask;
private string? _holderId;
private volatile bool _mayRunPrimaryWorkload;
public bool IsPrimary => _mayRunPrimaryWorkload;
/// <summary>Single attempt to insert/update the primary lease row for this holder.</summary>
public async Task<bool> TryAcquirePrimaryLeaseAsync(CancellationToken cancellationToken) {
var holder = runtimeInstance.InstanceId;
var acquired = await leaseService.TryAcquireAsync(RuntimeLeaseNames.PrimaryReplica, holder, PrimaryLeaseTtl, cancellationToken).ConfigureAwait(false);
if (!acquired.IsSuccess)
throw new InvalidOperationException(string.Join(", ", acquired.Messages ?? ["Primary lease acquire failed."]));
if (!acquired.Value)
return false;
lock (_sync) {
_holderId = holder;
_mayRunPrimaryWorkload = false;
}
return true;
}
/// <summary>After <see cref="TryAcquirePrimaryLeaseAsync"/> returned true, start renewal (call before long init).</summary>
public void StartLeaseRenewal(IHostApplicationLifetime applicationLifetime) {
lock (_sync) {
if (_holderId == null)
throw new InvalidOperationException("Cannot start renewal without an acquired primary lease.");
_renewCts?.Cancel();
_renewCts?.Dispose();
_renewCts = CancellationTokenSource.CreateLinkedTokenSource(applicationLifetime.ApplicationStopping);
var holder = _holderId;
var ct = _renewCts.Token;
_renewalTask = RenewalLoopAsync(holder, ct);
}
}
public void EnablePrimaryWorkload() => _mayRunPrimaryWorkload = true;
private async Task RenewalLoopAsync(string holderId, CancellationToken cancellationToken) {
try {
while (!cancellationToken.IsCancellationRequested) {
var renewed = await leaseService.TryAcquireAsync(RuntimeLeaseNames.PrimaryReplica, holderId, PrimaryLeaseTtl, cancellationToken).ConfigureAwait(false);
if (!renewed.IsSuccess || !renewed.Value) {
if (logger.IsEnabled(LogLevel.Warning))
logger.LogWarning("Primary replica lease was not renewed (success={Success}, acquired={Acquired}).", renewed.IsSuccess, renewed.Value);
_mayRunPrimaryWorkload = false;
return;
}
await Task.Delay(RenewInterval, cancellationToken).ConfigureAwait(false);
}
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) {
// normal shutdown
}
catch (Exception ex) {
if (logger.IsEnabled(LogLevel.Error))
logger.LogError(ex, "Primary replica lease renewal loop failed.");
_mayRunPrimaryWorkload = false;
}
}
/// <summary>Release lease and stop renewal after failed leader bootstrap (instance stays usable for retry).</summary>
public async Task AbandonPrimaryAsync() {
_mayRunPrimaryWorkload = false;
Task? renewalToAwait;
CancellationTokenSource? cts;
string? holder;
lock (_sync) {
holder = _holderId;
_holderId = null;
cts = _renewCts;
_renewCts = null;
renewalToAwait = _renewalTask;
_renewalTask = null;
}
try {
cts?.Cancel();
if (renewalToAwait != null)
await renewalToAwait.ConfigureAwait(false);
}
catch (Exception ex) {
if (logger.IsEnabled(LogLevel.Debug))
logger.LogDebug(ex, "Primary renewal task did not end cleanly during abandon.");
}
finally {
cts?.Dispose();
}
if (holder != null) {
var released = await leaseService.ReleaseAsync(RuntimeLeaseNames.PrimaryReplica, holder, CancellationToken.None).ConfigureAwait(false);
if (!released.IsSuccess && logger.IsEnabled(LogLevel.Warning))
logger.LogWarning("Primary lease release (abandon): {Messages}", string.Join("; ", released.Messages ?? []));
}
}
public async ValueTask DisposeAsync() => await AbandonPrimaryAsync().ConfigureAwait(false);
}

View File

@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk.Web"> <Project Sdk="Microsoft.NET.Sdk.Web">
<PropertyGroup> <PropertyGroup>
<Version>3.3.14</Version> <Version>3.3.17</Version>
<TargetFramework>net10.0</TargetFramework> <TargetFramework>net10.0</TargetFramework>
<Nullable>enable</Nullable> <Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings> <ImplicitUsings>enable</ImplicitUsings>

View File

@ -0,0 +1,26 @@
using MaksIT.Results;
using MaksIT.Results.Mvc;
using Microsoft.AspNetCore.Mvc;
namespace MaksIT.CertsUI.Mvc;
/// <summary>
/// Maps ACME domain results to HTTP: primary-replica required becomes 503 + <c>Retry-After</c> + ProblemDetails.
/// </summary>
public static class CertsFlowResultExtensions {
/// <summary>Default retry hint for clients and caches (seconds).</summary>
public const int DefaultPrimaryReplicaRetryAfterSeconds = 2;
public static IActionResult ToCertsFlowActionResult(this Result result) {
if (!result.IsSuccess && PrimaryReplicaRequiredObjectResult.IsPrimaryReplicaResult(result.Messages))
return PrimaryReplicaRequiredObjectResult.FromMessages(result.Messages, DefaultPrimaryReplicaRetryAfterSeconds);
return result.ToActionResult();
}
public static IActionResult ToCertsFlowActionResult<T>(this Result<T?> result) {
if (!result.IsSuccess && PrimaryReplicaRequiredObjectResult.IsPrimaryReplicaResult(result.Messages))
return PrimaryReplicaRequiredObjectResult.FromMessages(result.Messages, DefaultPrimaryReplicaRetryAfterSeconds);
return result.ToActionResult();
}
}

View File

@ -0,0 +1,39 @@
using MaksIT.CertsUI.Engine.DomainServices;
using Microsoft.AspNetCore.Mvc;
namespace MaksIT.CertsUI.Mvc;
/// <summary>
/// HTTP 503 with <c>Retry-After</c> (delay-seconds) and RFC 7807 <see cref="ProblemDetails"/> for primary-replica routing.
/// </summary>
internal sealed class PrimaryReplicaRequiredObjectResult : ObjectResult {
public PrimaryReplicaRequiredObjectResult(ProblemDetails problemDetails, int retryAfterSeconds) : base(problemDetails) {
ArgumentOutOfRangeException.ThrowIfLessThan(retryAfterSeconds, 1);
StatusCode = StatusCodes.Status503ServiceUnavailable;
DeclaredType = typeof(ProblemDetails);
RetryAfterSeconds = retryAfterSeconds;
}
public int RetryAfterSeconds { get; }
public override Task ExecuteResultAsync(ActionContext context) {
context.HttpContext.Response.Headers.RetryAfter = RetryAfterSeconds.ToString(System.Globalization.NumberFormatInfo.InvariantInfo);
return base.ExecuteResultAsync(context);
}
internal static bool IsPrimaryReplicaResult(IReadOnlyList<string>? messages) =>
messages is { Count: > 0 } && string.Equals(messages[0], CertsFlowPrimaryReplica.DiagnosticMarker, StringComparison.Ordinal);
internal static IActionResult FromMessages(IReadOnlyList<string>? messages, int retryAfterSeconds) {
var detail = (messages is { Count: > 1 } ? messages[1] : null) ?? "Only the primary replica runs this operation.";
var pd = new ProblemDetails {
Status = StatusCodes.Status503ServiceUnavailable,
Title = "Primary replica required",
Detail = detail,
Type = CertsFlowPrimaryReplica.DiagnosticMarker,
};
pd.Extensions["retryAfterSeconds"] = retryAfterSeconds;
return new PrimaryReplicaRequiredObjectResult(pd, retryAfterSeconds);
}
}

View File

@ -67,9 +67,14 @@ builder.Services.AddOptions<JsonOptions>().Configure(o =>
builder.Services.AddScoped<JwtAuthorizationFilter>(); builder.Services.AddScoped<JwtAuthorizationFilter>();
builder.Services.AddScoped<JwtOrApiKeyAuthorizationFilter>(); builder.Services.AddScoped<JwtOrApiKeyAuthorizationFilter>();
// Primary replica: one elected instance (Postgres lease) runs ACME + renewal; register shutdown last so StopAsync releases the lease first.
builder.Services.AddSingleton<PrimaryReplicaGate>();
builder.Services.AddSingleton<IPrimaryReplicaWorkload>(sp => sp.GetRequiredService<PrimaryReplicaGate>());
// Hosted services: initialization first, then autorenewal loop. // Hosted services: initialization first, then autorenewal loop.
builder.Services.AddHostedService<InitializationHostedService>(); builder.Services.AddHostedService<InitializationHostedService>();
builder.Services.AddHostedService<AutoRenewal>(); builder.Services.AddHostedService<AutoRenewal>();
builder.Services.AddHostedService<PrimaryReplicaShutdownHostedService>();
// PostgreSQL: prefer Configuration:CertsUIEngineConfiguration:ConnectionString in appsecrets.json; fallback ConnectionStrings:Certs for older files. // PostgreSQL: prefer Configuration:CertsUIEngineConfiguration:ConnectionString in appsecrets.json; fallback ConnectionStrings:Certs for older files.
var certsConnectionString = appSettings.CertsUIEngineConfiguration.ConnectionString var certsConnectionString = appSettings.CertsUIEngineConfiguration.ConnectionString

View File

@ -32,6 +32,8 @@ Optional per workload under **`components.<name>`**: **`replicaCount`** (default
When **`replicaCount` > 1**, the chart creates a **PodDisruptionBudget** (`minAvailable: 1`) for that component. When **`replicaCount` > 1**, the chart creates a **PodDisruptionBudget** (`minAvailable: 1`) for that component.
**Primary replica + ACME:** With multiple **server** pods, exactly one holds the Postgres lease `certs-ui-primary` and runs ACME orchestration (`CertsFlowDomainService`, renewal). Others answer **`AcmeChallengeAsync`** from the database for HTTP-01. Interactive UI flows should hit the primary: the chart defaults **`ClientIP`** session affinity on the **server** Service, and clients should retry on **503** (see `Retry-After` / `ProblemDetails`). After unclean failover, the old lease row can linger until its TTL (~90s with defaults); renewals and clean shutdown avoid stuck primaries.
**Server + RWO PVCs:** the default **acme** / **data** volumes use **ReadWriteOnce**. Kubernetes will not schedule a second server pod on the same volume; for multiple server replicas you need **ReadWriteMany** (or equivalent) and an application design that tolerates shared disk (see product HA roadmap). **Server + RWO PVCs:** the default **acme** / **data** volumes use **ReadWriteOnce**. Kubernetes will not schedule a second server pod on the same volume; for multiple server replicas you need **ReadWriteMany** (or equivalent) and an application design that tolerates shared disk (see product HA roadmap).
------------------------------------------------------------ ------------------------------------------------------------

View File

@ -54,6 +54,9 @@ spec:
{{- end }} {{- end }}
certs-ui.io/image: {{ printf "%s/%s:%s" $comp.image.registry $comp.image.repository $imageTag | quote }} certs-ui.io/image: {{ printf "%s/%s:%s" $comp.image.registry $comp.image.repository $imageTag | quote }}
spec: spec:
{{- with $comp.terminationGracePeriodSeconds }}
terminationGracePeriodSeconds: {{ . }}
{{- end }}
{{- include "certs-ui.imagePullSecrets" $root | nindent 6 }} {{- include "certs-ui.imagePullSecrets" $root | nindent 6 }}
containers: containers:
- name: {{ $compName }} - name: {{ $compName }}
@ -84,6 +87,10 @@ spec:
{{- end }} {{- end }}
{{- with $comp.resources }} {{- with $comp.resources }}
resources: resources:
{{- toYaml . | nindent 12 }}
{{- end }}
{{- with $comp.lifecycle }}
lifecycle:
{{- toYaml . | nindent 12 }} {{- toYaml . | nindent 12 }}
{{- end }} {{- end }}
{{- $p := default dict $comp.persistence -}} {{- $p := default dict $comp.persistence -}}

View File

@ -13,6 +13,13 @@ metadata:
app.kubernetes.io/component: {{ $compName }} app.kubernetes.io/component: {{ $compName }}
spec: spec:
type: {{ default "ClusterIP" $svc.type }} type: {{ default "ClusterIP" $svc.type }}
{{- $sa := default dict $svc.sessionAffinity }}
{{- if $sa.enabled }}
sessionAffinity: ClientIP
sessionAffinityConfig:
clientIP:
timeoutSeconds: {{ default 10800 $sa.clientIPTimeoutSeconds }}
{{- end }}
ports: ports:
- port: {{ default 80 $svc.port }} - port: {{ default 80 $svc.port }}
targetPort: http targetPort: http

View File

@ -83,6 +83,16 @@ components:
type: ClusterIP type: ClusterIP
port: 5000 port: 5000
targetPort: 5000 targetPort: 5000
# ClientIP affinity helps browsers hit the same server pod for multi-step ACME (primary holds orchestration).
sessionAffinity:
enabled: true
clientIPTimeoutSeconds: 10800
# Give kube-proxy / ingress time to stop sending new connections before SIGKILL (pairs with preStop).
terminationGracePeriodSeconds: 90
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "sleep 5"]
persistence: persistence:
storageClass: local-path storageClass: local-path
volumes: volumes: