(bugfix): coordination table provisioner fixes

This commit is contained in:
Maksym Sadovnychyy 2026-04-26 11:52:42 +02:00
parent bbd6fc5617
commit 86a31999bf
7 changed files with 73 additions and 48 deletions

View File

@ -4,6 +4,18 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [3.3.13] - 2026-04-26
### Fixed
- **HA lease / `42P01`:** Added `CoordinationTableProvisioner` with explicit `public.*` DDL; `InitializationHostedService` calls it immediately before bootstrap lease acquire (idempotent, same as post-migrate repair). `RuntimeLeaseServiceNpgsql` now uses `public.app_runtime_leases` in SQL so a non-default `search_path` cannot miss the table. Post-migrate verification requires `public.app_runtime_leases` plus `users` or `"VersionInfo"`.
### Upgrade notes (Kubernetes / Helm)
- **Pin container tags to the app semver** (e.g. `3.3.13` for server, client, reverseproxy) via `global.image.tag` and/or `components.*.image.tag`. The chart resolves the effective tag with `global.image.tag` when set (see `src/helm/templates/_helpers.tpl`).
- **Do not rely on `latest` + `imagePullPolicy: IfNotPresent` alone** — nodes keep the first pulled digest, so you can run an old server binary while the OCI chart is already `3.3.13`. Use an explicit semver tag and/or `pullPolicy: Always` (or bump `global.rolloutNonce` / `global.rollme` per chart NOTES) when upgrading.
- **Push all three images** for the tag you pin (`certs-ui/server`, `certs-ui/client`, `certs-ui/reverseproxy`) so every deployment can pull successfully.
## [3.3.12] - 2026-04-26
### Fixed

View File

@ -0,0 +1,38 @@
using Npgsql;
namespace MaksIT.CertsUI.Engine.Infrastructure;
/// <summary>
/// Idempotent DDL for HA coordination tables in schema <c>public</c> (same shape as the AcmeChallengesAndRuntimeLeases migration). Used after FluentMigrator and again before bootstrap lease
/// so <see cref="RuntimeLeaseServiceNpgsql"/> never runs against a missing <c>app_runtime_leases</c>.
/// </summary>
public static class CoordinationTableProvisioner {
/// <summary>Creates <c>public.acme_http_challenges</c> and <c>public.app_runtime_leases</c> if missing.</summary>
public static async Task EnsureAsync(string? connectionString, CancellationToken cancellationToken = default) {
if (string.IsNullOrWhiteSpace(connectionString))
return;
await using var conn = new NpgsqlConnection(connectionString);
await conn.OpenAsync(cancellationToken).ConfigureAwait(false);
await using var cmd = new NpgsqlCommand(
"""
CREATE TABLE IF NOT EXISTS public.acme_http_challenges (
file_name text NOT NULL PRIMARY KEY,
token_value text NOT NULL,
created_at_utc timestamp with time zone NOT NULL
);
CREATE INDEX IF NOT EXISTS "IX_acme_http_challenges_created_at_utc" ON public.acme_http_challenges (created_at_utc);
CREATE TABLE IF NOT EXISTS public.app_runtime_leases (
lease_name text NOT NULL PRIMARY KEY,
holder_id text NOT NULL,
version bigint NOT NULL DEFAULT 1,
acquired_at_utc timestamp with time zone NOT NULL,
expires_at_utc timestamp with time zone NOT NULL
);
""",
conn);
await cmd.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
}
}

View File

@ -38,7 +38,7 @@ public sealed class RunMigrationsService(
await EnsureDatabaseExistsAsync(cancellationToken).ConfigureAwait(false);
await BaselineExistingEfDatabaseAsync(cancellationToken).ConfigureAwait(false);
await Task.Run(() => migrationRunner.MigrateUp(), cancellationToken).ConfigureAwait(false);
await EnsureCoordinationTablesAsync(cancellationToken).ConfigureAwait(false);
await CoordinationTableProvisioner.EnsureAsync(config.ConnectionString, cancellationToken).ConfigureAwait(false);
await VerifyCoreSchemaAsync(cancellationToken).ConfigureAwait(false);
logger.LogInformation("Certs database migrations completed.");
}
@ -50,12 +50,12 @@ public sealed class RunMigrationsService(
await using var cmd = new NpgsqlCommand(
"""
SELECT EXISTS (
SELECT 1 FROM information_schema.tables
WHERE table_schema = 'public' AND table_name = 'users')
OR EXISTS (
SELECT 1 FROM information_schema.tables
WHERE table_schema = 'public' AND table_name = 'VersionInfo');
SELECT
EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'app_runtime_leases')
AND (
EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'users')
OR EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'VersionInfo')
);
""",
conn);
@ -64,37 +64,8 @@ public sealed class RunMigrationsService(
return;
throw new InvalidOperationException(
"After FluentMigrator MigrateUp(), the target database still has no \"users\" or \"VersionInfo\" table in schema \"public\". " +
"Confirm the connection string Database= value, that the role can CREATE TABLE, and that FluentMigrator is not in preview/connectionless mode (non-empty connection string).");
}
/// <summary>
/// Idempotent DDL for HA tables from <see cref="AcmeChallengesAndRuntimeLeases"/>.
/// When <c>VersionInfo</c> already lists that migration but the tables are missing (restore drift, partial apply),
/// FluentMigrator will not re-run <c>Up()</c>; this repair keeps lease and HTTP-01 persistence working.
/// </summary>
private async Task EnsureCoordinationTablesAsync(CancellationToken cancellationToken) {
await using var conn = new NpgsqlConnection(config.ConnectionString);
await conn.OpenAsync(cancellationToken).ConfigureAwait(false);
await using var cmd = new NpgsqlCommand(
"""
CREATE TABLE IF NOT EXISTS acme_http_challenges (
file_name text NOT NULL PRIMARY KEY,
token_value text NOT NULL,
created_at_utc timestamp with time zone NOT NULL
);
CREATE INDEX IF NOT EXISTS "IX_acme_http_challenges_created_at_utc" ON acme_http_challenges (created_at_utc);
CREATE TABLE IF NOT EXISTS app_runtime_leases (
lease_name text NOT NULL PRIMARY KEY,
holder_id text NOT NULL,
version bigint NOT NULL DEFAULT 1,
acquired_at_utc timestamp with time zone NOT NULL,
expires_at_utc timestamp with time zone NOT NULL
);
""",
conn);
await cmd.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
"After migrations and coordination DDL, schema \"public\" is missing \"app_runtime_leases\" and/or core tables (\"users\" / \"VersionInfo\"). " +
"Confirm Database= in the connection string, role CREATE privileges, and that FluentMigrator committed (non-empty connection string).");
}
private async Task EnsureDatabaseExistsAsync(CancellationToken cancellationToken) {

View File

@ -29,15 +29,15 @@ public sealed class RuntimeLeaseServiceNpgsql(
await using var cmd = new NpgsqlCommand(
"""
INSERT INTO app_runtime_leases (lease_name, holder_id, version, acquired_at_utc, expires_at_utc)
INSERT INTO public.app_runtime_leases (lease_name, holder_id, version, acquired_at_utc, expires_at_utc)
VALUES (@name, @holder, 1, @acquired, @expires)
ON CONFLICT (lease_name) DO UPDATE
SET holder_id = EXCLUDED.holder_id,
version = app_runtime_leases.version + 1,
version = public.app_runtime_leases.version + 1,
acquired_at_utc = EXCLUDED.acquired_at_utc,
expires_at_utc = EXCLUDED.expires_at_utc
WHERE app_runtime_leases.expires_at_utc < EXCLUDED.acquired_at_utc
OR app_runtime_leases.holder_id = EXCLUDED.holder_id
WHERE public.app_runtime_leases.expires_at_utc < EXCLUDED.acquired_at_utc
OR public.app_runtime_leases.holder_id = EXCLUDED.holder_id
RETURNING holder_id;
""",
conn);
@ -72,7 +72,7 @@ public sealed class RuntimeLeaseServiceNpgsql(
await using var cmd = new NpgsqlCommand(
"""
DELETE FROM app_runtime_leases
DELETE FROM public.app_runtime_leases
WHERE lease_name = @name AND holder_id = @holder;
""",
conn);

View File

@ -1,4 +1,5 @@
using Microsoft.Extensions.Options;
using MaksIT.CertsUI.Engine;
using MaksIT.CertsUI.Engine.DomainServices;
using MaksIT.CertsUI.Engine.Infrastructure;
using MaksIT.CertsUI.Engine.RuntimeCoordination;
@ -7,8 +8,8 @@ namespace MaksIT.CertsUI.HostedServices;
/// <summary>
/// Runs identity bootstrap before the API starts serving requests. FluentMigrator already ran in <c>Program.cs</c>
/// before the host starts. The bootstrap lease ensures only one replica writes against shared
/// <see cref="Configuration.CertsUIEngineConfiguration.DataFolder"/>.
/// before the host starts; coordination tables in <c>public</c> are ensured again here before the bootstrap lease.
/// The bootstrap lease ensures only one replica writes against shared <see cref="Configuration.CertsUIEngineConfiguration.DataFolder"/>.
/// </summary>
public sealed class InitializationHostedService(
ILogger<InitializationHostedService> logger,
@ -27,6 +28,9 @@ public sealed class InitializationHostedService(
try {
logger.LogInformation("Running startup initialization...");
var engineConfig = serviceProvider.GetRequiredService<ICertsEngineConfiguration>();
await CoordinationTableProvisioner.EnsureAsync(engineConfig.ConnectionString, cancellationToken).ConfigureAwait(false);
var holder = runtimeInstance.InstanceId;
var acquired = await runtimeLease.TryAcquireAsync(RuntimeLeaseNames.Bootstrap, holder, BootstrapLeaseTtl, cancellationToken).ConfigureAwait(false);
if (!acquired.IsSuccess)

View File

@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk.Web">
<PropertyGroup>
<Version>3.3.12</Version>
<Version>3.3.13</Version>
<TargetFramework>net10.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>

View File

@ -1,8 +1,8 @@
global:
imagePullSecrets: []
image:
# Uncomment to override every component (global wins when set). Otherwise use each components.*.image.
# No Chart.appVersion.
# When non-empty, overrides every components.*.image.tag (see _helpers.tpl). Production/staging: pin to released
# semver (e.g. 3.3.13) and use pullPolicy Always or bump tag each release — do not rely on :latest + IfNotPresent alone.
# tag: "latest"
# pullPolicy: IfNotPresent
# Optional rollout tuning (see NOTES): pin a fixed pod annotation or add a nonce for frozen/git-rendered manifests.