< Summary

Information
Class: Orchestrator.Commands.Observability.ExportExperimentAnalysis.ExportExperimentAnalysisCommand.RunContext
Assembly: Orchestrator
File(s): /home/runner/work/KicktippAi/KicktippAi/src/Orchestrator/Commands/Observability/ExportExperimentAnalysis/ExportExperimentAnalysisCommand.cs
Line coverage
100%
Covered lines: 5
Uncovered lines: 0
Coverable lines: 5
Total lines: 950
Line coverage: 100%
Branch coverage
N/A
Covered branches: 0
Total branches: 0
Branch coverage: N/A
Method coverage

Feature is only available for sponsors

Upgrade to PRO version

Metrics

MethodBranch coverage Crap Score Cyclomatic complexity Line coverage
.ctor(...)100%11100%

File(s)

/home/runner/work/KicktippAi/KicktippAi/src/Orchestrator/Commands/Observability/ExportExperimentAnalysis/ExportExperimentAnalysisCommand.cs

#LineLine coverage
 1using System.Globalization;
 2using System.Text.Json;
 3using EHonda.KicktippAi.Core;
 4using CursorBasedComposite = EHonda.Pagination.CursorBased.Composite;
 5using EHonda.Pagination.OffsetBased;
 6using OffsetBasedComposite = EHonda.Pagination.OffsetBased.Composite;
 7using Microsoft.Extensions.Logging;
 8using Orchestrator.Commands.Observability.Experiments;
 9using Orchestrator.Infrastructure.Langfuse;
 10using Spectre.Console;
 11using Spectre.Console.Cli;
 12
 13namespace Orchestrator.Commands.Observability.ExportExperimentAnalysis;
 14
 15public sealed class ExportExperimentAnalysisCommand : AsyncCommand<ExportExperimentAnalysisSettings>
 16{
 17    private readonly IAnsiConsole _console;
 18    private readonly ILangfusePublicApiClient _langfuseClient;
 19    private readonly ILogger<ExportExperimentAnalysisCommand> _logger;
 20
 21    public ExportExperimentAnalysisCommand(
 22        IAnsiConsole console,
 23        ILangfusePublicApiClient langfuseClient,
 24        ILogger<ExportExperimentAnalysisCommand> logger)
 25    {
 26        _console = console;
 27        _langfuseClient = langfuseClient;
 28        _logger = logger;
 29    }
 30
 31    protected override async Task<int> ExecuteAsync(CommandContext context, ExportExperimentAnalysisSettings settings, C
 32    {
 33        try
 34        {
 35            var runNames = settings.GetParsedRunNames();
 36            var runContexts = new List<RunContext>();
 37
 38            PreparedExperimentSupport.ReportProgress(
 39                $"Exporting experiment analysis for dataset '{settings.DatasetName}' across {runNames.Count} run(s).");
 40            _logger.LogInformation(
 41                "Exporting experiment analysis for dataset {DatasetName} across {RunCount} runs.",
 42                settings.DatasetName,
 43                runNames.Count);
 44            var dataset = await _langfuseClient.GetDatasetAsync(settings.DatasetName, cancellationToken)
 45                          ?? throw new InvalidOperationException($"Dataset '{settings.DatasetName}' could not be found."
 46
 47            for (var runIndex = 0; runIndex < runNames.Count; runIndex += 1)
 48            {
 49                var runName = runNames[runIndex];
 50                PreparedExperimentSupport.ReportProgress(
 51                    $"Loading run {runIndex + 1}/{runNames.Count}: '{runName}'.");
 52                var datasetRun = await _langfuseClient.GetDatasetRunAsync(settings.DatasetName, runName, cancellationTok
 53                    ?? throw new InvalidOperationException(
 54                        $"Dataset run '{runName}' could not be found in dataset '{settings.DatasetName}'.");
 55
 56                var runMetadata = DeserializeRunMetadata(datasetRun.Metadata, runName);
 57                var datasetRunItems = await ListAllDatasetRunItemsAsync(datasetRun, cancellationToken);
 58                EnsureDistinctDatasetItems(datasetRunItems, runName);
 59                var aggregateScores = await LoadAggregateScoresAsync(datasetRun.Id, runMetadata, cancellationToken);
 60
 61                runContexts.Add(new RunContext(datasetRun, runMetadata, datasetRunItems, aggregateScores));
 62            }
 63
 64            ValidateComparableRuns(runContexts);
 65
 66            PreparedExperimentSupport.ReportProgress("Loading dataset items for the comparable run set.");
 67            var datasetItemsById = await LoadDatasetItemsAsync(settings.DatasetName, runContexts, cancellationToken);
 68            PreparedExperimentSupport.ReportProgress("Loading traces and observations for the comparable run set.");
 69            var tracesById = await LoadTracesAsync(runContexts, cancellationToken);
 70            PreparedExperimentSupport.ReportProgress("Building normalized analysis rows.");
 71            var rows = BuildRows(runContexts, datasetItemsById, tracesById);
 72            var bundle = BuildBundle(settings.DatasetName, dataset, runContexts, rows);
 73
 74            var outputPath = ResolveOutputPath(settings, bundle.TaskType, settings.DatasetName);
 75            Directory.CreateDirectory(Path.GetDirectoryName(outputPath)!);
 76            await File.WriteAllTextAsync(
 77                outputPath,
 78                JsonSerializer.Serialize(bundle, PreparedExperimentCommandSupport.JsonOptions),
 79                cancellationToken);
 80
 81            PreparedExperimentSupport.ReportProgress(
 82                $"Wrote experiment analysis bundle with {bundle.Rows.Count} row(s) to '{outputPath}'.");
 83
 84            var summary = new
 85            {
 86                settings.DatasetName,
 87                bundle.TaskType,
 88                bundle.PrimaryMetricName,
 89                runCount = bundle.Runs.Count,
 90                rowCount = bundle.Rows.Count,
 91                outputPath
 92            };
 93
 94            _console.WriteLine(JsonSerializer.Serialize(summary, PreparedExperimentCommandSupport.JsonOptions));
 95            return 0;
 96        }
 97        catch (Exception ex)
 98        {
 99            _logger.LogError(ex, "Error exporting experiment analysis bundle");
 100            _console.MarkupLine($"[red]Error:[/] {Markup.Escape(ex.Message)}");
 101            return 1;
 102        }
 103    }
 104
 105    private async Task<Dictionary<string, LangfuseDatasetItem>> LoadDatasetItemsAsync(
 106        string datasetName,
 107        IReadOnlyList<RunContext> runContexts,
 108        CancellationToken cancellationToken)
 109    {
 110        const int pageSize = 100;
 111        var requiredDatasetItemIds = runContexts
 112            .SelectMany(context => context.DatasetRunItems.Select(item => item.DatasetItemId))
 113            .Distinct(StringComparer.Ordinal)
 114            .ToHashSet(StringComparer.Ordinal);
 115        var result = new Dictionary<string, LangfuseDatasetItem>(StringComparer.Ordinal);
 116
 117        if (requiredDatasetItemIds.Count == 0)
 118        {
 119            return result;
 120        }
 121
 122        await foreach (var datasetItem in EnumerateOffsetPaginatedItemsAsync(
 123                           (page, ct) => _langfuseClient.ListDatasetItemsAsync(
 124                               new LangfuseListDatasetItemsRequest(DatasetName: datasetName, Page: page, Limit: pageSize
 125                               ct),
 126                           cancellationToken))
 127        {
 128            if (requiredDatasetItemIds.Contains(datasetItem.Id))
 129            {
 130                result[datasetItem.Id] = datasetItem;
 131            }
 132
 133            if (result.Count == requiredDatasetItemIds.Count)
 134            {
 135                break;
 136            }
 137        }
 138
 139        var missingDatasetItemIds = requiredDatasetItemIds
 140            .Except(result.Keys, StringComparer.Ordinal)
 141            .ToArray();
 142        if (missingDatasetItemIds.Length > 0)
 143        {
 144            throw new InvalidOperationException(
 145                $"Dataset item(s) could not be loaded from Langfuse: {string.Join(", ", missingDatasetItemIds)}.");
 146        }
 147
 148        _logger.LogInformation(
 149            "Loaded {Count} dataset items for dataset {DatasetName} via paginated dataset listing.",
 150            result.Count,
 151            datasetName);
 152
 153        return result;
 154    }
 155
 156    private async Task<Dictionary<string, LangfuseTraceWithDetails>> LoadTracesAsync(
 157        IReadOnlyList<RunContext> runContexts,
 158        CancellationToken cancellationToken)
 159    {
 160        var result = new Dictionary<string, LangfuseTraceWithDetails>(StringComparer.Ordinal);
 161
 162        foreach (var runContext in runContexts)
 163        {
 164            var expectedTraceIds = runContext.DatasetRunItems
 165                .Select(item => item.TraceId)
 166                .ToHashSet(StringComparer.Ordinal);
 167            var traces = await ListRunTracesAsync(runContext.DatasetRun.Name, expectedTraceIds, cancellationToken);
 168            var observationsByTraceId = await ListRunObservationsAsync(runContext.DatasetRun.Name, expectedTraceIds, can
 169
 170            foreach (var traceId in expectedTraceIds)
 171            {
 172                if (!traces.TryGetValue(traceId, out var trace))
 173                {
 174                    throw new InvalidOperationException($"Trace '{traceId}' could not be loaded from Langfuse.");
 175                }
 176
 177                result[traceId] = new LangfuseTraceWithDetails(
 178                    trace.Id,
 179                    trace.Name,
 180                    trace.Metadata,
 181                    trace.Output,
 182                    [],
 183                    observationsByTraceId.TryGetValue(traceId, out var observations) ? observations : [],
 184                    trace.Tags);
 185            }
 186        }
 187
 188        _logger.LogInformation(
 189            "Loaded {Count} traces for comparable export using per-run session batching.",
 190            result.Count);
 191
 192        return result;
 193    }
 194
 195    private async Task<Dictionary<string, LangfuseTraceWithDetails>> ListRunTracesAsync(
 196        string runName,
 197        IReadOnlySet<string> expectedTraceIds,
 198        CancellationToken cancellationToken)
 199    {
 200        const int pageSize = 100;
 201        var result = new Dictionary<string, LangfuseTraceWithDetails>(StringComparer.Ordinal);
 202
 203        if (expectedTraceIds.Count == 0)
 204        {
 205            return result;
 206        }
 207
 208        await foreach (var trace in EnumerateOffsetPaginatedItemsAsync(
 209                           (page, ct) => _langfuseClient.ListTracesAsync(
 210                               new LangfuseListTracesRequest(SessionId: runName, Page: page, Limit: pageSize, Fields: "i
 211                               ct),
 212                           cancellationToken))
 213        {
 214            if (expectedTraceIds.Contains(trace.Id))
 215            {
 216                result[trace.Id] = trace;
 217            }
 218
 219            if (result.Count == expectedTraceIds.Count)
 220            {
 221                break;
 222            }
 223        }
 224
 225        _logger.LogInformation(
 226            "Loaded {LoadedCount}/{ExpectedCount} trace shells for run {RunName} via sessionId listing.",
 227            result.Count,
 228            expectedTraceIds.Count,
 229            runName);
 230
 231        return result;
 232    }
 233
 234    private async Task<Dictionary<string, IReadOnlyList<LangfuseObservationDetail>>> ListRunObservationsAsync(
 235        string runName,
 236        IReadOnlySet<string> expectedTraceIds,
 237        CancellationToken cancellationToken)
 238    {
 239        const int limit = 1000;
 240        var observationsByTraceId = new Dictionary<string, List<LangfuseObservationDetail>>(StringComparer.Ordinal);
 241
 242        if (expectedTraceIds.Count == 0)
 243        {
 244            return observationsByTraceId.ToDictionary(
 245                pair => pair.Key,
 246                pair => (IReadOnlyList<LangfuseObservationDetail>)pair.Value,
 247                StringComparer.Ordinal);
 248        }
 249
 250        await foreach (var observation in EnumerateCursorPaginatedItemsAsync(
 251                           (cursor, ct) => _langfuseClient.ListObservationsAsync(
 252                               new LangfuseListObservationsRequest(SessionId: runName, Limit: limit, Cursor: cursor, Fie
 253                               ct),
 254                           cancellationToken))
 255        {
 256            if (!expectedTraceIds.Contains(observation.TraceId))
 257            {
 258                continue;
 259            }
 260
 261            if (!observationsByTraceId.TryGetValue(observation.TraceId, out var observations))
 262            {
 263                observations = [];
 264                observationsByTraceId[observation.TraceId] = observations;
 265            }
 266
 267            observations.Add(observation);
 268        }
 269
 270        _logger.LogInformation(
 271            "Loaded observations for {TraceCount} traces in run {RunName} via sessionId observation listing.",
 272            observationsByTraceId.Count,
 273            runName);
 274
 275        return observationsByTraceId.ToDictionary(
 276            pair => pair.Key,
 277            pair => (IReadOnlyList<LangfuseObservationDetail>)pair.Value,
 278            StringComparer.Ordinal);
 279    }
 280
 281    private static PreparedExperimentAnalysisBundle BuildBundle(
 282        string datasetName,
 283        LangfuseDataset dataset,
 284        IReadOnlyList<RunContext> runContexts,
 285        IReadOnlyList<PreparedExperimentAnalysisRow> rows)
 286    {
 287        var taskType = runContexts[0].RunMetadata.TaskType ?? throw new InvalidOperationException("Run metadata missing 
 288        var primaryMetricName = ResolvePrimaryMetricName(taskType);
 289        var runSummaries = runContexts.Select(context => new PreparedExperimentAnalysisRun(
 290                context.DatasetRun.Name,
 291                context.DatasetRun.Id,
 292                taskType,
 293                ResolveRunDisplayName(context.RunMetadata, context.DatasetRun.Name),
 294                context.RunMetadata.PromptKey,
 295                context.RunMetadata.PromptSource,
 296                context.RunMetadata.LangfusePromptName,
 297                context.RunMetadata.LangfusePromptLabel,
 298                context.RunMetadata.LangfusePromptVersion,
 299                context.RunMetadata.ReasoningEffort,
 300                context.RunMetadata.MaxOutputTokenCount,
 301                context.RunMetadata.SliceKind,
 302                context.RunMetadata.SliceKey,
 303                context.RunMetadata.SourcePoolKey,
 304                context.RunMetadata.SelectedItemIdsHash,
 305                context.RunMetadata.SelectedItemIdsCount,
 306                context.RunMetadata.SampleSize,
 307                context.RunMetadata.MatchCount,
 308                context.RunMetadata.Repetitions,
 309                context.RunMetadata.Parallelism,
 310                context.RunMetadata.EvaluationTimestampPolicyKey,
 311                context.RunMetadata.EvaluationTime,
 312                context.RunMetadata.StartedAtUtc,
 313                context.RunMetadata.BatchStrategy,
 314                context.RunMetadata.BatchSize,
 315                context.RunMetadata.BatchCount,
 316                context.AggregateScores,
 317                ResolvePrimaryMetricValue(taskType, context.AggregateScores),
 318                context.DatasetRunItems.Count,
 319                context.RunMetadata.RunSubjectKind,
 320                context.RunMetadata.RunSubjectId,
 321                context.RunMetadata.RunSubjectDisplayName))
 322            .OrderBy(run => run.RunName, StringComparer.Ordinal)
 323            .ToList();
 324
 325        return new PreparedExperimentAnalysisBundle(
 326            datasetName,
 327            taskType,
 328            primaryMetricName,
 329            ExperimentArtifactSupport.FormatStartedAtUtc(DateTimeOffset.UtcNow),
 330            dataset.Description,
 331            LangfuseJsonUtilities.IsDefined(dataset.Metadata) ? dataset.Metadata : default,
 332            runSummaries,
 333            rows);
 334    }
 335
 336    private static List<PreparedExperimentAnalysisRow> BuildRows(
 337        IReadOnlyList<RunContext> runContexts,
 338        IReadOnlyDictionary<string, LangfuseDatasetItem> datasetItemsById,
 339        IReadOnlyDictionary<string, LangfuseTraceWithDetails> tracesById)
 340    {
 341        var rows = new List<PreparedExperimentAnalysisRow>();
 342
 343        foreach (var context in runContexts.OrderBy(context => context.DatasetRun.Name, StringComparer.Ordinal))
 344        {
 345            foreach (var datasetRunItem in context.DatasetRunItems.OrderBy(item => item.DatasetItemId, StringComparer.Or
 346            {
 347                var datasetItem = datasetItemsById[datasetRunItem.DatasetItemId];
 348                var trace = tracesById[datasetRunItem.TraceId];
 349                var predictionObservation = SelectPredictionObservation(trace, context.RunMetadata);
 350                var prediction = ExtractPrediction(trace.Output, predictionObservation?.Output, context.DatasetRun.Name,
 351                var expectedOutput = ExtractExpectedOutput(datasetItem.ExpectedOutput, context.DatasetRun.Name, datasetR
 352                var metadata = ExtractDatasetItemMetadata(datasetItem.Input, datasetItem.Metadata, datasetRunItem.Datase
 353                var kicktippPoints = CalculateKicktippPoints(prediction, expectedOutput);
 354                var sourceDatasetItemId = ResolveSourceDatasetItemId(
 355                    context.RunMetadata,
 356                    datasetRunItem.DatasetItemId,
 357                    trace.Metadata,
 358                    context.DatasetRun.Name);
 359
 360                rows.Add(new PreparedExperimentAnalysisRow(
 361                    datasetRunItem.DatasetItemId,
 362                    context.DatasetRun.Id,
 363                    context.DatasetRun.Name,
 364                    context.RunMetadata.TaskType ?? throw new InvalidOperationException($"Run '{context.DatasetRun.Name}
 365                    ResolveRunDisplayName(context.RunMetadata, context.DatasetRun.Name),
 366                    context.RunMetadata.PromptKey,
 367                    context.RunMetadata.ReasoningEffort,
 368                    context.RunMetadata.SliceKind,
 369                    context.RunMetadata.SliceKey,
 370                    context.RunMetadata.SourcePoolKey,
 371                    datasetRunItem.DatasetItemId,
 372                    sourceDatasetItemId,
 373                    datasetRunItem.TraceId,
 374                    predictionObservation?.Id,
 375                    metadata.Matchday,
 376                    metadata.HomeTeam,
 377                    metadata.AwayTeam,
 378                    metadata.StartsAt,
 379                    metadata.TippSpielId,
 380                    prediction.HomeGoals,
 381                    prediction.AwayGoals,
 382                    expectedOutput.HomeGoals,
 383                    expectedOutput.AwayGoals,
 384                    kicktippPoints,
 385                    prediction.Status,
 386                    context.RunMetadata.RunSubjectKind,
 387                    context.RunMetadata.RunSubjectId,
 388                    context.RunMetadata.RunSubjectDisplayName,
 389                    metadata.FixtureIndex,
 390                    metadata.RepetitionIndex));
 391            }
 392        }
 393
 394        return rows;
 395    }
 396
 397    private static void ValidateComparableRuns(IReadOnlyList<RunContext> runContexts)
 398    {
 399        if (runContexts.Count < 1)
 400        {
 401            throw new InvalidOperationException("At least one run is required to export an experiment analysis bundle.")
 402        }
 403
 404        var baseline = runContexts[0];
 405        var baselineTaskType = baseline.RunMetadata.TaskType ?? throw new InvalidOperationException(
 406            $"Run '{baseline.DatasetRun.Name}' is missing task type metadata.");
 407        var baselineItemIds = baseline.DatasetRunItems
 408            .Select(item => item.DatasetItemId)
 409            .OrderBy(item => item, StringComparer.Ordinal)
 410            .ToArray();
 411
 412        foreach (var candidate in runContexts.Skip(1))
 413        {
 414            var taskType = candidate.RunMetadata.TaskType ?? throw new InvalidOperationException(
 415                $"Run '{candidate.DatasetRun.Name}' is missing task type metadata.");
 416            if (!string.Equals(baselineTaskType, taskType, StringComparison.OrdinalIgnoreCase))
 417            {
 418                throw new InvalidOperationException(
 419                    $"Run '{candidate.DatasetRun.Name}' has task type '{taskType}', but expected '{baselineTaskType}'.")
 420            }
 421
 422            if (!string.IsNullOrWhiteSpace(baseline.RunMetadata.SelectedItemIdsHash)
 423                && !string.IsNullOrWhiteSpace(candidate.RunMetadata.SelectedItemIdsHash)
 424                && !string.Equals(
 425                    baseline.RunMetadata.SelectedItemIdsHash,
 426                    candidate.RunMetadata.SelectedItemIdsHash,
 427                    StringComparison.Ordinal))
 428            {
 429                throw new InvalidOperationException(
 430                    $"Run '{candidate.DatasetRun.Name}' has selectedItemIdsHash '{candidate.RunMetadata.SelectedItemIdsH
 431            }
 432
 433            var candidateItemIds = candidate.DatasetRunItems
 434                .Select(item => item.DatasetItemId)
 435                .OrderBy(item => item, StringComparer.Ordinal)
 436                .ToArray();
 437
 438            if (!baselineItemIds.SequenceEqual(candidateItemIds, StringComparer.Ordinal))
 439            {
 440                throw new InvalidOperationException(
 441                    $"Run '{candidate.DatasetRun.Name}' does not contain the same prepared dataset item set as '{baselin
 442            }
 443        }
 444    }
 445
 446    private static void EnsureDistinctDatasetItems(
 447        IReadOnlyList<LangfuseDatasetRunItem> datasetRunItems,
 448        string runName)
 449    {
 450        var duplicateItemId = datasetRunItems
 451            .GroupBy(item => item.DatasetItemId, StringComparer.Ordinal)
 452            .FirstOrDefault(group => group.Count() > 1)
 453            ?.Key;
 454
 455        if (duplicateItemId is not null)
 456        {
 457            throw new InvalidOperationException(
 458                $"Run '{runName}' contains duplicate dataset item id '{duplicateItemId}', which is not supported for com
 459        }
 460    }
 461
 462    private async Task<IReadOnlyList<LangfuseDatasetRunItem>> ListAllDatasetRunItemsAsync(
 463        LangfuseDatasetRunWithItems datasetRun,
 464        CancellationToken cancellationToken)
 465    {
 466        const int pageSize = 100;
 467        var items = new List<LangfuseDatasetRunItem>();
 468
 469        await foreach (var item in EnumerateOffsetPaginatedItemsAsync(
 470                           (page, ct) => _langfuseClient.ListDatasetRunItemsAsync(
 471                               datasetRun.DatasetId,
 472                               datasetRun.Name,
 473                               page,
 474                               pageSize,
 475                               ct),
 476                           cancellationToken))
 477        {
 478            items.Add(item);
 479        }
 480
 481        return items;
 482    }
 483
 484    private async Task<ExperimentAggregateScores> LoadAggregateScoresAsync(
 485        string datasetRunId,
 486        PreparedExperimentRunMetadata runMetadata,
 487        CancellationToken cancellationToken)
 488    {
 489        const int pageSize = 100;
 490        var scores = new List<LangfuseScore>();
 491
 492        await foreach (var score in EnumerateOffsetPaginatedItemsAsync(
 493                           (page, ct) => _langfuseClient.ListScoresAsync(
 494                               new LangfuseListScoresRequest(DatasetRunId: datasetRunId, Page: page, Limit: pageSize),
 495                               ct),
 496                           cancellationToken))
 497        {
 498            scores.Add(score);
 499        }
 500
 501        var total = scores.FirstOrDefault(score => string.Equals(score.Name, "total_kicktipp_points", StringComparison.O
 502        var average = scores.FirstOrDefault(score => string.Equals(score.Name, "avg_kicktipp_points", StringComparison.O
 503
 504        if (total?.Value is null || average?.Value is null)
 505        {
 506            var runName = string.IsNullOrWhiteSpace(runMetadata.StartedAtUtc)
 507                ? datasetRunId
 508                : runMetadata.StartedAtUtc;
 509            throw new InvalidOperationException(
 510                $"Dataset run '{runName}' is missing one or more aggregate Langfuse scores (expected total_kicktipp_poin
 511            );
 512        }
 513
 514        return new ExperimentAggregateScores(total.Value.Value, average.Value.Value);
 515    }
 516
 517    private IAsyncEnumerable<TItem> EnumerateOffsetPaginatedItemsAsync<TItem>(
 518        Func<int, CancellationToken, Task<LangfusePaginatedResponse<TItem>>> pageRetriever,
 519        CancellationToken cancellationToken)
 520    {
 521        var paginationHandler = new OffsetBasedComposite.PaginationHandlerBuilder<LangfusePaginatedResponse<TItem>, TIte
 522            .WithPageRetriever((previousPage, ct) => pageRetriever(GetNextPageNumber(previousPage), ct))
 523            .WithOffsetStateExtractor(static page => new OffsetState<int>(checked(page.Meta.Page * page.Meta.Limit), pag
 524            .WithItemExtractor(static page => page.Data)
 525            .Build();
 526
 527        return paginationHandler.GetAllItemsAsync(cancellationToken);
 528    }
 529
 530    private IAsyncEnumerable<TItem> EnumerateCursorPaginatedItemsAsync<TItem>(
 531        Func<string?, CancellationToken, Task<LangfuseCursorPaginatedResponse<TItem>>> pageRetriever,
 532        CancellationToken cancellationToken)
 533    {
 534        var paginationHandler = new CursorBasedComposite.PaginationHandlerBuilder<LangfuseCursorPaginatedResponse<TItem>
 535            .WithPageRetriever((previousPage, ct) => pageRetriever(previousPage?.Meta.Cursor, ct))
 536            .WithCursorExtractor(static page => page.Meta.Cursor)
 537            .WithItemExtractor(static page => page.Data)
 538            .Build();
 539
 540        return paginationHandler.GetAllItemsAsync(cancellationToken);
 541    }
 542
 543    private static int GetNextPageNumber<TItem>(LangfusePaginatedResponse<TItem>? previousPage)
 544    {
 545        return previousPage is null ? 1 : previousPage.Meta.Page + 1;
 546    }
 547
 548    private static PreparedExperimentRunMetadata DeserializeRunMetadata(JsonElement metadata, string runName)
 549    {
 550        if (!LangfuseJsonUtilities.IsDefined(metadata))
 551        {
 552            throw new InvalidOperationException($"Dataset run '{runName}' is missing run metadata.");
 553        }
 554
 555        try
 556        {
 557            var deserialized = metadata.Deserialize<PreparedExperimentRunMetadata>(PreparedExperimentCommandSupport.Json
 558            return deserialized ?? throw new InvalidOperationException($"Dataset run '{runName}' metadata could not be d
 559        }
 560        catch (JsonException ex)
 561        {
 562            throw new InvalidOperationException($"Dataset run '{runName}' metadata could not be deserialized.", ex);
 563        }
 564    }
 565
 566    private static PredictionOutput ExtractPrediction(
 567        JsonElement traceOutput,
 568        JsonElement? observationOutput,
 569        string runName,
 570        string traceId)
 571    {
 572        if (TryExtractPredictionOutput(traceOutput, out var prediction))
 573        {
 574            return prediction;
 575        }
 576
 577        if (observationOutput is JsonElement candidate && TryExtractPredictionOutput(candidate, out prediction))
 578        {
 579            return prediction;
 580        }
 581
 582        throw new InvalidOperationException(
 583            $"Run '{runName}' trace '{traceId}' does not expose a parseable prediction payload in trace or observation o
 584    }
 585
 586    private static ExpectedOutput ExtractExpectedOutput(JsonElement expectedOutput, string runName, string datasetItemId
 587    {
 588        if (expectedOutput.ValueKind == JsonValueKind.Object)
 589        {
 590            if (expectedOutput.TryGetProperty("homeGoals", out var homeGoals)
 591                && expectedOutput.TryGetProperty("awayGoals", out var awayGoals)
 592                && homeGoals.TryGetInt32(out var home)
 593                && awayGoals.TryGetInt32(out var away))
 594            {
 595                return new ExpectedOutput(home, away);
 596            }
 597
 598            if (expectedOutput.TryGetProperty("score", out var score)
 599                && score.ValueKind == JsonValueKind.String
 600                && TryParseScoreString(score.GetString(), out var parsed))
 601            {
 602                return parsed;
 603            }
 604        }
 605
 606        throw new InvalidOperationException(
 607            $"Run '{runName}' dataset item '{datasetItemId}' does not expose a parseable expected scoreline.");
 608    }
 609
 610    private static DatasetItemMetadata ExtractDatasetItemMetadata(JsonElement input, JsonElement metadata, string datase
 611    {
 612        if (metadata.ValueKind != JsonValueKind.Object)
 613        {
 614            throw new InvalidOperationException($"Dataset item '{datasetItemId}' metadata is missing or not an object.")
 615        }
 616
 617        return new DatasetItemMetadata(
 618            GetRequiredStringProperty(metadata, "homeTeam", datasetItemId),
 619            GetRequiredStringProperty(metadata, "awayTeam", datasetItemId),
 620            GetRequiredIntProperty(metadata, "matchday", datasetItemId),
 621            GetRequiredStringProperty(input, "startsAt", datasetItemId),
 622            GetOptionalStringProperty(metadata, "tippSpielId"),
 623            GetOptionalIntProperty(metadata, "fixtureIndex"),
 624            GetOptionalIntProperty(metadata, "repetitionIndex"));
 625    }
 626
 627    private static int CalculateKicktippPoints(PredictionOutput prediction, ExpectedOutput expectedOutput)
 628    {
 629        if (!string.Equals(prediction.Status, "placed", StringComparison.OrdinalIgnoreCase)
 630            || prediction.HomeGoals is not int homeGoals
 631            || prediction.AwayGoals is not int awayGoals)
 632        {
 633            return 0;
 634        }
 635
 636        return PreparedExperimentSupport.CalculateScores(
 637            new Prediction(homeGoals, awayGoals),
 638            expectedOutput.HomeGoals,
 639            expectedOutput.AwayGoals).KicktippPoints;
 640    }
 641
 642    private static LangfuseObservationDetail? SelectPredictionObservation(
 643        LangfuseTraceWithDetails trace,
 644        PreparedExperimentRunMetadata runMetadata)
 645    {
 646        var observations = trace.Observations ?? [];
 647        var preferredObservationName = string.IsNullOrWhiteSpace(runMetadata.ObservationName)
 648            ? string.Equals(runMetadata.TaskType, "community-to-date", StringComparison.OrdinalIgnoreCase)
 649                ? "community-match-prediction"
 650                : "predict-match"
 651            : runMetadata.ObservationName;
 652
 653        return observations.FirstOrDefault(observation => string.Equals(observation.Name, preferredObservationName, Stri
 654               ?? observations.FirstOrDefault(observation => string.Equals(observation.Name, "community-match-prediction
 655               ?? observations.FirstOrDefault(observation => string.Equals(observation.Name, "predict-match", StringComp
 656               ?? observations.FirstOrDefault(observation => string.Equals(observation.Type, "GENERATION", StringCompari
 657    }
 658
 659    private static string ResolveSourceDatasetItemId(
 660        PreparedExperimentRunMetadata runMetadata,
 661        string datasetItemId,
 662        JsonElement traceMetadata,
 663        string runName)
 664    {
 665        if (runMetadata.DatasetItemIdMap.Count > 0)
 666        {
 667            var reverseMatch = runMetadata.DatasetItemIdMap
 668                .FirstOrDefault(pair => string.Equals(pair.Value, datasetItemId, StringComparison.Ordinal));
 669            if (!string.IsNullOrWhiteSpace(reverseMatch.Key))
 670            {
 671                return reverseMatch.Key;
 672            }
 673        }
 674
 675        if (TryDeriveSourceDatasetItemId(datasetItemId, out var derived))
 676        {
 677            return derived;
 678        }
 679
 680        var fromTraceMetadata = GetOptionalStringProperty(traceMetadata, "sourceDatasetItemId");
 681        if (!string.IsNullOrWhiteSpace(fromTraceMetadata))
 682        {
 683            return fromTraceMetadata!;
 684        }
 685
 686        throw new InvalidOperationException(
 687            $"Run '{runName}' dataset item '{datasetItemId}' could not be mapped back to a source dataset item id.");
 688    }
 689
 690    private static bool TryDeriveSourceDatasetItemId(string datasetItemId, out string sourceDatasetItemId)
 691    {
 692        var repeatedMatchSliceIndex = datasetItemId.IndexOf("__repeated-match-slice__", StringComparison.Ordinal);
 693        if (repeatedMatchSliceIndex >= 0)
 694        {
 695            sourceDatasetItemId = datasetItemId[..repeatedMatchSliceIndex];
 696            return true;
 697        }
 698
 699        var repeatedMatchIndex = datasetItemId.IndexOf("__repeated-match__", StringComparison.Ordinal);
 700        if (repeatedMatchIndex >= 0)
 701        {
 702            sourceDatasetItemId = datasetItemId[..repeatedMatchIndex];
 703            return true;
 704        }
 705
 706        var sliceIndex = datasetItemId.IndexOf("__slice__", StringComparison.Ordinal);
 707        if (sliceIndex >= 0)
 708        {
 709            sourceDatasetItemId = datasetItemId[..sliceIndex];
 710            return true;
 711        }
 712
 713        sourceDatasetItemId = string.Empty;
 714        return false;
 715    }
 716
 717    private static bool TryExtractPredictionOutput(JsonElement value, out PredictionOutput prediction)
 718    {
 719        if (value.ValueKind == JsonValueKind.Object)
 720        {
 721            if (value.TryGetProperty("status", out var statusProperty)
 722                && statusProperty.ValueKind == JsonValueKind.String)
 723            {
 724                var status = NormalizePredictionStatus(statusProperty.GetString());
 725                var hasHomeGoals = TryGetNullableIntProperty(value, "homeGoals", out var parsedHomeGoals);
 726                var hasAwayGoals = TryGetNullableIntProperty(value, "awayGoals", out var parsedAwayGoals);
 727
 728                if (hasHomeGoals && hasAwayGoals)
 729                {
 730                    prediction = new PredictionOutput(status, parsedHomeGoals, parsedAwayGoals);
 731                    return true;
 732                }
 733
 734                if (string.Equals(status, "missed", StringComparison.OrdinalIgnoreCase))
 735                {
 736                    prediction = new PredictionOutput("missed", null, null);
 737                    return true;
 738                }
 739            }
 740
 741            if (value.TryGetProperty("homeGoals", out var homeGoals)
 742                && value.TryGetProperty("awayGoals", out var awayGoals)
 743                && homeGoals.TryGetInt32(out var home)
 744                && awayGoals.TryGetInt32(out var away))
 745            {
 746                prediction = new PredictionOutput("placed", home, away);
 747                return true;
 748            }
 749        }
 750
 751        if (value.ValueKind == JsonValueKind.String)
 752        {
 753            var raw = value.GetString();
 754            if (TryParseScoreString(raw, out var parsedScore))
 755            {
 756                prediction = new PredictionOutput("placed", parsedScore.HomeGoals, parsedScore.AwayGoals);
 757                return true;
 758            }
 759
 760            if (!string.IsNullOrWhiteSpace(raw))
 761            {
 762                try
 763                {
 764                    using var document = JsonDocument.Parse(raw);
 765                    if (TryExtractPredictionOutput(document.RootElement, out prediction))
 766                    {
 767                        return true;
 768                    }
 769                }
 770                catch (JsonException)
 771                {
 772                }
 773            }
 774        }
 775
 776        prediction = default;
 777        return false;
 778    }
 779
 780    private static bool TryGetNullableIntProperty(JsonElement value, string propertyName, out int? result)
 781    {
 782        result = null;
 783        if (!value.TryGetProperty(propertyName, out var property))
 784        {
 785            return false;
 786        }
 787
 788        if (property.ValueKind == JsonValueKind.Null)
 789        {
 790            return true;
 791        }
 792
 793        if (property.TryGetInt32(out var parsed))
 794        {
 795            result = parsed;
 796            return true;
 797        }
 798
 799        return false;
 800    }
 801
 802    private static bool TryParseScoreString(string? value, out ExpectedOutput expectedOutput)
 803    {
 804        expectedOutput = default;
 805        if (string.IsNullOrWhiteSpace(value))
 806        {
 807            return false;
 808        }
 809
 810        var segments = value.Split(':', StringSplitOptions.TrimEntries);
 811        if (segments.Length != 2)
 812        {
 813            return false;
 814        }
 815
 816        if (!int.TryParse(segments[0], NumberStyles.Integer, CultureInfo.InvariantCulture, out var homeGoals)
 817            || !int.TryParse(segments[1], NumberStyles.Integer, CultureInfo.InvariantCulture, out var awayGoals))
 818        {
 819            return false;
 820        }
 821
 822        expectedOutput = new ExpectedOutput(homeGoals, awayGoals);
 823        return true;
 824    }
 825
 826    private static string NormalizePredictionStatus(string? status)
 827    {
 828        return string.Equals(status, "missed", StringComparison.OrdinalIgnoreCase)
 829            ? "missed"
 830            : "placed";
 831    }
 832
 833    private static string ResolveRunDisplayName(PreparedExperimentRunMetadata runMetadata, string runName)
 834    {
 835        if (!string.IsNullOrWhiteSpace(runMetadata.RunSubjectDisplayName))
 836        {
 837            return runMetadata.RunSubjectDisplayName;
 838        }
 839
 840        if (!string.IsNullOrWhiteSpace(runMetadata.Model))
 841        {
 842            return PreparedExperimentSupport.BuildRunSubjectDisplayName(
 843                runMetadata.Model,
 844                runMetadata.ReasoningEffort);
 845        }
 846
 847        throw new InvalidOperationException($"Run '{runName}' is missing comparable display metadata.");
 848    }
 849
 850    private static string ResolvePrimaryMetricName(string taskType)
 851    {
 852        return IsAveragePrimaryMetricTask(taskType)
 853            ? "avg_kicktipp_points"
 854            : "total_kicktipp_points";
 855    }
 856
 857    private static double ResolvePrimaryMetricValue(string taskType, ExperimentAggregateScores aggregateScores)
 858    {
 859        return IsAveragePrimaryMetricTask(taskType)
 860            ? aggregateScores.AvgKicktippPoints
 861            : aggregateScores.TotalKicktippPoints;
 862    }
 863
 864    private static bool IsAveragePrimaryMetricTask(string taskType)
 865    {
 866        return string.Equals(taskType, "repeated-match", StringComparison.OrdinalIgnoreCase)
 867               || string.Equals(taskType, "repeated-match-slice", StringComparison.OrdinalIgnoreCase);
 868    }
 869
 870    private static string ResolveOutputPath(
 871        ExportExperimentAnalysisSettings settings,
 872        string taskType,
 873        string datasetName)
 874    {
 875        if (!string.IsNullOrWhiteSpace(settings.OutputPath))
 876        {
 877            return Path.GetFullPath(settings.OutputPath);
 878        }
 879
 880        var timestamp = DateTimeOffset.UtcNow.ToString("yyyy-MM-ddTHH-mm-ssZ", CultureInfo.InvariantCulture).ToLowerInva
 881        var datasetSegments = datasetName.Split('/', StringSplitOptions.RemoveEmptyEntries)
 882            .Select(segment => segment.Trim())
 883            .Where(segment => segment.Length > 0)
 884            .ToList();
 885
 886        return Path.GetFullPath(Path.Combine(
 887            ["artifacts", "langfuse-experiments", "analysis", taskType, .. datasetSegments, $"comparison-{timestamp}.jso
 888    }
 889
 890    private static string GetRequiredStringProperty(JsonElement metadata, string propertyName, string datasetItemId)
 891    {
 892        if (!metadata.TryGetProperty(propertyName, out var property)
 893            || property.ValueKind != JsonValueKind.String
 894            || string.IsNullOrWhiteSpace(property.GetString()))
 895        {
 896            throw new InvalidOperationException($"Dataset item '{datasetItemId}' metadata is missing '{propertyName}'.")
 897        }
 898
 899        return property.GetString()!;
 900    }
 901
 902    private static int GetRequiredIntProperty(JsonElement metadata, string propertyName, string datasetItemId)
 903    {
 904        if (!metadata.TryGetProperty(propertyName, out var property)
 905            || !property.TryGetInt32(out var value))
 906        {
 907            throw new InvalidOperationException($"Dataset item '{datasetItemId}' metadata is missing '{propertyName}'.")
 908        }
 909
 910        return value;
 911    }
 912
 913    private static string? GetOptionalStringProperty(JsonElement metadata, string propertyName)
 914    {
 915        return metadata.ValueKind == JsonValueKind.Object
 916               && metadata.TryGetProperty(propertyName, out var property)
 917               && property.ValueKind == JsonValueKind.String
 918               && !string.IsNullOrWhiteSpace(property.GetString())
 919            ? property.GetString()
 920            : null;
 921    }
 922
 923    private static int? GetOptionalIntProperty(JsonElement metadata, string propertyName)
 924    {
 925        return metadata.ValueKind == JsonValueKind.Object
 926               && metadata.TryGetProperty(propertyName, out var property)
 927               && property.TryGetInt32(out var value)
 928            ? value
 929            : null;
 930    }
 931
 1932    private sealed record RunContext(
 1933        LangfuseDatasetRunWithItems DatasetRun,
 1934        PreparedExperimentRunMetadata RunMetadata,
 1935        IReadOnlyList<LangfuseDatasetRunItem> DatasetRunItems,
 1936        ExperimentAggregateScores AggregateScores);
 937
 938    private readonly record struct PredictionOutput(string Status, int? HomeGoals, int? AwayGoals);
 939
 940    private readonly record struct ExpectedOutput(int HomeGoals, int AwayGoals);
 941
 942    private readonly record struct DatasetItemMetadata(
 943        string HomeTeam,
 944        string AwayTeam,
 945        int Matchday,
 946        string StartsAt,
 947        string? TippSpielId,
 948        int? FixtureIndex,
 949        int? RepetitionIndex);
 950}