| | | 1 | | using System.Globalization; |
| | | 2 | | using System.Text.Json; |
| | | 3 | | using EHonda.KicktippAi.Core; |
| | | 4 | | using CursorBasedComposite = EHonda.Pagination.CursorBased.Composite; |
| | | 5 | | using EHonda.Pagination.OffsetBased; |
| | | 6 | | using OffsetBasedComposite = EHonda.Pagination.OffsetBased.Composite; |
| | | 7 | | using Microsoft.Extensions.Logging; |
| | | 8 | | using Orchestrator.Commands.Observability.Experiments; |
| | | 9 | | using Orchestrator.Infrastructure.Langfuse; |
| | | 10 | | using Spectre.Console; |
| | | 11 | | using Spectre.Console.Cli; |
| | | 12 | | |
| | | 13 | | namespace Orchestrator.Commands.Observability.ExportExperimentAnalysis; |
| | | 14 | | |
| | | 15 | | public sealed class ExportExperimentAnalysisCommand : AsyncCommand<ExportExperimentAnalysisSettings> |
| | | 16 | | { |
| | | 17 | | private readonly IAnsiConsole _console; |
| | | 18 | | private readonly ILangfusePublicApiClient _langfuseClient; |
| | | 19 | | private readonly ILogger<ExportExperimentAnalysisCommand> _logger; |
| | | 20 | | |
| | | 21 | | public ExportExperimentAnalysisCommand( |
| | | 22 | | IAnsiConsole console, |
| | | 23 | | ILangfusePublicApiClient langfuseClient, |
| | | 24 | | ILogger<ExportExperimentAnalysisCommand> logger) |
| | | 25 | | { |
| | | 26 | | _console = console; |
| | | 27 | | _langfuseClient = langfuseClient; |
| | | 28 | | _logger = logger; |
| | | 29 | | } |
| | | 30 | | |
| | | 31 | | protected override async Task<int> ExecuteAsync(CommandContext context, ExportExperimentAnalysisSettings settings, C |
| | | 32 | | { |
| | | 33 | | try |
| | | 34 | | { |
| | | 35 | | var runNames = settings.GetParsedRunNames(); |
| | | 36 | | var runContexts = new List<RunContext>(); |
| | | 37 | | |
| | | 38 | | PreparedExperimentSupport.ReportProgress( |
| | | 39 | | $"Exporting experiment analysis for dataset '{settings.DatasetName}' across {runNames.Count} run(s)."); |
| | | 40 | | _logger.LogInformation( |
| | | 41 | | "Exporting experiment analysis for dataset {DatasetName} across {RunCount} runs.", |
| | | 42 | | settings.DatasetName, |
| | | 43 | | runNames.Count); |
| | | 44 | | var dataset = await _langfuseClient.GetDatasetAsync(settings.DatasetName, cancellationToken) |
| | | 45 | | ?? throw new InvalidOperationException($"Dataset '{settings.DatasetName}' could not be found." |
| | | 46 | | |
| | | 47 | | for (var runIndex = 0; runIndex < runNames.Count; runIndex += 1) |
| | | 48 | | { |
| | | 49 | | var runName = runNames[runIndex]; |
| | | 50 | | PreparedExperimentSupport.ReportProgress( |
| | | 51 | | $"Loading run {runIndex + 1}/{runNames.Count}: '{runName}'."); |
| | | 52 | | var datasetRun = await _langfuseClient.GetDatasetRunAsync(settings.DatasetName, runName, cancellationTok |
| | | 53 | | ?? throw new InvalidOperationException( |
| | | 54 | | $"Dataset run '{runName}' could not be found in dataset '{settings.DatasetName}'."); |
| | | 55 | | |
| | | 56 | | var runMetadata = DeserializeRunMetadata(datasetRun.Metadata, runName); |
| | | 57 | | var datasetRunItems = await ListAllDatasetRunItemsAsync(datasetRun, cancellationToken); |
| | | 58 | | EnsureDistinctDatasetItems(datasetRunItems, runName); |
| | | 59 | | var aggregateScores = await LoadAggregateScoresAsync(datasetRun.Id, runMetadata, cancellationToken); |
| | | 60 | | |
| | | 61 | | runContexts.Add(new RunContext(datasetRun, runMetadata, datasetRunItems, aggregateScores)); |
| | | 62 | | } |
| | | 63 | | |
| | | 64 | | ValidateComparableRuns(runContexts); |
| | | 65 | | |
| | | 66 | | PreparedExperimentSupport.ReportProgress("Loading dataset items for the comparable run set."); |
| | | 67 | | var datasetItemsById = await LoadDatasetItemsAsync(settings.DatasetName, runContexts, cancellationToken); |
| | | 68 | | PreparedExperimentSupport.ReportProgress("Loading traces and observations for the comparable run set."); |
| | | 69 | | var tracesById = await LoadTracesAsync(runContexts, cancellationToken); |
| | | 70 | | PreparedExperimentSupport.ReportProgress("Building normalized analysis rows."); |
| | | 71 | | var rows = BuildRows(runContexts, datasetItemsById, tracesById); |
| | | 72 | | var bundle = BuildBundle(settings.DatasetName, dataset, runContexts, rows); |
| | | 73 | | |
| | | 74 | | var outputPath = ResolveOutputPath(settings, bundle.TaskType, settings.DatasetName); |
| | | 75 | | Directory.CreateDirectory(Path.GetDirectoryName(outputPath)!); |
| | | 76 | | await File.WriteAllTextAsync( |
| | | 77 | | outputPath, |
| | | 78 | | JsonSerializer.Serialize(bundle, PreparedExperimentCommandSupport.JsonOptions), |
| | | 79 | | cancellationToken); |
| | | 80 | | |
| | | 81 | | PreparedExperimentSupport.ReportProgress( |
| | | 82 | | $"Wrote experiment analysis bundle with {bundle.Rows.Count} row(s) to '{outputPath}'."); |
| | | 83 | | |
| | | 84 | | var summary = new |
| | | 85 | | { |
| | | 86 | | settings.DatasetName, |
| | | 87 | | bundle.TaskType, |
| | | 88 | | bundle.PrimaryMetricName, |
| | | 89 | | runCount = bundle.Runs.Count, |
| | | 90 | | rowCount = bundle.Rows.Count, |
| | | 91 | | outputPath |
| | | 92 | | }; |
| | | 93 | | |
| | | 94 | | _console.WriteLine(JsonSerializer.Serialize(summary, PreparedExperimentCommandSupport.JsonOptions)); |
| | | 95 | | return 0; |
| | | 96 | | } |
| | | 97 | | catch (Exception ex) |
| | | 98 | | { |
| | | 99 | | _logger.LogError(ex, "Error exporting experiment analysis bundle"); |
| | | 100 | | _console.MarkupLine($"[red]Error:[/] {Markup.Escape(ex.Message)}"); |
| | | 101 | | return 1; |
| | | 102 | | } |
| | | 103 | | } |
| | | 104 | | |
| | | 105 | | private async Task<Dictionary<string, LangfuseDatasetItem>> LoadDatasetItemsAsync( |
| | | 106 | | string datasetName, |
| | | 107 | | IReadOnlyList<RunContext> runContexts, |
| | | 108 | | CancellationToken cancellationToken) |
| | | 109 | | { |
| | | 110 | | const int pageSize = 100; |
| | | 111 | | var requiredDatasetItemIds = runContexts |
| | | 112 | | .SelectMany(context => context.DatasetRunItems.Select(item => item.DatasetItemId)) |
| | | 113 | | .Distinct(StringComparer.Ordinal) |
| | | 114 | | .ToHashSet(StringComparer.Ordinal); |
| | | 115 | | var result = new Dictionary<string, LangfuseDatasetItem>(StringComparer.Ordinal); |
| | | 116 | | |
| | | 117 | | if (requiredDatasetItemIds.Count == 0) |
| | | 118 | | { |
| | | 119 | | return result; |
| | | 120 | | } |
| | | 121 | | |
| | | 122 | | await foreach (var datasetItem in EnumerateOffsetPaginatedItemsAsync( |
| | | 123 | | (page, ct) => _langfuseClient.ListDatasetItemsAsync( |
| | | 124 | | new LangfuseListDatasetItemsRequest(DatasetName: datasetName, Page: page, Limit: pageSize |
| | | 125 | | ct), |
| | | 126 | | cancellationToken)) |
| | | 127 | | { |
| | | 128 | | if (requiredDatasetItemIds.Contains(datasetItem.Id)) |
| | | 129 | | { |
| | | 130 | | result[datasetItem.Id] = datasetItem; |
| | | 131 | | } |
| | | 132 | | |
| | | 133 | | if (result.Count == requiredDatasetItemIds.Count) |
| | | 134 | | { |
| | | 135 | | break; |
| | | 136 | | } |
| | | 137 | | } |
| | | 138 | | |
| | | 139 | | var missingDatasetItemIds = requiredDatasetItemIds |
| | | 140 | | .Except(result.Keys, StringComparer.Ordinal) |
| | | 141 | | .ToArray(); |
| | | 142 | | if (missingDatasetItemIds.Length > 0) |
| | | 143 | | { |
| | | 144 | | throw new InvalidOperationException( |
| | | 145 | | $"Dataset item(s) could not be loaded from Langfuse: {string.Join(", ", missingDatasetItemIds)}."); |
| | | 146 | | } |
| | | 147 | | |
| | | 148 | | _logger.LogInformation( |
| | | 149 | | "Loaded {Count} dataset items for dataset {DatasetName} via paginated dataset listing.", |
| | | 150 | | result.Count, |
| | | 151 | | datasetName); |
| | | 152 | | |
| | | 153 | | return result; |
| | | 154 | | } |
| | | 155 | | |
| | | 156 | | private async Task<Dictionary<string, LangfuseTraceWithDetails>> LoadTracesAsync( |
| | | 157 | | IReadOnlyList<RunContext> runContexts, |
| | | 158 | | CancellationToken cancellationToken) |
| | | 159 | | { |
| | | 160 | | var result = new Dictionary<string, LangfuseTraceWithDetails>(StringComparer.Ordinal); |
| | | 161 | | |
| | | 162 | | foreach (var runContext in runContexts) |
| | | 163 | | { |
| | | 164 | | var expectedTraceIds = runContext.DatasetRunItems |
| | | 165 | | .Select(item => item.TraceId) |
| | | 166 | | .ToHashSet(StringComparer.Ordinal); |
| | | 167 | | var traces = await ListRunTracesAsync(runContext.DatasetRun.Name, expectedTraceIds, cancellationToken); |
| | | 168 | | var observationsByTraceId = await ListRunObservationsAsync(runContext.DatasetRun.Name, expectedTraceIds, can |
| | | 169 | | |
| | | 170 | | foreach (var traceId in expectedTraceIds) |
| | | 171 | | { |
| | | 172 | | if (!traces.TryGetValue(traceId, out var trace)) |
| | | 173 | | { |
| | | 174 | | throw new InvalidOperationException($"Trace '{traceId}' could not be loaded from Langfuse."); |
| | | 175 | | } |
| | | 176 | | |
| | | 177 | | result[traceId] = new LangfuseTraceWithDetails( |
| | | 178 | | trace.Id, |
| | | 179 | | trace.Name, |
| | | 180 | | trace.Metadata, |
| | | 181 | | trace.Output, |
| | | 182 | | [], |
| | | 183 | | observationsByTraceId.TryGetValue(traceId, out var observations) ? observations : [], |
| | | 184 | | trace.Tags); |
| | | 185 | | } |
| | | 186 | | } |
| | | 187 | | |
| | | 188 | | _logger.LogInformation( |
| | | 189 | | "Loaded {Count} traces for comparable export using per-run session batching.", |
| | | 190 | | result.Count); |
| | | 191 | | |
| | | 192 | | return result; |
| | | 193 | | } |
| | | 194 | | |
| | | 195 | | private async Task<Dictionary<string, LangfuseTraceWithDetails>> ListRunTracesAsync( |
| | | 196 | | string runName, |
| | | 197 | | IReadOnlySet<string> expectedTraceIds, |
| | | 198 | | CancellationToken cancellationToken) |
| | | 199 | | { |
| | | 200 | | const int pageSize = 100; |
| | | 201 | | var result = new Dictionary<string, LangfuseTraceWithDetails>(StringComparer.Ordinal); |
| | | 202 | | |
| | | 203 | | if (expectedTraceIds.Count == 0) |
| | | 204 | | { |
| | | 205 | | return result; |
| | | 206 | | } |
| | | 207 | | |
| | | 208 | | await foreach (var trace in EnumerateOffsetPaginatedItemsAsync( |
| | | 209 | | (page, ct) => _langfuseClient.ListTracesAsync( |
| | | 210 | | new LangfuseListTracesRequest(SessionId: runName, Page: page, Limit: pageSize, Fields: "i |
| | | 211 | | ct), |
| | | 212 | | cancellationToken)) |
| | | 213 | | { |
| | | 214 | | if (expectedTraceIds.Contains(trace.Id)) |
| | | 215 | | { |
| | | 216 | | result[trace.Id] = trace; |
| | | 217 | | } |
| | | 218 | | |
| | | 219 | | if (result.Count == expectedTraceIds.Count) |
| | | 220 | | { |
| | | 221 | | break; |
| | | 222 | | } |
| | | 223 | | } |
| | | 224 | | |
| | | 225 | | _logger.LogInformation( |
| | | 226 | | "Loaded {LoadedCount}/{ExpectedCount} trace shells for run {RunName} via sessionId listing.", |
| | | 227 | | result.Count, |
| | | 228 | | expectedTraceIds.Count, |
| | | 229 | | runName); |
| | | 230 | | |
| | | 231 | | return result; |
| | | 232 | | } |
| | | 233 | | |
| | | 234 | | private async Task<Dictionary<string, IReadOnlyList<LangfuseObservationDetail>>> ListRunObservationsAsync( |
| | | 235 | | string runName, |
| | | 236 | | IReadOnlySet<string> expectedTraceIds, |
| | | 237 | | CancellationToken cancellationToken) |
| | | 238 | | { |
| | | 239 | | const int limit = 1000; |
| | | 240 | | var observationsByTraceId = new Dictionary<string, List<LangfuseObservationDetail>>(StringComparer.Ordinal); |
| | | 241 | | |
| | | 242 | | if (expectedTraceIds.Count == 0) |
| | | 243 | | { |
| | | 244 | | return observationsByTraceId.ToDictionary( |
| | | 245 | | pair => pair.Key, |
| | | 246 | | pair => (IReadOnlyList<LangfuseObservationDetail>)pair.Value, |
| | | 247 | | StringComparer.Ordinal); |
| | | 248 | | } |
| | | 249 | | |
| | | 250 | | await foreach (var observation in EnumerateCursorPaginatedItemsAsync( |
| | | 251 | | (cursor, ct) => _langfuseClient.ListObservationsAsync( |
| | | 252 | | new LangfuseListObservationsRequest(SessionId: runName, Limit: limit, Cursor: cursor, Fie |
| | | 253 | | ct), |
| | | 254 | | cancellationToken)) |
| | | 255 | | { |
| | | 256 | | if (!expectedTraceIds.Contains(observation.TraceId)) |
| | | 257 | | { |
| | | 258 | | continue; |
| | | 259 | | } |
| | | 260 | | |
| | | 261 | | if (!observationsByTraceId.TryGetValue(observation.TraceId, out var observations)) |
| | | 262 | | { |
| | | 263 | | observations = []; |
| | | 264 | | observationsByTraceId[observation.TraceId] = observations; |
| | | 265 | | } |
| | | 266 | | |
| | | 267 | | observations.Add(observation); |
| | | 268 | | } |
| | | 269 | | |
| | | 270 | | _logger.LogInformation( |
| | | 271 | | "Loaded observations for {TraceCount} traces in run {RunName} via sessionId observation listing.", |
| | | 272 | | observationsByTraceId.Count, |
| | | 273 | | runName); |
| | | 274 | | |
| | | 275 | | return observationsByTraceId.ToDictionary( |
| | | 276 | | pair => pair.Key, |
| | | 277 | | pair => (IReadOnlyList<LangfuseObservationDetail>)pair.Value, |
| | | 278 | | StringComparer.Ordinal); |
| | | 279 | | } |
| | | 280 | | |
| | | 281 | | private static PreparedExperimentAnalysisBundle BuildBundle( |
| | | 282 | | string datasetName, |
| | | 283 | | LangfuseDataset dataset, |
| | | 284 | | IReadOnlyList<RunContext> runContexts, |
| | | 285 | | IReadOnlyList<PreparedExperimentAnalysisRow> rows) |
| | | 286 | | { |
| | | 287 | | var taskType = runContexts[0].RunMetadata.TaskType ?? throw new InvalidOperationException("Run metadata missing |
| | | 288 | | var primaryMetricName = ResolvePrimaryMetricName(taskType); |
| | | 289 | | var runSummaries = runContexts.Select(context => new PreparedExperimentAnalysisRun( |
| | | 290 | | context.DatasetRun.Name, |
| | | 291 | | context.DatasetRun.Id, |
| | | 292 | | taskType, |
| | | 293 | | ResolveRunDisplayName(context.RunMetadata, context.DatasetRun.Name), |
| | | 294 | | context.RunMetadata.PromptKey, |
| | | 295 | | context.RunMetadata.PromptSource, |
| | | 296 | | context.RunMetadata.LangfusePromptName, |
| | | 297 | | context.RunMetadata.LangfusePromptLabel, |
| | | 298 | | context.RunMetadata.LangfusePromptVersion, |
| | | 299 | | context.RunMetadata.ReasoningEffort, |
| | | 300 | | context.RunMetadata.MaxOutputTokenCount, |
| | | 301 | | context.RunMetadata.SliceKind, |
| | | 302 | | context.RunMetadata.SliceKey, |
| | | 303 | | context.RunMetadata.SourcePoolKey, |
| | | 304 | | context.RunMetadata.SelectedItemIdsHash, |
| | | 305 | | context.RunMetadata.SelectedItemIdsCount, |
| | | 306 | | context.RunMetadata.SampleSize, |
| | | 307 | | context.RunMetadata.MatchCount, |
| | | 308 | | context.RunMetadata.Repetitions, |
| | | 309 | | context.RunMetadata.Parallelism, |
| | | 310 | | context.RunMetadata.EvaluationTimestampPolicyKey, |
| | | 311 | | context.RunMetadata.EvaluationTime, |
| | | 312 | | context.RunMetadata.StartedAtUtc, |
| | | 313 | | context.RunMetadata.BatchStrategy, |
| | | 314 | | context.RunMetadata.BatchSize, |
| | | 315 | | context.RunMetadata.BatchCount, |
| | | 316 | | context.AggregateScores, |
| | | 317 | | ResolvePrimaryMetricValue(taskType, context.AggregateScores), |
| | | 318 | | context.DatasetRunItems.Count, |
| | | 319 | | context.RunMetadata.RunSubjectKind, |
| | | 320 | | context.RunMetadata.RunSubjectId, |
| | | 321 | | context.RunMetadata.RunSubjectDisplayName)) |
| | | 322 | | .OrderBy(run => run.RunName, StringComparer.Ordinal) |
| | | 323 | | .ToList(); |
| | | 324 | | |
| | | 325 | | return new PreparedExperimentAnalysisBundle( |
| | | 326 | | datasetName, |
| | | 327 | | taskType, |
| | | 328 | | primaryMetricName, |
| | | 329 | | ExperimentArtifactSupport.FormatStartedAtUtc(DateTimeOffset.UtcNow), |
| | | 330 | | dataset.Description, |
| | | 331 | | LangfuseJsonUtilities.IsDefined(dataset.Metadata) ? dataset.Metadata : default, |
| | | 332 | | runSummaries, |
| | | 333 | | rows); |
| | | 334 | | } |
| | | 335 | | |
| | | 336 | | private static List<PreparedExperimentAnalysisRow> BuildRows( |
| | | 337 | | IReadOnlyList<RunContext> runContexts, |
| | | 338 | | IReadOnlyDictionary<string, LangfuseDatasetItem> datasetItemsById, |
| | | 339 | | IReadOnlyDictionary<string, LangfuseTraceWithDetails> tracesById) |
| | | 340 | | { |
| | | 341 | | var rows = new List<PreparedExperimentAnalysisRow>(); |
| | | 342 | | |
| | | 343 | | foreach (var context in runContexts.OrderBy(context => context.DatasetRun.Name, StringComparer.Ordinal)) |
| | | 344 | | { |
| | | 345 | | foreach (var datasetRunItem in context.DatasetRunItems.OrderBy(item => item.DatasetItemId, StringComparer.Or |
| | | 346 | | { |
| | | 347 | | var datasetItem = datasetItemsById[datasetRunItem.DatasetItemId]; |
| | | 348 | | var trace = tracesById[datasetRunItem.TraceId]; |
| | | 349 | | var predictionObservation = SelectPredictionObservation(trace, context.RunMetadata); |
| | | 350 | | var prediction = ExtractPrediction(trace.Output, predictionObservation?.Output, context.DatasetRun.Name, |
| | | 351 | | var expectedOutput = ExtractExpectedOutput(datasetItem.ExpectedOutput, context.DatasetRun.Name, datasetR |
| | | 352 | | var metadata = ExtractDatasetItemMetadata(datasetItem.Input, datasetItem.Metadata, datasetRunItem.Datase |
| | | 353 | | var kicktippPoints = CalculateKicktippPoints(prediction, expectedOutput); |
| | | 354 | | var sourceDatasetItemId = ResolveSourceDatasetItemId( |
| | | 355 | | context.RunMetadata, |
| | | 356 | | datasetRunItem.DatasetItemId, |
| | | 357 | | trace.Metadata, |
| | | 358 | | context.DatasetRun.Name); |
| | | 359 | | |
| | | 360 | | rows.Add(new PreparedExperimentAnalysisRow( |
| | | 361 | | datasetRunItem.DatasetItemId, |
| | | 362 | | context.DatasetRun.Id, |
| | | 363 | | context.DatasetRun.Name, |
| | | 364 | | context.RunMetadata.TaskType ?? throw new InvalidOperationException($"Run '{context.DatasetRun.Name} |
| | | 365 | | ResolveRunDisplayName(context.RunMetadata, context.DatasetRun.Name), |
| | | 366 | | context.RunMetadata.PromptKey, |
| | | 367 | | context.RunMetadata.ReasoningEffort, |
| | | 368 | | context.RunMetadata.SliceKind, |
| | | 369 | | context.RunMetadata.SliceKey, |
| | | 370 | | context.RunMetadata.SourcePoolKey, |
| | | 371 | | datasetRunItem.DatasetItemId, |
| | | 372 | | sourceDatasetItemId, |
| | | 373 | | datasetRunItem.TraceId, |
| | | 374 | | predictionObservation?.Id, |
| | | 375 | | metadata.Matchday, |
| | | 376 | | metadata.HomeTeam, |
| | | 377 | | metadata.AwayTeam, |
| | | 378 | | metadata.StartsAt, |
| | | 379 | | metadata.TippSpielId, |
| | | 380 | | prediction.HomeGoals, |
| | | 381 | | prediction.AwayGoals, |
| | | 382 | | expectedOutput.HomeGoals, |
| | | 383 | | expectedOutput.AwayGoals, |
| | | 384 | | kicktippPoints, |
| | | 385 | | prediction.Status, |
| | | 386 | | context.RunMetadata.RunSubjectKind, |
| | | 387 | | context.RunMetadata.RunSubjectId, |
| | | 388 | | context.RunMetadata.RunSubjectDisplayName, |
| | | 389 | | metadata.FixtureIndex, |
| | | 390 | | metadata.RepetitionIndex)); |
| | | 391 | | } |
| | | 392 | | } |
| | | 393 | | |
| | | 394 | | return rows; |
| | | 395 | | } |
| | | 396 | | |
| | | 397 | | private static void ValidateComparableRuns(IReadOnlyList<RunContext> runContexts) |
| | | 398 | | { |
| | | 399 | | if (runContexts.Count < 1) |
| | | 400 | | { |
| | | 401 | | throw new InvalidOperationException("At least one run is required to export an experiment analysis bundle.") |
| | | 402 | | } |
| | | 403 | | |
| | | 404 | | var baseline = runContexts[0]; |
| | | 405 | | var baselineTaskType = baseline.RunMetadata.TaskType ?? throw new InvalidOperationException( |
| | | 406 | | $"Run '{baseline.DatasetRun.Name}' is missing task type metadata."); |
| | | 407 | | var baselineItemIds = baseline.DatasetRunItems |
| | | 408 | | .Select(item => item.DatasetItemId) |
| | | 409 | | .OrderBy(item => item, StringComparer.Ordinal) |
| | | 410 | | .ToArray(); |
| | | 411 | | |
| | | 412 | | foreach (var candidate in runContexts.Skip(1)) |
| | | 413 | | { |
| | | 414 | | var taskType = candidate.RunMetadata.TaskType ?? throw new InvalidOperationException( |
| | | 415 | | $"Run '{candidate.DatasetRun.Name}' is missing task type metadata."); |
| | | 416 | | if (!string.Equals(baselineTaskType, taskType, StringComparison.OrdinalIgnoreCase)) |
| | | 417 | | { |
| | | 418 | | throw new InvalidOperationException( |
| | | 419 | | $"Run '{candidate.DatasetRun.Name}' has task type '{taskType}', but expected '{baselineTaskType}'.") |
| | | 420 | | } |
| | | 421 | | |
| | | 422 | | if (!string.IsNullOrWhiteSpace(baseline.RunMetadata.SelectedItemIdsHash) |
| | | 423 | | && !string.IsNullOrWhiteSpace(candidate.RunMetadata.SelectedItemIdsHash) |
| | | 424 | | && !string.Equals( |
| | | 425 | | baseline.RunMetadata.SelectedItemIdsHash, |
| | | 426 | | candidate.RunMetadata.SelectedItemIdsHash, |
| | | 427 | | StringComparison.Ordinal)) |
| | | 428 | | { |
| | | 429 | | throw new InvalidOperationException( |
| | | 430 | | $"Run '{candidate.DatasetRun.Name}' has selectedItemIdsHash '{candidate.RunMetadata.SelectedItemIdsH |
| | | 431 | | } |
| | | 432 | | |
| | | 433 | | var candidateItemIds = candidate.DatasetRunItems |
| | | 434 | | .Select(item => item.DatasetItemId) |
| | | 435 | | .OrderBy(item => item, StringComparer.Ordinal) |
| | | 436 | | .ToArray(); |
| | | 437 | | |
| | | 438 | | if (!baselineItemIds.SequenceEqual(candidateItemIds, StringComparer.Ordinal)) |
| | | 439 | | { |
| | | 440 | | throw new InvalidOperationException( |
| | | 441 | | $"Run '{candidate.DatasetRun.Name}' does not contain the same prepared dataset item set as '{baselin |
| | | 442 | | } |
| | | 443 | | } |
| | | 444 | | } |
| | | 445 | | |
| | | 446 | | private static void EnsureDistinctDatasetItems( |
| | | 447 | | IReadOnlyList<LangfuseDatasetRunItem> datasetRunItems, |
| | | 448 | | string runName) |
| | | 449 | | { |
| | | 450 | | var duplicateItemId = datasetRunItems |
| | | 451 | | .GroupBy(item => item.DatasetItemId, StringComparer.Ordinal) |
| | | 452 | | .FirstOrDefault(group => group.Count() > 1) |
| | | 453 | | ?.Key; |
| | | 454 | | |
| | | 455 | | if (duplicateItemId is not null) |
| | | 456 | | { |
| | | 457 | | throw new InvalidOperationException( |
| | | 458 | | $"Run '{runName}' contains duplicate dataset item id '{duplicateItemId}', which is not supported for com |
| | | 459 | | } |
| | | 460 | | } |
| | | 461 | | |
| | | 462 | | private async Task<IReadOnlyList<LangfuseDatasetRunItem>> ListAllDatasetRunItemsAsync( |
| | | 463 | | LangfuseDatasetRunWithItems datasetRun, |
| | | 464 | | CancellationToken cancellationToken) |
| | | 465 | | { |
| | | 466 | | const int pageSize = 100; |
| | | 467 | | var items = new List<LangfuseDatasetRunItem>(); |
| | | 468 | | |
| | | 469 | | await foreach (var item in EnumerateOffsetPaginatedItemsAsync( |
| | | 470 | | (page, ct) => _langfuseClient.ListDatasetRunItemsAsync( |
| | | 471 | | datasetRun.DatasetId, |
| | | 472 | | datasetRun.Name, |
| | | 473 | | page, |
| | | 474 | | pageSize, |
| | | 475 | | ct), |
| | | 476 | | cancellationToken)) |
| | | 477 | | { |
| | | 478 | | items.Add(item); |
| | | 479 | | } |
| | | 480 | | |
| | | 481 | | return items; |
| | | 482 | | } |
| | | 483 | | |
| | | 484 | | private async Task<ExperimentAggregateScores> LoadAggregateScoresAsync( |
| | | 485 | | string datasetRunId, |
| | | 486 | | PreparedExperimentRunMetadata runMetadata, |
| | | 487 | | CancellationToken cancellationToken) |
| | | 488 | | { |
| | | 489 | | const int pageSize = 100; |
| | | 490 | | var scores = new List<LangfuseScore>(); |
| | | 491 | | |
| | | 492 | | await foreach (var score in EnumerateOffsetPaginatedItemsAsync( |
| | | 493 | | (page, ct) => _langfuseClient.ListScoresAsync( |
| | | 494 | | new LangfuseListScoresRequest(DatasetRunId: datasetRunId, Page: page, Limit: pageSize), |
| | | 495 | | ct), |
| | | 496 | | cancellationToken)) |
| | | 497 | | { |
| | | 498 | | scores.Add(score); |
| | | 499 | | } |
| | | 500 | | |
| | | 501 | | var total = scores.FirstOrDefault(score => string.Equals(score.Name, "total_kicktipp_points", StringComparison.O |
| | | 502 | | var average = scores.FirstOrDefault(score => string.Equals(score.Name, "avg_kicktipp_points", StringComparison.O |
| | | 503 | | |
| | | 504 | | if (total?.Value is null || average?.Value is null) |
| | | 505 | | { |
| | | 506 | | var runName = string.IsNullOrWhiteSpace(runMetadata.StartedAtUtc) |
| | | 507 | | ? datasetRunId |
| | | 508 | | : runMetadata.StartedAtUtc; |
| | | 509 | | throw new InvalidOperationException( |
| | | 510 | | $"Dataset run '{runName}' is missing one or more aggregate Langfuse scores (expected total_kicktipp_poin |
| | | 511 | | ); |
| | | 512 | | } |
| | | 513 | | |
| | | 514 | | return new ExperimentAggregateScores(total.Value.Value, average.Value.Value); |
| | | 515 | | } |
| | | 516 | | |
| | | 517 | | private IAsyncEnumerable<TItem> EnumerateOffsetPaginatedItemsAsync<TItem>( |
| | | 518 | | Func<int, CancellationToken, Task<LangfusePaginatedResponse<TItem>>> pageRetriever, |
| | | 519 | | CancellationToken cancellationToken) |
| | | 520 | | { |
| | | 521 | | var paginationHandler = new OffsetBasedComposite.PaginationHandlerBuilder<LangfusePaginatedResponse<TItem>, TIte |
| | | 522 | | .WithPageRetriever((previousPage, ct) => pageRetriever(GetNextPageNumber(previousPage), ct)) |
| | | 523 | | .WithOffsetStateExtractor(static page => new OffsetState<int>(checked(page.Meta.Page * page.Meta.Limit), pag |
| | | 524 | | .WithItemExtractor(static page => page.Data) |
| | | 525 | | .Build(); |
| | | 526 | | |
| | | 527 | | return paginationHandler.GetAllItemsAsync(cancellationToken); |
| | | 528 | | } |
| | | 529 | | |
| | | 530 | | private IAsyncEnumerable<TItem> EnumerateCursorPaginatedItemsAsync<TItem>( |
| | | 531 | | Func<string?, CancellationToken, Task<LangfuseCursorPaginatedResponse<TItem>>> pageRetriever, |
| | | 532 | | CancellationToken cancellationToken) |
| | | 533 | | { |
| | | 534 | | var paginationHandler = new CursorBasedComposite.PaginationHandlerBuilder<LangfuseCursorPaginatedResponse<TItem> |
| | | 535 | | .WithPageRetriever((previousPage, ct) => pageRetriever(previousPage?.Meta.Cursor, ct)) |
| | | 536 | | .WithCursorExtractor(static page => page.Meta.Cursor) |
| | | 537 | | .WithItemExtractor(static page => page.Data) |
| | | 538 | | .Build(); |
| | | 539 | | |
| | | 540 | | return paginationHandler.GetAllItemsAsync(cancellationToken); |
| | | 541 | | } |
| | | 542 | | |
| | | 543 | | private static int GetNextPageNumber<TItem>(LangfusePaginatedResponse<TItem>? previousPage) |
| | | 544 | | { |
| | | 545 | | return previousPage is null ? 1 : previousPage.Meta.Page + 1; |
| | | 546 | | } |
| | | 547 | | |
| | | 548 | | private static PreparedExperimentRunMetadata DeserializeRunMetadata(JsonElement metadata, string runName) |
| | | 549 | | { |
| | | 550 | | if (!LangfuseJsonUtilities.IsDefined(metadata)) |
| | | 551 | | { |
| | | 552 | | throw new InvalidOperationException($"Dataset run '{runName}' is missing run metadata."); |
| | | 553 | | } |
| | | 554 | | |
| | | 555 | | try |
| | | 556 | | { |
| | | 557 | | var deserialized = metadata.Deserialize<PreparedExperimentRunMetadata>(PreparedExperimentCommandSupport.Json |
| | | 558 | | return deserialized ?? throw new InvalidOperationException($"Dataset run '{runName}' metadata could not be d |
| | | 559 | | } |
| | | 560 | | catch (JsonException ex) |
| | | 561 | | { |
| | | 562 | | throw new InvalidOperationException($"Dataset run '{runName}' metadata could not be deserialized.", ex); |
| | | 563 | | } |
| | | 564 | | } |
| | | 565 | | |
| | | 566 | | private static PredictionOutput ExtractPrediction( |
| | | 567 | | JsonElement traceOutput, |
| | | 568 | | JsonElement? observationOutput, |
| | | 569 | | string runName, |
| | | 570 | | string traceId) |
| | | 571 | | { |
| | | 572 | | if (TryExtractPredictionOutput(traceOutput, out var prediction)) |
| | | 573 | | { |
| | | 574 | | return prediction; |
| | | 575 | | } |
| | | 576 | | |
| | | 577 | | if (observationOutput is JsonElement candidate && TryExtractPredictionOutput(candidate, out prediction)) |
| | | 578 | | { |
| | | 579 | | return prediction; |
| | | 580 | | } |
| | | 581 | | |
| | | 582 | | throw new InvalidOperationException( |
| | | 583 | | $"Run '{runName}' trace '{traceId}' does not expose a parseable prediction payload in trace or observation o |
| | | 584 | | } |
| | | 585 | | |
| | | 586 | | private static ExpectedOutput ExtractExpectedOutput(JsonElement expectedOutput, string runName, string datasetItemId |
| | | 587 | | { |
| | | 588 | | if (expectedOutput.ValueKind == JsonValueKind.Object) |
| | | 589 | | { |
| | | 590 | | if (expectedOutput.TryGetProperty("homeGoals", out var homeGoals) |
| | | 591 | | && expectedOutput.TryGetProperty("awayGoals", out var awayGoals) |
| | | 592 | | && homeGoals.TryGetInt32(out var home) |
| | | 593 | | && awayGoals.TryGetInt32(out var away)) |
| | | 594 | | { |
| | | 595 | | return new ExpectedOutput(home, away); |
| | | 596 | | } |
| | | 597 | | |
| | | 598 | | if (expectedOutput.TryGetProperty("score", out var score) |
| | | 599 | | && score.ValueKind == JsonValueKind.String |
| | | 600 | | && TryParseScoreString(score.GetString(), out var parsed)) |
| | | 601 | | { |
| | | 602 | | return parsed; |
| | | 603 | | } |
| | | 604 | | } |
| | | 605 | | |
| | | 606 | | throw new InvalidOperationException( |
| | | 607 | | $"Run '{runName}' dataset item '{datasetItemId}' does not expose a parseable expected scoreline."); |
| | | 608 | | } |
| | | 609 | | |
| | | 610 | | private static DatasetItemMetadata ExtractDatasetItemMetadata(JsonElement input, JsonElement metadata, string datase |
| | | 611 | | { |
| | | 612 | | if (metadata.ValueKind != JsonValueKind.Object) |
| | | 613 | | { |
| | | 614 | | throw new InvalidOperationException($"Dataset item '{datasetItemId}' metadata is missing or not an object.") |
| | | 615 | | } |
| | | 616 | | |
| | | 617 | | return new DatasetItemMetadata( |
| | | 618 | | GetRequiredStringProperty(metadata, "homeTeam", datasetItemId), |
| | | 619 | | GetRequiredStringProperty(metadata, "awayTeam", datasetItemId), |
| | | 620 | | GetRequiredIntProperty(metadata, "matchday", datasetItemId), |
| | | 621 | | GetRequiredStringProperty(input, "startsAt", datasetItemId), |
| | | 622 | | GetOptionalStringProperty(metadata, "tippSpielId"), |
| | | 623 | | GetOptionalIntProperty(metadata, "fixtureIndex"), |
| | | 624 | | GetOptionalIntProperty(metadata, "repetitionIndex")); |
| | | 625 | | } |
| | | 626 | | |
| | | 627 | | private static int CalculateKicktippPoints(PredictionOutput prediction, ExpectedOutput expectedOutput) |
| | | 628 | | { |
| | | 629 | | if (!string.Equals(prediction.Status, "placed", StringComparison.OrdinalIgnoreCase) |
| | | 630 | | || prediction.HomeGoals is not int homeGoals |
| | | 631 | | || prediction.AwayGoals is not int awayGoals) |
| | | 632 | | { |
| | | 633 | | return 0; |
| | | 634 | | } |
| | | 635 | | |
| | | 636 | | return PreparedExperimentSupport.CalculateScores( |
| | | 637 | | new Prediction(homeGoals, awayGoals), |
| | | 638 | | expectedOutput.HomeGoals, |
| | | 639 | | expectedOutput.AwayGoals).KicktippPoints; |
| | | 640 | | } |
| | | 641 | | |
| | | 642 | | private static LangfuseObservationDetail? SelectPredictionObservation( |
| | | 643 | | LangfuseTraceWithDetails trace, |
| | | 644 | | PreparedExperimentRunMetadata runMetadata) |
| | | 645 | | { |
| | | 646 | | var observations = trace.Observations ?? []; |
| | | 647 | | var preferredObservationName = string.IsNullOrWhiteSpace(runMetadata.ObservationName) |
| | | 648 | | ? string.Equals(runMetadata.TaskType, "community-to-date", StringComparison.OrdinalIgnoreCase) |
| | | 649 | | ? "community-match-prediction" |
| | | 650 | | : "predict-match" |
| | | 651 | | : runMetadata.ObservationName; |
| | | 652 | | |
| | | 653 | | return observations.FirstOrDefault(observation => string.Equals(observation.Name, preferredObservationName, Stri |
| | | 654 | | ?? observations.FirstOrDefault(observation => string.Equals(observation.Name, "community-match-prediction |
| | | 655 | | ?? observations.FirstOrDefault(observation => string.Equals(observation.Name, "predict-match", StringComp |
| | | 656 | | ?? observations.FirstOrDefault(observation => string.Equals(observation.Type, "GENERATION", StringCompari |
| | | 657 | | } |
| | | 658 | | |
| | | 659 | | private static string ResolveSourceDatasetItemId( |
| | | 660 | | PreparedExperimentRunMetadata runMetadata, |
| | | 661 | | string datasetItemId, |
| | | 662 | | JsonElement traceMetadata, |
| | | 663 | | string runName) |
| | | 664 | | { |
| | | 665 | | if (runMetadata.DatasetItemIdMap.Count > 0) |
| | | 666 | | { |
| | | 667 | | var reverseMatch = runMetadata.DatasetItemIdMap |
| | | 668 | | .FirstOrDefault(pair => string.Equals(pair.Value, datasetItemId, StringComparison.Ordinal)); |
| | | 669 | | if (!string.IsNullOrWhiteSpace(reverseMatch.Key)) |
| | | 670 | | { |
| | | 671 | | return reverseMatch.Key; |
| | | 672 | | } |
| | | 673 | | } |
| | | 674 | | |
| | | 675 | | if (TryDeriveSourceDatasetItemId(datasetItemId, out var derived)) |
| | | 676 | | { |
| | | 677 | | return derived; |
| | | 678 | | } |
| | | 679 | | |
| | | 680 | | var fromTraceMetadata = GetOptionalStringProperty(traceMetadata, "sourceDatasetItemId"); |
| | | 681 | | if (!string.IsNullOrWhiteSpace(fromTraceMetadata)) |
| | | 682 | | { |
| | | 683 | | return fromTraceMetadata!; |
| | | 684 | | } |
| | | 685 | | |
| | | 686 | | throw new InvalidOperationException( |
| | | 687 | | $"Run '{runName}' dataset item '{datasetItemId}' could not be mapped back to a source dataset item id."); |
| | | 688 | | } |
| | | 689 | | |
| | | 690 | | private static bool TryDeriveSourceDatasetItemId(string datasetItemId, out string sourceDatasetItemId) |
| | | 691 | | { |
| | | 692 | | var repeatedMatchSliceIndex = datasetItemId.IndexOf("__repeated-match-slice__", StringComparison.Ordinal); |
| | | 693 | | if (repeatedMatchSliceIndex >= 0) |
| | | 694 | | { |
| | | 695 | | sourceDatasetItemId = datasetItemId[..repeatedMatchSliceIndex]; |
| | | 696 | | return true; |
| | | 697 | | } |
| | | 698 | | |
| | | 699 | | var repeatedMatchIndex = datasetItemId.IndexOf("__repeated-match__", StringComparison.Ordinal); |
| | | 700 | | if (repeatedMatchIndex >= 0) |
| | | 701 | | { |
| | | 702 | | sourceDatasetItemId = datasetItemId[..repeatedMatchIndex]; |
| | | 703 | | return true; |
| | | 704 | | } |
| | | 705 | | |
| | | 706 | | var sliceIndex = datasetItemId.IndexOf("__slice__", StringComparison.Ordinal); |
| | | 707 | | if (sliceIndex >= 0) |
| | | 708 | | { |
| | | 709 | | sourceDatasetItemId = datasetItemId[..sliceIndex]; |
| | | 710 | | return true; |
| | | 711 | | } |
| | | 712 | | |
| | | 713 | | sourceDatasetItemId = string.Empty; |
| | | 714 | | return false; |
| | | 715 | | } |
| | | 716 | | |
| | | 717 | | private static bool TryExtractPredictionOutput(JsonElement value, out PredictionOutput prediction) |
| | | 718 | | { |
| | | 719 | | if (value.ValueKind == JsonValueKind.Object) |
| | | 720 | | { |
| | | 721 | | if (value.TryGetProperty("status", out var statusProperty) |
| | | 722 | | && statusProperty.ValueKind == JsonValueKind.String) |
| | | 723 | | { |
| | | 724 | | var status = NormalizePredictionStatus(statusProperty.GetString()); |
| | | 725 | | var hasHomeGoals = TryGetNullableIntProperty(value, "homeGoals", out var parsedHomeGoals); |
| | | 726 | | var hasAwayGoals = TryGetNullableIntProperty(value, "awayGoals", out var parsedAwayGoals); |
| | | 727 | | |
| | | 728 | | if (hasHomeGoals && hasAwayGoals) |
| | | 729 | | { |
| | | 730 | | prediction = new PredictionOutput(status, parsedHomeGoals, parsedAwayGoals); |
| | | 731 | | return true; |
| | | 732 | | } |
| | | 733 | | |
| | | 734 | | if (string.Equals(status, "missed", StringComparison.OrdinalIgnoreCase)) |
| | | 735 | | { |
| | | 736 | | prediction = new PredictionOutput("missed", null, null); |
| | | 737 | | return true; |
| | | 738 | | } |
| | | 739 | | } |
| | | 740 | | |
| | | 741 | | if (value.TryGetProperty("homeGoals", out var homeGoals) |
| | | 742 | | && value.TryGetProperty("awayGoals", out var awayGoals) |
| | | 743 | | && homeGoals.TryGetInt32(out var home) |
| | | 744 | | && awayGoals.TryGetInt32(out var away)) |
| | | 745 | | { |
| | | 746 | | prediction = new PredictionOutput("placed", home, away); |
| | | 747 | | return true; |
| | | 748 | | } |
| | | 749 | | } |
| | | 750 | | |
| | | 751 | | if (value.ValueKind == JsonValueKind.String) |
| | | 752 | | { |
| | | 753 | | var raw = value.GetString(); |
| | | 754 | | if (TryParseScoreString(raw, out var parsedScore)) |
| | | 755 | | { |
| | | 756 | | prediction = new PredictionOutput("placed", parsedScore.HomeGoals, parsedScore.AwayGoals); |
| | | 757 | | return true; |
| | | 758 | | } |
| | | 759 | | |
| | | 760 | | if (!string.IsNullOrWhiteSpace(raw)) |
| | | 761 | | { |
| | | 762 | | try |
| | | 763 | | { |
| | | 764 | | using var document = JsonDocument.Parse(raw); |
| | | 765 | | if (TryExtractPredictionOutput(document.RootElement, out prediction)) |
| | | 766 | | { |
| | | 767 | | return true; |
| | | 768 | | } |
| | | 769 | | } |
| | | 770 | | catch (JsonException) |
| | | 771 | | { |
| | | 772 | | } |
| | | 773 | | } |
| | | 774 | | } |
| | | 775 | | |
| | | 776 | | prediction = default; |
| | | 777 | | return false; |
| | | 778 | | } |
| | | 779 | | |
| | | 780 | | private static bool TryGetNullableIntProperty(JsonElement value, string propertyName, out int? result) |
| | | 781 | | { |
| | | 782 | | result = null; |
| | | 783 | | if (!value.TryGetProperty(propertyName, out var property)) |
| | | 784 | | { |
| | | 785 | | return false; |
| | | 786 | | } |
| | | 787 | | |
| | | 788 | | if (property.ValueKind == JsonValueKind.Null) |
| | | 789 | | { |
| | | 790 | | return true; |
| | | 791 | | } |
| | | 792 | | |
| | | 793 | | if (property.TryGetInt32(out var parsed)) |
| | | 794 | | { |
| | | 795 | | result = parsed; |
| | | 796 | | return true; |
| | | 797 | | } |
| | | 798 | | |
| | | 799 | | return false; |
| | | 800 | | } |
| | | 801 | | |
| | | 802 | | private static bool TryParseScoreString(string? value, out ExpectedOutput expectedOutput) |
| | | 803 | | { |
| | | 804 | | expectedOutput = default; |
| | | 805 | | if (string.IsNullOrWhiteSpace(value)) |
| | | 806 | | { |
| | | 807 | | return false; |
| | | 808 | | } |
| | | 809 | | |
| | | 810 | | var segments = value.Split(':', StringSplitOptions.TrimEntries); |
| | | 811 | | if (segments.Length != 2) |
| | | 812 | | { |
| | | 813 | | return false; |
| | | 814 | | } |
| | | 815 | | |
| | | 816 | | if (!int.TryParse(segments[0], NumberStyles.Integer, CultureInfo.InvariantCulture, out var homeGoals) |
| | | 817 | | || !int.TryParse(segments[1], NumberStyles.Integer, CultureInfo.InvariantCulture, out var awayGoals)) |
| | | 818 | | { |
| | | 819 | | return false; |
| | | 820 | | } |
| | | 821 | | |
| | | 822 | | expectedOutput = new ExpectedOutput(homeGoals, awayGoals); |
| | | 823 | | return true; |
| | | 824 | | } |
| | | 825 | | |
| | | 826 | | private static string NormalizePredictionStatus(string? status) |
| | | 827 | | { |
| | | 828 | | return string.Equals(status, "missed", StringComparison.OrdinalIgnoreCase) |
| | | 829 | | ? "missed" |
| | | 830 | | : "placed"; |
| | | 831 | | } |
| | | 832 | | |
| | | 833 | | private static string ResolveRunDisplayName(PreparedExperimentRunMetadata runMetadata, string runName) |
| | | 834 | | { |
| | | 835 | | if (!string.IsNullOrWhiteSpace(runMetadata.RunSubjectDisplayName)) |
| | | 836 | | { |
| | | 837 | | return runMetadata.RunSubjectDisplayName; |
| | | 838 | | } |
| | | 839 | | |
| | | 840 | | if (!string.IsNullOrWhiteSpace(runMetadata.Model)) |
| | | 841 | | { |
| | | 842 | | return PreparedExperimentSupport.BuildRunSubjectDisplayName( |
| | | 843 | | runMetadata.Model, |
| | | 844 | | runMetadata.ReasoningEffort); |
| | | 845 | | } |
| | | 846 | | |
| | | 847 | | throw new InvalidOperationException($"Run '{runName}' is missing comparable display metadata."); |
| | | 848 | | } |
| | | 849 | | |
| | | 850 | | private static string ResolvePrimaryMetricName(string taskType) |
| | | 851 | | { |
| | | 852 | | return IsAveragePrimaryMetricTask(taskType) |
| | | 853 | | ? "avg_kicktipp_points" |
| | | 854 | | : "total_kicktipp_points"; |
| | | 855 | | } |
| | | 856 | | |
| | | 857 | | private static double ResolvePrimaryMetricValue(string taskType, ExperimentAggregateScores aggregateScores) |
| | | 858 | | { |
| | | 859 | | return IsAveragePrimaryMetricTask(taskType) |
| | | 860 | | ? aggregateScores.AvgKicktippPoints |
| | | 861 | | : aggregateScores.TotalKicktippPoints; |
| | | 862 | | } |
| | | 863 | | |
| | | 864 | | private static bool IsAveragePrimaryMetricTask(string taskType) |
| | | 865 | | { |
| | | 866 | | return string.Equals(taskType, "repeated-match", StringComparison.OrdinalIgnoreCase) |
| | | 867 | | || string.Equals(taskType, "repeated-match-slice", StringComparison.OrdinalIgnoreCase); |
| | | 868 | | } |
| | | 869 | | |
| | | 870 | | private static string ResolveOutputPath( |
| | | 871 | | ExportExperimentAnalysisSettings settings, |
| | | 872 | | string taskType, |
| | | 873 | | string datasetName) |
| | | 874 | | { |
| | | 875 | | if (!string.IsNullOrWhiteSpace(settings.OutputPath)) |
| | | 876 | | { |
| | | 877 | | return Path.GetFullPath(settings.OutputPath); |
| | | 878 | | } |
| | | 879 | | |
| | | 880 | | var timestamp = DateTimeOffset.UtcNow.ToString("yyyy-MM-ddTHH-mm-ssZ", CultureInfo.InvariantCulture).ToLowerInva |
| | | 881 | | var datasetSegments = datasetName.Split('/', StringSplitOptions.RemoveEmptyEntries) |
| | | 882 | | .Select(segment => segment.Trim()) |
| | | 883 | | .Where(segment => segment.Length > 0) |
| | | 884 | | .ToList(); |
| | | 885 | | |
| | | 886 | | return Path.GetFullPath(Path.Combine( |
| | | 887 | | ["artifacts", "langfuse-experiments", "analysis", taskType, .. datasetSegments, $"comparison-{timestamp}.jso |
| | | 888 | | } |
| | | 889 | | |
| | | 890 | | private static string GetRequiredStringProperty(JsonElement metadata, string propertyName, string datasetItemId) |
| | | 891 | | { |
| | | 892 | | if (!metadata.TryGetProperty(propertyName, out var property) |
| | | 893 | | || property.ValueKind != JsonValueKind.String |
| | | 894 | | || string.IsNullOrWhiteSpace(property.GetString())) |
| | | 895 | | { |
| | | 896 | | throw new InvalidOperationException($"Dataset item '{datasetItemId}' metadata is missing '{propertyName}'.") |
| | | 897 | | } |
| | | 898 | | |
| | | 899 | | return property.GetString()!; |
| | | 900 | | } |
| | | 901 | | |
| | | 902 | | private static int GetRequiredIntProperty(JsonElement metadata, string propertyName, string datasetItemId) |
| | | 903 | | { |
| | | 904 | | if (!metadata.TryGetProperty(propertyName, out var property) |
| | | 905 | | || !property.TryGetInt32(out var value)) |
| | | 906 | | { |
| | | 907 | | throw new InvalidOperationException($"Dataset item '{datasetItemId}' metadata is missing '{propertyName}'.") |
| | | 908 | | } |
| | | 909 | | |
| | | 910 | | return value; |
| | | 911 | | } |
| | | 912 | | |
| | | 913 | | private static string? GetOptionalStringProperty(JsonElement metadata, string propertyName) |
| | | 914 | | { |
| | | 915 | | return metadata.ValueKind == JsonValueKind.Object |
| | | 916 | | && metadata.TryGetProperty(propertyName, out var property) |
| | | 917 | | && property.ValueKind == JsonValueKind.String |
| | | 918 | | && !string.IsNullOrWhiteSpace(property.GetString()) |
| | | 919 | | ? property.GetString() |
| | | 920 | | : null; |
| | | 921 | | } |
| | | 922 | | |
| | | 923 | | private static int? GetOptionalIntProperty(JsonElement metadata, string propertyName) |
| | | 924 | | { |
| | | 925 | | return metadata.ValueKind == JsonValueKind.Object |
| | | 926 | | && metadata.TryGetProperty(propertyName, out var property) |
| | | 927 | | && property.TryGetInt32(out var value) |
| | | 928 | | ? value |
| | | 929 | | : null; |
| | | 930 | | } |
| | | 931 | | |
| | 1 | 932 | | private sealed record RunContext( |
| | 1 | 933 | | LangfuseDatasetRunWithItems DatasetRun, |
| | 1 | 934 | | PreparedExperimentRunMetadata RunMetadata, |
| | 1 | 935 | | IReadOnlyList<LangfuseDatasetRunItem> DatasetRunItems, |
| | 1 | 936 | | ExperimentAggregateScores AggregateScores); |
| | | 937 | | |
| | | 938 | | private readonly record struct PredictionOutput(string Status, int? HomeGoals, int? AwayGoals); |
| | | 939 | | |
| | | 940 | | private readonly record struct ExpectedOutput(int HomeGoals, int AwayGoals); |
| | | 941 | | |
| | | 942 | | private readonly record struct DatasetItemMetadata( |
| | | 943 | | string HomeTeam, |
| | | 944 | | string AwayTeam, |
| | | 945 | | int Matchday, |
| | | 946 | | string StartsAt, |
| | | 947 | | string? TippSpielId, |
| | | 948 | | int? FixtureIndex, |
| | | 949 | | int? RepetitionIndex); |
| | | 950 | | } |