| | | 1 | | using System.ComponentModel; |
| | | 2 | | using Spectre.Console; |
| | | 3 | | using Spectre.Console.Cli; |
| | | 4 | | |
| | | 5 | | namespace Orchestrator.Commands.Observability.Experiments; |
| | | 6 | | |
| | | 7 | | public abstract class RunExperimentSettingsBase : CommandSettings |
| | | 8 | | { |
| | | 9 | | private static readonly HashSet<string> AllowedReasoningEfforts = new(StringComparer.Ordinal) |
| | | 10 | | { |
| | | 11 | | "none", |
| | | 12 | | "minimal", |
| | | 13 | | "low", |
| | | 14 | | "medium", |
| | | 15 | | "high", |
| | | 16 | | "xhigh" |
| | | 17 | | }; |
| | | 18 | | |
| | | 19 | | [CommandArgument(0, "<MODEL>")] |
| | | 20 | | [Description("The model to execute for the experiment run")] |
| | | 21 | | public string Model { get; set; } = string.Empty; |
| | | 22 | | |
| | | 23 | | [CommandOption("--manifest")] |
| | | 24 | | [Description("Path to the prepared experiment manifest JSON file")] |
| | | 25 | | public string ManifestPath { get; set; } = string.Empty; |
| | | 26 | | |
| | | 27 | | [CommandOption("--run-name")] |
| | | 28 | | [Description("Langfuse dataset run name")] |
| | | 29 | | public string RunName { get; set; } = string.Empty; |
| | | 30 | | |
| | | 31 | | [CommandOption("--run-description")] |
| | | 32 | | [Description("Optional Langfuse dataset run description")] |
| | | 33 | | public string? RunDescription { get; set; } |
| | | 34 | | |
| | | 35 | | [CommandOption("--run-metadata-file")] |
| | | 36 | | [Description("Optional path to an experiment run metadata JSON file. When omitted, metadata is built from the manife |
| | | 37 | | public string? RunMetadataFile { get; set; } |
| | | 38 | | |
| | | 39 | | [CommandOption("--prompt-key")] |
| | | 40 | | [Description("Prompt variant identifier used in run metadata and trace tags")] |
| | | 41 | | [DefaultValue("prompt-v1")] |
| | | 42 | | public string PromptKey { get; set; } = "prompt-v1"; |
| | | 43 | | |
| | | 44 | | [CommandOption("--prompt-source")] |
| | | 45 | | [Description("Prompt source for experiment predictions: local or langfuse")] |
| | | 46 | | [DefaultValue("local")] |
| | | 47 | | public string PromptSource { get; set; } = "local"; |
| | | 48 | | |
| | | 49 | | [CommandOption("--langfuse-prompt-name")] |
| | | 50 | | [Description("Langfuse hosted prompt name when --prompt-source langfuse is used")] |
| | | 51 | | public string? LangfusePromptName { get; set; } |
| | | 52 | | |
| | | 53 | | [CommandOption("--langfuse-prompt-label")] |
| | | 54 | | [Description("Langfuse hosted prompt label when --prompt-source langfuse is used")] |
| | | 55 | | [DefaultValue("production")] |
| | | 56 | | public string? LangfusePromptLabel { get; set; } = "production"; |
| | | 57 | | |
| | | 58 | | [CommandOption("--langfuse-prompt-version")] |
| | | 59 | | [Description("Optional Langfuse hosted prompt version when --prompt-source langfuse is used")] |
| | | 60 | | public int? LangfusePromptVersion { get; set; } |
| | | 61 | | |
| | | 62 | | [CommandOption("--reasoning-effort")] |
| | | 63 | | [Description("Optional OpenAI reasoning effort for experiment predictions: none, minimal, low, medium, high, or xhig |
| | | 64 | | public string? ReasoningEffort { get; set; } |
| | | 65 | | |
| | | 66 | | [CommandOption("--max-output-tokens")] |
| | | 67 | | [Description("Maximum OpenAI output tokens per prediction. Defaults to 10000")] |
| | | 68 | | public int? MaxOutputTokenCount { get; set; } |
| | | 69 | | |
| | | 70 | | [CommandOption("--include-justification")] |
| | | 71 | | [Description("Use the justification prompt variant when reconstructing historical prompts")] |
| | | 72 | | [DefaultValue(false)] |
| | | 73 | | public bool IncludeJustification { get; set; } |
| | | 74 | | |
| | | 75 | | [CommandOption("--evaluation-time")] |
| | | 76 | | [Description("Optional exact evaluation time in NodaTime invariant ZonedDateTime 'G' format, for example '2026-03-15 |
| | | 77 | | public string? EvaluationTime { get; set; } |
| | | 78 | | |
| | | 79 | | [CommandOption("--evaluation-policy-kind")] |
| | | 80 | | [Description("Optional evaluation policy kind. Defaults to 'relative' when no run metadata file or exact evaluation |
| | | 81 | | public string? EvaluationPolicyKind { get; set; } |
| | | 82 | | |
| | | 83 | | [CommandOption("--evaluation-policy-offset")] |
| | | 84 | | [Description("Optional evaluation policy offset. Defaults to '-12:00:00' when no run metadata file or exact evaluati |
| | | 85 | | public string? EvaluationPolicyOffset { get; set; } |
| | | 86 | | |
| | | 87 | | [CommandOption("--dataset-name")] |
| | | 88 | | [Description("Optional hosted dataset name override")] |
| | | 89 | | public string? DatasetName { get; set; } |
| | | 90 | | |
| | | 91 | | [CommandOption("--replace-run")] |
| | | 92 | | [Description("Delete an existing dataset run with the same name before starting")] |
| | | 93 | | [DefaultValue(false)] |
| | | 94 | | public bool ReplaceRun { get; set; } |
| | | 95 | | |
| | | 96 | | protected ValidationResult ValidateCommon() |
| | | 97 | | { |
| | | 98 | | if (string.IsNullOrWhiteSpace(Model)) |
| | | 99 | | { |
| | | 100 | | return ValidationResult.Error("Model is required"); |
| | | 101 | | } |
| | | 102 | | |
| | | 103 | | if (string.IsNullOrWhiteSpace(ManifestPath)) |
| | | 104 | | { |
| | | 105 | | return ValidationResult.Error("--manifest is required"); |
| | | 106 | | } |
| | | 107 | | |
| | | 108 | | if (string.IsNullOrWhiteSpace(RunName)) |
| | | 109 | | { |
| | | 110 | | return ValidationResult.Error("--run-name is required"); |
| | | 111 | | } |
| | | 112 | | |
| | | 113 | | if (string.IsNullOrWhiteSpace(PromptKey)) |
| | | 114 | | { |
| | | 115 | | return ValidationResult.Error("--prompt-key must be a non-empty string"); |
| | | 116 | | } |
| | | 117 | | |
| | | 118 | | if (!string.IsNullOrWhiteSpace(ReasoningEffort)) |
| | | 119 | | { |
| | | 120 | | var normalizedReasoningEffort = ReasoningEffort.Trim().ToLowerInvariant(); |
| | | 121 | | if (!AllowedReasoningEfforts.Contains(normalizedReasoningEffort)) |
| | | 122 | | { |
| | | 123 | | return ValidationResult.Error("--reasoning-effort must be one of: none, minimal, low, medium, high, xhig |
| | | 124 | | } |
| | | 125 | | |
| | | 126 | | ReasoningEffort = normalizedReasoningEffort; |
| | | 127 | | } |
| | | 128 | | |
| | | 129 | | if (MaxOutputTokenCount is < 1) |
| | | 130 | | { |
| | | 131 | | return ValidationResult.Error("--max-output-tokens must be at least 1 when provided"); |
| | | 132 | | } |
| | | 133 | | |
| | | 134 | | var normalizedPromptSource = PromptSource.Trim().ToLowerInvariant(); |
| | | 135 | | if (normalizedPromptSource is not ("local" or "langfuse")) |
| | | 136 | | { |
| | | 137 | | return ValidationResult.Error("--prompt-source must be either 'local' or 'langfuse'"); |
| | | 138 | | } |
| | | 139 | | |
| | | 140 | | if (normalizedPromptSource == "langfuse") |
| | | 141 | | { |
| | | 142 | | if (IncludeJustification) |
| | | 143 | | { |
| | | 144 | | return ValidationResult.Error("--prompt-source langfuse does not support --include-justification in this |
| | | 145 | | } |
| | | 146 | | |
| | | 147 | | if (string.IsNullOrWhiteSpace(LangfusePromptName)) |
| | | 148 | | { |
| | | 149 | | return ValidationResult.Error("--langfuse-prompt-name is required when --prompt-source langfuse is used" |
| | | 150 | | } |
| | | 151 | | |
| | | 152 | | if (LangfusePromptVersion is < 1) |
| | | 153 | | { |
| | | 154 | | return ValidationResult.Error("--langfuse-prompt-version must be at least 1 when provided"); |
| | | 155 | | } |
| | | 156 | | } |
| | | 157 | | else if (!string.IsNullOrWhiteSpace(LangfusePromptName) || LangfusePromptVersion is not null) |
| | | 158 | | { |
| | | 159 | | return ValidationResult.Error("Langfuse prompt options require --prompt-source langfuse"); |
| | | 160 | | } |
| | | 161 | | |
| | | 162 | | var hasEvaluationPolicyKind = !string.IsNullOrWhiteSpace(EvaluationPolicyKind); |
| | | 163 | | var hasEvaluationPolicyOffset = !string.IsNullOrWhiteSpace(EvaluationPolicyOffset); |
| | | 164 | | |
| | | 165 | | if (hasEvaluationPolicyKind != hasEvaluationPolicyOffset) |
| | | 166 | | { |
| | | 167 | | return ValidationResult.Error("--evaluation-policy-kind and --evaluation-policy-offset must be provided toge |
| | | 168 | | } |
| | | 169 | | |
| | | 170 | | if (!string.IsNullOrWhiteSpace(EvaluationTime) && hasEvaluationPolicyKind) |
| | | 171 | | { |
| | | 172 | | return ValidationResult.Error("--evaluation-time cannot be combined with --evaluation-policy-kind/--evaluati |
| | | 173 | | } |
| | | 174 | | |
| | | 175 | | if (!string.IsNullOrWhiteSpace(EvaluationTime)) |
| | | 176 | | { |
| | | 177 | | try |
| | | 178 | | { |
| | | 179 | | _ = Commands.Observability.EvaluationTimeParser.Parse(EvaluationTime); |
| | | 180 | | } |
| | | 181 | | catch (ArgumentException ex) |
| | | 182 | | { |
| | | 183 | | return ValidationResult.Error(ex.Message); |
| | | 184 | | } |
| | | 185 | | } |
| | | 186 | | |
| | | 187 | | if (hasEvaluationPolicyKind) |
| | | 188 | | { |
| | | 189 | | try |
| | | 190 | | { |
| | | 191 | | _ = Commands.Observability.EvaluationTimestampPolicyParser.Parse(EvaluationPolicyKind, EvaluationPolicyO |
| | | 192 | | } |
| | | 193 | | catch (ArgumentException ex) |
| | | 194 | | { |
| | | 195 | | return ValidationResult.Error(ex.Message); |
| | | 196 | | } |
| | | 197 | | } |
| | | 198 | | |
| | | 199 | | return ValidationResult.Success(); |
| | | 200 | | } |
| | | 201 | | |
| | | 202 | | private protected PreparedExperimentRunOptions CreateRunOptions( |
| | | 203 | | string batchStrategy, |
| | | 204 | | int? batchSize = null, |
| | | 205 | | int? batchCount = null, |
| | | 206 | | int? parallelism = null) |
| | | 207 | | { |
| | | 208 | | var normalizedPromptSource = PromptSource.Trim().ToLowerInvariant(); |
| | | 209 | | var langfusePromptName = normalizedPromptSource == "langfuse" ? LangfusePromptName : null; |
| | | 210 | | var langfusePromptLabel = normalizedPromptSource == "langfuse" ? LangfusePromptLabel : null; |
| | | 211 | | var langfusePromptVersion = normalizedPromptSource == "langfuse" ? LangfusePromptVersion : null; |
| | | 212 | | |
| | | 213 | | return new PreparedExperimentRunOptions( |
| | | 214 | | Model, |
| | | 215 | | PromptKey, |
| | | 216 | | IncludeJustification, |
| | | 217 | | EvaluationTime, |
| | | 218 | | EvaluationPolicyKind, |
| | | 219 | | EvaluationPolicyOffset, |
| | | 220 | | DatasetName, |
| | | 221 | | normalizedPromptSource, |
| | | 222 | | langfusePromptName, |
| | | 223 | | langfusePromptLabel, |
| | | 224 | | langfusePromptVersion, |
| | | 225 | | batchStrategy, |
| | | 226 | | batchSize, |
| | | 227 | | batchCount, |
| | | 228 | | ReasoningEffort, |
| | | 229 | | MaxOutputTokenCount, |
| | | 230 | | parallelism); |
| | | 231 | | } |
| | | 232 | | } |
| | | 233 | | |
| | | 234 | | public sealed class RunSliceSettings : RunExperimentSettingsBase |
| | | 235 | | { |
| | | 236 | | [CommandOption("--batch-size")] |
| | | 237 | | [Description("Optional batch size override")] |
| | | 238 | | public int? BatchSize { get; set; } |
| | | 239 | | |
| | | 240 | | public override ValidationResult Validate() |
| | | 241 | | { |
| | 1 | 242 | | var commonValidation = ValidateCommon(); |
| | 1 | 243 | | if (!commonValidation.Successful) |
| | | 244 | | { |
| | 0 | 245 | | return commonValidation; |
| | | 246 | | } |
| | | 247 | | |
| | 1 | 248 | | if (BatchSize is < 1) |
| | | 249 | | { |
| | 0 | 250 | | return ValidationResult.Error("--batch-size must be at least 1 when provided"); |
| | | 251 | | } |
| | | 252 | | |
| | 1 | 253 | | return ValidationResult.Success(); |
| | | 254 | | } |
| | | 255 | | |
| | | 256 | | internal PreparedExperimentRunOptions ToRunOptions() |
| | | 257 | | { |
| | 1 | 258 | | return CreateRunOptions("simple-batched", BatchSize); |
| | | 259 | | } |
| | | 260 | | } |