| | | 1 | | using System.Globalization; |
| | | 2 | | using System.Text; |
| | | 3 | | using System.Text.RegularExpressions; |
| | | 4 | | using CsvHelper; |
| | | 5 | | using CsvHelper.Configuration; |
| | | 6 | | using DuckDB.NET.Data; |
| | | 7 | | using EHonda.KicktippAi.Core; |
| | | 8 | | using Microsoft.Extensions.Logging; |
| | | 9 | | |
| | | 10 | | namespace Orchestrator.Commands.Operations.CollectContext; |
| | | 11 | | |
| | | 12 | | public interface IWm26LineupSource |
| | | 13 | | { |
| | | 14 | | Task<Wm26LineupCollection> CollectAsync( |
| | | 15 | | Wm26LineupSourceRequest request, |
| | | 16 | | CancellationToken cancellationToken = default); |
| | | 17 | | } |
| | | 18 | | |
| | | 19 | | internal interface IWm26TransfermarktDuckDbProvider |
| | | 20 | | { |
| | | 21 | | Task<string> GetDatabasePathAsync( |
| | | 22 | | string? configuredPath, |
| | | 23 | | CancellationToken cancellationToken = default); |
| | | 24 | | } |
| | | 25 | | |
| | | 26 | | internal sealed class Wm26LineupSource : IWm26LineupSource |
| | | 27 | | { |
| | | 28 | | private const string MissingValue = "N/A"; |
| | | 29 | | |
| | | 30 | | private static readonly IReadOnlyList<string> RequiredSeedColumns = |
| | | 31 | | [ |
| | | 32 | | "Team_Slug", |
| | | 33 | | "Team", |
| | | 34 | | "Data_Collected_At", |
| | | 35 | | "Role", |
| | | 36 | | "Name", |
| | | 37 | | "Transfermarkt_National_Team_Id", |
| | | 38 | | "Transfermarkt_Player_Id" |
| | | 39 | | ]; |
| | | 40 | | |
| | | 41 | | private static readonly IReadOnlyList<string> OutputColumns = |
| | | 42 | | [ |
| | | 43 | | "Team", |
| | | 44 | | "Data_Collected_At", |
| | | 45 | | "Role", |
| | | 46 | | "Name", |
| | | 47 | | "Age", |
| | | 48 | | "Position", |
| | | 49 | | "Market_Value_EUR" |
| | | 50 | | ]; |
| | | 51 | | |
| | | 52 | | private static readonly Regex NonAlphanumericRegex = new("[^a-z0-9]+", RegexOptions.Compiled); |
| | | 53 | | |
| | | 54 | | private readonly IWm26TransfermarktDuckDbProvider _duckDbProvider; |
| | | 55 | | |
| | | 56 | | public Wm26LineupSource(IWm26TransfermarktDuckDbProvider duckDbProvider) |
| | | 57 | | { |
| | | 58 | | _duckDbProvider = duckDbProvider; |
| | | 59 | | } |
| | | 60 | | |
| | | 61 | | public async Task<Wm26LineupCollection> CollectAsync( |
| | | 62 | | Wm26LineupSourceRequest request, |
| | | 63 | | CancellationToken cancellationToken = default) |
| | | 64 | | { |
| | | 65 | | ArgumentNullException.ThrowIfNull(request); |
| | | 66 | | |
| | | 67 | | var seedPath = ResolvePath(request.SeedPath); |
| | | 68 | | var teamsPath = ResolvePath(request.TeamsPath); |
| | | 69 | | var teams = ReadTeamManifest(teamsPath); |
| | | 70 | | var seedRows = ReadSeedRows(seedPath); |
| | | 71 | | var databasePath = await _duckDbProvider.GetDatabasePathAsync(request.DuckDbPath, cancellationToken); |
| | | 72 | | |
| | | 73 | | var enrichedRows = EnrichRows(seedRows, databasePath); |
| | | 74 | | var groupedRows = GroupRowsByManifest(teams, enrichedRows); |
| | | 75 | | ValidateCoaches(groupedRows); |
| | | 76 | | |
| | | 77 | | var contextDocuments = groupedRows |
| | | 78 | | .Select(entry => new Wm26LineupDocument( |
| | | 79 | | $"lineup-{entry.Team.Slug}.csv", |
| | | 80 | | RenderCsv(entry.Rows), |
| | | 81 | | entry.Team.Name, |
| | | 82 | | entry.Rows.Count(row => string.Equals(row.Role, "Player", StringComparison.Ordinal)), |
| | | 83 | | entry.Rows.Count == 0)) |
| | | 84 | | .ToList(); |
| | | 85 | | |
| | | 86 | | var aggregateRows = groupedRows.SelectMany(entry => entry.Rows); |
| | | 87 | | var kpiContent = RenderCsv(aggregateRows); |
| | | 88 | | var headerOnlyTeams = contextDocuments |
| | | 89 | | .Where(document => document.IsHeaderOnly) |
| | | 90 | | .Select(document => new Wm26LineupTeam(GetSlugFromDocumentName(document.DocumentName), document.TeamName)) |
| | | 91 | | .ToList(); |
| | | 92 | | var missingSourceData = BuildMissingSourceData(enrichedRows); |
| | | 93 | | |
| | | 94 | | return new Wm26LineupCollection( |
| | | 95 | | seedPath, |
| | | 96 | | teamsPath, |
| | | 97 | | databasePath, |
| | | 98 | | seedRows.Count, |
| | | 99 | | enrichedRows.Count, |
| | | 100 | | contextDocuments, |
| | | 101 | | kpiContent, |
| | | 102 | | headerOnlyTeams, |
| | | 103 | | missingSourceData); |
| | | 104 | | } |
| | | 105 | | |
| | | 106 | | private static string ResolvePath(string value) |
| | | 107 | | { |
| | | 108 | | ArgumentException.ThrowIfNullOrWhiteSpace(value); |
| | | 109 | | return Path.GetFullPath(value); |
| | | 110 | | } |
| | | 111 | | |
| | | 112 | | private static string GetSlugFromDocumentName(string documentName) |
| | | 113 | | { |
| | | 114 | | return documentName["lineup-".Length..^4]; |
| | | 115 | | } |
| | | 116 | | |
| | | 117 | | private static IReadOnlyList<Wm26LineupTeam> ReadTeamManifest(string teamsPath) |
| | | 118 | | { |
| | | 119 | | if (!File.Exists(teamsPath)) |
| | | 120 | | { |
| | | 121 | | throw new FileNotFoundException($"Team manifest CSV not found: {teamsPath}", teamsPath); |
| | | 122 | | } |
| | | 123 | | |
| | | 124 | | using var reader = new StreamReader(teamsPath, Encoding.UTF8, detectEncodingFromByteOrderMarks: true); |
| | | 125 | | using var csv = CreateReader(reader); |
| | | 126 | | csv.Read(); |
| | | 127 | | csv.ReadHeader(); |
| | | 128 | | ValidateColumns(csv, ["Team_Slug", "Team"], "Team manifest CSV"); |
| | | 129 | | |
| | | 130 | | var teams = new List<Wm26LineupTeam>(); |
| | | 131 | | var slugs = new HashSet<string>(StringComparer.OrdinalIgnoreCase); |
| | | 132 | | while (csv.Read()) |
| | | 133 | | { |
| | | 134 | | var lineNumber = csv.Context?.Parser?.Row ?? 0; |
| | | 135 | | var slug = GetTrimmedField(csv, "Team_Slug"); |
| | | 136 | | var team = GetTrimmedField(csv, "Team"); |
| | | 137 | | |
| | | 138 | | if (string.IsNullOrWhiteSpace(slug)) |
| | | 139 | | { |
| | | 140 | | throw new InvalidOperationException($"Team manifest line {lineNumber}: missing Team_Slug"); |
| | | 141 | | } |
| | | 142 | | |
| | | 143 | | if (string.IsNullOrWhiteSpace(team)) |
| | | 144 | | { |
| | | 145 | | throw new InvalidOperationException($"Team manifest line {lineNumber}: missing Team"); |
| | | 146 | | } |
| | | 147 | | |
| | | 148 | | if (!slugs.Add(slug)) |
| | | 149 | | { |
| | | 150 | | throw new InvalidOperationException($"Team manifest line {lineNumber}: duplicate Team_Slug {slug}"); |
| | | 151 | | } |
| | | 152 | | |
| | | 153 | | teams.Add(new Wm26LineupTeam(slug, team)); |
| | | 154 | | } |
| | | 155 | | |
| | | 156 | | if (teams.Count == 0) |
| | | 157 | | { |
| | | 158 | | throw new InvalidOperationException("Team manifest CSV has no team rows"); |
| | | 159 | | } |
| | | 160 | | |
| | | 161 | | return teams; |
| | | 162 | | } |
| | | 163 | | |
| | | 164 | | private static List<Wm26LineupSeedRow> ReadSeedRows(string seedPath) |
| | | 165 | | { |
| | | 166 | | if (!File.Exists(seedPath)) |
| | | 167 | | { |
| | | 168 | | throw new FileNotFoundException($"Lineup seed CSV not found: {seedPath}", seedPath); |
| | | 169 | | } |
| | | 170 | | |
| | | 171 | | using var reader = new StreamReader(seedPath, Encoding.UTF8, detectEncodingFromByteOrderMarks: true); |
| | | 172 | | using var csv = CreateReader(reader); |
| | | 173 | | csv.Read(); |
| | | 174 | | csv.ReadHeader(); |
| | | 175 | | ValidateColumns(csv, RequiredSeedColumns, "Lineup seed CSV"); |
| | | 176 | | |
| | | 177 | | var rows = new List<Wm26LineupSeedRow>(); |
| | | 178 | | while (csv.Read()) |
| | | 179 | | { |
| | | 180 | | var lineNumber = csv.Context?.Parser?.Row ?? 0; |
| | | 181 | | var row = new Wm26LineupSeedRow( |
| | | 182 | | GetTrimmedField(csv, "Team_Slug"), |
| | | 183 | | GetTrimmedField(csv, "Team"), |
| | | 184 | | GetTrimmedField(csv, "Data_Collected_At"), |
| | | 185 | | GetTrimmedField(csv, "Role"), |
| | | 186 | | GetTrimmedField(csv, "Name"), |
| | | 187 | | GetTrimmedField(csv, "Transfermarkt_National_Team_Id"), |
| | | 188 | | GetTrimmedField(csv, "Transfermarkt_Player_Id"), |
| | | 189 | | GetOptionalField(csv, "Age"), |
| | | 190 | | GetOptionalField(csv, "Position"), |
| | | 191 | | GetOptionalField(csv, "Market_Value_EUR")); |
| | | 192 | | |
| | | 193 | | ValidateSeedRow(row, lineNumber); |
| | | 194 | | rows.Add(row); |
| | | 195 | | } |
| | | 196 | | |
| | | 197 | | if (rows.Count == 0) |
| | | 198 | | { |
| | | 199 | | throw new InvalidOperationException("Lineup seed CSV has no lineup rows"); |
| | | 200 | | } |
| | | 201 | | |
| | | 202 | | return rows; |
| | | 203 | | } |
| | | 204 | | |
| | | 205 | | private static CsvReader CreateReader(TextReader reader) |
| | | 206 | | { |
| | | 207 | | return new CsvReader( |
| | | 208 | | reader, |
| | | 209 | | new CsvConfiguration(CultureInfo.InvariantCulture) |
| | | 210 | | { |
| | | 211 | | BadDataFound = null, |
| | | 212 | | MissingFieldFound = null, |
| | | 213 | | TrimOptions = TrimOptions.Trim |
| | | 214 | | }); |
| | | 215 | | } |
| | | 216 | | |
| | | 217 | | private static void ValidateColumns(CsvReader csv, IReadOnlyList<string> requiredColumns, string label) |
| | | 218 | | { |
| | | 219 | | var headers = csv.HeaderRecord ?? []; |
| | | 220 | | var missing = requiredColumns |
| | | 221 | | .Where(column => !headers.Contains(column, StringComparer.Ordinal)) |
| | | 222 | | .ToList(); |
| | | 223 | | |
| | | 224 | | if (missing.Count > 0) |
| | | 225 | | { |
| | | 226 | | throw new InvalidOperationException($"{label} is missing required column(s): {string.Join(", ", missing)}"); |
| | | 227 | | } |
| | | 228 | | } |
| | | 229 | | |
| | | 230 | | private static string GetTrimmedField(CsvReader csv, string column) |
| | | 231 | | { |
| | | 232 | | return (csv.GetField(column) ?? string.Empty).Trim(); |
| | | 233 | | } |
| | | 234 | | |
| | | 235 | | private static string GetOptionalField(CsvReader csv, string column) |
| | | 236 | | { |
| | | 237 | | var headers = csv.HeaderRecord ?? []; |
| | | 238 | | return headers.Contains(column, StringComparer.Ordinal) |
| | | 239 | | ? GetTrimmedField(csv, column) |
| | | 240 | | : string.Empty; |
| | | 241 | | } |
| | | 242 | | |
| | | 243 | | private static void ValidateSeedRow(Wm26LineupSeedRow row, int lineNumber) |
| | | 244 | | { |
| | | 245 | | foreach (var (column, value) in new[] |
| | | 246 | | { |
| | | 247 | | ("Team_Slug", row.TeamSlug), |
| | | 248 | | ("Team", row.Team), |
| | | 249 | | ("Data_Collected_At", row.DataCollectedAt), |
| | | 250 | | ("Role", row.Role) |
| | | 251 | | }) |
| | | 252 | | { |
| | | 253 | | if (string.IsNullOrWhiteSpace(value)) |
| | | 254 | | { |
| | | 255 | | throw new InvalidOperationException($"Line {lineNumber}: missing {column}"); |
| | | 256 | | } |
| | | 257 | | } |
| | | 258 | | |
| | | 259 | | if (!IsValidRole(row.Role)) |
| | | 260 | | { |
| | | 261 | | throw new InvalidOperationException($"Line {lineNumber}: unsupported Role {row.Role}"); |
| | | 262 | | } |
| | | 263 | | |
| | | 264 | | if (string.Equals(row.Role, "Player", StringComparison.Ordinal) |
| | | 265 | | && string.IsNullOrWhiteSpace(row.Name) |
| | | 266 | | && string.IsNullOrWhiteSpace(row.TransfermarktPlayerId)) |
| | | 267 | | { |
| | | 268 | | throw new InvalidOperationException($"Line {lineNumber}: Player row needs Name or Transfermarkt_Player_Id"); |
| | | 269 | | } |
| | | 270 | | |
| | | 271 | | ValidateAge(row.Age, lineNumber); |
| | | 272 | | ValidateMarketValue(row.MarketValueEur, row.Role, lineNumber); |
| | | 273 | | |
| | | 274 | | if (!DateOnly.TryParseExact( |
| | | 275 | | row.DataCollectedAt, |
| | | 276 | | "yyyy-MM-dd", |
| | | 277 | | CultureInfo.InvariantCulture, |
| | | 278 | | DateTimeStyles.None, |
| | | 279 | | out _)) |
| | | 280 | | { |
| | | 281 | | throw new InvalidOperationException( |
| | | 282 | | $"Line {lineNumber}: Data_Collected_At must use YYYY-MM-DD, got {row.DataCollectedAt}"); |
| | | 283 | | } |
| | | 284 | | } |
| | | 285 | | |
| | | 286 | | private static void ValidateOutputRow(Wm26LineupOutputRow row, int lineNumber) |
| | | 287 | | { |
| | | 288 | | foreach (var (column, value) in new[] |
| | | 289 | | { |
| | | 290 | | ("Team", row.Team), |
| | | 291 | | ("Data_Collected_At", row.DataCollectedAt), |
| | | 292 | | ("Role", row.Role), |
| | | 293 | | ("Name", row.Name), |
| | | 294 | | ("Position", row.Position) |
| | | 295 | | }) |
| | | 296 | | { |
| | | 297 | | if (string.IsNullOrWhiteSpace(value)) |
| | | 298 | | { |
| | | 299 | | throw new InvalidOperationException($"Line {lineNumber}: missing {column}"); |
| | | 300 | | } |
| | | 301 | | } |
| | | 302 | | |
| | | 303 | | if (string.Equals(row.Role, "Player", StringComparison.Ordinal) |
| | | 304 | | && string.IsNullOrWhiteSpace(row.Age)) |
| | | 305 | | { |
| | | 306 | | throw new InvalidOperationException($"Line {lineNumber}: missing Age"); |
| | | 307 | | } |
| | | 308 | | |
| | | 309 | | if (!IsValidRole(row.Role)) |
| | | 310 | | { |
| | | 311 | | throw new InvalidOperationException($"Line {lineNumber}: unsupported Role {row.Role}"); |
| | | 312 | | } |
| | | 313 | | |
| | | 314 | | ValidateAge(row.Age, lineNumber); |
| | | 315 | | ValidateMarketValue(row.MarketValueEur, row.Role, lineNumber); |
| | | 316 | | } |
| | | 317 | | |
| | | 318 | | private static bool IsValidRole(string role) |
| | | 319 | | { |
| | | 320 | | return string.Equals(role, "Player", StringComparison.Ordinal) |
| | | 321 | | || string.Equals(role, "Coach", StringComparison.Ordinal); |
| | | 322 | | } |
| | | 323 | | |
| | | 324 | | private static void ValidateAge(string age, int lineNumber) |
| | | 325 | | { |
| | | 326 | | if (!string.IsNullOrWhiteSpace(age) |
| | | 327 | | && !string.Equals(age, MissingValue, StringComparison.OrdinalIgnoreCase) |
| | | 328 | | && !age.All(char.IsDigit)) |
| | | 329 | | { |
| | | 330 | | throw new InvalidOperationException($"Line {lineNumber}: Age must be numeric or N/A when provided"); |
| | | 331 | | } |
| | | 332 | | } |
| | | 333 | | |
| | | 334 | | private static void ValidateMarketValue(string marketValue, string role, int lineNumber) |
| | | 335 | | { |
| | | 336 | | var normalized = marketValue.Replace(".", string.Empty, StringComparison.Ordinal); |
| | | 337 | | if (!string.IsNullOrWhiteSpace(marketValue) |
| | | 338 | | && !string.Equals(marketValue, MissingValue, StringComparison.OrdinalIgnoreCase) |
| | | 339 | | && !normalized.All(char.IsDigit)) |
| | | 340 | | { |
| | | 341 | | throw new InvalidOperationException( |
| | | 342 | | $"Line {lineNumber}: Market_Value_EUR must be numeric, N/A, or empty"); |
| | | 343 | | } |
| | | 344 | | |
| | | 345 | | if (string.Equals(role, "Player", StringComparison.Ordinal) && normalized == "0") |
| | | 346 | | { |
| | | 347 | | throw new InvalidOperationException( |
| | | 348 | | $"Line {lineNumber}: Market_Value_EUR must use N/A instead of 0 when unavailable"); |
| | | 349 | | } |
| | | 350 | | } |
| | | 351 | | |
| | | 352 | | private static List<Wm26LineupOutputRow> EnrichRows( |
| | | 353 | | IReadOnlyList<Wm26LineupSeedRow> seedRows, |
| | | 354 | | string databasePath) |
| | | 355 | | { |
| | | 356 | | using var connection = new DuckDBConnection($"Data Source={databasePath}"); |
| | | 357 | | connection.Open(); |
| | | 358 | | |
| | | 359 | | var enrichedRows = new List<Wm26LineupOutputRow>(); |
| | | 360 | | var errors = new List<string>(); |
| | | 361 | | for (var index = 0; index < seedRows.Count; index++) |
| | | 362 | | { |
| | | 363 | | var lineNumber = index + 2; |
| | | 364 | | var seedRow = seedRows[index]; |
| | | 365 | | try |
| | | 366 | | { |
| | | 367 | | var row = string.Equals(seedRow.Role, "Coach", StringComparison.Ordinal) |
| | | 368 | | ? EnrichCoachRow(connection, seedRow) |
| | | 369 | | : EnrichPlayerRow(connection, seedRow); |
| | | 370 | | ValidateOutputRow(row, lineNumber); |
| | | 371 | | enrichedRows.Add(row); |
| | | 372 | | } |
| | | 373 | | catch (Exception ex) when (ex is InvalidOperationException or FormatException) |
| | | 374 | | { |
| | | 375 | | errors.Add($"Line {lineNumber}: {ex.Message}"); |
| | | 376 | | } |
| | | 377 | | } |
| | | 378 | | |
| | | 379 | | if (errors.Count > 0) |
| | | 380 | | { |
| | | 381 | | throw new InvalidOperationException( |
| | | 382 | | "Lineup enrichment failed:" + Environment.NewLine + |
| | | 383 | | string.Join(Environment.NewLine, errors.Select(error => $"- {error}"))); |
| | | 384 | | } |
| | | 385 | | |
| | | 386 | | return enrichedRows; |
| | | 387 | | } |
| | | 388 | | |
| | | 389 | | private static Wm26LineupOutputRow EnrichCoachRow( |
| | | 390 | | DuckDBConnection connection, |
| | | 391 | | Wm26LineupSeedRow row) |
| | | 392 | | { |
| | | 393 | | var coachName = row.Name; |
| | | 394 | | if (string.IsNullOrWhiteSpace(coachName) && !string.IsNullOrWhiteSpace(row.TransfermarktNationalTeamId)) |
| | | 395 | | { |
| | | 396 | | coachName = GetCoachName(connection, row.TransfermarktNationalTeamId); |
| | | 397 | | } |
| | | 398 | | |
| | | 399 | | if (string.IsNullOrWhiteSpace(coachName)) |
| | | 400 | | { |
| | | 401 | | throw new InvalidOperationException( |
| | | 402 | | "Coach row needs Name or Transfermarkt_National_Team_Id with national_teams.coach_name"); |
| | | 403 | | } |
| | | 404 | | |
| | | 405 | | return new Wm26LineupOutputRow( |
| | | 406 | | row.TeamSlug, |
| | | 407 | | row.Team, |
| | | 408 | | row.DataCollectedAt, |
| | | 409 | | "Coach", |
| | | 410 | | coachName, |
| | | 411 | | row.Age, |
| | | 412 | | string.IsNullOrWhiteSpace(row.Position) ? "Coach" : row.Position, |
| | | 413 | | string.Empty); |
| | | 414 | | } |
| | | 415 | | |
| | | 416 | | private static Wm26LineupOutputRow EnrichPlayerRow( |
| | | 417 | | DuckDBConnection connection, |
| | | 418 | | Wm26LineupSeedRow row) |
| | | 419 | | { |
| | | 420 | | var player = ResolvePlayer(connection, row); |
| | | 421 | | if (player is null) |
| | | 422 | | { |
| | | 423 | | return new Wm26LineupOutputRow( |
| | | 424 | | row.TeamSlug, |
| | | 425 | | row.Team, |
| | | 426 | | row.DataCollectedAt, |
| | | 427 | | "Player", |
| | | 428 | | row.Name, |
| | | 429 | | ProvidedValue(row.Age) ?? MissingValue, |
| | | 430 | | ProvidedValue(row.Position) ?? MissingValue, |
| | | 431 | | ProvidedValue(row.MarketValueEur) ?? MissingValue); |
| | | 432 | | } |
| | | 433 | | |
| | | 434 | | var collectedAt = DateOnly.ParseExact(row.DataCollectedAt, "yyyy-MM-dd", CultureInfo.InvariantCulture); |
| | | 435 | | return new Wm26LineupOutputRow( |
| | | 436 | | row.TeamSlug, |
| | | 437 | | row.Team, |
| | | 438 | | row.DataCollectedAt, |
| | | 439 | | "Player", |
| | | 440 | | string.IsNullOrWhiteSpace(row.Name) ? player.Name : row.Name, |
| | | 441 | | ProvidedValue(row.Age) ?? CalculateAgeOrMissing(player.DateOfBirth, collectedAt), |
| | | 442 | | string.IsNullOrWhiteSpace(player.Position) ? ProvidedValue(row.Position) ?? MissingValue : player.Position, |
| | | 443 | | ProvidedValue(row.MarketValueEur) ?? FormatMarketValueOrMissing(player.MarketValueInEur)); |
| | | 444 | | } |
| | | 445 | | |
| | | 446 | | private static Wm26LineupPlayerRecord? ResolvePlayer( |
| | | 447 | | DuckDBConnection connection, |
| | | 448 | | Wm26LineupSeedRow row) |
| | | 449 | | { |
| | | 450 | | if (!string.IsNullOrWhiteSpace(row.TransfermarktPlayerId)) |
| | | 451 | | { |
| | | 452 | | return GetPlayerById(connection, row.TransfermarktPlayerId); |
| | | 453 | | } |
| | | 454 | | |
| | | 455 | | if (string.IsNullOrWhiteSpace(row.TransfermarktNationalTeamId)) |
| | | 456 | | { |
| | | 457 | | return null; |
| | | 458 | | } |
| | | 459 | | |
| | | 460 | | var candidates = GetPlayersByNationalTeamId(connection, row.TransfermarktNationalTeamId); |
| | | 461 | | var normalizedName = NormalizeName(row.Name); |
| | | 462 | | var matches = candidates |
| | | 463 | | .Where(candidate => NormalizeName(candidate.Name) == normalizedName) |
| | | 464 | | .ToList(); |
| | | 465 | | |
| | | 466 | | return matches.Count switch |
| | | 467 | | { |
| | | 468 | | 0 => null, |
| | | 469 | | 1 => matches[0], |
| | | 470 | | _ => throw new InvalidOperationException( |
| | | 471 | | $"Player {row.Name} matched multiple Transfermarkt players: {string.Join(", ", matches.Select(match => m |
| | | 472 | | }; |
| | | 473 | | } |
| | | 474 | | |
| | | 475 | | private static Wm26LineupPlayerRecord? GetPlayerById(DuckDBConnection connection, string playerId) |
| | | 476 | | { |
| | | 477 | | using var command = connection.CreateCommand(); |
| | | 478 | | command.CommandText = |
| | | 479 | | """ |
| | | 480 | | select |
| | | 481 | | player_id, |
| | | 482 | | name, |
| | | 483 | | date_of_birth, |
| | | 484 | | position, |
| | | 485 | | market_value_in_eur, |
| | | 486 | | current_national_team_id |
| | | 487 | | from players |
| | | 488 | | where cast(player_id as varchar) = $player_id |
| | | 489 | | """; |
| | | 490 | | command.Parameters.Add(new DuckDBParameter("player_id", playerId)); |
| | | 491 | | |
| | | 492 | | using var reader = command.ExecuteReader(); |
| | | 493 | | return reader.Read() ? ReadPlayer(reader) : null; |
| | | 494 | | } |
| | | 495 | | |
| | | 496 | | private static IReadOnlyList<Wm26LineupPlayerRecord> GetPlayersByNationalTeamId( |
| | | 497 | | DuckDBConnection connection, |
| | | 498 | | string nationalTeamId) |
| | | 499 | | { |
| | | 500 | | using var command = connection.CreateCommand(); |
| | | 501 | | command.CommandText = |
| | | 502 | | """ |
| | | 503 | | select |
| | | 504 | | player_id, |
| | | 505 | | name, |
| | | 506 | | date_of_birth, |
| | | 507 | | position, |
| | | 508 | | market_value_in_eur, |
| | | 509 | | current_national_team_id |
| | | 510 | | from players |
| | | 511 | | where cast(current_national_team_id as varchar) = $national_team_id |
| | | 512 | | """; |
| | | 513 | | command.Parameters.Add(new DuckDBParameter("national_team_id", nationalTeamId)); |
| | | 514 | | |
| | | 515 | | using var reader = command.ExecuteReader(); |
| | | 516 | | var players = new List<Wm26LineupPlayerRecord>(); |
| | | 517 | | while (reader.Read()) |
| | | 518 | | { |
| | | 519 | | players.Add(ReadPlayer(reader)); |
| | | 520 | | } |
| | | 521 | | |
| | | 522 | | return players; |
| | | 523 | | } |
| | | 524 | | |
| | | 525 | | private static string GetCoachName(DuckDBConnection connection, string nationalTeamId) |
| | | 526 | | { |
| | | 527 | | using var command = connection.CreateCommand(); |
| | | 528 | | command.CommandText = |
| | | 529 | | """ |
| | | 530 | | select coach_name |
| | | 531 | | from national_teams |
| | | 532 | | where cast(national_team_id as varchar) = $national_team_id |
| | | 533 | | """; |
| | | 534 | | command.Parameters.Add(new DuckDBParameter("national_team_id", nationalTeamId)); |
| | | 535 | | |
| | | 536 | | var value = command.ExecuteScalar(); |
| | | 537 | | return value is null or DBNull ? string.Empty : Convert.ToString(value, CultureInfo.InvariantCulture)?.Trim() ?? |
| | | 538 | | } |
| | | 539 | | |
| | | 540 | | private static Wm26LineupPlayerRecord ReadPlayer(System.Data.Common.DbDataReader reader) |
| | | 541 | | { |
| | | 542 | | return new Wm26LineupPlayerRecord( |
| | | 543 | | Convert.ToString(reader.GetValue(0), CultureInfo.InvariantCulture) ?? string.Empty, |
| | | 544 | | Convert.ToString(reader.GetValue(1), CultureInfo.InvariantCulture)?.Trim() ?? string.Empty, |
| | | 545 | | reader.IsDBNull(2) ? null : reader.GetValue(2), |
| | | 546 | | reader.IsDBNull(3) ? string.Empty : Convert.ToString(reader.GetValue(3), CultureInfo.InvariantCulture)?.Trim |
| | | 547 | | reader.IsDBNull(4) ? null : reader.GetValue(4), |
| | | 548 | | reader.IsDBNull(5) ? string.Empty : Convert.ToString(reader.GetValue(5), CultureInfo.InvariantCulture)?.Trim |
| | | 549 | | } |
| | | 550 | | |
| | | 551 | | private static string? ProvidedValue(string value) |
| | | 552 | | { |
| | | 553 | | return string.IsNullOrWhiteSpace(value) || string.Equals(value, MissingValue, StringComparison.OrdinalIgnoreCase |
| | | 554 | | ? null |
| | | 555 | | : value; |
| | | 556 | | } |
| | | 557 | | |
| | | 558 | | private static string CalculateAgeOrMissing(object? dateOfBirth, DateOnly collectedAt) |
| | | 559 | | { |
| | | 560 | | if (dateOfBirth is null or DBNull) |
| | | 561 | | { |
| | | 562 | | return MissingValue; |
| | | 563 | | } |
| | | 564 | | |
| | | 565 | | DateOnly born; |
| | | 566 | | if (dateOfBirth is DateTime dateTime) |
| | | 567 | | { |
| | | 568 | | born = DateOnly.FromDateTime(dateTime); |
| | | 569 | | } |
| | | 570 | | else if (dateOfBirth is DateOnly dateOnly) |
| | | 571 | | { |
| | | 572 | | born = dateOnly; |
| | | 573 | | } |
| | | 574 | | else |
| | | 575 | | { |
| | | 576 | | var text = Convert.ToString(dateOfBirth, CultureInfo.InvariantCulture)?.Trim(); |
| | | 577 | | if (string.IsNullOrWhiteSpace(text) |
| | | 578 | | || !DateOnly.TryParseExact( |
| | | 579 | | text[..Math.Min(text.Length, 10)], |
| | | 580 | | "yyyy-MM-dd", |
| | | 581 | | CultureInfo.InvariantCulture, |
| | | 582 | | DateTimeStyles.None, |
| | | 583 | | out born)) |
| | | 584 | | { |
| | | 585 | | return MissingValue; |
| | | 586 | | } |
| | | 587 | | } |
| | | 588 | | |
| | | 589 | | var age = collectedAt.Year - born.Year; |
| | | 590 | | if (collectedAt.Month < born.Month || (collectedAt.Month == born.Month && collectedAt.Day < born.Day)) |
| | | 591 | | { |
| | | 592 | | age--; |
| | | 593 | | } |
| | | 594 | | |
| | | 595 | | return age < 0 ? MissingValue : age.ToString(CultureInfo.InvariantCulture); |
| | | 596 | | } |
| | | 597 | | |
| | | 598 | | private static string FormatMarketValueOrMissing(object? value) |
| | | 599 | | { |
| | | 600 | | if (value is null or DBNull) |
| | | 601 | | { |
| | | 602 | | return MissingValue; |
| | | 603 | | } |
| | | 604 | | |
| | | 605 | | if (!long.TryParse(Convert.ToString(value, CultureInfo.InvariantCulture), NumberStyles.Integer, CultureInfo.Inva |
| | | 606 | | { |
| | | 607 | | return MissingValue; |
| | | 608 | | } |
| | | 609 | | |
| | | 610 | | return marketValue == 0 ? MissingValue : marketValue.ToString(CultureInfo.InvariantCulture); |
| | | 611 | | } |
| | | 612 | | |
| | | 613 | | private static IReadOnlyList<Wm26GroupedLineupRows> GroupRowsByManifest( |
| | | 614 | | IReadOnlyList<Wm26LineupTeam> teams, |
| | | 615 | | IReadOnlyList<Wm26LineupOutputRow> rows) |
| | | 616 | | { |
| | | 617 | | var grouped = teams |
| | | 618 | | .Select(team => new Wm26GroupedLineupRows(team, [])) |
| | | 619 | | .ToDictionary(entry => entry.Team.Slug, StringComparer.OrdinalIgnoreCase); |
| | | 620 | | |
| | | 621 | | foreach (var row in rows) |
| | | 622 | | { |
| | | 623 | | if (!grouped.TryGetValue(row.TeamSlug, out var group)) |
| | | 624 | | { |
| | | 625 | | group = new Wm26GroupedLineupRows(new Wm26LineupTeam(row.TeamSlug, row.Team), []); |
| | | 626 | | grouped[row.TeamSlug] = group; |
| | | 627 | | } |
| | | 628 | | |
| | | 629 | | group.Rows.Add(row); |
| | | 630 | | } |
| | | 631 | | |
| | | 632 | | return grouped.Values.ToList(); |
| | | 633 | | } |
| | | 634 | | |
| | | 635 | | private static void ValidateCoaches(IReadOnlyList<Wm26GroupedLineupRows> groupedRows) |
| | | 636 | | { |
| | | 637 | | var teamsWithoutCoach = groupedRows |
| | | 638 | | .Where(group => group.Rows.Count > 0 |
| | | 639 | | && group.Rows.All(row => !string.Equals(row.Role, "Coach", StringComparison.Ordinal))) |
| | | 640 | | .Select(group => group.Team.Slug) |
| | | 641 | | .ToList(); |
| | | 642 | | |
| | | 643 | | if (teamsWithoutCoach.Count > 0) |
| | | 644 | | { |
| | | 645 | | throw new InvalidOperationException( |
| | | 646 | | $"Lineup source has teams without Coach rows: {string.Join(", ", teamsWithoutCoach)}"); |
| | | 647 | | } |
| | | 648 | | } |
| | | 649 | | |
| | | 650 | | private static IReadOnlyList<Wm26LineupMissingData> BuildMissingSourceData( |
| | | 651 | | IReadOnlyList<Wm26LineupOutputRow> rows) |
| | | 652 | | { |
| | | 653 | | return rows |
| | | 654 | | .Where(row => string.Equals(row.Role, "Player", StringComparison.Ordinal)) |
| | | 655 | | .Select(row => new |
| | | 656 | | { |
| | | 657 | | Row = row, |
| | | 658 | | Fields = new[] { ("Age", row.Age), ("Position", row.Position), ("Market_Value_EUR", row.MarketValueEur) |
| | | 659 | | .Where(field => string.Equals(field.Item2, MissingValue, StringComparison.OrdinalIgnoreCase)) |
| | | 660 | | .Select(field => field.Item1) |
| | | 661 | | .ToList() |
| | | 662 | | }) |
| | | 663 | | .Where(entry => entry.Fields.Count > 0) |
| | | 664 | | .Select(entry => new Wm26LineupMissingData( |
| | | 665 | | entry.Row.TeamSlug, |
| | | 666 | | entry.Row.Team, |
| | | 667 | | entry.Row.Name, |
| | | 668 | | entry.Fields)) |
| | | 669 | | .ToList(); |
| | | 670 | | } |
| | | 671 | | |
| | | 672 | | private static string RenderCsv(IEnumerable<Wm26LineupOutputRow> rows) |
| | | 673 | | { |
| | | 674 | | using var writer = new StringWriter(CultureInfo.InvariantCulture); |
| | | 675 | | using var csv = new CsvWriter( |
| | | 676 | | writer, |
| | | 677 | | new CsvConfiguration(CultureInfo.InvariantCulture) |
| | | 678 | | { |
| | | 679 | | NewLine = "\r\n" |
| | | 680 | | }); |
| | | 681 | | |
| | | 682 | | foreach (var column in OutputColumns) |
| | | 683 | | { |
| | | 684 | | csv.WriteField(column); |
| | | 685 | | } |
| | | 686 | | |
| | | 687 | | csv.NextRecord(); |
| | | 688 | | |
| | | 689 | | foreach (var row in rows) |
| | | 690 | | { |
| | | 691 | | csv.WriteField(row.Team); |
| | | 692 | | csv.WriteField(row.DataCollectedAt); |
| | | 693 | | csv.WriteField(row.Role); |
| | | 694 | | csv.WriteField(row.Name); |
| | | 695 | | csv.WriteField(row.Age); |
| | | 696 | | csv.WriteField(row.Position); |
| | | 697 | | csv.WriteField(FormatMarketValueForOutput(row.MarketValueEur)); |
| | | 698 | | csv.NextRecord(); |
| | | 699 | | } |
| | | 700 | | |
| | | 701 | | return writer.ToString(); |
| | | 702 | | } |
| | | 703 | | |
| | | 704 | | private static string FormatMarketValueForOutput(string value) |
| | | 705 | | { |
| | | 706 | | if (string.IsNullOrWhiteSpace(value) || string.Equals(value, MissingValue, StringComparison.OrdinalIgnoreCase)) |
| | | 707 | | { |
| | | 708 | | return value; |
| | | 709 | | } |
| | | 710 | | |
| | | 711 | | var digits = value.Replace(".", string.Empty, StringComparison.Ordinal); |
| | | 712 | | return long.TryParse(digits, NumberStyles.Integer, CultureInfo.InvariantCulture, out var marketValue) |
| | | 713 | | ? marketValue.ToString("N0", CultureInfo.InvariantCulture).Replace(",", ".", StringComparison.Ordinal) |
| | | 714 | | : value; |
| | | 715 | | } |
| | | 716 | | |
| | | 717 | | private static string NormalizeName(string value) |
| | | 718 | | { |
| | | 719 | | var normalized = value.Normalize(NormalizationForm.FormKD); |
| | | 720 | | var builder = new StringBuilder(); |
| | | 721 | | |
| | | 722 | | foreach (var character in normalized) |
| | | 723 | | { |
| | | 724 | | var category = CharUnicodeInfo.GetUnicodeCategory(character); |
| | | 725 | | if (category != UnicodeCategory.NonSpacingMark) |
| | | 726 | | { |
| | | 727 | | builder.Append(char.ToLowerInvariant(character)); |
| | | 728 | | } |
| | | 729 | | } |
| | | 730 | | |
| | | 731 | | return NonAlphanumericRegex.Replace(builder.ToString(), " ").Trim(); |
| | | 732 | | } |
| | | 733 | | |
| | | 734 | | private sealed record Wm26LineupSeedRow( |
| | | 735 | | string TeamSlug, |
| | | 736 | | string Team, |
| | | 737 | | string DataCollectedAt, |
| | | 738 | | string Role, |
| | | 739 | | string Name, |
| | | 740 | | string TransfermarktNationalTeamId, |
| | | 741 | | string TransfermarktPlayerId, |
| | | 742 | | string Age, |
| | | 743 | | string Position, |
| | | 744 | | string MarketValueEur); |
| | | 745 | | |
| | | 746 | | private sealed record Wm26LineupOutputRow( |
| | | 747 | | string TeamSlug, |
| | | 748 | | string Team, |
| | | 749 | | string DataCollectedAt, |
| | | 750 | | string Role, |
| | | 751 | | string Name, |
| | | 752 | | string Age, |
| | | 753 | | string Position, |
| | | 754 | | string MarketValueEur); |
| | | 755 | | |
| | | 756 | | private sealed record Wm26LineupPlayerRecord( |
| | | 757 | | string PlayerId, |
| | | 758 | | string Name, |
| | | 759 | | object? DateOfBirth, |
| | | 760 | | string Position, |
| | | 761 | | object? MarketValueInEur, |
| | | 762 | | string CurrentNationalTeamId); |
| | | 763 | | |
| | 1 | 764 | | private sealed record Wm26GroupedLineupRows(Wm26LineupTeam Team, List<Wm26LineupOutputRow> Rows); |
| | | 765 | | } |
| | | 766 | | |
| | | 767 | | internal sealed class Wm26TransfermarktDuckDbProvider : IWm26TransfermarktDuckDbProvider |
| | | 768 | | { |
| | | 769 | | public const string DefaultDuckDbUrl = |
| | | 770 | | "https://pub-e682421888d945d684bcae8890b0ec20.r2.dev/data/transfermarkt-datasets.duckdb"; |
| | | 771 | | |
| | | 772 | | public const string DefaultCachePath = "data/wm26/lineups/private/data/transfermarkt-datasets.duckdb"; |
| | | 773 | | |
| | | 774 | | private readonly HttpClient _httpClient; |
| | | 775 | | private readonly ILogger<Wm26TransfermarktDuckDbProvider> _logger; |
| | | 776 | | |
| | | 777 | | public Wm26TransfermarktDuckDbProvider( |
| | | 778 | | HttpClient httpClient, |
| | | 779 | | ILogger<Wm26TransfermarktDuckDbProvider> logger) |
| | | 780 | | { |
| | | 781 | | _httpClient = httpClient; |
| | | 782 | | _logger = logger; |
| | | 783 | | } |
| | | 784 | | |
| | | 785 | | public async Task<string> GetDatabasePathAsync( |
| | | 786 | | string? configuredPath, |
| | | 787 | | CancellationToken cancellationToken = default) |
| | | 788 | | { |
| | | 789 | | if (!string.IsNullOrWhiteSpace(configuredPath)) |
| | | 790 | | { |
| | | 791 | | var path = Path.GetFullPath(configuredPath); |
| | | 792 | | if (!File.Exists(path)) |
| | | 793 | | { |
| | | 794 | | throw new FileNotFoundException($"DuckDB database not found: {path}", path); |
| | | 795 | | } |
| | | 796 | | |
| | | 797 | | return path; |
| | | 798 | | } |
| | | 799 | | |
| | | 800 | | var cachePath = Path.GetFullPath(DefaultCachePath); |
| | | 801 | | Directory.CreateDirectory(Path.GetDirectoryName(cachePath)!); |
| | | 802 | | var tempPath = $"{cachePath}.download"; |
| | | 803 | | |
| | | 804 | | _logger.LogInformation("Refreshing Transfermarkt DuckDB snapshot from {Url}", DefaultDuckDbUrl); |
| | | 805 | | using var response = await _httpClient.GetAsync(DefaultDuckDbUrl, HttpCompletionOption.ResponseHeadersRead, canc |
| | | 806 | | response.EnsureSuccessStatusCode(); |
| | | 807 | | |
| | | 808 | | await using (var source = await response.Content.ReadAsStreamAsync(cancellationToken)) |
| | | 809 | | await using (var target = new FileStream(tempPath, FileMode.Create, FileAccess.Write, FileShare.None)) |
| | | 810 | | { |
| | | 811 | | await source.CopyToAsync(target, cancellationToken); |
| | | 812 | | } |
| | | 813 | | |
| | | 814 | | File.Move(tempPath, cachePath, overwrite: true); |
| | | 815 | | return cachePath; |
| | | 816 | | } |
| | | 817 | | } |
| | | 818 | | |
| | | 819 | | public sealed record Wm26LineupSourceRequest( |
| | | 820 | | string SeedPath, |
| | | 821 | | string TeamsPath, |
| | | 822 | | string? DuckDbPath); |
| | | 823 | | |
| | | 824 | | public sealed record Wm26LineupCollection( |
| | | 825 | | string SeedPath, |
| | | 826 | | string TeamsPath, |
| | | 827 | | string DuckDbPath, |
| | | 828 | | int SeedRowCount, |
| | | 829 | | int EnrichedRowCount, |
| | | 830 | | IReadOnlyList<Wm26LineupDocument> ContextDocuments, |
| | | 831 | | string KpiContent, |
| | | 832 | | IReadOnlyList<Wm26LineupTeam> HeaderOnlyTeams, |
| | | 833 | | IReadOnlyList<Wm26LineupMissingData> MissingSourceData); |
| | | 834 | | |
| | | 835 | | public sealed record Wm26LineupDocument( |
| | | 836 | | string DocumentName, |
| | | 837 | | string Content, |
| | | 838 | | string TeamName, |
| | | 839 | | int PlayerCount, |
| | | 840 | | bool IsHeaderOnly); |
| | | 841 | | |
| | | 842 | | public sealed record Wm26LineupTeam(string Slug, string Name); |
| | | 843 | | |
| | | 844 | | public sealed record Wm26LineupMissingData( |
| | | 845 | | string TeamSlug, |
| | | 846 | | string TeamName, |
| | | 847 | | string PlayerName, |
| | | 848 | | IReadOnlyList<string> Fields); |