Coverage for apps / backend / worker / cli.py: 29%
508 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-08 08:37 +1200
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-08 08:37 +1200
1"""Worker CLI for TipSharks ingestion and computation tasks."""
3import asyncio
4import json
5import os
6import subprocess
7import sys
8from calendar import monthrange
9from datetime import date, timedelta
11import click
12from rich.console import Console
13from rich.table import Table
15from packages.core.common.logging import get_logger, setup_logging
16from packages.core.common.settings import get_settings
17from packages.core.common.utils import parse_date
18from packages.core.storage.database import get_session
19from packages.core.storage.ingestion import IngestionService
20from packages.core.storage.models import Meeting, Race
21from packages.core.storage.repositories import StarterRepository, normalize_race_data
23# Setup logging
24setup_logging()
25logger = get_logger(__name__)
26console = Console()
29def _subtract_months(d: date, months: int) -> date:
30 """Subtract *months* from *d*, clamping day to month length.
32 Args:
33 d: Starting date.
34 months: Number of months to subtract (must be >= 0).
36 Returns:
37 Date *months* months before *d*.
38 """
39 target_month = d.month - 1 - months # 0-indexed
40 target_year = d.year + target_month // 12
41 target_month = target_month % 12 + 1
42 max_day = monthrange(target_year, target_month)[1]
43 target_day = min(d.day, max_day)
44 return date(target_year, target_month, target_day)
47@click.group()
48def cli():
49 """TipSharks worker CLI.
51 Commands for ingesting TAB racing data and computing ratings.
52 """
53 pass
56@cli.command("normalize-races")
57@click.option(
58 "--from",
59 "date_from",
60 required=False,
61 help="Start date (YYYY-MM-DD)",
62)
63@click.option(
64 "--to",
65 "date_to",
66 required=False,
67 help="End date (YYYY-MM-DD)",
68)
69@click.option(
70 "--commit-every",
71 default=200,
72 show_default=True,
73 help="Commit interval for database updates",
74)
75def normalize_races(date_from: str, date_to: str, commit_every: int) -> None:
76 """Normalize race/starter JSON structure and backfill starters from raw JSON."""
77 start_date = parse_date(date_from) if date_from else None
78 end_date = parse_date(date_to) if date_to else None
80 with get_session() as session:
81 query = session.query(Race).join(Meeting)
82 if start_date:
83 query = query.filter(Meeting.meeting_date >= start_date)
84 if end_date:
85 query = query.filter(Meeting.meeting_date <= end_date)
87 race_ids = [
88 race_id
89 for (race_id,) in query.with_entities(Race.id)
90 .order_by(Meeting.meeting_date, Race.race_number)
91 .all()
92 ]
93 total = len(race_ids)
94 console.print(f"[bold]Normalizing {total} races[/bold]")
96 updated_races = 0
97 upserted_starters = 0
99 for idx, race_id in enumerate(race_ids, 1):
100 race = session.query(Race).filter(Race.id == race_id).one()
101 normalized = normalize_race_data(race.raw_json or {})
102 if normalized != race.raw_json:
103 race.raw_json = normalized
104 updated_races += 1
106 starters = (normalized.get("raw_json") or {}).get("starters")
107 if isinstance(starters, list):
108 for starter in starters:
109 if not isinstance(starter, dict):
110 continue
111 placing = starter.get("placing")
112 StarterRepository.upsert(session, race.id, starter, placing=placing)
113 upserted_starters += 1
115 if idx % commit_every == 0:
116 session.commit()
117 console.print(f"[dim]Processed {idx}/{total} races[/dim]")
119 session.commit()
121 console.print(
122 f"[green]✓ Updated races: {updated_races}, upserted starters: {upserted_starters}[/green]"
123 )
126@cli.command()
127@click.option(
128 "--from",
129 "date_from",
130 required=False,
131 help="Start date (YYYY-MM-DD)",
132)
133@click.option(
134 "--to",
135 "date_to",
136 required=False,
137 help="End date (YYYY-MM-DD)",
138)
139@click.option(
140 "--date",
141 "single_date",
142 required=False,
143 help="Single date to ingest (YYYY-MM-DD)",
144)
145@click.option(
146 "--category",
147 "category",
148 required=False,
149 type=click.Choice(["T", "H", "G"], case_sensitive=False),
150 help="Racing category: T (Thoroughbred), H (Harness), G (Greyhound)",
151)
152def ingest(date_from: str, date_to: str, single_date: str, category: str):
153 """Ingest meetings and races from TAB API.
155 Examples:
156 ingest --from 2024-01-01 --to 2024-01-31
157 ingest --date 2024-01-15
158 ingest --date 2024-01-15 --category H
159 """
160 # Validate inputs
161 if single_date:
162 if date_from or date_to:
163 console.print("[red]Error: Cannot use --date with --from/--to[/red]")
164 sys.exit(1)
165 start_date = parse_date(single_date)
166 end_date = start_date
167 elif date_from and date_to:
168 start_date = parse_date(date_from)
169 end_date = parse_date(date_to)
170 else:
171 console.print("[red]Error: Must specify either --date or --from/--to[/red]")
172 sys.exit(1)
174 settings = get_settings()
175 effective_category = category.upper() if category else settings.tab.default_category
177 console.print(
178 f"\n[bold]Ingesting {effective_category} meetings from {start_date} to {end_date}[/bold]\n"
179 )
181 try:
182 with get_session() as session:
183 service = IngestionService(session)
184 meetings, races, starters = asyncio.run(
185 service.ingest_date_range(
186 start_date, end_date, category=effective_category
187 )
188 )
190 # Display results
191 table = Table(title="Ingestion Results")
192 table.add_column("Entity", style="cyan")
193 table.add_column("Count", style="green", justify="right")
195 table.add_row("Meetings", str(meetings))
196 table.add_row("Races", str(races))
197 table.add_row("Starters", str(starters))
198 table.add_row(
199 "Errors",
200 str(service.stats["errors"]),
201 style="red" if service.stats["errors"] > 0 else "green",
202 )
204 console.print(table)
206 if service.stats["errors"] > 0:
207 console.print(
208 f"\n[yellow]⚠ {service.stats['errors']} errors occurred during ingestion[/yellow]"
209 )
210 else:
211 console.print("\n[green]✓ Ingestion completed successfully[/green]")
213 except Exception as e:
214 logger.error(f"Ingestion failed: {e}", exc_info=True)
215 console.print(f"\n[red]✗ Ingestion failed: {e}[/red]")
216 sys.exit(1)
219@cli.command()
220@click.option(
221 "--from",
222 "date_from",
223 required=True,
224 help="Start date (YYYY-MM-DD)",
225)
226@click.option(
227 "--to",
228 "date_to",
229 required=True,
230 help="End date (YYYY-MM-DD)",
231)
232@click.option(
233 "--clear",
234 is_flag=True,
235 help="Clear existing ratings before recompute",
236)
237def recompute(date_from: str, date_to: str, clear: bool):
238 """Recompute ratings from stored race results.
240 This will deterministically recompute all ratings in the date range.
242 Examples:
243 recompute --from 2024-01-01 --to 2024-12-31
244 recompute --from 2024-01-01 --to 2024-12-31 --clear
245 """
246 start_date = parse_date(date_from)
247 end_date = parse_date(date_to)
249 console.print(
250 f"\n[bold]Recomputing ratings from {start_date} to {end_date}[/bold]\n"
251 )
253 if clear:
254 console.print("[yellow]Clearing existing ratings...[/yellow]")
256 try:
257 # Import here to avoid circular dependency
258 from packages.core.ratings.recompute import recompute_ratings
260 with get_session() as session:
261 snapshot_count = recompute_ratings(
262 session, start_date, end_date, clear_existing=clear
263 )
265 console.print(
266 f"\n[green]✓ Recompute complete: {snapshot_count} rating snapshots created[/green]"
267 )
269 except Exception as e:
270 logger.error(f"Recompute failed: {e}", exc_info=True)
271 console.print(f"\n[red]✗ Recompute failed: {e}[/red]")
272 sys.exit(1)
275@cli.command()
276@click.option(
277 "--urls",
278 "urls_file",
279 required=True,
280 help="File containing HRNZ result URLs (one per line)",
281)
282@click.option(
283 "--overwrite/--no-overwrite",
284 default=False,
285 show_default=True,
286 help="Overwrite existing meetings/races/starters for matching HRNZ meeting IDs",
287)
288@click.option(
289 "--from",
290 "date_from",
291 required=False,
292 help="Filter: Start date (YYYY-MM-DD)",
293)
294@click.option(
295 "--to",
296 "date_to",
297 required=False,
298 help="Filter: End date (YYYY-MM-DD)",
299)
300def scrape_hrnz(urls_file: str, overwrite: bool, date_from: str, date_to: str):
301 """Import historical data from HRNZ results archive.
303 Scrapes HRNZ InfoHorse results pages and imports into database.
305 WARNING: Web scraping should only be used if official API access is unavailable.
306 Always check HRNZ's Terms of Service before scraping.
308 Examples:
309 scrape-hrnz --urls hrnz_urls.txt
310 scrape-hrnz --urls hrnz_urls.txt --from 2024-01-01 --to 2024-12-31
312 URL File Format:
313 102402rs.htm
314 102502rs.htm
315 102602rs.htm
316 """
317 from datetime import date as date_type
319 from packages.core.storage.repositories import (
320 DriverRepository,
321 HorseRepository,
322 MeetingRepository,
323 RaceRepository,
324 StarterRepository,
325 TrainerRepository,
326 )
327 from packages.hrnz_scraper import HRNZScraper
328 from packages.hrnz_scraper.mapper import HRNZDataMapper
330 # Parse date filters
331 start_date = parse_date(date_from) if date_from else date_type(2000, 1, 1)
332 end_date = parse_date(date_to) if date_to else date_type(2030, 12, 31)
334 # Read URLs from file
335 try:
336 with open(urls_file) as f:
337 urls = [line.strip() for line in f if line.strip()]
338 except FileNotFoundError:
339 console.print(f"[red]Error: File not found: {urls_file}[/red]")
340 sys.exit(1)
342 console.print(f"\n[bold]Scraping {len(urls)} HRNZ meetings[/bold]")
343 console.print(f"Date filter: {start_date} to {end_date}\n")
344 console.print(
345 "[yellow]⚠ Using web scraper - please ensure compliance with HRNZ ToS[/yellow]\n"
346 )
348 stats = {
349 "meetings": 0,
350 "races": 0,
351 "starters": 0,
352 "horses": 0,
353 "drivers": 0,
354 "trainers": 0,
355 "errors": 0,
356 }
358 try:
360 async def scrape_and_import():
361 async with HRNZScraper() as scraper:
362 for idx, url in enumerate(urls, 1):
363 try:
364 console.print(f"[{idx}/{len(urls)}] Scraping {url}...")
366 # Scrape meeting
367 scraped = await scraper.get_meeting_results(url)
369 # Check date filter
370 meeting_date_str = scraped.get("date")
371 if meeting_date_str:
372 meeting_date = date_type.fromisoformat(meeting_date_str)
373 if not (start_date <= meeting_date <= end_date):
374 console.print(
375 " [dim]Skipped (outside date range)[/dim]"
376 )
377 continue
379 # Map to TipSharks format
380 mapper = HRNZDataMapper()
381 meeting = mapper.map_meeting(scraped)
382 entities = mapper.map_entities(scraped)
384 # Import to database
385 with get_session() as session:
386 if overwrite:
387 session.query(Meeting).filter(
388 Meeting.id == meeting["meeting"]
389 ).delete(synchronize_session=False)
390 session.commit()
392 # Upsert meeting
393 MeetingRepository.upsert(session, meeting)
394 stats["meetings"] += 1
396 # Upsert entities (horses, drivers, trainers)
397 for horse in entities["horses"]:
398 HorseRepository.upsert(
399 session,
400 horse["id"],
401 horse["name"],
402 horse.get("raw_json"),
403 )
404 stats["horses"] += 1
406 for driver in entities["drivers"]:
407 DriverRepository.upsert(
408 session, driver["name"], driver_id=driver.get("id")
409 )
410 stats["drivers"] += 1
412 for trainer in entities["trainers"]:
413 TrainerRepository.upsert(
414 session,
415 trainer["name"],
416 trainer_id=trainer.get("id"),
417 )
418 stats["trainers"] += 1
420 # Upsert races and get race_id_map
421 races = mapper.map_races(scraped, meeting["meeting"])
422 race_id_map = {}
423 for race in races:
424 race_obj = RaceRepository.upsert(
425 session, meeting["meeting"], race
426 )
427 race_id_map[race["race_number"]] = race_obj.id
428 stats["races"] += 1
430 # Upsert starters
431 starters = mapper.map_starters(scraped, race_id_map)
432 for starter in starters:
433 StarterRepository.upsert(
434 session,
435 starter["race_id"],
436 starter,
437 starter.get("placing"),
438 )
439 stats["starters"] += 1
441 session.commit()
443 console.print(
444 f" [green]✓ Imported: {len(races)} races, "
445 f"{len(starters)} starters[/green]"
446 )
448 except Exception as e:
449 stats["errors"] += 1
450 logger.error(f"Failed to scrape {url}: {e}")
451 console.print(f" [red]✗ Error: {e}[/red]")
452 continue
454 asyncio.run(scrape_and_import())
456 # Display results
457 table = Table(title="HRNZ Scraping Results")
458 table.add_column("Entity", style="cyan")
459 table.add_column("Count", style="green", justify="right")
461 table.add_row("Meetings", str(stats["meetings"]))
462 table.add_row("Races", str(stats["races"]))
463 table.add_row("Starters", str(stats["starters"]))
464 table.add_row("Horses", str(stats["horses"]))
465 table.add_row("Drivers", str(stats["drivers"]))
466 table.add_row("Trainers", str(stats["trainers"]))
467 table.add_row(
468 "Errors",
469 str(stats["errors"]),
470 style="red" if stats["errors"] > 0 else "green",
471 )
473 console.print("\n")
474 console.print(table)
476 if stats["errors"] > 0:
477 console.print(
478 f"\n[yellow]⚠ {stats['errors']} errors occurred during scraping[/yellow]"
479 )
480 console.print(
481 "[yellow]Tip: Run 'recompute' to compute ratings for imported data[/yellow]"
482 )
483 else:
484 console.print("\n[green]✓ Scraping completed successfully[/green]")
485 console.print(
486 "[green]Tip: Run 'recompute' to compute ratings for imported data[/green]"
487 )
489 except Exception as e:
490 logger.error(f"Scraping failed: {e}", exc_info=True)
491 console.print(f"\n[red]✗ Scraping failed: {e}[/red]")
492 sys.exit(1)
495@cli.command()
496@click.option(
497 "--from",
498 "date_from",
499 required=True,
500 help="Start date (YYYY-MM-DD)",
501)
502@click.option(
503 "--to",
504 "date_to",
505 required=True,
506 help="End date (YYYY-MM-DD)",
507)
508@click.option(
509 "--race-type",
510 "race_day_type",
511 type=click.Choice(["OfficialRaces", "TrialRaces", "WorkoutRaces"]),
512 default="OfficialRaces",
513 show_default=True,
514 help="Race day type filter from the results enquiry",
515)
516@click.option(
517 "--club",
518 "club_no",
519 default="",
520 show_default=True,
521 help="Optional club number filter (leave blank for all clubs)",
522)
523@click.option(
524 "--overwrite/--no-overwrite",
525 default=False,
526 show_default=True,
527 help="Overwrite existing meetings/races/starters for matching HRNZ meeting IDs",
528)
529def scrape_hrnz_enquiry(
530 date_from: str, date_to: str, race_day_type: str, club_no: str, overwrite: bool
531):
532 """Scrape HRNZ historical results via the Results Enquiry page."""
533 from datetime import date as date_type
535 from packages.core.storage.repositories import (
536 DriverRepository,
537 HorseRepository,
538 MeetingRepository,
539 RaceRepository,
540 StarterRepository,
541 TrainerRepository,
542 )
543 from packages.hrnz_scraper import HRNZHistoricalResultsScraper
544 from packages.hrnz_scraper.mapper import HRNZDataMapper
546 start_date = parse_date(date_from)
547 end_date = parse_date(date_to)
549 console.print(
550 f"\n[bold]Scraping HRNZ results from {start_date} to {end_date}[/bold]\n"
551 )
552 console.print(
553 "[yellow]⚠ Using web scraper - please ensure compliance with HRNZ ToS[/yellow]\n"
554 )
556 stats = {
557 "meetings": 0,
558 "races": 0,
559 "starters": 0,
560 "horses": 0,
561 "drivers": 0,
562 "trainers": 0,
563 "errors": 0,
564 }
566 try:
568 async def scrape_and_import():
569 async with HRNZHistoricalResultsScraper() as scraper:
570 meetings = [
571 meeting
572 async for meeting in scraper.iter_meetings(
573 start_date,
574 end_date,
575 race_day_type=race_day_type,
576 club_no=club_no,
577 )
578 ]
579 for idx, meeting_meta in enumerate(meetings, 1):
580 try:
581 url = meeting_meta["results_url"]
582 console.print(f"[{idx}/{len(meetings)}] Scraping {url}...")
584 scraped = await scraper.get_meeting_results(url, meeting_meta)
586 if not scraped.get("date") and meeting_meta.get("meeting_date"):
587 scraped["date"] = meeting_meta["meeting_date"].isoformat()
589 if not scraped.get("races"):
590 console.print(" [dim]Skipped (no races found)[/dim]")
591 continue
593 meeting_date_str = scraped.get("date")
594 if meeting_date_str:
595 meeting_date = date_type.fromisoformat(meeting_date_str)
596 if not (start_date <= meeting_date <= end_date):
597 console.print(
598 " [dim]Skipped (outside date range)[/dim]"
599 )
600 continue
602 mapper = HRNZDataMapper()
603 meeting = mapper.map_meeting(scraped)
604 entities = mapper.map_entities(scraped)
606 with get_session() as session:
607 if overwrite:
608 session.query(Meeting).filter(
609 Meeting.id == meeting["meeting"]
610 ).delete(synchronize_session=False)
611 session.commit()
613 MeetingRepository.upsert(session, meeting)
614 stats["meetings"] += 1
616 for horse in entities["horses"]:
617 HorseRepository.upsert(
618 session,
619 horse["id"],
620 horse["name"],
621 horse.get("raw_json"),
622 )
623 stats["horses"] += 1
625 for driver in entities["drivers"]:
626 DriverRepository.upsert(
627 session, driver["name"], driver_id=driver.get("id")
628 )
629 stats["drivers"] += 1
631 for trainer in entities["trainers"]:
632 TrainerRepository.upsert(
633 session,
634 trainer["name"],
635 trainer_id=trainer.get("id"),
636 )
637 stats["trainers"] += 1
639 races = mapper.map_races(scraped, meeting["meeting"])
640 race_id_map = {}
641 for race in races:
642 race_obj = RaceRepository.upsert(
643 session, meeting["meeting"], race
644 )
645 race_id_map[race["race_number"]] = race_obj.id
646 stats["races"] += 1
648 starters = mapper.map_starters(scraped, race_id_map)
649 for starter in starters:
650 StarterRepository.upsert(
651 session,
652 starter["race_id"],
653 starter,
654 starter.get("placing"),
655 )
656 stats["starters"] += 1
658 session.commit()
660 console.print(
661 f" [green]✓ Imported: {len(races)} races, "
662 f"{len(starters)} starters[/green]"
663 )
664 except Exception as e:
665 stats["errors"] += 1
666 logger.error(
667 f"Failed to scrape {meeting_meta.get('results_url')}: {e}"
668 )
669 console.print(f" [red]✗ Error: {e}[/red]")
670 continue
672 asyncio.run(scrape_and_import())
674 table = Table(title="HRNZ Results Enquiry Scraping Results")
675 table.add_column("Entity", style="cyan")
676 table.add_column("Count", style="green", justify="right")
678 table.add_row("Meetings", str(stats["meetings"]))
679 table.add_row("Races", str(stats["races"]))
680 table.add_row("Starters", str(stats["starters"]))
681 table.add_row("Horses", str(stats["horses"]))
682 table.add_row("Drivers", str(stats["drivers"]))
683 table.add_row("Trainers", str(stats["trainers"]))
684 table.add_row(
685 "Errors",
686 str(stats["errors"]),
687 style="red" if stats["errors"] > 0 else "green",
688 )
690 console.print("\n")
691 console.print(table)
693 if stats["errors"] > 0:
694 console.print(
695 f"\n[yellow]⚠ {stats['errors']} errors occurred during scraping[/yellow]"
696 )
697 console.print(
698 "[yellow]Tip: Run 'recompute' to compute ratings for imported data[/yellow]"
699 )
700 else:
701 console.print("\n[green]✓ Scraping completed successfully[/green]")
702 console.print(
703 "[green]Tip: Run 'recompute' to compute ratings for imported data[/green]"
704 )
706 except Exception as e:
707 logger.error(f"Scraping failed: {e}", exc_info=True)
708 console.print(f"\n[red]✗ Scraping failed: {e}[/red]")
709 sys.exit(1)
712@cli.command()
713def refresh_club_codes():
714 """Refresh HRNZ club codes by scraping the HRNZ results pages.
716 Fetches the latest list of club codes from HRNZ, compares them with
717 the hardcoded ``HRNZ_ALL_CLUB_CODES`` list, and reports any
718 differences (new, missing, or unmatched codes).
719 """
720 from packages.hrnz_scraper.club_refresh import (
721 generate_diff_report,
722 refresh_club_codes,
723 )
725 console.print("\n[bold cyan]Refreshing HRNZ club codes...[/bold cyan]\n")
727 try:
728 result = refresh_club_codes()
729 report = generate_diff_report(result)
731 console.print(report)
733 new_codes = result.get("new", [])
734 missing_codes = result.get("missing", [])
736 if result.get("error"):
737 console.print(f"\n[yellow]⚠ {result['error']}[/yellow]")
739 if new_codes:
740 console.print(
741 "\n[yellow]⚠ New club codes detected. "
742 "Update HRNZ_ALL_CLUB_CODES in packages/core/common/settings.py "
743 "to include these codes.[/yellow]"
744 )
745 console.print(f"\nAdd to settings.py:\n {new_codes!r}")
747 if missing_codes and not new_codes:
748 console.print(
749 "\n[green]✓ All hardcoded club codes were found online.[/green]"
750 )
751 elif not new_codes and not missing_codes:
752 console.print(
753 "\n[green]✓ Hardcoded club codes are up to date "
754 f"(all {len(result.get('fetched', []))} codes confirmed).[/green]"
755 )
757 except Exception as e:
758 logger.error(f"Club code refresh failed: {e}", exc_info=True)
759 console.print(f"\n[red]✗ Club code refresh failed: {e}[/red]")
760 sys.exit(1)
763@cli.command()
764def info():
765 """Display configuration information."""
766 settings = get_settings()
768 table = Table(title="TipSharks Configuration")
769 table.add_column("Setting", style="cyan")
770 table.add_column("Value", style="yellow")
772 # TAB API settings
773 table.add_row("TAB Base URL", settings.tab.base_url)
774 table.add_row("TAB Default Category", settings.tab.default_category)
775 table.add_row("TAB Default Country", settings.tab.default_country)
776 table.add_row("TAB Mock Mode", "✓" if settings.tab.mock_mode else "✗")
778 # Database
779 table.add_row(
780 "Database URL",
781 (
782 settings.database.url.split("@")[-1]
783 if "@" in settings.database.url
784 else settings.database.url
785 ),
786 )
787 table.add_row("Log Level", settings.logging.level)
789 # Rating settings
790 table.add_row("Elo Scale (C)", str(settings.rating.elo_scale_c))
791 table.add_row("Elo K", str(settings.rating.elo_k_base))
792 table.add_row("Driver Weight (α)", str(settings.rating.driver_weight_alpha))
793 table.add_row("Trainer Weight (β)", str(settings.rating.trainer_weight_beta))
794 table.add_row("Enable Driver", "✓" if settings.rating.enable_driver else "✗")
795 table.add_row("Enable Trainer", "✓" if settings.rating.enable_trainer else "✗")
796 table.add_row(
797 "Enable Adjustments", "✓" if settings.rating.enable_adjustments else "✗"
798 )
800 console.print(table)
803@cli.command()
804@click.option(
805 "--from",
806 "date_from",
807 required=True,
808 help="Start date (YYYY-MM-DD)",
809)
810@click.option(
811 "--to",
812 "date_to",
813 required=True,
814 help="End date (YYYY-MM-DD)",
815)
816@click.option(
817 "--out",
818 "output_file",
819 required=False,
820 help="Output JSON file path (optional)",
821)
822def data_quality(date_from, date_to, output_file):
823 """Generate data quality report for date range.
825 Validates data completeness, accuracy, and identifies issues.
826 """
827 import json
829 from packages.core.common.data_quality import (
830 DataQualityValidator,
831 check_data_freshness,
832 )
834 console.print("[bold cyan]Generating Data Quality Report...[/bold cyan]")
836 # Parse dates
837 start_date = parse_date(date_from)
838 end_date = parse_date(date_to)
840 console.print(f"Date range: {start_date} to {end_date}")
842 with get_session() as session:
843 validator = DataQualityValidator(session)
845 # Generate report
846 report = validator.generate_report(start_date, end_date)
848 # Check data freshness
849 freshness_issue = check_data_freshness(session)
850 if freshness_issue:
851 report.issues.append(freshness_issue)
853 # Display summary
854 summary_table = Table(title="Data Quality Summary")
855 summary_table.add_column("Metric", style="cyan")
856 summary_table.add_column("Value", style="yellow")
858 summary_table.add_row("Total Meetings", str(report.total_meetings))
859 summary_table.add_row("Total Races", str(report.total_races))
860 summary_table.add_row("Total Starters", str(report.total_starters))
861 summary_table.add_row("Errors", f"[red]{report.error_count}[/red]")
862 summary_table.add_row("Warnings", f"[yellow]{report.warning_count}[/yellow]")
863 summary_table.add_row(
864 "Info", str(sum(1 for i in report.issues if i.severity == "info"))
865 )
867 console.print(summary_table)
869 # Display metrics
870 if report.metrics:
871 metrics_table = Table(title="Data Metrics")
872 metrics_table.add_column("Metric", style="cyan")
873 metrics_table.add_column("Value", style="yellow")
875 for key, value in report.metrics.items():
876 formatted_key = key.replace("_", " ").title()
877 metrics_table.add_row(formatted_key, str(value))
879 console.print(metrics_table)
881 # Display issues by category
882 if report.issues:
883 console.print("\n[bold]Issues by Category:[/bold]")
885 for category in ["error", "warning", "info"]:
886 category_issues = [i for i in report.issues if i.severity == category]
887 if category_issues:
888 issues_table = Table(title=f"{category.upper()} Issues")
889 issues_table.add_column("Category", style="cyan")
890 issues_table.add_column("Message", style="yellow")
891 issues_table.add_column("Race ID", style="magenta")
893 for issue in category_issues[:20]: # Show first 20
894 issues_table.add_row(
895 issue.category,
896 issue.message,
897 str(issue.race_id) if issue.race_id else "-",
898 )
900 console.print(issues_table)
902 if len(category_issues) > 20:
903 console.print(
904 f"[dim]... and {len(category_issues) - 20} more {category} issues[/dim]"
905 )
907 # Save to file if requested
908 if output_file:
909 report_data = {
910 "start_date": report.start_date.isoformat(),
911 "end_date": report.end_date.isoformat(),
912 "generated_at": report.generated_at.isoformat(),
913 "summary": {
914 "total_meetings": report.total_meetings,
915 "total_races": report.total_races,
916 "total_starters": report.total_starters,
917 "error_count": report.error_count,
918 "warning_count": report.warning_count,
919 },
920 "metrics": report.metrics,
921 "issues": [
922 {
923 "severity": issue.severity,
924 "category": issue.category,
925 "message": issue.message,
926 "race_id": issue.race_id,
927 "meeting_id": issue.meeting_id,
928 "starter_id": issue.starter_id,
929 "details": issue.details,
930 }
931 for issue in report.issues
932 ],
933 }
935 with open(output_file, "w") as f:
936 json.dump(report_data, f, indent=2)
938 console.print(f"\n[green]Report saved to {output_file}[/green]")
940 # Exit with error code if errors found
941 if report.has_errors:
942 console.print("\n[bold red]⚠ Data quality check FAILED[/bold red]")
943 sys.exit(1)
944 else:
945 console.print("\n[bold green]✓ Data quality check PASSED[/bold green]")
946 sys.exit(0)
949@cli.command()
950@click.option(
951 "--months",
952 default=12,
953 type=click.IntRange(1, 24),
954 show_default=True,
955 help="Number of months to backfill (max 24)",
956)
957@click.option(
958 "--category",
959 type=click.Choice(["T", "H", "G"], case_sensitive=False),
960 help="Racing category: T (Thoroughbred), H (Harness), G (Greyhound)",
961)
962@click.option(
963 "--source",
964 type=click.Choice(["tab", "ingest"], case_sensitive=False),
965 default="tab",
966 show_default=True,
967 help="Data source: tab (TAB API) or ingest (tab-api-ingest service)",
968)
969@click.option(
970 "--clear",
971 is_flag=True,
972 help="Clear existing ratings before recompute",
973)
974@click.option(
975 "--learn-adjustments",
976 is_flag=True,
977 help="Learn barrier/handicap adjustments during recompute",
978)
979@click.option(
980 "--skip-eval",
981 is_flag=True,
982 help="Skip accuracy evaluation after recompute",
983)
984def backfill(months, category, source, clear, learn_adjustments, skip_eval):
985 """Backfill historical data and recompute ratings.
987 Ingests data week by week from (today - months) to today,
988 then recomputes all ratings and evaluates prediction accuracy.
990 Examples:
992 backfill --months 12 --category H --source tab
994 backfill --months 6 --category T --clear --learn-adjustments
996 backfill --months 3 --source ingest --skip-eval
997 """
998 # Calculate date range
999 today = date.today()
1000 start_date = _subtract_months(today, months)
1002 settings = get_settings()
1003 effective_category = category.upper() if category else settings.tab.default_category
1005 console.print(
1006 f"\n[bold]Backfilling {effective_category} data "
1007 f"from {start_date} to {today}[/bold]"
1008 )
1009 console.print(
1010 f"Source: {source}, Months: {months}, "
1011 f"Clear: {clear}, Learn adjustments: {learn_adjustments}\n"
1012 )
1014 # Track totals
1015 totals: dict[str, int] = {
1016 "meetings": 0,
1017 "races": 0,
1018 "starters": 0,
1019 "errors": 0,
1020 }
1021 failed_weeks: list[tuple[date, date, str]] = []
1023 # ── Week-by-week ingestion ───────────────────────────────────
1024 current = start_date
1025 week_count = 0
1026 while current <= today:
1027 week_end = min(current + timedelta(days=6), today)
1028 week_count += 1
1029 console.print(
1030 f"[bold cyan]Week {week_count}: {current} to {week_end}[/bold cyan]"
1031 )
1033 try:
1034 with get_session() as session:
1035 service = IngestionService(session, source=source)
1036 meetings, races, starters = asyncio.run(
1037 service.ingest_date_range(
1038 current, week_end, category=effective_category
1039 )
1040 )
1041 totals["meetings"] += meetings
1042 totals["races"] += races
1043 totals["starters"] += starters
1044 totals["errors"] += service.stats["errors"]
1046 console.print(
1047 f" [green]✓ {meetings} meetings, "
1048 f"{races} races, {starters} starters[/green]"
1049 )
1050 except Exception as e:
1051 logger.error(f"Week {current} to {week_end} failed: {e}", exc_info=True)
1052 console.print(f" [red]✗ Week failed: {e}[/red]")
1053 failed_weeks.append((current, week_end, str(e)))
1054 totals["errors"] += 1
1056 current = week_end + timedelta(days=1)
1058 # ── Ingestion summary ────────────────────────────────────────
1059 console.print("\n[bold]Ingestion complete:[/bold]")
1060 console.print(f" Weeks processed: {week_count}")
1061 console.print(f" Total meetings: {totals['meetings']}")
1062 console.print(f" Total races: {totals['races']}")
1063 console.print(f" Total starters: {totals['starters']}")
1065 if failed_weeks:
1066 console.print(f"\n[yellow]⚠ {len(failed_weeks)} week(s) had errors:[/yellow]")
1067 for fw_start, fw_end, fw_err in failed_weeks:
1068 console.print(f" [yellow]{fw_start} to {fw_end}: {fw_err}[/yellow]")
1070 # ── Recompute ratings ────────────────────────────────────────
1071 snapshot_count = 0
1072 if totals["meetings"] == 0:
1073 console.print("\n[yellow]No data ingested — skipping recompute.[/yellow]")
1074 else:
1075 console.print(
1076 f"\n[bold]Recomputing ratings from " f"{start_date} to {today}...[/bold]"
1077 )
1078 try:
1079 from packages.core.ratings.recompute import recompute_ratings
1081 with get_session() as session:
1082 snapshot_count = recompute_ratings(
1083 session,
1084 start_date,
1085 today,
1086 clear_existing=clear,
1087 learn_adjustments=learn_adjustments,
1088 )
1089 console.print(f"[green]✓ {snapshot_count} rating snapshots created[/green]")
1090 except Exception as e:
1091 logger.error(f"Recompute failed: {e}", exc_info=True)
1092 console.print(f"\n[red]✗ Recompute failed: {e}[/red]")
1093 sys.exit(1)
1095 # ── Evaluation ───────────────────────────────────────────────
1096 brier: float | None = None
1097 winner_acc: float | None = None
1098 if not skip_eval and totals["meetings"] > 0:
1099 console.print("\n[bold]Evaluating prediction accuracy...[/bold]")
1100 try:
1101 script_path = os.path.join(
1102 os.path.dirname(os.path.abspath(__file__)),
1103 "..",
1104 "..",
1105 "..",
1106 "scripts",
1107 "evaluate_accuracy.py",
1108 )
1109 eval_result = subprocess.run(
1110 [
1111 sys.executable,
1112 script_path,
1113 "--from",
1114 start_date.isoformat(),
1115 "--to",
1116 today.isoformat(),
1117 "--json",
1118 ],
1119 capture_output=True,
1120 text=True,
1121 check=True,
1122 timeout=600,
1123 )
1124 data = json.loads(eval_result.stdout)
1125 if "all" in data and data["all"]:
1126 brier = data["all"]["brier"]
1127 winner_acc = data["all"]["winner_acc"]
1128 console.print(f" [green]Winner accuracy: {winner_acc:.4f}[/green]")
1129 console.print(f" [green]Brier score: {brier:.4f}[/green]")
1130 else:
1131 console.print(" [yellow]Evaluation returned no results[/yellow]")
1132 except Exception as e:
1133 logger.error(f"Evaluation failed: {e}", exc_info=True)
1134 console.print(f" [yellow]⚠ Evaluation skipped due to error: {e}[/yellow]")
1136 # ── Final summary table ──────────────────────────────────────
1137 table = Table(title="Backfill Summary")
1138 table.add_column("Metric", style="cyan")
1139 table.add_column("Value", style="green", justify="right")
1141 table.add_row("Weeks processed", str(week_count))
1142 table.add_row("Meetings ingested", str(totals["meetings"]))
1143 table.add_row("Races ingested", str(totals["races"]))
1144 table.add_row("Starters ingested", str(totals["starters"]))
1145 table.add_row("Rating snapshots", str(snapshot_count))
1146 if brier is not None:
1147 table.add_row("Brier score", f"{brier:.4f}")
1148 if winner_acc is not None:
1149 table.add_row("Winner accuracy", f"{winner_acc:.4f}")
1150 table.add_row(
1151 "Errors",
1152 str(totals["errors"]),
1153 style="red" if totals["errors"] > 0 else "green",
1154 )
1155 if failed_weeks:
1156 table.add_row("Failed weeks", str(len(failed_weeks)), style="red")
1158 console.print("\n")
1159 console.print(table)
1161 if totals["errors"] > 0:
1162 console.print(
1163 f"\n[yellow]⚠ Completed with {totals['errors']} error(s)[/yellow]"
1164 )
1165 else:
1166 console.print("\n[green]✓ Backfill completed successfully[/green]")
1169if __name__ == "__main__":
1170 cli()