Coverage for packages / hrnz_scraper / club_refresh.py: 0%
186 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-08 08:37 +1200
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-08 08:37 +1200
1"""HRNZ club code auto-refresh module.
3Fetches the HRNZ results index page to discover all active club codes,
4then compares them against the hardcoded HRNZ_ALL_CLUB_CODES list.
6Usage:
7 from packages.hrnz_scraper.club_refresh import refresh_club_codes
8 result = refresh_club_codes()
9 # result = {"fetched": [...], "new": [...], "missing": [...], "unmatched": [...]}
10"""
12from __future__ import annotations
14import json
15import re
16from pathlib import Path
17from typing import Any
19import httpx
21from packages.core.common.logging import get_logger
23logger = get_logger(__name__)
25# HRNZ URL patterns
26INFOHORSE_BASE = "https://infohorse.hrnz.co.nz/datahrs/results/"
27HARNESS_ORG_NZ = "https://www.harness.org.nz/racing/results/"
29# Fallback cache path (relative to project root)
30DEFAULT_CACHE_DIR = "data"
31DEFAULT_CACHE_FILE = "hrzn_club_codes.json"
34def _get_cache_path() -> Path:
35 """Return the filesystem path for the club code cache file."""
36 # Look for project root by traversing up from this file
37 here = Path(__file__).resolve()
38 # Walk up to find the tipsharks-elo-api root
39 root = here
40 for _parent in range(6):
41 candidate = root.parent
42 if (candidate / "pyproject.toml").exists():
43 root = candidate
44 break
45 root = candidate
47 cache_dir = root / DEFAULT_CACHE_DIR
48 cache_dir.mkdir(parents=True, exist_ok=True)
49 return cache_dir / DEFAULT_CACHE_FILE
52def _extract_club_codes_from_html(html: str) -> set[str]:
53 """Extract 2-digit club codes from HRNZ results page HTML.
55 Looks for URL patterns like ``010741rs.htm`` or ``102402rs.htm``
56 where the two middle digits are the club code (e.g., ``41``, ``24``).
58 Args:
59 html: Raw HTML content of an HRNZ results or index page.
61 Returns:
62 Set of unique 2-digit club code strings found in the page.
63 """
64 codes: set[str] = set()
66 # Pattern: looks for mmddCCrs.htm where CC is the 2-digit club code
67 # Matches any 6+ digits followed by "rs.htm" or similar
68 patterns = [
69 r"(?<!\d)(\d{2})(?:\d{2})(\d{2})rs\.htm", # mmddCCrs.htm
70 r"(?<!\d)(\d{2})(\d{2})(\d{2})rs\.htm", # yymmddCCrs.htm
71 ]
73 for pattern in patterns:
74 for match in re.finditer(pattern, html, re.IGNORECASE):
75 # The last 2-digit capture group is the club code
76 code = match.group(match.lastindex or len(match.groups()))
77 if code.isdigit() and 0 <= int(code) <= 99:
78 codes.add(code)
80 # Also look for explicit links with club codes in query params
81 club_param_pattern = r"[?&]club(?:_no|code|id)?[= ](\d{1,2})(?:&|$|\s)"
82 for match in re.finditer(club_param_pattern, html, re.IGNORECASE):
83 code = match.group(1).zfill(2)
84 if code.isdigit() and 0 <= int(code) <= 99:
85 codes.add(code)
87 # Also scan for any isolated 2-digit numbers near "club" or "meeting" context
88 context_pattern = r"(?:club|meeting|venue)[^<]*?(\d{2})[^<]*?(?:rs\.htm|results?)"
89 for match in re.finditer(context_pattern, html, re.IGNORECASE):
90 code = match.group(1)
91 if code.isdigit() and 0 <= int(code) <= 99:
92 codes.add(code)
94 return codes
97def _try_fetch_infohorse_index() -> str | None:
98 """Try to fetch the infohorse results directory index page.
100 Returns:
101 HTML string or None if unavailable.
102 """
103 try:
104 resp = httpx.get(INFOHORSE_BASE, timeout=15.0, follow_redirects=True)
105 resp.raise_for_status()
106 content_type = resp.headers.get("content-type", "")
107 if "text/html" in content_type or "html" in content_type.lower():
108 logger.info("Fetched infohorse index page (%d bytes)", len(resp.text))
109 return resp.text
110 logger.info(
111 "Infohorse index returned non-HTML (%s); trying next source", content_type
112 )
113 return None
114 except httpx.HTTPError as exc:
115 logger.warning("Failed to fetch infohorse index: %s", exc)
116 return None
117 except Exception as exc:
118 logger.warning("Unexpected error fetching infohorse index: %s", exc)
119 return None
122def _try_fetch_harness_index() -> str | None:
123 """Try to fetch the harness.org.nz results index page.
125 Returns:
126 HTML string or None if unavailable.
127 """
128 try:
129 resp = httpx.get(HARNESS_ORG_NZ, timeout=15.0, follow_redirects=True)
130 resp.raise_for_status()
131 logger.info("Fetched harness.org.nz index page (%d bytes)", len(resp.text))
132 return resp.text
133 except httpx.HTTPError as exc:
134 logger.warning("Failed to fetch harness.org.nz index: %s", exc)
135 return None
136 except Exception as exc:
137 logger.warning("Unexpected error fetching harness.org.nz index: %s", exc)
138 return None
141def _try_fetch_todays_meeting() -> str | None:
142 """Try to fetch a specific today's meeting page to extract club codes.
144 Falls back to fetching a known recent meeting page pattern to discover
145 club codes from the page itself.
147 Returns:
148 HTML string or None if unavailable.
149 """
150 from datetime import date, timedelta
152 # Try the last 3 days as HRNZ pages may not be published for today
153 today = date.today()
154 for days_ago in range(1, 4):
155 d = today - timedelta(days=days_ago)
156 date_prefix = d.strftime("%m%d")
157 # Try club code "41" (a common code) as a probe
158 url = f"{INFOHORSE_BASE}{date_prefix}41rs.htm"
159 try:
160 resp = httpx.get(url, timeout=15.0, follow_redirects=True)
161 if resp.status_code == 200:
162 content_type = resp.headers.get("content-type", "")
163 if "text/html" in content_type or "html" in content_type.lower():
164 logger.info(
165 "Fetched meeting page %s (%d bytes)", url, len(resp.text)
166 )
167 return resp.text
168 except httpx.HTTPError:
169 continue
171 logger.warning("Could not fetch any recent meeting page for club code discovery")
172 return None
175def fetch_club_codes_from_hrnz() -> set[str]:
176 """Fetch all discoverable club codes from HRNZ sources.
178 Tries multiple sources in order:
179 1. infohorse.hrnz.co.nz results directory index
180 2. harness.org.nz/racing/results/ index page
181 3. A recent meeting page (probe-based)
183 Returns:
184 Set of 2-digit club code strings (e.g., {"02", "07", "15", ...}).
185 May be empty if all sources fail.
186 """
187 all_codes: set[str] = set()
189 # Source 1: Infohorse directory index
190 html = _try_fetch_infohorse_index()
191 if html:
192 codes = _extract_club_codes_from_html(html)
193 logger.info("Extracted %d club codes from infohorse index", len(codes))
194 all_codes.update(codes)
196 # Source 2: Harness.org.nz results page
197 html = _try_fetch_harness_index()
198 if html:
199 codes = _extract_club_codes_from_html(html)
200 logger.info("Extracted %d club codes from harness.org.nz index", len(codes))
201 all_codes.update(codes)
203 # Source 3: Meeting page probe
204 if not all_codes:
205 html = _try_fetch_todays_meeting()
206 if html:
207 codes = _extract_club_codes_from_html(html)
208 logger.info("Extracted %d club codes from meeting page probe", len(codes))
209 all_codes.update(codes)
211 return all_codes
214def load_cached_codes() -> list[str]:
215 """Load previously cached club codes from disk.
217 Returns:
218 List of 2-digit club code strings, or empty list if no cache exists.
219 """
220 cache_path = _get_cache_path()
221 if not cache_path.exists():
222 return []
224 try:
225 data = json.loads(cache_path.read_text())
226 codes = data.get("club_codes", [])
227 if isinstance(codes, list):
228 logger.info("Loaded %d club codes from cache: %s", len(codes), cache_path)
229 return [str(c).zfill(2) for c in codes if str(c).strip().isdigit()]
230 except (json.JSONDecodeError, OSError) as exc:
231 logger.warning("Failed to load cached club codes: %s", exc)
233 return []
236def save_cached_codes(codes: list[str]) -> None:
237 """Save club codes to disk cache.
239 Args:
240 codes: List of 2-digit club code strings.
241 """
242 cache_path = _get_cache_path()
243 try:
244 cache_path.write_text(
245 json.dumps(
246 {"club_codes": sorted(set(codes)), "source": "hrnz_club_refresh"},
247 indent=2,
248 )
249 )
250 logger.info("Saved %d club codes to cache: %s", len(codes), cache_path)
251 except OSError as exc:
252 logger.warning("Failed to save club codes to cache: %s", exc)
255def refresh_club_codes(
256 hardcoded_codes: list[str] | None = None,
257 use_cache_fallback: bool = True,
258 save_cache: bool = True,
259) -> dict[str, Any]:
260 """Fetch current HRNZ club codes and compare with the hardcoded list.
262 Args:
263 hardcoded_codes: The hardcoded list to compare against.
264 Defaults to importing ``HRNZ_ALL_CLUB_CODES`` from settings.
265 use_cache_fallback: If True and live fetch fails, try loading from
266 disk cache.
267 save_cache: If True, save the fetched codes to disk cache.
269 Returns:
270 Dict with keys:
271 - ``fetched``: Full set of codes discovered from HRNZ (sorted list)
272 - ``new``: Codes found online but NOT in hardcoded list (sorted list)
273 - ``missing``: Codes in hardcoded list but NOT found online (sorted list)
274 - ``unmatched``: Hardcoded codes with no online confirmation (sorted list)
275 - ``source``: Where the codes came from (``"hrnz"``, ``"cache"``, or ``"hardcoded"``)
276 - ``error``: Error message if fetching failed entirely
277 """
278 if hardcoded_codes is None:
279 # Lazy import to avoid circular dependency at module level
280 from packages.core.common.settings import HRNZ_ALL_CLUB_CODES
282 hardcoded_codes = HRNZ_ALL_CLUB_CODES
284 hardcoded_set = {c.zfill(2) for c in hardcoded_codes}
286 logger.info(
287 "Refreshing HRNZ club codes (hardcoded list has %d codes)", len(hardcoded_set)
288 )
290 fetched: set[str] = set()
291 source = "hardcoded"
293 # Try live fetch
294 try:
295 fetched = fetch_club_codes_from_hrnz()
296 except Exception as exc:
297 logger.error("Error during HRNZ club code fetch: %s", exc, exc_info=True)
299 # Fallback to cache
300 if not fetched and use_cache_fallback:
301 cached = load_cached_codes()
302 if cached:
303 fetched = {c.zfill(2) for c in cached}
304 source = "cache"
305 logger.info("Using cached club codes (%d codes)", len(fetched))
307 if fetched:
308 source = "hrnz"
309 fetched_sorted = sorted(fetched)
310 new_codes = sorted(fetched - hardcoded_set)
311 missing_codes = sorted(hardcoded_set - fetched)
312 unmatched_codes = sorted(hardcoded_set - fetched)
314 if save_cache:
315 save_cached_codes(fetched_sorted)
317 logger.info(
318 "Club code refresh complete: %d fetched, %d new, %d missing",
319 len(fetched),
320 len(new_codes),
321 len(missing_codes),
322 )
324 if new_codes:
325 logger.warning(
326 "NEW HRNZ club codes found (not in hardcoded list): %s",
327 ", ".join(new_codes),
328 )
329 if missing_codes:
330 logger.info(
331 "Hardcoded codes not found online (may be inactive): %s",
332 ", ".join(missing_codes),
333 )
335 return {
336 "fetched": fetched_sorted,
337 "new": new_codes,
338 "missing": missing_codes,
339 "unmatched": unmatched_codes,
340 "source": source,
341 "error": None,
342 }
344 # No codes fetched from any source; return hardcoded as fallback
345 logger.warning(
346 "Could not fetch HRNZ club codes from any source; returning hardcoded list as fallback"
347 )
348 return {
349 "fetched": sorted(hardcoded_set),
350 "new": [],
351 "missing": [],
352 "unmatched": [],
353 "source": "hardcoded",
354 "error": "Could not fetch club codes from HRNZ; using hardcoded/cached list",
355 }
358def generate_diff_report(result: dict[str, Any]) -> str:
359 """Generate a human-readable diff report from a refresh result.
361 Args:
362 result: The dict returned by ``refresh_club_codes()``.
364 Returns:
365 Formatted report string suitable for console output.
366 """
367 lines: list[str] = []
368 lines.append("=" * 60)
369 lines.append("HRNZ CLUB CODE REFRESH REPORT")
370 lines.append("=" * 60)
371 lines.append(f"Source: {result.get('source', 'unknown')}")
372 lines.append(f"Total codes found: {len(result.get('fetched', []))}")
373 lines.append(
374 f"Hardcoded codes: {len(result.get('fetched', [])) + len(result.get('missing', []))}"
375 )
377 if result.get("error"):
378 lines.append(f"\n⚠ Error: {result['error']}")
380 new_codes = result.get("new", [])
381 missing_codes = result.get("missing", [])
383 if new_codes:
384 lines.append(f"\n🆕 NEW codes (not in hardcoded list): {len(new_codes)}")
385 for code in new_codes:
386 lines.append(f" + {code}")
387 else:
388 lines.append("\n✅ No new codes found")
390 if missing_codes:
391 lines.append(
392 f"\n🗑️ MISSING codes (in hardcoded but not found): {len(missing_codes)}"
393 )
394 for code in missing_codes:
395 lines.append(f" - {code}")
396 else:
397 lines.append("\n✅ No missing codes")
399 lines.append("=" * 60)
400 return "\n".join(lines)
403if __name__ == "__main__":
404 # Simple CLI for testing
405 from packages.core.common.logging import setup_logging
407 setup_logging()
408 result = refresh_club_codes()
409 print(generate_diff_report(result))