Coverage for packages / hrnz_scraper / club_refresh.py: 0%

186 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-08 08:37 +1200

1"""HRNZ club code auto-refresh module. 

2 

3Fetches the HRNZ results index page to discover all active club codes, 

4then compares them against the hardcoded HRNZ_ALL_CLUB_CODES list. 

5 

6Usage: 

7 from packages.hrnz_scraper.club_refresh import refresh_club_codes 

8 result = refresh_club_codes() 

9 # result = {"fetched": [...], "new": [...], "missing": [...], "unmatched": [...]} 

10""" 

11 

12from __future__ import annotations 

13 

14import json 

15import re 

16from pathlib import Path 

17from typing import Any 

18 

19import httpx 

20 

21from packages.core.common.logging import get_logger 

22 

23logger = get_logger(__name__) 

24 

25# HRNZ URL patterns 

26INFOHORSE_BASE = "https://infohorse.hrnz.co.nz/datahrs/results/" 

27HARNESS_ORG_NZ = "https://www.harness.org.nz/racing/results/" 

28 

29# Fallback cache path (relative to project root) 

30DEFAULT_CACHE_DIR = "data" 

31DEFAULT_CACHE_FILE = "hrzn_club_codes.json" 

32 

33 

34def _get_cache_path() -> Path: 

35 """Return the filesystem path for the club code cache file.""" 

36 # Look for project root by traversing up from this file 

37 here = Path(__file__).resolve() 

38 # Walk up to find the tipsharks-elo-api root 

39 root = here 

40 for _parent in range(6): 

41 candidate = root.parent 

42 if (candidate / "pyproject.toml").exists(): 

43 root = candidate 

44 break 

45 root = candidate 

46 

47 cache_dir = root / DEFAULT_CACHE_DIR 

48 cache_dir.mkdir(parents=True, exist_ok=True) 

49 return cache_dir / DEFAULT_CACHE_FILE 

50 

51 

52def _extract_club_codes_from_html(html: str) -> set[str]: 

53 """Extract 2-digit club codes from HRNZ results page HTML. 

54 

55 Looks for URL patterns like ``010741rs.htm`` or ``102402rs.htm`` 

56 where the two middle digits are the club code (e.g., ``41``, ``24``). 

57 

58 Args: 

59 html: Raw HTML content of an HRNZ results or index page. 

60 

61 Returns: 

62 Set of unique 2-digit club code strings found in the page. 

63 """ 

64 codes: set[str] = set() 

65 

66 # Pattern: looks for mmddCCrs.htm where CC is the 2-digit club code 

67 # Matches any 6+ digits followed by "rs.htm" or similar 

68 patterns = [ 

69 r"(?<!\d)(\d{2})(?:\d{2})(\d{2})rs\.htm", # mmddCCrs.htm 

70 r"(?<!\d)(\d{2})(\d{2})(\d{2})rs\.htm", # yymmddCCrs.htm 

71 ] 

72 

73 for pattern in patterns: 

74 for match in re.finditer(pattern, html, re.IGNORECASE): 

75 # The last 2-digit capture group is the club code 

76 code = match.group(match.lastindex or len(match.groups())) 

77 if code.isdigit() and 0 <= int(code) <= 99: 

78 codes.add(code) 

79 

80 # Also look for explicit links with club codes in query params 

81 club_param_pattern = r"[?&]club(?:_no|code|id)?[= ](\d{1,2})(?:&|$|\s)" 

82 for match in re.finditer(club_param_pattern, html, re.IGNORECASE): 

83 code = match.group(1).zfill(2) 

84 if code.isdigit() and 0 <= int(code) <= 99: 

85 codes.add(code) 

86 

87 # Also scan for any isolated 2-digit numbers near "club" or "meeting" context 

88 context_pattern = r"(?:club|meeting|venue)[^<]*?(\d{2})[^<]*?(?:rs\.htm|results?)" 

89 for match in re.finditer(context_pattern, html, re.IGNORECASE): 

90 code = match.group(1) 

91 if code.isdigit() and 0 <= int(code) <= 99: 

92 codes.add(code) 

93 

94 return codes 

95 

96 

97def _try_fetch_infohorse_index() -> str | None: 

98 """Try to fetch the infohorse results directory index page. 

99 

100 Returns: 

101 HTML string or None if unavailable. 

102 """ 

103 try: 

104 resp = httpx.get(INFOHORSE_BASE, timeout=15.0, follow_redirects=True) 

105 resp.raise_for_status() 

106 content_type = resp.headers.get("content-type", "") 

107 if "text/html" in content_type or "html" in content_type.lower(): 

108 logger.info("Fetched infohorse index page (%d bytes)", len(resp.text)) 

109 return resp.text 

110 logger.info( 

111 "Infohorse index returned non-HTML (%s); trying next source", content_type 

112 ) 

113 return None 

114 except httpx.HTTPError as exc: 

115 logger.warning("Failed to fetch infohorse index: %s", exc) 

116 return None 

117 except Exception as exc: 

118 logger.warning("Unexpected error fetching infohorse index: %s", exc) 

119 return None 

120 

121 

122def _try_fetch_harness_index() -> str | None: 

123 """Try to fetch the harness.org.nz results index page. 

124 

125 Returns: 

126 HTML string or None if unavailable. 

127 """ 

128 try: 

129 resp = httpx.get(HARNESS_ORG_NZ, timeout=15.0, follow_redirects=True) 

130 resp.raise_for_status() 

131 logger.info("Fetched harness.org.nz index page (%d bytes)", len(resp.text)) 

132 return resp.text 

133 except httpx.HTTPError as exc: 

134 logger.warning("Failed to fetch harness.org.nz index: %s", exc) 

135 return None 

136 except Exception as exc: 

137 logger.warning("Unexpected error fetching harness.org.nz index: %s", exc) 

138 return None 

139 

140 

141def _try_fetch_todays_meeting() -> str | None: 

142 """Try to fetch a specific today's meeting page to extract club codes. 

143 

144 Falls back to fetching a known recent meeting page pattern to discover 

145 club codes from the page itself. 

146 

147 Returns: 

148 HTML string or None if unavailable. 

149 """ 

150 from datetime import date, timedelta 

151 

152 # Try the last 3 days as HRNZ pages may not be published for today 

153 today = date.today() 

154 for days_ago in range(1, 4): 

155 d = today - timedelta(days=days_ago) 

156 date_prefix = d.strftime("%m%d") 

157 # Try club code "41" (a common code) as a probe 

158 url = f"{INFOHORSE_BASE}{date_prefix}41rs.htm" 

159 try: 

160 resp = httpx.get(url, timeout=15.0, follow_redirects=True) 

161 if resp.status_code == 200: 

162 content_type = resp.headers.get("content-type", "") 

163 if "text/html" in content_type or "html" in content_type.lower(): 

164 logger.info( 

165 "Fetched meeting page %s (%d bytes)", url, len(resp.text) 

166 ) 

167 return resp.text 

168 except httpx.HTTPError: 

169 continue 

170 

171 logger.warning("Could not fetch any recent meeting page for club code discovery") 

172 return None 

173 

174 

175def fetch_club_codes_from_hrnz() -> set[str]: 

176 """Fetch all discoverable club codes from HRNZ sources. 

177 

178 Tries multiple sources in order: 

179 1. infohorse.hrnz.co.nz results directory index 

180 2. harness.org.nz/racing/results/ index page 

181 3. A recent meeting page (probe-based) 

182 

183 Returns: 

184 Set of 2-digit club code strings (e.g., {"02", "07", "15", ...}). 

185 May be empty if all sources fail. 

186 """ 

187 all_codes: set[str] = set() 

188 

189 # Source 1: Infohorse directory index 

190 html = _try_fetch_infohorse_index() 

191 if html: 

192 codes = _extract_club_codes_from_html(html) 

193 logger.info("Extracted %d club codes from infohorse index", len(codes)) 

194 all_codes.update(codes) 

195 

196 # Source 2: Harness.org.nz results page 

197 html = _try_fetch_harness_index() 

198 if html: 

199 codes = _extract_club_codes_from_html(html) 

200 logger.info("Extracted %d club codes from harness.org.nz index", len(codes)) 

201 all_codes.update(codes) 

202 

203 # Source 3: Meeting page probe 

204 if not all_codes: 

205 html = _try_fetch_todays_meeting() 

206 if html: 

207 codes = _extract_club_codes_from_html(html) 

208 logger.info("Extracted %d club codes from meeting page probe", len(codes)) 

209 all_codes.update(codes) 

210 

211 return all_codes 

212 

213 

214def load_cached_codes() -> list[str]: 

215 """Load previously cached club codes from disk. 

216 

217 Returns: 

218 List of 2-digit club code strings, or empty list if no cache exists. 

219 """ 

220 cache_path = _get_cache_path() 

221 if not cache_path.exists(): 

222 return [] 

223 

224 try: 

225 data = json.loads(cache_path.read_text()) 

226 codes = data.get("club_codes", []) 

227 if isinstance(codes, list): 

228 logger.info("Loaded %d club codes from cache: %s", len(codes), cache_path) 

229 return [str(c).zfill(2) for c in codes if str(c).strip().isdigit()] 

230 except (json.JSONDecodeError, OSError) as exc: 

231 logger.warning("Failed to load cached club codes: %s", exc) 

232 

233 return [] 

234 

235 

236def save_cached_codes(codes: list[str]) -> None: 

237 """Save club codes to disk cache. 

238 

239 Args: 

240 codes: List of 2-digit club code strings. 

241 """ 

242 cache_path = _get_cache_path() 

243 try: 

244 cache_path.write_text( 

245 json.dumps( 

246 {"club_codes": sorted(set(codes)), "source": "hrnz_club_refresh"}, 

247 indent=2, 

248 ) 

249 ) 

250 logger.info("Saved %d club codes to cache: %s", len(codes), cache_path) 

251 except OSError as exc: 

252 logger.warning("Failed to save club codes to cache: %s", exc) 

253 

254 

255def refresh_club_codes( 

256 hardcoded_codes: list[str] | None = None, 

257 use_cache_fallback: bool = True, 

258 save_cache: bool = True, 

259) -> dict[str, Any]: 

260 """Fetch current HRNZ club codes and compare with the hardcoded list. 

261 

262 Args: 

263 hardcoded_codes: The hardcoded list to compare against. 

264 Defaults to importing ``HRNZ_ALL_CLUB_CODES`` from settings. 

265 use_cache_fallback: If True and live fetch fails, try loading from 

266 disk cache. 

267 save_cache: If True, save the fetched codes to disk cache. 

268 

269 Returns: 

270 Dict with keys: 

271 - ``fetched``: Full set of codes discovered from HRNZ (sorted list) 

272 - ``new``: Codes found online but NOT in hardcoded list (sorted list) 

273 - ``missing``: Codes in hardcoded list but NOT found online (sorted list) 

274 - ``unmatched``: Hardcoded codes with no online confirmation (sorted list) 

275 - ``source``: Where the codes came from (``"hrnz"``, ``"cache"``, or ``"hardcoded"``) 

276 - ``error``: Error message if fetching failed entirely 

277 """ 

278 if hardcoded_codes is None: 

279 # Lazy import to avoid circular dependency at module level 

280 from packages.core.common.settings import HRNZ_ALL_CLUB_CODES 

281 

282 hardcoded_codes = HRNZ_ALL_CLUB_CODES 

283 

284 hardcoded_set = {c.zfill(2) for c in hardcoded_codes} 

285 

286 logger.info( 

287 "Refreshing HRNZ club codes (hardcoded list has %d codes)", len(hardcoded_set) 

288 ) 

289 

290 fetched: set[str] = set() 

291 source = "hardcoded" 

292 

293 # Try live fetch 

294 try: 

295 fetched = fetch_club_codes_from_hrnz() 

296 except Exception as exc: 

297 logger.error("Error during HRNZ club code fetch: %s", exc, exc_info=True) 

298 

299 # Fallback to cache 

300 if not fetched and use_cache_fallback: 

301 cached = load_cached_codes() 

302 if cached: 

303 fetched = {c.zfill(2) for c in cached} 

304 source = "cache" 

305 logger.info("Using cached club codes (%d codes)", len(fetched)) 

306 

307 if fetched: 

308 source = "hrnz" 

309 fetched_sorted = sorted(fetched) 

310 new_codes = sorted(fetched - hardcoded_set) 

311 missing_codes = sorted(hardcoded_set - fetched) 

312 unmatched_codes = sorted(hardcoded_set - fetched) 

313 

314 if save_cache: 

315 save_cached_codes(fetched_sorted) 

316 

317 logger.info( 

318 "Club code refresh complete: %d fetched, %d new, %d missing", 

319 len(fetched), 

320 len(new_codes), 

321 len(missing_codes), 

322 ) 

323 

324 if new_codes: 

325 logger.warning( 

326 "NEW HRNZ club codes found (not in hardcoded list): %s", 

327 ", ".join(new_codes), 

328 ) 

329 if missing_codes: 

330 logger.info( 

331 "Hardcoded codes not found online (may be inactive): %s", 

332 ", ".join(missing_codes), 

333 ) 

334 

335 return { 

336 "fetched": fetched_sorted, 

337 "new": new_codes, 

338 "missing": missing_codes, 

339 "unmatched": unmatched_codes, 

340 "source": source, 

341 "error": None, 

342 } 

343 

344 # No codes fetched from any source; return hardcoded as fallback 

345 logger.warning( 

346 "Could not fetch HRNZ club codes from any source; returning hardcoded list as fallback" 

347 ) 

348 return { 

349 "fetched": sorted(hardcoded_set), 

350 "new": [], 

351 "missing": [], 

352 "unmatched": [], 

353 "source": "hardcoded", 

354 "error": "Could not fetch club codes from HRNZ; using hardcoded/cached list", 

355 } 

356 

357 

358def generate_diff_report(result: dict[str, Any]) -> str: 

359 """Generate a human-readable diff report from a refresh result. 

360 

361 Args: 

362 result: The dict returned by ``refresh_club_codes()``. 

363 

364 Returns: 

365 Formatted report string suitable for console output. 

366 """ 

367 lines: list[str] = [] 

368 lines.append("=" * 60) 

369 lines.append("HRNZ CLUB CODE REFRESH REPORT") 

370 lines.append("=" * 60) 

371 lines.append(f"Source: {result.get('source', 'unknown')}") 

372 lines.append(f"Total codes found: {len(result.get('fetched', []))}") 

373 lines.append( 

374 f"Hardcoded codes: {len(result.get('fetched', [])) + len(result.get('missing', []))}" 

375 ) 

376 

377 if result.get("error"): 

378 lines.append(f"\n⚠ Error: {result['error']}") 

379 

380 new_codes = result.get("new", []) 

381 missing_codes = result.get("missing", []) 

382 

383 if new_codes: 

384 lines.append(f"\n🆕 NEW codes (not in hardcoded list): {len(new_codes)}") 

385 for code in new_codes: 

386 lines.append(f" + {code}") 

387 else: 

388 lines.append("\n✅ No new codes found") 

389 

390 if missing_codes: 

391 lines.append( 

392 f"\n🗑️ MISSING codes (in hardcoded but not found): {len(missing_codes)}" 

393 ) 

394 for code in missing_codes: 

395 lines.append(f" - {code}") 

396 else: 

397 lines.append("\n✅ No missing codes") 

398 

399 lines.append("=" * 60) 

400 return "\n".join(lines) 

401 

402 

403if __name__ == "__main__": 

404 # Simple CLI for testing 

405 from packages.core.common.logging import setup_logging 

406 

407 setup_logging() 

408 result = refresh_club_codes() 

409 print(generate_diff_report(result))