Coverage for packages / hrnz_scraper / scraper.py: 36%

262 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-08 08:37 +1200

1"""HRNZ web scraper for extracting historical race data using Playwright. 

2 

3This scraper extracts race results from the HRNZ InfoHorse results archive. 

4It respects rate limits and implements polite scraping practices. 

5 

6WARNING: Web scraping should only be used if official API access is not available. 

7Always check HRNZ's Terms of Service and consider contacting them for official data access. 

8""" 

9 

10import asyncio 

11import re 

12from datetime import datetime 

13from typing import Any 

14from urllib.parse import urljoin 

15 

16from bs4 import BeautifulSoup 

17 

18try: 

19 from playwright.async_api import Browser, Page, async_playwright 

20except ImportError: # pragma: no cover - optional dependency for scraping runtime 

21 async_playwright = None 

22 Browser = Page = Any 

23 

24from packages.core.common.logging import get_logger 

25from packages.hrnz_scraper.proxy import build_decodo_proxy 

26 

27logger = get_logger(__name__) 

28 

29 

30class HRNZScraperError(Exception): 

31 """Base exception for HRNZ scraper errors.""" 

32 

33 pass 

34 

35 

36class HRNZScraper: 

37 """Scraper for HRNZ InfoHorse results archive using Playwright. 

38 

39 This scraper extracts race meetings, races, and results from the 

40 publicly accessible HRNZ results archive at infohorse.hrnz.co.nz. 

41 

42 Example: 

43 >>> async with HRNZScraper() as scraper: 

44 >>> meeting = await scraper.get_meeting_results('010741rs.htm') 

45 """ 

46 

47 BASE_URL = "https://infohorse.hrnz.co.nz/datahrs/results/" 

48 

49 # Rate limiting: 1 request per 2 seconds to be polite 

50 RATE_LIMIT_DELAY = 2.0 

51 

52 def __init__(self, timeout: float = 30000): # Playwright uses milliseconds 

53 """Initialize HRNZ scraper. 

54 

55 Args: 

56 timeout: Request timeout in milliseconds (default: 30000ms = 30s) 

57 """ 

58 self.timeout = timeout 

59 self._playwright = None 

60 self._browser: Browser | None = None 

61 self._last_request_time = 0.0 

62 

63 async def __aenter__(self): 

64 """Async context manager entry.""" 

65 await self._ensure_browser() 

66 return self 

67 

68 async def __aexit__(self, exc_type, exc_val, exc_tb): 

69 """Async context manager exit.""" 

70 await self.close() 

71 

72 async def _ensure_browser(self): 

73 """Ensure Playwright browser is initialized.""" 

74 if async_playwright is None: 

75 raise ImportError( 

76 "playwright is required for HRNZ scraping; install it or use the API ingest path" 

77 ) 

78 if self._browser is None: 

79 self._playwright = await async_playwright().start() 

80 self._browser = await self._playwright.chromium.launch(headless=True) 

81 logger.info("Playwright browser initialized") 

82 

83 async def close(self): 

84 """Close Playwright browser.""" 

85 if self._browser is not None: 

86 await self._browser.close() 

87 self._browser = None 

88 if self._playwright is not None: 

89 await self._playwright.stop() 

90 self._playwright = None 

91 logger.info("Playwright browser closed") 

92 

93 async def _rate_limited_fetch(self, url: str) -> str: 

94 """Fetch page with rate limiting and wait for content. 

95 

96 Args: 

97 url: URL to fetch 

98 

99 Returns: 

100 Page HTML content 

101 

102 Raises: 

103 HRNZScraperError: If request fails 

104 """ 

105 await self._ensure_browser() 

106 

107 # Enforce rate limit 

108 import time 

109 

110 elapsed = time.time() - self._last_request_time 

111 if elapsed < self.RATE_LIMIT_DELAY: 

112 await asyncio.sleep(self.RATE_LIMIT_DELAY - elapsed) 

113 

114 logger.debug(f"Fetching: {url}") 

115 

116 try: 

117 proxy = build_decodo_proxy() 

118 context = ( 

119 await self._browser.new_context(proxy=proxy) 

120 if proxy 

121 else await self._browser.new_context() 

122 ) 

123 page: Page = await context.new_page() 

124 try: 

125 # Navigate to page and wait for network idle 

126 await page.goto(url, timeout=self.timeout, wait_until="networkidle") 

127 

128 # Wait a bit for any JavaScript to finish rendering 

129 await page.wait_for_timeout(1000) 

130 

131 # Get the HTML content 

132 content = await page.content() 

133 

134 self._last_request_time = time.time() 

135 return content 

136 finally: 

137 await page.close() 

138 await context.close() 

139 except Exception as e: 

140 raise HRNZScraperError(f"Failed to fetch {url}: {e}") from e 

141 

142 async def get_meeting_results(self, url: str) -> dict[str, Any]: 

143 """Scrape results from a specific meeting page. 

144 

145 Args: 

146 url: URL to meeting results page (e.g., '010741rs.htm' or full URL) 

147 

148 Returns: 

149 Dictionary containing meeting and race data 

150 

151 Example: 

152 >>> meeting = await scraper.get_meeting_results('102402rs.htm') 

153 >>> print(f"Found {len(meeting['races'])} races") 

154 """ 

155 # Construct full URL if relative path given 

156 if not url.startswith("http"): 

157 url = urljoin(self.BASE_URL, url) 

158 

159 html = await self._rate_limited_fetch(url) 

160 soup = BeautifulSoup(html, "html.parser") 

161 

162 # Extract meeting information 

163 meeting_data = self._parse_meeting_header(soup) 

164 

165 # Extract races from tables 

166 races = self._parse_races(soup) 

167 

168 meeting_data["races"] = races 

169 meeting_data["source_url"] = url 

170 

171 logger.info( 

172 f"Scraped meeting: {meeting_data.get('venue')} " 

173 f"on {meeting_data.get('date')} - {len(races)} races" 

174 ) 

175 

176 return meeting_data 

177 

178 def _parse_meeting_header(self, soup: BeautifulSoup) -> dict[str, Any]: 

179 """Parse meeting header information from HRNZ page. 

180 

181 HRNZ uses specific structure: 

182 - h1 tag for venue/club name 

183 - div.hrnz-content__date for date 

184 - h5 tag for meeting details 

185 

186 Args: 

187 soup: BeautifulSoup object 

188 

189 Returns: 

190 Meeting metadata 

191 """ 

192 meeting = {} 

193 

194 # Find venue from h1 tag 

195 h1 = soup.find("h1") 

196 if h1: 

197 venue = h1.get_text(strip=True) 

198 # Clean up venue name (remove "Inc" suffix, etc.) 

199 venue = venue.replace(" Inc", "").replace(" Inc.", "").strip() 

200 meeting["venue"] = venue 

201 logger.debug(f"Found venue: {venue}") 

202 

203 # Find date from specific div class 

204 date_div = soup.find("div", class_="hrnz-content__date") 

205 if date_div: 

206 date_text = date_div.get_text(strip=True) 

207 meeting["date_raw"] = date_text 

208 # Format: "Wednesday, 7 January" or "Wednesday, 7 January 2026" 

209 parsed_date = self._parse_date(date_text) 

210 if parsed_date: 

211 meeting["date"] = parsed_date 

212 logger.debug(f"Found date: {parsed_date}") 

213 

214 # Find meeting name from h5 in hrnz-field__meeting 

215 meeting_div = soup.find("div", class_="hrnz-field__meeting") 

216 if meeting_div: 

217 h5 = meeting_div.find("h5") 

218 if h5: 

219 meeting_name = h5.get_text(strip=True) 

220 # Extract just the meeting name part 

221 if " at " in meeting_name: 

222 meeting_name = meeting_name.split(" at ")[0].strip() 

223 meeting["name"] = meeting_name 

224 logger.debug(f"Found meeting name: {meeting_name}") 

225 

226 return meeting 

227 

228 def _parse_date(self, date_str: str) -> str | None: 

229 """Parse date string into ISO format. 

230 

231 Args: 

232 date_str: Date string in various formats 

233 

234 Returns: 

235 ISO format date string (YYYY-MM-DD) or None 

236 """ 

237 import datetime as dt 

238 

239 # Clean up the date string 

240 date_str = date_str.strip().replace("\xa0", " ") 

241 

242 # Try various date formats 

243 formats = [ 

244 "%A, %d %B %Y", # "Wednesday, 7 January 2026" 

245 "%A, %d %B", # "Wednesday, 7 January" (no year) 

246 "%d %B %Y", # "7 January 2026" 

247 "%d %B", # "7 January" (no year) 

248 "%d/%m/%Y", 

249 "%d-%m-%Y", 

250 "%d/%m/%y", 

251 "%d-%m-%y", 

252 ] 

253 

254 for fmt in formats: 

255 try: 

256 parsed = datetime.strptime(date_str.strip(), fmt) 

257 

258 # If no year in format, assume current year 

259 if "%Y" not in fmt and "%y" not in fmt: 

260 current_year = dt.datetime.now().year 

261 parsed = parsed.replace(year=current_year) 

262 

263 # Handle 2-digit years 

264 if parsed.year < 100: 

265 # Assume 20xx for years 00-50, 19xx for 51-99 

266 if parsed.year <= 50: 

267 parsed = parsed.replace(year=parsed.year + 2000) 

268 else: 

269 parsed = parsed.replace(year=parsed.year + 1900) 

270 

271 return parsed.date().isoformat() 

272 except ValueError: 

273 continue 

274 

275 logger.warning(f"Could not parse date: {date_str}") 

276 return None 

277 

278 def _parse_races(self, soup: BeautifulSoup) -> list[dict[str, Any]]: 

279 """Parse all races from meeting page. 

280 

281 HRNZ results are in HTML tables. Each table represents one race. 

282 Tables have headers: PlacePl, BookBk, Horse, Barrier, Hcap, Stakes, etc. 

283 

284 Args: 

285 soup: BeautifulSoup object 

286 

287 Returns: 

288 List of race dictionaries 

289 """ 

290 races = [] 

291 

292 # Find all tables - HRNZ has one table per race 

293 tables = soup.find_all("table") 

294 

295 logger.debug(f"Found {len(tables)} tables on page") 

296 

297 for idx, table in enumerate(tables, 1): 

298 try: 

299 race = self._parse_race_table(table, idx) 

300 if race and race.get("starters") and len(race["starters"]) > 0: 

301 races.append(race) 

302 logger.debug( 

303 f"Parsed race {idx}: {len(race.get('starters', []))} starters" 

304 ) 

305 else: 

306 logger.debug(f"Table {idx} had no starters, skipping") 

307 except Exception as e: 

308 logger.warning(f"Failed to parse table {idx}: {e}") 

309 continue 

310 

311 return races 

312 

313 def _parse_race_table( 

314 self, table: BeautifulSoup, race_number: int 

315 ) -> dict[str, Any]: 

316 """Parse individual race table. 

317 

318 Args: 

319 table: BeautifulSoup table element 

320 race_number: Race number (fallback if not in HTML) 

321 

322 Returns: 

323 Race dictionary with starters 

324 """ 

325 race = {"race_number": race_number, "starters": []} 

326 

327 # Try to extract race details from table caption or headers 

328 caption = table.find("caption") 

329 if caption: 

330 caption_text = caption.get_text(strip=True) 

331 # Parse race info from caption (e.g., "Race 1 - 2200m - Mobile") 

332 race_info_match = re.search( 

333 r"Race\s+(\d+).*?(\d+)m", caption_text, re.IGNORECASE 

334 ) 

335 if race_info_match: 

336 race["race_number"] = int(race_info_match.group(1)) 

337 race["distance_m"] = int(race_info_match.group(2)) 

338 

339 # Parse rows 

340 rows = table.find_all("tr") 

341 header_map = self._build_header_map(rows) 

342 

343 for row in rows: 

344 cells = row.find_all(["td", "th"]) 

345 if len(cells) < 4: # Not enough data for a starter 

346 continue 

347 

348 # Skip header rows 

349 if row.find("th"): 

350 continue 

351 

352 starter = self._parse_starter_row(cells, header_map) 

353 if starter: 

354 race["starters"].append(starter) 

355 

356 return race 

357 

358 @staticmethod 

359 def _build_header_map(rows: list) -> dict[str, int]: 

360 """Build a header map from column names to indices.""" 

361 for row in rows: 

362 headers = row.find_all("th") 

363 if not headers: 

364 continue 

365 header_map = {} 

366 for idx, header in enumerate(headers): 

367 text = header.get_text(strip=True) 

368 if text: 

369 header_map[text.strip().lower()] = idx 

370 if header_map: 

371 return header_map 

372 return {} 

373 

374 def _parse_starter_row( 

375 self, cells: list, header_map: dict[str, int] 

376 ) -> dict[str, Any] | None: 

377 """Parse a single starter row from race table. 

378 

379 HRNZ table structure (as of 2026): 

380 cells[0] = PlacePl (placing) 

381 cells[1] = BookBk (book number) 

382 cells[2] = Horse (with link containing UUID) 

383 cells[3] = Barrier 

384 cells[4] = Hcap (handicap) 

385 cells[5] = Stakes 

386 cells[6] = Fav (favorite odds) 

387 cells[7] = Time 

388 cells[8] = Margin 

389 cells[9] = Time/Margin 

390 

391 Args: 

392 cells: List of table cells 

393 

394 Returns: 

395 Starter dictionary or None 

396 """ 

397 try: 

398 if len(cells) < 5: # Need at least place, book, horse, barrier, hcap 

399 return None 

400 

401 starter = {} 

402 

403 def _cell_by_label(label: str) -> Any | None: 

404 target = label.lower() 

405 for cell in cells: 

406 data_label = cell.get("data-label") 

407 if data_label and data_label.strip().lower() == target: 

408 return cell 

409 idx = header_map.get(target) 

410 if idx is not None and idx < len(cells): 

411 return cells[idx] 

412 return None 

413 

414 # cells[0] = PlacePl (placing) 

415 placing_cell = ( 

416 _cell_by_label("Placing") or _cell_by_label("Place") or cells[0] 

417 ) 

418 pos_text = placing_cell.get_text(strip=True) 

419 if pos_text: 

420 pos_match = re.match(r"(\d+)", pos_text) 

421 if pos_match: 

422 starter["placing"] = int(pos_match.group(1)) 

423 elif pos_text.upper() in ("DNS", "DNF", "DSQ", "LR", "SCR", "NP"): 

424 starter["did_not_finish"] = True 

425 starter["placing"] = None 

426 else: 

427 starter["placing"] = None 

428 

429 # cells[2] = Horse (name and UUID from link) 

430 horse_cell = _cell_by_label("Horse") or cells[2] 

431 horse_link = horse_cell.find("a") 

432 if horse_link: 

433 starter["horse_name"] = horse_link.get_text(strip=True) 

434 horse_href = horse_link.get("href", "") 

435 horse_uuid = self._extract_uuid(horse_href) 

436 if horse_uuid: 

437 starter["horse_id"] = horse_uuid 

438 else: 

439 # No link, just text 

440 horse_name = horse_cell.get_text(strip=True) 

441 if horse_name: 

442 starter["horse_name"] = horse_name 

443 

444 # cells[3] = Barrier 

445 barrier_cell = ( 

446 _cell_by_label("Barrier") or _cell_by_label("Draw") or cells[3] 

447 ) 

448 barrier_text = barrier_cell.get_text(strip=True) 

449 if barrier_text: 

450 barrier_match = re.match(r"(\d+)", barrier_text) 

451 if barrier_match: 

452 starter["barrier"] = int(barrier_match.group(1)) 

453 

454 # cells[4] = Hcap (handicap in meters) 

455 # Can be "fr" (front), "10" (10m), "20" (20m), etc. 

456 hcap_cell = _cell_by_label("Hcap") or _cell_by_label("HCP") or cells[4] 

457 hcap_text = hcap_cell.get_text(strip=True) 

458 if hcap_text: 

459 if hcap_text.lower() == "fr": 

460 starter["handicap_m"] = 0 # Front = 0 handicap 

461 else: 

462 hcap_match = re.match(r"(\d+)", hcap_text) 

463 if hcap_match: 

464 starter["handicap_m"] = int(hcap_match.group(1)) 

465 

466 # cells[7] = Time (race time) 

467 time_text = cells[7].get_text(strip=True) if len(cells) > 7 else "" 

468 if time_text and time_text != "": 

469 starter["race_time"] = time_text 

470 

471 # cells[8] = Margin 

472 margin_text = cells[8].get_text(strip=True) if len(cells) > 8 else "" 

473 if margin_text and margin_text != "": 

474 starter["margin"] = margin_text 

475 

476 driver_cell = _cell_by_label("Driver") 

477 if driver_cell: 

478 driver_link = driver_cell.find("a") 

479 if driver_link: 

480 driver_name = driver_link.get_text(strip=True) 

481 if driver_name: 

482 starter["driver_name"] = driver_name 

483 driver_href = driver_link.get("href", "") 

484 driver_uuid = self._extract_uuid(driver_href) 

485 if driver_uuid: 

486 starter["driver_id"] = driver_uuid 

487 else: 

488 driver_name = driver_cell.get_text(strip=True) 

489 if driver_name: 

490 starter["driver_name"] = driver_name 

491 

492 trainer_cell = _cell_by_label("Trainer") 

493 if trainer_cell: 

494 trainer_link = trainer_cell.find("a") 

495 if trainer_link: 

496 trainer_name = trainer_link.get_text(strip=True) 

497 if trainer_name: 

498 starter["trainer_name"] = trainer_name 

499 trainer_href = trainer_link.get("href", "") 

500 trainer_uuid = self._extract_uuid(trainer_href) 

501 if trainer_uuid: 

502 starter["trainer_id"] = trainer_uuid 

503 else: 

504 trainer_name = trainer_cell.get_text(strip=True) 

505 if trainer_name: 

506 starter["trainer_name"] = trainer_name 

507 

508 # Only return if we got at least a horse name 

509 if starter.get("horse_name"): 

510 return starter 

511 

512 except Exception as e: 

513 logger.debug(f"Error parsing starter row: {e}") 

514 

515 return None 

516 

517 def _extract_uuid(self, href: str) -> str | None: 

518 """Extract UUID from href string. 

519 

520 Args: 

521 href: href attribute value 

522 

523 Returns: 

524 UUID string or None 

525 """ 

526 uuid_match = re.search(r"([0-9A-F-]{36})", href, re.IGNORECASE) 

527 if uuid_match: 

528 return uuid_match.group(1) 

529 return None