Coverage for packages / hrnz_scraper / scraper.py: 36%
262 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-08 08:37 +1200
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-08 08:37 +1200
1"""HRNZ web scraper for extracting historical race data using Playwright.
3This scraper extracts race results from the HRNZ InfoHorse results archive.
4It respects rate limits and implements polite scraping practices.
6WARNING: Web scraping should only be used if official API access is not available.
7Always check HRNZ's Terms of Service and consider contacting them for official data access.
8"""
10import asyncio
11import re
12from datetime import datetime
13from typing import Any
14from urllib.parse import urljoin
16from bs4 import BeautifulSoup
18try:
19 from playwright.async_api import Browser, Page, async_playwright
20except ImportError: # pragma: no cover - optional dependency for scraping runtime
21 async_playwright = None
22 Browser = Page = Any
24from packages.core.common.logging import get_logger
25from packages.hrnz_scraper.proxy import build_decodo_proxy
27logger = get_logger(__name__)
30class HRNZScraperError(Exception):
31 """Base exception for HRNZ scraper errors."""
33 pass
36class HRNZScraper:
37 """Scraper for HRNZ InfoHorse results archive using Playwright.
39 This scraper extracts race meetings, races, and results from the
40 publicly accessible HRNZ results archive at infohorse.hrnz.co.nz.
42 Example:
43 >>> async with HRNZScraper() as scraper:
44 >>> meeting = await scraper.get_meeting_results('010741rs.htm')
45 """
47 BASE_URL = "https://infohorse.hrnz.co.nz/datahrs/results/"
49 # Rate limiting: 1 request per 2 seconds to be polite
50 RATE_LIMIT_DELAY = 2.0
52 def __init__(self, timeout: float = 30000): # Playwright uses milliseconds
53 """Initialize HRNZ scraper.
55 Args:
56 timeout: Request timeout in milliseconds (default: 30000ms = 30s)
57 """
58 self.timeout = timeout
59 self._playwright = None
60 self._browser: Browser | None = None
61 self._last_request_time = 0.0
63 async def __aenter__(self):
64 """Async context manager entry."""
65 await self._ensure_browser()
66 return self
68 async def __aexit__(self, exc_type, exc_val, exc_tb):
69 """Async context manager exit."""
70 await self.close()
72 async def _ensure_browser(self):
73 """Ensure Playwright browser is initialized."""
74 if async_playwright is None:
75 raise ImportError(
76 "playwright is required for HRNZ scraping; install it or use the API ingest path"
77 )
78 if self._browser is None:
79 self._playwright = await async_playwright().start()
80 self._browser = await self._playwright.chromium.launch(headless=True)
81 logger.info("Playwright browser initialized")
83 async def close(self):
84 """Close Playwright browser."""
85 if self._browser is not None:
86 await self._browser.close()
87 self._browser = None
88 if self._playwright is not None:
89 await self._playwright.stop()
90 self._playwright = None
91 logger.info("Playwright browser closed")
93 async def _rate_limited_fetch(self, url: str) -> str:
94 """Fetch page with rate limiting and wait for content.
96 Args:
97 url: URL to fetch
99 Returns:
100 Page HTML content
102 Raises:
103 HRNZScraperError: If request fails
104 """
105 await self._ensure_browser()
107 # Enforce rate limit
108 import time
110 elapsed = time.time() - self._last_request_time
111 if elapsed < self.RATE_LIMIT_DELAY:
112 await asyncio.sleep(self.RATE_LIMIT_DELAY - elapsed)
114 logger.debug(f"Fetching: {url}")
116 try:
117 proxy = build_decodo_proxy()
118 context = (
119 await self._browser.new_context(proxy=proxy)
120 if proxy
121 else await self._browser.new_context()
122 )
123 page: Page = await context.new_page()
124 try:
125 # Navigate to page and wait for network idle
126 await page.goto(url, timeout=self.timeout, wait_until="networkidle")
128 # Wait a bit for any JavaScript to finish rendering
129 await page.wait_for_timeout(1000)
131 # Get the HTML content
132 content = await page.content()
134 self._last_request_time = time.time()
135 return content
136 finally:
137 await page.close()
138 await context.close()
139 except Exception as e:
140 raise HRNZScraperError(f"Failed to fetch {url}: {e}") from e
142 async def get_meeting_results(self, url: str) -> dict[str, Any]:
143 """Scrape results from a specific meeting page.
145 Args:
146 url: URL to meeting results page (e.g., '010741rs.htm' or full URL)
148 Returns:
149 Dictionary containing meeting and race data
151 Example:
152 >>> meeting = await scraper.get_meeting_results('102402rs.htm')
153 >>> print(f"Found {len(meeting['races'])} races")
154 """
155 # Construct full URL if relative path given
156 if not url.startswith("http"):
157 url = urljoin(self.BASE_URL, url)
159 html = await self._rate_limited_fetch(url)
160 soup = BeautifulSoup(html, "html.parser")
162 # Extract meeting information
163 meeting_data = self._parse_meeting_header(soup)
165 # Extract races from tables
166 races = self._parse_races(soup)
168 meeting_data["races"] = races
169 meeting_data["source_url"] = url
171 logger.info(
172 f"Scraped meeting: {meeting_data.get('venue')} "
173 f"on {meeting_data.get('date')} - {len(races)} races"
174 )
176 return meeting_data
178 def _parse_meeting_header(self, soup: BeautifulSoup) -> dict[str, Any]:
179 """Parse meeting header information from HRNZ page.
181 HRNZ uses specific structure:
182 - h1 tag for venue/club name
183 - div.hrnz-content__date for date
184 - h5 tag for meeting details
186 Args:
187 soup: BeautifulSoup object
189 Returns:
190 Meeting metadata
191 """
192 meeting = {}
194 # Find venue from h1 tag
195 h1 = soup.find("h1")
196 if h1:
197 venue = h1.get_text(strip=True)
198 # Clean up venue name (remove "Inc" suffix, etc.)
199 venue = venue.replace(" Inc", "").replace(" Inc.", "").strip()
200 meeting["venue"] = venue
201 logger.debug(f"Found venue: {venue}")
203 # Find date from specific div class
204 date_div = soup.find("div", class_="hrnz-content__date")
205 if date_div:
206 date_text = date_div.get_text(strip=True)
207 meeting["date_raw"] = date_text
208 # Format: "Wednesday, 7 January" or "Wednesday, 7 January 2026"
209 parsed_date = self._parse_date(date_text)
210 if parsed_date:
211 meeting["date"] = parsed_date
212 logger.debug(f"Found date: {parsed_date}")
214 # Find meeting name from h5 in hrnz-field__meeting
215 meeting_div = soup.find("div", class_="hrnz-field__meeting")
216 if meeting_div:
217 h5 = meeting_div.find("h5")
218 if h5:
219 meeting_name = h5.get_text(strip=True)
220 # Extract just the meeting name part
221 if " at " in meeting_name:
222 meeting_name = meeting_name.split(" at ")[0].strip()
223 meeting["name"] = meeting_name
224 logger.debug(f"Found meeting name: {meeting_name}")
226 return meeting
228 def _parse_date(self, date_str: str) -> str | None:
229 """Parse date string into ISO format.
231 Args:
232 date_str: Date string in various formats
234 Returns:
235 ISO format date string (YYYY-MM-DD) or None
236 """
237 import datetime as dt
239 # Clean up the date string
240 date_str = date_str.strip().replace("\xa0", " ")
242 # Try various date formats
243 formats = [
244 "%A, %d %B %Y", # "Wednesday, 7 January 2026"
245 "%A, %d %B", # "Wednesday, 7 January" (no year)
246 "%d %B %Y", # "7 January 2026"
247 "%d %B", # "7 January" (no year)
248 "%d/%m/%Y",
249 "%d-%m-%Y",
250 "%d/%m/%y",
251 "%d-%m-%y",
252 ]
254 for fmt in formats:
255 try:
256 parsed = datetime.strptime(date_str.strip(), fmt)
258 # If no year in format, assume current year
259 if "%Y" not in fmt and "%y" not in fmt:
260 current_year = dt.datetime.now().year
261 parsed = parsed.replace(year=current_year)
263 # Handle 2-digit years
264 if parsed.year < 100:
265 # Assume 20xx for years 00-50, 19xx for 51-99
266 if parsed.year <= 50:
267 parsed = parsed.replace(year=parsed.year + 2000)
268 else:
269 parsed = parsed.replace(year=parsed.year + 1900)
271 return parsed.date().isoformat()
272 except ValueError:
273 continue
275 logger.warning(f"Could not parse date: {date_str}")
276 return None
278 def _parse_races(self, soup: BeautifulSoup) -> list[dict[str, Any]]:
279 """Parse all races from meeting page.
281 HRNZ results are in HTML tables. Each table represents one race.
282 Tables have headers: PlacePl, BookBk, Horse, Barrier, Hcap, Stakes, etc.
284 Args:
285 soup: BeautifulSoup object
287 Returns:
288 List of race dictionaries
289 """
290 races = []
292 # Find all tables - HRNZ has one table per race
293 tables = soup.find_all("table")
295 logger.debug(f"Found {len(tables)} tables on page")
297 for idx, table in enumerate(tables, 1):
298 try:
299 race = self._parse_race_table(table, idx)
300 if race and race.get("starters") and len(race["starters"]) > 0:
301 races.append(race)
302 logger.debug(
303 f"Parsed race {idx}: {len(race.get('starters', []))} starters"
304 )
305 else:
306 logger.debug(f"Table {idx} had no starters, skipping")
307 except Exception as e:
308 logger.warning(f"Failed to parse table {idx}: {e}")
309 continue
311 return races
313 def _parse_race_table(
314 self, table: BeautifulSoup, race_number: int
315 ) -> dict[str, Any]:
316 """Parse individual race table.
318 Args:
319 table: BeautifulSoup table element
320 race_number: Race number (fallback if not in HTML)
322 Returns:
323 Race dictionary with starters
324 """
325 race = {"race_number": race_number, "starters": []}
327 # Try to extract race details from table caption or headers
328 caption = table.find("caption")
329 if caption:
330 caption_text = caption.get_text(strip=True)
331 # Parse race info from caption (e.g., "Race 1 - 2200m - Mobile")
332 race_info_match = re.search(
333 r"Race\s+(\d+).*?(\d+)m", caption_text, re.IGNORECASE
334 )
335 if race_info_match:
336 race["race_number"] = int(race_info_match.group(1))
337 race["distance_m"] = int(race_info_match.group(2))
339 # Parse rows
340 rows = table.find_all("tr")
341 header_map = self._build_header_map(rows)
343 for row in rows:
344 cells = row.find_all(["td", "th"])
345 if len(cells) < 4: # Not enough data for a starter
346 continue
348 # Skip header rows
349 if row.find("th"):
350 continue
352 starter = self._parse_starter_row(cells, header_map)
353 if starter:
354 race["starters"].append(starter)
356 return race
358 @staticmethod
359 def _build_header_map(rows: list) -> dict[str, int]:
360 """Build a header map from column names to indices."""
361 for row in rows:
362 headers = row.find_all("th")
363 if not headers:
364 continue
365 header_map = {}
366 for idx, header in enumerate(headers):
367 text = header.get_text(strip=True)
368 if text:
369 header_map[text.strip().lower()] = idx
370 if header_map:
371 return header_map
372 return {}
374 def _parse_starter_row(
375 self, cells: list, header_map: dict[str, int]
376 ) -> dict[str, Any] | None:
377 """Parse a single starter row from race table.
379 HRNZ table structure (as of 2026):
380 cells[0] = PlacePl (placing)
381 cells[1] = BookBk (book number)
382 cells[2] = Horse (with link containing UUID)
383 cells[3] = Barrier
384 cells[4] = Hcap (handicap)
385 cells[5] = Stakes
386 cells[6] = Fav (favorite odds)
387 cells[7] = Time
388 cells[8] = Margin
389 cells[9] = Time/Margin
391 Args:
392 cells: List of table cells
394 Returns:
395 Starter dictionary or None
396 """
397 try:
398 if len(cells) < 5: # Need at least place, book, horse, barrier, hcap
399 return None
401 starter = {}
403 def _cell_by_label(label: str) -> Any | None:
404 target = label.lower()
405 for cell in cells:
406 data_label = cell.get("data-label")
407 if data_label and data_label.strip().lower() == target:
408 return cell
409 idx = header_map.get(target)
410 if idx is not None and idx < len(cells):
411 return cells[idx]
412 return None
414 # cells[0] = PlacePl (placing)
415 placing_cell = (
416 _cell_by_label("Placing") or _cell_by_label("Place") or cells[0]
417 )
418 pos_text = placing_cell.get_text(strip=True)
419 if pos_text:
420 pos_match = re.match(r"(\d+)", pos_text)
421 if pos_match:
422 starter["placing"] = int(pos_match.group(1))
423 elif pos_text.upper() in ("DNS", "DNF", "DSQ", "LR", "SCR", "NP"):
424 starter["did_not_finish"] = True
425 starter["placing"] = None
426 else:
427 starter["placing"] = None
429 # cells[2] = Horse (name and UUID from link)
430 horse_cell = _cell_by_label("Horse") or cells[2]
431 horse_link = horse_cell.find("a")
432 if horse_link:
433 starter["horse_name"] = horse_link.get_text(strip=True)
434 horse_href = horse_link.get("href", "")
435 horse_uuid = self._extract_uuid(horse_href)
436 if horse_uuid:
437 starter["horse_id"] = horse_uuid
438 else:
439 # No link, just text
440 horse_name = horse_cell.get_text(strip=True)
441 if horse_name:
442 starter["horse_name"] = horse_name
444 # cells[3] = Barrier
445 barrier_cell = (
446 _cell_by_label("Barrier") or _cell_by_label("Draw") or cells[3]
447 )
448 barrier_text = barrier_cell.get_text(strip=True)
449 if barrier_text:
450 barrier_match = re.match(r"(\d+)", barrier_text)
451 if barrier_match:
452 starter["barrier"] = int(barrier_match.group(1))
454 # cells[4] = Hcap (handicap in meters)
455 # Can be "fr" (front), "10" (10m), "20" (20m), etc.
456 hcap_cell = _cell_by_label("Hcap") or _cell_by_label("HCP") or cells[4]
457 hcap_text = hcap_cell.get_text(strip=True)
458 if hcap_text:
459 if hcap_text.lower() == "fr":
460 starter["handicap_m"] = 0 # Front = 0 handicap
461 else:
462 hcap_match = re.match(r"(\d+)", hcap_text)
463 if hcap_match:
464 starter["handicap_m"] = int(hcap_match.group(1))
466 # cells[7] = Time (race time)
467 time_text = cells[7].get_text(strip=True) if len(cells) > 7 else ""
468 if time_text and time_text != "":
469 starter["race_time"] = time_text
471 # cells[8] = Margin
472 margin_text = cells[8].get_text(strip=True) if len(cells) > 8 else ""
473 if margin_text and margin_text != "":
474 starter["margin"] = margin_text
476 driver_cell = _cell_by_label("Driver")
477 if driver_cell:
478 driver_link = driver_cell.find("a")
479 if driver_link:
480 driver_name = driver_link.get_text(strip=True)
481 if driver_name:
482 starter["driver_name"] = driver_name
483 driver_href = driver_link.get("href", "")
484 driver_uuid = self._extract_uuid(driver_href)
485 if driver_uuid:
486 starter["driver_id"] = driver_uuid
487 else:
488 driver_name = driver_cell.get_text(strip=True)
489 if driver_name:
490 starter["driver_name"] = driver_name
492 trainer_cell = _cell_by_label("Trainer")
493 if trainer_cell:
494 trainer_link = trainer_cell.find("a")
495 if trainer_link:
496 trainer_name = trainer_link.get_text(strip=True)
497 if trainer_name:
498 starter["trainer_name"] = trainer_name
499 trainer_href = trainer_link.get("href", "")
500 trainer_uuid = self._extract_uuid(trainer_href)
501 if trainer_uuid:
502 starter["trainer_id"] = trainer_uuid
503 else:
504 trainer_name = trainer_cell.get_text(strip=True)
505 if trainer_name:
506 starter["trainer_name"] = trainer_name
508 # Only return if we got at least a horse name
509 if starter.get("horse_name"):
510 return starter
512 except Exception as e:
513 logger.debug(f"Error parsing starter row: {e}")
515 return None
517 def _extract_uuid(self, href: str) -> str | None:
518 """Extract UUID from href string.
520 Args:
521 href: href attribute value
523 Returns:
524 UUID string or None
525 """
526 uuid_match = re.search(r"([0-9A-F-]{36})", href, re.IGNORECASE)
527 if uuid_match:
528 return uuid_match.group(1)
529 return None