Coverage for packages / hrnz_scraper / historical_scraper.py: 9%

514 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-08 08:37 +1200

1"""HRNZ scraper for historical results via the Results Enquiry page.""" 

2 

3from __future__ import annotations 

4 

5import asyncio 

6import os 

7import re 

8from datetime import date 

9from typing import Any 

10from urllib.parse import parse_qs, urlencode, urljoin, urlparse 

11 

12from bs4 import BeautifulSoup 

13 

14try: 

15 from playwright.async_api import Browser, Page, async_playwright 

16except ImportError: # pragma: no cover - optional dependency for scraping runtime 

17 async_playwright = None 

18 Browser = Page = Any 

19 

20from packages.core.common.logging import get_logger 

21from packages.hrnz_scraper.proxy import build_decodo_proxy 

22 

23logger = get_logger(__name__) 

24 

25 

26class HRNZHistoricalResultsScraper: 

27 """Scraper for HRNZ historical results enquiry.""" 

28 

29 BASE_URL = "https://harness.hrnz.co.nz" 

30 RESULTS_PATH = "/gws/ws/r/infohorsews/wsd06x" 

31 SEARCH_URL = ( 

32 "https://harness.hrnz.co.nz/gws/ws/r/infohorsews/wsd08x" 

33 "?Arg=hrnzg-Ptype&Arg=ResultsSearch&Arg=hrnzg-rSite&Arg=TRUE" 

34 ) 

35 

36 RATE_LIMIT_DELAY = 2.0 

37 

38 def __init__(self, timeout: float = 30000): 

39 """Initialize HRNZ historical scraper. 

40 

41 Args: 

42 timeout: Request timeout in milliseconds (default: 30000ms = 30s) 

43 """ 

44 env_timeout = os.getenv("HRNZ_PLAYWRIGHT_TIMEOUT_MS", "").strip() 

45 if env_timeout: 

46 try: 

47 timeout = float(env_timeout) 

48 except ValueError: 

49 logger.warning( 

50 "Invalid HRNZ_PLAYWRIGHT_TIMEOUT_MS=%s; using default.", env_timeout 

51 ) 

52 self.timeout = timeout 

53 self._playwright = None 

54 self._browser: Browser | None = None 

55 self._last_request_time = 0.0 

56 

57 async def __aenter__(self): 

58 await self._ensure_browser() 

59 return self 

60 

61 async def __aexit__(self, exc_type, exc_val, exc_tb): 

62 await self.close() 

63 

64 async def _ensure_browser(self): 

65 if async_playwright is None: 

66 raise ImportError( 

67 "playwright is required for HRNZ scraping; install it or use the API ingest path" 

68 ) 

69 if self._browser is None: 

70 self._playwright = await async_playwright().start() 

71 self._browser = await self._playwright.chromium.launch(headless=True) 

72 logger.info("Playwright browser initialized") 

73 

74 async def close(self): 

75 if self._browser is not None: 

76 await self._browser.close() 

77 self._browser = None 

78 if self._playwright is not None: 

79 await self._playwright.stop() 

80 self._playwright = None 

81 logger.info("Playwright browser closed") 

82 

83 async def _rate_limited_fetch(self, url: str) -> str: 

84 await self._ensure_browser() 

85 

86 import time 

87 

88 elapsed = time.time() - self._last_request_time 

89 if elapsed < self.RATE_LIMIT_DELAY: 

90 await asyncio.sleep(self.RATE_LIMIT_DELAY - elapsed) 

91 

92 logger.debug(f"Fetching: {url}") 

93 

94 try: 

95 proxy = build_decodo_proxy() 

96 context = ( 

97 await self._browser.new_context(proxy=proxy) 

98 if proxy 

99 else await self._browser.new_context() 

100 ) 

101 page: Page = await context.new_page() 

102 try: 

103 await page.goto( 

104 url, timeout=self.timeout, wait_until="domcontentloaded" 

105 ) 

106 await page.wait_for_timeout(1000) 

107 content = await page.content() 

108 self._last_request_time = time.time() 

109 return content 

110 finally: 

111 await page.close() 

112 await context.close() 

113 except Exception as e: 

114 raise RuntimeError(f"Failed to fetch {url}: {e}") from e 

115 

116 async def iter_meetings( 

117 self, 

118 start_date: date, 

119 end_date: date, 

120 race_day_type: str = "OfficialRaces", 

121 club_no: str = "", 

122 ): 

123 """Yield meeting metadata within a date range.""" 

124 seen_racedays: set[str] = set() 

125 current = date(start_date.year, start_date.month, 1) 

126 

127 while current <= end_date: 

128 meetings = await self.list_meetings_for_month( 

129 current.year, 

130 current.month, 

131 race_day_type=race_day_type, 

132 club_no=club_no, 

133 ) 

134 for meeting in meetings: 

135 meeting_date = meeting.get("meeting_date") 

136 if meeting_date and not (start_date <= meeting_date <= end_date): 

137 continue 

138 raceday_id = meeting.get("raceday_id") 

139 if raceday_id and raceday_id in seen_racedays: 

140 continue 

141 if raceday_id: 

142 seen_racedays.add(raceday_id) 

143 yield meeting 

144 

145 if current.month == 12: 

146 current = date(current.year + 1, 1, 1) 

147 else: 

148 current = date(current.year, current.month + 1, 1) 

149 

150 async def list_meetings_for_month( 

151 self, 

152 year: int, 

153 month: int, 

154 race_day_type: str = "OfficialRaces", 

155 club_no: str = "", 

156 ) -> list[dict[str, Any]]: 

157 """Fetch and parse the raceday list for a month.""" 

158 url = self._build_raceday_search_url(year, month, race_day_type, club_no) 

159 html = await self._rate_limited_fetch(url) 

160 return self._parse_raceday_list(html, year) 

161 

162 async def get_meeting_results( 

163 self, results_url: str, meeting_meta: dict[str, Any] | None = None 

164 ): 

165 """Scrape results from a meeting results page.""" 

166 html = await self._rate_limited_fetch(results_url) 

167 soup = BeautifulSoup(html, "html.parser") 

168 

169 meeting = self._parse_meeting_header(soup) 

170 if meeting_meta: 

171 meeting.setdefault("raceday_id", meeting_meta.get("raceday_id")) 

172 meeting.setdefault("meeting_time", meeting_meta.get("meeting_time")) 

173 meeting.setdefault("venue", meeting_meta.get("meeting_name")) 

174 meeting.setdefault("name", meeting_meta.get("meeting_name")) 

175 if "date" not in meeting and meeting_meta.get("meeting_date"): 

176 meeting["date"] = meeting_meta["meeting_date"].isoformat() 

177 meeting["source_url"] = results_url 

178 

179 races = self._parse_races(soup) 

180 if not races: 

181 race_links = self._parse_race_links(soup) 

182 if not race_links: 

183 logger.warning("No race links found for meeting page: %s", results_url) 

184 races = [] 

185 for race_link in race_links: 

186 race_html = await self._rate_limited_fetch(race_link["results_url"]) 

187 race_soup = BeautifulSoup(race_html, "html.parser") 

188 race = self._parse_race_page(race_soup, race_link) 

189 if race and race.get("starters"): 

190 races.append(race) 

191 if race_links and not races: 

192 logger.warning( 

193 "Race links found but no starters parsed for meeting page: %s", 

194 results_url, 

195 ) 

196 meeting["races"] = races 

197 

198 logger.info( 

199 "Scraped meeting: %s on %s (%s races)", 

200 meeting.get("venue"), 

201 meeting.get("date"), 

202 len(races), 

203 ) 

204 

205 return meeting 

206 

207 def _build_raceday_search_url( 

208 self, year: int, month: int, race_day_type: str, club_no: str 

209 ) -> str: 

210 params = [ 

211 ("Arg", "hrnzg-Ptype"), 

212 ("Arg", "RaceResults"), 

213 ("Arg", "hrnzg-rSite"), 

214 ("Arg", "TRUE"), 

215 ("Arg", "hrnzg-ResultsType"), 

216 ("Arg", "RacedaySearch"), 

217 ("Arg", "hrnzg-ResultsYear"), 

218 ("Arg", str(year)), 

219 ("Arg", "hrnzg-ResultsMonth"), 

220 ("Arg", str(month)), 

221 ("Arg", "hrnzg-ResultsDay"), 

222 ("Arg", "1"), 

223 ("Arg", "hrnzg-ResultsRacedayType"), 

224 ("Arg", race_day_type), 

225 ("Arg", "hrnzg-ResultsClubNo"), 

226 ("Arg", club_no), 

227 ] 

228 return f"{self.BASE_URL}{self.RESULTS_PATH}?{urlencode(params)}" 

229 

230 def _parse_raceday_list(self, html: str, year: int) -> list[dict[str, Any]]: 

231 soup = BeautifulSoup(html, "html.parser") 

232 table = soup.find("table") # First table holds raceday list 

233 if not table: 

234 logger.warning("No raceday list table found") 

235 return [] 

236 

237 meetings = [] 

238 

239 for row in table.find_all("tr"): 

240 if row.find("th"): 

241 continue 

242 

243 for link in row.find_all("a", href=True): 

244 href = link.get("href") 

245 if not href or "RacesDisplay" not in href: 

246 continue 

247 meeting_name = link.get_text(strip=True) 

248 if not meeting_name: 

249 continue 

250 results_url = urljoin(self.BASE_URL, href) 

251 raceday_id = self._extract_raceday_id(results_url) 

252 meetings.append( 

253 { 

254 "raceday_id": raceday_id, 

255 "meeting_name": meeting_name, 

256 "meeting_date": None, 

257 "meeting_time": None, 

258 "results_url": results_url, 

259 } 

260 ) 

261 

262 return meetings 

263 

264 def _parse_raceday_header( 

265 self, header_text: str, year: int 

266 ) -> tuple[date | None, str | None]: 

267 header_text = header_text.replace("\xa0", " ").strip() 

268 date_match = re.search(r"(\d{1,2})\s+([A-Za-z]{3})", header_text) 

269 time_match = re.search(r"(\d{1,2}:\d{2})", header_text) 

270 if not date_match: 

271 return None, None 

272 

273 day = int(date_match.group(1)) 

274 month_str = date_match.group(2).lower() 

275 month_map = { 

276 "jan": 1, 

277 "feb": 2, 

278 "mar": 3, 

279 "apr": 4, 

280 "may": 5, 

281 "jun": 6, 

282 "jul": 7, 

283 "aug": 8, 

284 "sep": 9, 

285 "oct": 10, 

286 "nov": 11, 

287 "dec": 12, 

288 } 

289 month = month_map.get(month_str) 

290 if not month: 

291 return None, None 

292 try: 

293 meeting_date = date(year, month, day) 

294 except ValueError: 

295 return None, None 

296 

297 meeting_time = time_match.group(1) if time_match else None 

298 return meeting_date, meeting_time 

299 

300 def _parse_meeting_header(self, soup: BeautifulSoup) -> dict[str, Any]: 

301 meeting: dict[str, Any] = {} 

302 

303 h1 = soup.find("h1") 

304 if h1: 

305 venue = h1.get_text(strip=True) 

306 venue = venue.replace(" Inc", "").replace(" Inc.", "").strip() 

307 meeting["venue"] = venue 

308 

309 date_div = soup.find("div", class_="hrnz-content__date") 

310 if date_div: 

311 date_text = date_div.get_text(strip=True) 

312 meeting["date_raw"] = date_text 

313 parsed_date = self._parse_date(date_text) 

314 if parsed_date: 

315 meeting["date"] = parsed_date 

316 

317 meeting_div = soup.find("div", class_="hrnz-field__meeting") 

318 if meeting_div: 

319 h5 = meeting_div.find("h5") 

320 if h5: 

321 meeting_name = h5.get_text(strip=True) 

322 if " at " in meeting_name: 

323 meeting_name = meeting_name.split(" at ")[0].strip() 

324 meeting["name"] = meeting_name 

325 

326 return meeting 

327 

328 def _parse_date(self, date_str: str) -> str | None: 

329 import datetime as dt 

330 from datetime import datetime 

331 

332 date_str = date_str.strip().replace("\xa0", " ") 

333 

334 formats = [ 

335 "%A, %d %B %Y", 

336 "%A, %d %B", 

337 "%d %B %Y", 

338 "%d %B", 

339 "%d/%m/%Y", 

340 "%d-%m-%Y", 

341 "%d/%m/%y", 

342 "%d-%m-%y", 

343 ] 

344 

345 for fmt in formats: 

346 try: 

347 parsed = datetime.strptime(date_str.strip(), fmt) 

348 if "%Y" not in fmt and "%y" not in fmt: 

349 current_year = dt.datetime.now().year 

350 parsed = parsed.replace(year=current_year) 

351 if parsed.year < 100: 

352 parsed = parsed.replace( 

353 year=parsed.year + (2000 if parsed.year <= 50 else 1900) 

354 ) 

355 return parsed.date().isoformat() 

356 except ValueError: 

357 continue 

358 

359 logger.warning("Could not parse date: %s", date_str) 

360 return None 

361 

362 def _parse_races(self, soup: BeautifulSoup) -> list[dict[str, Any]]: 

363 races = [] 

364 race_sections = soup.find_all("div", class_="hrnz-race") 

365 

366 for section in race_sections: 

367 race = self._parse_race_section(section) 

368 if race and race.get("starters"): 

369 races.append(race) 

370 

371 return races 

372 

373 def _parse_race_links(self, soup: BeautifulSoup) -> list[dict[str, Any]]: 

374 race_links = [] 

375 seen_urls: set[str] = set() 

376 tables = soup.find_all("table") 

377 for table in tables: 

378 headers = [th.get_text(" ", strip=True) for th in table.find_all("th")] 

379 if "Race" not in " ".join(headers): 

380 continue 

381 for row in table.find_all("tr"): 

382 cells = row.find_all("td") 

383 if len(cells) < 2: 

384 continue 

385 race_number_text = cells[0].get_text(" ", strip=True) 

386 name_text = cells[1].get_text(" ", strip=True) 

387 link = row.find("a", href=True) 

388 if not link: 

389 continue 

390 href = link.get("href", "") 

391 if "RaceDisplay" not in href: 

392 continue 

393 results_url = urljoin(self.BASE_URL, href) 

394 if results_url in seen_urls: 

395 continue 

396 race_number = None 

397 match = re.search(r"R(\d+)", race_number_text, re.IGNORECASE) 

398 if match: 

399 race_number = int(match.group(1)) 

400 race_links.append( 

401 { 

402 "race_number": race_number, 

403 "name": name_text, 

404 "results_url": results_url, 

405 } 

406 ) 

407 seen_urls.add(results_url) 

408 

409 if race_links: 

410 return race_links 

411 

412 for link in soup.find_all("a", href=True): 

413 href = link.get("href", "") 

414 if "RaceDisplay" not in href: 

415 continue 

416 results_url = urljoin(self.BASE_URL, href) 

417 if results_url in seen_urls: 

418 continue 

419 text = link.get_text(" ", strip=True) 

420 race_number = None 

421 match = re.search(r"Race\s*(\d+)|R(\d+)", text, re.IGNORECASE) 

422 if match: 

423 race_number = int(match.group(1) or match.group(2)) 

424 race_links.append( 

425 { 

426 "race_number": race_number, 

427 "name": text or None, 

428 "results_url": results_url, 

429 } 

430 ) 

431 seen_urls.add(results_url) 

432 return race_links 

433 

434 def _parse_race_page( 

435 self, soup: BeautifulSoup, race_meta: dict[str, Any] | None = None 

436 ) -> dict[str, Any] | None: 

437 race: dict[str, Any] = {"starters": []} 

438 if race_meta: 

439 if race_meta.get("race_number") is not None: 

440 race["race_number"] = race_meta["race_number"] 

441 if race_meta.get("name"): 

442 race["name"] = race_meta["name"] 

443 

444 title_tag = None 

445 for h5 in soup.find_all("h5"): 

446 if re.search( 

447 r"Race\s+\d+\s*-", h5.get_text(" ", strip=True), re.IGNORECASE 

448 ): 

449 title_tag = h5 

450 break 

451 

452 if title_tag: 

453 title_text = title_tag.get_text(" ", strip=True) 

454 match = re.search(r"Race\s+(\d+)\s*-\s*(.*)", title_text, re.IGNORECASE) 

455 if match: 

456 race["race_number"] = int(match.group(1)) 

457 name_part = match.group(2) 

458 if "," in name_part: 

459 name_part = name_part.split(",", 1)[0].strip() 

460 race["name"] = name_part.strip() 

461 distance_match = re.search(r"(\d{3,4})m", title_text, re.IGNORECASE) 

462 if distance_match: 

463 race["distance_m"] = int(distance_match.group(1)) 

464 

465 page_text = soup.get_text(" ", strip=True) 

466 weather = self._extract_label_value(page_text, "Weather") 

467 track_condition = self._extract_label_value(page_text, "Track") 

468 if weather: 

469 race["weather"] = weather 

470 if track_condition: 

471 race["track_condition"] = track_condition 

472 

473 name_upper = race.get("name", "").upper() 

474 if "MOBILE" in name_upper: 

475 race["start_type"] = "Mobile" 

476 elif "STANDING" in name_upper or "STAND" in name_upper: 

477 race["start_type"] = "Standing" 

478 

479 if "PACE" in name_upper: 

480 race["gait"] = "Pace" 

481 elif "TROT" in name_upper: 

482 race["gait"] = "Trot" 

483 

484 table = soup.find("table", class_="hrnz-table--participants") 

485 if table: 

486 race["starters"] = self._parse_race_table(table) 

487 

488 return race 

489 

490 def _parse_race_section(self, section: BeautifulSoup) -> dict[str, Any] | None: 

491 header = section.find("div", class_="hrnz-race__header") 

492 if not header: 

493 return None 

494 

495 race: dict[str, Any] = {"starters": []} 

496 

497 race_number = None 

498 number_dd = header.find("dd") 

499 if number_dd: 

500 match = re.search(r"(\\d+)", number_dd.get_text(strip=True)) 

501 if match: 

502 race_number = int(match.group(1)) 

503 

504 if not race_number: 

505 race_id = section.get("id", "") 

506 match = re.search(r"race-(\\d+)", race_id) 

507 if match: 

508 race_number = int(match.group(1)) 

509 

510 if race_number: 

511 race["race_number"] = race_number 

512 

513 name = "" 

514 name_tag = header.find("h3") 

515 if name_tag: 

516 name = name_tag.get_text(strip=True) 

517 race["name"] = name 

518 

519 details_tag = header.find("h4") 

520 details_text = "" 

521 if details_tag: 

522 details_text = details_tag.get_text(" ", strip=True) 

523 race["details"] = details_text 

524 

525 meta_text = header.get_text(" ", strip=True) 

526 weather = self._extract_label_value(meta_text, "Weather") 

527 track_condition = self._extract_label_value(meta_text, "Track") 

528 if weather: 

529 race["weather"] = weather 

530 if track_condition: 

531 race["track_condition"] = track_condition 

532 

533 distance_match = re.search(r"(\\d{3,4})m", details_text, re.IGNORECASE) 

534 if distance_match: 

535 race["distance_m"] = int(distance_match.group(1)) 

536 

537 name_upper = name.upper() 

538 if "MOBILE" in name_upper: 

539 race["start_type"] = "Mobile" 

540 elif "STANDING" in name_upper or "STAND" in name_upper: 

541 race["start_type"] = "Standing" 

542 

543 if "PACE" in name_upper: 

544 race["gait"] = "Pace" 

545 elif "TROT" in name_upper: 

546 race["gait"] = "Trot" 

547 

548 table = section.find("table", class_="hrnz-table--participants") 

549 if table: 

550 race["starters"] = self._parse_race_table(table) 

551 

552 return race 

553 

554 def _parse_race_table(self, table: BeautifulSoup) -> list[dict[str, Any]]: 

555 starters = [] 

556 rows = table.find_all("tr") 

557 header_map = self._build_header_map(rows) 

558 placing_headers = {"placing", "place", "pos", "position", "finish", "fin"} 

559 has_placing_column = any(key in header_map for key in placing_headers) 

560 row_index = 0 

561 

562 for row in rows: 

563 if row.find("th"): 

564 continue 

565 cells = row.find_all(["td", "th"]) 

566 if len(cells) < 4: 

567 continue 

568 row_index += 1 

569 fallback_placing = None if has_placing_column else row_index 

570 starter = self._parse_starter_row( 

571 cells, header_map, fallback_placing=fallback_placing 

572 ) 

573 if starter: 

574 starters.append(starter) 

575 

576 return starters 

577 

578 @staticmethod 

579 def _build_header_map(rows: list) -> dict[str, int]: 

580 for row in rows: 

581 headers = row.find_all("th") 

582 if not headers: 

583 continue 

584 header_map = {} 

585 for idx, header in enumerate(headers): 

586 text = header.get_text(strip=True) 

587 if text: 

588 header_map[text.strip().lower()] = idx 

589 if header_map: 

590 return header_map 

591 return {} 

592 

593 def _parse_starter_row( 

594 self, 

595 cells: list, 

596 header_map: dict[str, int], 

597 fallback_placing: int | None = None, 

598 ) -> dict[str, Any] | None: 

599 try: 

600 starter: dict[str, Any] = {} 

601 

602 def _cell_by_label(label: str) -> Any | None: 

603 target = label.lower() 

604 for cell in cells: 

605 data_label = cell.get("data-label") 

606 if data_label and data_label.strip().lower() == target: 

607 return cell 

608 idx = header_map.get(target) 

609 if idx is not None and idx < len(cells): 

610 return cells[idx] 

611 return None 

612 

613 placing_cell = ( 

614 _cell_by_label("Placing") 

615 or _cell_by_label("Place") 

616 or _cell_by_label("Pos") 

617 or _cell_by_label("Position") 

618 or _cell_by_label("Finish") 

619 or _cell_by_label("Fin") 

620 or cells[0] 

621 ) 

622 pos_text = placing_cell.get_text(strip=True) 

623 if pos_text: 

624 pos_upper = pos_text.upper() 

625 if pos_upper in ("SCR", "SCRATCH", "S"): 

626 return None 

627 pos_match = re.match(r"(\\d+)", pos_text) 

628 if pos_match: 

629 starter["placing"] = int(pos_match.group(1)) 

630 elif pos_upper in ("DNS", "DNF", "DSQ", "LR", "NP"): 

631 starter["did_not_finish"] = True 

632 starter["placing"] = None 

633 else: 

634 starter["placing"] = None 

635 elif fallback_placing and not starter.get("did_not_finish"): 

636 starter["placing"] = fallback_placing 

637 

638 book_cell = _cell_by_label("Book") or _cell_by_label("Bk") 

639 if book_cell: 

640 book_text = book_cell.get_text(strip=True) 

641 if book_text and book_text.isdigit(): 

642 starter["runner_number"] = int(book_text) 

643 

644 horse_cell = _cell_by_label("Horse") or cells[2] 

645 horse_link = horse_cell.find("a") 

646 if horse_link: 

647 starter["horse_name"] = horse_link.get_text(strip=True) 

648 horse_href = horse_link.get("href", "") 

649 horse_uuid = self._extract_uuid(horse_href) 

650 if horse_uuid: 

651 starter["horse_id"] = horse_uuid 

652 else: 

653 horse_name = horse_cell.get_text(strip=True) 

654 if horse_name: 

655 starter["horse_name"] = horse_name 

656 

657 barrier_cell = _cell_by_label("Barrier") or _cell_by_label("Draw") 

658 if barrier_cell: 

659 barrier_text = barrier_cell.get_text(strip=True) 

660 if barrier_text: 

661 barrier_match = re.match(r"(\\d+)", barrier_text) 

662 if barrier_match: 

663 starter["barrier"] = int(barrier_match.group(1)) 

664 elif re.search(r"[A-Za-z]", barrier_text): 

665 starter["barrier_position"] = barrier_text 

666 

667 hcap_cell = _cell_by_label("Hcap") or _cell_by_label("HCP") 

668 if hcap_cell: 

669 hcap_text = hcap_cell.get_text(strip=True) 

670 if hcap_text: 

671 if hcap_text.lower().startswith("fr"): 

672 starter["handicap_m"] = 0 

673 else: 

674 hcap_match = re.match(r"(\\d+)", hcap_text) 

675 if hcap_match: 

676 starter["handicap_m"] = int(hcap_match.group(1)) 

677 

678 time_cell = _cell_by_label("Time") or _cell_by_label("Time/ Margin") 

679 if time_cell: 

680 time_text = time_cell.get_text(strip=True) 

681 if time_text: 

682 starter["race_time"] = time_text 

683 

684 margin_cell = _cell_by_label("Margin") 

685 if margin_cell: 

686 margin_text = margin_cell.get_text(strip=True) 

687 if margin_text: 

688 starter["margin"] = margin_text 

689 

690 driver_cell = _cell_by_label("Driver") 

691 if driver_cell: 

692 driver_link = driver_cell.find("a") 

693 if driver_link: 

694 driver_name = driver_link.get_text(strip=True) 

695 if driver_name: 

696 starter["driver_name"] = driver_name 

697 driver_href = driver_link.get("href", "") 

698 driver_uuid = self._extract_uuid(driver_href) 

699 if driver_uuid: 

700 starter["driver_id"] = driver_uuid 

701 else: 

702 driver_name = driver_cell.get_text(strip=True) 

703 if driver_name: 

704 starter["driver_name"] = driver_name 

705 

706 trainer_cell = _cell_by_label("Trainer") 

707 if trainer_cell: 

708 trainer_link = trainer_cell.find("a") 

709 if trainer_link: 

710 trainer_name = trainer_link.get_text(strip=True) 

711 if trainer_name: 

712 starter["trainer_name"] = trainer_name 

713 trainer_href = trainer_link.get("href", "") 

714 trainer_uuid = self._extract_uuid(trainer_href) 

715 if trainer_uuid: 

716 starter["trainer_id"] = trainer_uuid 

717 else: 

718 trainer_name = trainer_cell.get_text(strip=True) 

719 if trainer_name: 

720 starter["trainer_name"] = trainer_name 

721 

722 if starter.get("horse_name"): 

723 return starter 

724 

725 except Exception as e: 

726 logger.debug("Error parsing starter row: %s", e) 

727 

728 return None 

729 

730 @staticmethod 

731 def _extract_uuid(href: str) -> str | None: 

732 uuid_match = re.search(r"([0-9A-F-]{36})", href, re.IGNORECASE) 

733 if uuid_match: 

734 return uuid_match.group(1) 

735 return None 

736 

737 @staticmethod 

738 def _extract_raceday_id(url: str) -> str | None: 

739 parsed = urlparse(url) 

740 args = parse_qs(parsed.query).get("Arg", []) 

741 for idx, value in enumerate(args): 

742 if value == "hrnzg-RacedayID" and idx + 1 < len(args): 

743 return args[idx + 1] 

744 return None 

745 

746 @staticmethod 

747 def _extract_label_value(text: str, label: str) -> str | None: 

748 pattern = rf"{re.escape(label)}:\\s*([^\\n\\r]+)" 

749 match = re.search(pattern, text, re.IGNORECASE) 

750 if not match: 

751 return None 

752 value = match.group(1) 

753 value = re.split(r"(Weather:|Track:)", value, maxsplit=1)[0] 

754 return value.strip().strip(";")