"""HRNZ web scraper for extracting historical race data using Playwright.

This scraper extracts race results from the HRNZ InfoHorse results archive.
It respects rate limits and implements polite scraping practices.

WARNING: Web scraping should only be used if official API access is not available.
Always check HRNZ's Terms of Service and consider contacting them for official data access.
"""

import asyncio
import re
from datetime import datetime
from typing import Any
from urllib.parse import urljoin

from bs4 import BeautifulSoup

try:
    from playwright.async_api import Browser, Page, async_playwright
except ImportError:  # pragma: no cover - optional dependency for scraping runtime
    async_playwright = None
    Browser = Page = Any

from packages.core.common.logging import get_logger
from packages.hrnz_scraper.proxy import build_decodo_proxy

logger = get_logger(__name__)


class HRNZScraperError(Exception):
    """Base exception for HRNZ scraper errors."""

    pass


class HRNZScraper:
    """Scraper for HRNZ InfoHorse results archive using Playwright.

    This scraper extracts race meetings, races, and results from the
    publicly accessible HRNZ results archive at infohorse.hrnz.co.nz.

    Example:
        >>> async with HRNZScraper() as scraper:
        >>>     meeting = await scraper.get_meeting_results('010741rs.htm')
    """

    BASE_URL = "https://infohorse.hrnz.co.nz/datahrs/results/"

    # Rate limiting: 1 request per 2 seconds to be polite
    RATE_LIMIT_DELAY = 2.0

    def __init__(self, timeout: float = 30000):  # Playwright uses milliseconds
        """Initialize HRNZ scraper.

        Args:
            timeout: Request timeout in milliseconds (default: 30000ms = 30s)
        """
        self.timeout = timeout
        self._playwright = None
        self._browser: Browser | None = None
        self._last_request_time = 0.0

    async def __aenter__(self):
        """Async context manager entry."""
        await self._ensure_browser()
        return self

    async def __aexit__(self, exc_type, exc_val, exc_tb):
        """Async context manager exit."""
        await self.close()

    async def _ensure_browser(self):
        """Ensure Playwright browser is initialized."""
        if async_playwright is None:
            raise ImportError(
                "playwright is required for HRNZ scraping; install it or use the API ingest path"
            )
        if self._browser is None:
            self._playwright = await async_playwright().start()
            self._browser = await self._playwright.chromium.launch(headless=True)
            logger.info("Playwright browser initialized")

    async def close(self):
        """Close Playwright browser."""
        if self._browser is not None:
            await self._browser.close()
            self._browser = None
        if self._playwright is not None:
            await self._playwright.stop()
            self._playwright = None
            logger.info("Playwright browser closed")

    async def _rate_limited_fetch(self, url: str) -> str:
        """Fetch page with rate limiting and wait for content.

        Args:
            url: URL to fetch

        Returns:
            Page HTML content

        Raises:
            HRNZScraperError: If request fails
        """
        await self._ensure_browser()

        # Enforce rate limit
        import time

        elapsed = time.time() - self._last_request_time
        if elapsed < self.RATE_LIMIT_DELAY:
            await asyncio.sleep(self.RATE_LIMIT_DELAY - elapsed)

        logger.debug(f"Fetching: {url}")

        try:
            proxy = build_decodo_proxy()
            context = (
                await self._browser.new_context(proxy=proxy)
                if proxy
                else await self._browser.new_context()
            )
            page: Page = await context.new_page()
            try:
                # Navigate to page and wait for network idle
                await page.goto(url, timeout=self.timeout, wait_until="networkidle")

                # Wait a bit for any JavaScript to finish rendering
                await page.wait_for_timeout(1000)

                # Get the HTML content
                content = await page.content()

                self._last_request_time = time.time()
                return content
            finally:
                await page.close()
                await context.close()
        except Exception as e:
            raise HRNZScraperError(f"Failed to fetch {url}: {e}") from e

    async def get_meeting_results(self, url: str) -> dict[str, Any]:
        """Scrape results from a specific meeting page.

        Args:
            url: URL to meeting results page (e.g., '010741rs.htm' or full URL)

        Returns:
            Dictionary containing meeting and race data

        Example:
            >>> meeting = await scraper.get_meeting_results('102402rs.htm')
            >>> print(f"Found {len(meeting['races'])} races")
        """
        # Construct full URL if relative path given
        if not url.startswith("http"):
            url = urljoin(self.BASE_URL, url)

        html = await self._rate_limited_fetch(url)
        soup = BeautifulSoup(html, "html.parser")

        # Extract meeting information
        meeting_data = self._parse_meeting_header(soup)

        # Extract races from tables
        races = self._parse_races(soup)

        meeting_data["races"] = races
        meeting_data["source_url"] = url

        logger.info(
            f"Scraped meeting: {meeting_data.get('venue')} "
            f"on {meeting_data.get('date')} - {len(races)} races"
        )

        return meeting_data

    def _parse_meeting_header(self, soup: BeautifulSoup) -> dict[str, Any]:
        """Parse meeting header information from HRNZ page.

        HRNZ uses specific structure:
        - h1 tag for venue/club name
        - div.hrnz-content__date for date
        - h5 tag for meeting details

        Args:
            soup: BeautifulSoup object

        Returns:
            Meeting metadata
        """
        meeting = {}

        # Find venue from h1 tag
        h1 = soup.find("h1")
        if h1:
            venue = h1.get_text(strip=True)
            # Clean up venue name (remove "Inc" suffix, etc.)
            venue = venue.replace(" Inc", "").replace(" Inc.", "").strip()
            meeting["venue"] = venue
            logger.debug(f"Found venue: {venue}")

        # Find date from specific div class
        date_div = soup.find("div", class_="hrnz-content__date")
        if date_div:
            date_text = date_div.get_text(strip=True)
            meeting["date_raw"] = date_text
            # Format: "Wednesday, 7 January" or "Wednesday, 7 January 2026"
            parsed_date = self._parse_date(date_text)
            if parsed_date:
                meeting["date"] = parsed_date
                logger.debug(f"Found date: {parsed_date}")

        # Find meeting name from h5 in hrnz-field__meeting
        meeting_div = soup.find("div", class_="hrnz-field__meeting")
        if meeting_div:
            h5 = meeting_div.find("h5")
            if h5:
                meeting_name = h5.get_text(strip=True)
                # Extract just the meeting name part
                if " at " in meeting_name:
                    meeting_name = meeting_name.split(" at ")[0].strip()
                meeting["name"] = meeting_name
                logger.debug(f"Found meeting name: {meeting_name}")

        return meeting

    def _parse_date(self, date_str: str) -> str | None:
        """Parse date string into ISO format.

        Args:
            date_str: Date string in various formats

        Returns:
            ISO format date string (YYYY-MM-DD) or None
        """
        import datetime as dt

        # Clean up the date string
        date_str = date_str.strip().replace("\xa0", " ")

        # Try various date formats
        formats = [
            "%A, %d %B %Y",  # "Wednesday, 7 January 2026"
            "%A, %d %B",  # "Wednesday, 7 January" (no year)
            "%d %B %Y",  # "7 January 2026"
            "%d %B",  # "7 January" (no year)
            "%d/%m/%Y",
            "%d-%m-%Y",
            "%d/%m/%y",
            "%d-%m-%y",
        ]

        for fmt in formats:
            try:
                parsed = datetime.strptime(date_str.strip(), fmt)

                # If no year in format, assume current year
                if "%Y" not in fmt and "%y" not in fmt:
                    current_year = dt.datetime.now().year
                    parsed = parsed.replace(year=current_year)

                # Handle 2-digit years
                if parsed.year < 100:
                    # Assume 20xx for years 00-50, 19xx for 51-99
                    if parsed.year <= 50:
                        parsed = parsed.replace(year=parsed.year + 2000)
                    else:
                        parsed = parsed.replace(year=parsed.year + 1900)

                return parsed.date().isoformat()
            except ValueError:
                continue

        logger.warning(f"Could not parse date: {date_str}")
        return None

    def _parse_races(self, soup: BeautifulSoup) -> list[dict[str, Any]]:
        """Parse all races from meeting page.

        HRNZ results are in HTML tables. Each table represents one race.
        Tables have headers: PlacePl, BookBk, Horse, Barrier, Hcap, Stakes, etc.

        Args:
            soup: BeautifulSoup object

        Returns:
            List of race dictionaries
        """
        races = []

        # Find all tables - HRNZ has one table per race
        tables = soup.find_all("table")

        logger.debug(f"Found {len(tables)} tables on page")

        for idx, table in enumerate(tables, 1):
            try:
                race = self._parse_race_table(table, idx)
                if race and race.get("starters") and len(race["starters"]) > 0:
                    races.append(race)
                    logger.debug(
                        f"Parsed race {idx}: {len(race.get('starters', []))} starters"
                    )
                else:
                    logger.debug(f"Table {idx} had no starters, skipping")
            except Exception as e:
                logger.warning(f"Failed to parse table {idx}: {e}")
                continue

        return races

    def _parse_race_table(
        self, table: BeautifulSoup, race_number: int
    ) -> dict[str, Any]:
        """Parse individual race table.

        Args:
            table: BeautifulSoup table element
            race_number: Race number (fallback if not in HTML)

        Returns:
            Race dictionary with starters
        """
        race = {"race_number": race_number, "starters": []}

        # Try to extract race details from table caption or headers
        caption = table.find("caption")
        if caption:
            caption_text = caption.get_text(strip=True)
            # Parse race info from caption (e.g., "Race 1 - 2200m - Mobile")
            race_info_match = re.search(
                r"Race\s+(\d+).*?(\d+)m", caption_text, re.IGNORECASE
            )
            if race_info_match:
                race["race_number"] = int(race_info_match.group(1))
                race["distance_m"] = int(race_info_match.group(2))

        # Parse rows
        rows = table.find_all("tr")
        header_map = self._build_header_map(rows)

        for row in rows:
            cells = row.find_all(["td", "th"])
            if len(cells) < 4:  # Not enough data for a starter
                continue

            # Skip header rows
            if row.find("th"):
                continue

            starter = self._parse_starter_row(cells, header_map)
            if starter:
                race["starters"].append(starter)

        return race

    @staticmethod
    def _build_header_map(rows: list) -> dict[str, int]:
        """Build a header map from column names to indices."""
        for row in rows:
            headers = row.find_all("th")
            if not headers:
                continue
            header_map = {}
            for idx, header in enumerate(headers):
                text = header.get_text(strip=True)
                if text:
                    header_map[text.strip().lower()] = idx
            if header_map:
                return header_map
        return {}

    def _parse_starter_row(
        self, cells: list, header_map: dict[str, int]
    ) -> dict[str, Any] | None:
        """Parse a single starter row from race table.

        HRNZ table structure (as of 2026):
        cells[0] = PlacePl (placing)
        cells[1] = BookBk (book number)
        cells[2] = Horse (with link containing UUID)
        cells[3] = Barrier
        cells[4] = Hcap (handicap)
        cells[5] = Stakes
        cells[6] = Fav (favorite odds)
        cells[7] = Time
        cells[8] = Margin
        cells[9] = Time/Margin

        Args:
            cells: List of table cells

        Returns:
            Starter dictionary or None
        """
        try:
            if len(cells) < 5:  # Need at least place, book, horse, barrier, hcap
                return None

            starter = {}

            def _cell_by_label(label: str) -> Any | None:
                target = label.lower()
                for cell in cells:
                    data_label = cell.get("data-label")
                    if data_label and data_label.strip().lower() == target:
                        return cell
                idx = header_map.get(target)
                if idx is not None and idx < len(cells):
                    return cells[idx]
                return None

            # cells[0] = PlacePl (placing)
            placing_cell = (
                _cell_by_label("Placing") or _cell_by_label("Place") or cells[0]
            )
            pos_text = placing_cell.get_text(strip=True)
            if pos_text:
                pos_match = re.match(r"(\d+)", pos_text)
                if pos_match:
                    starter["placing"] = int(pos_match.group(1))
                elif pos_text.upper() in ("DNS", "DNF", "DSQ", "LR", "SCR", "NP"):
                    starter["did_not_finish"] = True
                    starter["placing"] = None
                else:
                    starter["placing"] = None

            # cells[2] = Horse (name and UUID from link)
            horse_cell = _cell_by_label("Horse") or cells[2]
            horse_link = horse_cell.find("a")
            if horse_link:
                starter["horse_name"] = horse_link.get_text(strip=True)
                horse_href = horse_link.get("href", "")
                horse_uuid = self._extract_uuid(horse_href)
                if horse_uuid:
                    starter["horse_id"] = horse_uuid
            else:
                # No link, just text
                horse_name = horse_cell.get_text(strip=True)
                if horse_name:
                    starter["horse_name"] = horse_name

            # cells[3] = Barrier
            barrier_cell = (
                _cell_by_label("Barrier") or _cell_by_label("Draw") or cells[3]
            )
            barrier_text = barrier_cell.get_text(strip=True)
            if barrier_text:
                barrier_match = re.match(r"(\d+)", barrier_text)
                if barrier_match:
                    starter["barrier"] = int(barrier_match.group(1))

            # cells[4] = Hcap (handicap in meters)
            # Can be "fr" (front), "10" (10m), "20" (20m), etc.
            hcap_cell = _cell_by_label("Hcap") or _cell_by_label("HCP") or cells[4]
            hcap_text = hcap_cell.get_text(strip=True)
            if hcap_text:
                if hcap_text.lower() == "fr":
                    starter["handicap_m"] = 0  # Front = 0 handicap
                else:
                    hcap_match = re.match(r"(\d+)", hcap_text)
                    if hcap_match:
                        starter["handicap_m"] = int(hcap_match.group(1))

            # cells[7] = Time (race time)
            time_text = cells[7].get_text(strip=True) if len(cells) > 7 else ""
            if time_text and time_text != "":
                starter["race_time"] = time_text

            # cells[8] = Margin
            margin_text = cells[8].get_text(strip=True) if len(cells) > 8 else ""
            if margin_text and margin_text != "":
                starter["margin"] = margin_text

            driver_cell = _cell_by_label("Driver")
            if driver_cell:
                driver_link = driver_cell.find("a")
                if driver_link:
                    driver_name = driver_link.get_text(strip=True)
                    if driver_name:
                        starter["driver_name"] = driver_name
                    driver_href = driver_link.get("href", "")
                    driver_uuid = self._extract_uuid(driver_href)
                    if driver_uuid:
                        starter["driver_id"] = driver_uuid
                else:
                    driver_name = driver_cell.get_text(strip=True)
                    if driver_name:
                        starter["driver_name"] = driver_name

            trainer_cell = _cell_by_label("Trainer")
            if trainer_cell:
                trainer_link = trainer_cell.find("a")
                if trainer_link:
                    trainer_name = trainer_link.get_text(strip=True)
                    if trainer_name:
                        starter["trainer_name"] = trainer_name
                    trainer_href = trainer_link.get("href", "")
                    trainer_uuid = self._extract_uuid(trainer_href)
                    if trainer_uuid:
                        starter["trainer_id"] = trainer_uuid
                else:
                    trainer_name = trainer_cell.get_text(strip=True)
                    if trainer_name:
                        starter["trainer_name"] = trainer_name

            # Only return if we got at least a horse name
            if starter.get("horse_name"):
                return starter

        except Exception as e:
            logger.debug(f"Error parsing starter row: {e}")

        return None

    def _extract_uuid(self, href: str) -> str | None:
        """Extract UUID from href string.

        Args:
            href: href attribute value

        Returns:
            UUID string or None
        """
        uuid_match = re.search(r"([0-9A-F-]{36})", href, re.IGNORECASE)
        if uuid_match:
            return uuid_match.group(1)
        return None
