"""HRNZ scraper for historical results via the Results Enquiry page."""

from __future__ import annotations

import asyncio
import os
import re
from datetime import date
from typing import Any
from urllib.parse import parse_qs, urlencode, urljoin, urlparse

from bs4 import BeautifulSoup

try:
    from playwright.async_api import Browser, Page, async_playwright
except ImportError:  # pragma: no cover - optional dependency for scraping runtime
    async_playwright = None
    Browser = Page = Any

from packages.core.common.logging import get_logger
from packages.hrnz_scraper.proxy import build_decodo_proxy

logger = get_logger(__name__)


class HRNZHistoricalResultsScraper:
    """Scraper for HRNZ historical results enquiry."""

    BASE_URL = "https://harness.hrnz.co.nz"
    RESULTS_PATH = "/gws/ws/r/infohorsews/wsd06x"
    SEARCH_URL = (
        "https://harness.hrnz.co.nz/gws/ws/r/infohorsews/wsd08x"
        "?Arg=hrnzg-Ptype&Arg=ResultsSearch&Arg=hrnzg-rSite&Arg=TRUE"
    )

    RATE_LIMIT_DELAY = 2.0

    def __init__(self, timeout: float = 30000):
        """Initialize HRNZ historical scraper.

        Args:
            timeout: Request timeout in milliseconds (default: 30000ms = 30s)
        """
        env_timeout = os.getenv("HRNZ_PLAYWRIGHT_TIMEOUT_MS", "").strip()
        if env_timeout:
            try:
                timeout = float(env_timeout)
            except ValueError:
                logger.warning(
                    "Invalid HRNZ_PLAYWRIGHT_TIMEOUT_MS=%s; using default.", env_timeout
                )
        self.timeout = timeout
        self._playwright = None
        self._browser: Browser | None = None
        self._last_request_time = 0.0

    async def __aenter__(self):
        await self._ensure_browser()
        return self

    async def __aexit__(self, exc_type, exc_val, exc_tb):
        await self.close()

    async def _ensure_browser(self):
        if async_playwright is None:
            raise ImportError(
                "playwright is required for HRNZ scraping; install it or use the API ingest path"
            )
        if self._browser is None:
            self._playwright = await async_playwright().start()
            self._browser = await self._playwright.chromium.launch(headless=True)
            logger.info("Playwright browser initialized")

    async def close(self):
        if self._browser is not None:
            await self._browser.close()
            self._browser = None
        if self._playwright is not None:
            await self._playwright.stop()
            self._playwright = None
            logger.info("Playwright browser closed")

    async def _rate_limited_fetch(self, url: str) -> str:
        await self._ensure_browser()

        import time

        elapsed = time.time() - self._last_request_time
        if elapsed < self.RATE_LIMIT_DELAY:
            await asyncio.sleep(self.RATE_LIMIT_DELAY - elapsed)

        logger.debug(f"Fetching: {url}")

        try:
            proxy = build_decodo_proxy()
            context = (
                await self._browser.new_context(proxy=proxy)
                if proxy
                else await self._browser.new_context()
            )
            page: Page = await context.new_page()
            try:
                await page.goto(
                    url, timeout=self.timeout, wait_until="domcontentloaded"
                )
                await page.wait_for_timeout(1000)
                content = await page.content()
                self._last_request_time = time.time()
                return content
            finally:
                await page.close()
                await context.close()
        except Exception as e:
            raise RuntimeError(f"Failed to fetch {url}: {e}") from e

    async def iter_meetings(
        self,
        start_date: date,
        end_date: date,
        race_day_type: str = "OfficialRaces",
        club_no: str = "",
    ):
        """Yield meeting metadata within a date range."""
        seen_racedays: set[str] = set()
        current = date(start_date.year, start_date.month, 1)

        while current <= end_date:
            meetings = await self.list_meetings_for_month(
                current.year,
                current.month,
                race_day_type=race_day_type,
                club_no=club_no,
            )
            for meeting in meetings:
                meeting_date = meeting.get("meeting_date")
                if meeting_date and not (start_date <= meeting_date <= end_date):
                    continue
                raceday_id = meeting.get("raceday_id")
                if raceday_id and raceday_id in seen_racedays:
                    continue
                if raceday_id:
                    seen_racedays.add(raceday_id)
                yield meeting

            if current.month == 12:
                current = date(current.year + 1, 1, 1)
            else:
                current = date(current.year, current.month + 1, 1)

    async def list_meetings_for_month(
        self,
        year: int,
        month: int,
        race_day_type: str = "OfficialRaces",
        club_no: str = "",
    ) -> list[dict[str, Any]]:
        """Fetch and parse the raceday list for a month."""
        url = self._build_raceday_search_url(year, month, race_day_type, club_no)
        html = await self._rate_limited_fetch(url)
        return self._parse_raceday_list(html, year)

    async def get_meeting_results(
        self, results_url: str, meeting_meta: dict[str, Any] | None = None
    ):
        """Scrape results from a meeting results page."""
        html = await self._rate_limited_fetch(results_url)
        soup = BeautifulSoup(html, "html.parser")

        meeting = self._parse_meeting_header(soup)
        if meeting_meta:
            meeting.setdefault("raceday_id", meeting_meta.get("raceday_id"))
            meeting.setdefault("meeting_time", meeting_meta.get("meeting_time"))
            meeting.setdefault("venue", meeting_meta.get("meeting_name"))
            meeting.setdefault("name", meeting_meta.get("meeting_name"))
            if "date" not in meeting and meeting_meta.get("meeting_date"):
                meeting["date"] = meeting_meta["meeting_date"].isoformat()
        meeting["source_url"] = results_url

        races = self._parse_races(soup)
        if not races:
            race_links = self._parse_race_links(soup)
            if not race_links:
                logger.warning("No race links found for meeting page: %s", results_url)
            races = []
            for race_link in race_links:
                race_html = await self._rate_limited_fetch(race_link["results_url"])
                race_soup = BeautifulSoup(race_html, "html.parser")
                race = self._parse_race_page(race_soup, race_link)
                if race and race.get("starters"):
                    races.append(race)
            if race_links and not races:
                logger.warning(
                    "Race links found but no starters parsed for meeting page: %s",
                    results_url,
                )
        meeting["races"] = races

        logger.info(
            "Scraped meeting: %s on %s (%s races)",
            meeting.get("venue"),
            meeting.get("date"),
            len(races),
        )

        return meeting

    def _build_raceday_search_url(
        self, year: int, month: int, race_day_type: str, club_no: str
    ) -> str:
        params = [
            ("Arg", "hrnzg-Ptype"),
            ("Arg", "RaceResults"),
            ("Arg", "hrnzg-rSite"),
            ("Arg", "TRUE"),
            ("Arg", "hrnzg-ResultsType"),
            ("Arg", "RacedaySearch"),
            ("Arg", "hrnzg-ResultsYear"),
            ("Arg", str(year)),
            ("Arg", "hrnzg-ResultsMonth"),
            ("Arg", str(month)),
            ("Arg", "hrnzg-ResultsDay"),
            ("Arg", "1"),
            ("Arg", "hrnzg-ResultsRacedayType"),
            ("Arg", race_day_type),
            ("Arg", "hrnzg-ResultsClubNo"),
            ("Arg", club_no),
        ]
        return f"{self.BASE_URL}{self.RESULTS_PATH}?{urlencode(params)}"

    def _parse_raceday_list(self, html: str, year: int) -> list[dict[str, Any]]:
        soup = BeautifulSoup(html, "html.parser")
        table = soup.find("table")  # First table holds raceday list
        if not table:
            logger.warning("No raceday list table found")
            return []

        meetings = []

        for row in table.find_all("tr"):
            if row.find("th"):
                continue

            for link in row.find_all("a", href=True):
                href = link.get("href")
                if not href or "RacesDisplay" not in href:
                    continue
                meeting_name = link.get_text(strip=True)
                if not meeting_name:
                    continue
                results_url = urljoin(self.BASE_URL, href)
                raceday_id = self._extract_raceday_id(results_url)
                meetings.append(
                    {
                        "raceday_id": raceday_id,
                        "meeting_name": meeting_name,
                        "meeting_date": None,
                        "meeting_time": None,
                        "results_url": results_url,
                    }
                )

        return meetings

    def _parse_raceday_header(
        self, header_text: str, year: int
    ) -> tuple[date | None, str | None]:
        header_text = header_text.replace("\xa0", " ").strip()
        date_match = re.search(r"(\d{1,2})\s+([A-Za-z]{3})", header_text)
        time_match = re.search(r"(\d{1,2}:\d{2})", header_text)
        if not date_match:
            return None, None

        day = int(date_match.group(1))
        month_str = date_match.group(2).lower()
        month_map = {
            "jan": 1,
            "feb": 2,
            "mar": 3,
            "apr": 4,
            "may": 5,
            "jun": 6,
            "jul": 7,
            "aug": 8,
            "sep": 9,
            "oct": 10,
            "nov": 11,
            "dec": 12,
        }
        month = month_map.get(month_str)
        if not month:
            return None, None
        try:
            meeting_date = date(year, month, day)
        except ValueError:
            return None, None

        meeting_time = time_match.group(1) if time_match else None
        return meeting_date, meeting_time

    def _parse_meeting_header(self, soup: BeautifulSoup) -> dict[str, Any]:
        meeting: dict[str, Any] = {}

        h1 = soup.find("h1")
        if h1:
            venue = h1.get_text(strip=True)
            venue = venue.replace(" Inc", "").replace(" Inc.", "").strip()
            meeting["venue"] = venue

        date_div = soup.find("div", class_="hrnz-content__date")
        if date_div:
            date_text = date_div.get_text(strip=True)
            meeting["date_raw"] = date_text
            parsed_date = self._parse_date(date_text)
            if parsed_date:
                meeting["date"] = parsed_date

        meeting_div = soup.find("div", class_="hrnz-field__meeting")
        if meeting_div:
            h5 = meeting_div.find("h5")
            if h5:
                meeting_name = h5.get_text(strip=True)
                if " at " in meeting_name:
                    meeting_name = meeting_name.split(" at ")[0].strip()
                meeting["name"] = meeting_name

        return meeting

    def _parse_date(self, date_str: str) -> str | None:
        import datetime as dt
        from datetime import datetime

        date_str = date_str.strip().replace("\xa0", " ")

        formats = [
            "%A, %d %B %Y",
            "%A, %d %B",
            "%d %B %Y",
            "%d %B",
            "%d/%m/%Y",
            "%d-%m-%Y",
            "%d/%m/%y",
            "%d-%m-%y",
        ]

        for fmt in formats:
            try:
                parsed = datetime.strptime(date_str.strip(), fmt)
                if "%Y" not in fmt and "%y" not in fmt:
                    current_year = dt.datetime.now().year
                    parsed = parsed.replace(year=current_year)
                if parsed.year < 100:
                    parsed = parsed.replace(
                        year=parsed.year + (2000 if parsed.year <= 50 else 1900)
                    )
                return parsed.date().isoformat()
            except ValueError:
                continue

        logger.warning("Could not parse date: %s", date_str)
        return None

    def _parse_races(self, soup: BeautifulSoup) -> list[dict[str, Any]]:
        races = []
        race_sections = soup.find_all("div", class_="hrnz-race")

        for section in race_sections:
            race = self._parse_race_section(section)
            if race and race.get("starters"):
                races.append(race)

        return races

    def _parse_race_links(self, soup: BeautifulSoup) -> list[dict[str, Any]]:
        race_links = []
        seen_urls: set[str] = set()
        tables = soup.find_all("table")
        for table in tables:
            headers = [th.get_text(" ", strip=True) for th in table.find_all("th")]
            if "Race" not in " ".join(headers):
                continue
            for row in table.find_all("tr"):
                cells = row.find_all("td")
                if len(cells) < 2:
                    continue
                race_number_text = cells[0].get_text(" ", strip=True)
                name_text = cells[1].get_text(" ", strip=True)
                link = row.find("a", href=True)
                if not link:
                    continue
                href = link.get("href", "")
                if "RaceDisplay" not in href:
                    continue
                results_url = urljoin(self.BASE_URL, href)
                if results_url in seen_urls:
                    continue
                race_number = None
                match = re.search(r"R(\d+)", race_number_text, re.IGNORECASE)
                if match:
                    race_number = int(match.group(1))
                race_links.append(
                    {
                        "race_number": race_number,
                        "name": name_text,
                        "results_url": results_url,
                    }
                )
                seen_urls.add(results_url)

        if race_links:
            return race_links

        for link in soup.find_all("a", href=True):
            href = link.get("href", "")
            if "RaceDisplay" not in href:
                continue
            results_url = urljoin(self.BASE_URL, href)
            if results_url in seen_urls:
                continue
            text = link.get_text(" ", strip=True)
            race_number = None
            match = re.search(r"Race\s*(\d+)|R(\d+)", text, re.IGNORECASE)
            if match:
                race_number = int(match.group(1) or match.group(2))
            race_links.append(
                {
                    "race_number": race_number,
                    "name": text or None,
                    "results_url": results_url,
                }
            )
            seen_urls.add(results_url)
        return race_links

    def _parse_race_page(
        self, soup: BeautifulSoup, race_meta: dict[str, Any] | None = None
    ) -> dict[str, Any] | None:
        race: dict[str, Any] = {"starters": []}
        if race_meta:
            if race_meta.get("race_number") is not None:
                race["race_number"] = race_meta["race_number"]
            if race_meta.get("name"):
                race["name"] = race_meta["name"]

        title_tag = None
        for h5 in soup.find_all("h5"):
            if re.search(
                r"Race\s+\d+\s*-", h5.get_text(" ", strip=True), re.IGNORECASE
            ):
                title_tag = h5
                break

        if title_tag:
            title_text = title_tag.get_text(" ", strip=True)
            match = re.search(r"Race\s+(\d+)\s*-\s*(.*)", title_text, re.IGNORECASE)
            if match:
                race["race_number"] = int(match.group(1))
                name_part = match.group(2)
                if "," in name_part:
                    name_part = name_part.split(",", 1)[0].strip()
                race["name"] = name_part.strip()
            distance_match = re.search(r"(\d{3,4})m", title_text, re.IGNORECASE)
            if distance_match:
                race["distance_m"] = int(distance_match.group(1))

        page_text = soup.get_text(" ", strip=True)
        weather = self._extract_label_value(page_text, "Weather")
        track_condition = self._extract_label_value(page_text, "Track")
        if weather:
            race["weather"] = weather
        if track_condition:
            race["track_condition"] = track_condition

        name_upper = race.get("name", "").upper()
        if "MOBILE" in name_upper:
            race["start_type"] = "Mobile"
        elif "STANDING" in name_upper or "STAND" in name_upper:
            race["start_type"] = "Standing"

        if "PACE" in name_upper:
            race["gait"] = "Pace"
        elif "TROT" in name_upper:
            race["gait"] = "Trot"

        table = soup.find("table", class_="hrnz-table--participants")
        if table:
            race["starters"] = self._parse_race_table(table)

        return race

    def _parse_race_section(self, section: BeautifulSoup) -> dict[str, Any] | None:
        header = section.find("div", class_="hrnz-race__header")
        if not header:
            return None

        race: dict[str, Any] = {"starters": []}

        race_number = None
        number_dd = header.find("dd")
        if number_dd:
            match = re.search(r"(\\d+)", number_dd.get_text(strip=True))
            if match:
                race_number = int(match.group(1))

        if not race_number:
            race_id = section.get("id", "")
            match = re.search(r"race-(\\d+)", race_id)
            if match:
                race_number = int(match.group(1))

        if race_number:
            race["race_number"] = race_number

        name = ""
        name_tag = header.find("h3")
        if name_tag:
            name = name_tag.get_text(strip=True)
            race["name"] = name

        details_tag = header.find("h4")
        details_text = ""
        if details_tag:
            details_text = details_tag.get_text(" ", strip=True)
            race["details"] = details_text

        meta_text = header.get_text(" ", strip=True)
        weather = self._extract_label_value(meta_text, "Weather")
        track_condition = self._extract_label_value(meta_text, "Track")
        if weather:
            race["weather"] = weather
        if track_condition:
            race["track_condition"] = track_condition

        distance_match = re.search(r"(\\d{3,4})m", details_text, re.IGNORECASE)
        if distance_match:
            race["distance_m"] = int(distance_match.group(1))

        name_upper = name.upper()
        if "MOBILE" in name_upper:
            race["start_type"] = "Mobile"
        elif "STANDING" in name_upper or "STAND" in name_upper:
            race["start_type"] = "Standing"

        if "PACE" in name_upper:
            race["gait"] = "Pace"
        elif "TROT" in name_upper:
            race["gait"] = "Trot"

        table = section.find("table", class_="hrnz-table--participants")
        if table:
            race["starters"] = self._parse_race_table(table)

        return race

    def _parse_race_table(self, table: BeautifulSoup) -> list[dict[str, Any]]:
        starters = []
        rows = table.find_all("tr")
        header_map = self._build_header_map(rows)
        placing_headers = {"placing", "place", "pos", "position", "finish", "fin"}
        has_placing_column = any(key in header_map for key in placing_headers)
        row_index = 0

        for row in rows:
            if row.find("th"):
                continue
            cells = row.find_all(["td", "th"])
            if len(cells) < 4:
                continue
            row_index += 1
            fallback_placing = None if has_placing_column else row_index
            starter = self._parse_starter_row(
                cells, header_map, fallback_placing=fallback_placing
            )
            if starter:
                starters.append(starter)

        return starters

    @staticmethod
    def _build_header_map(rows: list) -> dict[str, int]:
        for row in rows:
            headers = row.find_all("th")
            if not headers:
                continue
            header_map = {}
            for idx, header in enumerate(headers):
                text = header.get_text(strip=True)
                if text:
                    header_map[text.strip().lower()] = idx
            if header_map:
                return header_map
        return {}

    def _parse_starter_row(
        self,
        cells: list,
        header_map: dict[str, int],
        fallback_placing: int | None = None,
    ) -> dict[str, Any] | None:
        try:
            starter: dict[str, Any] = {}

            def _cell_by_label(label: str) -> Any | None:
                target = label.lower()
                for cell in cells:
                    data_label = cell.get("data-label")
                    if data_label and data_label.strip().lower() == target:
                        return cell
                idx = header_map.get(target)
                if idx is not None and idx < len(cells):
                    return cells[idx]
                return None

            placing_cell = (
                _cell_by_label("Placing")
                or _cell_by_label("Place")
                or _cell_by_label("Pos")
                or _cell_by_label("Position")
                or _cell_by_label("Finish")
                or _cell_by_label("Fin")
                or cells[0]
            )
            pos_text = placing_cell.get_text(strip=True)
            if pos_text:
                pos_upper = pos_text.upper()
                if pos_upper in ("SCR", "SCRATCH", "S"):
                    return None
                pos_match = re.match(r"(\\d+)", pos_text)
                if pos_match:
                    starter["placing"] = int(pos_match.group(1))
                elif pos_upper in ("DNS", "DNF", "DSQ", "LR", "NP"):
                    starter["did_not_finish"] = True
                    starter["placing"] = None
                else:
                    starter["placing"] = None
            elif fallback_placing and not starter.get("did_not_finish"):
                starter["placing"] = fallback_placing

            book_cell = _cell_by_label("Book") or _cell_by_label("Bk")
            if book_cell:
                book_text = book_cell.get_text(strip=True)
                if book_text and book_text.isdigit():
                    starter["runner_number"] = int(book_text)

            horse_cell = _cell_by_label("Horse") or cells[2]
            horse_link = horse_cell.find("a")
            if horse_link:
                starter["horse_name"] = horse_link.get_text(strip=True)
                horse_href = horse_link.get("href", "")
                horse_uuid = self._extract_uuid(horse_href)
                if horse_uuid:
                    starter["horse_id"] = horse_uuid
            else:
                horse_name = horse_cell.get_text(strip=True)
                if horse_name:
                    starter["horse_name"] = horse_name

            barrier_cell = _cell_by_label("Barrier") or _cell_by_label("Draw")
            if barrier_cell:
                barrier_text = barrier_cell.get_text(strip=True)
                if barrier_text:
                    barrier_match = re.match(r"(\\d+)", barrier_text)
                    if barrier_match:
                        starter["barrier"] = int(barrier_match.group(1))
                    elif re.search(r"[A-Za-z]", barrier_text):
                        starter["barrier_position"] = barrier_text

            hcap_cell = _cell_by_label("Hcap") or _cell_by_label("HCP")
            if hcap_cell:
                hcap_text = hcap_cell.get_text(strip=True)
                if hcap_text:
                    if hcap_text.lower().startswith("fr"):
                        starter["handicap_m"] = 0
                    else:
                        hcap_match = re.match(r"(\\d+)", hcap_text)
                        if hcap_match:
                            starter["handicap_m"] = int(hcap_match.group(1))

            time_cell = _cell_by_label("Time") or _cell_by_label("Time/ Margin")
            if time_cell:
                time_text = time_cell.get_text(strip=True)
                if time_text:
                    starter["race_time"] = time_text

            margin_cell = _cell_by_label("Margin")
            if margin_cell:
                margin_text = margin_cell.get_text(strip=True)
                if margin_text:
                    starter["margin"] = margin_text

            driver_cell = _cell_by_label("Driver")
            if driver_cell:
                driver_link = driver_cell.find("a")
                if driver_link:
                    driver_name = driver_link.get_text(strip=True)
                    if driver_name:
                        starter["driver_name"] = driver_name
                    driver_href = driver_link.get("href", "")
                    driver_uuid = self._extract_uuid(driver_href)
                    if driver_uuid:
                        starter["driver_id"] = driver_uuid
                else:
                    driver_name = driver_cell.get_text(strip=True)
                    if driver_name:
                        starter["driver_name"] = driver_name

            trainer_cell = _cell_by_label("Trainer")
            if trainer_cell:
                trainer_link = trainer_cell.find("a")
                if trainer_link:
                    trainer_name = trainer_link.get_text(strip=True)
                    if trainer_name:
                        starter["trainer_name"] = trainer_name
                    trainer_href = trainer_link.get("href", "")
                    trainer_uuid = self._extract_uuid(trainer_href)
                    if trainer_uuid:
                        starter["trainer_id"] = trainer_uuid
                else:
                    trainer_name = trainer_cell.get_text(strip=True)
                    if trainer_name:
                        starter["trainer_name"] = trainer_name

            if starter.get("horse_name"):
                return starter

        except Exception as e:
            logger.debug("Error parsing starter row: %s", e)

        return None

    @staticmethod
    def _extract_uuid(href: str) -> str | None:
        uuid_match = re.search(r"([0-9A-F-]{36})", href, re.IGNORECASE)
        if uuid_match:
            return uuid_match.group(1)
        return None

    @staticmethod
    def _extract_raceday_id(url: str) -> str | None:
        parsed = urlparse(url)
        args = parse_qs(parsed.query).get("Arg", [])
        for idx, value in enumerate(args):
            if value == "hrnzg-RacedayID" and idx + 1 < len(args):
                return args[idx + 1]
        return None

    @staticmethod
    def _extract_label_value(text: str, label: str) -> str | None:
        pattern = rf"{re.escape(label)}:\\s*([^\\n\\r]+)"
        match = re.search(pattern, text, re.IGNORECASE)
        if not match:
            return None
        value = match.group(1)
        value = re.split(r"(Weather:|Track:)", value, maxsplit=1)[0]
        return value.strip().strip(";")
