"""HRNZ club code auto-refresh module.

Fetches the HRNZ results index page to discover all active club codes,
then compares them against the hardcoded HRNZ_ALL_CLUB_CODES list.

Usage:
    from packages.hrnz_scraper.club_refresh import refresh_club_codes
    result = refresh_club_codes()
    # result = {"fetched": [...], "new": [...], "missing": [...], "unmatched": [...]}
"""

from __future__ import annotations

import json
import re
from pathlib import Path
from typing import Any

import httpx

from packages.core.common.logging import get_logger

logger = get_logger(__name__)

# HRNZ URL patterns
INFOHORSE_BASE = "https://infohorse.hrnz.co.nz/datahrs/results/"
HARNESS_ORG_NZ = "https://www.harness.org.nz/racing/results/"

# Fallback cache path (relative to project root)
DEFAULT_CACHE_DIR = "data"
DEFAULT_CACHE_FILE = "hrzn_club_codes.json"


def _get_cache_path() -> Path:
    """Return the filesystem path for the club code cache file."""
    # Look for project root by traversing up from this file
    here = Path(__file__).resolve()
    # Walk up to find the tipsharks-elo-api root
    root = here
    for _parent in range(6):
        candidate = root.parent
        if (candidate / "pyproject.toml").exists():
            root = candidate
            break
        root = candidate

    cache_dir = root / DEFAULT_CACHE_DIR
    cache_dir.mkdir(parents=True, exist_ok=True)
    return cache_dir / DEFAULT_CACHE_FILE


def _extract_club_codes_from_html(html: str) -> set[str]:
    """Extract 2-digit club codes from HRNZ results page HTML.

    Looks for URL patterns like ``010741rs.htm`` or ``102402rs.htm``
    where the two middle digits are the club code (e.g., ``41``, ``24``).

    Args:
        html: Raw HTML content of an HRNZ results or index page.

    Returns:
        Set of unique 2-digit club code strings found in the page.
    """
    codes: set[str] = set()

    # Pattern: looks for mmddCCrs.htm where CC is the 2-digit club code
    # Matches any 6+ digits followed by "rs.htm" or similar
    patterns = [
        r"(?<!\d)(\d{2})(?:\d{2})(\d{2})rs\.htm",  # mmddCCrs.htm
        r"(?<!\d)(\d{2})(\d{2})(\d{2})rs\.htm",  # yymmddCCrs.htm
    ]

    for pattern in patterns:
        for match in re.finditer(pattern, html, re.IGNORECASE):
            # The last 2-digit capture group is the club code
            code = match.group(match.lastindex or len(match.groups()))
            if code.isdigit() and 0 <= int(code) <= 99:
                codes.add(code)

    # Also look for explicit links with club codes in query params
    club_param_pattern = r"[?&]club(?:_no|code|id)?[= ](\d{1,2})(?:&|$|\s)"
    for match in re.finditer(club_param_pattern, html, re.IGNORECASE):
        code = match.group(1).zfill(2)
        if code.isdigit() and 0 <= int(code) <= 99:
            codes.add(code)

    # Also scan for any isolated 2-digit numbers near "club" or "meeting" context
    context_pattern = r"(?:club|meeting|venue)[^<]*?(\d{2})[^<]*?(?:rs\.htm|results?)"
    for match in re.finditer(context_pattern, html, re.IGNORECASE):
        code = match.group(1)
        if code.isdigit() and 0 <= int(code) <= 99:
            codes.add(code)

    return codes


def _try_fetch_infohorse_index() -> str | None:
    """Try to fetch the infohorse results directory index page.

    Returns:
        HTML string or None if unavailable.
    """
    try:
        resp = httpx.get(INFOHORSE_BASE, timeout=15.0, follow_redirects=True)
        resp.raise_for_status()
        content_type = resp.headers.get("content-type", "")
        if "text/html" in content_type or "html" in content_type.lower():
            logger.info("Fetched infohorse index page (%d bytes)", len(resp.text))
            return resp.text
        logger.info(
            "Infohorse index returned non-HTML (%s); trying next source", content_type
        )
        return None
    except httpx.HTTPError as exc:
        logger.warning("Failed to fetch infohorse index: %s", exc)
        return None
    except Exception as exc:
        logger.warning("Unexpected error fetching infohorse index: %s", exc)
        return None


def _try_fetch_harness_index() -> str | None:
    """Try to fetch the harness.org.nz results index page.

    Returns:
        HTML string or None if unavailable.
    """
    try:
        resp = httpx.get(HARNESS_ORG_NZ, timeout=15.0, follow_redirects=True)
        resp.raise_for_status()
        logger.info("Fetched harness.org.nz index page (%d bytes)", len(resp.text))
        return resp.text
    except httpx.HTTPError as exc:
        logger.warning("Failed to fetch harness.org.nz index: %s", exc)
        return None
    except Exception as exc:
        logger.warning("Unexpected error fetching harness.org.nz index: %s", exc)
        return None


def _try_fetch_todays_meeting() -> str | None:
    """Try to fetch a specific today's meeting page to extract club codes.

    Falls back to fetching a known recent meeting page pattern to discover
    club codes from the page itself.

    Returns:
        HTML string or None if unavailable.
    """
    from datetime import date, timedelta

    # Try the last 3 days as HRNZ pages may not be published for today
    today = date.today()
    for days_ago in range(1, 4):
        d = today - timedelta(days=days_ago)
        date_prefix = d.strftime("%m%d")
        # Try club code "41" (a common code) as a probe
        url = f"{INFOHORSE_BASE}{date_prefix}41rs.htm"
        try:
            resp = httpx.get(url, timeout=15.0, follow_redirects=True)
            if resp.status_code == 200:
                content_type = resp.headers.get("content-type", "")
                if "text/html" in content_type or "html" in content_type.lower():
                    logger.info(
                        "Fetched meeting page %s (%d bytes)", url, len(resp.text)
                    )
                    return resp.text
        except httpx.HTTPError:
            continue

    logger.warning("Could not fetch any recent meeting page for club code discovery")
    return None


def fetch_club_codes_from_hrnz() -> set[str]:
    """Fetch all discoverable club codes from HRNZ sources.

    Tries multiple sources in order:
    1. infohorse.hrnz.co.nz results directory index
    2. harness.org.nz/racing/results/ index page
    3. A recent meeting page (probe-based)

    Returns:
        Set of 2-digit club code strings (e.g., {"02", "07", "15", ...}).
        May be empty if all sources fail.
    """
    all_codes: set[str] = set()

    # Source 1: Infohorse directory index
    html = _try_fetch_infohorse_index()
    if html:
        codes = _extract_club_codes_from_html(html)
        logger.info("Extracted %d club codes from infohorse index", len(codes))
        all_codes.update(codes)

    # Source 2: Harness.org.nz results page
    html = _try_fetch_harness_index()
    if html:
        codes = _extract_club_codes_from_html(html)
        logger.info("Extracted %d club codes from harness.org.nz index", len(codes))
        all_codes.update(codes)

    # Source 3: Meeting page probe
    if not all_codes:
        html = _try_fetch_todays_meeting()
        if html:
            codes = _extract_club_codes_from_html(html)
            logger.info("Extracted %d club codes from meeting page probe", len(codes))
            all_codes.update(codes)

    return all_codes


def load_cached_codes() -> list[str]:
    """Load previously cached club codes from disk.

    Returns:
        List of 2-digit club code strings, or empty list if no cache exists.
    """
    cache_path = _get_cache_path()
    if not cache_path.exists():
        return []

    try:
        data = json.loads(cache_path.read_text())
        codes = data.get("club_codes", [])
        if isinstance(codes, list):
            logger.info("Loaded %d club codes from cache: %s", len(codes), cache_path)
            return [str(c).zfill(2) for c in codes if str(c).strip().isdigit()]
    except (json.JSONDecodeError, OSError) as exc:
        logger.warning("Failed to load cached club codes: %s", exc)

    return []


def save_cached_codes(codes: list[str]) -> None:
    """Save club codes to disk cache.

    Args:
        codes: List of 2-digit club code strings.
    """
    cache_path = _get_cache_path()
    try:
        cache_path.write_text(
            json.dumps(
                {"club_codes": sorted(set(codes)), "source": "hrnz_club_refresh"},
                indent=2,
            )
        )
        logger.info("Saved %d club codes to cache: %s", len(codes), cache_path)
    except OSError as exc:
        logger.warning("Failed to save club codes to cache: %s", exc)


def refresh_club_codes(
    hardcoded_codes: list[str] | None = None,
    use_cache_fallback: bool = True,
    save_cache: bool = True,
) -> dict[str, Any]:
    """Fetch current HRNZ club codes and compare with the hardcoded list.

    Args:
        hardcoded_codes: The hardcoded list to compare against.
            Defaults to importing ``HRNZ_ALL_CLUB_CODES`` from settings.
        use_cache_fallback: If True and live fetch fails, try loading from
            disk cache.
        save_cache: If True, save the fetched codes to disk cache.

    Returns:
        Dict with keys:
            - ``fetched``: Full set of codes discovered from HRNZ (sorted list)
            - ``new``: Codes found online but NOT in hardcoded list (sorted list)
            - ``missing``: Codes in hardcoded list but NOT found online (sorted list)
            - ``unmatched``: Hardcoded codes with no online confirmation (sorted list)
            - ``source``: Where the codes came from (``"hrnz"``, ``"cache"``, or ``"hardcoded"``)
            - ``error``: Error message if fetching failed entirely
    """
    if hardcoded_codes is None:
        # Lazy import to avoid circular dependency at module level
        from packages.core.common.settings import HRNZ_ALL_CLUB_CODES

        hardcoded_codes = HRNZ_ALL_CLUB_CODES

    hardcoded_set = {c.zfill(2) for c in hardcoded_codes}

    logger.info(
        "Refreshing HRNZ club codes (hardcoded list has %d codes)", len(hardcoded_set)
    )

    fetched: set[str] = set()
    source = "hardcoded"

    # Try live fetch
    try:
        fetched = fetch_club_codes_from_hrnz()
    except Exception as exc:
        logger.error("Error during HRNZ club code fetch: %s", exc, exc_info=True)

    # Fallback to cache
    if not fetched and use_cache_fallback:
        cached = load_cached_codes()
        if cached:
            fetched = {c.zfill(2) for c in cached}
            source = "cache"
            logger.info("Using cached club codes (%d codes)", len(fetched))

    if fetched:
        source = "hrnz"
        fetched_sorted = sorted(fetched)
        new_codes = sorted(fetched - hardcoded_set)
        missing_codes = sorted(hardcoded_set - fetched)
        unmatched_codes = sorted(hardcoded_set - fetched)

        if save_cache:
            save_cached_codes(fetched_sorted)

        logger.info(
            "Club code refresh complete: %d fetched, %d new, %d missing",
            len(fetched),
            len(new_codes),
            len(missing_codes),
        )

        if new_codes:
            logger.warning(
                "NEW HRNZ club codes found (not in hardcoded list): %s",
                ", ".join(new_codes),
            )
        if missing_codes:
            logger.info(
                "Hardcoded codes not found online (may be inactive): %s",
                ", ".join(missing_codes),
            )

        return {
            "fetched": fetched_sorted,
            "new": new_codes,
            "missing": missing_codes,
            "unmatched": unmatched_codes,
            "source": source,
            "error": None,
        }

    # No codes fetched from any source; return hardcoded as fallback
    logger.warning(
        "Could not fetch HRNZ club codes from any source; returning hardcoded list as fallback"
    )
    return {
        "fetched": sorted(hardcoded_set),
        "new": [],
        "missing": [],
        "unmatched": [],
        "source": "hardcoded",
        "error": "Could not fetch club codes from HRNZ; using hardcoded/cached list",
    }


def generate_diff_report(result: dict[str, Any]) -> str:
    """Generate a human-readable diff report from a refresh result.

    Args:
        result: The dict returned by ``refresh_club_codes()``.

    Returns:
        Formatted report string suitable for console output.
    """
    lines: list[str] = []
    lines.append("=" * 60)
    lines.append("HRNZ CLUB CODE REFRESH REPORT")
    lines.append("=" * 60)
    lines.append(f"Source: {result.get('source', 'unknown')}")
    lines.append(f"Total codes found: {len(result.get('fetched', []))}")
    lines.append(
        f"Hardcoded codes: {len(result.get('fetched', [])) + len(result.get('missing', []))}"
    )

    if result.get("error"):
        lines.append(f"\n⚠ Error: {result['error']}")

    new_codes = result.get("new", [])
    missing_codes = result.get("missing", [])

    if new_codes:
        lines.append(f"\n🆕 NEW codes (not in hardcoded list): {len(new_codes)}")
        for code in new_codes:
            lines.append(f"  + {code}")
    else:
        lines.append("\n✅ No new codes found")

    if missing_codes:
        lines.append(
            f"\n🗑️  MISSING codes (in hardcoded but not found): {len(missing_codes)}"
        )
        for code in missing_codes:
            lines.append(f"  - {code}")
    else:
        lines.append("\n✅ No missing codes")

    lines.append("=" * 60)
    return "\n".join(lines)


if __name__ == "__main__":
    # Simple CLI for testing
    from packages.core.common.logging import setup_logging

    setup_logging()
    result = refresh_club_codes()
    print(generate_diff_report(result))
