"""Data quality validation and monitoring for harness racing data."""

from dataclasses import dataclass, field
from datetime import date, datetime

from sqlalchemy import func
from sqlalchemy.orm import Session

from packages.core.common.logging import get_logger
from packages.core.storage.models import Meeting, Race, Starter

logger = get_logger(__name__)


@dataclass
class ValidationIssue:
    """Represents a data quality issue."""

    severity: str  # "error", "warning", "info"
    category: str  # "placing", "missing_data", "suspicious", "completeness"
    message: str
    race_id: int | None = None
    meeting_id: int | None = None
    starter_id: int | None = None
    details: dict = field(default_factory=dict)


@dataclass
class DataQualityReport:
    """Data quality assessment report."""

    start_date: date
    end_date: date
    total_meetings: int
    total_races: int
    total_starters: int
    issues: list[ValidationIssue] = field(default_factory=list)
    metrics: dict = field(default_factory=dict)
    generated_at: datetime = field(default_factory=datetime.now)

    @property
    def error_count(self) -> int:
        """Count of error-level issues."""
        return sum(1 for issue in self.issues if issue.severity == "error")

    @property
    def warning_count(self) -> int:
        """Count of warning-level issues."""
        return sum(1 for issue in self.issues if issue.severity == "warning")

    @property
    def has_errors(self) -> bool:
        """Whether report contains any errors."""
        return self.error_count > 0


class DataQualityValidator:
    """Validates data quality for harness racing data."""

    def __init__(self, session: Session):
        """Initialize validator.

        Args:
            session: Database session
        """
        self.session = session

    def validate_race(
        self, race: Race, starters: list[Starter]
    ) -> list[ValidationIssue]:
        """Validate a single race and its starters.

        Args:
            race: Race to validate
            starters: List of starters in the race

        Returns:
            List of validation issues found
        """
        issues = []

        # Validate placing sequence
        issues.extend(self._validate_placings(race, starters))

        # Validate data completeness
        issues.extend(self._validate_completeness(race, starters))

        # Detect suspicious results
        issues.extend(self._detect_suspicious_results(race, starters))

        return issues

    def _validate_placings(
        self, race: Race, starters: list[Starter]
    ) -> list[ValidationIssue]:
        """Validate placing sequence is valid.

        Checks:
        - No gaps in placing sequence (1, 2, 3, ... or 1, 2, 4 if 3 DNF)
        - No duplicate placings
        - First place exists
        - Placings are positive integers

        Args:
            race: Race instance
            starters: List of starters

        Returns:
            List of validation issues
        """
        issues = []

        # Get finished starters only
        finished = [
            s for s in starters if s.placing is not None and not s.did_not_finish
        ]

        if not finished:
            issues.append(
                ValidationIssue(
                    severity="warning",
                    category="placing",
                    message="No finishers in race",
                    race_id=race.id,
                    details={"starter_count": len(starters)},
                )
            )
            return issues

        placings = [s.placing for s in finished]

        # Check for duplicates
        if len(placings) != len(set(placings)):
            duplicates = [p for p in set(placings) if placings.count(p) > 1]
            issues.append(
                ValidationIssue(
                    severity="error",
                    category="placing",
                    message=f"Duplicate placings found: {duplicates}",
                    race_id=race.id,
                    details={"duplicates": duplicates},
                )
            )

        # Check first place exists
        if 1 not in placings:
            issues.append(
                ValidationIssue(
                    severity="error",
                    category="placing",
                    message="No first place finisher",
                    race_id=race.id,
                    details={"placings": sorted(placings)},
                )
            )

        # Check for gaps (allowing for DNF)
        sorted_placings = sorted(placings)
        expected = list(range(1, len(placings) + 1))

        # Allow small gaps (up to 2) for DNF cases
        max_gap = max(sorted_placings) - len(sorted_placings)
        if max_gap > 2:
            issues.append(
                ValidationIssue(
                    severity="warning",
                    category="placing",
                    message=f"Large gap in placing sequence (gap={max_gap})",
                    race_id=race.id,
                    details={"placings": sorted_placings, "expected": expected},
                )
            )

        # Check for invalid placing values
        for starter in finished:
            if starter.placing <= 0:
                issues.append(
                    ValidationIssue(
                        severity="error",
                        category="placing",
                        message=f"Invalid placing value: {starter.placing}",
                        race_id=race.id,
                        starter_id=starter.id,
                    )
                )

        return issues

    def _validate_completeness(
        self, race: Race, starters: list[Starter]
    ) -> list[ValidationIssue]:
        """Validate data completeness.

        Checks:
        - All starters have horse_id
        - Missing driver assignments
        - Missing trainer assignments
        - Missing barrier positions

        Args:
            race: Race instance
            starters: List of starters

        Returns:
            List of validation issues
        """
        issues = []

        missing_horses = sum(1 for s in starters if not s.horse_id)
        missing_drivers = sum(1 for s in starters if not s.driver_id)
        missing_trainers = sum(1 for s in starters if not s.trainer_id)
        missing_barriers = sum(1 for s in starters if s.barrier is None)

        if missing_horses > 0:
            issues.append(
                ValidationIssue(
                    severity="error",
                    category="missing_data",
                    message=f"{missing_horses} starters missing horse_id",
                    race_id=race.id,
                    details={"count": missing_horses},
                )
            )

        if missing_drivers > 0:
            issues.append(
                ValidationIssue(
                    severity="warning",
                    category="missing_data",
                    message=f"{missing_drivers} starters missing driver assignment",
                    race_id=race.id,
                    details={"count": missing_drivers},
                )
            )

        if missing_trainers > 0:
            issues.append(
                ValidationIssue(
                    severity="warning",
                    category="missing_data",
                    message=f"{missing_trainers} starters missing trainer assignment",
                    race_id=race.id,
                    details={"count": missing_trainers},
                )
            )

        if missing_barriers > 0:
            issues.append(
                ValidationIssue(
                    severity="warning",
                    category="missing_data",
                    message=f"{missing_barriers} starters missing barrier position",
                    race_id=race.id,
                    details={"count": missing_barriers},
                )
            )

        return issues

    def _detect_suspicious_results(
        self, race: Race, starters: list[Starter]
    ) -> list[ValidationIssue]:
        """Detect suspicious or anomalous results.

        Checks:
        - Very few starters (< 3)
        - Too many DNFs (> 50% of field)
        - Unusual handicap values

        Args:
            race: Race instance
            starters: List of starters

        Returns:
            List of validation issues
        """
        issues = []

        # Check field size
        if len(starters) < 3:
            issues.append(
                ValidationIssue(
                    severity="warning",
                    category="suspicious",
                    message=f"Very small field size: {len(starters)} starters",
                    race_id=race.id,
                    details={"starter_count": len(starters)},
                )
            )

        # Check DNF rate
        dnf_count = sum(1 for s in starters if s.did_not_finish)
        if len(starters) > 0:
            dnf_rate = dnf_count / len(starters)
            if dnf_rate > 0.5:
                issues.append(
                    ValidationIssue(
                        severity="warning",
                        category="suspicious",
                        message=f"High DNF rate: {dnf_rate:.1%} ({dnf_count}/{len(starters)})",
                        race_id=race.id,
                        details={"dnf_count": dnf_count, "dnf_rate": dnf_rate},
                    )
                )

        # Check for unusual handicaps
        handicaps = [s.handicap_m for s in starters if s.handicap_m is not None]
        if handicaps:
            max_handicap = max(handicaps)
            if max_handicap > 100:  # > 100m back is very unusual
                issues.append(
                    ValidationIssue(
                        severity="info",
                        category="suspicious",
                        message=f"Unusually large handicap: {max_handicap}m",
                        race_id=race.id,
                        details={"max_handicap": max_handicap},
                    )
                )

        return issues

    def generate_report(self, from_date: date, to_date: date) -> DataQualityReport:
        """Generate comprehensive data quality report for date range.

        Args:
            from_date: Start date (inclusive)
            to_date: End date (inclusive)

        Returns:
            DataQualityReport with all issues and metrics
        """
        logger.info(f"Generating data quality report from {from_date} to {to_date}")

        # Get all meetings in date range
        meetings = (
            self.session.query(Meeting)
            .filter(
                Meeting.meeting_date >= from_date,
                Meeting.meeting_date <= to_date,
            )
            .all()
        )

        all_issues = []
        total_races = 0
        total_starters = 0

        # Validate each race
        for meeting in meetings:
            for race in meeting.races:
                total_races += 1
                starters = race.starters
                total_starters += len(starters)

                race_issues = self.validate_race(race, starters)
                all_issues.extend(race_issues)

        # Compute additional metrics
        metrics = self._compute_metrics(meetings, total_races, total_starters)

        report = DataQualityReport(
            start_date=from_date,
            end_date=to_date,
            total_meetings=len(meetings),
            total_races=total_races,
            total_starters=total_starters,
            issues=all_issues,
            metrics=metrics,
        )

        logger.info(
            f"Data quality report complete: {report.error_count} errors, "
            f"{report.warning_count} warnings across {total_races} races"
        )

        return report

    def _compute_metrics(
        self, meetings: list[Meeting], total_races: int, total_starters: int
    ) -> dict:
        """Compute data quality metrics.

        Args:
            meetings: List of meetings
            total_races: Total race count
            total_starters: Total starter count

        Returns:
            Dictionary of metrics
        """
        if not meetings or total_races == 0:
            return {}

        # Compute averages
        avg_races_per_meeting = total_races / len(meetings) if meetings else 0
        avg_starters_per_race = total_starters / total_races if total_races else 0

        # Compute completeness metrics
        total_with_driver = (
            self.session.query(func.count(Starter.id))
            .join(Starter.race)
            .join(Race.meeting)
            .filter(
                Meeting.meeting_date >= meetings[0].meeting_date,
                Meeting.meeting_date <= meetings[-1].meeting_date,
                Starter.driver_id.isnot(None),
            )
            .scalar()
        )

        total_with_trainer = (
            self.session.query(func.count(Starter.id))
            .join(Starter.race)
            .join(Race.meeting)
            .filter(
                Meeting.meeting_date >= meetings[0].meeting_date,
                Meeting.meeting_date <= meetings[-1].meeting_date,
                Starter.trainer_id.isnot(None),
            )
            .scalar()
        )

        driver_completeness = (
            total_with_driver / total_starters if total_starters > 0 else 0
        )
        trainer_completeness = (
            total_with_trainer / total_starters if total_starters > 0 else 0
        )

        return {
            "avg_races_per_meeting": round(avg_races_per_meeting, 2),
            "avg_starters_per_race": round(avg_starters_per_race, 2),
            "driver_assignment_rate": round(driver_completeness, 3),
            "trainer_assignment_rate": round(trainer_completeness, 3),
        }


def check_data_freshness(
    session: Session, max_age_days: int = 7
) -> ValidationIssue | None:
    """Check if data is fresh (recent meetings exist).

    Args:
        session: Database session
        max_age_days: Maximum acceptable age of most recent meeting

    Returns:
        ValidationIssue if data is stale, None otherwise
    """

    latest_meeting = (
        session.query(Meeting).order_by(Meeting.meeting_date.desc()).first()
    )

    if not latest_meeting:
        return ValidationIssue(
            severity="error",
            category="completeness",
            message="No meetings found in database",
        )

    days_old = (date.today() - latest_meeting.meeting_date).days

    if days_old > max_age_days:
        return ValidationIssue(
            severity="warning",
            category="completeness",
            message=f"Data may be stale: most recent meeting is {days_old} days old",
            meeting_id=latest_meeting.id,
            details={
                "latest_meeting_date": latest_meeting.meeting_date.isoformat(),
                "days_old": days_old,
            },
        )

    return None
