Coverage for packages / hrnz_scraper / mapper.py: 0%
94 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-08 08:37 +1200
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-08 08:37 +1200
1"""Mapper to convert HRNZ scraped data to TipSharks data models."""
3import hashlib
4from datetime import datetime
5from typing import Any
7from packages.core.common.logging import get_logger
9logger = get_logger(__name__)
12class HRNZDataMapper:
13 """Maps HRNZ scraped data to TipSharks database format.
15 Converts the dictionaries returned by HRNZScraper into the format
16 expected by TipSharks repositories for database insertion.
17 """
19 @staticmethod
20 def map_meeting(scraped_meeting: dict[str, Any]) -> dict[str, Any]:
21 """Map scraped meeting data to TipSharks meeting format.
23 Args:
24 scraped_meeting: Meeting dict from HRNZScraper
26 Returns:
27 Meeting dict compatible with MeetingRepository.upsert()
28 """
29 # Generate a meeting ID from date + venue
30 # Format similar to TAB: hash of venue + date
31 date_str = scraped_meeting.get("date", "")
32 venue = scraped_meeting.get("venue", "Unknown")
34 # Create deterministic ID from venue + date
35 meeting_id_seed = f"{venue}_{date_str}"
36 meeting_id = hashlib.md5(meeting_id_seed.encode()).hexdigest()[:8]
38 meeting = {
39 "meeting": meeting_id, # Use "meeting" key for compatibility with TAB format
40 "date": date_str, # Keep ISO format string
41 "name": venue, # Use "name" key for compatibility with TAB format
42 "category": "H", # Harness racing
43 "raw_json": scraped_meeting, # Store original scraped data
44 }
46 return meeting
48 @staticmethod
49 def map_races(
50 scraped_meeting: dict[str, Any], meeting_id: str
51 ) -> list[dict[str, Any]]:
52 """Map scraped races to TipSharks race format.
54 Args:
55 scraped_meeting: Meeting dict from HRNZScraper
56 meeting_id: TipSharks meeting ID
58 Returns:
59 List of race dicts compatible with RaceRepository.upsert()
60 """
61 races = []
63 scraped_races = scraped_meeting.get("races", [])
64 meeting_date_str = scraped_meeting.get("date", "")
66 for scraped_race in scraped_races:
67 race = {
68 "meeting_id": meeting_id,
69 "race_number": scraped_race.get("race_number", 0),
70 "distance_m": scraped_race.get("distance_m", 2000),
71 "start_type": scraped_race.get("start_type", "Standing"),
72 "name": scraped_race.get("name", ""),
73 "weather": scraped_race.get("weather"),
74 "track_condition": scraped_race.get("track_condition"),
75 "raw_json": scraped_race, # Store original scraped data
76 }
78 # Construct race datetime (use meeting date at noon if no time given)
79 if meeting_date_str:
80 try:
81 # Default to 12:00 PM if no specific time
82 race_datetime = datetime.fromisoformat(
83 f"{meeting_date_str}T12:00:00"
84 )
85 # Add race number offset (30 min per race)
86 race_offset_minutes = (race["race_number"] - 1) * 30
87 from datetime import timedelta
89 race_datetime += timedelta(minutes=race_offset_minutes)
90 # Store as ISO string for JSON serialization
91 race["race_datetime"] = race_datetime.isoformat()
92 except ValueError as e:
93 logger.warning(f"Could not parse race datetime: {e}")
95 # Determine gait (default to Pace for harness racing)
96 race["gait"] = "Pace" # HRNZ doesn't always specify, default to pace
98 races.append(race)
100 return races
102 @staticmethod
103 def map_starters(
104 scraped_meeting: dict[str, Any],
105 race_id_map: dict[int, int],
106 ) -> list[dict[str, Any]]:
107 """Map scraped starters to TipSharks starter format.
109 Args:
110 scraped_meeting: Meeting dict from HRNZScraper
111 race_id_map: Mapping of race_number to database race ID
113 Returns:
114 List of starter dicts compatible with StarterRepository.upsert()
115 """
116 starters = []
118 scraped_races = scraped_meeting.get("races", [])
120 for scraped_race in scraped_races:
121 race_number = scraped_race.get("race_number")
122 race_id = race_id_map.get(race_number)
124 if not race_id:
125 logger.warning(
126 f"No database race_id for race {race_number}, skipping starters"
127 )
128 continue
130 scraped_starters = scraped_race.get("starters", [])
132 for scraped_starter in scraped_starters:
133 # Map horse ID - convert UUID to integer within PostgreSQL INTEGER range
134 horse_uuid = scraped_starter.get("horse_id")
135 horse_name = scraped_starter.get("horse_name", "Unknown")
136 if horse_uuid:
137 horse_id = (
138 int(hashlib.md5(horse_uuid.encode()).hexdigest()[:8], 16)
139 % 2147483647
140 )
141 else:
142 horse_id = (
143 int(hashlib.md5(horse_name.encode()).hexdigest()[:8], 16)
144 % 2147483647
145 )
147 # Map driver ID - convert UUID to integer within PostgreSQL INTEGER range
148 driver_uuid = scraped_starter.get("driver_id")
149 driver_name = scraped_starter.get("driver_name", "Unknown")
150 if driver_uuid:
151 driver_id = (
152 int(hashlib.md5(driver_uuid.encode()).hexdigest()[:8], 16)
153 % 2147483647
154 )
155 else:
156 driver_id = (
157 int(hashlib.md5(driver_name.encode()).hexdigest()[:8], 16)
158 % 2147483647
159 )
161 # Map trainer ID - convert UUID to integer within PostgreSQL INTEGER range
162 trainer_uuid = scraped_starter.get("trainer_id")
163 trainer_name = scraped_starter.get("trainer_name", "Unknown")
164 if trainer_uuid:
165 trainer_id = (
166 int(hashlib.md5(trainer_uuid.encode()).hexdigest()[:8], 16)
167 % 2147483647
168 )
169 else:
170 trainer_id = (
171 int(hashlib.md5(trainer_name.encode()).hexdigest()[:8], 16)
172 % 2147483647
173 )
175 starter = {
176 "race_id": race_id,
177 "horse_id": horse_id,
178 "name": horse_name, # Add "name" field for StarterRepository compatibility
179 "driver_name": driver_name,
180 "trainer_name": trainer_name,
181 "driver_id": driver_id,
182 "trainer_id": trainer_id,
183 "runner_number": scraped_starter.get("runner_number"),
184 "barrier": scraped_starter.get("barrier", 1),
185 "barrier_position": scraped_starter.get("barrier_position"),
186 "handicap_m": scraped_starter.get("handicap_m", 0),
187 "placing": scraped_starter.get("placing"),
188 "did_not_finish": bool(scraped_starter.get("did_not_finish")),
189 "raw_json": scraped_starter,
190 }
192 starters.append(starter)
194 # Also track entities for upserting
195 # Return entity dicts for horses, drivers, trainers
197 return starters
199 @staticmethod
200 def map_entities(
201 scraped_meeting: dict[str, Any],
202 ) -> dict[str, list[dict[str, Any]]]:
203 """Extract and map all entities (horses, drivers, trainers).
205 Args:
206 scraped_meeting: Meeting dict from HRNZScraper
208 Returns:
209 Dict with 'horses', 'drivers', 'trainers' lists
210 """
211 horses = {}
212 drivers = {}
213 trainers = {}
215 scraped_races = scraped_meeting.get("races", [])
217 for scraped_race in scraped_races:
218 scraped_starters = scraped_race.get("starters", [])
220 for scraped_starter in scraped_starters:
221 # Horse - convert UUID to integer within PostgreSQL INTEGER range
222 horse_uuid = scraped_starter.get("horse_id")
223 horse_name = scraped_starter.get("horse_name", "Unknown")
224 if horse_uuid:
225 # Convert UUID to integer using hash (modulo to fit in PostgreSQL INTEGER)
226 horse_id = (
227 int(hashlib.md5(horse_uuid.encode()).hexdigest()[:8], 16)
228 % 2147483647
229 )
230 else:
231 # Generate from horse name
232 horse_id = (
233 int(hashlib.md5(horse_name.encode()).hexdigest()[:8], 16)
234 % 2147483647
235 )
237 if horse_id not in horses:
238 horses[horse_id] = {
239 "id": horse_id,
240 "name": horse_name,
241 "raw_json": {"source": "hrnz_scraper", "uuid": horse_uuid},
242 }
244 # Driver - convert UUID to integer within PostgreSQL INTEGER range
245 driver_uuid = scraped_starter.get("driver_id")
246 driver_name = scraped_starter.get("driver_name", "Unknown")
247 if driver_uuid:
248 driver_id = (
249 int(hashlib.md5(driver_uuid.encode()).hexdigest()[:8], 16)
250 % 2147483647
251 )
252 else:
253 driver_id = (
254 int(hashlib.md5(driver_name.encode()).hexdigest()[:8], 16)
255 % 2147483647
256 )
258 if driver_id not in drivers:
259 drivers[driver_id] = {
260 "id": driver_id,
261 "name": driver_name,
262 "raw_json": {"source": "hrnz_scraper", "uuid": driver_uuid},
263 }
265 # Trainer - convert UUID to integer within PostgreSQL INTEGER range
266 trainer_uuid = scraped_starter.get("trainer_id")
267 trainer_name = scraped_starter.get("trainer_name", "Unknown")
268 if trainer_uuid:
269 trainer_id = (
270 int(hashlib.md5(trainer_uuid.encode()).hexdigest()[:8], 16)
271 % 2147483647
272 )
273 else:
274 trainer_id = (
275 int(hashlib.md5(trainer_name.encode()).hexdigest()[:8], 16)
276 % 2147483647
277 )
279 if trainer_id not in trainers:
280 trainers[trainer_id] = {
281 "id": trainer_id,
282 "name": trainer_name,
283 "raw_json": {"source": "hrnz_scraper", "uuid": trainer_uuid},
284 }
286 return {
287 "horses": list(horses.values()),
288 "drivers": list(drivers.values()),
289 "trainers": list(trainers.values()),
290 }