Coverage for packages / hrnz_scraper / mapper.py: 0%

94 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-08 08:37 +1200

1"""Mapper to convert HRNZ scraped data to TipSharks data models.""" 

2 

3import hashlib 

4from datetime import datetime 

5from typing import Any 

6 

7from packages.core.common.logging import get_logger 

8 

9logger = get_logger(__name__) 

10 

11 

12class HRNZDataMapper: 

13 """Maps HRNZ scraped data to TipSharks database format. 

14 

15 Converts the dictionaries returned by HRNZScraper into the format 

16 expected by TipSharks repositories for database insertion. 

17 """ 

18 

19 @staticmethod 

20 def map_meeting(scraped_meeting: dict[str, Any]) -> dict[str, Any]: 

21 """Map scraped meeting data to TipSharks meeting format. 

22 

23 Args: 

24 scraped_meeting: Meeting dict from HRNZScraper 

25 

26 Returns: 

27 Meeting dict compatible with MeetingRepository.upsert() 

28 """ 

29 # Generate a meeting ID from date + venue 

30 # Format similar to TAB: hash of venue + date 

31 date_str = scraped_meeting.get("date", "") 

32 venue = scraped_meeting.get("venue", "Unknown") 

33 

34 # Create deterministic ID from venue + date 

35 meeting_id_seed = f"{venue}_{date_str}" 

36 meeting_id = hashlib.md5(meeting_id_seed.encode()).hexdigest()[:8] 

37 

38 meeting = { 

39 "meeting": meeting_id, # Use "meeting" key for compatibility with TAB format 

40 "date": date_str, # Keep ISO format string 

41 "name": venue, # Use "name" key for compatibility with TAB format 

42 "category": "H", # Harness racing 

43 "raw_json": scraped_meeting, # Store original scraped data 

44 } 

45 

46 return meeting 

47 

48 @staticmethod 

49 def map_races( 

50 scraped_meeting: dict[str, Any], meeting_id: str 

51 ) -> list[dict[str, Any]]: 

52 """Map scraped races to TipSharks race format. 

53 

54 Args: 

55 scraped_meeting: Meeting dict from HRNZScraper 

56 meeting_id: TipSharks meeting ID 

57 

58 Returns: 

59 List of race dicts compatible with RaceRepository.upsert() 

60 """ 

61 races = [] 

62 

63 scraped_races = scraped_meeting.get("races", []) 

64 meeting_date_str = scraped_meeting.get("date", "") 

65 

66 for scraped_race in scraped_races: 

67 race = { 

68 "meeting_id": meeting_id, 

69 "race_number": scraped_race.get("race_number", 0), 

70 "distance_m": scraped_race.get("distance_m", 2000), 

71 "start_type": scraped_race.get("start_type", "Standing"), 

72 "name": scraped_race.get("name", ""), 

73 "weather": scraped_race.get("weather"), 

74 "track_condition": scraped_race.get("track_condition"), 

75 "raw_json": scraped_race, # Store original scraped data 

76 } 

77 

78 # Construct race datetime (use meeting date at noon if no time given) 

79 if meeting_date_str: 

80 try: 

81 # Default to 12:00 PM if no specific time 

82 race_datetime = datetime.fromisoformat( 

83 f"{meeting_date_str}T12:00:00" 

84 ) 

85 # Add race number offset (30 min per race) 

86 race_offset_minutes = (race["race_number"] - 1) * 30 

87 from datetime import timedelta 

88 

89 race_datetime += timedelta(minutes=race_offset_minutes) 

90 # Store as ISO string for JSON serialization 

91 race["race_datetime"] = race_datetime.isoformat() 

92 except ValueError as e: 

93 logger.warning(f"Could not parse race datetime: {e}") 

94 

95 # Determine gait (default to Pace for harness racing) 

96 race["gait"] = "Pace" # HRNZ doesn't always specify, default to pace 

97 

98 races.append(race) 

99 

100 return races 

101 

102 @staticmethod 

103 def map_starters( 

104 scraped_meeting: dict[str, Any], 

105 race_id_map: dict[int, int], 

106 ) -> list[dict[str, Any]]: 

107 """Map scraped starters to TipSharks starter format. 

108 

109 Args: 

110 scraped_meeting: Meeting dict from HRNZScraper 

111 race_id_map: Mapping of race_number to database race ID 

112 

113 Returns: 

114 List of starter dicts compatible with StarterRepository.upsert() 

115 """ 

116 starters = [] 

117 

118 scraped_races = scraped_meeting.get("races", []) 

119 

120 for scraped_race in scraped_races: 

121 race_number = scraped_race.get("race_number") 

122 race_id = race_id_map.get(race_number) 

123 

124 if not race_id: 

125 logger.warning( 

126 f"No database race_id for race {race_number}, skipping starters" 

127 ) 

128 continue 

129 

130 scraped_starters = scraped_race.get("starters", []) 

131 

132 for scraped_starter in scraped_starters: 

133 # Map horse ID - convert UUID to integer within PostgreSQL INTEGER range 

134 horse_uuid = scraped_starter.get("horse_id") 

135 horse_name = scraped_starter.get("horse_name", "Unknown") 

136 if horse_uuid: 

137 horse_id = ( 

138 int(hashlib.md5(horse_uuid.encode()).hexdigest()[:8], 16) 

139 % 2147483647 

140 ) 

141 else: 

142 horse_id = ( 

143 int(hashlib.md5(horse_name.encode()).hexdigest()[:8], 16) 

144 % 2147483647 

145 ) 

146 

147 # Map driver ID - convert UUID to integer within PostgreSQL INTEGER range 

148 driver_uuid = scraped_starter.get("driver_id") 

149 driver_name = scraped_starter.get("driver_name", "Unknown") 

150 if driver_uuid: 

151 driver_id = ( 

152 int(hashlib.md5(driver_uuid.encode()).hexdigest()[:8], 16) 

153 % 2147483647 

154 ) 

155 else: 

156 driver_id = ( 

157 int(hashlib.md5(driver_name.encode()).hexdigest()[:8], 16) 

158 % 2147483647 

159 ) 

160 

161 # Map trainer ID - convert UUID to integer within PostgreSQL INTEGER range 

162 trainer_uuid = scraped_starter.get("trainer_id") 

163 trainer_name = scraped_starter.get("trainer_name", "Unknown") 

164 if trainer_uuid: 

165 trainer_id = ( 

166 int(hashlib.md5(trainer_uuid.encode()).hexdigest()[:8], 16) 

167 % 2147483647 

168 ) 

169 else: 

170 trainer_id = ( 

171 int(hashlib.md5(trainer_name.encode()).hexdigest()[:8], 16) 

172 % 2147483647 

173 ) 

174 

175 starter = { 

176 "race_id": race_id, 

177 "horse_id": horse_id, 

178 "name": horse_name, # Add "name" field for StarterRepository compatibility 

179 "driver_name": driver_name, 

180 "trainer_name": trainer_name, 

181 "driver_id": driver_id, 

182 "trainer_id": trainer_id, 

183 "runner_number": scraped_starter.get("runner_number"), 

184 "barrier": scraped_starter.get("barrier", 1), 

185 "barrier_position": scraped_starter.get("barrier_position"), 

186 "handicap_m": scraped_starter.get("handicap_m", 0), 

187 "placing": scraped_starter.get("placing"), 

188 "did_not_finish": bool(scraped_starter.get("did_not_finish")), 

189 "raw_json": scraped_starter, 

190 } 

191 

192 starters.append(starter) 

193 

194 # Also track entities for upserting 

195 # Return entity dicts for horses, drivers, trainers 

196 

197 return starters 

198 

199 @staticmethod 

200 def map_entities( 

201 scraped_meeting: dict[str, Any], 

202 ) -> dict[str, list[dict[str, Any]]]: 

203 """Extract and map all entities (horses, drivers, trainers). 

204 

205 Args: 

206 scraped_meeting: Meeting dict from HRNZScraper 

207 

208 Returns: 

209 Dict with 'horses', 'drivers', 'trainers' lists 

210 """ 

211 horses = {} 

212 drivers = {} 

213 trainers = {} 

214 

215 scraped_races = scraped_meeting.get("races", []) 

216 

217 for scraped_race in scraped_races: 

218 scraped_starters = scraped_race.get("starters", []) 

219 

220 for scraped_starter in scraped_starters: 

221 # Horse - convert UUID to integer within PostgreSQL INTEGER range 

222 horse_uuid = scraped_starter.get("horse_id") 

223 horse_name = scraped_starter.get("horse_name", "Unknown") 

224 if horse_uuid: 

225 # Convert UUID to integer using hash (modulo to fit in PostgreSQL INTEGER) 

226 horse_id = ( 

227 int(hashlib.md5(horse_uuid.encode()).hexdigest()[:8], 16) 

228 % 2147483647 

229 ) 

230 else: 

231 # Generate from horse name 

232 horse_id = ( 

233 int(hashlib.md5(horse_name.encode()).hexdigest()[:8], 16) 

234 % 2147483647 

235 ) 

236 

237 if horse_id not in horses: 

238 horses[horse_id] = { 

239 "id": horse_id, 

240 "name": horse_name, 

241 "raw_json": {"source": "hrnz_scraper", "uuid": horse_uuid}, 

242 } 

243 

244 # Driver - convert UUID to integer within PostgreSQL INTEGER range 

245 driver_uuid = scraped_starter.get("driver_id") 

246 driver_name = scraped_starter.get("driver_name", "Unknown") 

247 if driver_uuid: 

248 driver_id = ( 

249 int(hashlib.md5(driver_uuid.encode()).hexdigest()[:8], 16) 

250 % 2147483647 

251 ) 

252 else: 

253 driver_id = ( 

254 int(hashlib.md5(driver_name.encode()).hexdigest()[:8], 16) 

255 % 2147483647 

256 ) 

257 

258 if driver_id not in drivers: 

259 drivers[driver_id] = { 

260 "id": driver_id, 

261 "name": driver_name, 

262 "raw_json": {"source": "hrnz_scraper", "uuid": driver_uuid}, 

263 } 

264 

265 # Trainer - convert UUID to integer within PostgreSQL INTEGER range 

266 trainer_uuid = scraped_starter.get("trainer_id") 

267 trainer_name = scraped_starter.get("trainer_name", "Unknown") 

268 if trainer_uuid: 

269 trainer_id = ( 

270 int(hashlib.md5(trainer_uuid.encode()).hexdigest()[:8], 16) 

271 % 2147483647 

272 ) 

273 else: 

274 trainer_id = ( 

275 int(hashlib.md5(trainer_name.encode()).hexdigest()[:8], 16) 

276 % 2147483647 

277 ) 

278 

279 if trainer_id not in trainers: 

280 trainers[trainer_id] = { 

281 "id": trainer_id, 

282 "name": trainer_name, 

283 "raw_json": {"source": "hrnz_scraper", "uuid": trainer_uuid}, 

284 } 

285 

286 return { 

287 "horses": list(horses.values()), 

288 "drivers": list(drivers.values()), 

289 "trainers": list(trainers.values()), 

290 }