
    %isH                         S r SSKrSSKrSSKJr  SSKJr  SSKJr  SSKJ	r	   SSK
JrJrJr  SSKJr  SS	KJr  \" \5      r " S
 S\5      r " S S5      rg! \ a	    Sr\=rr N8f = f)a  HRNZ web scraper for extracting historical race data using Playwright.

This scraper extracts race results from the HRNZ InfoHorse results archive.
It respects rate limits and implements polite scraping practices.

WARNING: Web scraping should only be used if official API access is not available.
Always check HRNZ's Terms of Service and consider contacting them for official data access.
    N)datetime)Any)urljoin)BeautifulSoup)BrowserPageasync_playwright)
get_logger)build_decodo_proxyc                       \ rS rSrSrSrg)HRNZScraperError   z'Base exception for HRNZ scraper errors. N)__name__
__module____qualname____firstlineno____doc____static_attributes__r       B/root/tipsharks/tipsharks-elo-api/packages/hrnz_scraper/scraper.pyr   r      s    1r   r   c                   \   \ rS rSrSrSrSrS S\4S jjrS r	S r
S	 rS
 rS\S\4S jrS\S\\\4   4S jrS\S\\\4   4S jrS\S\S-  4S jrS\S\\\\4      4S jrS\S\S\\\4   4S jr\S\S\\\4   4S j5       rS\S\\\4   S\\\4   S-  4S jrS\S\S-  4S jrSrg)!HRNZScraper$   a@  Scraper for HRNZ InfoHorse results archive using Playwright.

This scraper extracts race meetings, races, and results from the
publicly accessible HRNZ results archive at infohorse.hrnz.co.nz.

Example:
    >>> async with HRNZScraper() as scraper:
    >>>     meeting = await scraper.get_meeting_results('010741rs.htm')
z-https://infohorse.hrnz.co.nz/datahrs/results/g       @timeoutc                 :    Xl         SU l        SU l        SU l        g)zfInitialize HRNZ scraper.

Args:
    timeout: Request timeout in milliseconds (default: 30000ms = 30s)
Ng        )r   _playwright_browser_last_request_time)selfr   s     r   __init__HRNZScraper.__init__4   s      (,"%r   c                 B   #    U R                  5       I Sh  vN   U $  N7f)zAsync context manager entry.N)_ensure_browserr    s    r   
__aenter__HRNZScraper.__aenter__?   s!     ""$$$ 	%s   c                 @   #    U R                  5       I Sh  vN   g N7f)zAsync context manager exit.N)close)r    exc_typeexc_valexc_tbs       r   	__aexit__HRNZScraper.__aexit__D   s     jjls   c                 (  #    [         c  [        S5      eU R                  ck  [        5       R                  5       I Sh  vN U l        U R                  R
                  R                  SS9I Sh  vN U l        [        R                  S5        gg NP N"7f)z)Ensure Playwright browser is initialized.NzOplaywright is required for HRNZ scraping; install it or use the API ingest pathT)headlesszPlaywright browser initialized)	r	   ImportErrorr   startr   chromiumlaunchloggerinfor%   s    r   r$   HRNZScraper._ensure_browserH   sz     #a  == %5%7%=%=%??D"&"2"2";";"B"BD"B"QQDMKK89 !?Qs!   ;BB/B-B.!BBc                   #    U R                   b)  U R                   R                  5       I Sh  vN   SU l         U R                  b?  U R                  R                  5       I Sh  vN   SU l        [        R                  S5        gg NX N$7f)zClose Playwright browser.NzPlaywright browser closed)r   r)   r   stopr5   r6   r%   s    r   r)   HRNZScraper.closeS   st     ==$--%%''' DM'""'')))#DKK34 ( ( *s!   +B
B5B
#B$#B
B
urlreturnc                   #    U R                  5       I Sh  vN   SSKnUR                  5       U R                  -
  nX0R                  :  a+  [        R
                  " U R                  U-
  5      I Sh  vN   [        R                  SU 35         [        5       nU(       a!  U R                  R                  US9I Sh  vN O!U R                  R                  5       I Sh  vN nUR                  5       I Sh  vN n UR                  XR                  SS9I Sh  vN   UR                  S5      I Sh  vN   UR                  5       I Sh  vN nUR                  5       U l        UUR!                  5       I Sh  vN   UR!                  5       I Sh  vN   $  GN GN( N N N N Nq N[ N/ N! UR!                  5       I Sh  vN    UR!                  5       I Sh  vN    f = f! ["         a  n[%        SU S	U 35      UeSnAff = f7f)
zFetch page with rate limiting and wait for content.

Args:
    url: URL to fetch

Returns:
    Page HTML content

Raises:
    HRNZScraperError: If request fails
Nr   z
Fetching: )proxynetworkidle)r   
wait_untili  zFailed to fetch : )r$   timer   RATE_LIMIT_DELAYasynciosleepr5   debugr   r   new_contextnew_pagegotor   wait_for_timeoutcontentr)   	Exceptionr   )	r    r;   rB   elapsedr>   contextpagerK   es	            r   _rate_limited_fetchHRNZScraper._rate_limited_fetch]   s     ""$$$ 	))+ 7 77***-- 5 5 ?@@@z#'(	I&(E  mm//e/<<<==4466 
  '//11D&ii\\miTTT ++D111 !%.*.))+'jjl""mmo%%C 	% A =61 U 2 /
 #% jjl""mmo%% 	I"%5cU"QC#@AqH	Is  HFAH1F2H-G# <F=!G# FG# 6F 7G# <F, F"F, 2F$3F, 
F&F, %G# 8F(9G# F*G# HHG# G#  G# "F, $F, &F, (G# *G# ,G  GG GG  G# #
H-G??HHc           
        #    UR                  S5      (       d  [        U R                  U5      nU R                  U5      I Sh  vN n[	        US5      nU R                  U5      nU R                  U5      nXTS'   XS'   [        R                  SUR                  S5       SUR                  S	5       S
[        U5       S35        U$  N7f)a:  Scrape results from a specific meeting page.

Args:
    url: URL to meeting results page (e.g., '010741rs.htm' or full URL)

Returns:
    Dictionary containing meeting and race data

Example:
    >>> meeting = await scraper.get_meeting_results('102402rs.htm')
    >>> print(f"Found {len(meeting['races'])} races")
httpNzhtml.parserraces
source_urlzScraped meeting: venuez on datez - z races)
startswithr   BASE_URLrQ   r   _parse_meeting_header_parse_racesr5   r6   getlen)r    r;   htmlsoupmeeting_datarU   s         r   get_meeting_resultsHRNZScraper.get_meeting_results   s      ~~f%%$---C--c22T=1 11$7 !!$' %W%(\" 0 0 9: ;""6*+3s5zl&B	

 # 3s   ACCBCr`   c                    0 nUR                  S5      nU(       a[  UR                  SS9nUR                  SS5      R                  SS5      R                  5       nXBS'   [        R                  SU 35        UR                  S	S
S9nU(       aG  UR                  SS9nXbS'   U R                  U5      nU(       a  XrS'   [        R                  SU 35        UR                  S	SS9nU(       ak  UR                  S5      n	U	(       aS  U	R                  SS9n
SU
;   a"  U
R                  S5      S   R                  5       n
XS'   [        R                  SU
 35        U$ )zParse meeting header information from HRNZ page.

HRNZ uses specific structure:
- h1 tag for venue/club name
- div.hrnz-content__date for date
- h5 tag for meeting details

Args:
    soup: BeautifulSoup object

Returns:
    Meeting metadata
h1Tstripz Inc z Inc.rW   zFound venue: divzhrnz-content__date)class_date_rawrX   zFound date: zhrnz-field__meetingh5z at r   namezFound meeting name: )findget_textreplacerg   r5   rF   _parse_datesplit)r    r`   meetingre   rW   date_div	date_textparsed_datemeeting_divrl   meeting_names              r   r[   !HRNZScraper._parse_meeting_header   sM     YYt_KKdK+EMM&"-55grBHHJE$GLL=01 99U+?9@ )))5I"+J**95K"-|K=9: ii.CiD!!$'B!{{{6\)#/#5#5f#=a#@#F#F#HL".3L>BCr   date_strNc                 r   SSK nUR                  5       R                  SS5      n/ SQnU H  n [         R                  " UR                  5       U5      nSU;  a9  SU;  a3  UR                   R	                  5       R
                  nUR                  US9nUR
                  S	:  aI  UR
                  S
::  a  UR                  UR
                  S-   S9nOUR                  UR
                  S-   S9nUR                  5       R                  5       s  $    [        R                  SU 35        g! [         a     GM
  f = f)zParse date string into ISO format.

Args:
    date_str: Date string in various formats

Returns:
    ISO format date string (YYYY-MM-DD) or None
r   N     )z%A, %d %B %Yz	%A, %d %Bz%d %B %Yz%d %Bz%d/%m/%Yz%d-%m-%Yz%d/%m/%yz%d-%m-%yz%Yz%y)yeard   2   i  il  zCould not parse date: )r   rg   rp   strptimenowr~   rX   	isoformat
ValueErrorr5   warning)r    rz   dtformatsfmtparsedcurrent_years          r   rq   HRNZScraper._parse_date   s    	 >>#++FC8	
 C!**8>>+;SA s?t3#%;;??#4#9#9L#^^^>F ;;${{b(!'V[[45G!H!'V[[45G!H{{}..00# * 	/z:;	  s   CD''
D65D6c                 D   / nUR                  S5      n[        R                  S[        U5       S35        [	        US5       H  u  pE U R                  XT5      nU(       ap  UR                  S5      (       aZ  [        US   5      S:  aH  UR                  U5        [        R                  SU S[        UR                  S/ 5      5       S	35        M  [        R                  S
U S35        M     U$ ! [         a&  n[        R                  SU SU 35         SnAM  SnAff = f)zParse all races from meeting page.

HRNZ results are in HTML tables. Each table represents one race.
Tables have headers: PlacePl, BookBk, Horse, Barrier, Hcap, Stakes, etc.

Args:
    soup: BeautifulSoup object

Returns:
    List of race dictionaries
tablezFound z tables on page   startersr   zParsed race rA   z	 starterszTable z had no starters, skippingzFailed to parse table N)
find_allr5   rF   r^   	enumerate_parse_race_tabler]   appendrL   r   )r    r`   rU   tablesidxr   racerP   s           r   r\   HRNZScraper._parse_races  s     w'vc&k]/:;#FA.JC--e9DHHZ00Sj9I5JQ5NLL&LL&se2c$((:r2J.K-LIV LL6#.H!IJ / 	  !7uBqcBCs   BC/C//
D9DDr   race_numberc                 V   U/ S.nUR                  S5      nU(       av  UR                  SS9n[        R                  " SU[        R                  5      nU(       a:  [        UR                  S5      5      US'   [        UR                  S5      5      US	'   UR                  S
5      nU R                  U5      nU Hm  n	U	R                  SS/5      n
[        U
5      S:  a  M'  U	R                  S5      (       a  M?  U R                  X5      nU(       d  MY  US   R                  U5        Mo     U$ )zParse individual race table.

Args:
    table: BeautifulSoup table element
    race_number: Race number (fallback if not in HTML)

Returns:
    Race dictionary with starters
)r   r   captionTrf   zRace\s+(\d+).*?(\d+)mr   r      
distance_mtrtdth   r   )rn   ro   research
IGNORECASEintgroupr   _build_header_mapr^   _parse_starter_rowr   )r    r   r   r   r   caption_textrace_info_matchrows
header_maprowcellsstarters               r   r   HRNZScraper._parse_race_table9  s     +; **Y'"++$+7L ii(,O &)/*?*?*B&C]#%()>)>q)A%B\" ~~d#++D1
CLL$.E5zA~ xx~~--e@GwZ ''0  r   r   c                     U  Hv  nUR                  S5      nU(       d  M  0 n[        U5       H=  u  pEUR                  SS9nU(       d  M  XCUR                  5       R	                  5       '   M?     U(       d  Mt  Us  $    0 $ )z0Build a header map from column names to indices.r   Trf   )r   r   ro   rg   lower)r   r   headersr   r   headertexts          r   r   HRNZScraper._build_header_mapf  sx     Cll4(GJ(1T247:tzz|1134  2 z!!  	r   r   r   c                 :  ^^  [        T5      S:  a  g0 nS[        S[        S-  4UU4S jjnU" S5      =(       d    U" S5      =(       d    TS   nUR                  S	S
9nU(       a`  [        R
                  " SU5      nU(       a  [        UR                  S5      5      US'   O$UR                  5       S;   a  S	US'   SUS'   OSUS'   U" S5      =(       d    TS   nUR                  S5      n	U	(       aA  U	R                  S	S
9US'   U	R                  SS5      n
U R                  U
5      nU(       a  XS'   OUR                  S	S
9nU(       a  XS'   U" S5      =(       d    U" S5      =(       d    TS   nUR                  S	S
9nU(       a;  [        R
                  " SU5      nU(       a  [        UR                  S5      5      US'   U" S5      =(       d    U" S5      =(       d    TS   nUR                  S	S
9nU(       aU  UR                  5       S:X  a  SUS'   O;[        R
                  " SU5      nU(       a  [        UR                  S5      5      US'   [        T5      S :  a  TS    R                  S	S
9OSnU(       a  US:w  a  UUS!'   [        T5      S":  a  TS"   R                  S	S
9OSnU(       a  US:w  a  UUS#'   U" S$5      nU(       a~  UR                  S5      nU(       aK  UR                  S	S
9nU(       a  UUS%'   UR                  SS5      nU R                  U5      nU(       a  UUS&'   OUR                  S	S
9nU(       a  UUS%'   U" S'5      nU(       a~  UR                  S5      nU(       aK  UR                  S	S
9nU(       a  UUS('   UR                  SS5      nU R                  U5      nU(       a  UUS)'   OUR                  S	S
9nU(       a  UUS('   UR                  S5      (       a  U$  g! [         a"  n[        R                  S*U 35         SnAgSnAff = f)+a  Parse a single starter row from race table.

HRNZ table structure (as of 2026):
cells[0] = PlacePl (placing)
cells[1] = BookBk (book number)
cells[2] = Horse (with link containing UUID)
cells[3] = Barrier
cells[4] = Hcap (handicap)
cells[5] = Stakes
cells[6] = Fav (favorite odds)
cells[7] = Time
cells[8] = Margin
cells[9] = Time/Margin

Args:
    cells: List of table cells

Returns:
    Starter dictionary or None
   Nlabelr<   c                   > U R                  5       nT HC  nUR                  S5      nU(       d  M  UR                  5       R                  5       U:X  d  MA  Us  $    TR                  U5      nUb  U[        T5      :  a  TU   $ g )Nz
data-label)r   r]   rg   r^   )r   targetcell
data_labelr   r   r   s        r   _cell_by_label6HRNZScraper._parse_starter_row.<locals>._cell_by_label  sv    !D!%,!7J!zj&6&6&8&>&>&@F&J# " !nnV,?sSZ'7 :%r   PlacingPlacer   Trf   z(\d+)r   placing)DNSDNFDSQLRSCRNPdid_not_finishHorser   a
horse_namehrefrh   horse_idBarrierDraw   barrierHcapHCPr   fr
handicap_m   	race_time   marginDriverdriver_name	driver_idTrainertrainer_name
trainer_idzError parsing starter row: )r^   strr   ro   r   matchr   r   upperrn   r]   _extract_uuidr   rL   r5   rF   ) r    r   r   r   r   placing_cellpos_text	pos_match
horse_cell
horse_link
horse_href
horse_uuidr   barrier_cellbarrier_textbarrier_match	hcap_cell	hcap_text
hcap_match	time_textmargin_textdriver_celldriver_linkr   driver_hrefdriver_uuidtrainer_celltrainer_linkr   trainer_hreftrainer_uuidrP   s     ``                             r   r   HRNZScraper._parse_starter_rowv  s   .t	<5zA~G	c 	cDj 	 	 y)P^G-DPa  $,,4,8HHHXx8	),Y__Q-?)@GI&^^%)QQ04G,-)-GI&)-GI& (0<E!HJ#-J(2(;(;$(;(G%'^^FB7
!//
;
*4J' (00t0<
,6L) y)O^F-COuQx  (00t0<L "< @ ),]-@-@-C)DGI& 'v.S.2GS5QR8I!***6I??$,,-GL)!#(I!>J!03J4D4DQ4G0H- :=Uaa)))5RIY"_'0$ <?u:>%(++$+7rK{b0$/!(2K)..s3"-"6"6T"6"BK"1<."-//&""=K"&"4"4["AK"/:,"-"6"6T"6"BK"1<.))4L+005#/#8#8t#8#DL#2>/#/#3#3FB#?L#'#5#5l#CL#0<-#/#8#8t#8#DL#2>/ {{<(( )   	<LL6qc:;;	<s   O. OO. .
P8PPr   c                     [         R                  " SU[         R                  5      nU(       a  UR                  S5      $ g)zgExtract UUID from href string.

Args:
    href: href attribute value

Returns:
    UUID string or None
z([0-9A-F-]{36})r   N)r   r   r   r   )r    r   
uuid_matchs      r   r   HRNZScraper._extract_uuid  s2     YY14G
##A&&r   )r   r   r   r   )i0u  )r   r   r   r   r   rZ   rC   floatr!   r&   r-   r$   r)   r   rQ   dictr   rb   r   r[   rq   listr\   r   r   staticmethodr   r   r   r   r   r   r   r   r   $   s\    ?H 	& 	&
	:5/IS /IS /Ib"S "T#s(^ "H0- 0DcN 0d0C 0C$J 0d! !4S#X3G !F+"+14+	c3h+Z  c3h  MM'+CH~M	c3h$	M^# #* r   r   )r   rD   r   r   typingr   urllib.parser   bs4r   playwright.async_apir   r   r	   r1   packages.core.common.loggingr
   packages.hrnz_scraper.proxyr   r   r5   rL   r   r   r   r   r   <module>r	     st     	     DD
 4 :	H		y 	m m!  Gds   
A A'&A'