2022-10-30 18:20:59 +00:00
import json
2022-10-30 03:35:36 +00:00
import re
2022-10-30 18:24:57 +00:00
from urllib . parse import urlparse
2022-10-30 04:46:49 +00:00
import httpx
import parsel
2022-10-30 18:20:59 +00:00
import pyjq
2024-10-09 23:11:11 +00:00
import logging
2022-10-30 04:46:49 +00:00
2024-10-09 23:11:11 +00:00
logger = logging . getLogger ( __name__ )
2022-10-30 03:35:36 +00:00
class ScrapeTarget :
2022-10-30 18:24:57 +00:00
def __init__ ( self , product_name , url , selector , target_name = None , regex = None , parser = None ) :
2022-10-30 03:35:36 +00:00
self . product_name = product_name
2022-10-30 18:24:57 +00:00
self . target_name = target_name if target_name else urlparse ( url ) . hostname
2022-10-30 03:35:36 +00:00
self . url = url
2022-10-30 18:20:59 +00:00
self . selector = selector
2022-10-30 04:46:49 +00:00
self . regex = re . compile ( regex if regex else r ' [0-9]+( \ .[0-9] {2} )? ' )
2022-10-30 18:20:59 +00:00
self . parser = parser if parser else ' html '
2022-10-30 04:46:49 +00:00
self . headers = { }
2024-10-09 23:11:11 +00:00
self . client = httpx . Client (
follow_redirects = True ,
http2 = True ,
)
2022-10-30 04:46:49 +00:00
2022-10-31 02:44:02 +00:00
# sanity check
valid_parsers = ( ' html ' , ' json ' )
if self . parser not in valid_parsers :
raise ValueError ( " Invalid parser configured (got ' %s ' but need one of %s ) product: ' %s ' , target: ' %s ' " % ( self . parser , valid_parsers , self . product_name , self . target_name ) )
2022-10-30 04:46:49 +00:00
def query_target ( self ) :
2022-10-30 22:31:54 +00:00
# some sites get suspicious if we talk to them in HTTP/1.1 (maybe because it doesn't match our user-agent?)
2022-10-30 04:46:49 +00:00
# we use httpx to have HTTP2 support and circumvent that issue
2024-10-09 23:11:11 +00:00
query_response = self . client . get (
self . url ,
2022-10-30 04:46:49 +00:00
headers = self . headers ,
2024-10-09 23:11:11 +00:00
)
logger . info ( ' Status: %s ' , query_response . status_code )
query_response_text = query_response . text
logger . debug ( ' Response: %s ' , query_response_text )
2022-10-30 04:46:49 +00:00
2022-10-30 18:20:59 +00:00
# parse the response and match the selector
selector_match = ' '
if self . parser == ' html ' :
# parse response as html
2024-10-09 23:11:11 +00:00
selector = parsel . Selector ( text = query_response_text )
2022-10-30 18:20:59 +00:00
selector_match = selector . css ( self . selector ) . get ( )
elif self . parser == ' json ' :
# parse response as json
2024-10-09 23:11:11 +00:00
query_response_json = json . loads ( query_response_text )
2022-10-30 18:20:59 +00:00
selector_match = str ( pyjq . first ( self . selector , query_response_json ) )
else :
2022-10-31 02:44:02 +00:00
raise ScrapeError ( ' Invalid parser! ' )
2022-10-30 18:20:59 +00:00
if not selector_match :
2022-10-31 02:44:02 +00:00
raise ScrapeError ( ' Failed to match selector! ' )
2022-10-30 18:20:59 +00:00
# match the regex
regex_match = self . regex . search ( selector_match )
if regex_match :
str_result = regex_match . group ( 0 )
# convert the result to float
float_result = float ( str_result )
return float_result
else :
2022-10-31 02:44:02 +00:00
raise ScrapeError ( ' Failed to match regex! ' )
class ScrapeError ( Exception ) :
def __init__ ( self , msg ) :
super ( ) . __init__ ( msg )