2022-10-30 18:20:59 +00:00
import json
2022-10-30 03:35:36 +00:00
import re
2022-10-30 18:24:57 +00:00
from urllib . parse import urlparse
2024-10-10 02:37:34 +00:00
# import httpx
import requests
2022-10-30 04:46:49 +00:00
import parsel
2022-10-30 18:20:59 +00:00
import pyjq
2024-10-09 23:11:11 +00:00
import logging
2022-10-30 04:46:49 +00:00
2024-10-09 23:11:11 +00:00
logger = logging . getLogger ( __name__ )
2022-10-30 03:35:36 +00:00
class ScrapeTarget :
2024-10-10 02:37:34 +00:00
def __init__ ( self , product_name , url , selector , target_name = None , regex = None , parser = None , headers = { } ) :
2022-10-30 03:35:36 +00:00
self . product_name = product_name
2022-10-30 18:24:57 +00:00
self . target_name = target_name if target_name else urlparse ( url ) . hostname
2022-10-30 03:35:36 +00:00
self . url = url
2022-10-30 18:20:59 +00:00
self . selector = selector
2024-10-10 02:37:34 +00:00
self . regex = re . compile ( regex if regex else r ' ([0-9]+,?)+( \ .[0-9] {2} )? ' )
2022-10-30 18:20:59 +00:00
self . parser = parser if parser else ' html '
2024-10-10 02:37:34 +00:00
if ' Referer ' not in headers :
headers [ ' Referer ' ] = ' google.com '
if ' DNT ' not in headers :
headers [ ' DNT ' ] = ' 1 '
self . headers = headers
self . session = requests . Session ( )
2022-10-30 04:46:49 +00:00
2022-10-31 02:44:02 +00:00
# sanity check
valid_parsers = ( ' html ' , ' json ' )
if self . parser not in valid_parsers :
raise ValueError ( " Invalid parser configured (got ' %s ' but need one of %s ) product: ' %s ' , target: ' %s ' " % ( self . parser , valid_parsers , self . product_name , self . target_name ) )
2022-10-30 04:46:49 +00:00
def query_target ( self ) :
2024-10-10 02:37:34 +00:00
query_response = self . session . get (
2024-10-09 23:11:11 +00:00
self . url ,
2022-10-30 04:46:49 +00:00
headers = self . headers ,
2024-10-09 23:11:11 +00:00
)
logger . info ( ' Status: %s ' , query_response . status_code )
2024-10-10 02:37:34 +00:00
# self.client.cookies.update(query_response.cookies)
2024-10-09 23:11:11 +00:00
query_response_text = query_response . text
logger . debug ( ' Response: %s ' , query_response_text )
2022-10-30 04:46:49 +00:00
2022-10-30 18:20:59 +00:00
# parse the response and match the selector
selector_match = ' '
if self . parser == ' html ' :
# parse response as html
2024-10-09 23:11:11 +00:00
selector = parsel . Selector ( text = query_response_text )
2022-10-30 18:20:59 +00:00
selector_match = selector . css ( self . selector ) . get ( )
elif self . parser == ' json ' :
# parse response as json
2024-10-09 23:11:11 +00:00
query_response_json = json . loads ( query_response_text )
2022-10-30 18:20:59 +00:00
selector_match = str ( pyjq . first ( self . selector , query_response_json ) )
else :
2022-10-31 02:44:02 +00:00
raise ScrapeError ( ' Invalid parser! ' )
2022-10-30 18:20:59 +00:00
if not selector_match :
2022-10-31 02:44:02 +00:00
raise ScrapeError ( ' Failed to match selector! ' )
2022-10-30 18:20:59 +00:00
# match the regex
regex_match = self . regex . search ( selector_match )
if regex_match :
2024-10-10 02:37:34 +00:00
str_result = regex_match . group ( 0 ) . replace ( ' , ' , ' ' )
2022-10-30 18:20:59 +00:00
# convert the result to float
float_result = float ( str_result )
return float_result
else :
2022-10-31 02:44:02 +00:00
raise ScrapeError ( ' Failed to match regex! ' )
class ScrapeError ( Exception ) :
def __init__ ( self , msg ) :
super ( ) . __init__ ( msg )