diff --git a/ecommerce_exporter/cli.py b/ecommerce_exporter/cli.py index 1c1d580..b296ca4 100644 --- a/ecommerce_exporter/cli.py +++ b/ecommerce_exporter/cli.py @@ -65,14 +65,7 @@ def main(): ) args = parser.parse_args() - scrape_targets = parse_config(os.path.abspath(args.config)) - - # setup the headers for each scrape targets - for scrape_target in scrape_targets: - scrape_target.headers = { - 'accept': '*/*', - 'user-agent': args.user_agent, - } + scrape_targets = parse_config(os.path.abspath(args.config), user_agent=args.user_agent) # start the http server to server the prometheus metrics logger.info("serving metrics on http://%s:%s/metrics", args.listen_address, args.listen_port) @@ -103,7 +96,7 @@ def main(): ).inc() time.sleep(args.interval * 60) -def parse_config(config_filename): +def parse_config(config_filename, user_agent): result = [] logger.info('Loading configurations from %s', config_filename) with open(config_filename, 'r') as f: @@ -125,6 +118,9 @@ def parse_config(config_filename): target_name=target.get('name'), regex=target.get('regex'), parser=target.get('parser'), + headers = { + 'User-Agent': user_agent, + }, )) return result diff --git a/ecommerce_exporter/scrape_target.py b/ecommerce_exporter/scrape_target.py index d29d7e7..1256e9c 100644 --- a/ecommerce_exporter/scrape_target.py +++ b/ecommerce_exporter/scrape_target.py @@ -3,25 +3,27 @@ import re from urllib.parse import urlparse -import httpx +# import httpx +import requests import parsel import pyjq import logging logger = logging.getLogger(__name__) class ScrapeTarget: - def __init__(self, product_name, url, selector, target_name=None, regex=None, parser=None): + def __init__(self, product_name, url, selector, target_name=None, regex=None, parser=None, headers={}): self.product_name = product_name self.target_name = target_name if target_name else urlparse(url).hostname self.url = url self.selector = selector - self.regex = re.compile(regex if regex else r'[0-9]+(\.[0-9]{2})?') + self.regex = re.compile(regex if regex else r'([0-9]+,?)+(\.[0-9]{2})?') self.parser = parser if parser else 'html' - self.headers = {} - self.client = httpx.Client( - follow_redirects=True, - http2=True, - ) + if 'Referer' not in headers: + headers['Referer'] = 'google.com' + if 'DNT' not in headers: + headers['DNT'] = '1' + self.headers = headers + self.session = requests.Session() # sanity check valid_parsers = ('html', 'json') @@ -29,13 +31,12 @@ class ScrapeTarget: raise ValueError("Invalid parser configured (got '%s' but need one of %s) product: '%s', target: '%s'" % (self.parser, valid_parsers, self.product_name, self.target_name)) def query_target(self): - # some sites get suspicious if we talk to them in HTTP/1.1 (maybe because it doesn't match our user-agent?) - # we use httpx to have HTTP2 support and circumvent that issue - query_response = self.client.get( + query_response = self.session.get( self.url, headers=self.headers, ) logger.info('Status: %s', query_response.status_code) + # self.client.cookies.update(query_response.cookies) query_response_text = query_response.text logger.debug('Response: %s', query_response_text) @@ -58,7 +59,7 @@ class ScrapeTarget: # match the regex regex_match = self.regex.search(selector_match) if regex_match: - str_result = regex_match.group(0) + str_result = regex_match.group(0).replace(',', '') # convert the result to float float_result = float(str_result) return float_result diff --git a/setup.cfg b/setup.cfg index 6aa03af..6108e81 100644 --- a/setup.cfg +++ b/setup.cfg @@ -16,7 +16,7 @@ setup_requires = setuptools_scm install_requires= PyYAML~=6.0 - httpx[http2]~=0.23.0 + requests~=2.32.0 parsel~=1.6.0 pyjq~=2.6.0 prometheus-client~=0.15.0