From 4a1f6d052c420942eb629bb40039450a214733e6 Mon Sep 17 00:00:00 2001 From: Massaki Archambault Date: Sun, 30 Oct 2022 00:46:49 -0400 Subject: [PATCH] setup scrapper --- dealwatch/cli.py | 38 +++++++++++++++++++++++++++++++++++--- dealwatch/scrape_target.py | 31 +++++++++++++++++++++++++++++-- setup.cfg | 2 +- 3 files changed, 65 insertions(+), 6 deletions(-) diff --git a/dealwatch/cli.py b/dealwatch/cli.py index cf63f14..c1e937b 100644 --- a/dealwatch/cli.py +++ b/dealwatch/cli.py @@ -2,20 +2,52 @@ import argparse import yaml +from prometheus_client import start_http_server + from dealwatch.scrape_target import ScrapeTarget def main(): - parser = argparse.ArgumentParser("An utility to scrape e-commerce target price fluctuations") + parser = argparse.ArgumentParser("An utility to scrape e-commerce product price and expose them as prometheus metrics") parser.add_argument( '-c', '--config', help='The configuration file. (default: %(default)s)', type=str, default='dealwatch.yml', ) + parser.add_argument( + '--user-agent', + help='The user-agent to spoof. (default: %(default)s)', + type=str, + default='Mozilla/5.0 (X11; Linux x86_64; rv:106.0) Gecko/20100101 Firefox/106.0', + ) + parser.add_argument( + '-p', '--listen-port', + help='The listen port for the http server. (default: %(default)s)', + type=int, + default=8000, + ) + parser.add_argument( + '-a', '--listen-address', + help='The listen address for the http server. (default: %(default)s)', + type=str, + default='0.0.0.0', + ) args = parser.parse_args() - products = parse_config(args.config) - print(products) + scrape_targets = parse_config(args.config) + + # setup the headers for each scrape targets + for scrape_target in scrape_targets: + scrape_target.headers = { + 'Accept': '*/*', + 'User-Agent': args.user_agent, + } + + # start the http server to server the prometheus metrics + start_http_server(args.listen_port, args.listen_address) + + for scrape_target in scrape_targets: + print(scrape_target.query_target()) def parse_config(config_filename): result = [] diff --git a/dealwatch/scrape_target.py b/dealwatch/scrape_target.py index d618119..71a20b5 100644 --- a/dealwatch/scrape_target.py +++ b/dealwatch/scrape_target.py @@ -1,9 +1,36 @@ import re +import httpx +import parsel + class ScrapeTarget: def __init__(self, product_name, target_name, url, selector, regex=None): self.product_name = product_name self.target_name = target_name self.url = url - self.selector = selector - self.regex = re.compile(regex if regex else r'[0-9]+(\.[0-9]{2})?') \ No newline at end of file + self.selector = selector+'::text' + self.regex = re.compile(regex if regex else r'[0-9]+(\.[0-9]{2})?') + self.headers = {} + + def query_target(self): + print('Query product %s, target %s' % (self.product_name, self.target_name)) + # some sites get suspicious if we talk to them in HTTP/1.1 + # we use httpx to have HTTP2 support and circumvent that issue + query_response = httpx.get( + url=self.url, + headers=self.headers, + follow_redirects=True, + ).text + selector = parsel.Selector(text=query_response) + + # Match the selector + selector_match = selector.css(self.selector).get() + if selector_match: + # Match the regex + regex_match = self.regex.search(selector_match) + if regex_match: + str_result = regex_match.group(0) + # Convert the reult to float + float_result = float(str_result) + return float_result + return None \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index ea44a1f..38d1319 100644 --- a/setup.cfg +++ b/setup.cfg @@ -11,7 +11,7 @@ setup_requires = setuptools_scm install_requires= PyYAML~=6.0 - requests~=2.28.1 + httpx~=0.23.0 parsel~=1.6.0 prometheus-client~=0.15.0