diff --git a/.gitignore b/.gitignore index 8dcae0b..14b95c9 100644 --- a/.gitignore +++ b/.gitignore @@ -202,4 +202,4 @@ tags [._]*.un~ ### Project-specific -dealwatch.yml \ No newline at end of file +webscraping-exporter.yml \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 3424c11..2f59fd9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,4 +2,4 @@ FROM python:3.10 COPY . /tmp/package RUN pip install --no-cache-dir /tmp/package && \ rm -r /tmp/package -ENTRYPOINT ["dealwatch"] \ No newline at end of file +ENTRYPOINT ["webscraping-exporter"] \ No newline at end of file diff --git a/README.md b/README.md index 677ed60..c5666c5 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,2 @@ -# dealwatch +# webscraping-exporter diff --git a/setup.cfg b/setup.cfg index 1f2fd7d..f9d5585 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -name = dealwatch +name = webscraping-exporter author = badjware author_email = marchambault.badjware.dev platform = any @@ -18,6 +18,6 @@ install_requires= [options.entry_points] console_scripts = - dealwatch = dealwatch.cli:main + webscraping-exporter = webscraping_exporter.cli:main [tool.setuptools_scm] \ No newline at end of file diff --git a/dealwatch.example.yml b/webscraping-exporter.exemple.yml similarity index 100% rename from dealwatch.example.yml rename to webscraping-exporter.exemple.yml diff --git a/dealwatch/__init__.py b/webscraping_exporter/__init__.py similarity index 100% rename from dealwatch/__init__.py rename to webscraping_exporter/__init__.py diff --git a/dealwatch/cli.py b/webscraping_exporter/cli.py similarity index 68% rename from dealwatch/cli.py rename to webscraping_exporter/cli.py index 8b99c93..beb143b 100644 --- a/dealwatch/cli.py +++ b/webscraping_exporter/cli.py @@ -1,11 +1,28 @@ import argparse +from itertools import product import time import yaml -from prometheus_client import start_http_server +from prometheus_client import start_http_server, Gauge, Counter -from dealwatch.scrape_target import ScrapeTarget +from webscraping_exporter.scrape_target import ScrapeTarget + +WEBSCRAPING_SCRAPE_TARGET_VALUE = Gauge( + 'webscraping_scrape_target_value', + 'The value scraped from a scrape target', + ['product_name', 'target_name'], +) +WEBSCRAPING_SCRAPE_TARGET_SUCCESS = Counter( + 'webscraping_scrape_target_success_total', + 'The number of successful scrape and parse of a scrape target', + ['product_name', 'target_name'], +) +WEBSCRAPING_SCRAPE_TARGET_FAILURE = Counter( + 'webscraping_scrape_target_failure_total', + 'The number of failed scrape and parse of a scrape target', + ['product_name', 'target_name'], +) def main(): parser = argparse.ArgumentParser("An utility to scrape e-commerce product price and expose them as prometheus metrics") @@ -13,13 +30,13 @@ def main(): '-c', '--config', help='The configuration file. (default: %(default)s)', type=str, - default='dealwatch.yml', + default='webscraping-exporter.yml', ) parser.add_argument( '-i', '--interval', help='The target scrape interval, in minutes. (default: %(default)s)', type=float, - default=10, + default=15, ) parser.add_argument( '--user-agent', @@ -56,7 +73,12 @@ def main(): # start the main loop while True: for scrape_target in scrape_targets: - print(scrape_target.query_target()) + value = scrape_target.query_target() + if value is not None: + WEBSCRAPING_SCRAPE_TARGET_VALUE.labels(product_name=scrape_target.product_name,target_name=scrape_target.target_name).set(value) + WEBSCRAPING_SCRAPE_TARGET_SUCCESS.labels(product_name=scrape_target.product_name,target_name=scrape_target.target_name).inc() + else: + WEBSCRAPING_SCRAPE_TARGET_FAILURE.labels(product_name=scrape_target.product_name,target_name=scrape_target.target_name).inc() time.sleep(args.interval * 60) def parse_config(config_filename): diff --git a/dealwatch/scrape_target.py b/webscraping_exporter/scrape_target.py similarity index 96% rename from dealwatch/scrape_target.py rename to webscraping_exporter/scrape_target.py index 5907a47..8ba311c 100644 --- a/dealwatch/scrape_target.py +++ b/webscraping_exporter/scrape_target.py @@ -19,7 +19,7 @@ class ScrapeTarget: def query_target(self): print('Query product %s, target %s' % (self.product_name, self.target_name)) - # some sites get suspicious if we talk to them in HTTP/1.1 + # some sites get suspicious if we talk to them in HTTP/1.1 (maybe because it doesn't match our user-agent?) # we use httpx to have HTTP2 support and circumvent that issue query_response = httpx.get( url=self.url,