error management

2022-10-30 22:44:02 -04:00 · 2022-10-30 22:44:02 -04:00 · b38df4b298
parent 0e22353da8
commit b38df4b298
2 changed files with 35 additions and 20 deletions
--- a/webscraping_exporter/cli.py
+++ b/webscraping_exporter/cli.py
@ -1,12 +1,13 @@
 import argparse
-from itertools import product
+import os
 import time

 import yaml

+from httpx import RequestError
 from prometheus_client import start_http_server, Gauge, Counter

-from webscraping_exporter.scrape_target import ScrapeTarget
+from webscraping_exporter.scrape_target import ScrapeError, ScrapeTarget

 WEBSCRAPING_SCRAPE_TARGET_VALUE = Gauge(
    'webscraping_scrape_target_value',
@ -21,7 +22,7 @@ WEBSCRAPING_SCRAPE_TARGET_SUCCESS = Counter(
 WEBSCRAPING_SCRAPE_TARGET_FAILURE = Counter(
    'webscraping_scrape_target_failure_total',
    'The number of failed scrape and parse of a scrape target',
-    ['product_name', 'target_name'],
+    ['product_name', 'target_name', 'exception'],
 )

 def main():
@ -58,7 +59,7 @@ def main():
    )

    args = parser.parse_args()
-    scrape_targets = parse_config(args.config)
+    scrape_targets = parse_config(os.path.abspath(args.config))

    # setup the headers for each scrape targets
    for scrape_target in scrape_targets:
@ -73,12 +74,24 @@ def main():
    # start the main loop
    while True:
        for scrape_target in scrape_targets:
+            try:
+                print("Starting scrape. product: '%s', target '%s'" % (scrape_target.product_name, scrape_target.target_name))
                value = scrape_target.query_target()
-            if value is not None:
-                WEBSCRAPING_SCRAPE_TARGET_VALUE.labels(product_name=scrape_target.product_name,target_name=scrape_target.target_name).set(value)
-                WEBSCRAPING_SCRAPE_TARGET_SUCCESS.labels(product_name=scrape_target.product_name,target_name=scrape_target.target_name).inc()
-            else:
-                WEBSCRAPING_SCRAPE_TARGET_FAILURE.labels(product_name=scrape_target.product_name,target_name=scrape_target.target_name).inc()
+                WEBSCRAPING_SCRAPE_TARGET_VALUE.labels(
+                    product_name=scrape_target.product_name,
+                    target_name=scrape_target.target_name
+                ).set(value)
+                WEBSCRAPING_SCRAPE_TARGET_SUCCESS.labels(
+                    product_name=scrape_target.product_name,
+                    target_name=scrape_target.target_name,
+                ).inc()
+            except (RequestError, ScrapeError) as e:
+                print("Failed to scrape! product: '%s', target: '%s', message: '%s'" % (scrape_target.product_name, scrape_target.target_name, e))
+                WEBSCRAPING_SCRAPE_TARGET_FAILURE.labels(
+                    product_name=scrape_target.product_name,
+                    target_name=scrape_target.target_name,
+                    exception=e.__class__.__name__,
+                ).inc()
        time.sleep(args.interval * 60)

 def parse_config(config_filename):
--- a/webscraping_exporter/scrape_target.py
+++ b/webscraping_exporter/scrape_target.py
@ -17,8 +17,12 @@ class ScrapeTarget:
        self.parser = parser if parser else 'html'
        self.headers = {}

+        # sanity check
+        valid_parsers = ('html', 'json')
+        if self.parser not in valid_parsers:
+            raise ValueError("Invalid parser configured (got '%s' but need one of %s) product: '%s', target: '%s'" % (self.parser, valid_parsers, self.product_name, self.target_name))
+
    def query_target(self):
-        print('Query product %s, target %s' % (self.product_name, self.target_name))
        # some sites get suspicious if we talk to them in HTTP/1.1 (maybe because it doesn't match our user-agent?)
        # we use httpx to have HTTP2 support and circumvent that issue
        query_response = httpx.get(
@ -38,14 +42,10 @@ class ScrapeTarget:
            query_response_json = json.loads(query_response)
            selector_match = str(pyjq.first(self.selector, query_response_json))
        else:
-            # TODO: better error handling
-            print('invalid parser!')
-            return None
+            raise ScrapeError('Invalid parser!')

        if not selector_match:
-            # TODO: better error handling
-            print('no selector_match!')
-            return None
+            raise ScrapeError('Failed to match selector!')

        # match the regex
        regex_match = self.regex.search(selector_match)
@ -55,6 +55,8 @@ class ScrapeTarget:
            float_result = float(str_result)
            return float_result
        else:
-            # TODO: better error handling
-            print('no regex match!')
-            return None
+            raise ScrapeError('Failed to match regex!')
+
+class ScrapeError(Exception):
+    def __init__(self, msg):
+        super().__init__(msg)