From b38df4b2984d321e834590bea29fcfb35663b540 Mon Sep 17 00:00:00 2001
From: Massaki Archambault <marchambault@badjware.dev>
Date: Sun, 30 Oct 2022 22:44:02 -0400
Subject: [PATCH] error management

---
 webscraping_exporter/cli.py           | 33 +++++++++++++++++++--------
 webscraping_exporter/scrape_target.py | 22 ++++++++++--------
 2 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/webscraping_exporter/cli.py b/webscraping_exporter/cli.py
index beb143b..7dcd02c 100644
--- a/webscraping_exporter/cli.py
+++ b/webscraping_exporter/cli.py
@@ -1,12 +1,13 @@
 import argparse
-from itertools import product
+import os
 import time
 
 import yaml
 
+from httpx import RequestError
 from prometheus_client import start_http_server, Gauge, Counter
 
-from webscraping_exporter.scrape_target import ScrapeTarget
+from webscraping_exporter.scrape_target import ScrapeError, ScrapeTarget
 
 WEBSCRAPING_SCRAPE_TARGET_VALUE = Gauge(
     'webscraping_scrape_target_value',
@@ -21,7 +22,7 @@ WEBSCRAPING_SCRAPE_TARGET_SUCCESS = Counter(
 WEBSCRAPING_SCRAPE_TARGET_FAILURE = Counter(
     'webscraping_scrape_target_failure_total',
     'The number of failed scrape and parse of a scrape target',
-    ['product_name', 'target_name'],
+    ['product_name', 'target_name', 'exception'],
 )
 
 def main():
@@ -58,7 +59,7 @@ def main():
     )
 
     args = parser.parse_args()
-    scrape_targets = parse_config(args.config)
+    scrape_targets = parse_config(os.path.abspath(args.config))
 
     # setup the headers for each scrape targets
     for scrape_target in scrape_targets:
@@ -73,12 +74,24 @@ def main():
     # start the main loop
     while True:
         for scrape_target in scrape_targets:
-            value = scrape_target.query_target()
-            if value is not None:
-                WEBSCRAPING_SCRAPE_TARGET_VALUE.labels(product_name=scrape_target.product_name,target_name=scrape_target.target_name).set(value)
-                WEBSCRAPING_SCRAPE_TARGET_SUCCESS.labels(product_name=scrape_target.product_name,target_name=scrape_target.target_name).inc()
-            else:
-                WEBSCRAPING_SCRAPE_TARGET_FAILURE.labels(product_name=scrape_target.product_name,target_name=scrape_target.target_name).inc()
+            try:
+                print("Starting scrape. product: '%s', target '%s'" % (scrape_target.product_name, scrape_target.target_name))
+                value = scrape_target.query_target()
+                WEBSCRAPING_SCRAPE_TARGET_VALUE.labels(
+                    product_name=scrape_target.product_name,
+                    target_name=scrape_target.target_name
+                ).set(value)
+                WEBSCRAPING_SCRAPE_TARGET_SUCCESS.labels(
+                    product_name=scrape_target.product_name,
+                    target_name=scrape_target.target_name,
+                ).inc()
+            except (RequestError, ScrapeError) as e:
+                print("Failed to scrape! product: '%s', target: '%s', message: '%s'" % (scrape_target.product_name, scrape_target.target_name, e))
+                WEBSCRAPING_SCRAPE_TARGET_FAILURE.labels(
+                    product_name=scrape_target.product_name,
+                    target_name=scrape_target.target_name,
+                    exception=e.__class__.__name__,
+                ).inc()
         time.sleep(args.interval * 60)
 
 def parse_config(config_filename):
diff --git a/webscraping_exporter/scrape_target.py b/webscraping_exporter/scrape_target.py
index 8ba311c..ffce4e6 100644
--- a/webscraping_exporter/scrape_target.py
+++ b/webscraping_exporter/scrape_target.py
@@ -17,8 +17,12 @@ class ScrapeTarget:
         self.parser = parser if parser else 'html'
         self.headers = {}
 
+        # sanity check
+        valid_parsers = ('html', 'json')
+        if self.parser not in valid_parsers:
+            raise ValueError("Invalid parser configured (got '%s' but need one of %s) product: '%s', target: '%s'" % (self.parser, valid_parsers, self.product_name, self.target_name))
+
     def query_target(self):
-        print('Query product %s, target %s' % (self.product_name, self.target_name))
         # some sites get suspicious if we talk to them in HTTP/1.1 (maybe because it doesn't match our user-agent?)
         # we use httpx to have HTTP2 support and circumvent that issue
         query_response = httpx.get(
@@ -38,14 +42,10 @@ class ScrapeTarget:
             query_response_json = json.loads(query_response)
             selector_match = str(pyjq.first(self.selector, query_response_json))
         else:
-            # TODO: better error handling
-            print('invalid parser!')
-            return None
+            raise ScrapeError('Invalid parser!')
 
         if not selector_match:
-            # TODO: better error handling
-            print('no selector_match!')
-            return None
+            raise ScrapeError('Failed to match selector!')
 
         # match the regex
         regex_match = self.regex.search(selector_match)
@@ -55,6 +55,8 @@ class ScrapeTarget:
             float_result = float(str_result)
             return float_result
         else:
-            # TODO: better error handling
-            print('no regex match!')
-            return None
\ No newline at end of file
+            raise ScrapeError('Failed to match regex!')
+
+class ScrapeError(Exception):
+    def __init__(self, msg):
+        super().__init__(msg)
\ No newline at end of file