From bda9b433182f90d8388b560bd2405da854e414d7 Mon Sep 17 00:00:00 2001 From: Massaki Archambault Date: Sun, 30 Oct 2022 14:20:59 -0400 Subject: [PATCH] json parser --- dealwatch/cli.py | 15 +++++++++-- dealwatch/scrape_target.py | 51 +++++++++++++++++++++++++++----------- setup.cfg | 1 + 3 files changed, 51 insertions(+), 16 deletions(-) diff --git a/dealwatch/cli.py b/dealwatch/cli.py index c1e937b..c82d583 100644 --- a/dealwatch/cli.py +++ b/dealwatch/cli.py @@ -1,4 +1,5 @@ import argparse +import time import yaml @@ -14,6 +15,12 @@ def main(): type=str, default='dealwatch.yml', ) + parser.add_argument( + '-i', '--interval', + help='The target scrape interval, in minutes. (default: %(default)s)', + type=float, + default=10, + ) parser.add_argument( '--user-agent', help='The user-agent to spoof. (default: %(default)s)', @@ -46,8 +53,11 @@ def main(): # start the http server to server the prometheus metrics start_http_server(args.listen_port, args.listen_address) - for scrape_target in scrape_targets: - print(scrape_target.query_target()) + # start the main loop + while True: + for scrape_target in scrape_targets: + print(scrape_target.query_target()) + time.sleep(args.interval * 60) def parse_config(config_filename): result = [] @@ -70,6 +80,7 @@ def parse_config(config_filename): url=get_field_or_die(target, 'url'), selector=get_field_or_die(target, 'selector'), regex=target.get('regex'), + parser=target.get('parser'), )) return result diff --git a/dealwatch/scrape_target.py b/dealwatch/scrape_target.py index 71a20b5..b81d135 100644 --- a/dealwatch/scrape_target.py +++ b/dealwatch/scrape_target.py @@ -1,15 +1,19 @@ +from email import parser +import json import re import httpx import parsel +import pyjq class ScrapeTarget: - def __init__(self, product_name, target_name, url, selector, regex=None): + def __init__(self, product_name, target_name, url, selector, regex=None, parser=None): self.product_name = product_name self.target_name = target_name self.url = url - self.selector = selector+'::text' + self.selector = selector self.regex = re.compile(regex if regex else r'[0-9]+(\.[0-9]{2})?') + self.parser = parser if parser else 'html' self.headers = {} def query_target(self): @@ -21,16 +25,35 @@ class ScrapeTarget: headers=self.headers, follow_redirects=True, ).text - selector = parsel.Selector(text=query_response) - # Match the selector - selector_match = selector.css(self.selector).get() - if selector_match: - # Match the regex - regex_match = self.regex.search(selector_match) - if regex_match: - str_result = regex_match.group(0) - # Convert the reult to float - float_result = float(str_result) - return float_result - return None \ No newline at end of file + # parse the response and match the selector + selector_match = '' + if self.parser == 'html': + # parse response as html + selector = parsel.Selector(text=query_response) + selector_match = selector.css(self.selector).get() + elif self.parser == 'json': + # parse response as json + query_response_json = json.loads(query_response) + selector_match = str(pyjq.first(self.selector, query_response_json)) + else: + # TODO: better error handling + print('invalid parser!') + return None + + if not selector_match: + # TODO: better error handling + print('no selector_match!') + return None + + # match the regex + regex_match = self.regex.search(selector_match) + if regex_match: + str_result = regex_match.group(0) + # convert the result to float + float_result = float(str_result) + return float_result + else: + # TODO: better error handling + print('no regex match!') + return None \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 38d1319..1f2fd7d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,6 +13,7 @@ install_requires= PyYAML~=6.0 httpx~=0.23.0 parsel~=1.6.0 + pyjq~=2.6.0 prometheus-client~=0.15.0 [options.entry_points]