From 687ac4317d9c1bace651296f89ada8433327fdd6 Mon Sep 17 00:00:00 2001 From: Massaki Archambault Date: Sat, 29 Oct 2022 23:35:36 -0400 Subject: [PATCH] configuration file parsing --- .gitignore | 2 ++ dealwatch.example.yml | 5 ++++ dealwatch/cli.py | 52 ++++++++++++++++++++++++++++++++++++++ dealwatch/main.py | 5 ---- dealwatch/scrape_target.py | 9 +++++++ setup.cfg | 5 +++- 6 files changed, 72 insertions(+), 6 deletions(-) create mode 100644 dealwatch.example.yml create mode 100644 dealwatch/cli.py delete mode 100644 dealwatch/main.py create mode 100644 dealwatch/scrape_target.py diff --git a/.gitignore b/.gitignore index 8703878..8dcae0b 100644 --- a/.gitignore +++ b/.gitignore @@ -201,3 +201,5 @@ tags # Persistent undo [._]*.un~ +### Project-specific +dealwatch.yml \ No newline at end of file diff --git a/dealwatch.example.yml b/dealwatch.example.yml new file mode 100644 index 0000000..4343e4e --- /dev/null +++ b/dealwatch.example.yml @@ -0,0 +1,5 @@ +targets: + amazon.ca: + url: https://www.amazon.ca/Intel-i7-12700K-Desktop-Processor-Unlocked/dp/B09FXNVDBJ/?_encoding=UTF8&pd_rd_w=BXQyU&content-id=amzn1.sym.b09e9731-f0de-43db-b62a-8954bcec282c&pf_rd_p=b09e9731-f0de-43db-b62a-8954bcec282c&pf_rd_r=Z2HRQ8TYGA943PQFTW1Q&pd_rd_wg=AG2TD&pd_rd_r=e4766451-3584-4c4f-8235-bcd4a316909a&ref_=pd_gw_ci_mcx_mr_hp_atf_m + selector: .a-offscreen + regex: '[0-9]+(\.[0-9]{2})?' \ No newline at end of file diff --git a/dealwatch/cli.py b/dealwatch/cli.py new file mode 100644 index 0000000..cf63f14 --- /dev/null +++ b/dealwatch/cli.py @@ -0,0 +1,52 @@ +import argparse + +import yaml + +from dealwatch.scrape_target import ScrapeTarget + +def main(): + parser = argparse.ArgumentParser("An utility to scrape e-commerce target price fluctuations") + parser.add_argument( + '-c', '--config', + help='The configuration file. (default: %(default)s)', + type=str, + default='dealwatch.yml', + ) + + args = parser.parse_args() + products = parse_config(args.config) + print(products) + +def parse_config(config_filename): + result = [] + print('Loading configurations from %s' % config_filename) + with open(config_filename, 'r') as f: + config = yaml.safe_load(f) + + # iterate through products listed in the configuration + products = get_field_or_die(config, 'products') + for product in products: + product_name = get_field_or_die(product, 'name') + + # iterate through the targets listed for each products in the configuration + targets = get_field_or_die(product, 'targets') + for target in targets: + # Create a ScrapeTarget for each targets to scrape + result.append(ScrapeTarget( + product_name=product_name, + target_name=get_field_or_die(target, 'name'), + url=get_field_or_die(target, 'url'), + selector=get_field_or_die(target, 'selector'), + regex=target.get('regex'), + )) + return result + +def get_field_or_die(mapping, field_name): + value = mapping.get(field_name) + if value is None: + raise Exception('Missing required field: %s' % field_name) + else: + return value + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/dealwatch/main.py b/dealwatch/main.py deleted file mode 100644 index 59cb9cd..0000000 --- a/dealwatch/main.py +++ /dev/null @@ -1,5 +0,0 @@ -def main(): - print("Hello world") - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/dealwatch/scrape_target.py b/dealwatch/scrape_target.py new file mode 100644 index 0000000..d618119 --- /dev/null +++ b/dealwatch/scrape_target.py @@ -0,0 +1,9 @@ +import re + +class ScrapeTarget: + def __init__(self, product_name, target_name, url, selector, regex=None): + self.product_name = product_name + self.target_name = target_name + self.url = url + self.selector = selector + self.regex = re.compile(regex if regex else r'[0-9]+(\.[0-9]{2})?') \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 890f926..ea44a1f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -10,10 +10,13 @@ setup_requires = setuptools setuptools_scm install_requires= + PyYAML~=6.0 + requests~=2.28.1 + parsel~=1.6.0 prometheus-client~=0.15.0 [options.entry_points] console_scripts = - dealwatch = dealwatch.main:main + dealwatch = dealwatch.cli:main [tool.setuptools_scm] \ No newline at end of file