rename project to webscraping-exporter
This commit is contained in:
parent
51968e443a
commit
0e22353da8
|
@ -202,4 +202,4 @@ tags
|
||||||
[._]*.un~
|
[._]*.un~
|
||||||
|
|
||||||
### Project-specific
|
### Project-specific
|
||||||
dealwatch.yml
|
webscraping-exporter.yml
|
|
@ -2,4 +2,4 @@ FROM python:3.10
|
||||||
COPY . /tmp/package
|
COPY . /tmp/package
|
||||||
RUN pip install --no-cache-dir /tmp/package && \
|
RUN pip install --no-cache-dir /tmp/package && \
|
||||||
rm -r /tmp/package
|
rm -r /tmp/package
|
||||||
ENTRYPOINT ["dealwatch"]
|
ENTRYPOINT ["webscraping-exporter"]
|
|
@ -1,5 +1,5 @@
|
||||||
[metadata]
|
[metadata]
|
||||||
name = dealwatch
|
name = webscraping-exporter
|
||||||
author = badjware
|
author = badjware
|
||||||
author_email = marchambault.badjware.dev
|
author_email = marchambault.badjware.dev
|
||||||
platform = any
|
platform = any
|
||||||
|
@ -18,6 +18,6 @@ install_requires=
|
||||||
|
|
||||||
[options.entry_points]
|
[options.entry_points]
|
||||||
console_scripts =
|
console_scripts =
|
||||||
dealwatch = dealwatch.cli:main
|
webscraping-exporter = webscraping_exporter.cli:main
|
||||||
|
|
||||||
[tool.setuptools_scm]
|
[tool.setuptools_scm]
|
|
@ -1,11 +1,28 @@
|
||||||
import argparse
|
import argparse
|
||||||
|
from itertools import product
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from prometheus_client import start_http_server
|
from prometheus_client import start_http_server, Gauge, Counter
|
||||||
|
|
||||||
from dealwatch.scrape_target import ScrapeTarget
|
from webscraping_exporter.scrape_target import ScrapeTarget
|
||||||
|
|
||||||
|
WEBSCRAPING_SCRAPE_TARGET_VALUE = Gauge(
|
||||||
|
'webscraping_scrape_target_value',
|
||||||
|
'The value scraped from a scrape target',
|
||||||
|
['product_name', 'target_name'],
|
||||||
|
)
|
||||||
|
WEBSCRAPING_SCRAPE_TARGET_SUCCESS = Counter(
|
||||||
|
'webscraping_scrape_target_success_total',
|
||||||
|
'The number of successful scrape and parse of a scrape target',
|
||||||
|
['product_name', 'target_name'],
|
||||||
|
)
|
||||||
|
WEBSCRAPING_SCRAPE_TARGET_FAILURE = Counter(
|
||||||
|
'webscraping_scrape_target_failure_total',
|
||||||
|
'The number of failed scrape and parse of a scrape target',
|
||||||
|
['product_name', 'target_name'],
|
||||||
|
)
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser("An utility to scrape e-commerce product price and expose them as prometheus metrics")
|
parser = argparse.ArgumentParser("An utility to scrape e-commerce product price and expose them as prometheus metrics")
|
||||||
|
@ -13,13 +30,13 @@ def main():
|
||||||
'-c', '--config',
|
'-c', '--config',
|
||||||
help='The configuration file. (default: %(default)s)',
|
help='The configuration file. (default: %(default)s)',
|
||||||
type=str,
|
type=str,
|
||||||
default='dealwatch.yml',
|
default='webscraping-exporter.yml',
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-i', '--interval',
|
'-i', '--interval',
|
||||||
help='The target scrape interval, in minutes. (default: %(default)s)',
|
help='The target scrape interval, in minutes. (default: %(default)s)',
|
||||||
type=float,
|
type=float,
|
||||||
default=10,
|
default=15,
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--user-agent',
|
'--user-agent',
|
||||||
|
@ -56,7 +73,12 @@ def main():
|
||||||
# start the main loop
|
# start the main loop
|
||||||
while True:
|
while True:
|
||||||
for scrape_target in scrape_targets:
|
for scrape_target in scrape_targets:
|
||||||
print(scrape_target.query_target())
|
value = scrape_target.query_target()
|
||||||
|
if value is not None:
|
||||||
|
WEBSCRAPING_SCRAPE_TARGET_VALUE.labels(product_name=scrape_target.product_name,target_name=scrape_target.target_name).set(value)
|
||||||
|
WEBSCRAPING_SCRAPE_TARGET_SUCCESS.labels(product_name=scrape_target.product_name,target_name=scrape_target.target_name).inc()
|
||||||
|
else:
|
||||||
|
WEBSCRAPING_SCRAPE_TARGET_FAILURE.labels(product_name=scrape_target.product_name,target_name=scrape_target.target_name).inc()
|
||||||
time.sleep(args.interval * 60)
|
time.sleep(args.interval * 60)
|
||||||
|
|
||||||
def parse_config(config_filename):
|
def parse_config(config_filename):
|
|
@ -19,7 +19,7 @@ class ScrapeTarget:
|
||||||
|
|
||||||
def query_target(self):
|
def query_target(self):
|
||||||
print('Query product %s, target %s' % (self.product_name, self.target_name))
|
print('Query product %s, target %s' % (self.product_name, self.target_name))
|
||||||
# some sites get suspicious if we talk to them in HTTP/1.1
|
# some sites get suspicious if we talk to them in HTTP/1.1 (maybe because it doesn't match our user-agent?)
|
||||||
# we use httpx to have HTTP2 support and circumvent that issue
|
# we use httpx to have HTTP2 support and circumvent that issue
|
||||||
query_response = httpx.get(
|
query_response = httpx.get(
|
||||||
url=self.url,
|
url=self.url,
|
Loading…
Reference in New Issue