1
0
Fork 0

Compare commits

..

10 Commits

Author SHA1 Message Date
Massaki Archambault 4f0d37224b use bullseye in docker image 2024-10-10 00:19:50 -04:00
Massaki Archambault a0b13f2fe7 drop httpx in favour of requests 2024-10-09 22:37:34 -04:00
Massaki Archambault 6a13728220 improve logging 2024-10-09 19:11:11 -04:00
Massaki Archambault bfd9a94cac do not crash on unhandled exception 2024-10-09 18:21:30 -04:00
Massaki Archambault 65ce1d3be1 improve readme
continuous-integration/drone/push Build is passing Details
2022-11-06 14:37:28 -05:00
Massaki Archambault b94844e694 fetch tags
continuous-integration/drone/tag Build is passing Details
continuous-integration/drone/push Build is passing Details
2022-11-06 01:26:30 -04:00
Massaki Archambault 14c73b99f6 fix typo
continuous-integration/drone/tag Build is failing Details
continuous-integration/drone/push Build is passing Details
2022-11-06 01:18:54 -04:00
Massaki Archambault fc40e2df46 fix python docker image tag
continuous-integration/drone/push Build is passing Details
continuous-integration/drone/tag Build was killed Details
2022-11-06 00:56:09 -04:00
Massaki Archambault 311152793b plugins/python is bugged so do the upload to pypi manually
continuous-integration/drone/push Build was killed Details
continuous-integration/drone/tag Build was killed Details
2022-11-06 00:42:53 -04:00
Massaki Archambault a919efa17f fix typo
continuous-integration/drone/tag Build was killed Details
2022-11-06 00:29:59 -04:00
7 changed files with 86 additions and 41 deletions

View File

@ -80,10 +80,14 @@ trigger:
- tag
steps:
- name: publish pypi
- name: fetch tags
image: alpine/git
commands:
- git fetch --tags
- name: publish pypi
image: plugins/pypi
settings:
username:
from_secret: pypi_username
password:
from_secret: pypi_username
from_secret: pypi_password

1
.python-version Normal file
View File

@ -0,0 +1 @@
3.10

View File

@ -1,4 +1,4 @@
FROM python:3.10
FROM python:3.10-bullseye
COPY . /tmp/package
RUN pip install --no-cache-dir /tmp/package && \
rm -r /tmp/package

View File

@ -4,6 +4,8 @@ ecommerce-exporter is a [prometheus](https://prometheus.io/) exporter that websc
## Install
### Using docker
An aarch64 and an amd64 docker images are available on [docker hub](https://hub.docker.com/r/badjware/ecommerce-exporter). You can pull it using:
``` sh
docker pull badjware/ecommerce-exporter
@ -11,6 +13,13 @@ docker pull badjware/ecommerce-exporter
This is the recommended way of running the exporter.
### Using pip
Alternatively, if you prefer to avoid having to use docker, you can install ecommerce-exporter as a standard python package.
``` sh
pip install ecommerce-exporter
```
## Usage
Download the [example configuration file](ecommerce-exporter.example.yml) and edit it to configure the e-commerce sites you wish to scrape. You can configure multiple products and multiple targets in the same configuration file.
@ -49,7 +58,7 @@ options:
Finding the correct value for a selector will require some effort. Once you find the correct selector to use, you should be able to use the same one across the whole site.
## html parser
### html parser
The general procedure to figure out the selector for a site using an html parser is as follow:
1. Open up the product page in your browser.
@ -67,7 +76,7 @@ Below is a table with examples of some CSS selectors that match the html element
| canadacomputer.com | `.price-show-panel .h2-big strong::text` |
| memoryexpress.com | `.GrandTotal` |
## json parser
### json parser
The general procedure to figure out the selector for a site using an json parser is as follow:
1. Open up the development tool of your browser using the F12 key.
@ -84,3 +93,20 @@ Below is a table with examples of some jq selectors that match the json field co
| --- | --- | --- |
| newegg.ca | `.MainItem.UnitCost` | https://www.newegg.ca/product/api/ProductRealtime?ItemNumber=19-118-343&RecommendItem=&BestSellerItemList=9SIAA4YGC82324%2C9SIADGEGMY7603%2C9SIAVH1J0A6685&IsVATPrice=true |
| bestbuy.ca | `.[] \| .salePrice,.regularPrice` | https://www.bestbuy.ca/api/offers/v1/products/15778672/offers |
## Developing
Setup a virtualenv and activate it:
``` sh
python -m venv env
source env/bin/activate
```
Setup the project as an editable install:
``` sh
pip install -e .
```
You can now run the exporter using `ecommerce-exporter` while your virtualenv is active.
Happy hacking!

View File

@ -1,13 +1,19 @@
import argparse
import os
import time
import logging
import yaml
from httpx import RequestError
from prometheus_client import start_http_server, Gauge, Counter
from ecommerce_exporter.scrape_target import ScrapeError, ScrapeTarget
from ecommerce_exporter.scrape_target import ScrapeTarget
logging.basicConfig(
format=os.environ.get('LOG_FORMAT', '[%(asctime)s] [%(levelname)-8s] %(message)s'),
level=os.environ.get('LOG_LEVEL', 'INFO')
)
logger = logging.getLogger(__name__)
ECOMMERCE_SCRAPE_TARGET_VALUE = Gauge(
'ecommerce_scrape_target_value',
@ -43,7 +49,7 @@ def main():
'--user-agent',
help='The user-agent to spoof. (default: %(default)s)',
type=str,
default='Mozilla/5.0 (X11; Linux x86_64; rv:106.0) Gecko/20100101 Firefox/106.0',
default='Mozilla/5.0 (X11; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0',
)
parser.add_argument(
'-p', '--listen-port',
@ -59,23 +65,17 @@ def main():
)
args = parser.parse_args()
scrape_targets = parse_config(os.path.abspath(args.config))
# setup the headers for each scrape targets
for scrape_target in scrape_targets:
scrape_target.headers = {
'Accept': '*/*',
'User-Agent': args.user_agent,
}
scrape_targets = parse_config(os.path.abspath(args.config), user_agent=args.user_agent)
# start the http server to server the prometheus metrics
logger.info("serving metrics on http://%s:%s/metrics", args.listen_address, args.listen_port)
start_http_server(args.listen_port, args.listen_address)
# start the main loop
while True:
for scrape_target in scrape_targets:
try:
print("Starting scrape. product: '%s', target '%s'" % (scrape_target.product_name, scrape_target.target_name))
logger.info("Starting scrape. product: '%s', target '%s'", scrape_target.product_name, scrape_target.target_name)
value = scrape_target.query_target()
ECOMMERCE_SCRAPE_TARGET_VALUE.labels(
product_name=scrape_target.product_name,
@ -85,8 +85,10 @@ def main():
product_name=scrape_target.product_name,
target_name=scrape_target.target_name,
).inc()
except (RequestError, ScrapeError) as e:
print("Failed to scrape! product: '%s', target: '%s', message: '%s'" % (scrape_target.product_name, scrape_target.target_name, e))
except KeyboardInterrupt:
return
except Exception as e:
logger.error("Failed to scrape! product: '%s', target: '%s', message: '%s'" , scrape_target.product_name, scrape_target.target_name, e)
ECOMMERCE_SCRAPE_TARGET_FAILURE.labels(
product_name=scrape_target.product_name,
target_name=scrape_target.target_name,
@ -94,9 +96,9 @@ def main():
).inc()
time.sleep(args.interval * 60)
def parse_config(config_filename):
def parse_config(config_filename, user_agent):
result = []
print('Loading configurations from %s' % config_filename)
logger.info('Loading configurations from %s', config_filename)
with open(config_filename, 'r') as f:
config = yaml.safe_load(f)
@ -116,6 +118,9 @@ def parse_config(config_filename):
target_name=target.get('name'),
regex=target.get('regex'),
parser=target.get('parser'),
headers = {
'User-Agent': user_agent,
},
))
return result

View File

@ -3,19 +3,27 @@ import re
from urllib.parse import urlparse
import httpx
# import httpx
import requests
import parsel
import pyjq
import logging
logger = logging.getLogger(__name__)
class ScrapeTarget:
def __init__(self, product_name, url, selector, target_name=None, regex=None, parser=None):
def __init__(self, product_name, url, selector, target_name=None, regex=None, parser=None, headers={}):
self.product_name = product_name
self.target_name = target_name if target_name else urlparse(url).hostname
self.url = url
self.selector = selector
self.regex = re.compile(regex if regex else r'[0-9]+(\.[0-9]{2})?')
self.regex = re.compile(regex if regex else r'([0-9]+,?)+(\.[0-9]{2})?')
self.parser = parser if parser else 'html'
self.headers = {}
if 'Referer' not in headers:
headers['Referer'] = 'google.com'
if 'DNT' not in headers:
headers['DNT'] = '1'
self.headers = headers
self.session = requests.Session()
# sanity check
valid_parsers = ('html', 'json')
@ -23,23 +31,24 @@ class ScrapeTarget:
raise ValueError("Invalid parser configured (got '%s' but need one of %s) product: '%s', target: '%s'" % (self.parser, valid_parsers, self.product_name, self.target_name))
def query_target(self):
# some sites get suspicious if we talk to them in HTTP/1.1 (maybe because it doesn't match our user-agent?)
# we use httpx to have HTTP2 support and circumvent that issue
query_response = httpx.get(
url=self.url,
query_response = self.session.get(
self.url,
headers=self.headers,
follow_redirects=True,
).text
)
logger.info('Status: %s', query_response.status_code)
# self.client.cookies.update(query_response.cookies)
query_response_text = query_response.text
logger.debug('Response: %s', query_response_text)
# parse the response and match the selector
selector_match = ''
if self.parser == 'html':
# parse response as html
selector = parsel.Selector(text=query_response)
selector = parsel.Selector(text=query_response_text)
selector_match = selector.css(self.selector).get()
elif self.parser == 'json':
# parse response as json
query_response_json = json.loads(query_response)
query_response_json = json.loads(query_response_text)
selector_match = str(pyjq.first(self.selector, query_response_json))
else:
raise ScrapeError('Invalid parser!')
@ -50,7 +59,7 @@ class ScrapeTarget:
# match the regex
regex_match = self.regex.search(selector_match)
if regex_match:
str_result = regex_match.group(0)
str_result = regex_match.group(0).replace(',', '')
# convert the result to float
float_result = float(str_result)
return float_result

View File

@ -3,7 +3,7 @@ name = ecommerce-exporter
description = ecommerce-exporter is a prometheus exporter that export the price of products in e-commerce site as prometheus metrics.
url = https://code.badjware.dev/badjware/ecommerce-exporter
author = badjware
author_email = marchambault.badjware.dev
author_email = marchambault@badjware.dev
licence = MIT Licence
classifers =
Programming Language :: Python
@ -16,7 +16,7 @@ setup_requires =
setuptools_scm
install_requires=
PyYAML~=6.0
httpx~=0.23.0
requests~=2.32.0
parsel~=1.6.0
pyjq~=2.6.0
prometheus-client~=0.15.0