Compare commits
8 Commits
Author | SHA1 | Date |
---|---|---|
|
4f0d37224b | |
|
a0b13f2fe7 | |
|
6a13728220 | |
|
bfd9a94cac | |
|
65ce1d3be1 | |
|
b94844e694 | |
|
14c73b99f6 | |
|
fc40e2df46 |
20
.drone.yml
20
.drone.yml
|
@ -80,14 +80,14 @@ trigger:
|
|||
- tag
|
||||
|
||||
steps:
|
||||
- name: publish pypi
|
||||
image: python:10
|
||||
- name: fetch tags
|
||||
image: alpine/git
|
||||
commands:
|
||||
- pip install twine
|
||||
- python setup.py sdist
|
||||
- twine upload dist/*
|
||||
environment:
|
||||
TWINE_USERNAME:
|
||||
from_secret: pypi_username
|
||||
TWINE_PASSWORD:
|
||||
from_secret: pypi_username
|
||||
- git fetch --tags
|
||||
- name: publish pypi
|
||||
image: plugins/pypi
|
||||
settings:
|
||||
username:
|
||||
from_secret: pypi_username
|
||||
password:
|
||||
from_secret: pypi_password
|
|
@ -0,0 +1 @@
|
|||
3.10
|
|
@ -1,4 +1,4 @@
|
|||
FROM python:3.10
|
||||
FROM python:3.10-bullseye
|
||||
COPY . /tmp/package
|
||||
RUN pip install --no-cache-dir /tmp/package && \
|
||||
rm -r /tmp/package
|
||||
|
|
30
README.md
30
README.md
|
@ -4,6 +4,8 @@ ecommerce-exporter is a [prometheus](https://prometheus.io/) exporter that websc
|
|||
|
||||
## Install
|
||||
|
||||
### Using docker
|
||||
|
||||
An aarch64 and an amd64 docker images are available on [docker hub](https://hub.docker.com/r/badjware/ecommerce-exporter). You can pull it using:
|
||||
``` sh
|
||||
docker pull badjware/ecommerce-exporter
|
||||
|
@ -11,6 +13,13 @@ docker pull badjware/ecommerce-exporter
|
|||
|
||||
This is the recommended way of running the exporter.
|
||||
|
||||
### Using pip
|
||||
|
||||
Alternatively, if you prefer to avoid having to use docker, you can install ecommerce-exporter as a standard python package.
|
||||
``` sh
|
||||
pip install ecommerce-exporter
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
Download the [example configuration file](ecommerce-exporter.example.yml) and edit it to configure the e-commerce sites you wish to scrape. You can configure multiple products and multiple targets in the same configuration file.
|
||||
|
@ -49,7 +58,7 @@ options:
|
|||
|
||||
Finding the correct value for a selector will require some effort. Once you find the correct selector to use, you should be able to use the same one across the whole site.
|
||||
|
||||
## html parser
|
||||
### html parser
|
||||
|
||||
The general procedure to figure out the selector for a site using an html parser is as follow:
|
||||
1. Open up the product page in your browser.
|
||||
|
@ -67,7 +76,7 @@ Below is a table with examples of some CSS selectors that match the html element
|
|||
| canadacomputer.com | `.price-show-panel .h2-big strong::text` |
|
||||
| memoryexpress.com | `.GrandTotal` |
|
||||
|
||||
## json parser
|
||||
### json parser
|
||||
|
||||
The general procedure to figure out the selector for a site using an json parser is as follow:
|
||||
1. Open up the development tool of your browser using the F12 key.
|
||||
|
@ -84,3 +93,20 @@ Below is a table with examples of some jq selectors that match the json field co
|
|||
| --- | --- | --- |
|
||||
| newegg.ca | `.MainItem.UnitCost` | https://www.newegg.ca/product/api/ProductRealtime?ItemNumber=19-118-343&RecommendItem=&BestSellerItemList=9SIAA4YGC82324%2C9SIADGEGMY7603%2C9SIAVH1J0A6685&IsVATPrice=true |
|
||||
| bestbuy.ca | `.[] \| .salePrice,.regularPrice` | https://www.bestbuy.ca/api/offers/v1/products/15778672/offers |
|
||||
|
||||
## Developing
|
||||
|
||||
Setup a virtualenv and activate it:
|
||||
``` sh
|
||||
python -m venv env
|
||||
source env/bin/activate
|
||||
```
|
||||
|
||||
Setup the project as an editable install:
|
||||
``` sh
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
You can now run the exporter using `ecommerce-exporter` while your virtualenv is active.
|
||||
|
||||
Happy hacking!
|
|
@ -1,13 +1,19 @@
|
|||
import argparse
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
|
||||
import yaml
|
||||
|
||||
from httpx import RequestError
|
||||
from prometheus_client import start_http_server, Gauge, Counter
|
||||
|
||||
from ecommerce_exporter.scrape_target import ScrapeError, ScrapeTarget
|
||||
from ecommerce_exporter.scrape_target import ScrapeTarget
|
||||
|
||||
logging.basicConfig(
|
||||
format=os.environ.get('LOG_FORMAT', '[%(asctime)s] [%(levelname)-8s] %(message)s'),
|
||||
level=os.environ.get('LOG_LEVEL', 'INFO')
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ECOMMERCE_SCRAPE_TARGET_VALUE = Gauge(
|
||||
'ecommerce_scrape_target_value',
|
||||
|
@ -43,7 +49,7 @@ def main():
|
|||
'--user-agent',
|
||||
help='The user-agent to spoof. (default: %(default)s)',
|
||||
type=str,
|
||||
default='Mozilla/5.0 (X11; Linux x86_64; rv:106.0) Gecko/20100101 Firefox/106.0',
|
||||
default='Mozilla/5.0 (X11; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0',
|
||||
)
|
||||
parser.add_argument(
|
||||
'-p', '--listen-port',
|
||||
|
@ -59,23 +65,17 @@ def main():
|
|||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
scrape_targets = parse_config(os.path.abspath(args.config))
|
||||
|
||||
# setup the headers for each scrape targets
|
||||
for scrape_target in scrape_targets:
|
||||
scrape_target.headers = {
|
||||
'Accept': '*/*',
|
||||
'User-Agent': args.user_agent,
|
||||
}
|
||||
scrape_targets = parse_config(os.path.abspath(args.config), user_agent=args.user_agent)
|
||||
|
||||
# start the http server to server the prometheus metrics
|
||||
logger.info("serving metrics on http://%s:%s/metrics", args.listen_address, args.listen_port)
|
||||
start_http_server(args.listen_port, args.listen_address)
|
||||
|
||||
# start the main loop
|
||||
while True:
|
||||
for scrape_target in scrape_targets:
|
||||
try:
|
||||
print("Starting scrape. product: '%s', target '%s'" % (scrape_target.product_name, scrape_target.target_name))
|
||||
logger.info("Starting scrape. product: '%s', target '%s'", scrape_target.product_name, scrape_target.target_name)
|
||||
value = scrape_target.query_target()
|
||||
ECOMMERCE_SCRAPE_TARGET_VALUE.labels(
|
||||
product_name=scrape_target.product_name,
|
||||
|
@ -85,8 +85,10 @@ def main():
|
|||
product_name=scrape_target.product_name,
|
||||
target_name=scrape_target.target_name,
|
||||
).inc()
|
||||
except (RequestError, ScrapeError) as e:
|
||||
print("Failed to scrape! product: '%s', target: '%s', message: '%s'" % (scrape_target.product_name, scrape_target.target_name, e))
|
||||
except KeyboardInterrupt:
|
||||
return
|
||||
except Exception as e:
|
||||
logger.error("Failed to scrape! product: '%s', target: '%s', message: '%s'" , scrape_target.product_name, scrape_target.target_name, e)
|
||||
ECOMMERCE_SCRAPE_TARGET_FAILURE.labels(
|
||||
product_name=scrape_target.product_name,
|
||||
target_name=scrape_target.target_name,
|
||||
|
@ -94,9 +96,9 @@ def main():
|
|||
).inc()
|
||||
time.sleep(args.interval * 60)
|
||||
|
||||
def parse_config(config_filename):
|
||||
def parse_config(config_filename, user_agent):
|
||||
result = []
|
||||
print('Loading configurations from %s' % config_filename)
|
||||
logger.info('Loading configurations from %s', config_filename)
|
||||
with open(config_filename, 'r') as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
|
@ -116,6 +118,9 @@ def parse_config(config_filename):
|
|||
target_name=target.get('name'),
|
||||
regex=target.get('regex'),
|
||||
parser=target.get('parser'),
|
||||
headers = {
|
||||
'User-Agent': user_agent,
|
||||
},
|
||||
))
|
||||
return result
|
||||
|
||||
|
|
|
@ -3,19 +3,27 @@ import re
|
|||
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import httpx
|
||||
# import httpx
|
||||
import requests
|
||||
import parsel
|
||||
import pyjq
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
class ScrapeTarget:
|
||||
def __init__(self, product_name, url, selector, target_name=None, regex=None, parser=None):
|
||||
def __init__(self, product_name, url, selector, target_name=None, regex=None, parser=None, headers={}):
|
||||
self.product_name = product_name
|
||||
self.target_name = target_name if target_name else urlparse(url).hostname
|
||||
self.url = url
|
||||
self.selector = selector
|
||||
self.regex = re.compile(regex if regex else r'[0-9]+(\.[0-9]{2})?')
|
||||
self.regex = re.compile(regex if regex else r'([0-9]+,?)+(\.[0-9]{2})?')
|
||||
self.parser = parser if parser else 'html'
|
||||
self.headers = {}
|
||||
if 'Referer' not in headers:
|
||||
headers['Referer'] = 'google.com'
|
||||
if 'DNT' not in headers:
|
||||
headers['DNT'] = '1'
|
||||
self.headers = headers
|
||||
self.session = requests.Session()
|
||||
|
||||
# sanity check
|
||||
valid_parsers = ('html', 'json')
|
||||
|
@ -23,23 +31,24 @@ class ScrapeTarget:
|
|||
raise ValueError("Invalid parser configured (got '%s' but need one of %s) product: '%s', target: '%s'" % (self.parser, valid_parsers, self.product_name, self.target_name))
|
||||
|
||||
def query_target(self):
|
||||
# some sites get suspicious if we talk to them in HTTP/1.1 (maybe because it doesn't match our user-agent?)
|
||||
# we use httpx to have HTTP2 support and circumvent that issue
|
||||
query_response = httpx.get(
|
||||
url=self.url,
|
||||
query_response = self.session.get(
|
||||
self.url,
|
||||
headers=self.headers,
|
||||
follow_redirects=True,
|
||||
).text
|
||||
)
|
||||
logger.info('Status: %s', query_response.status_code)
|
||||
# self.client.cookies.update(query_response.cookies)
|
||||
query_response_text = query_response.text
|
||||
logger.debug('Response: %s', query_response_text)
|
||||
|
||||
# parse the response and match the selector
|
||||
selector_match = ''
|
||||
if self.parser == 'html':
|
||||
# parse response as html
|
||||
selector = parsel.Selector(text=query_response)
|
||||
selector = parsel.Selector(text=query_response_text)
|
||||
selector_match = selector.css(self.selector).get()
|
||||
elif self.parser == 'json':
|
||||
# parse response as json
|
||||
query_response_json = json.loads(query_response)
|
||||
query_response_json = json.loads(query_response_text)
|
||||
selector_match = str(pyjq.first(self.selector, query_response_json))
|
||||
else:
|
||||
raise ScrapeError('Invalid parser!')
|
||||
|
@ -50,7 +59,7 @@ class ScrapeTarget:
|
|||
# match the regex
|
||||
regex_match = self.regex.search(selector_match)
|
||||
if regex_match:
|
||||
str_result = regex_match.group(0)
|
||||
str_result = regex_match.group(0).replace(',', '')
|
||||
# convert the result to float
|
||||
float_result = float(str_result)
|
||||
return float_result
|
||||
|
|
Loading…
Reference in New Issue