drop httpx in favour of requests
This commit is contained in:
parent
6a13728220
commit
a0b13f2fe7
|
@ -65,14 +65,7 @@ def main():
|
||||||
)
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
scrape_targets = parse_config(os.path.abspath(args.config))
|
scrape_targets = parse_config(os.path.abspath(args.config), user_agent=args.user_agent)
|
||||||
|
|
||||||
# setup the headers for each scrape targets
|
|
||||||
for scrape_target in scrape_targets:
|
|
||||||
scrape_target.headers = {
|
|
||||||
'accept': '*/*',
|
|
||||||
'user-agent': args.user_agent,
|
|
||||||
}
|
|
||||||
|
|
||||||
# start the http server to server the prometheus metrics
|
# start the http server to server the prometheus metrics
|
||||||
logger.info("serving metrics on http://%s:%s/metrics", args.listen_address, args.listen_port)
|
logger.info("serving metrics on http://%s:%s/metrics", args.listen_address, args.listen_port)
|
||||||
|
@ -103,7 +96,7 @@ def main():
|
||||||
).inc()
|
).inc()
|
||||||
time.sleep(args.interval * 60)
|
time.sleep(args.interval * 60)
|
||||||
|
|
||||||
def parse_config(config_filename):
|
def parse_config(config_filename, user_agent):
|
||||||
result = []
|
result = []
|
||||||
logger.info('Loading configurations from %s', config_filename)
|
logger.info('Loading configurations from %s', config_filename)
|
||||||
with open(config_filename, 'r') as f:
|
with open(config_filename, 'r') as f:
|
||||||
|
@ -125,6 +118,9 @@ def parse_config(config_filename):
|
||||||
target_name=target.get('name'),
|
target_name=target.get('name'),
|
||||||
regex=target.get('regex'),
|
regex=target.get('regex'),
|
||||||
parser=target.get('parser'),
|
parser=target.get('parser'),
|
||||||
|
headers = {
|
||||||
|
'User-Agent': user_agent,
|
||||||
|
},
|
||||||
))
|
))
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
|
@ -3,25 +3,27 @@ import re
|
||||||
|
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import httpx
|
# import httpx
|
||||||
|
import requests
|
||||||
import parsel
|
import parsel
|
||||||
import pyjq
|
import pyjq
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
class ScrapeTarget:
|
class ScrapeTarget:
|
||||||
def __init__(self, product_name, url, selector, target_name=None, regex=None, parser=None):
|
def __init__(self, product_name, url, selector, target_name=None, regex=None, parser=None, headers={}):
|
||||||
self.product_name = product_name
|
self.product_name = product_name
|
||||||
self.target_name = target_name if target_name else urlparse(url).hostname
|
self.target_name = target_name if target_name else urlparse(url).hostname
|
||||||
self.url = url
|
self.url = url
|
||||||
self.selector = selector
|
self.selector = selector
|
||||||
self.regex = re.compile(regex if regex else r'[0-9]+(\.[0-9]{2})?')
|
self.regex = re.compile(regex if regex else r'([0-9]+,?)+(\.[0-9]{2})?')
|
||||||
self.parser = parser if parser else 'html'
|
self.parser = parser if parser else 'html'
|
||||||
self.headers = {}
|
if 'Referer' not in headers:
|
||||||
self.client = httpx.Client(
|
headers['Referer'] = 'google.com'
|
||||||
follow_redirects=True,
|
if 'DNT' not in headers:
|
||||||
http2=True,
|
headers['DNT'] = '1'
|
||||||
)
|
self.headers = headers
|
||||||
|
self.session = requests.Session()
|
||||||
|
|
||||||
# sanity check
|
# sanity check
|
||||||
valid_parsers = ('html', 'json')
|
valid_parsers = ('html', 'json')
|
||||||
|
@ -29,13 +31,12 @@ class ScrapeTarget:
|
||||||
raise ValueError("Invalid parser configured (got '%s' but need one of %s) product: '%s', target: '%s'" % (self.parser, valid_parsers, self.product_name, self.target_name))
|
raise ValueError("Invalid parser configured (got '%s' but need one of %s) product: '%s', target: '%s'" % (self.parser, valid_parsers, self.product_name, self.target_name))
|
||||||
|
|
||||||
def query_target(self):
|
def query_target(self):
|
||||||
# some sites get suspicious if we talk to them in HTTP/1.1 (maybe because it doesn't match our user-agent?)
|
query_response = self.session.get(
|
||||||
# we use httpx to have HTTP2 support and circumvent that issue
|
|
||||||
query_response = self.client.get(
|
|
||||||
self.url,
|
self.url,
|
||||||
headers=self.headers,
|
headers=self.headers,
|
||||||
)
|
)
|
||||||
logger.info('Status: %s', query_response.status_code)
|
logger.info('Status: %s', query_response.status_code)
|
||||||
|
# self.client.cookies.update(query_response.cookies)
|
||||||
query_response_text = query_response.text
|
query_response_text = query_response.text
|
||||||
logger.debug('Response: %s', query_response_text)
|
logger.debug('Response: %s', query_response_text)
|
||||||
|
|
||||||
|
@ -58,7 +59,7 @@ class ScrapeTarget:
|
||||||
# match the regex
|
# match the regex
|
||||||
regex_match = self.regex.search(selector_match)
|
regex_match = self.regex.search(selector_match)
|
||||||
if regex_match:
|
if regex_match:
|
||||||
str_result = regex_match.group(0)
|
str_result = regex_match.group(0).replace(',', '')
|
||||||
# convert the result to float
|
# convert the result to float
|
||||||
float_result = float(str_result)
|
float_result = float(str_result)
|
||||||
return float_result
|
return float_result
|
||||||
|
|
Loading…
Reference in New Issue