drop httpx in favour of requests
This commit is contained in:
parent
6a13728220
commit
a0b13f2fe7
|
@ -65,14 +65,7 @@ def main():
|
|||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
scrape_targets = parse_config(os.path.abspath(args.config))
|
||||
|
||||
# setup the headers for each scrape targets
|
||||
for scrape_target in scrape_targets:
|
||||
scrape_target.headers = {
|
||||
'accept': '*/*',
|
||||
'user-agent': args.user_agent,
|
||||
}
|
||||
scrape_targets = parse_config(os.path.abspath(args.config), user_agent=args.user_agent)
|
||||
|
||||
# start the http server to server the prometheus metrics
|
||||
logger.info("serving metrics on http://%s:%s/metrics", args.listen_address, args.listen_port)
|
||||
|
@ -103,7 +96,7 @@ def main():
|
|||
).inc()
|
||||
time.sleep(args.interval * 60)
|
||||
|
||||
def parse_config(config_filename):
|
||||
def parse_config(config_filename, user_agent):
|
||||
result = []
|
||||
logger.info('Loading configurations from %s', config_filename)
|
||||
with open(config_filename, 'r') as f:
|
||||
|
@ -125,6 +118,9 @@ def parse_config(config_filename):
|
|||
target_name=target.get('name'),
|
||||
regex=target.get('regex'),
|
||||
parser=target.get('parser'),
|
||||
headers = {
|
||||
'User-Agent': user_agent,
|
||||
},
|
||||
))
|
||||
return result
|
||||
|
||||
|
|
|
@ -3,25 +3,27 @@ import re
|
|||
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import httpx
|
||||
# import httpx
|
||||
import requests
|
||||
import parsel
|
||||
import pyjq
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
class ScrapeTarget:
|
||||
def __init__(self, product_name, url, selector, target_name=None, regex=None, parser=None):
|
||||
def __init__(self, product_name, url, selector, target_name=None, regex=None, parser=None, headers={}):
|
||||
self.product_name = product_name
|
||||
self.target_name = target_name if target_name else urlparse(url).hostname
|
||||
self.url = url
|
||||
self.selector = selector
|
||||
self.regex = re.compile(regex if regex else r'[0-9]+(\.[0-9]{2})?')
|
||||
self.regex = re.compile(regex if regex else r'([0-9]+,?)+(\.[0-9]{2})?')
|
||||
self.parser = parser if parser else 'html'
|
||||
self.headers = {}
|
||||
self.client = httpx.Client(
|
||||
follow_redirects=True,
|
||||
http2=True,
|
||||
)
|
||||
if 'Referer' not in headers:
|
||||
headers['Referer'] = 'google.com'
|
||||
if 'DNT' not in headers:
|
||||
headers['DNT'] = '1'
|
||||
self.headers = headers
|
||||
self.session = requests.Session()
|
||||
|
||||
# sanity check
|
||||
valid_parsers = ('html', 'json')
|
||||
|
@ -29,13 +31,12 @@ class ScrapeTarget:
|
|||
raise ValueError("Invalid parser configured (got '%s' but need one of %s) product: '%s', target: '%s'" % (self.parser, valid_parsers, self.product_name, self.target_name))
|
||||
|
||||
def query_target(self):
|
||||
# some sites get suspicious if we talk to them in HTTP/1.1 (maybe because it doesn't match our user-agent?)
|
||||
# we use httpx to have HTTP2 support and circumvent that issue
|
||||
query_response = self.client.get(
|
||||
query_response = self.session.get(
|
||||
self.url,
|
||||
headers=self.headers,
|
||||
)
|
||||
logger.info('Status: %s', query_response.status_code)
|
||||
# self.client.cookies.update(query_response.cookies)
|
||||
query_response_text = query_response.text
|
||||
logger.debug('Response: %s', query_response_text)
|
||||
|
||||
|
@ -58,7 +59,7 @@ class ScrapeTarget:
|
|||
# match the regex
|
||||
regex_match = self.regex.search(selector_match)
|
||||
if regex_match:
|
||||
str_result = regex_match.group(0)
|
||||
str_result = regex_match.group(0).replace(',', '')
|
||||
# convert the result to float
|
||||
float_result = float(str_result)
|
||||
return float_result
|
||||
|
|
Loading…
Reference in New Issue