1
0
Fork 0

add grafana-agent

This commit is contained in:
Massaki Archambault 2022-05-26 16:16:05 -04:00
parent 56ffc03025
commit 826c8d33e0
6 changed files with 135 additions and 0 deletions

View File

@ -0,0 +1,10 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: grafana-agent-wal-pvc
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 5Gi

View File

@ -0,0 +1,18 @@
resources:
- https://raw.githubusercontent.com/grafana/agent/v0.24.2/production/kubernetes/agent-bare.yaml
- grafana-agent-volume.yaml
secretGenerator:
- name: grafana-agent
behavior: create
literals:
- agent.yaml=
patchesJson6902:
- target:
version: v1
kind: StatefulSet
name: grafana-agent
path: patches/grafana-agent-statefulset-patch.yaml
namespace: default

View File

@ -0,0 +1,13 @@
- op: remove
path: /spec/volumeClaimTemplates
- op: add
path: /spec/template/spec/volumes
value:
- name: grafana-agent
secret:
secretName: grafana-agent
- name: agent-wal
persistentVolumeClaim:
claimName: grafana-agent-wal-pvc

View File

@ -3,6 +3,7 @@ bases:
- ../../bases/node-exporter - ../../bases/node-exporter
- ../../bases/blackbox-exporter - ../../bases/blackbox-exporter
- ../../bases/kube-state-metrics - ../../bases/kube-state-metrics
- ../../bases/grafana-agent
- servicemonitors/any-namespaces-http.yaml - servicemonitors/any-namespaces-http.yaml
- servicemonitors/kubelet.yaml - servicemonitors/kubelet.yaml
- servicemonitors/longhorn.yaml - servicemonitors/longhorn.yaml

View File

@ -0,0 +1,88 @@
metrics:
wal_directory: /var/lib/agent/wal
global:
scrape_interval: 60s
external_labels:
cluster: cloud
configs:
- name: integrations
remote_write:
- url: https://prometheus-prod-10-prod-us-central-0.grafana.net/api/prom/push
basic_auth:
username: 443422
password: ${ssm:/k3s/prod/monitoring/grafana-cloud/password}
scrape_configs:
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
job_name: integrations/kubernetes/cadvisor
kubernetes_sd_configs:
- role: node
metric_relabel_configs:
- source_labels: [__name__]
regex: namespace_memory:kube_pod_container_resource_requests:sum|kubelet_running_containers|container_cpu_usage_seconds_total|kube_pod_container_info|container_network_receive_packets_dropped_total|kube_pod_status_phase|kubelet_pod_start_duration_seconds_count|kubelet_cgroup_manager_duration_seconds_bucket|kube_horizontalpodautoscaler_status_desired_replicas|cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits|node_namespace_pod_container:container_memory_swap|kube_statefulset_status_replicas_ready|kube_horizontalpodautoscaler_spec_max_replicas|cluster:namespace:pod_memory:active:kube_pod_container_resource_requests|process_cpu_seconds_total|process_resident_memory_bytes|kubelet_server_expiration_renew_errors|kube_daemonset.*|container_fs_reads_total|machine_memory_bytes|kubelet_volume_stats_inodes_used|volume_manager_total_volumes|kube_statefulset_status_replicas|namespace_cpu:kube_pod_container_resource_limits:sum|kube_pod_container_resource_requests|kube_pod_container_resource_limits|kubelet_pod_worker_duration_seconds_count|namespace_workload_pod:kube_pod_owner:relabel|kubelet_cgroup_manager_duration_seconds_count|container_cpu_cfs_throttled_periods_total|kube_node_spec_taint|container_fs_reads_bytes_total|kubelet_certificate_manager_client_ttl_seconds|container_network_receive_bytes_total|kubelet_running_container_count|kube_daemonset_status_number_available|kube_node_status_allocatable|container_fs_writes_total|kube_namespace_status_phase|kubelet_volume_stats_available_bytes|kubelet_pleg_relist_duration_seconds_bucket|kubelet_runtime_operations_errors_total|kube_pod_container_status_waiting_reason|kube_replicaset_owner|kube_resourcequota|kube_pod_info|kubelet_pleg_relist_duration_seconds_count|kube_deployment_status_replicas_available|cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests|kubelet_running_pods|kube_statefulset_status_replicas_updated|kube_deployment_status_replicas_updated|kube_job_spec_completions|kube_daemonset_status_number_misscheduled|kubelet_certificate_manager_server_ttl_seconds|container_network_transmit_bytes_total|container_memory_cache|kubelet_volume_stats_capacity_bytes|node_namespace_pod_container:container_memory_cache|container_memory_rss|container_memory_swap|storage_operation_duration_seconds_count|kube_replicaset.*|kube_pod_owner|cluster:namespace:pod_memory:active:kube_pod_container_resource_limits|kubelet_volume_stats_inodes|kube_daemonset_status_desired_number_scheduled|kube_daemonset_status_updated_number_scheduled|kube_statefulset.*|kube_node_info|go_goroutines|kubelet_pod_worker_duration_seconds_bucket|kubelet_node_config_error|container_cpu_cfs_periods_total|kubelet_pleg_relist_interval_seconds_bucket|kube_job.*|container_network_receive_packets_total|container_fs_writes_bytes_total|kubelet_running_pod_count|kube_deployment_spec_replicas|up|kube_node_status_capacity|namespace_cpu:kube_pod_container_resource_requests:sum|node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate|container_memory_working_set_bytes|kubelet_node_name|node_namespace_pod_container:container_memory_rss|storage_operation_errors_total|kube_statefulset_metadata_generation|container_network_transmit_packets_total|kubelet_runtime_operations_total|kube_statefulset_status_observed_generation|kube_horizontalpodautoscaler_status_current_replicas|kubernetes_build_info|kubelet_certificate_manager_client_expiration_renew_errors|kube_job_failed|namespace_workload_pod|node_namespace_pod_container:container_memory_working_set_bytes|kube_statefulset_replicas|kube_deployment_status_observed_generation|kube_pod_container_status_restarts_total|kube_daemonset_status_current_number_scheduled|kube_pod_start_time|namespace_memory:kube_pod_container_resource_limits:sum|container_network_transmit_packets_dropped_total|rest_client_requests_total|kube_deployment_metadata_generation|kube_statefulset_status_update_revision|kube_job_status_succeeded|kube_horizontalpodautoscaler_spec_min_replicas|kube_statefulset_status_current_revision|node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile|kube_node_status_condition
action: keep
relabel_configs:
- replacement: kubernetes.default.svc.cluster.local:443
target_label: __address__
- regex: (.+)
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
source_labels:
- __meta_kubernetes_node_name
target_label: __metrics_path__
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: false
server_name: kubernetes
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
job_name: integrations/kubernetes/kubelet
kubernetes_sd_configs:
- role: node
metric_relabel_configs:
- source_labels: [__name__]
regex: namespace_memory:kube_pod_container_resource_requests:sum|kubelet_running_containers|container_cpu_usage_seconds_total|kube_pod_container_info|container_network_receive_packets_dropped_total|kube_pod_status_phase|kubelet_pod_start_duration_seconds_count|kubelet_cgroup_manager_duration_seconds_bucket|kube_horizontalpodautoscaler_status_desired_replicas|cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits|node_namespace_pod_container:container_memory_swap|kube_statefulset_status_replicas_ready|kube_horizontalpodautoscaler_spec_max_replicas|cluster:namespace:pod_memory:active:kube_pod_container_resource_requests|process_cpu_seconds_total|process_resident_memory_bytes|kubelet_server_expiration_renew_errors|kube_daemonset.*|container_fs_reads_total|machine_memory_bytes|kubelet_volume_stats_inodes_used|volume_manager_total_volumes|kube_statefulset_status_replicas|namespace_cpu:kube_pod_container_resource_limits:sum|kube_pod_container_resource_requests|kube_pod_container_resource_limits|kubelet_pod_worker_duration_seconds_count|namespace_workload_pod:kube_pod_owner:relabel|kubelet_cgroup_manager_duration_seconds_count|container_cpu_cfs_throttled_periods_total|kube_node_spec_taint|container_fs_reads_bytes_total|kubelet_certificate_manager_client_ttl_seconds|container_network_receive_bytes_total|kubelet_running_container_count|kube_daemonset_status_number_available|kube_node_status_allocatable|container_fs_writes_total|kube_namespace_status_phase|kubelet_volume_stats_available_bytes|kubelet_pleg_relist_duration_seconds_bucket|kubelet_runtime_operations_errors_total|kube_pod_container_status_waiting_reason|kube_replicaset_owner|kube_resourcequota|kube_pod_info|kubelet_pleg_relist_duration_seconds_count|kube_deployment_status_replicas_available|cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests|kubelet_running_pods|kube_statefulset_status_replicas_updated|kube_deployment_status_replicas_updated|kube_job_spec_completions|kube_daemonset_status_number_misscheduled|kubelet_certificate_manager_server_ttl_seconds|container_network_transmit_bytes_total|container_memory_cache|kubelet_volume_stats_capacity_bytes|node_namespace_pod_container:container_memory_cache|container_memory_rss|container_memory_swap|storage_operation_duration_seconds_count|kube_replicaset.*|kube_pod_owner|cluster:namespace:pod_memory:active:kube_pod_container_resource_limits|kubelet_volume_stats_inodes|kube_daemonset_status_desired_number_scheduled|kube_daemonset_status_updated_number_scheduled|kube_statefulset.*|kube_node_info|go_goroutines|kubelet_pod_worker_duration_seconds_bucket|kubelet_node_config_error|container_cpu_cfs_periods_total|kubelet_pleg_relist_interval_seconds_bucket|kube_job.*|container_network_receive_packets_total|container_fs_writes_bytes_total|kubelet_running_pod_count|kube_deployment_spec_replicas|up|kube_node_status_capacity|namespace_cpu:kube_pod_container_resource_requests:sum|node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate|container_memory_working_set_bytes|kubelet_node_name|node_namespace_pod_container:container_memory_rss|storage_operation_errors_total|kube_statefulset_metadata_generation|container_network_transmit_packets_total|kubelet_runtime_operations_total|kube_statefulset_status_observed_generation|kube_horizontalpodautoscaler_status_current_replicas|kubernetes_build_info|kubelet_certificate_manager_client_expiration_renew_errors|kube_job_failed|namespace_workload_pod|node_namespace_pod_container:container_memory_working_set_bytes|kube_statefulset_replicas|kube_deployment_status_observed_generation|kube_pod_container_status_restarts_total|kube_daemonset_status_current_number_scheduled|kube_pod_start_time|namespace_memory:kube_pod_container_resource_limits:sum|container_network_transmit_packets_dropped_total|rest_client_requests_total|kube_deployment_metadata_generation|kube_statefulset_status_update_revision|kube_job_status_succeeded|kube_horizontalpodautoscaler_spec_min_replicas|kube_statefulset_status_current_revision|node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile|kube_node_status_condition
action: keep
relabel_configs:
- replacement: kubernetes.default.svc.cluster.local:443
target_label: __address__
- regex: (.+)
replacement: /api/v1/nodes/${1}/proxy/metrics
source_labels:
- __meta_kubernetes_node_name
target_label: __metrics_path__
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: false
server_name: kubernetes
- job_name: integrations/kubernetes/kube-state-metrics
kubernetes_sd_configs:
- role: pod
metric_relabel_configs:
- source_labels: [__name__]
regex: namespace_memory:kube_pod_container_resource_requests:sum|kubelet_running_containers|container_cpu_usage_seconds_total|kube_pod_container_info|container_network_receive_packets_dropped_total|kube_pod_status_phase|kubelet_pod_start_duration_seconds_count|kubelet_cgroup_manager_duration_seconds_bucket|kube_horizontalpodautoscaler_status_desired_replicas|cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits|node_namespace_pod_container:container_memory_swap|kube_statefulset_status_replicas_ready|kube_horizontalpodautoscaler_spec_max_replicas|cluster:namespace:pod_memory:active:kube_pod_container_resource_requests|process_cpu_seconds_total|process_resident_memory_bytes|kubelet_server_expiration_renew_errors|kube_daemonset.*|container_fs_reads_total|machine_memory_bytes|kubelet_volume_stats_inodes_used|volume_manager_total_volumes|kube_statefulset_status_replicas|namespace_cpu:kube_pod_container_resource_limits:sum|kube_pod_container_resource_requests|kube_pod_container_resource_limits|kubelet_pod_worker_duration_seconds_count|namespace_workload_pod:kube_pod_owner:relabel|kubelet_cgroup_manager_duration_seconds_count|container_cpu_cfs_throttled_periods_total|kube_node_spec_taint|container_fs_reads_bytes_total|kubelet_certificate_manager_client_ttl_seconds|container_network_receive_bytes_total|kubelet_running_container_count|kube_daemonset_status_number_available|kube_node_status_allocatable|container_fs_writes_total|kube_namespace_status_phase|kubelet_volume_stats_available_bytes|kubelet_pleg_relist_duration_seconds_bucket|kubelet_runtime_operations_errors_total|kube_pod_container_status_waiting_reason|kube_replicaset_owner|kube_resourcequota|kube_pod_info|kubelet_pleg_relist_duration_seconds_count|kube_deployment_status_replicas_available|cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests|kubelet_running_pods|kube_statefulset_status_replicas_updated|kube_deployment_status_replicas_updated|kube_job_spec_completions|kube_daemonset_status_number_misscheduled|kubelet_certificate_manager_server_ttl_seconds|container_network_transmit_bytes_total|container_memory_cache|kubelet_volume_stats_capacity_bytes|node_namespace_pod_container:container_memory_cache|container_memory_rss|container_memory_swap|storage_operation_duration_seconds_count|kube_replicaset.*|kube_pod_owner|cluster:namespace:pod_memory:active:kube_pod_container_resource_limits|kubelet_volume_stats_inodes|kube_daemonset_status_desired_number_scheduled|kube_daemonset_status_updated_number_scheduled|kube_statefulset.*|kube_node_info|go_goroutines|kubelet_pod_worker_duration_seconds_bucket|kubelet_node_config_error|container_cpu_cfs_periods_total|kubelet_pleg_relist_interval_seconds_bucket|kube_job.*|container_network_receive_packets_total|container_fs_writes_bytes_total|kubelet_running_pod_count|kube_deployment_spec_replicas|up|kube_node_status_capacity|namespace_cpu:kube_pod_container_resource_requests:sum|node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate|container_memory_working_set_bytes|kubelet_node_name|node_namespace_pod_container:container_memory_rss|storage_operation_errors_total|kube_statefulset_metadata_generation|container_network_transmit_packets_total|kubelet_runtime_operations_total|kube_statefulset_status_observed_generation|kube_horizontalpodautoscaler_status_current_replicas|kubernetes_build_info|kubelet_certificate_manager_client_expiration_renew_errors|kube_job_failed|namespace_workload_pod|node_namespace_pod_container:container_memory_working_set_bytes|kube_statefulset_replicas|kube_deployment_status_observed_generation|kube_pod_container_status_restarts_total|kube_daemonset_status_current_number_scheduled|kube_pod_start_time|namespace_memory:kube_pod_container_resource_limits:sum|container_network_transmit_packets_dropped_total|rest_client_requests_total|kube_deployment_metadata_generation|kube_statefulset_status_update_revision|kube_job_status_succeeded|kube_horizontalpodautoscaler_spec_min_replicas|kube_statefulset_status_current_revision|node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile|kube_node_status_condition
action: keep
relabel_configs:
- action: keep
regex: kube-state-metrics
source_labels:
- __meta_kubernetes_pod_label_app_kubernetes_io_name
integrations:
eventhandler:
cache_path: /var/lib/agent/eventhandler.cache
logs_instance: integrations
logs:
configs:
- name: integrations
clients:
- url: https://logs-prod3.grafana.net/loki/api/v1/push
basic_auth:
username: 220681
password: ${ssm:/k3s/prod/monitoring/grafana-cloud/password}
external_labels:
cluster: cloud
job: integrations/kubernetes/eventhandler
positions:
filename: /tmp/positions.yaml
target_config:
sync_period: 10s

View File

@ -62,6 +62,11 @@ secretGenerator:
behavior: replace behavior: replace
literals: literals:
- password=${ssm:/k3s/prod/nextcloud/redis/password} - password=${ssm:/k3s/prod/nextcloud/redis/password}
- name: grafana-agent
namespace: monitoring
behavior: replace
files:
- agent.yaml=configurations/grafana-agent/agent.yaml
# - name: grafana-cloud-credentials # - name: grafana-cloud-credentials
# type: Opaque # type: Opaque
# namespace: monitoring # namespace: monitoring