From 826c8d33e0eca187efa9cc082a0c4131bdb378ee Mon Sep 17 00:00:00 2001 From: Massaki Archambault Date: Thu, 26 May 2022 16:16:05 -0400 Subject: [PATCH] add grafana-agent --- .../grafana-agent/grafana-agent-volume.yaml | 10 +++ .../bases/grafana-agent/kustomization.yaml | 18 ++++ .../grafana-agent-statefulset-patch.yaml | 13 +++ .../namespaces/monitoring/kustomization.yaml | 1 + .../configurations/grafana-agent/agent.yaml | 88 +++++++++++++++++++ kustomize/overlays/prod/kustomization.yaml | 5 ++ 6 files changed, 135 insertions(+) create mode 100644 kustomize/bases/grafana-agent/grafana-agent-volume.yaml create mode 100644 kustomize/bases/grafana-agent/kustomization.yaml create mode 100644 kustomize/bases/grafana-agent/patches/grafana-agent-statefulset-patch.yaml create mode 100644 kustomize/overlays/prod/configurations/grafana-agent/agent.yaml diff --git a/kustomize/bases/grafana-agent/grafana-agent-volume.yaml b/kustomize/bases/grafana-agent/grafana-agent-volume.yaml new file mode 100644 index 0000000..f22ff07 --- /dev/null +++ b/kustomize/bases/grafana-agent/grafana-agent-volume.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: grafana-agent-wal-pvc +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 5Gi diff --git a/kustomize/bases/grafana-agent/kustomization.yaml b/kustomize/bases/grafana-agent/kustomization.yaml new file mode 100644 index 0000000..8b1419d --- /dev/null +++ b/kustomize/bases/grafana-agent/kustomization.yaml @@ -0,0 +1,18 @@ +resources: + - https://raw.githubusercontent.com/grafana/agent/v0.24.2/production/kubernetes/agent-bare.yaml + - grafana-agent-volume.yaml + +secretGenerator: + - name: grafana-agent + behavior: create + literals: + - agent.yaml= + +patchesJson6902: + - target: + version: v1 + kind: StatefulSet + name: grafana-agent + path: patches/grafana-agent-statefulset-patch.yaml + +namespace: default diff --git a/kustomize/bases/grafana-agent/patches/grafana-agent-statefulset-patch.yaml b/kustomize/bases/grafana-agent/patches/grafana-agent-statefulset-patch.yaml new file mode 100644 index 0000000..e3a303b --- /dev/null +++ b/kustomize/bases/grafana-agent/patches/grafana-agent-statefulset-patch.yaml @@ -0,0 +1,13 @@ +- op: remove + path: /spec/volumeClaimTemplates +- op: add + path: /spec/template/spec/volumes + value: + - name: grafana-agent + secret: + secretName: grafana-agent + - name: agent-wal + persistentVolumeClaim: + claimName: grafana-agent-wal-pvc + + diff --git a/kustomize/namespaces/monitoring/kustomization.yaml b/kustomize/namespaces/monitoring/kustomization.yaml index 6f160c6..d4e3815 100644 --- a/kustomize/namespaces/monitoring/kustomization.yaml +++ b/kustomize/namespaces/monitoring/kustomization.yaml @@ -3,6 +3,7 @@ bases: - ../../bases/node-exporter - ../../bases/blackbox-exporter - ../../bases/kube-state-metrics + - ../../bases/grafana-agent - servicemonitors/any-namespaces-http.yaml - servicemonitors/kubelet.yaml - servicemonitors/longhorn.yaml diff --git a/kustomize/overlays/prod/configurations/grafana-agent/agent.yaml b/kustomize/overlays/prod/configurations/grafana-agent/agent.yaml new file mode 100644 index 0000000..0f94595 --- /dev/null +++ b/kustomize/overlays/prod/configurations/grafana-agent/agent.yaml @@ -0,0 +1,88 @@ +metrics: + wal_directory: /var/lib/agent/wal + global: + scrape_interval: 60s + external_labels: + cluster: cloud + configs: + - name: integrations + remote_write: + - url: https://prometheus-prod-10-prod-us-central-0.grafana.net/api/prom/push + basic_auth: + username: 443422 + password: ${ssm:/k3s/prod/monitoring/grafana-cloud/password} + scrape_configs: + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: integrations/kubernetes/cadvisor + kubernetes_sd_configs: + - role: node + metric_relabel_configs: + - source_labels: [__name__] + regex: namespace_memory:kube_pod_container_resource_requests:sum|kubelet_running_containers|container_cpu_usage_seconds_total|kube_pod_container_info|container_network_receive_packets_dropped_total|kube_pod_status_phase|kubelet_pod_start_duration_seconds_count|kubelet_cgroup_manager_duration_seconds_bucket|kube_horizontalpodautoscaler_status_desired_replicas|cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits|node_namespace_pod_container:container_memory_swap|kube_statefulset_status_replicas_ready|kube_horizontalpodautoscaler_spec_max_replicas|cluster:namespace:pod_memory:active:kube_pod_container_resource_requests|process_cpu_seconds_total|process_resident_memory_bytes|kubelet_server_expiration_renew_errors|kube_daemonset.*|container_fs_reads_total|machine_memory_bytes|kubelet_volume_stats_inodes_used|volume_manager_total_volumes|kube_statefulset_status_replicas|namespace_cpu:kube_pod_container_resource_limits:sum|kube_pod_container_resource_requests|kube_pod_container_resource_limits|kubelet_pod_worker_duration_seconds_count|namespace_workload_pod:kube_pod_owner:relabel|kubelet_cgroup_manager_duration_seconds_count|container_cpu_cfs_throttled_periods_total|kube_node_spec_taint|container_fs_reads_bytes_total|kubelet_certificate_manager_client_ttl_seconds|container_network_receive_bytes_total|kubelet_running_container_count|kube_daemonset_status_number_available|kube_node_status_allocatable|container_fs_writes_total|kube_namespace_status_phase|kubelet_volume_stats_available_bytes|kubelet_pleg_relist_duration_seconds_bucket|kubelet_runtime_operations_errors_total|kube_pod_container_status_waiting_reason|kube_replicaset_owner|kube_resourcequota|kube_pod_info|kubelet_pleg_relist_duration_seconds_count|kube_deployment_status_replicas_available|cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests|kubelet_running_pods|kube_statefulset_status_replicas_updated|kube_deployment_status_replicas_updated|kube_job_spec_completions|kube_daemonset_status_number_misscheduled|kubelet_certificate_manager_server_ttl_seconds|container_network_transmit_bytes_total|container_memory_cache|kubelet_volume_stats_capacity_bytes|node_namespace_pod_container:container_memory_cache|container_memory_rss|container_memory_swap|storage_operation_duration_seconds_count|kube_replicaset.*|kube_pod_owner|cluster:namespace:pod_memory:active:kube_pod_container_resource_limits|kubelet_volume_stats_inodes|kube_daemonset_status_desired_number_scheduled|kube_daemonset_status_updated_number_scheduled|kube_statefulset.*|kube_node_info|go_goroutines|kubelet_pod_worker_duration_seconds_bucket|kubelet_node_config_error|container_cpu_cfs_periods_total|kubelet_pleg_relist_interval_seconds_bucket|kube_job.*|container_network_receive_packets_total|container_fs_writes_bytes_total|kubelet_running_pod_count|kube_deployment_spec_replicas|up|kube_node_status_capacity|namespace_cpu:kube_pod_container_resource_requests:sum|node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate|container_memory_working_set_bytes|kubelet_node_name|node_namespace_pod_container:container_memory_rss|storage_operation_errors_total|kube_statefulset_metadata_generation|container_network_transmit_packets_total|kubelet_runtime_operations_total|kube_statefulset_status_observed_generation|kube_horizontalpodautoscaler_status_current_replicas|kubernetes_build_info|kubelet_certificate_manager_client_expiration_renew_errors|kube_job_failed|namespace_workload_pod|node_namespace_pod_container:container_memory_working_set_bytes|kube_statefulset_replicas|kube_deployment_status_observed_generation|kube_pod_container_status_restarts_total|kube_daemonset_status_current_number_scheduled|kube_pod_start_time|namespace_memory:kube_pod_container_resource_limits:sum|container_network_transmit_packets_dropped_total|rest_client_requests_total|kube_deployment_metadata_generation|kube_statefulset_status_update_revision|kube_job_status_succeeded|kube_horizontalpodautoscaler_spec_min_replicas|kube_statefulset_status_current_revision|node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile|kube_node_status_condition + action: keep + relabel_configs: + - replacement: kubernetes.default.svc.cluster.local:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: false + server_name: kubernetes + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: integrations/kubernetes/kubelet + kubernetes_sd_configs: + - role: node + metric_relabel_configs: + - source_labels: [__name__] + regex: namespace_memory:kube_pod_container_resource_requests:sum|kubelet_running_containers|container_cpu_usage_seconds_total|kube_pod_container_info|container_network_receive_packets_dropped_total|kube_pod_status_phase|kubelet_pod_start_duration_seconds_count|kubelet_cgroup_manager_duration_seconds_bucket|kube_horizontalpodautoscaler_status_desired_replicas|cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits|node_namespace_pod_container:container_memory_swap|kube_statefulset_status_replicas_ready|kube_horizontalpodautoscaler_spec_max_replicas|cluster:namespace:pod_memory:active:kube_pod_container_resource_requests|process_cpu_seconds_total|process_resident_memory_bytes|kubelet_server_expiration_renew_errors|kube_daemonset.*|container_fs_reads_total|machine_memory_bytes|kubelet_volume_stats_inodes_used|volume_manager_total_volumes|kube_statefulset_status_replicas|namespace_cpu:kube_pod_container_resource_limits:sum|kube_pod_container_resource_requests|kube_pod_container_resource_limits|kubelet_pod_worker_duration_seconds_count|namespace_workload_pod:kube_pod_owner:relabel|kubelet_cgroup_manager_duration_seconds_count|container_cpu_cfs_throttled_periods_total|kube_node_spec_taint|container_fs_reads_bytes_total|kubelet_certificate_manager_client_ttl_seconds|container_network_receive_bytes_total|kubelet_running_container_count|kube_daemonset_status_number_available|kube_node_status_allocatable|container_fs_writes_total|kube_namespace_status_phase|kubelet_volume_stats_available_bytes|kubelet_pleg_relist_duration_seconds_bucket|kubelet_runtime_operations_errors_total|kube_pod_container_status_waiting_reason|kube_replicaset_owner|kube_resourcequota|kube_pod_info|kubelet_pleg_relist_duration_seconds_count|kube_deployment_status_replicas_available|cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests|kubelet_running_pods|kube_statefulset_status_replicas_updated|kube_deployment_status_replicas_updated|kube_job_spec_completions|kube_daemonset_status_number_misscheduled|kubelet_certificate_manager_server_ttl_seconds|container_network_transmit_bytes_total|container_memory_cache|kubelet_volume_stats_capacity_bytes|node_namespace_pod_container:container_memory_cache|container_memory_rss|container_memory_swap|storage_operation_duration_seconds_count|kube_replicaset.*|kube_pod_owner|cluster:namespace:pod_memory:active:kube_pod_container_resource_limits|kubelet_volume_stats_inodes|kube_daemonset_status_desired_number_scheduled|kube_daemonset_status_updated_number_scheduled|kube_statefulset.*|kube_node_info|go_goroutines|kubelet_pod_worker_duration_seconds_bucket|kubelet_node_config_error|container_cpu_cfs_periods_total|kubelet_pleg_relist_interval_seconds_bucket|kube_job.*|container_network_receive_packets_total|container_fs_writes_bytes_total|kubelet_running_pod_count|kube_deployment_spec_replicas|up|kube_node_status_capacity|namespace_cpu:kube_pod_container_resource_requests:sum|node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate|container_memory_working_set_bytes|kubelet_node_name|node_namespace_pod_container:container_memory_rss|storage_operation_errors_total|kube_statefulset_metadata_generation|container_network_transmit_packets_total|kubelet_runtime_operations_total|kube_statefulset_status_observed_generation|kube_horizontalpodautoscaler_status_current_replicas|kubernetes_build_info|kubelet_certificate_manager_client_expiration_renew_errors|kube_job_failed|namespace_workload_pod|node_namespace_pod_container:container_memory_working_set_bytes|kube_statefulset_replicas|kube_deployment_status_observed_generation|kube_pod_container_status_restarts_total|kube_daemonset_status_current_number_scheduled|kube_pod_start_time|namespace_memory:kube_pod_container_resource_limits:sum|container_network_transmit_packets_dropped_total|rest_client_requests_total|kube_deployment_metadata_generation|kube_statefulset_status_update_revision|kube_job_status_succeeded|kube_horizontalpodautoscaler_spec_min_replicas|kube_statefulset_status_current_revision|node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile|kube_node_status_condition + action: keep + relabel_configs: + - replacement: kubernetes.default.svc.cluster.local:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/${1}/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: false + server_name: kubernetes + - job_name: integrations/kubernetes/kube-state-metrics + kubernetes_sd_configs: + - role: pod + metric_relabel_configs: + - source_labels: [__name__] + regex: namespace_memory:kube_pod_container_resource_requests:sum|kubelet_running_containers|container_cpu_usage_seconds_total|kube_pod_container_info|container_network_receive_packets_dropped_total|kube_pod_status_phase|kubelet_pod_start_duration_seconds_count|kubelet_cgroup_manager_duration_seconds_bucket|kube_horizontalpodautoscaler_status_desired_replicas|cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits|node_namespace_pod_container:container_memory_swap|kube_statefulset_status_replicas_ready|kube_horizontalpodautoscaler_spec_max_replicas|cluster:namespace:pod_memory:active:kube_pod_container_resource_requests|process_cpu_seconds_total|process_resident_memory_bytes|kubelet_server_expiration_renew_errors|kube_daemonset.*|container_fs_reads_total|machine_memory_bytes|kubelet_volume_stats_inodes_used|volume_manager_total_volumes|kube_statefulset_status_replicas|namespace_cpu:kube_pod_container_resource_limits:sum|kube_pod_container_resource_requests|kube_pod_container_resource_limits|kubelet_pod_worker_duration_seconds_count|namespace_workload_pod:kube_pod_owner:relabel|kubelet_cgroup_manager_duration_seconds_count|container_cpu_cfs_throttled_periods_total|kube_node_spec_taint|container_fs_reads_bytes_total|kubelet_certificate_manager_client_ttl_seconds|container_network_receive_bytes_total|kubelet_running_container_count|kube_daemonset_status_number_available|kube_node_status_allocatable|container_fs_writes_total|kube_namespace_status_phase|kubelet_volume_stats_available_bytes|kubelet_pleg_relist_duration_seconds_bucket|kubelet_runtime_operations_errors_total|kube_pod_container_status_waiting_reason|kube_replicaset_owner|kube_resourcequota|kube_pod_info|kubelet_pleg_relist_duration_seconds_count|kube_deployment_status_replicas_available|cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests|kubelet_running_pods|kube_statefulset_status_replicas_updated|kube_deployment_status_replicas_updated|kube_job_spec_completions|kube_daemonset_status_number_misscheduled|kubelet_certificate_manager_server_ttl_seconds|container_network_transmit_bytes_total|container_memory_cache|kubelet_volume_stats_capacity_bytes|node_namespace_pod_container:container_memory_cache|container_memory_rss|container_memory_swap|storage_operation_duration_seconds_count|kube_replicaset.*|kube_pod_owner|cluster:namespace:pod_memory:active:kube_pod_container_resource_limits|kubelet_volume_stats_inodes|kube_daemonset_status_desired_number_scheduled|kube_daemonset_status_updated_number_scheduled|kube_statefulset.*|kube_node_info|go_goroutines|kubelet_pod_worker_duration_seconds_bucket|kubelet_node_config_error|container_cpu_cfs_periods_total|kubelet_pleg_relist_interval_seconds_bucket|kube_job.*|container_network_receive_packets_total|container_fs_writes_bytes_total|kubelet_running_pod_count|kube_deployment_spec_replicas|up|kube_node_status_capacity|namespace_cpu:kube_pod_container_resource_requests:sum|node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate|container_memory_working_set_bytes|kubelet_node_name|node_namespace_pod_container:container_memory_rss|storage_operation_errors_total|kube_statefulset_metadata_generation|container_network_transmit_packets_total|kubelet_runtime_operations_total|kube_statefulset_status_observed_generation|kube_horizontalpodautoscaler_status_current_replicas|kubernetes_build_info|kubelet_certificate_manager_client_expiration_renew_errors|kube_job_failed|namespace_workload_pod|node_namespace_pod_container:container_memory_working_set_bytes|kube_statefulset_replicas|kube_deployment_status_observed_generation|kube_pod_container_status_restarts_total|kube_daemonset_status_current_number_scheduled|kube_pod_start_time|namespace_memory:kube_pod_container_resource_limits:sum|container_network_transmit_packets_dropped_total|rest_client_requests_total|kube_deployment_metadata_generation|kube_statefulset_status_update_revision|kube_job_status_succeeded|kube_horizontalpodautoscaler_spec_min_replicas|kube_statefulset_status_current_revision|node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile|kube_node_status_condition + action: keep + relabel_configs: + - action: keep + regex: kube-state-metrics + source_labels: + - __meta_kubernetes_pod_label_app_kubernetes_io_name + +integrations: + eventhandler: + cache_path: /var/lib/agent/eventhandler.cache + logs_instance: integrations +logs: + configs: + - name: integrations + clients: + - url: https://logs-prod3.grafana.net/loki/api/v1/push + basic_auth: + username: 220681 + password: ${ssm:/k3s/prod/monitoring/grafana-cloud/password} + external_labels: + cluster: cloud + job: integrations/kubernetes/eventhandler + positions: + filename: /tmp/positions.yaml + target_config: + sync_period: 10s diff --git a/kustomize/overlays/prod/kustomization.yaml b/kustomize/overlays/prod/kustomization.yaml index be0b9c4..3ed4d9c 100644 --- a/kustomize/overlays/prod/kustomization.yaml +++ b/kustomize/overlays/prod/kustomization.yaml @@ -62,6 +62,11 @@ secretGenerator: behavior: replace literals: - password=${ssm:/k3s/prod/nextcloud/redis/password} + - name: grafana-agent + namespace: monitoring + behavior: replace + files: + - agent.yaml=configurations/grafana-agent/agent.yaml # - name: grafana-cloud-credentials # type: Opaque # namespace: monitoring