Openshift Overview v2.0 1,7511,751 2.0 (1 reviews)
This dashboards does alerting, elasticsearch indices sizes, etcd stuff, capacity utilization, general node stuff, CPU throttling. Oh, and you need the Pie Chart plugin. I like pie.
You will need the elaticsearch exporter, and some other stuff I can't remember right now
Here is my ConfigMap for Prometheus Adjust host names appropriately. Create your own ConfigMap for the etcd certficates
apiVersion: v1
data:
prometheus.yml: |+
global:
scrape_interval: 30s
evaluation_interval: 30s
scrape_configs:
- job_name: 'kubernetes-controllers'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- default
relabel_configs:
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: kubernetes;https
- source_labels: [__address__]
action: replace
target_label: __address__
regex: (.+)(?::\d+)
replacement: $1:8444
- job_name: 'kubernetes-nodes'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
metric_relabel_configs:
- source_labels: [__name__]
action: drop
regex: 'openshift_sdn_pod_(setup|teardown)_latency(.*)'
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- job_name: 'kubernetes-cadvisor'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
metrics_path: /metrics/cadvisor
kubernetes_sd_configs:
- role: node
metric_relabel_configs:
- source_labels: [__name__]
action: drop
regex: 'container_(cpu_user_seconds_total|cpu_cfs_periods_total|memory_usage_bytes|memory_swap|memory_cache|last_seen|fs_(read_seconds_total|write_seconds_total|sector_(.*)|io_(.*)|reads_merged_total|writes_merged_total)|tasks_state|memory_failcnt|memory_failures_total|spec_memory_swap_limit_bytes|fs_(.*)_bytes_total|spec_(.*))'
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- job_name: 'kubernetes-service-endpoints'
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_namespace]
action: keep
regex: 'default|logging|metrics|kube-.+|openshift|openshift-.+'
- source_labels: [__meta_kubernetes_service_name]
action: drop
regex: 'prometheus-node-exporter'
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: (.+)(?::\d+);(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
- job_name: node
static_configs:
- targets:
- "dcposen01.lsd.co.za:9100"
- "dcposen02.lsd.co.za:9100"
- "dcposen03.lsd.co.za:9100"
- "dcposen04.lsd.co.za:9100"
- "dcposen05.lsd.co.za:9100"
- "dcposen06.lsd.co.za:9100"
- "dcposen07.lsd.co.za:9100"
- "dcposen08.lsd.co.za:9100"
- "dcposei01.lsd.co.za:9100"
- "dcposei02.lsd.co.za:9100"
- "dcposei03.lsd.co.za:9100"
- "dcposem02.lsd.co.za:9100"
- "dcposem03.lsd.co.za:9100"
- "dcposem01.lsd.co.za:9100"
- job_name: kube-state-metrics
static_configs:
- targets:
- "kube-state-metrics.openshift-monitoring.svc.cluster.local:8080"
- job_name: openshift-etcd
scheme: https
tls_config:
ca_file: /etc/prometheus/openshift-etcd-certs/ca.crt
cert_file: /etc/prometheus/openshift-etcd-certs/peer.crt
key_file: /etc/prometheus/openshift-etcd-certs/peer.key
insecure_skip_verify: true
static_configs:
- targets:
- "172.30.48.31:2379"
- "172.30.50.15:2379"
- "172.30.50.16:2379"
- job_name: blackbox-exporter
metrics_path: /probe
params:
module: [http_2xx_no_ssl_check] # Look for a HTTP 200 response.
static_configs:
- targets:
- https://webserver-invest-prod.oseza.lsd.co.za
- https://hawkular-metrics.oseza.lsd.co.za
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter.openshift-monitoring.svc.cluster.local:9115
- job_name: 'tcp_cert_check'
scrape_interval: 1h
metrics_path: /probe
params:
module: [tcp_cert]
static_configs:
- targets:
- '172.30.48.31:2379'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter.openshift-monitoring.svc.cluster.local:9115
- job_name: blackbox-exporter-ssh
metrics_path: /probe
params:
module: [ssh_banner]
static_configs:
- targets:
- dcposen02.lsd.co.za:22
- dcposen03.lsd.co.za:22
- dcposen04.lsd.co.za:22
- dcposen05.lsd.co.za:22
- dcposen06.lsd.co.za:22
- dcposen07.lsd.co.za:22
- dcposen08.lsd.co.za:22
- dcposei01.lsd.co.za:22
- dcposei02.lsd.co.za:22
- dcposei03.lsd.co.za:22
- dcposem02.lsd.co.za:22
- dcposem03.lsd.co.za:22
- dcposem01.lsd.co.za:22
relabel_configs:
- source_labels: [__address__] # Leave this spooky blackbox stuff alone
target_label: __param_target # Stuff breaks otherwise
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter.openshift-monitoring.svc.cluster.local:9115
- job_name: elasticsearch-exporter
static_configs:
- targets:
- "elasticsearch-exporter.openshift-monitoring.svc.cluster.local:9108"
kind: ConfigMap
Used Metrics 1818
node_systemd_unit_state
kube_node_labels
-
node_load15
-
container_cpu_cfs_throttled_seconds_total
kube_node_status_condition
-
probe_duration_seconds
-
probe_ssl_earliest_cert_expiry
kube_pod_container_status_restarts_total
kube_resourcequota
-
machine_memory_bytes
-
machine_cpu_cores
kube_pod_status_phase
-
probe_http_status_code
node_disk_bytes_read
node_disk_bytes_written
node_filesystem_avail
node_filesystem_size
elasticsearch_indices_store_size_bytes_primary