monitor

[!NOTE|label:references]

code pool

prometheus

[!NOTE|label:references:]

grafana

environment

[!TIP]

  • reference Do not use environment variables to add new configuration settings. Instead, use environmental variables to override existing options.

To override an option: ```bash GF__ ```

If you need to specify the version of a plugin, then you can add it to the `GF_INSTALL_PLUGINS` environment variable. Otherwise, the latest is used. For example: ```bash -e "GF_INSTALL_PLUGINS=grafana-clock-panel 1.0.1,grafana-simple-json-datasource 1.3.5" ```

# for pvc
kubems-01 ~ $ mkdir -p /opt/grafana/plugins
kubems-01 ~ $ sudo chown -R 472:472 $_

# https://grafana.com/docs/grafana/latest/installation/docker/#migration-from-a-previous-version-of-the-docker-container-to-5-1-or-later
# 104:104 for version < 5.1
  • user id changes

    Version
    User
    User ID

    < 5.1

    grafana

    104

    >= 5.1

    grafana

    472

  • path

    SETTING
    DEFAULT VALUE

    GF_PATHS_CONFIG

    /etc/grafana/grafana.ini

    GF_PATHS_DATA

    /var/lib/grafana

    GF_PATHS_HOME

    /usr/share/grafana

    GF_PATHS_LOGS

    /var/log/grafana

    GF_PATHS_PLUGINS

    /var/lib/grafana/plugins

    GF_PATHS_PROVISIONING

    /etc/grafana/provisioning

  • environment

    ENVIRONMENT VARIABLE
    EXAMPLE

    GF_INSTALL_PLUGINS

    grafana-kubernetes-app,grafana-piechart-panel,http://my.com/plugin.zip

    GF_PATHS_PLUGINS

    /data/grafana/plugins

    GF_SERVER_ROOT_URL

    https://grafana.domain.com

    GF_SECURITY_ADMIN_PASSWORD

    admin

    GF_DEFAULT_INSTANCE_NAME

    -

    GF_SECURITY_ADMIN_USER

    -

    GF_DATABASE_TYPE

    mysql

    GF_DATABASE_HOST

    -

setup

  • ns

    $ cat << EOF | kubectl apply -f -
    ---
    kind: Namespace
    apiVersion: v1
    metadata:
      name: kubernetes-dashboard
      labels:
        name: kubernetes-dashboard
    EOF
  • sa

    $ cat << EOF | kubectl apply -f -
    ---
    apiVersion: v1
    kind: ServiceAccount
    metadata:
      labels:
        k8s-app: grafana
      name: grafana-admin
      namespace: kubernetes-dashboard
    ---
    apiVersion: rbac.authorization.k8s.io/v1beta1
    kind: ClusterRoleBinding
    metadata:
      name: grafana-admin
      labels:
        k8s-app: grafana
    roleRef:
      apiGroup: rbac.authorization.k8s.io
      kind: ClusterRole
      name: cluster-admin
    subjects:
    - kind: ServiceAccount
      name: grafana
      namespace: kubernetes-dashboard
    EOF
  • pv & pvc

    $ cat << EOF | kubectl apply -f -
    ---
    kind: PersistentVolume
    apiVersion: v1
    metadata:
      name: grafana
      labels:
        type: local
    spec:
      storageClassName: manual
      capacity:
        storage: 2Gi
      accessModes:
        - ReadWriteMany
      hostPath:
        path: "/opt/grafana"
    ---
    kind: PersistentVolumeClaim
    apiVersion: v1
    metadata:
      name: grafana
      namespace: kubernetes-dashboard
    spec:
      accessModes:
      - ReadWriteMany
      resources:
        requests:
          storage: 2Gi
      storageClassName: "manual"
      volumeName: grafana
    EOF
  • deploy

    $ cat << EOF | kubectl apply -f -
    ---
    apiVersion: extensions/v1beta1
    kind: Deployment
    metadata:
      name: grafana
      namespace: kubernetes-dashboard
      labels:
        app: grafana
        component: core
    spec:
      replicas: 1
      strategy:
        rollingUpdate:
          maxSurge: 25%
          maxUnavailable: 25%
        type: RollingUpdate
      template:
        metadata:
          labels:
            app: grafana
            component: core
        spec:
          affinity:
            nodeAffinity:
              requiredDuringSchedulingIgnoredDuringExecution:
                nodeSelectorTerms:
                - matchExpressions:
                  - key: kubernetes.io/hostname
                    operator: In
                    values:
                    - kubems-01
          containers:
          - image: grafana/grafana
            name: grafana
            imagePullPolicy: IfNotPresent
            resources:
              # keep request = limit to keep this container in guaranteed class
              limits:
                cpu: 512m
                memory: 512Mi
              requests:
                cpu: 256m
                memory: 256Mi
            env:
              # The following env variables set up basic auth twith the default admin user and admin password.
              - name: GF_SERVER_DOMAIN
                value: "grafana.marslo.com"
              - name: GF_SECURITY_ADMIN_PASSWORD
                value: "mypasswd"
              - name: GF_INSTALL_PLUGINS
                value: "grafana-kubernetes-app"
              - name: GF_SERVER_ROOT_URL
                value: "/"
              - name: GF_AUTH_BASIC_ENABLED
                value: "true"
              - name: GF_AUTH_ANONYMOUS_ENABLED
                value: "false"
            readinessProbe:
              httpGet:
                path: /login
                port: 3000
            volumeMounts:
            - name: grafana-persistent-storage
              mountPath: /var/lib/grafana
          restartPolicy: Always
          terminationGracePeriodSeconds: 30
          serviceAccountName: grafana
          volumes:
          - name: grafana-persistent-storage
            persistentVolumeClaim:
              claimName: grafana
    EOF
  • svc

    $ cat << EOF | kubectl apply -f -
    ---
    apiVersion: v1
    kind: Service
    metadata:
      name: grafana
      namespace: kubernetes-dashboard
      labels:
        app: grafana
        component: core
    spec:
      type: CluslterIP
      ports:
        - name: http
          port: 3000
          protocol: TCP
      selector:
        app: grafana
        component: core
    EOF
  • ingress

    $ cat << EOF | kubectl apply -f -
    apiVersion: extensions/v1beta1
    kind: Ingress
    metadata:
      name: grafana
      namespace: kubernetes-dashboard
    spec:
      rules:
      - host: grafana.marslo.com
       http:
         paths:
         - path: /
           backend:
            serviceName: grafana
            servicePort: http
      tls:
      - hosts:
        - grafana.marslo.com
        secretName: marslo-cert
    EOF

dashboard

[!NOTE|label:references:]

kubernetes cluster monitoring

  • devopsprodigy-kubegraf-app

    • install via grafana-cli

      $ grafana-cli plugins install devopsprodigy-kubegraf-app
    • setup sa and cluster role

      $ kubectl create ns kubegraf
      $ kubectl apply -f https://raw.githubusercontent.com/devopsprodigy/kubegraf/master/kubernetes/serviceaccount.yaml
      $ kubectl apply -f https://raw.githubusercontent.com/devopsprodigy/kubegraf/master/kubernetes/clusterrole.yaml
      $ kubectl apply -f https://raw.githubusercontent.com/devopsprodigy/kubegraf/master/kubernetes/clusterrolebinding.yaml
      $ kubectl apply -f https://raw.githubusercontent.com/devopsprodigy/kubegraf/master/kubernetes/secret.yaml
    • create private certificate

copy `/etc/kubernetes/pki/grafana-kubegraf.crt` to all masters
```bash
$ openssl genrsa -out ~/grafana-kubegraf.key 2048
$ openssl req -new -key ~/grafana-kubegraf.key -out ~/grafana-kubegraf.csr -subj "/CN=grafana-kubegraf/O=monitoring"
$ openssl x509 -req -in ~/grafana-kubegraf.csr -CA /etc/kubernetes/pki/ca.crt -CAkey /etc/kubernetes/pki/ca.key -out /etc/kubernetes/pki/grafana-kubegraf.crt -CAcreateserial
```
- get token
  ```bash
  $ kubectl get secret grafana-kubegraf-secret -o jsonpath={.data.token} -n kubegraf | base64 -d
  ```

@deprecated

  • install via grafana-cli

    $ grafana-cli plugins install kubernetes-app
  • setup

    $ cat ~/.kube/config | grep certificate-authority-data | awk '{print $2}' | base64 -d
    $ cat ~/.kube/config | grep client-certificate-data | awk '{print $2}' | base64 -d
    $ cat ~/.kube/config | grep client-key-data | awk '{print $2}' | base64 -d

cluster

node

namespace

pod

jenkins

grafana settings

  • cluster

    label_values(kube_pod_info, cluster)
    • or

      label_values(node_cpu_seconds_total, cluster)
  • instance

    label_values(apiserver_request_total{job="apiserver"}, instance)
  • node

    label_values(kube_node_info{cluster="$cluster"}, node)
    • or

      label_values(kubernetes_io_hostname)
  • namespace

    label_values(kube_pod_info{cluster="$cluster"}, namespace)
  • pod

    label_values(kube_pod_info{cluster="$cluster", namespace="$namespace"}, pod)

api

references:

reset password

$ curl -X PUT -H "Content-Type: application/json" -d '{
  "oldPassword": "admin",
  "newPassword": "newpass",
  "confirmNew": "newpass"
}' http://admin:admin@<your_grafana_host>:3000/api/user/password

set grafana admin password

$ docker exec -t grafana \
              bash -c 'grafana-cli --homepath /usr/share/grafana admin reset-admin-password "{{ grafana_passwd }}"'

check if grafana can be accessed

$ curl -s http://localhost:3000/api/org \
       -u {{ grafana_user }}:{{ grafana_passwd }}

create datasoure in grafana

$ curl -L \
       --header 'Content-Type: application/json' \
       --header 'Accept: application/json' \
       -d@//docker_data/grafana_conf/grafanacfg.output http://localhost:3000/api/datasources \
       -u {{ grafana_user }}:{{ grafana_passwd }}

create TelegrafHost dashboard in grafana using jsonfile

$ curl -L \
       --header 'Content-Type: application/json' \
       --header 'Accept: application/json' \
       -d@/home/Configfiles/dashboards/TelegrafHostMetrics.json http://localhost:3000/api/dashboards/db \
       -u {{ grafana_user }}:{{ grafana_passwd }}

create DockerMetrics dashboard in grafana using jsonfile

$ curl --header 'Content-Type: application/json' \
       --header 'Accept: application/json' \
       -d@/home/Configfiles/dashboards/DockerMetricsperContainer.json http://localhost:3000/api/dashboards/db \
       -u {{ grafana_user }}:{{ grafana_passwd }}

create JVM dashboard in grafana using jsonfile

$ curl --header 'Content-Type: application/json' \
       --header 'Accept: application/json' \
       -d@/home/Configfiles/dashboards/jvm-metrics-jolokia-2_rev1.json http://localhost:3000/api/dashboards/db \
       -u {{ grafana_user }}:{{ grafana_passwd }}

add-ons

metric server

[!NOTE|label:references:]

sample yaml

apiVersion: v1
kind: ServiceAccount
metadata:
  labels:
    k8s-app: metrics-server
  name: metrics-server
  namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  labels:
    k8s-app: metrics-server
    rbac.authorization.k8s.io/aggregate-to-admin: "true"
    rbac.authorization.k8s.io/aggregate-to-edit: "true"
    rbac.authorization.k8s.io/aggregate-to-view: "true"
  name: system:aggregated-metrics-reader
rules:
- apiGroups:
  - metrics.k8s.io
  resources:
  - pods
  - nodes
  verbs:
  - get
  - list
  - watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  labels:
    k8s-app: metrics-server
  name: system:metrics-server
rules:
- apiGroups:
  - ""
  resources:
  - nodes/metrics
  verbs:
  - get
- apiGroups:
  - ""
  resources:
  - pods
  - nodes
  verbs:
  - get
  - list
  - watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
  labels:
    k8s-app: metrics-server
  name: metrics-server-auth-reader
  namespace: kube-system
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: extension-apiserver-authentication-reader
subjects:
- kind: ServiceAccount
  name: metrics-server
  namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  labels:
    k8s-app: metrics-server
  name: metrics-server:system:auth-delegator
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: system:auth-delegator
subjects:
- kind: ServiceAccount
  name: metrics-server
  namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  labels:
    k8s-app: metrics-server
  name: system:metrics-server
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: system:metrics-server
subjects:
- kind: ServiceAccount
  name: metrics-server
  namespace: kube-system
---
apiVersion: v1
kind: Service
metadata:
  labels:
    k8s-app: metrics-server
  name: metrics-server
  namespace: kube-system
spec:
  ports:
  - name: https
    port: 443
    protocol: TCP
    targetPort: https
  selector:
    k8s-app: metrics-server
---
apiVersion: apps/v1
kind: Deployment
metadata:
  labels:
    k8s-app: metrics-server
  name: metrics-server
  namespace: kube-system
spec:
  selector:
    matchLabels:
      k8s-app: metrics-server
  strategy:
    rollingUpdate:
      maxUnavailable: 0
  template:
    metadata:
      labels:
        k8s-app: metrics-server
    spec:
      containers:
      - args:
        - --cert-dir=/tmp
        - --secure-port=4443
        - --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname
        - --kubelet-use-node-status-port
        - --kubelet-insecure-tls
        - --metric-resolution=15s
        image: registry.k8s.io/metrics-server/metrics-server:v0.6.3
        imagePullPolicy: IfNotPresent
        livenessProbe:
          failureThreshold: 3
          httpGet:
            path: /livez
            port: https
            scheme: HTTPS
          periodSeconds: 10
        name: metrics-server
        ports:
        - containerPort: 4443
          name: https
          protocol: TCP
        readinessProbe:
          failureThreshold: 3
          httpGet:
            path: /readyz
            port: https
            scheme: HTTPS
          initialDelaySeconds: 20
          periodSeconds: 10
        resources:
          requests:
            cpu: 100m
            memory: 200Mi
        securityContext:
          allowPrivilegeEscalation: false
          readOnlyRootFilesystem: true
          runAsNonRoot: true
          runAsUser: 1000
        volumeMounts:
        - mountPath: /tmp
          name: tmp-dir
      nodeSelector:
        kubernetes.io/os: linux
      priorityClassName: system-cluster-critical
      serviceAccountName: metrics-server
      volumes:
      - emptyDir: {}
        name: tmp-dir
---
apiVersion: apiregistration.k8s.io/v1
kind: APIService
metadata:
  labels:
    k8s-app: metrics-server
  name: v1beta1.metrics.k8s.io
spec:
  group: metrics.k8s.io
  groupPriorityMinimum: 100
  insecureSkipTLSVerify: true
  service:
    name: metrics-server
    namespace: kube-system
  version: v1beta1
  versionPriority: 100

q&a

$ curl -k https://api.server.com:6443/api/v1/model/metrics/
{
  "kind": "Status",
  "apiVersion": "v1",
  "metadata": {

  },
  "status": "Failure",
  "message": "model \"metrics\" is forbidden: User \"system:anonymous\" cannot get resource \"model\" in API group \"\" at the cluster scope",
  "reason": "Forbidden",
  "details": {
    "name": "metrics",
    "kind": "model"
  },
  "code": 403
}

$ kubectl get --raw "/apis/metrics.k8s.io/v1beta1/pods"
Error from server (ServiceUnavailable): the server is currently unable to handle the request

$ kubectl top node
Error from server (ServiceUnavailable): the server is currently unable to handle the request (get nodes.metrics.k8s.io)

$ kubectl describe apiservice v1beta1.metrics.k8s.io
Name:         v1beta1.metrics.k8s.io
Namespace:
Labels:       k8s-app=metrics-server
Annotations:  kubectl.kubernetes.io/last-applied-configuration:
                {"apiVersion":"apiregistration.k8s.io/v1","kind":"APIService","metadata":{"annotations":{},"labels":{"k8s-app":"metrics-server"},"name":"v...
API Version:  apiregistration.k8s.io/v1
Kind:         APIService
Metadata:
  Creation Timestamp:  2023-05-10T03:35:06Z
  Resource Version:    404029923
  Self Link:           /apis/apiregistration.k8s.io/v1/apiservices/v1beta1.metrics.k8s.io
  UID:                 a7f6a96d-eee3-11ed-9c0f-b883034b82d0
Spec:
  Group:                     metrics.k8s.io
  Group Priority Minimum:    100
  Insecure Skip TLS Verify:  true
  Service:
    Name:            metrics-server
    Namespace:       kube-system
  Version:           v1beta1
  Version Priority:  100
Status:
  Conditions:
    Last Transition Time:  2023-05-10T03:35:06Z
    Message:               endpoints for service/metrics-server in "kube-system" have no addresses
    Reason:                MissingEndpoints
    Status:                False
    Type:                  Available
Events:                    <none>

kube-state-metrics

[!INFO|label:references:]

installation

$ helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
$ helm repo update
$ helm install kube-state-metrics prometheus-community/kube-state-metrics

# others
$ helm show values prometheus-community/kube-state-metrics
$ helm install -f config.yaml kube-state-metrics prometheus-community/kube-state-metrics

$ kubectl port-forward svc/kube-state-metrics 8080:8080
$ curl 127.0.0.1:8080/metrics | grep kube_node_status_capacity

# cost considerations
# values.yaml
metricAllowlist:
 - kube_node_info
 - kube_job_status_active

scripts

$ cat bin/node-resources.sh
#!/bin/bash
set -euo pipefail

echo -e "Iterating...\n"

nodes=$(kubectl get node --no-headers -o custom-columns=NAME:.metadata.name)

for node in $nodes; do
  echo "Node: $node"
  kubectl describe node "$node" | sed '1,/Non-terminated Pods/d'
  echo
done
  • one-line

    $ kubectl get nodes |
              awk '{print $1}' |
              xargs -I {} sh -c 'echo  {} ; kubectl describe node {} | grep Allocated -A 5 | grep -ve Event -ve Allocated -ve percent -ve -- ; echo '
    
    # alias
    $ alias util='kubectl get nodes | awk '\''{print $1}'\'' | xargs -I {} sh -c '\''echo   {} ; kubectl describe node {} | grep Allocated -A 5 | grep -ve Event -ve Allocated -ve percent -ve -- ; echo '\'''
    # or
    $ alias util='kubectl get nodes -o 'jsonpath={.items[*].metadata.name} | fmt -1 | xargs -I {} sh -c '\''echo   {} ; kubectl describe node {} | grep Allocated -A 5 | grep -ve Event -ve Allocated -ve percent -ve -- ; echo '\'''
    # or
    $ alias util='kubectl get no --no-headers -o=custom-columns=NAME:.metadata.name | xargs -I {} sh -c '\''echo   {} ; kubectl describe node {} | grep Allocated -A 5 | grep -ve Event -ve Allocated -ve percent -ve -- ; echo '\'''

Last updated