From dfca5d9f943e5260a6d1cacf3c4071987a9decfb Mon Sep 17 00:00:00 2001
From: Andi Skrgat <andi8647@gmail.com>
Date: Fri, 2 May 2025 10:26:13 +0200
Subject: [PATCH] Document monitoring on K8s

---
 .../install-memgraph/kubernetes.mdx           | 193 ++++++++++++------
 1 file changed, 133 insertions(+), 60 deletions(-)

diff --git a/pages/getting-started/install-memgraph/kubernetes.mdx b/pages/getting-started/install-memgraph/kubernetes.mdx
index 2340b27f..6375f572 100644
--- a/pages/getting-started/install-memgraph/kubernetes.mdx
+++ b/pages/getting-started/install-memgraph/kubernetes.mdx
@@ -544,71 +544,144 @@ least once in 5 minutes for a pod to be considered ready on data instances.
 Coordinators have their own probe configuration and receive pings on the NuRaft
 server, whereas data instances receive pings on the Bolt server.
 
+
+### Monitoring
+
+Memgraph supports cluster monitoring with the help of [kube-prometheus-stack chart](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack) and 
+[Memgraph's Prometheus exporter](https://github.com/memgraph/prometheus-exporter).
+The chart `kube-prometheus-stack` should be installed independently from HA chart with the following command:
+
+```bash
+helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack -f kube_prometheus_stack_values.yaml --namespace monitoring --create-namespace
+```
+
+The file `kube_prometheus_stack_values.yaml` can but doesn't need to be necessarily included. The template of the file can be found [here](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/values.yaml).
+If you are installing `kube-prometheus-stack` chart in a namespace different than the `default`, make sure to allow cross-namespace metric scraping. You can allow this 
+by adding the following configuration to your `kube_prometheus_stack_values.yaml` file:
+
+```
+prometheus:
+  prometheusSpec:
+    serviceMonitorSelectorNilUsesHelmValues: false
+```
+
+To enable monitoring when installing Memgraph's HA chart, there are several options you can use listed here:
+
+```
+prometheus:
+  enabled: true
+  namespace: monitoring
+  memgraphExporter:
+    port: 9115
+    pullFrequencySeconds: 5
+    repository: memgraph/mg-exporter
+    tag: 0.2.1
+  serviceMonitor:
+    kubePrometheusStackReleaseName: kube-prometheus-stack
+    interval: 15s
+```
+
+By setting `prometheus.enabled` false, all resources from `charts/memgraph-high-availability/templates/mg-exporter.yaml` will get installed in the namespace `monitoring`.
+Check the table below to find the explanation for each of the configuration parameters.
+
+
+To uninstall the `kube-prometheus-stack` use:
+
+```
+helm uninstall kube-prometheus-stack --namespace monitoring
+```
+
+CRDs aren't removed by default and they should be manually cleaned up:
+
+```
+kubectl delete crd alertmanagerconfigs.monitoring.coreos.com
+kubectl delete crd alertmanagers.monitoring.coreos.com
+kubectl delete crd podmonitors.monitoring.coreos.com
+kubectl delete crd probes.monitoring.coreos.com
+kubectl delete crd prometheusagents.monitoring.coreos.com
+kubectl delete crd prometheuses.monitoring.coreos.com
+kubectl delete crd prometheusrules.monitoring.coreos.com
+kubectl delete crd scrapeconfigs.monitoring.coreos.com
+kubectl delete crd servicemonitors.monitoring.coreos.com
+kubectl delete crd thanosrulers.monitoring.coreos.com
+```
+
+
+
 ### Configuration options
 
 The following table lists the configurable parameters of the Memgraph HA chart and their default values.
 
 
-| Parameter                                                | Description                                                                                                  | Default                    |
-| -------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------ | -------------------------- |
-| `image.repository`                                       | Memgraph Docker image repository                                                                             | `memgraph/memgraph`        |
-| `image.tag`                                              | Specific tag for the Memgraph Docker image. Overrides the image tag whose default is chart version.          | `3.1.0`                    |
-| `image.pullPolicy`                                       | Image pull policy                                                                                            | `IfNotPresent`             |
-| `env.MEMGRAPH_ENTERPRISE_LICENSE`                        | Memgraph enterprise license                                                                                  | `<your-license>`           |
-| `env.MEMGRAPH_ORGANIZATION_NAME`                         | Organization name                                                                                            | `<your-organization-name>` |
-| `storage.libPVCSize`                                     | Size of the storage PVC                                                                                      | `1Gi`                      |
-| `storage.libStorageClassName`                            | The name of the storage class used for storing data.                                                         | `""`                       |
-| `storage.libStorageAccessMode`                           | Access mode used for lib storage.                                                                            | `ReadWriteOnce`            |
-| `storage.logPVCSize`                                     | Size of the log PVC                                                                                          | `1Gi`                      |
-| `storage.logStorageClassName`                            | The name of the storage class used for storing logs.                                                         | `""`                       |
-| `storage.logStorageAccessMode`                           | Access mode used for log storage.                                                                            | `ReadWriteOnce`            |
-| `externalAccess.coordinator.serviceType`                 | IngressNginx, NodePort, CommonLoadBalancer or LoadBalancer. By default, no external service will be created. | `""`                       |
-| `externalAccess.coordinator.annotations`                 | Annotations for external services attached to coordinators.                                                  | `{}`                       |
-| `externalAccess.dataInstance.serviceType`                | IngressNginx, NodePort or LoadBalancer. By default, no external service will be created.                     | `""`                       |
-| `externalAccess.dataInstance.annotations`                | Annotations for external services attached to data instances.                                                | `{}`                       |
-| `headlessService.enabled`                                | Specifies whether headless services will be used inside K8s network on all instances.                        | `false`                    |
-| `ports.boltPort`                                         | Bolt port used on coordinator and data instances.                                                            | `7687`                     |
-| `ports.managementPort`                                   | Management port used on coordinator and data instances.                                                      | `10000`                    |
-| `ports.replicationPort`                                  | Replication port used on data instances.                                                                     | `20000`                    |
-| `ports.coordinatorPort`                                  | Coordinator port used on coordinators.                                                                       | `12000`                    |
-| `affinity.unique`                                        | Schedule pods on different nodes in the cluster                                                              | `false`                    |
-| `affinity.parity`                                        | Schedule pods on the same node with maximum one coordinator and one data node                                | `false`                    |
-| `affinity.nodeSelection`                                 | Schedule pods on nodes with specific labels                                                                  | `false`                    |
-| `affinity.roleLabelKey`                                  | Label key for node selection                                                                                 | `role`                     |
-| `affinity.dataNodeLabelValue`                            | Label value for data nodes                                                                                   | `data-node`                |
-| `affinity.coordinatorNodeLabelValue`                     | Label value for coordinator nodes                                                                            | `coordinator-node`         |
-| `container.data.livenessProbe.tcpSocket.port`            | Port used for TCP connection. Should be the same as bolt port.                                               | `7687`                     |
-| `container.data.livenessProbe.failureThreshold`          | Failure threshold for liveness probe                                                                         | `20`                       |
-| `container.data.livenessProbe.timeoutSeconds`            | Timeout for liveness probe                                                                                   | `10`                       |
-| `container.data.livenessProbe.periodSeconds`             | Period seconds for readiness probe                                                                           | `5`                        |
-| `container.data.readinessProbe.tcpSocket.port`           | Port used for TCP connection. Should be the same as bolt port.                                               | `7687`                     |
-| `container.data.readinessProbe.failureThreshold`         | Failure threshold for readiness probe                                                                        | `20`                       |
-| `container.data.readinessProbe.timeoutSeconds`           | Timeout for readiness probe                                                                                  | `10`                       |
-| `container.data.readinessProbe.periodSeconds`            | Period seconds for readiness probe                                                                           | `5`                        |
-| `container.data.startupProbe.tcpSocket.port`             | Port used for TCP connection. Should be the same as bolt port.                                               | `7687`                     |
-| `container.data.startupProbe.failureThreshold`           | Failure threshold for startup probe                                                                          | `1440`                     |
-| `container.data.startupProbe.timeoutSeconds`             | Timeout for probe                                                                                            | `10`                       |
-| `container.data.startupProbe.periodSeconds`              | Period seconds for startup probe                                                                             | `10`                       |
-| `container.coordinators.livenessProbe.tcpSocket.port`    | Port used for TCP connection. Should be the same as bolt port.                                               | `12000`                    |
-| `container.coordinators.livenessProbe.failureThreshold`  | Failure threshold for liveness probe                                                                         | `20`                       |
-| `container.coordinators.livenessProbe.timeoutSeconds`    | Timeout for liveness probe                                                                                   | `10`                       |
-| `container.coordinators.livenessProbe.periodSeconds`     | Period seconds for readiness probe                                                                           | `5`                        |
-| `container.coordinators.readinessProbe.tcpSocket.port`   | Port used for TCP connection. Should be the same as bolt port.                                               | `12000`                    |
-| `container.coordinators.readinessProbe.failureThreshold` | Failure threshold for readiness probe                                                                        | `20`                       |
-| `container.coordinators.readinessProbe.timeoutSeconds`   | Timeout for readiness probe                                                                                  | `10`                       |
-| `container.coordinators.readinessProbe.periodSeconds`    | Period seconds for readiness probe                                                                           | `5`                        |
-| `container.coordinators.startupProbe.tcpSocket.port`     | Port used for TCP connection. Should be the same as bolt port.                                               | `12000`                    |
-| `container.coordinators.startupProbe.failureThreshold`   | Failure threshold for startup probe                                                                          | `1440`                     |
-| `container.coordinators.startupProbe.timeoutSeconds`     | Timeout for probe                                                                                            | `10`                       |
-| `container.coordinators.startupProbe.periodSeconds`      | Period seconds for startup probe                                                                             | `10`                       |
-| `data`                                                   | Configuration for data instances                                                                             | See `data` section         |
-| `coordinators`                                           | Configuration for coordinator instances                                                                      | See `coordinators` section |
-| `sysctlInitContainer.enabled`                            | Enable the init container to set sysctl parameters                                                           | `true`                     |
-| `sysctlInitContainer.maxMapCount`                        | Value for `vm.max_map_count` to be set by the init container                                                 | `262144`                   |
-| `secrets.enabled`                                        | Enable the use of Kubernetes secrets for Memgraph credentials                                                | `false`                    |
-| `secrets.name`                                           | The name of the Kubernetes secret containing Memgraph credentials                                            | `memgraph-secrets`         |
-| `secrets.userKey`                                        | The key in the Kubernetes secret for the Memgraph user, the value is passed to the `MEMGRAPH_USER` env.      | `USER`                     |
-| `secrets.passwordKey`                                    | The key in the Kubernetes secret for the Memgraph password, the value is passed to the `MEMGRAPH_PASSWORD`.  | `PASSWORD`                 |
+| Parameter                                                  | Description                                                                                                  | Default                        |
+| ---------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------ | ------------------------------ |
+| `image.repository`                                         | Memgraph Docker image repository                                                                             | `memgraph/memgraph`            |
+| `image.tag`                                                | Specific tag for the Memgraph Docker image. Overrides the image tag whose default is chart version.          | `3.1.0`                        |
+| `image.pullPolicy`                                         | Image pull policy                                                                                            | `IfNotPresent`                 |
+| `env.MEMGRAPH_ENTERPRISE_LICENSE`                          | Memgraph enterprise license                                                                                  | `<your-license>`               |
+| `env.MEMGRAPH_ORGANIZATION_NAME`                           | Organization name                                                                                            | `<your-organization-name>`     |
+| `storage.libPVCSize`                                       | Size of the storage PVC                                                                                      | `1Gi`                          |
+| `storage.libStorageClassName`                              | The name of the storage class used for storing data.                                                         | `""`                           |
+| `storage.libStorageAccessMode`                             | Access mode used for lib storage.                                                                            | `ReadWriteOnce`                |
+| `storage.logPVCSize`                                       | Size of the log PVC                                                                                          | `1Gi`                          |
+| `storage.logStorageClassName`                              | The name of the storage class used for storing logs.                                                         | `""`                           |
+| `storage.logStorageAccessMode`                             | Access mode used for log storage.                                                                            | `ReadWriteOnce`                |
+| `externalAccess.coordinator.serviceType`                   | IngressNginx, NodePort, CommonLoadBalancer or LoadBalancer. By default, no external service will be created. | `""`                           |
+| `externalAccess.coordinator.annotations`                   | Annotations for external services attached to coordinators.                                                  | `{}`                           |
+| `externalAccess.dataInstance.serviceType`                  | IngressNginx, NodePort or LoadBalancer. By default, no external service will be created.                     | `""`                           |
+| `externalAccess.dataInstance.annotations`                  | Annotations for external services attached to data instances.                                                | `{}`                           |
+| `headlessService.enabled`                                  | Specifies whether headless services will be used inside K8s network on all instances.                        | `false`                        |
+| `ports.boltPort`                                           | Bolt port used on coordinator and data instances.                                                            | `7687`                         |
+| `ports.managementPort`                                     | Management port used on coordinator and data instances.                                                      | `10000`                        |
+| `ports.replicationPort`                                    | Replication port used on data instances.                                                                     | `20000`                        |
+| `ports.coordinatorPort`                                    | Coordinator port used on coordinators.                                                                       | `12000`                        |
+| `affinity.unique`                                          | Schedule pods on different nodes in the cluster                                                              | `false`                        |
+| `affinity.parity`                                          | Schedule pods on the same node with maximum one coordinator and one data node                                | `false`                        |
+| `affinity.nodeSelection`                                   | Schedule pods on nodes with specific labels                                                                  | `false`                        |
+| `affinity.roleLabelKey`                                    | Label key for node selection                                                                                 | `role`                         |
+| `affinity.dataNodeLabelValue`                              | Label value for data nodes                                                                                   | `data-node`                    |
+| `affinity.coordinatorNodeLabelValue`                       | Label value for coordinator nodes                                                                            | `coordinator-node`             |
+| `container.data.livenessProbe.tcpSocket.port`              | Port used for TCP connection. Should be the same as bolt port.                                               | `7687`                         |
+| `container.data.livenessProbe.failureThreshold`            | Failure threshold for liveness probe                                                                         | `20`                           |
+| `container.data.livenessProbe.timeoutSeconds`              | Timeout for liveness probe                                                                                   | `10`                           |
+| `container.data.livenessProbe.periodSeconds`               | Period seconds for readiness probe                                                                           | `5`                            |
+| `container.data.readinessProbe.tcpSocket.port`             | Port used for TCP connection. Should be the same as bolt port.                                               | `7687`                         |
+| `container.data.readinessProbe.failureThreshold`           | Failure threshold for readiness probe                                                                        | `20`                           |
+| `container.data.readinessProbe.timeoutSeconds`             | Timeout for readiness probe                                                                                  | `10`                           |
+| `container.data.readinessProbe.periodSeconds`              | Period seconds for readiness probe                                                                           | `5`                            |
+| `container.data.startupProbe.tcpSocket.port`               | Port used for TCP connection. Should be the same as bolt port.                                               | `7687`                         |
+| `container.data.startupProbe.failureThreshold`             | Failure threshold for startup probe                                                                          | `1440`                         |
+| `container.data.startupProbe.timeoutSeconds`               | Timeout for probe                                                                                            | `10`                           |
+| `container.data.startupProbe.periodSeconds`                | Period seconds for startup probe                                                                             | `10`                           |
+| `container.coordinators.livenessProbe.tcpSocket.port`      | Port used for TCP connection. Should be the same as bolt port.                                               | `12000`                        |
+| `container.coordinators.livenessProbe.failureThreshold`    | Failure threshold for liveness probe                                                                         | `20`                           |
+| `container.coordinators.livenessProbe.timeoutSeconds`      | Timeout for liveness probe                                                                                   | `10`                           |
+| `container.coordinators.livenessProbe.periodSeconds`       | Period seconds for readiness probe                                                                           | `5`                            |
+| `container.coordinators.readinessProbe.tcpSocket.port`     | Port used for TCP connection. Should be the same as bolt port.                                               | `12000`                        |
+| `container.coordinators.readinessProbe.failureThreshold`   | Failure threshold for readiness probe                                                                        | `20`                           |
+| `container.coordinators.readinessProbe.timeoutSeconds`     | Timeout for readiness probe                                                                                  | `10`                           |
+| `container.coordinators.readinessProbe.periodSeconds`      | Period seconds for readiness probe                                                                           | `5`                            |
+| `container.coordinators.startupProbe.tcpSocket.port`       | Port used for TCP connection. Should be the same as bolt port.                                               | `12000`                        |
+| `container.coordinators.startupProbe.failureThreshold`     | Failure threshold for startup probe                                                                          | `1440`                         |
+| `container.coordinators.startupProbe.timeoutSeconds`       | Timeout for probe                                                                                            | `10`                           |
+| `container.coordinators.startupProbe.periodSeconds`        | Period seconds for startup probe                                                                             | `10`                           |
+| `data`                                                     | Configuration for data instances                                                                             | See `data` section             |
+| `coordinators`                                             | Configuration for coordinator instances                                                                      | See `coordinators` section     |
+| `sysctlInitContainer.enabled`                              | Enable the init container to set sysctl parameters                                                           | `true`                         |
+| `sysctlInitContainer.maxMapCount`                          | Value for `vm.max_map_count` to be set by the init container                                                 | `262144`                       |
+| `secrets.enabled`                                          | Enable the use of Kubernetes secrets for Memgraph credentials                                                | `false`                        |
+| `secrets.name`                                             | The name of the Kubernetes secret containing Memgraph credentials                                            | `memgraph-secrets`             |
+| `secrets.userKey`                                          | The key in the Kubernetes secret for the Memgraph user, the value is passed to the `MEMGRAPH_USER` env.      | `USER`                         |
+| `secrets.passwordKey`                                      | The key in the Kubernetes secret for the Memgraph password, the value is passed to the `MEMGRAPH_PASSWORD`.  | `PASSWORD`                     |
+| `prometheus.enabled`                                       | If set to `true`, K8s resources representing Memgraph's Prometheus exporter will be deployed.                | `false`                        |
+| `prometheus.namespace`                                     | The namespace in which `kube-prometheus-stack` and Memgraph's Promethues exporter are installed.             | `monitoring`                   |
+| `prometheus.memgraphExporter.port`                         | The port on which Memgraph's Prometheus exporter is available.                                               | `9115`                         |
+| `prometheus.memgraphExporter.pullFrequencySeconds`         | How often will Memgraph's Prometheus exporter pull data from Memgraph instances.                             | `5`                            |
+| `prometheus.memgraphExporter.repository`                   | The repository where Memgraph's Prometheus exporter image is available.                                      | `memgraph/prometheus-exporter` |                      
+| `prometheus.memgraphExporter.tag`                          | The tag of Memgraph's Prometheus exporter image.                                                             | `0.2.1`                        |                      
+| `prometheus.serviceMonitor.kubePrometheusStackReleaseName` | The release name under which `kube-prometheus-stack` chart is installed.                                     | `kube-prometheus-stack`        |
+| `prometheus.serviceMonitor.interval`                       | How often will Prometheus pull data from Memgraph's Prometheus exporter.                                     | `15s`                          |
+
 
 For the `data` and `coordinators` sections, each item in the list has the following parameters: