This the multi-page printable view of this section. Click here to print.

Return to the regular view of this page.

  • 1:
    • 1.1:
      • 1.2:
      • 2:
        • 2.1:
          • 2.2:
            • 2.3:
              • 2.4:
                • 2.5:
                  • 2.6:
                    • 2.7:
                      • 2.8:
                        • 2.9:
                          • 2.10:
                            • 2.11:
                              • 2.12:
                                • 2.13:
                                  • 2.14:
                                    • 2.15:
                                      • 2.16:
                                        • 2.17:
                                          • 2.18:
                                            • 2.19:
                                              • 2.20:
                                                • 2.21:
                                                  • 2.22:
                                                  • 3:
                                                    • 3.1:
                                                      • 3.1.1:
                                                        • 3.1.2:
                                                          • 3.1.3:
                                                          • 3.2:
                                                            • 3.3:
                                                              • 3.4:
                                                              • 4:
                                                                • 4.1:
                                                                  • 4.2:
                                                                    • 4.3:
                                                                      • 4.4:
                                                                      • 5:
                                                                        • 5.1:
                                                                        • 6:
                                                                          • 6.1:
                                                                            • 6.1.1:
                                                                              • 6.1.2:
                                                                                • 6.1.3:
                                                                                  • 6.1.4:
                                                                                    • 6.1.5:
                                                                                      • 6.1.6:
                                                                                        • 6.1.7:
                                                                                        • 6.2:
                                                                                          • 6.2.1:
                                                                                            • 6.2.2:
                                                                                              • 6.2.3:
                                                                                                • 6.2.4:
                                                                                                  • 6.2.5:
                                                                                                    • 6.2.6:
                                                                                                      • 6.2.7:
                                                                                                        • 6.2.8:
                                                                                                          • 6.2.9:
                                                                                                            • 6.2.10:
                                                                                                              • 6.2.11:
                                                                                                                • 6.2.12:
                                                                                                                  • 6.2.13:
                                                                                                                    • 6.2.14:
                                                                                                                      • 6.2.15:
                                                                                                                        • 6.2.16:
                                                                                                                          • 6.2.17:
                                                                                                                            • 6.2.18:
                                                                                                                              • 6.2.19:
                                                                                                                                • 6.2.20:
                                                                                                                                  • 6.2.21:
                                                                                                                                    • 6.2.22:
                                                                                                                                      • 6.2.23:
                                                                                                                                        • 6.2.24:
                                                                                                                                          • 6.2.25:
                                                                                                                                            • 6.2.26:
                                                                                                                                              • 6.2.27:
                                                                                                                                              • 6.3:
                                                                                                                                                • 6.4:

                                                                                                                                                Monitoring Integrations

                                                                                                                                                Integrations for Sysdig Monitor include a number of platforms, orchestrators, and a wide range of applications designed to extend Monitor capabilities and collect metrics from these systems. Sysdig collects metrics from Prometheus, JMX, StatsD, Kubernetes, and a number of applications to provide a 360-degree view of your infrastructure. Many metrics are collected out of the box; you can also extend the integration or create custom metrics to receive curated insights into your infrastructure stack.

                                                                                                                                                Key Benefits

                                                                                                                                                • Collects the richest data set for cloud-native visibility and security.

                                                                                                                                                • Polls data, auto-discover context in order to provide operational and security insights.

                                                                                                                                                • Simplifies deploying your monitoring integrations by providing guided configuration, curated list of enterprise-grade images, integration with CI/CD workflow.

                                                                                                                                                • Extends the power of Prometheus metrics with additional insight from other metrics types and infrastructure stack.

                                                                                                                                                • Employs Prometheus alert and events and provides ready-to-use dashboards for Kubernetes monitoring needs.

                                                                                                                                                • Exposes application metrics using Java JMX and MBeans monitoring.

                                                                                                                                                Key Integrations

                                                                                                                                                Inbound

                                                                                                                                                • Monitoring Integrations

                                                                                                                                                  Describes how to configure Monitoring Integration in your infrastructure and receive deeper insight into the health and performance of your services across platforms and the cloud.

                                                                                                                                                • Prometheus Metrics

                                                                                                                                                  Describes how Sysdig agent enables automatically collecting metrics from services that expose native Prometheus metrics as well as from applications with Prometheus exporters, how to set up your environment, and scrape Prometheus metrics seamlessly.

                                                                                                                                                • Agent Installation

                                                                                                                                                  Learn how to install Sysdig agents on supported platforms.

                                                                                                                                                • AWS CloudWatch

                                                                                                                                                  Illustrates how to configure Sysdig to collect various types of CloudWatch metrics.

                                                                                                                                                • Java Management Extention (JMX) Metrics

                                                                                                                                                  Describes how to configure your Java virtual machines so Sysdig Agent can collect JMX metrics using the JMX protocol.

                                                                                                                                                • StatsD Metrics

                                                                                                                                                  Describes how the Sysdig agent collects custom StatsD metrics with an embedded StatsD server.

                                                                                                                                                • Node.JS Metrics

                                                                                                                                                  Illustrates how Sysdig is able to monitor node.js applications by linking a library to the node.js codebase.

                                                                                                                                                • Monitor Log Files

                                                                                                                                                  Learn how to search a string by using the chisel script called logwatcher.

                                                                                                                                                • (legacy) Integrate Applications

                                                                                                                                                  Describes the monitoring capabilities of Sysdig agent with application check scripts or ‘app checks’.

                                                                                                                                                Oubound

                                                                                                                                                • Notification Channels

                                                                                                                                                  Learn how to add, edit, or delete a variety of notification channel types, and how to disable or delete notifications when they are not needed, for example, during scheduled downtime.

                                                                                                                                                • S3 Capture Storage

                                                                                                                                                  Learn how to configure Sysdig to use an AWS S3 bucket or custom S3 storage for storing Capture files.

                                                                                                                                                Platform Metrics (IBM)

                                                                                                                                                For Sysdig instances deployed on IBM Cloud Monitoring with Sysdig, an additional form of metrics collection is offered: Platform metrics. Rather than being collected by the Sysdig agent, when enabled, Platform metrics are reported to Sysdig directly by the IBM Cloud infrastructure.

                                                                                                                                                Enable this feature by logging into the IBM Cloud console and selecting “Enable” for IBM Platform metrics under the Configure your resource section when creating a new IBM Cloud Monitoring with a Sysdig instance, as described here.

                                                                                                                                                1 -

                                                                                                                                                Configure Monitoring Integrations

                                                                                                                                                Monitoring Integration provides an at-a-glance summary of workloads running in your infrastructure and a deeper insight into the health and performance of your services across platforms and the cloud. You can easily identify the workloads in your team scope, the service discovered (such as etcd) within each workload, and configure the Prometheus exporter integration to collect and visualize time series metrics. Monitoring Integration also powers Alerts Library.

                                                                                                                                                The following indicates integration status for each service integrations:

                                                                                                                                                • Reporting Metrics: The integration is configured correctly and is reporting metrics.

                                                                                                                                                • Needs Attention: An integration has stopped working and is no longer reporting metrics or requires some other type of attention.

                                                                                                                                                • Pending Metrics: An integration has recently been configured and has been waiting to receive metrics.

                                                                                                                                                • Configure Integration: The integration needs to be configured, and therefore no metrics are reported.

                                                                                                                                                Ensure that you meet the prerequisites given in Guidelines for Monitoring Integrations to make the best use of this feature.

                                                                                                                                                Access Monitoring Integrations

                                                                                                                                                1. Log in to Sysdig Monitor.

                                                                                                                                                2. Select Integration > Monitoring Integration in the management section of the left-hand sidebar.

                                                                                                                                                  The Integrations page is displayed. Continue with Configure an Integration.

                                                                                                                                                Configure an Integration

                                                                                                                                                1. Locate the service that you want to configure an integration for. To do so, identify the workload and drill down to the grouping where the service is running.

                                                                                                                                                  To locate the service, you can use one of the following:

                                                                                                                                                  • Text search
                                                                                                                                                  • Type filtering
                                                                                                                                                  • Left navigation to filter the workload and then use text search or type filtering
                                                                                                                                                  • Use the Configure Integration option on the top, and locate the service using text search or type filtering
                                                                                                                                                2. Click Configure Integration.

                                                                                                                                                  1. Click Start Installation.
                                                                                                                                                  2. Review the prerequisites.
                                                                                                                                                  3. Do one of the following:
                                                                                                                                                    • Dry Run: Use kubectl command to install the service. Follow the on-screen instructions to complete the tasks successfully.
                                                                                                                                                    • Patch: Install directly on your workload. Follow the on-screen instructions to complete the tasks successfully.
                                                                                                                                                    • Manual: Use an exporter and install the service manually. Click Documentation to learn more about the service exporter and integrate with Sysdig Monitor
                                                                                                                                                3. Click Validate to validate the installation.

                                                                                                                                                4. Make sure that the wizard shows the Installation Complete screen.

                                                                                                                                                5. Click Close to close the window.

                                                                                                                                                Show Unidentified Workloads

                                                                                                                                                The services that Sysdig Monitor cannot discover can technically still be monitored through the Unidentified Workloads option. You can view the workloads with these unidentified services or applications and see their status. To do so, use the Unidentified Workloads slider at the top right corner of the Integration page.

                                                                                                                                                Learn More

                                                                                                                                                1.1 -

                                                                                                                                                Guidelines for Monitoring Integrations

                                                                                                                                                If you are directed to this page from the Sysdig Monitor app, your agent deployment might include a configuration that causes either of the following:

                                                                                                                                                • Prohibits the use of Monitoring Integrations
                                                                                                                                                • Affect the current metrics you are already collecting

                                                                                                                                                Ensure that you meet the prerequisites to successfully use Monitoring Integrations. For technical assistance, contact Sysdig Support.

                                                                                                                                                Prerequisites

                                                                                                                                                • Upgrade Sysdig agent to v12.0.0

                                                                                                                                                • If you have clusters with more than 50 nodes and you don’t have the prom_service_discovery option enabled:

                                                                                                                                                  • Enabling the latest Prometheus features might create an additional connection to the Kubernetes API server from each Sysdig agent in your environment. The surge in agent connections can increase the CPU and memory load in your API servers. Therefore, ensure that your API servers are suitably sized to handle the increased load in large clusters.
                                                                                                                                                  • If you encounter any problems contact Sysdig Support.
                                                                                                                                                • Remove the following manual configurations in the dragent.yaml file because they might interfere with those provided by Sysdig:

                                                                                                                                                  • use_promscrape
                                                                                                                                                  • promscrape_fastproto
                                                                                                                                                  • prom_service_discovery
                                                                                                                                                  • prometheus.max_metrics
                                                                                                                                                  • prometheus.ingest_raw
                                                                                                                                                  • prometheus.ingest_calculated
                                                                                                                                                • The sysdig_sd_configs configuration is no longer supported. Remove the existing prometheus.yaml if it includes the sysdig_sd_configs configuration.

                                                                                                                                                If you are not currently using Prometheus metrics in Sysdig Monitor, you can skip the following steps:

                                                                                                                                                • If you are using a custom Prometheus process_filter in dragent.yaml to trigger scraping, see Migrating from Promscrape V1 to V2.

                                                                                                                                                • If you are using service annotations or container labels to find scrape targets, you may need to create new scrape_configs in prometheus.yaml , preferably based on Kubernetes pods service discovery. This configuration can be complicated in certain environments and therefore we recommend that you contact Sysdig support for help.

                                                                                                                                                Learn More

                                                                                                                                                1.2 -

                                                                                                                                                Configure Default Integrations

                                                                                                                                                Each Monitoring Integration holds a specific job that scrapes its metrics and sends them to Sysdig Monitor. To optimize metrics scraping for building dashboards and alerts in Sysdig Monitor, Sysdig offers default jobs for these integrations. Periodically, the Sysdig agent connects with Sysdig Monitor and retrieves the default jobs and make the Monitoring Integrations available for use. See the list of the available integrations and corresponding jobs.

                                                                                                                                                You can find all the jobs in the /opt/draios/etc/promscrape.yaml file in the sysdig-agent container in your cluster.

                                                                                                                                                Supported Monitoring Integrations

                                                                                                                                                IntegrationOut of the BoxEnabled by defaultJob name in config file
                                                                                                                                                Apacheapache-exporter-default, apache-grok-default
                                                                                                                                                Cephceph-default
                                                                                                                                                Consulconsul-server-default, consul-envoy-default
                                                                                                                                                ElasticSearchelasticsearch-default
                                                                                                                                                Fluentdfluentd-default
                                                                                                                                                HaProxyhaproxy-default
                                                                                                                                                Harborharbor-exporter-default, harbor-core-default, harbor-registry-default, harbor-jobservice-default
                                                                                                                                                Kubernetes API Serverkubernetes-apiservers-default
                                                                                                                                                Kubernetes Control Planekube-dns-default, kube-scheduler-default, kube-controller-manager-default
                                                                                                                                                Kubernetes Etcdetcd-default
                                                                                                                                                Kubeletk8s-kubelet-default
                                                                                                                                                Kube-Proxykubernetes-kube-proxy-default
                                                                                                                                                Kubernetes Persistent Volume Claimk8s-pvc-default
                                                                                                                                                Kubernetes Storagek8s-storage-default
                                                                                                                                                Kedakeda-default
                                                                                                                                                Memcachedmemcached-default
                                                                                                                                                MongoDBmongodb-default
                                                                                                                                                MySQLmysql-default
                                                                                                                                                Nginxnginx-default
                                                                                                                                                Nginx Ingressnginx-ingress-default
                                                                                                                                                NTPntp-default
                                                                                                                                                Open Policy Agent - Gatekeeperopa-default
                                                                                                                                                Php-fpmphp-fpm-default
                                                                                                                                                Portworxportworx-default, portworx-openshift-default
                                                                                                                                                PostgreSQLpostgres-default
                                                                                                                                                Prometheus Default Jobk8s-pods
                                                                                                                                                RabbitMQrabbitmq-default
                                                                                                                                                Redisredis-default
                                                                                                                                                Sysdig Admission Controllersysdig-admission-controller-default

                                                                                                                                                Enable and Disable Integrations

                                                                                                                                                Some integrations are disabled by default due to the potential high cardinality of their metrics. To enable them, contact Sysdig Support. The same applies to disabling integrations by default in all your clusters.

                                                                                                                                                Customize a Default Job

                                                                                                                                                The default jobs offered by Sysdig for integrations are optimized to scrape the metrics for building dashboards and alerts in Sysdig Monitor. Instead of processing all the metrics available, you can determine which metrics to include or exclude for your requirements. To do so, you can overwrite the default configuration in the prometheus.yaml file. The prometheus.yaml file is located in the sysdig-agent ConfigMap in the sysdig-agent namespace.

                                                                                                                                                You can overwrite the default job for a specific integration by adding a new job to the prometheus.yaml file with the same name as the default job that you want to replace. For example, if you want to create a new job for the Apache integration, create a new job with the name apache-default. The jobs defined by the user has precedence over the default ones.

                                                                                                                                                See Supported Monitoring Integrations for the complete list of integrations and corresponding job names.

                                                                                                                                                Use Sysdig Annotations in Exporters

                                                                                                                                                Sysdig provides a set of Helm charts that helps you configure the exporters for the integrations. For more information on installing Monitor Integrations, see the Monitoring Integrations option in the Sysdig Monitor. Additionally, the Helm charts are publicly available in the Sysdig Helm repository.

                                                                                                                                                If exporters are already installed in your cluster, you can use the standard Prometheus annotations and the Sysdig agent will automatically scrape them.

                                                                                                                                                For example, if you use the annotation given below, the incoming metrics will have the information about the pod that generates the metrics.

                                                                                                                                                spec:
                                                                                                                                                  template:
                                                                                                                                                    metadata:
                                                                                                                                                      annotations:
                                                                                                                                                        prometheus.io/path: /metrics
                                                                                                                                                        prometheus.io/port: '9100'
                                                                                                                                                        prometheus.io/scrape: 'true'
                                                                                                                                                

                                                                                                                                                If you use an exporter, the incoming metrics will be associated with the exporter pod, not the application pod. To change this behavior, you can use the Sysdig-provided annotations and configure the exporter on the agent.

                                                                                                                                                Annotate the Exporter

                                                                                                                                                Use the following annotations to configure the exporter:

                                                                                                                                                spec:
                                                                                                                                                  template:
                                                                                                                                                    metadata:
                                                                                                                                                      annotations:
                                                                                                                                                        promcat.sysdig.com/port: '9187'
                                                                                                                                                        promcat.sysdig.com/target_ns: my-namespace
                                                                                                                                                        promcat.sysdig.com/target_workload_type: deployment
                                                                                                                                                        promcat.sysdig.com/target_workload_name: my-workload
                                                                                                                                                        promcat.sysdig.com/integration_type: my-integration
                                                                                                                                                
                                                                                                                                                • port: The port to scrape for metrics on the exporter.
                                                                                                                                                • target_ns: The namespace of the workload corresponding to the application (not the exporter).
                                                                                                                                                • target_workload_type: The type of the workload of the application (not the exporter). The possible values are deployment, statefulset, and daemonset.
                                                                                                                                                • target_workload_name: The name of the workload corresponding to the application (not the exporter).
                                                                                                                                                • integration_type: The type of the integration. The job created in the Sysdig agent use this value to find the exporter.

                                                                                                                                                Configure a New Job

                                                                                                                                                Edit the prometheus.yaml file to configure a new job in Sysdig agent. The file is located in the sysdig-agent ConfigMap in the sysdig-agent namespace.

                                                                                                                                                You can use the following example template:

                                                                                                                                                - job_name: my-integration
                                                                                                                                                  tls_config:
                                                                                                                                                    insecure_skip_verify: true
                                                                                                                                                  kubernetes_sd_configs:
                                                                                                                                                    - role: pod
                                                                                                                                                  relabel_configs:
                                                                                                                                                    - action: keep
                                                                                                                                                      source_labels: [__meta_kubernetes_pod_host_ip]
                                                                                                                                                      regex: __HOSTIPS__
                                                                                                                                                    - action: drop
                                                                                                                                                      source_labels: [__meta_kubernetes_pod_annotation_promcat_sysdig_com_omit]
                                                                                                                                                      regex: true
                                                                                                                                                    - action: keep
                                                                                                                                                      source_labels:
                                                                                                                                                        - __meta_kubernetes_pod_annotation_promcat_sysdig_com_integration_type
                                                                                                                                                      regex: 'my-integration' # Use here the integration type that you defined in your annotations
                                                                                                                                                    - action: replace
                                                                                                                                                      source_labels: [__meta_kubernetes_pod_annotation_promcat_sysdig_com_target_ns]
                                                                                                                                                      target_label: kube_namespace_name
                                                                                                                                                    - action: replace
                                                                                                                                                      source_labels: [__meta_kubernetes_pod_annotation_promcat_sysdig_com_target_workload_type]
                                                                                                                                                      target_label: kube_workload_type
                                                                                                                                                    - action: replace
                                                                                                                                                      source_labels: [__meta_kubernetes_pod_annotation_promcat_sysdig_com_target_workload_name]
                                                                                                                                                      target_label: kube_workload_name
                                                                                                                                                    - action: replace
                                                                                                                                                      replacement: true
                                                                                                                                                      target_label: sysdig_omit_source
                                                                                                                                                    - action: replace
                                                                                                                                                      source_labels: [__address__, __meta_kubernetes_pod_annotation_promcat_sysdig_com_port]
                                                                                                                                                      regex: ([^:]+)(?::\d+)?;(\d+)
                                                                                                                                                      replacement: $1:$2
                                                                                                                                                      target_label: __address__
                                                                                                                                                    - action: replace
                                                                                                                                                      source_labels: [__meta_kubernetes_pod_uid]
                                                                                                                                                      target_label: sysdig_k8s_pod_uid
                                                                                                                                                    - action: replace
                                                                                                                                                      source_labels: [__meta_kubernetes_pod_container_name]
                                                                                                                                                      target_label: sysdig_k8s_pod_container_name
                                                                                                                                                

                                                                                                                                                Exclude a Deployment from Being Scraped

                                                                                                                                                If you want the agent to exclude a deployment from being scraped, use the following annotation:

                                                                                                                                                spec:
                                                                                                                                                  template:
                                                                                                                                                    metadata:
                                                                                                                                                      annotations:
                                                                                                                                                        promcat.sysdig.com/omit: 'true'
                                                                                                                                                

                                                                                                                                                Learn More

                                                                                                                                                2.1 -

                                                                                                                                                Apache

                                                                                                                                                Apache

                                                                                                                                                This integration is enabled by default.

                                                                                                                                                List of alerts

                                                                                                                                                AlertDescriptionFormat
                                                                                                                                                [Apache] No Instance UpNo instances upPrometheus
                                                                                                                                                [Apache] Up Time Less Than One HourInstance with UpTime less than one hourPrometheus
                                                                                                                                                [Apache] Time Since Last OK Request More Than One HourTime since last OK request higher than one hourPrometheus
                                                                                                                                                [Apache] High Error RateHigh error ratePrometheus
                                                                                                                                                [Apache] High Rate Of Busy Workers In InstanceLow workers in open_slot statePrometheus

                                                                                                                                                List of dashboards:

                                                                                                                                                • Apache

                                                                                                                                                List of metrics:

                                                                                                                                                • apache_accesses_total
                                                                                                                                                • apache_connections
                                                                                                                                                • apache_cpuload
                                                                                                                                                • apache_duration_ms_total
                                                                                                                                                • apache_http_last_request_seconds
                                                                                                                                                • apache_http_response_codes_total
                                                                                                                                                • apache_scoreboard
                                                                                                                                                • apache_sent_kilobytes_total
                                                                                                                                                • apache_up
                                                                                                                                                • apache_uptime_seconds_total
                                                                                                                                                • apache_workers

                                                                                                                                                2.2 -

                                                                                                                                                Ceph

                                                                                                                                                Ceph

                                                                                                                                                This integration is enabled by default.

                                                                                                                                                List of alerts

                                                                                                                                                AlertDescriptionFormat
                                                                                                                                                [Ceph] Ceph Manager is absentCeph Manager has disappeared from Prometheus target discovery.Prometheus
                                                                                                                                                [Ceph] Ceph Manager is missing replicasCeph Manager is missing replicas.Prometheus
                                                                                                                                                [Ceph] Ceph quorum at riskStorage cluster quorum is low. Contact Support.Prometheus
                                                                                                                                                [Ceph] High number of leader changesCeph Monitor has seen a lot of leader changes per minute recently.Prometheus

                                                                                                                                                List of dashboards:

                                                                                                                                                • ceph

                                                                                                                                                List of metrics:

                                                                                                                                                • ceph_cluster_total_bytes
                                                                                                                                                • ceph_cluster_total_used_bytes
                                                                                                                                                • ceph_health_status
                                                                                                                                                • ceph_mgr_status
                                                                                                                                                • ceph_mon_metadata
                                                                                                                                                • ceph_mon_num_elections
                                                                                                                                                • ceph_mon_quorum_status
                                                                                                                                                • ceph_osd_apply_latency_ms
                                                                                                                                                • ceph_osd_commit_latency_ms
                                                                                                                                                • ceph_osd_in
                                                                                                                                                • ceph_osd_metadata
                                                                                                                                                • ceph_osd_numpg
                                                                                                                                                • ceph_osd_op_r
                                                                                                                                                • ceph_osd_op_r_latency_count
                                                                                                                                                • ceph_osd_op_r_latency_sum
                                                                                                                                                • ceph_osd_op_r_out_bytes
                                                                                                                                                • ceph_osd_op_w
                                                                                                                                                • ceph_osd_op_w_in_bytes
                                                                                                                                                • ceph_osd_op_w_latency_count
                                                                                                                                                • ceph_osd_op_w_latency_sum
                                                                                                                                                • ceph_osd_recovery_bytes
                                                                                                                                                • ceph_osd_recovery_ops
                                                                                                                                                • ceph_osd_up
                                                                                                                                                • ceph_pool_max_avail

                                                                                                                                                2.3 -

                                                                                                                                                Consul

                                                                                                                                                Consul

                                                                                                                                                This integration is enabled by default.

                                                                                                                                                List of alerts

                                                                                                                                                AlertDescriptionFormat
                                                                                                                                                [Consul] KV Store update time anomalyKV Store update time anomalyPrometheus
                                                                                                                                                [Consul] Transaction time anomalyTransaction time anomalyPrometheus
                                                                                                                                                [Consul] Raft transactions count anomalyRaft transactions count anomalyPrometheus
                                                                                                                                                [Consul] Raft commit time anomalyRaft commit time anomalyPrometheus
                                                                                                                                                [Consul] Leader time to contact followers too highLeader time to contact followers too highPrometheus
                                                                                                                                                [Consul] Flapping leadershipFlapping leadershipPrometheus
                                                                                                                                                [Consul] Too many electionsToo many electionsPrometheus
                                                                                                                                                [Consul] Server cluster unhealthyServer cluster unhealthyPrometheus
                                                                                                                                                [Consul] Zero failure toleranceZero failure tolerancePrometheus
                                                                                                                                                [Consul] Client RPC requests anomalyConsul client RPC requests anomalyPrometheus
                                                                                                                                                [Consul] Client RPC requests rate limit exceededConsul client RPC requests rate limit exceededPrometheus
                                                                                                                                                [Consul] Client RPC requests failedConsul client RPC requests failedPrometheus
                                                                                                                                                [Consul] License ExpiryConsul License ExpiryPrometheus
                                                                                                                                                [Consul] Garbage Collection pause highConsul Garbage Collection pause highPrometheus
                                                                                                                                                [Consul] Garbage Collection pause too highConsul Garbage Collection pause too highPrometheus
                                                                                                                                                [Consul] Raft restore duration too highConsul Raft restore duration too highPrometheus
                                                                                                                                                [Consul] RPC requests error rate is highConsul RPC requests error rate is highPrometheus
                                                                                                                                                [Consul] Cache hit rate is lowConsul Cache hit rate is lowPrometheus
                                                                                                                                                [Consul] High 4xx RequestError RateHigh 4xx RequestError RatePrometheus
                                                                                                                                                [Consul] High Request LatencyEnvoy High Request LatencyPrometheus
                                                                                                                                                [Consul] High Response LatencyEnvoy High Response LatencyPrometheus
                                                                                                                                                [Consul] Certificate close to expireCertificate close to expirePrometheus

                                                                                                                                                List of dashboards:

                                                                                                                                                • consul
                                                                                                                                                • consul envoy

                                                                                                                                                List of metrics:

                                                                                                                                                • consul_autopilot_failure_tolerance
                                                                                                                                                • consul_autopilot_healthy
                                                                                                                                                • consul_client_rpc
                                                                                                                                                • consul_client_rpc_exceeded
                                                                                                                                                • consul_client_rpc_failed
                                                                                                                                                • consul_consul_cache_bypass
                                                                                                                                                • consul_consul_cache_entries_count
                                                                                                                                                • consul_consul_cache_evict_expired
                                                                                                                                                • consul_consul_cache_fetch_error
                                                                                                                                                • consul_consul_cache_fetch_success
                                                                                                                                                • consul_kvs_apply_sum
                                                                                                                                                • consul_raft_apply
                                                                                                                                                • consul_raft_commitTime_sum
                                                                                                                                                • consul_raft_fsm_lastRestoreDuration
                                                                                                                                                • consul_raft_leader_lastContact
                                                                                                                                                • consul_raft_leader_oldestLogAge
                                                                                                                                                • consul_raft_rpc_installSnapshot
                                                                                                                                                • consul_raft_state_candidate
                                                                                                                                                • consul_raft_state_leader
                                                                                                                                                • consul_rpc_cross_dc
                                                                                                                                                • consul_rpc_queries_blocking
                                                                                                                                                • consul_rpc_query
                                                                                                                                                • consul_rpc_request
                                                                                                                                                • consul_rpc_request_error
                                                                                                                                                • consul_runtime_gc_pause_ns
                                                                                                                                                • consul_runtime_gc_pause_ns_sum
                                                                                                                                                • consul_system_licenseExpiration
                                                                                                                                                • consul_txn_apply_sum
                                                                                                                                                • envoy_cluster_membership_change
                                                                                                                                                • envoy_cluster_membership_healthy
                                                                                                                                                • envoy_cluster_membership_total
                                                                                                                                                • envoy_cluster_upstream_cx_active
                                                                                                                                                • envoy_cluster_upstream_cx_connect_ms_bucket
                                                                                                                                                • envoy_cluster_upstream_rq_active
                                                                                                                                                • envoy_cluster_upstream_rq_pending_active
                                                                                                                                                • envoy_cluster_upstream_rq_time_bucket
                                                                                                                                                • envoy_cluster_upstream_rq_xx
                                                                                                                                                • envoy_server_days_until_first_cert_expiring
                                                                                                                                                • go_build_info
                                                                                                                                                • go_gc_duration_seconds
                                                                                                                                                • go_gc_duration_seconds_count
                                                                                                                                                • go_gc_duration_seconds_sum
                                                                                                                                                • go_goroutines
                                                                                                                                                • go_memstats_buck_hash_sys_bytes
                                                                                                                                                • go_memstats_gc_sys_bytes
                                                                                                                                                • go_memstats_heap_alloc_bytes
                                                                                                                                                • go_memstats_heap_idle_bytes
                                                                                                                                                • go_memstats_heap_inuse_bytes
                                                                                                                                                • go_memstats_heap_released_bytes
                                                                                                                                                • go_memstats_heap_sys_bytes
                                                                                                                                                • go_memstats_lookups_total
                                                                                                                                                • go_memstats_mallocs_total
                                                                                                                                                • go_memstats_mcache_inuse_bytes
                                                                                                                                                • go_memstats_mcache_sys_bytes
                                                                                                                                                • go_memstats_mspan_inuse_bytes
                                                                                                                                                • go_memstats_mspan_sys_bytes
                                                                                                                                                • go_memstats_next_gc_bytes
                                                                                                                                                • go_memstats_stack_inuse_bytes
                                                                                                                                                • go_memstats_stack_sys_bytes
                                                                                                                                                • go_memstats_sys_bytes
                                                                                                                                                • go_threads
                                                                                                                                                • process_cpu_seconds_total
                                                                                                                                                • process_max_fds
                                                                                                                                                • process_open_fds

                                                                                                                                                2.4 -

                                                                                                                                                Elasticsearch

                                                                                                                                                Elasticsearch

                                                                                                                                                This integration is enabled by default.

                                                                                                                                                List of alerts

                                                                                                                                                AlertDescriptionFormat
                                                                                                                                                [Elasticsearch] Heap Usage Too HighThe heap usage is over 90%Prometheus
                                                                                                                                                [Elasticsearch] Heap Usage WarningThe heap usage is over 80%Prometheus
                                                                                                                                                [Elasticsearch] Disk Space LowDisk available less than 20%Prometheus
                                                                                                                                                [Elasticsearch] Disk Out Of SpaceDisk available less than 10%Prometheus
                                                                                                                                                [Elasticsearch] Cluster RedCluster in Red statusPrometheus
                                                                                                                                                [Elasticsearch] Cluster YellowCluster in Yellow statusPrometheus
                                                                                                                                                [Elasticsearch] Relocation ShardsRelocating shards for too longPrometheus
                                                                                                                                                [Elasticsearch] Initializing ShardsInitializing shards takes too longPrometheus
                                                                                                                                                [Elasticsearch] Unassigned ShardsUnassigned shards for long timePrometheus
                                                                                                                                                [Elasticsearch] Pending TasksElasticsearch has a high number of pending tasksPrometheus
                                                                                                                                                [Elasticsearch] No New DocumentsElasticsearch has no new documents for a period of timePrometheus

                                                                                                                                                List of dashboards:

                                                                                                                                                • ElasticSearch_Cluster
                                                                                                                                                • ElasticSearch_Infra

                                                                                                                                                List of metrics:

                                                                                                                                                • elasticsearch_cluster_health_active_primary_shards
                                                                                                                                                • elasticsearch_cluster_health_active_shards
                                                                                                                                                • elasticsearch_cluster_health_initializing_shards
                                                                                                                                                • elasticsearch_cluster_health_number_of_data_nodes
                                                                                                                                                • elasticsearch_cluster_health_number_of_nodes
                                                                                                                                                • elasticsearch_cluster_health_number_of_pending_tasks
                                                                                                                                                • elasticsearch_cluster_health_relocating_shards
                                                                                                                                                • elasticsearch_cluster_health_status
                                                                                                                                                • elasticsearch_cluster_health_unassigned_shards
                                                                                                                                                • elasticsearch_filesystem_data_available_bytes
                                                                                                                                                • elasticsearch_filesystem_data_size_bytes
                                                                                                                                                • elasticsearch_indices_docs
                                                                                                                                                • elasticsearch_indices_indexing_index_time_seconds_total
                                                                                                                                                • elasticsearch_indices_indexing_index_total
                                                                                                                                                • elasticsearch_indices_merges_total_time_seconds_total
                                                                                                                                                • elasticsearch_indices_search_query_time_seconds
                                                                                                                                                • elasticsearch_indices_store_throttle_time_seconds_total
                                                                                                                                                • elasticsearch_jvm_gc_collection_seconds_count
                                                                                                                                                • elasticsearch_jvm_gc_collection_seconds_sum
                                                                                                                                                • elasticsearch_jvm_memory_committed_bytes
                                                                                                                                                • elasticsearch_jvm_memory_max_bytes
                                                                                                                                                • elasticsearch_jvm_memory_pool_peak_used_bytes
                                                                                                                                                • elasticsearch_jvm_memory_used_bytes
                                                                                                                                                • elasticsearch_os_load1
                                                                                                                                                • elasticsearch_os_load15
                                                                                                                                                • elasticsearch_os_load5
                                                                                                                                                • elasticsearch_process_cpu_percent
                                                                                                                                                • elasticsearch_transport_rx_size_bytes_total
                                                                                                                                                • elasticsearch_transport_tx_size_bytes_total

                                                                                                                                                2.5 -

                                                                                                                                                Fluentd

                                                                                                                                                Fluentd

                                                                                                                                                This integration is enabled by default.

                                                                                                                                                List of alerts

                                                                                                                                                AlertDescriptionFormat
                                                                                                                                                [Fluentd] No Input From ContainerNo Input From Container.Prometheus
                                                                                                                                                [Fluentd] High Error RatioHigh Error Ratio.Prometheus
                                                                                                                                                [Fluentd] High Retry RatioHigh Retry Ratio.Prometheus
                                                                                                                                                [Fluentd] High Retry WaitHigh Retry Wait.Prometheus
                                                                                                                                                [Fluentd] Low Buffer Available SpaceLow Buffer Available Space.Prometheus
                                                                                                                                                [Fluentd] Buffer Queue Length IncreasingBuffer Queue Length Increasing.Prometheus
                                                                                                                                                [Fluentd] Buffer Total Bytes IncreasingBuffer Total Bytes Increasing.Prometheus
                                                                                                                                                [Fluentd] High Slow Flush RatioHigh Slow Flush Ratio.Prometheus
                                                                                                                                                [Fluentd] No Output Records From PluginNo Output Records From Plugin.Prometheus

                                                                                                                                                List of dashboards:

                                                                                                                                                • Fluentd

                                                                                                                                                List of metrics:

                                                                                                                                                • fluentd_input_status_num_records_total
                                                                                                                                                • fluentd_output_status_buffer_available_space_ratio
                                                                                                                                                • fluentd_output_status_buffer_queue_length
                                                                                                                                                • fluentd_output_status_buffer_total_bytes
                                                                                                                                                • fluentd_output_status_emit_count
                                                                                                                                                • fluentd_output_status_emit_records
                                                                                                                                                • fluentd_output_status_flush_time_count
                                                                                                                                                • fluentd_output_status_num_errors
                                                                                                                                                • fluentd_output_status_retry_count
                                                                                                                                                • fluentd_output_status_retry_wait
                                                                                                                                                • fluentd_output_status_rollback_count
                                                                                                                                                • fluentd_output_status_slow_flush_count

                                                                                                                                                2.6 -

                                                                                                                                                Haproxy-ingress

                                                                                                                                                Haproxy-ingress

                                                                                                                                                This integration is enabled by default.

                                                                                                                                                List of alerts

                                                                                                                                                AlertDescriptionFormat
                                                                                                                                                [Haproxy-Ingress] Uptime less than 1 hourThis alert detects when all of the instances of the ingress controller have an uptime of less than 1 hour.Prometheus
                                                                                                                                                [Haproxy-Ingress] Frontend DownThis alert detects when a frontend has all of its instances down for more than 10 minutes.Prometheus
                                                                                                                                                [Haproxy-Ingress] Backend DownThis alert detects when a backend has all of its instances down for more than 10 minutes.Prometheus
                                                                                                                                                [Haproxy-Ingress] High Sessions UsageThis alert triggers when the backend sessions overpass the 85% of the sessions capacity for 10 minutes.Prometheus
                                                                                                                                                [Haproxy-Ingress] High Error RateThis alert triggers when there is an error rate over 15% for over 10 minutes in a proxy.Prometheus
                                                                                                                                                [Haproxy-Ingress] High Request Denied RateThese alerts detect when there is a denied rate of requests over 10% for over 10 minutes in a proxy.Prometheus
                                                                                                                                                [Haproxy-Ingress] High Response Denied RateThese alerts detect when there is a denied rate of responses over 10% for over 10 minutes in a proxy.Prometheus
                                                                                                                                                [Haproxy-Ingress] High Response RateThis alert triggers when a proxy has a mean response time higher than 250ms for over 10 minutes.Prometheus

                                                                                                                                                List of dashboards:

                                                                                                                                                • HAProxy_Ingress_Overview
                                                                                                                                                • HAProxy_Ingress_Service_Details

                                                                                                                                                List of metrics:

                                                                                                                                                • haproxy_backend_bytes_in_total
                                                                                                                                                • haproxy_backend_bytes_out_total
                                                                                                                                                • haproxy_backend_client_aborts_total
                                                                                                                                                • haproxy_backend_connect_time_average_seconds
                                                                                                                                                • haproxy_backend_current_queue
                                                                                                                                                • haproxy_backend_http_requests_total
                                                                                                                                                • haproxy_backend_http_responses_total
                                                                                                                                                • haproxy_backend_limit_sessions
                                                                                                                                                • haproxy_backend_queue_time_average_seconds
                                                                                                                                                • haproxy_backend_requests_denied_total
                                                                                                                                                • haproxy_backend_response_time_average_seconds
                                                                                                                                                • haproxy_backend_responses_denied_total
                                                                                                                                                • haproxy_backend_sessions_total
                                                                                                                                                • haproxy_backend_status
                                                                                                                                                • haproxy_frontend_bytes_in_total
                                                                                                                                                • haproxy_frontend_bytes_out_total
                                                                                                                                                • haproxy_frontend_connections_total
                                                                                                                                                • haproxy_frontend_denied_connections_total
                                                                                                                                                • haproxy_frontend_denied_sessions_total
                                                                                                                                                • haproxy_frontend_request_errors_total
                                                                                                                                                • haproxy_frontend_requests_denied_total
                                                                                                                                                • haproxy_frontend_responses_denied_total
                                                                                                                                                • haproxy_frontend_status
                                                                                                                                                • haproxy_process_active_peers
                                                                                                                                                • haproxy_process_current_connection_rate
                                                                                                                                                • haproxy_process_current_run_queue
                                                                                                                                                • haproxy_process_current_session_rate
                                                                                                                                                • haproxy_process_current_tasks
                                                                                                                                                • haproxy_process_jobs
                                                                                                                                                • haproxy_process_ssl_connections_total
                                                                                                                                                • haproxy_process_start_time_seconds

                                                                                                                                                2.7 -

                                                                                                                                                Harbor

                                                                                                                                                Harbor

                                                                                                                                                This integration is enabled by default.

                                                                                                                                                List of alerts

                                                                                                                                                AlertDescriptionFormat
                                                                                                                                                [Harbor] Harbor Core Is DownHarbor Core Is DownPrometheus
                                                                                                                                                [Harbor] Harbor Database Is DownHarbor Database Is DownPrometheus
                                                                                                                                                [Harbor] Harbor Registry Is DownHarbor Registry Is DownPrometheus
                                                                                                                                                [Harbor] Harbor Redis Is DownHarbor Redis Is DownPrometheus
                                                                                                                                                [Harbor] Harbor Trivy Is DownHarbor Trivy Is DownPrometheus
                                                                                                                                                [Harbor] Harbor JobService Is DownHarbor JobService Is DownPrometheus
                                                                                                                                                [Harbor] Project Quota Is Raising The LimitProject Quota Is Raising The LimitPrometheus
                                                                                                                                                [Harbor] Harbor p99 latency is higher than 10 secondsHarbor p99 latency is higher than 10 secondsPrometheus
                                                                                                                                                [Harbor] Harbor Error Rate is HighHarbor Error Rate is HighPrometheus

                                                                                                                                                List of dashboards:

                                                                                                                                                • Harbor

                                                                                                                                                List of metrics:

                                                                                                                                                • go_build_info
                                                                                                                                                • go_gc_duration_seconds
                                                                                                                                                • go_gc_duration_seconds_count
                                                                                                                                                • go_gc_duration_seconds_sum
                                                                                                                                                • go_goroutines
                                                                                                                                                • go_memstats_buck_hash_sys_bytes
                                                                                                                                                • go_memstats_gc_sys_bytes
                                                                                                                                                • go_memstats_heap_alloc_bytes
                                                                                                                                                • go_memstats_heap_idle_bytes
                                                                                                                                                • go_memstats_heap_inuse_bytes
                                                                                                                                                • go_memstats_heap_released_bytes
                                                                                                                                                • go_memstats_heap_sys_bytes
                                                                                                                                                • go_memstats_lookups_total
                                                                                                                                                • go_memstats_mallocs_total
                                                                                                                                                • go_memstats_mcache_inuse_bytes
                                                                                                                                                • go_memstats_mcache_sys_bytes
                                                                                                                                                • go_memstats_mspan_inuse_bytes
                                                                                                                                                • go_memstats_mspan_sys_bytes
                                                                                                                                                • go_memstats_next_gc_bytes
                                                                                                                                                • go_memstats_stack_inuse_bytes
                                                                                                                                                • go_memstats_stack_sys_bytes
                                                                                                                                                • go_memstats_sys_bytes
                                                                                                                                                • go_threads
                                                                                                                                                • harbor_artifact_pulled
                                                                                                                                                • harbor_core_http_request_duration_seconds
                                                                                                                                                • harbor_jobservice_task_process_time_seconds
                                                                                                                                                • harbor_project_member_total
                                                                                                                                                • harbor_project_quota_byte
                                                                                                                                                • harbor_project_quota_usage_byte
                                                                                                                                                • harbor_project_repo_total
                                                                                                                                                • harbor_project_total
                                                                                                                                                • harbor_quotas_size_bytes
                                                                                                                                                • harbor_task_concurrency
                                                                                                                                                • harbor_task_queue_latency
                                                                                                                                                • harbor_task_queue_size
                                                                                                                                                • harbor_up
                                                                                                                                                • process_cpu_seconds_total
                                                                                                                                                • process_max_fds
                                                                                                                                                • process_open_fds
                                                                                                                                                • registry_http_request_duration_seconds_bucket
                                                                                                                                                • registry_http_request_size_bytes_bucket
                                                                                                                                                • registry_http_requests_total
                                                                                                                                                • registry_http_response_size_bytes_bucket
                                                                                                                                                • registry_storage_action_seconds_bucket

                                                                                                                                                2.8 -

                                                                                                                                                K8s-etcd

                                                                                                                                                K8s-etcd

                                                                                                                                                This integration is enabled by default.

                                                                                                                                                List of dashboards:

                                                                                                                                                • Kubernetes_Etcd

                                                                                                                                                List of metrics:

                                                                                                                                                • etcd_debugging_mvcc_db_total_size_in_bytes
                                                                                                                                                • etcd_disk_backend_commit_duration_seconds_bucket
                                                                                                                                                • etcd_disk_wal_fsync_duration_seconds_bucket
                                                                                                                                                • etcd_grpc_proxy_cache_hits_total
                                                                                                                                                • etcd_grpc_proxy_cache_misses_total
                                                                                                                                                • etcd_network_client_grpc_received_bytes_total
                                                                                                                                                • etcd_network_client_grpc_sent_bytes_total
                                                                                                                                                • etcd_network_peer_received_bytes_total
                                                                                                                                                • etcd_network_peer_received_failures_total
                                                                                                                                                • etcd_network_peer_round_trip_time_seconds_bucket
                                                                                                                                                • etcd_network_peer_sent_bytes_total
                                                                                                                                                • etcd_network_peer_sent_failures_total
                                                                                                                                                • etcd_server_has_leader
                                                                                                                                                • etcd_server_id
                                                                                                                                                • etcd_server_leader_changes_seen_total
                                                                                                                                                • etcd_server_proposals_applied_total
                                                                                                                                                • etcd_server_proposals_committed_total
                                                                                                                                                • etcd_server_proposals_failed_total
                                                                                                                                                • etcd_server_proposals_pending
                                                                                                                                                • go_build_info
                                                                                                                                                • go_gc_duration_seconds
                                                                                                                                                • go_gc_duration_seconds_count
                                                                                                                                                • go_gc_duration_seconds_sum
                                                                                                                                                • go_goroutines
                                                                                                                                                • go_memstats_buck_hash_sys_bytes
                                                                                                                                                • go_memstats_gc_sys_bytes
                                                                                                                                                • go_memstats_heap_alloc_bytes
                                                                                                                                                • go_memstats_heap_idle_bytes
                                                                                                                                                • go_memstats_heap_inuse_bytes
                                                                                                                                                • go_memstats_heap_released_bytes
                                                                                                                                                • go_memstats_heap_sys_bytes
                                                                                                                                                • go_memstats_lookups_total
                                                                                                                                                • go_memstats_mallocs_total
                                                                                                                                                • go_memstats_mcache_inuse_bytes
                                                                                                                                                • go_memstats_mcache_sys_bytes
                                                                                                                                                • go_memstats_mspan_inuse_bytes
                                                                                                                                                • go_memstats_mspan_sys_bytes
                                                                                                                                                • go_memstats_next_gc_bytes
                                                                                                                                                • go_memstats_stack_inuse_bytes
                                                                                                                                                • go_memstats_stack_sys_bytes
                                                                                                                                                • go_memstats_sys_bytes
                                                                                                                                                • go_threads
                                                                                                                                                • grpc_server_handled_total
                                                                                                                                                • grpc_server_started_total
                                                                                                                                                • process_cpu_seconds_total
                                                                                                                                                • process_max_fds
                                                                                                                                                • process_open_fds
                                                                                                                                                • sysdig_container_cpu_cores_used
                                                                                                                                                • sysdig_container_memory_used_bytes

                                                                                                                                                2.9 -

                                                                                                                                                Keda

                                                                                                                                                Keda

                                                                                                                                                This integration is enabled by default.

                                                                                                                                                List of alerts

                                                                                                                                                AlertDescriptionFormat
                                                                                                                                                [Keda] Errors in Scaled ObjectErrors detected in scaled objectPrometheus

                                                                                                                                                List of dashboards:

                                                                                                                                                • Keda

                                                                                                                                                List of metrics:

                                                                                                                                                • keda_metrics_adapter_scaled_object_errors
                                                                                                                                                • keda_metrics_adapter_scaler_metrics_value
                                                                                                                                                • kubernetes.hpa.replicas.current
                                                                                                                                                • kubernetes.hpa.replicas.desired
                                                                                                                                                • kubernetes.hpa.replicas.max
                                                                                                                                                • kubernetes.hpa.replicas.min

                                                                                                                                                2.10 -

                                                                                                                                                Memcached

                                                                                                                                                Memcached

                                                                                                                                                This integration is enabled by default.

                                                                                                                                                List of alerts

                                                                                                                                                AlertDescriptionFormat
                                                                                                                                                [Memcached] Instance DownInstance is not reachablePrometheus
                                                                                                                                                [Memcached] Low UpTimeUptime of less than 1 hour in a Memcached instancePrometheus
                                                                                                                                                [Memcached] Connection ThrottledConnection throttled because max number of requests per event process reachedPrometheus
                                                                                                                                                [Memcached] Connections Close To The Limit 85%The mumber of connections are close to the limitPrometheus
                                                                                                                                                [Memcached] Connections Limit ReachedReached the number of maximum connections and caused a connection errorPrometheus

                                                                                                                                                List of dashboards:

                                                                                                                                                • Memcached

                                                                                                                                                List of metrics:

                                                                                                                                                • memcached_commands_total
                                                                                                                                                • memcached_connections_listener_disabled_total
                                                                                                                                                • memcached_connections_yielded_total
                                                                                                                                                • memcached_current_bytes
                                                                                                                                                • memcached_current_connections
                                                                                                                                                • memcached_current_items
                                                                                                                                                • memcached_items_evicted_total
                                                                                                                                                • memcached_items_reclaimed_total
                                                                                                                                                • memcached_items_total
                                                                                                                                                • memcached_limit_bytes
                                                                                                                                                • memcached_max_connections
                                                                                                                                                • memcached_up
                                                                                                                                                • memcached_uptime_seconds

                                                                                                                                                2.11 -

                                                                                                                                                Mongodb

                                                                                                                                                Mongodb

                                                                                                                                                This integration is enabled by default.

                                                                                                                                                List of alerts

                                                                                                                                                AlertDescriptionFormat
                                                                                                                                                [MongoDB] Instance DownMongo server detected down by instancePrometheus
                                                                                                                                                [MongoDB] Uptime less than one hourMongo server detected down by instancePrometheus
                                                                                                                                                [MongoDB] Asserts detectedMongo server detected down by instancePrometheus
                                                                                                                                                [MongoDB] High LatencyHigh latency in instancePrometheus
                                                                                                                                                [MongoDB] High Ticket UtilizationTicket usage over 75% in instancePrometheus
                                                                                                                                                [MongoDB] Recurrent Cursor TimeoutRecurrent cursors timeout in instancePrometheus
                                                                                                                                                [MongoDB] Recurrent Memory Page FaultsRecurrent cursors timeout in instancePrometheus

                                                                                                                                                List of dashboards:

                                                                                                                                                • MongoDB_Database_Details
                                                                                                                                                • MongoDB_Instance_Health

                                                                                                                                                List of metrics:

                                                                                                                                                • mongodb_asserts_total
                                                                                                                                                • mongodb_connections
                                                                                                                                                • mongodb_extra_info_page_faults_total
                                                                                                                                                • mongodb_instance_uptime_seconds
                                                                                                                                                • mongodb_memory
                                                                                                                                                • mongodb_mongod_db_collections_total
                                                                                                                                                • mongodb_mongod_db_data_size_bytes
                                                                                                                                                • mongodb_mongod_db_index_size_bytes
                                                                                                                                                • mongodb_mongod_db_indexes_total
                                                                                                                                                • mongodb_mongod_db_objects_total
                                                                                                                                                • mongodb_mongod_global_lock_client
                                                                                                                                                • mongodb_mongod_global_lock_current_queue
                                                                                                                                                • mongodb_mongod_global_lock_ratio
                                                                                                                                                • mongodb_mongod_metrics_cursor_open
                                                                                                                                                • mongodb_mongod_metrics_cursor_timed_out_total
                                                                                                                                                • mongodb_mongod_op_latencies_latency_total
                                                                                                                                                • mongodb_mongod_op_latencies_ops_total
                                                                                                                                                • mongodb_mongod_wiredtiger_cache_bytes
                                                                                                                                                • mongodb_mongod_wiredtiger_cache_bytes_total
                                                                                                                                                • mongodb_mongod_wiredtiger_cache_evicted_total
                                                                                                                                                • mongodb_mongod_wiredtiger_cache_pages
                                                                                                                                                • mongodb_mongod_wiredtiger_concurrent_transactions_out_tickets
                                                                                                                                                • mongodb_mongod_wiredtiger_concurrent_transactions_total_tickets
                                                                                                                                                • mongodb_network_bytes_total
                                                                                                                                                • mongodb_network_metrics_num_requests_total
                                                                                                                                                • mongodb_op_counters_total
                                                                                                                                                • mongodb_up
                                                                                                                                                • net.error.count

                                                                                                                                                2.12 -

                                                                                                                                                Mysql

                                                                                                                                                Mysql

                                                                                                                                                This integration is enabled by default.

                                                                                                                                                List of alerts

                                                                                                                                                AlertDescriptionFormat
                                                                                                                                                [MySQL] Mysql DownMySQL instance is downPrometheus
                                                                                                                                                [MySQL] Mysql RestartedMySQL has just been restarted, less than one minute agoPrometheus
                                                                                                                                                [MySQL] Mysql Too any Connections (>80%)More than 80% of MySQL connections are in usePrometheus
                                                                                                                                                [MySQL] Mysql High Threads RunningMore than 60% of MySQL connections are in running statePrometheus
                                                                                                                                                [MySQL] Mysql HighOpen FilesMore than 80% of MySQL files openPrometheus
                                                                                                                                                [MySQL] Mysql Slow QueriesMySQL server mysql has some new slow queryPrometheus
                                                                                                                                                [MySQL] Mysql Innodb Log WaitsMySQL innodb log writes stallingPrometheus
                                                                                                                                                [MySQL] Mysql Slave Io Thread Not RunningMySQL Slave IO thread not runningPrometheus
                                                                                                                                                [MySQL] Mysql Slave Sql Thread Not RunningMySQL Slave SQL thread not runningPrometheus
                                                                                                                                                [MySQL] Mysql Slave Replication LagMySQL Slave replication lagPrometheus

                                                                                                                                                List of dashboards:

                                                                                                                                                • MySQL

                                                                                                                                                List of metrics:

                                                                                                                                                • mysql_global_status_aborted_clients
                                                                                                                                                • mysql_global_status_aborted_connects
                                                                                                                                                • mysql_global_status_buffer_pool_pages
                                                                                                                                                • mysql_global_status_bytes_received
                                                                                                                                                • mysql_global_status_bytes_sent
                                                                                                                                                • mysql_global_status_commands_total
                                                                                                                                                • mysql_global_status_connection_errors_total
                                                                                                                                                • mysql_global_status_innodb_buffer_pool_read_requests
                                                                                                                                                • mysql_global_status_innodb_buffer_pool_reads
                                                                                                                                                • mysql_global_status_innodb_log_waits
                                                                                                                                                • mysql_global_status_innodb_mem_adaptive_hash
                                                                                                                                                • mysql_global_status_innodb_mem_dictionary
                                                                                                                                                • mysql_global_status_innodb_page_size
                                                                                                                                                • mysql_global_status_questions
                                                                                                                                                • mysql_global_status_select_full_join
                                                                                                                                                • mysql_global_status_select_full_range_join
                                                                                                                                                • mysql_global_status_select_range_check
                                                                                                                                                • mysql_global_status_select_scan
                                                                                                                                                • mysql_global_status_slow_queries
                                                                                                                                                • mysql_global_status_sort_merge_passes
                                                                                                                                                • mysql_global_status_sort_range
                                                                                                                                                • mysql_global_status_sort_rows
                                                                                                                                                • mysql_global_status_sort_scan
                                                                                                                                                • mysql_global_status_table_locks_immediate
                                                                                                                                                • mysql_global_status_table_locks_waited
                                                                                                                                                • mysql_global_status_table_open_cache_hits
                                                                                                                                                • mysql_global_status_table_open_cache_misses
                                                                                                                                                • mysql_global_status_threads_cached
                                                                                                                                                • mysql_global_status_threads_connected
                                                                                                                                                • mysql_global_status_threads_created
                                                                                                                                                • mysql_global_status_threads_running
                                                                                                                                                • mysql_global_status_uptime
                                                                                                                                                • mysql_global_variables_innodb_additional_mem_pool_size
                                                                                                                                                • mysql_global_variables_innodb_log_buffer_size
                                                                                                                                                • mysql_global_variables_innodb_open_files
                                                                                                                                                • mysql_global_variables_key_buffer_size
                                                                                                                                                • mysql_global_variables_max_connections
                                                                                                                                                • mysql_global_variables_open_files_limit
                                                                                                                                                • mysql_global_variables_query_cache_size
                                                                                                                                                • mysql_global_variables_thread_cache_size
                                                                                                                                                • mysql_global_variables_tokudb_cache_size
                                                                                                                                                • mysql_slave_status_master_server_id
                                                                                                                                                • mysql_slave_status_seconds_behind_master
                                                                                                                                                • mysql_slave_status_slave_io_running
                                                                                                                                                • mysql_slave_status_slave_sql_running
                                                                                                                                                • mysql_slave_status_sql_delay
                                                                                                                                                • mysql_up

                                                                                                                                                2.13 -

                                                                                                                                                Nginx

                                                                                                                                                Nginx

                                                                                                                                                This integration is enabled by default.

                                                                                                                                                List of alerts

                                                                                                                                                AlertDescriptionFormat
                                                                                                                                                [Nginx] No Intances UpNo Nginx instances UpPrometheus

                                                                                                                                                List of dashboards:

                                                                                                                                                • NGINX_App_Overview

                                                                                                                                                List of metrics:

                                                                                                                                                • net.bytes.in
                                                                                                                                                • net.bytes.out
                                                                                                                                                • net.http.error.count
                                                                                                                                                • net.http.request.count
                                                                                                                                                • net.http.request.time
                                                                                                                                                • nginx_connections_accepted
                                                                                                                                                • nginx_connections_active
                                                                                                                                                • nginx_connections_handled
                                                                                                                                                • nginx_connections_reading
                                                                                                                                                • nginx_connections_waiting
                                                                                                                                                • nginx_connections_writing
                                                                                                                                                • nginx_up

                                                                                                                                                2.14 -

                                                                                                                                                Nginx-ingress

                                                                                                                                                Nginx-ingress

                                                                                                                                                This integration is enabled by default.

                                                                                                                                                List of alerts

                                                                                                                                                AlertDescriptionFormat
                                                                                                                                                [Nginx-Ingress] High Http 4xx Error RateToo many HTTP requests with status 4xx (> 5%)Prometheus
                                                                                                                                                [Nginx-Ingress] High Http 5xx Error RateToo many HTTP requests with status 5xx (> 5%)Prometheus
                                                                                                                                                [Nginx-Ingress] High LatencyNginx p99 latency is higher than 10 secondsPrometheus
                                                                                                                                                [Nginx-Ingress] Ingress Certificate ExpiryNginx Ingress Certificate will expire in less than 14 daysPrometheus

                                                                                                                                                List of dashboards:

                                                                                                                                                • Nginx_Kubernetes_Ingress_Controller

                                                                                                                                                List of metrics:

                                                                                                                                                • go_build_info
                                                                                                                                                • go_gc_duration_seconds
                                                                                                                                                • go_gc_duration_seconds_count
                                                                                                                                                • go_gc_duration_seconds_sum
                                                                                                                                                • go_goroutines
                                                                                                                                                • go_memstats_buck_hash_sys_bytes
                                                                                                                                                • go_memstats_gc_sys_bytes
                                                                                                                                                • go_memstats_heap_alloc_bytes
                                                                                                                                                • go_memstats_heap_idle_bytes
                                                                                                                                                • go_memstats_heap_inuse_bytes
                                                                                                                                                • go_memstats_heap_released_bytes
                                                                                                                                                • go_memstats_heap_sys_bytes
                                                                                                                                                • go_memstats_lookups_total
                                                                                                                                                • go_memstats_mallocs_total
                                                                                                                                                • go_memstats_mcache_inuse_bytes
                                                                                                                                                • go_memstats_mcache_sys_bytes
                                                                                                                                                • go_memstats_mspan_inuse_bytes
                                                                                                                                                • go_memstats_mspan_sys_bytes
                                                                                                                                                • go_memstats_next_gc_bytes
                                                                                                                                                • go_memstats_stack_inuse_bytes
                                                                                                                                                • go_memstats_stack_sys_bytes
                                                                                                                                                • go_memstats_sys_bytes
                                                                                                                                                • go_threads
                                                                                                                                                • nginx_ingress_controller_config_last_reload_successful
                                                                                                                                                • nginx_ingress_controller_config_last_reload_successful_timestamp_seconds
                                                                                                                                                • nginx_ingress_controller_ingress_upstream_latency_seconds_count
                                                                                                                                                • nginx_ingress_controller_ingress_upstream_latency_seconds_sum
                                                                                                                                                • nginx_ingress_controller_nginx_process_connections
                                                                                                                                                • nginx_ingress_controller_nginx_process_cpu_seconds_total
                                                                                                                                                • nginx_ingress_controller_nginx_process_resident_memory_bytes
                                                                                                                                                • nginx_ingress_controller_request_duration_seconds_bucket
                                                                                                                                                • nginx_ingress_controller_request_duration_seconds_count
                                                                                                                                                • nginx_ingress_controller_request_duration_seconds_sum
                                                                                                                                                • nginx_ingress_controller_request_size_sum
                                                                                                                                                • nginx_ingress_controller_requests
                                                                                                                                                • nginx_ingress_controller_response_duration_seconds_count
                                                                                                                                                • nginx_ingress_controller_response_duration_seconds_sum
                                                                                                                                                • nginx_ingress_controller_response_size_sum
                                                                                                                                                • nginx_ingress_controller_ssl_expire_time_seconds
                                                                                                                                                • process_cpu_seconds_total
                                                                                                                                                • process_max_fds
                                                                                                                                                • process_open_fds

                                                                                                                                                2.15 -

                                                                                                                                                Ntp

                                                                                                                                                Ntp

                                                                                                                                                This integration is enabled by default.

                                                                                                                                                List of alerts

                                                                                                                                                AlertDescriptionFormat
                                                                                                                                                [Ntp] Drift is too highDrift is too highPrometheus

                                                                                                                                                List of dashboards:

                                                                                                                                                • ntp

                                                                                                                                                List of metrics:

                                                                                                                                                • ntp_drift_seconds

                                                                                                                                                2.16 -

                                                                                                                                                Opa

                                                                                                                                                Opa

                                                                                                                                                This integration is enabled by default.

                                                                                                                                                List of alerts

                                                                                                                                                AlertDescriptionFormat
                                                                                                                                                [Opa gatekeeper] Too much time since the last auditThere was more than 120 second since the last auditPrometheus
                                                                                                                                                [Opa gatekeeper] Spike of violationsThere was more than 30 violationsPrometheus

                                                                                                                                                List of dashboards:

                                                                                                                                                • OPA_Gatekeeper

                                                                                                                                                List of metrics:

                                                                                                                                                • gatekeeper_audit_duration_seconds_bucket
                                                                                                                                                • gatekeeper_audit_last_run_time
                                                                                                                                                • gatekeeper_constraint_template_ingestion_count
                                                                                                                                                • gatekeeper_constraint_template_ingestion_duration_seconds_bucket
                                                                                                                                                • gatekeeper_constraint_templates
                                                                                                                                                • gatekeeper_constraints
                                                                                                                                                • gatekeeper_request_count
                                                                                                                                                • gatekeeper_request_duration_seconds_bucket
                                                                                                                                                • gatekeeper_request_duration_seconds_count
                                                                                                                                                • gatekeeper_violations

                                                                                                                                                2.17 -

                                                                                                                                                Php-fpm

                                                                                                                                                Php-fpm

                                                                                                                                                This integration is enabled by default.

                                                                                                                                                List of alerts

                                                                                                                                                AlertDescriptionFormat
                                                                                                                                                [Php-Fpm] Percentage of instances lowLess than 75% of instances are upPrometheus
                                                                                                                                                [Php-Fpm] Recently rebootInstances have been recently rebootPrometheus
                                                                                                                                                [Php-Fpm] Limit of child proccess exceededNumber of childs process have been exceededPrometheus
                                                                                                                                                [Php-Fpm] Reaching limit of queue processBuffer of queue requests reaching its limitPrometheus
                                                                                                                                                [Php-Fpm] Too slow requests processingRequests have taking too much time to be processedPrometheus

                                                                                                                                                List of dashboards:

                                                                                                                                                • Php-fpm

                                                                                                                                                List of metrics:

                                                                                                                                                • kube_workload_status_desired
                                                                                                                                                • phpfpm_accepted_connections
                                                                                                                                                • phpfpm_active_processes
                                                                                                                                                • phpfpm_idle_processes
                                                                                                                                                • phpfpm_listen_queue
                                                                                                                                                • phpfpm_listen_queue_length
                                                                                                                                                • phpfpm_max_children_reached
                                                                                                                                                • phpfpm_process_requests
                                                                                                                                                • phpfpm_slow_requests
                                                                                                                                                • phpfpm_start_since
                                                                                                                                                • phpfpm_total_processes
                                                                                                                                                • phpfpm_up

                                                                                                                                                2.18 -

                                                                                                                                                Portworx

                                                                                                                                                Portworx

                                                                                                                                                This integration is enabled by default.

                                                                                                                                                List of alerts

                                                                                                                                                AlertDescriptionFormat
                                                                                                                                                [Portworx] No QuorumPortworx No Quorum.Prometheus
                                                                                                                                                [Portworx] Node Status Not OKPortworx Node Status Not OK.Prometheus
                                                                                                                                                [Portworx] Offline NodesPortworx Offline Nodes.Prometheus
                                                                                                                                                [Portworx] Nodes Storage Full or DownPortworx Nodes Storage Full or Down.Prometheus
                                                                                                                                                [Portworx] Offline Storage NodesPortworx Offline Storage Nodes.Prometheus
                                                                                                                                                [Portworx] Unhealthy Node KVDBPortworx Unhealthy Node KVDB.Prometheus
                                                                                                                                                [Portworx] Cache read hit rate is lowPortworx Cache read hit rate is low.Prometheus
                                                                                                                                                [Portworx] Cache write hit rate is lowPortworx Cache write hit rate is low.Prometheus
                                                                                                                                                [Portworx] High Read Latency In DiskPortworx High Read Latency In Disk.Prometheus
                                                                                                                                                [Portworx] High Write Latency In DiskPortworx High Write Latency In Disk.Prometheus
                                                                                                                                                [Portworx] Low Cluster CapacityPortworx Low Cluster Capacity.Prometheus
                                                                                                                                                [Portworx] Disk Full In 48HPortworx Disk Full In 48H.Prometheus
                                                                                                                                                [Portworx] Disk Full In 12HPortworx Disk Full In 12H.Prometheus
                                                                                                                                                [Portworx] Pool Status Not OnlinePortworx Node Status Not Online.Prometheus
                                                                                                                                                [Portworx] High Write Latency In PoolPortworx High Write Latency In Pool.Prometheus
                                                                                                                                                [Portworx] Pool Full In 48HPortworx Pool Full In 48H.Prometheus
                                                                                                                                                [Portworx] Pool Full In 12HPortworx Pool Full In 12H.Prometheus
                                                                                                                                                [Portworx] High Write Latency In VolumePortworx High Write Latency In Volume.Prometheus
                                                                                                                                                [Portworx] High Read Latency In VolumePortworx High Read Latency In Volume.Prometheus
                                                                                                                                                [Portworx] License ExpiryPortworx License Expiry.Prometheus

                                                                                                                                                List of dashboards:

                                                                                                                                                • Portworx Cluster
                                                                                                                                                • Portworx Volumes

                                                                                                                                                List of metrics:

                                                                                                                                                • go_build_info
                                                                                                                                                • go_gc_duration_seconds
                                                                                                                                                • go_gc_duration_seconds_count
                                                                                                                                                • go_gc_duration_seconds_sum
                                                                                                                                                • go_goroutines
                                                                                                                                                • go_memstats_buck_hash_sys_bytes
                                                                                                                                                • go_memstats_gc_sys_bytes
                                                                                                                                                • go_memstats_heap_alloc_bytes
                                                                                                                                                • go_memstats_heap_idle_bytes
                                                                                                                                                • go_memstats_heap_inuse_bytes
                                                                                                                                                • go_memstats_heap_released_bytes
                                                                                                                                                • go_memstats_heap_sys_bytes
                                                                                                                                                • go_memstats_lookups_total
                                                                                                                                                • go_memstats_mallocs_total
                                                                                                                                                • go_memstats_mcache_inuse_bytes
                                                                                                                                                • go_memstats_mcache_sys_bytes
                                                                                                                                                • go_memstats_mspan_inuse_bytes
                                                                                                                                                • go_memstats_mspan_sys_bytes
                                                                                                                                                • go_memstats_next_gc_bytes
                                                                                                                                                • go_memstats_stack_inuse_bytes
                                                                                                                                                • go_memstats_stack_sys_bytes
                                                                                                                                                • go_memstats_sys_bytes
                                                                                                                                                • go_threads
                                                                                                                                                • process_cpu_seconds_total
                                                                                                                                                • process_max_fds
                                                                                                                                                • process_open_fds
                                                                                                                                                • px_cluster_disk_available_bytes
                                                                                                                                                • px_cluster_disk_total_bytes
                                                                                                                                                • px_cluster_status_nodes_offline
                                                                                                                                                • px_cluster_status_nodes_online
                                                                                                                                                • px_cluster_status_nodes_storage_down
                                                                                                                                                • px_cluster_status_quorum
                                                                                                                                                • px_cluster_status_size
                                                                                                                                                • px_cluster_status_storage_nodes_decommissioned
                                                                                                                                                • px_cluster_status_storage_nodes_offline
                                                                                                                                                • px_cluster_status_storage_nodes_online
                                                                                                                                                • px_disk_stats_num_reads_total
                                                                                                                                                • px_disk_stats_num_writes_total
                                                                                                                                                • px_disk_stats_read_bytes_total
                                                                                                                                                • px_disk_stats_read_latency_seconds
                                                                                                                                                • px_disk_stats_used_bytes
                                                                                                                                                • px_disk_stats_write_latency_seconds
                                                                                                                                                • px_disk_stats_written_bytes_total
                                                                                                                                                • px_kvdb_health_state_node_view
                                                                                                                                                • px_network_io_received_bytes_total
                                                                                                                                                • px_network_io_sent_bytes_total
                                                                                                                                                • px_node_status_license_expiry
                                                                                                                                                • px_node_status_node_status
                                                                                                                                                • px_pool_stats_available_bytes
                                                                                                                                                • px_pool_stats_flushed_bytes_total
                                                                                                                                                • px_pool_stats_num_flushes_total
                                                                                                                                                • px_pool_stats_num_writes
                                                                                                                                                • px_pool_stats_status
                                                                                                                                                • px_pool_stats_total_bytes
                                                                                                                                                • px_pool_stats_write_latency_seconds
                                                                                                                                                • px_pool_stats_written_bytes
                                                                                                                                                • px_px_cache_read_hits
                                                                                                                                                • px_px_cache_read_miss
                                                                                                                                                • px_px_cache_write_hits
                                                                                                                                                • px_px_cache_write_miss
                                                                                                                                                • px_volume_attached
                                                                                                                                                • px_volume_attached_state
                                                                                                                                                • px_volume_capacity_bytes
                                                                                                                                                • px_volume_currhalevel
                                                                                                                                                • px_volume_halevel
                                                                                                                                                • px_volume_read_bytes_total
                                                                                                                                                • px_volume_read_latency_seconds
                                                                                                                                                • px_volume_reads_total
                                                                                                                                                • px_volume_replication_status
                                                                                                                                                • px_volume_state
                                                                                                                                                • px_volume_status
                                                                                                                                                • px_volume_usage_bytes
                                                                                                                                                • px_volume_write_latency_seconds
                                                                                                                                                • px_volume_writes_total
                                                                                                                                                • px_volume_written_bytes_total

                                                                                                                                                2.19 -

                                                                                                                                                Postgresql

                                                                                                                                                Postgresql

                                                                                                                                                This integration is enabled by default.

                                                                                                                                                List of alerts

                                                                                                                                                AlertDescriptionFormat
                                                                                                                                                [PostgreSQL] Instance DownPostgreSQL instance is unavailablePrometheus
                                                                                                                                                [PostgreSQL] Low UpTimeThe PostgreSQL instance has a UpTime of less than 1 hourPrometheus
                                                                                                                                                [PostgreSQL] Max Write Buffer ReachedBackground writer stops because it reached the maximum write buffersPrometheus
                                                                                                                                                [PostgreSQL] High WAL Files Archive Error RateHigh error rate in WAL files archiverPrometheus
                                                                                                                                                [PostgreSQL] Low Available ConnectionsLow available network connectionsPrometheus
                                                                                                                                                [PostgreSQL] High Response TimeHigh response time in at least one of the databasesPrometheus
                                                                                                                                                [PostgreSQL] Low Cache Hit RateLow cache hit ratePrometheus
                                                                                                                                                [PostgreSQL] DeadLocks In DatabaseDeadlocks detected in databasePrometheus

                                                                                                                                                List of dashboards:

                                                                                                                                                • Postgresql_DB_Golden_Signals
                                                                                                                                                • Postgresql_Instance_Health

                                                                                                                                                List of metrics:

                                                                                                                                                • pg_database_size_bytes
                                                                                                                                                • pg_locks_count
                                                                                                                                                • pg_postmaster_start_time_seconds
                                                                                                                                                • pg_replication_lag
                                                                                                                                                • pg_settings_max_connections
                                                                                                                                                • pg_settings_superuser_reserved_connections
                                                                                                                                                • pg_stat_activity_count
                                                                                                                                                • pg_stat_activity_max_tx_duration
                                                                                                                                                • pg_stat_archiver_archived_count
                                                                                                                                                • pg_stat_archiver_failed_count
                                                                                                                                                • pg_stat_bgwriter_buffers_alloc
                                                                                                                                                • pg_stat_bgwriter_buffers_backend
                                                                                                                                                • pg_stat_bgwriter_buffers_checkpoint
                                                                                                                                                • pg_stat_bgwriter_buffers_clean
                                                                                                                                                • pg_stat_bgwriter_checkpoint_sync_time
                                                                                                                                                • pg_stat_bgwriter_checkpoint_write_time
                                                                                                                                                • pg_stat_bgwriter_checkpoints_req
                                                                                                                                                • pg_stat_bgwriter_checkpoints_timed
                                                                                                                                                • pg_stat_bgwriter_maxwritten_clean
                                                                                                                                                • pg_stat_database_blk_read_time
                                                                                                                                                • pg_stat_database_blks_hit
                                                                                                                                                • pg_stat_database_blks_read
                                                                                                                                                • pg_stat_database_conflicts_confl_deadlock
                                                                                                                                                • pg_stat_database_conflicts_confl_lock
                                                                                                                                                • pg_stat_database_deadlocks
                                                                                                                                                • pg_stat_database_numbackends
                                                                                                                                                • pg_stat_database_temp_bytes
                                                                                                                                                • pg_stat_database_tup_deleted
                                                                                                                                                • pg_stat_database_tup_fetched
                                                                                                                                                • pg_stat_database_tup_inserted
                                                                                                                                                • pg_stat_database_tup_returned
                                                                                                                                                • pg_stat_database_tup_updated
                                                                                                                                                • pg_stat_database_xact_commit
                                                                                                                                                • pg_stat_database_xact_rollback
                                                                                                                                                • pg_stat_user_tables_idx_scan
                                                                                                                                                • pg_stat_user_tables_n_tup_hot_upd
                                                                                                                                                • pg_stat_user_tables_seq_scan
                                                                                                                                                • pg_up

                                                                                                                                                2.20 -

                                                                                                                                                Rabbitmq

                                                                                                                                                Rabbitmq

                                                                                                                                                This integration is enabled by default.

                                                                                                                                                List of alerts

                                                                                                                                                AlertDescriptionFormat
                                                                                                                                                [RabbitMQ] Cluster Operator Unavailable ReplicasThere are kube_pod_names that are either running but not yet available or kube_pod_names that still have not been created.Prometheus
                                                                                                                                                [RabbitMQ] Insufficient Established Erlang Distribution LinksInsuffient establised erland distribution linksPrometheus
                                                                                                                                                [RabbitMQ] Low Disk Watermark PredictedThe predicted free disk space in 24 hours from now is lowPrometheus
                                                                                                                                                [RabbitMQ] High Connection ChurnThere are a high connection churnPrometheus
                                                                                                                                                [RabbitMQ] No MajorityOfNodesReadyThere are so many nodes not readyPrometheus
                                                                                                                                                [RabbitMQ] Persistent Volume MissingThere is at least one pvc not boundPrometheus
                                                                                                                                                [RabbitMQ] Unroutable MessagesThere were unroutable message within the last 5 minutes in RabbitMQ clusterPrometheus
                                                                                                                                                [RabbitMQ] File Descriptors Near LimitThe file descriptors are near to the limitPrometheus
                                                                                                                                                [RabbitMQ] Container RestartsOver the last 10 minutes a rabbitmq container was restartedPrometheus
                                                                                                                                                [RabbitMQ] TCP Sockets Near LimitThe TCP sockets are near to the limitPrometheus

                                                                                                                                                List of dashboards:

                                                                                                                                                • Rabbitmq_Usage
                                                                                                                                                • Rabbitmq_Overview

                                                                                                                                                List of metrics:

                                                                                                                                                • erlang_vm_dist_node_state
                                                                                                                                                • kube_deployment_status_replicas_unavailable
                                                                                                                                                • kube_kube_pod_name_container_status_restarts_total
                                                                                                                                                • kube_persistentvolumeclaim_status_phase
                                                                                                                                                • kube_statefulset_replicas
                                                                                                                                                • kube_statefulset_status_replicas_ready
                                                                                                                                                • rabbitmq_build_info
                                                                                                                                                • rabbitmq_channel_consumers
                                                                                                                                                • rabbitmq_channel_get_ack_total
                                                                                                                                                • rabbitmq_channel_get_empty_total
                                                                                                                                                • rabbitmq_channel_get_total
                                                                                                                                                • rabbitmq_channel_messages_acked_total
                                                                                                                                                • rabbitmq_channel_messages_confirmed_total
                                                                                                                                                • rabbitmq_channel_messages_delivered_ack_total
                                                                                                                                                • rabbitmq_channel_messages_delivered_total
                                                                                                                                                • rabbitmq_channel_messages_published_total
                                                                                                                                                • rabbitmq_channel_messages_redelivered_total
                                                                                                                                                • rabbitmq_channel_messages_unconfirmed
                                                                                                                                                • rabbitmq_channel_messages_unroutable_dropped_total
                                                                                                                                                • rabbitmq_channel_messages_unroutable_returned_total
                                                                                                                                                • rabbitmq_channels
                                                                                                                                                • rabbitmq_channels_closed_total
                                                                                                                                                • rabbitmq_channels_opened_total
                                                                                                                                                • rabbitmq_connections
                                                                                                                                                • rabbitmq_connections_closed_total
                                                                                                                                                • rabbitmq_connections_opened_total
                                                                                                                                                • rabbitmq_disk_space_available_bytes
                                                                                                                                                • rabbitmq_disk_space_available_limit_bytes
                                                                                                                                                • rabbitmq_process_max_fds
                                                                                                                                                • rabbitmq_process_max_tcp_sockets
                                                                                                                                                • rabbitmq_process_open_fds
                                                                                                                                                • rabbitmq_process_open_tcp_sockets
                                                                                                                                                • rabbitmq_process_resident_memory_bytes
                                                                                                                                                • rabbitmq_queue_messages_published_total
                                                                                                                                                • rabbitmq_queue_messages_ready
                                                                                                                                                • rabbitmq_queue_messages_unacked
                                                                                                                                                • rabbitmq_queues
                                                                                                                                                • rabbitmq_queues_created_total
                                                                                                                                                • rabbitmq_queues_declared_total
                                                                                                                                                • rabbitmq_queues_deleted_total
                                                                                                                                                • rabbitmq_resident_memory_limit_bytes

                                                                                                                                                2.21 -

                                                                                                                                                Redis

                                                                                                                                                Redis

                                                                                                                                                This integration is enabled by default.

                                                                                                                                                List of alerts

                                                                                                                                                AlertDescriptionFormat
                                                                                                                                                [Redis] Low UpTimeUptime of less than 1 hour in a redis instancePrometheus
                                                                                                                                                [Redis] High Memory UsageHigh memory usagePrometheus
                                                                                                                                                [Redis] High Clients UsageHigh client connections usagePrometheus
                                                                                                                                                [Redis] High Response TimeResponse time over 250msPrometheus
                                                                                                                                                [Redis] High Fragmentation RatioHigh fragmentation ratioPrometheus
                                                                                                                                                [Redis] High Keys Eviction RatioHigh keys eviction ratioPrometheus
                                                                                                                                                [Redis] Recurrent Rejected ConnectionsRecurrent rejected connectionsPrometheus
                                                                                                                                                [Redis] Low Hit RatioLow keyspace hit ratioPrometheus

                                                                                                                                                List of dashboards:

                                                                                                                                                • Redis_Golden_Signals

                                                                                                                                                List of metrics:

                                                                                                                                                • redis_blocked_clients
                                                                                                                                                • redis_commands_duration_seconds_total
                                                                                                                                                • redis_commands_processed_total
                                                                                                                                                • redis_commands_total
                                                                                                                                                • redis_config_maxclients
                                                                                                                                                • redis_connected_clients
                                                                                                                                                • redis_connected_slaves
                                                                                                                                                • redis_connections_received_total
                                                                                                                                                • redis_cpu_sys_children_seconds_total
                                                                                                                                                • redis_cpu_sys_seconds_total
                                                                                                                                                • redis_cpu_user_children_seconds_total
                                                                                                                                                • redis_cpu_user_seconds_total
                                                                                                                                                • redis_db_avg_ttl_seconds
                                                                                                                                                • redis_db_keys
                                                                                                                                                • redis_evicted_keys_total
                                                                                                                                                • redis_expired_keys_total
                                                                                                                                                • redis_keyspace_hits_total
                                                                                                                                                • redis_keyspace_misses_total
                                                                                                                                                • redis_mem_fragmentation_ratio
                                                                                                                                                • redis_memory_max_bytes
                                                                                                                                                • redis_memory_used_bytes
                                                                                                                                                • redis_memory_used_dataset_bytes
                                                                                                                                                • redis_memory_used_lua_bytes
                                                                                                                                                • redis_memory_used_overhead_bytes
                                                                                                                                                • redis_memory_used_scripts_bytes
                                                                                                                                                • redis_net_input_bytes_total
                                                                                                                                                • redis_net_output_bytes_total
                                                                                                                                                • redis_pubsub_channels
                                                                                                                                                • redis_pubsub_patterns
                                                                                                                                                • redis_rdb_changes_since_last_save
                                                                                                                                                • redis_rdb_last_save_timestamp_seconds
                                                                                                                                                • redis_rejected_connections_total
                                                                                                                                                • redis_slowlog_length
                                                                                                                                                • redis_uptime_in_seconds

                                                                                                                                                2.22 -

                                                                                                                                                Sysdig-admission-controller

                                                                                                                                                Sysdig-admission-controller

                                                                                                                                                This integration is enabled by default.

                                                                                                                                                List of alerts

                                                                                                                                                AlertDescriptionFormat
                                                                                                                                                [Sysdig Admission Controller] No K8s Audit Events ReceivedThe Admission Controller is not receiving Kubernetes Audit eventsPrometheus
                                                                                                                                                [Sysdig Admission Controller] K8s Audit Events ThrottlingKubernetes Audit events is being throttledPrometheus
                                                                                                                                                [Sysdig Admission Controller] Scanning Events ThrottlingScanning events is being throttledPrometheus
                                                                                                                                                [Sysdig Admission Controller] Inline Scanning ThrottlingThe inline scanning queue is not empty for a long timePrometheus
                                                                                                                                                [Sysdig Admission Controller] High Error Rate In Scan Status From BackendHigh Error Rate In Scan Status From BackendPrometheus
                                                                                                                                                [Sysdig Admission Controller] High Error Rate In Scan Report From BackendHigh Error Rate In Scan Status From BackendPrometheus
                                                                                                                                                [Sysdig Admission Controller] High Error Rate In Image ScanHigh Error Rate In Image ScanPrometheus

                                                                                                                                                List of dashboards:

                                                                                                                                                • Sysdig_Admission_Controller

                                                                                                                                                List of metrics:

                                                                                                                                                • go_build_info
                                                                                                                                                • go_gc_duration_seconds
                                                                                                                                                • go_gc_duration_seconds_count
                                                                                                                                                • go_gc_duration_seconds_sum
                                                                                                                                                • go_goroutines
                                                                                                                                                • go_memstats_buck_hash_sys_bytes
                                                                                                                                                • go_memstats_gc_sys_bytes
                                                                                                                                                • go_memstats_heap_alloc_bytes
                                                                                                                                                • go_memstats_heap_idle_bytes
                                                                                                                                                • go_memstats_heap_inuse_bytes
                                                                                                                                                • go_memstats_heap_released_bytes
                                                                                                                                                • go_memstats_heap_sys_bytes
                                                                                                                                                • go_memstats_lookups_total
                                                                                                                                                • go_memstats_mallocs_total
                                                                                                                                                • go_memstats_mcache_inuse_bytes
                                                                                                                                                • go_memstats_mcache_sys_bytes
                                                                                                                                                • go_memstats_mspan_inuse_bytes
                                                                                                                                                • go_memstats_mspan_sys_bytes
                                                                                                                                                • go_memstats_next_gc_bytes
                                                                                                                                                • go_memstats_stack_inuse_bytes
                                                                                                                                                • go_memstats_stack_sys_bytes
                                                                                                                                                • go_memstats_sys_bytes
                                                                                                                                                • go_threads
                                                                                                                                                • k8s_audit_ac_alerts_total
                                                                                                                                                • k8s_audit_ac_events_processed_total
                                                                                                                                                • k8s_audit_ac_events_received_total
                                                                                                                                                • process_cpu_seconds_total
                                                                                                                                                • process_max_fds
                                                                                                                                                • process_open_fds
                                                                                                                                                • queue_length
                                                                                                                                                • scan_report_cache_hits
                                                                                                                                                • scan_report_cache_misses
                                                                                                                                                • scan_status_cache_hits
                                                                                                                                                • scan_status_cache_misses
                                                                                                                                                • scanner_scan_errors
                                                                                                                                                • scanner_scan_report_error_from_backend_count
                                                                                                                                                • scanner_scan_report_retrieved_from_backend_count
                                                                                                                                                • scanner_scan_requests_already_queued
                                                                                                                                                • scanner_scan_requests_error
                                                                                                                                                • scanner_scan_requests_queued
                                                                                                                                                • scanner_scan_status_error_from_backend_count
                                                                                                                                                • scanner_scan_status_retrieved_from_backend_count
                                                                                                                                                • scanner_scan_success
                                                                                                                                                • scanning_ac_admission_responses_total
                                                                                                                                                • scanning_ac_containers_processed_total
                                                                                                                                                • scanning_ac_http_scanning_handler_requests_total

                                                                                                                                                3 -

                                                                                                                                                Custom Integrations for Sysdig Monitor

                                                                                                                                                • Prometheus Metrics

                                                                                                                                                  Describes how Sysdig agent enables automatically collecting metrics from services that expose native Prometheus metrics as well as from applications with Prometheus exporters, how to set up your environment, and scrape Prometheus metrics seamlessly.

                                                                                                                                                • Java Management Extention (JMX) Metrics

                                                                                                                                                  Describes how to configure your Java virtual machines so Sysdig Agent can collect JMX metrics using the JMX protocol.

                                                                                                                                                • StatsD Metrics

                                                                                                                                                  Describes how the Sysdig agent collects custom StatsD metrics with an embedded StatsD server.

                                                                                                                                                • Node.JS Metrics

                                                                                                                                                  Illustrates how Sysdig is able to monitor node.js applications by linking a library to the node.js codebase.

                                                                                                                                                3.1 -

                                                                                                                                                Collect Prometheus Metrics

                                                                                                                                                Sysdig supports collecting, storing, and querying Prometheus native metrics and labels. You can use Sysdig in the same way that you use Prometheus and leverage Prometheus Query Language (PromQL) to create dashboards and alerts. Sysdig is compatible with Prometheus HTTP API to query your monitoring data programmatically using PromQL and extend Sysdig to other platforms like Grafana.

                                                                                                                                                From a metric collection standpoint, a lightweight Prometheus server is directly embedded into the Sysdig agent to facilitate metric collection. This also supports targets, instances, and jobs with filtering and relabeling using Prometheus syntax. You can configure the agent to identify these processes that expose Prometheus metric endpoints on its own host and send it to the Sysdig collector for storing and further processing.

                                                                                                                                                The Prometheus product itself does not necessarily have to be installed for Prometheus metrics collection.

                                                                                                                                                Agent Compatibility

                                                                                                                                                See the Sysdig agent versions and compatibility with Prometheus features:

                                                                                                                                                Sysdig Agent v12.2.0 and Above

                                                                                                                                                The following features are enabled by default:

                                                                                                                                                • Automatically scraping any Kubernetes pods with the following annotation set: prometheus.io/scrape=true
                                                                                                                                                • Automatically scrape applications supported by Monitoring Integrations.

                                                                                                                                                For more information, see Set up the Environment.

                                                                                                                                                Sysdig Agent Prior to v12.0.0

                                                                                                                                                Manually enable Prometheus in dragent.yaml file:

                                                                                                                                                  prometheus:
                                                                                                                                                       enabled: true
                                                                                                                                                

                                                                                                                                                For more information, see Enable Promscrape V2 on Older Versions of Sysdig Agent .

                                                                                                                                                Learn More

                                                                                                                                                The following topics describe in detail about setting up the environment for service discovery, metrics collection, and further processing.

                                                                                                                                                See the following blog posts for additional context on the Prometheus metric and how such metrics are typically used.

                                                                                                                                                3.1.1 -

                                                                                                                                                Set Up the Environment

                                                                                                                                                If you are already leveraging Kubernetes Service Discovery, specifically the approach given in prometheus-kubernetes.yml, you might already have annotations attached to the pods that mark them as eligible for scraping. Such environments can quickly begin scraping the same metrics by using the Sysdig agent in a single step.

                                                                                                                                                If you are not using Kubernetes Service Discovery, follow the instructions given below:

                                                                                                                                                Annotation

                                                                                                                                                Ensure that the Kubernetes pods that contain your Prometheus exporters have been deployed with the following annotations to enable scraping, substituting the listening exporter-TCP-port:

                                                                                                                                                spec:
                                                                                                                                                  template:
                                                                                                                                                    metadata:
                                                                                                                                                      annotations:
                                                                                                                                                        prometheus.io/scrape: "true"
                                                                                                                                                        prometheus.io/port: "exporter-TCP-port"
                                                                                                                                                

                                                                                                                                                The configuration above assumes your exporters use the typical endpoint called /metrics. If your exporter is using a different endpoint, specify by adding the following additional annotation, substituting the exporter-endpoint-name:

                                                                                                                                                prometheus.io/path: "/exporter-endpoint-name"
                                                                                                                                                
                                                                                                                                                

                                                                                                                                                Sample Exporter

                                                                                                                                                Use the Sample Exporter to test your environment. You will quickly see auto-discovered Prometheus metrics being displayed on Sysdig Monitor. You can use this working example as a basis to similarly annotate your own exporters.

                                                                                                                                                3.1.2 -

                                                                                                                                                Enable Prometheus Native Service Discovery

                                                                                                                                                Prometheus service discovery is a standard method of finding endpoints to scrape for metrics. You configure prometheus.yaml and custom jobs to prepare for scraping endpoints in the same way you do for native Prometheus.

                                                                                                                                                For metric collection, a lightweight Prometheus server, named promscrape, is directly embedded into the Sysdig agent to facilitate metric collection. Promscrape supports filtering and relabeling targets, instances, and jobs and identify them using the custom jobs configured in the prometheus.yaml file. The latest versions of Sysdig agent (above v12.0.0) by default identify the processes that expose Prometheus metric endpoints on its own host and send it to the Sysdig collector for storing and further processing. On older versions of Sysdig agent, you enable these features by configuring dragent.yaml.

                                                                                                                                                Working with Promscrape

                                                                                                                                                Promscrape is a lightweight Prometheus server that is embedded with the Sysdig agent. Promscrape scrapes metrics from Prometheus endpoints and sends them for storing and processing.

                                                                                                                                                Promscrape has two versions: Promscrape V1 and Promscrape V2.

                                                                                                                                                • Promscrape V2

                                                                                                                                                  Promscrape itself discovers targets by using the standard Prometheus configuration (native Prometheus service discovery), allowing the use of relabel_configs to find or modify targets. An instance of promscrape runs on every node that is running a Sysdig agent and is intended to collect metrics from local as well as remote targets specified in the prometheus.yaml file. The prometheus.yaml file you create is shared across all such nodes.

                                                                                                                                                  Promscrape V2 is enabled by default on Sysdig agent v12.5.0 and above. On older versions of Sysdig agent, you need to manually enable Promscrape V2, which allows for native Prometheus service discovery, by setting the prom_service_discovery parameter to true in dragent.yaml.

                                                                                                                                                • Promscrape V1

                                                                                                                                                  Sysdig agent discovers scrape targets through the Sysdig process_filter rules. For more information, see Process Filter.

                                                                                                                                                About Promscrape V2

                                                                                                                                                Supported Features

                                                                                                                                                Promscrape V2 supports the following native Prometheus capabilities:

                                                                                                                                                • Relabeling: Promscrape V2 supports Prometheus native relabel_config and metric_relabel_configs. Relabel configuration enables the following:

                                                                                                                                                  • Drop unnecessary metrics or unwanted labels from metrics

                                                                                                                                                  • Edit the label format of the target before scraping the labels

                                                                                                                                                • Sample format: In addition to the regular sample format (metrics name, labels, and metrics reading), Promscrape V2 includes metrics type (counter, gauge, histogram, summary) to every sample sent to the agent.

                                                                                                                                                • Scraping configuration: Promscrape V2 supports all types of scraping configuration, such as federation, blackbox-exporter, and so on.

                                                                                                                                                • Label mapping: The metrics can be mapped to their source (pod, process) by using the source labels which in turn map certain Prometheus label names to the known agent tags.

                                                                                                                                                Unsupported Features

                                                                                                                                                • Promscrape V2 does not support calculated metrics.

                                                                                                                                                • Promscrape V2 does not support cluster-wide features such as recording rules and alert management.

                                                                                                                                                • Service discovery configurations in Promscrape V1 (process_filter) and Promscrape V2 (prometheus.yaml) are incompatible and non-translatable.

                                                                                                                                                • Promscrape V2 collects metrics from both local and remote targets specified in the prometheus.yaml file and therefore it does not make sense to configure promscrape to scrape remote targets, because you will see metrics duplication in this case.

                                                                                                                                                • Promscrape V2 does not have the cluster view and therefore it ignores the configuration of recording rules and alerts, which is used in the cluster-wide metrics collection. Therefore, the following Prometheus Configurations are not supported

                                                                                                                                                • Sysdig uses __HOSTNAME__, which is not a standard Prometheus keyword.

                                                                                                                                                Enable Promscrape V2 on Older Versions of Sysdig Agent

                                                                                                                                                To enable Prometheus native service discovery on agent versions prior to 11.2:

                                                                                                                                                1. Open dragent.yaml file.

                                                                                                                                                2. Set the following Prometheus Service Discovery parameter to true:

                                                                                                                                                  prometheus:
                                                                                                                                                    prom_service_discovery: true
                                                                                                                                                  

                                                                                                                                                  If true, promscrape.v2 is used. Otherwise, promscrape.v1 is used to scrape the targets.

                                                                                                                                                3. Restart the agent.

                                                                                                                                                Create Custom Jobs

                                                                                                                                                Prerequisites

                                                                                                                                                Ensure the following features are enabled:

                                                                                                                                                • Monitoring Integration
                                                                                                                                                • Promscrape V2

                                                                                                                                                If you are using Sysdig agent v12.0.0 or above, these features are enabled by default.

                                                                                                                                                Prepare Custom Job

                                                                                                                                                You set up custom jobs in the Prometheus configuration file to identify endpoints that expose Prometheus metrics. Sysdig agent uses these custom jobs to scrape endpoints by using promscrape, the lightweight Prometheus server embedded in it.

                                                                                                                                                Guidelines

                                                                                                                                                • Ensure that targets are scraped only by the agent running on the same node as the target. You do this by adding the host selection relabeling rules.

                                                                                                                                                • Use the the sysdig specific relabeling rules to automatically get the right workload labels applied.

                                                                                                                                                Example Prometheus Configuration file

                                                                                                                                                The prometheus.yaml file comes with a default configuration for scraping the pods running on the local node. This configuration also includes the rules to preserve pod UID and container name labels for further correlation with Kubernetes State Metrics or Sysdig native metrics.

                                                                                                                                                Here is an example prometheus.yaml file that you can use to set up custom jobs.

                                                                                                                                                global:
                                                                                                                                                  scrape_interval: 10s
                                                                                                                                                scrape_configs:
                                                                                                                                                - job_name: 'my_pod_job'
                                                                                                                                                  sample_limit: 40000
                                                                                                                                                  tls_config:
                                                                                                                                                    insecure_skip_verify: true
                                                                                                                                                  kubernetes_sd_configs:
                                                                                                                                                  - role: pod
                                                                                                                                                  relabel_configs:
                                                                                                                                                    # Look for pod name starting with "my_pod_prefix" in namespace "my_namespace"
                                                                                                                                                  - action:
                                                                                                                                                    source_labels: [__meta_kubernetes_namespace,__meta_kubernetes_pod_name]
                                                                                                                                                    separator: /
                                                                                                                                                    regex: my_namespace/my_pod_prefix.+
                                                                                                                                                
                                                                                                                                                    # In those pods try to scrape from port 9876
                                                                                                                                                  - source_labels: [__address__]
                                                                                                                                                    action: replace
                                                                                                                                                    target_label: __address__
                                                                                                                                                    regex: (.+?)(\\:\\d)?
                                                                                                                                                    replacement: $1:9876
                                                                                                                                                
                                                                                                                                                    # Trying to ensure we only scrape local targets
                                                                                                                                                    # __HOSTIPS__ is replaced by promscrape with a regex list of the IP addresses
                                                                                                                                                    # of all the active network interfaces on the host
                                                                                                                                                  - action: keep
                                                                                                                                                    source_labels: [__meta_kubernetes_pod_host_ip]
                                                                                                                                                    regex: __HOSTIPS__
                                                                                                                                                
                                                                                                                                                    # Holding on to pod-id and container name so we can associate the metrics
                                                                                                                                                    # with the container (and cluster hierarchy)
                                                                                                                                                  - action: replace
                                                                                                                                                    source_labels: [__meta_kubernetes_pod_uid]
                                                                                                                                                    target_label: sysdig_k8s_pod_uid
                                                                                                                                                  - action: replace
                                                                                                                                                    source_labels: [__meta_kubernetes_pod_container_name]
                                                                                                                                                    target_label: sysdig_k8s_pod_container_name
                                                                                                                                                

                                                                                                                                                Default Scrape Job

                                                                                                                                                If Monitoring Integration is not enabled for you and you still want to automatically collect metrics from pods with the Prometheus annotations set (prometheus.io/scrape=true), add the following default scrape job to your prometheus.yaml file:

                                                                                                                                                - job_name: 'k8s-pods'
                                                                                                                                                  sample_limit: 40000
                                                                                                                                                  tls_config:
                                                                                                                                                    insecure_skip_verify: true
                                                                                                                                                  kubernetes_sd_configs:
                                                                                                                                                  - role: pod
                                                                                                                                                  relabel_configs:
                                                                                                                                                    # Trying to ensure we only scrape local targets
                                                                                                                                                    # __HOSTIPS__ is replaced by promscrape with a regex list of the IP addresses
                                                                                                                                                    # of all the active network interfaces on the host
                                                                                                                                                  - action: keep
                                                                                                                                                    source_labels: [__meta_kubernetes_pod_host_ip]
                                                                                                                                                    regex: __HOSTIPS__
                                                                                                                                                  - action: keep
                                                                                                                                                    source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
                                                                                                                                                    regex: true
                                                                                                                                                  - action: replace
                                                                                                                                                    source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scheme]
                                                                                                                                                    target_label: __scheme__
                                                                                                                                                    regex: (https?)
                                                                                                                                                  - action: replace
                                                                                                                                                    source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
                                                                                                                                                    target_label: __metrics_path__
                                                                                                                                                    regex: (.+)
                                                                                                                                                  - action: replace
                                                                                                                                                    source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
                                                                                                                                                    regex: ([^:]+)(?::\d+)?;(\d+)
                                                                                                                                                    replacement: $1:$2
                                                                                                                                                    target_label: __address__
                                                                                                                                                
                                                                                                                                                    # Holding on to pod-id and container name so we can associate the metrics
                                                                                                                                                    # with the container (and cluster hierarchy)
                                                                                                                                                  - action: replace
                                                                                                                                                    source_labels: [__meta_kubernetes_pod_uid]
                                                                                                                                                    target_label: sysdig_k8s_pod_uid
                                                                                                                                                  - action: replace
                                                                                                                                                    source_labels: [__meta_kubernetes_pod_container_name]
                                                                                                                                                    target_label: sysdig_k8s_pod_container_name
                                                                                                                                                

                                                                                                                                                Default Prometheus Configuration File

                                                                                                                                                Here is the default prometheus.yaml file.

                                                                                                                                                global:
                                                                                                                                                  scrape_interval: 10s
                                                                                                                                                scrape_configs:
                                                                                                                                                - job_name: 'k8s-pods'
                                                                                                                                                  tls_config:
                                                                                                                                                    insecure_skip_verify: true
                                                                                                                                                  kubernetes_sd_configs:
                                                                                                                                                  - role: pod
                                                                                                                                                  relabel_configs:
                                                                                                                                                    # Trying to ensure we only scrape local targets
                                                                                                                                                    # __HOSTIPS__ is replaced by promscrape with a regex list of the IP addresses
                                                                                                                                                    # of all the active network interfaces on the host
                                                                                                                                                  - action: keep
                                                                                                                                                    source_labels: [__meta_kubernetes_pod_host_ip]
                                                                                                                                                    regex: __HOSTIPS__
                                                                                                                                                  - action: keep
                                                                                                                                                    source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
                                                                                                                                                    regex: true
                                                                                                                                                  - action: replace
                                                                                                                                                    source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scheme]
                                                                                                                                                    target_label: __scheme__
                                                                                                                                                    regex: (https?)
                                                                                                                                                  - action: replace
                                                                                                                                                    source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
                                                                                                                                                    target_label: __metrics_path__
                                                                                                                                                    regex: (.+)
                                                                                                                                                  - action: replace
                                                                                                                                                    source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
                                                                                                                                                    regex: ([^:]+)(?::\d+)?;(\d+)
                                                                                                                                                    replacement: $1:$2
                                                                                                                                                    target_label: __address__
                                                                                                                                                    # Holding on to pod-id and container name so we can associate the metrics
                                                                                                                                                    # with the container (and cluster hierarchy)
                                                                                                                                                  - action: replace
                                                                                                                                                    source_labels: [__meta_kubernetes_pod_uid]
                                                                                                                                                    target_label: sysdig_k8s_pod_uid
                                                                                                                                                  - action: replace
                                                                                                                                                    source_labels: [__meta_kubernetes_pod_container_name]
                                                                                                                                                    target_label: sysdig_k8s_pod_container_name
                                                                                                                                                

                                                                                                                                                Understand the Prometheus Settings

                                                                                                                                                Scrape Interval

                                                                                                                                                The default scrape interval is 10 seconds. However, the value can be overridden per scraping job. The scrape interval configured in the prometheus.yaml is independent of the agent configuration.

                                                                                                                                                Promscrape V2 reads prometheus.yaml and initiates scraping jobs.

                                                                                                                                                The metrics from targets are collected per scrape interval for each target and immediately forwarded to the agent. The agent sends the metrics every 10 seconds to the Sysdig collector. Only those metrics that have been received since the last transmission are sent to the collector. If a scraping job for a job has a scrape interval longer than 10 seconds, the agent transmissions might not include all the metrics from that job.

                                                                                                                                                Hostname Selection

                                                                                                                                                __HOSTIPS__ is replaced by the host IP addresses. Selection by the host IP address is preferred because of its reliability.

                                                                                                                                                __HOSTNAME__ is replaced with the actual hostname before promscrape starts scraping the targets. This allows promscrape to ignore targets running on other hosts.

                                                                                                                                                Relabeling Configuration

                                                                                                                                                The default Prometheus configuration file contains the following two relabeling configurations:

                                                                                                                                                - action: replace
                                                                                                                                                  source_labels: [__meta_kubernetes_pod_uid]
                                                                                                                                                  target_label: sysdig_k8s_pod_uid
                                                                                                                                                - action: replace
                                                                                                                                                  source_labels: [__meta_kubernetes_pod_container_name]
                                                                                                                                                  target_label: sysdig_k8s_pod_container_name
                                                                                                                                                

                                                                                                                                                These rules add two labels, sysdig_k8s_pod_uid and sysdig_k8s_pod_container_name to every metric gathered from the local targets, containing pod ID and container name respectively. These labels will be dropped from the metrics before sending them to the Sysdig collector for further processing.

                                                                                                                                                Configure Prometheus Configuration File Using the Agent Configmap

                                                                                                                                                Here is an example for setting up the prometheus.yaml file using the agent configmap:

                                                                                                                                                apiVersion: v1
                                                                                                                                                data:
                                                                                                                                                  dragent.yaml: |
                                                                                                                                                    new_k8s: true
                                                                                                                                                    k8s_cluster_name: your-cluster-name
                                                                                                                                                    metrics_excess_log: true
                                                                                                                                                    10s_flush_enable: true
                                                                                                                                                    app_checks_enabled: false
                                                                                                                                                    use_promscrape: true
                                                                                                                                                    new_k8s: true
                                                                                                                                                    promscrape_fastproto: true
                                                                                                                                                    prometheus:
                                                                                                                                                      enabled: true
                                                                                                                                                      prom_service_discovery: true
                                                                                                                                                      log_errors: true
                                                                                                                                                      max_metrics: 200000
                                                                                                                                                      max_metrics_per_process: 200000
                                                                                                                                                      max_tags_per_metric: 100
                                                                                                                                                      ingest_raw: true
                                                                                                                                                      ingest_calculated: false
                                                                                                                                                    snaplen: 512
                                                                                                                                                    tags: role:cluster
                                                                                                                                                  prometheus.yaml: |
                                                                                                                                                    global:
                                                                                                                                                      scrape_interval: 10s
                                                                                                                                                    scrape_configs:
                                                                                                                                                    - job_name: 'haproxy-router'
                                                                                                                                                      basic_auth:
                                                                                                                                                        username: USER
                                                                                                                                                        password: PASSWORD
                                                                                                                                                      tls_config:
                                                                                                                                                        insecure_skip_verify: true
                                                                                                                                                      kubernetes_sd_configs:
                                                                                                                                                      - role: pod
                                                                                                                                                      relabel_configs:
                                                                                                                                                        # Trying to ensure we only scrape local targets
                                                                                                                                                        # We need the wildcard at the end because in AWS the node name is the FQDN,
                                                                                                                                                        # whereas in Azure the node name is the base host name
                                                                                                                                                      - action: keep
                                                                                                                                                        source_labels: [__meta_kubernetes_pod_host_ip]
                                                                                                                                                        regex: __HOSTIPS__
                                                                                                                                                      - action: keep
                                                                                                                                                        source_labels:
                                                                                                                                                        - __meta_kubernetes_namespace
                                                                                                                                                        - __meta_kubernetes_pod_name
                                                                                                                                                        separator: '/'
                                                                                                                                                        regex: 'default/router-1-.+'
                                                                                                                                                        # Holding on to pod-id and container name so we can associate the metrics
                                                                                                                                                        # with the container (and cluster hierarchy)
                                                                                                                                                      - action: replace
                                                                                                                                                        source_labels: [__meta_kubernetes_pod_uid]
                                                                                                                                                        target_label: sysdig_k8s_pod_uid
                                                                                                                                                      - action: replace
                                                                                                                                                        source_labels: [__meta_kubernetes_pod_container_name]
                                                                                                                                                        target_label: sysdig_k8s_pod_container_name
                                                                                                                                                
                                                                                                                                                kind: ConfigMap
                                                                                                                                                metadata:
                                                                                                                                                    labels:
                                                                                                                                                      app: sysdig-agent
                                                                                                                                                    name: sysdig-agent
                                                                                                                                                    namespace: sysdig-agent
                                                                                                                                                

                                                                                                                                                3.1.3 -

                                                                                                                                                Migrating from Promscrape V1 to V2

                                                                                                                                                Promscrape is the lightweight Prometheus server in the Sysdig agent. An updated version of promscrape, named Promscrape V2 is available. This configuration is controlled by the prom_discovery_service parameter in the dragent.yaml file. To use the latest features, such as Service Discovery and Monitoring Integrations, you need to have this option enabled in your environment.

                                                                                                                                                Compare Promscrape V1 and V2

                                                                                                                                                The main difference between V1 and V2 is how scrape targets are determined.

                                                                                                                                                In v1 targets are found through process-filtering rules configured in dragent.yaml or dragent.default.yaml (if no rules are given in dragent.yaml).The process-filtering rules are applied to all the running processes on the host. Matches are made based on process attributes, such as process name or TCP ports being listened to, as well as associated contexts from docker or Kubernetes, such as container labels or Kubernetes annotations.

                                                                                                                                                With Promscrape V2, scrape targets are determined by scrape_configs fields in a prometheus.yaml file (or the prometheus-v2.default.yaml file if no prometheus.yaml exists). Because promscrape is adapted from the open-source Prometheus server, the scrape_config settings are compatible with the normal Prometheus configuration. Here is an example:

                                                                                                                                                global:
                                                                                                                                                  scrape_interval: 10s
                                                                                                                                                scrape_configs:
                                                                                                                                                - job_name: 'my_pod_job'
                                                                                                                                                  sample_limit: 40000
                                                                                                                                                  tls_config:
                                                                                                                                                    insecure_skip_verify: true
                                                                                                                                                  kubernetes_sd_configs:
                                                                                                                                                  - role: pod
                                                                                                                                                  relabel_configs:
                                                                                                                                                    # Look for pod name starting with "my_pod_prefix" in namespace "my_namespace"
                                                                                                                                                  - action:
                                                                                                                                                    source_labels: [__meta_kubernetes_namespace,__meta_kubernetes_pod_name,__meta_kubernetes_pod_label]
                                                                                                                                                    separator: /
                                                                                                                                                    regex: my_namespace/my_pod_prefix.+
                                                                                                                                                  - action: keep
                                                                                                                                                    source_labels: [__meta_kubernetes_pod_label_app]
                                                                                                                                                    regex: my_app_metrics
                                                                                                                                                
                                                                                                                                                    # In those pods try to scrape from port 9876
                                                                                                                                                  - source_labels: [__address__]
                                                                                                                                                    action: replace
                                                                                                                                                    target_label: __address__
                                                                                                                                                    regex: (.+?)(\\:\\d)?
                                                                                                                                                    replacement: $1:9876
                                                                                                                                                
                                                                                                                                                    # Trying to ensure we only scrape local targets
                                                                                                                                                    # __HOSTIPS__ is replaced by promscrape with a regex list of the IP addresses
                                                                                                                                                    # of all the active network interfaces on the host
                                                                                                                                                  - action: keep
                                                                                                                                                    source_labels: [__meta_kubernetes_pod_host_ip]
                                                                                                                                                    regex: __HOSTIPS__
                                                                                                                                                
                                                                                                                                                    # Holding on to pod-id and container name so we can associate the metrics
                                                                                                                                                    # with the container (and cluster hierarchy)
                                                                                                                                                  - action: replace
                                                                                                                                                    source_labels: [__meta_kubernetes_pod_uid]
                                                                                                                                                    target_label: sysdig_k8s_pod_uid
                                                                                                                                                  - action: replace
                                                                                                                                                    source_labels: [__meta_kubernetes_pod_container_name]
                                                                                                                                                    target_label: sysdig_k8s_pod_container_name
                                                                                                                                                

                                                                                                                                                Migrate Using Default Configuration

                                                                                                                                                The default configuration for Promscrape v1 triggers the scraping based on standard Kubernetes pod annotations and container labels. The default configuration for v2 currently triggers scraping only based on the standard Kubernetes pod annotations leveraging the Prometheus native service discovery.

                                                                                                                                                Example Pod Annotations

                                                                                                                                                Annotation

                                                                                                                                                Value

                                                                                                                                                Description

                                                                                                                                                spec: template: metadata: annotations: prometheus.io/scrape: "true" prometheus.io/port: ""

                                                                                                                                                true

                                                                                                                                                Required field.

                                                                                                                                                prometheus.io/port: ""

                                                                                                                                                The port number to scrape

                                                                                                                                                Optional. It will scrape all pod-registered ports if omitted.

                                                                                                                                                prometheus.io/scheme

                                                                                                                                                <http|https>

                                                                                                                                                The default is http.

                                                                                                                                                (required field)prometheus.io/path

                                                                                                                                                The URL

                                                                                                                                                The default is /metrics.

                                                                                                                                                Example Static Job

                                                                                                                                                - job_name: 'static10'
                                                                                                                                                  static_configs:
                                                                                                                                                    - targets: ['localhost:5010']
                                                                                                                                                

                                                                                                                                                Guidelines

                                                                                                                                                • Users running Kubernetes with Promscrape v1 default rules and triggering scraping based on pod annotations need not take any action to migrate to v2. The migration happens automatically.

                                                                                                                                                • Users operating non-Kubernetes environments might need to continue using v1 for now, depending on how scraping is triggered. As of today promscrape.v2 doesn’t support leveraging container and Docker labels to discover Prometheus metrics endpoints. If your environment is one of these, define static jobs with the IP:port to be scrapped.

                                                                                                                                                Migrate Using Custom Rules

                                                                                                                                                If you relying on custom process_filter rules to collect metrics, use any method using standard Prometheus configuration syntax to scrape the endpoints. We recommend one of the following:

                                                                                                                                                • Adopt the standard approach of adding the standard Prometheus annotations to their pods. For more information, see Migrate Using Default Configuration.
                                                                                                                                                • Write a Prometheus scrape_config by using Kubernetes pods service discovery and use the appropriate pod metadata to trigger their scrapes.

                                                                                                                                                See the below example for converting your process_filter rules to Prometheus terminology.

                                                                                                                                                process_filter

                                                                                                                                                Prometheus

                                                                                                                                                - include:
                                                                                                                                                    kubernetes.pod.annotation.sysdig.com/test: true
                                                                                                                                                - action: keep
                                                                                                                                                  source_labels: [__meta_kubernetes_pod_annotation_sysdig_com_test]
                                                                                                                                                  regex: true
                                                                                                                                                - include:
                                                                                                                                                    kubernetes.pod.label.app: sysdig
                                                                                                                                                - action: keep
                                                                                                                                                  source_labels: [__meta_kubernetes_pod_label_app]
                                                                                                                                                  regex: 'sysdig'
                                                                                                                                                -include:
                                                                                                                                                   container.label.com.sysdig.test: true

                                                                                                                                                Not supported.

                                                                                                                                                - include:
                                                                                                                                                    process.name: test

                                                                                                                                                Not supported.

                                                                                                                                                - include:
                                                                                                                                                    process.cmdline: sysdig-agent

                                                                                                                                                Not supported.

                                                                                                                                                - include:
                                                                                                                                                    port: 8080
                                                                                                                                                - action: keep
                                                                                                                                                  source_labels: [__meta_kubernetes_pod_container_port_number]
                                                                                                                                                  regex: '8080'
                                                                                                                                                - include:
                                                                                                                                                    container.image: sysdig-agent

                                                                                                                                                Not supported.

                                                                                                                                                - include:
                                                                                                                                                    container.name: sysdig-agent
                                                                                                                                                - action: keep
                                                                                                                                                  source_labels: [__meta_kubernetes_pod_container_name]
                                                                                                                                                  regex: 'sysdig-agent'
                                                                                                                                                - include:
                                                                                                                                                    appcheck.match: sysdig

                                                                                                                                                Appchecks are not compatble with Promscrape v2. See Configure Monitoring Integrations for supported integrations.

                                                                                                                                                Contact Support

                                                                                                                                                If you have any queries related to promscrape migration, contact Sysdig Support.

                                                                                                                                                3.2 -

                                                                                                                                                Integrate JMX Metrics from Java Virtual Machines

                                                                                                                                                The Sysdig agent retrieves data from your Java virtual machines using the JMX protocol. The agent is configured to automatically discover active Java virtual machines and poll them for basic JVM metrics like Heap Memory and Garbage collector as well as application-specific metrics. Now, the following applications are supported by default:

                                                                                                                                                • ActiveMQ
                                                                                                                                                • Cassandra
                                                                                                                                                • Elasticsearch
                                                                                                                                                • HBase
                                                                                                                                                • Kafka
                                                                                                                                                • Tomcat
                                                                                                                                                • Zookeeper

                                                                                                                                                The agent can also be easily configured to extract custom JMX metrics coming from your own Java processes. Metrics extracted are shown in the pre-defined Application views or under the Metrics > JVM and JMX menus.

                                                                                                                                                The module java.management must be loaded for the Sysdig agent to collect both JVM and JMX metrics.

                                                                                                                                                The default JMX metrics configuration is found in the /opt/draios/etc/dragent.default.yaml file. When customizing existing entries, copy the complete application’s bean listing from that defaults yaml file into the user settings file /opt/draios/etc/dragent.yaml. The Sysdig agent will merge configurations of both files.

                                                                                                                                                Java versions 7 - 10 are currently supported by the Sysdig agents.

                                                                                                                                                For Java 11-14 you must be running minimum agent version 10.1.0 and must run the app with the JMX Remote option.

                                                                                                                                                Here is what your dragent.yaml file might look like for a customized entry for the Spark application:

                                                                                                                                                customerid: 07c948-your-key-here-006f3b
                                                                                                                                                tags: local:nyc,service:db3
                                                                                                                                                jmx:
                                                                                                                                                  per_process_beans:
                                                                                                                                                    spark:
                                                                                                                                                      pattern: "spark"
                                                                                                                                                      beans:
                                                                                                                                                        - query: "metrics:name=Spark shell.BlockManager.disk.diskSpaceUsed_MB"
                                                                                                                                                          attributes:
                                                                                                                                                            - name: VALUE
                                                                                                                                                              alias: spark.metric
                                                                                                                                                

                                                                                                                                                Include the jmx: and per_process_beans: section headers at the beginning of your application/bean list. For more information on adding parameters to a container agent’s configuration file, see Understanding the Agent Config Files.

                                                                                                                                                Bean Configuration

                                                                                                                                                Basic JVM metrics are pre-defined inside the default_beans: section. This section is defined in the agent’s default settings file and contains beans and attributes that are going to be polled for every Java process, like memory and garbage collector usage:

                                                                                                                                                jmx:
                                                                                                                                                  default_beans:
                                                                                                                                                    - query: "java.lang:type=Memory"
                                                                                                                                                      attributes:
                                                                                                                                                        - HeapMemoryUsage
                                                                                                                                                        - NonHeapMemoryUsage
                                                                                                                                                    - query: "java.lang:type=GarbageCollector,*"
                                                                                                                                                      attributes:
                                                                                                                                                        - name: "CollectionCount"
                                                                                                                                                          type: "counter"
                                                                                                                                                        - name: "CollectionTime"
                                                                                                                                                          type: "counter"
                                                                                                                                                

                                                                                                                                                Metrics specific for each application are specified in sections named after the applications. For example, this is the Tomcat section:

                                                                                                                                                per_process_beans:
                                                                                                                                                    tomcat:
                                                                                                                                                      pattern: "catalina"
                                                                                                                                                      beans:
                                                                                                                                                        - query: "Catalina:type=Cache,*"
                                                                                                                                                          attributes:
                                                                                                                                                            - accessCount
                                                                                                                                                            - cacheSize
                                                                                                                                                            - hitsCount
                                                                                                                                                            - . . .
                                                                                                                                                

                                                                                                                                                The key name, tomcat in this case, will be displayed as a process name in the Sysdig Monitor user interface instead of just java. The pattern: parameter specifies a string that is used to match a java process name and arguments with this set of JMX metrics. If the process main class full name contains the given text, the process is tagged and the metrics specified in the section will be fetched.

                                                                                                                                                The class names are matched against the process argument list. If you implement JMX metrics in a custom manner that does not expose the class names on the command line, you will need to find a pattern which conveniently matches your java invocation command line.

                                                                                                                                                The beans: section contains the list of beans to be queried, based on JMX patterns. JMX patterns are explained in details in the Oracle documentation, but in practice, the format of the query line is pretty simple: you can specify the full name of the bean like java.lang:type=Memory , or you can fetch multiple beans in a single line using the wildcard * as in: java.lang:type=GarbageCollector,* (note that this is just a wildcard, not a regex).

                                                                                                                                                To get the list of all the beans and attributes that your application exports, you can use JVisualVM, Jmxterm, JConsole or other similar tools. Here is a screenshot from JConsole showing where to find the namespace, bean and attribute (metric) information (JConsole is available when you install the Java Development Kit):

                                                                                                                                                For each query, you have to specify the attributes that you want to retrieve, and for each of them a new metric will be created. We support the following JMX attributes types (For these attributes, all the subattributes will be retrieved):

                                                                                                                                                Attributes may be absolute values or rates. For absolute values, we need to calculate a per second rate before sending them. In this case, you can specify type: counter , the default is rate which can be omitted, so usually you can simply write the attribute name.

                                                                                                                                                Limits

                                                                                                                                                The total number of JMX metrics polled per host is limited to 500. The maximum number of beans queried per process is limited to 300. If more metrics are needed please contact your sales representative with your use case.

                                                                                                                                                In agents 0.46 and earlier, the limit was 100 beans for each process.

                                                                                                                                                Aliases

                                                                                                                                                JMX beans and attributes can have very long names. To avoid interface cluttering we added support for aliasing, you can specify an alias in the attribute configuration. For example:

                                                                                                                                                  cassandra:
                                                                                                                                                    pattern: "cassandra"
                                                                                                                                                    beans:
                                                                                                                                                      - query: "org.apache.cassandra.db:type=StorageProxy
                                                                                                                                                        attributes:
                                                                                                                                                          - name: RecentWriteLatencyMicros
                                                                                                                                                            alias: cassandra.write.latency
                                                                                                                                                          - name: RecentReadLatencyMicros
                                                                                                                                                            alias: cassandra.read.latency
                                                                                                                                                

                                                                                                                                                In this way the alias will be used in Sysdig Monitor instead of the raw bean name. Aliases can be dynamic as well, getting data from the bean name - useful where you use pattern bean queries. For example, here we are using the attribute name to create different metrics:

                                                                                                                                                      - query: "java.lang:type=GarbageCollector,*"
                                                                                                                                                        attributes:
                                                                                                                                                          - name: CollectionCount
                                                                                                                                                            type: counter
                                                                                                                                                            alias: jvm.gc.{name}.count
                                                                                                                                                          - name: CollectionTime
                                                                                                                                                            type: counter
                                                                                                                                                            alias: jvm.gc.{name}.time
                                                                                                                                                

                                                                                                                                                This query will match multiple beans (All Garbage collectors) and the metric name will reflect the name of the Garbage Collector. For example: jvm.gc.ConcurrentMarkSweep.count . General syntax is: {<bean_property_key>} , to get all beans properties you can use a JMX explorer like JVisualVM or Jmxterm.

                                                                                                                                                To use these metrics in promQL queries, you have to add the prefix jmx_ and replace the dots (.) from metrics name by underscores (_). For example, the metric name jvm.gc.ConcurrentMarkSweep.count will be jmx_jvm_gc_ConcurrentMarkSweep_count in promQL.

                                                                                                                                                Troubleshooting: Why Can’t I See Java (JMX) Metrics?

                                                                                                                                                The Sysdig agent normally auto-discovers Java processes running on your host and enables the JMX extensions for polling them.

                                                                                                                                                JMX Remote

                                                                                                                                                If your Java application is not discovered automatically by the agent, try adding the following parameter on your application’s command line:

                                                                                                                                                 -Dcom.sun.management.jmxremote
                                                                                                                                                

                                                                                                                                                For more information, see Oracle’s web page on monitoring using JMX technology.

                                                                                                                                                Java Versions

                                                                                                                                                Java versions 7 - 10 are currently supported by the Sysdig agents.

                                                                                                                                                For Java 11-14 you must be running minimum agent version 10.1.0 and must run the app with the JMX Remote option.

                                                                                                                                                Java-Based Applications and JMX Authentication

                                                                                                                                                For Java-based applications (Cassandra, Elasticsearch, Kafka, Tomcat, Zookeeper and etc.), the Sysdig agent requires the Java runtime environment (JRE) to be installed to poll for metrics (beans).

                                                                                                                                                The Sysdig agent does not support JMX authentication.

                                                                                                                                                If the Docker-container-based Sysdig agent is installed, the JRE is installed alongside the agent binaries and no further dependencies exist. However, if you are installing the service-based agent (non-container) and you do not see the JVM/JMX metrics reporting, your host may not have the JRE installed or it may not be installed in the expected location: usr/bin/java

                                                                                                                                                To confirm if the Sysdig agent is able to find the JRE, restart the agent with service dragent restart and check the agent’s /opt/draios/logs/draios.log file for the two Java detection and location log entries recorded during agent startup.

                                                                                                                                                Example if Java is missing or not found:

                                                                                                                                                2017-09-08 23:19:27.944, Information, java detected: false
                                                                                                                                                2017-09-08 23:19:27.944, Information, java_binary:
                                                                                                                                                

                                                                                                                                                Example if Java is found:

                                                                                                                                                2017-09-08 23:19:27.944, Information, java detected: true
                                                                                                                                                2017-09-08 23:19:27.944, Information, java_binary: /usr/bin/java
                                                                                                                                                

                                                                                                                                                If Java is not installed, the resolution is to install the Java Runtime Environment. If your host has Java installed but not in the expected location ( /usr/bin/java ) you can install a symlink from /usr/bin/java to the actual binary OR set the java_home: variable in the Sysdig agent’s configuration file: /opt/draios/etc/dragent.yaml

                                                                                                                                                java_home: /usr/my_java_location/
                                                                                                                                                

                                                                                                                                                Disabling JMX Polling

                                                                                                                                                If you do not need it or otherwise want to disable JMX metrics reporting, you can add the following two lines to the agent’s user settings configuration file /opt/draios/etc/dragent.yaml:

                                                                                                                                                jmx:
                                                                                                                                                  enabled: false
                                                                                                                                                

                                                                                                                                                After editing the file, restart the native Linux agent via service dragent restart or restart the container agent to make the change take effect.

                                                                                                                                                If using our containerized agent, instead of editing the dragent.yaml file, you can add this extra parameter in the docker run command when starting the agent:

                                                                                                                                                -e ADDITIONAL_CONF="jmx:\n  enabled: false\n"
                                                                                                                                                

                                                                                                                                                3.3 -

                                                                                                                                                Integrate StatsD Metrics

                                                                                                                                                StatsD is an open-source project built by Etsy. Using a StatsD library specific to your application’s language, it allows for the easy generation and transmission of custom application metrics to a collection server.

                                                                                                                                                The Sysdig agent contains an embedded StatsD server, so your custom metrics can now be sent to our collector and be relayed to the Sysdig Monitor backend for aggregation. Your application metrics and the rich set of metrics collected by our agent already can all be visualized in the same simple and intuitive graphical interface. Configuring alert notifications is also exactly the same.

                                                                                                                                                Installation and Configuration

                                                                                                                                                The Statsd server, embedded in Sysdig agent beginning with version 0.1.136, is pre-configured and starts by default so no additional user configuration is necessary. Install the agent in a supported distribution directly or install the Docker containerized version in your container server and you’re done.

                                                                                                                                                Sending StatsD Metrics

                                                                                                                                                Active Collection

                                                                                                                                                By default, the Sysdig agent’s embedded StatsD collector listens on the standard StatsD port, 8125, both on TCP and UDP. StatsD is a text based protocol, where samples are separated by a \n .

                                                                                                                                                Sending metrics from your application to the collector is as simple as:

                                                                                                                                                echo "hello_statsd:1|c" > /dev/udp/127.0.0.1/8125
                                                                                                                                                

                                                                                                                                                The example transmits the counter metric "hello_statsd" with a value of ‘1’ to the Statsd collector listening on UDP port 8125. Here is a second example sending the output of a more complex shell command giving the number of established network connections:

                                                                                                                                                echo "EstablishedConnections:`netstat -a | grep ESTAB | wc -l`|c" > /dev/udp/127.0.0.1/8125
                                                                                                                                                

                                                                                                                                                The protocol format is as follows:

                                                                                                                                                METRIC_NAME:METRIC_VALUE|TYPE[|@SAMPLING_RATIO]
                                                                                                                                                

                                                                                                                                                Metric names can be any string except reserved characters: |#:@ . Value is a number and depends on the metric type. Type can be any of: c, ms, g, s . Sampling ratio is a value between 0 (exclusive) and 1 and it’s used to handle subsampling. When sent, metrics will be available in the same display menu for the subviews as the built in metrics.

                                                                                                                                                Passive Collection

                                                                                                                                                In infrastructures already containing a third party StatsD collection server, StatsD metrics can be collected “out of band”. A passive collection technique is automatically performed by our agent by intercepting system calls - as is done for all the Sysdig Monitor metrics normally collected. This method does not require changing your current StatsD configuration and is an excellent way to ’test drive’ the Sysdig Monitor application without having to perform any modifications other than agent installation.

                                                                                                                                                The passive mode of collection is especially suitable for containerized environments where simplicity and efficiency are essential. With the containerized version of the Sysdig Monitor agent running on the host, all other container applications can continue to transmit to any currently implemented collector. In the case where no collector exists, container applications can simply be configured to send StatsD metrics to the localhost interface (127.0.0.1) as demonstrated above - no actual StatsD server needs to be listening at that address.

                                                                                                                                                Effectively, each network transmission made from inside the application container, including statsd messages sent to a non existent destination, generates a system call. The Sysdig agent captures these system calls from its own container, where the statsd collector is listening. In practice, the Sysdig agent acts as a transparent proxy between the application and the StatsD collector, even if they are in different containers. The agent correlates which container a system call is coming from, and uses that information to transparently label the StatsD messages.

                                                                                                                                                The above graphic demonstrates the components of the Sysdig agent and where metrics are actively or passively collected. Regardless of the method of collection, the number of StatsD metrics the agent can transmit is limited by your payment plan.

                                                                                                                                                Note 1: When using the passive technique, ICMP port unreachable events may be generated on the host network.

                                                                                                                                                Note 2: Some clients may use IPv6 addressing (::1) for the “localhost” address string. Metrics collection over IPv6 is not supported at this time. If your StatsD metrics are not visible in the Sysdig Monitor interface, please use “127.0.0.1” instead of “localhost” string to force IPv4. Another solution that may be required is adding the JVM option: java.net.preferIPv4Stack=true.

                                                                                                                                                Note 3: When StatsD metrics are not continuously transmitted by your application (once per second as in the case of all agent created metrics), the charts will render a ‘zero’ or null value. Any alert conditions will only look at those Statsd values actually transmitted and ignore the nulls.

                                                                                                                                                Supported Metric Types

                                                                                                                                                Counter

                                                                                                                                                A counter metric is updated with the value sent by the application, sent to the Sysdig Monitor backend, and then reset to zero. You can use it to count, for example, how many calls have been made to an API:

                                                                                                                                                api.login:1|c
                                                                                                                                                

                                                                                                                                                You can specify negative values to decrement a counter.

                                                                                                                                                Gauge

                                                                                                                                                A gauge is a single value that will be sent as is:

                                                                                                                                                table_size:10000|g
                                                                                                                                                

                                                                                                                                                These are plotted as received, in the sense, they are at a point in time metrics. You can achieve relative increments or decrements on a counter by prepending the value with a + or a - respectively. As an example, these three samples will cause table_size to be 950:

                                                                                                                                                table_size:1000|g
                                                                                                                                                table_size:-100|g
                                                                                                                                                table_size:+50|g
                                                                                                                                                

                                                                                                                                                In Sysdig Monitor, the gauge value is only rendered on the various charts when it is actually transmitted by your application. When not transmitted, a null is plotted on the charts which is not used in any calculations or alerts.

                                                                                                                                                Set

                                                                                                                                                A set is like a counter, but it counts unique elements. For example:

                                                                                                                                                active_users:user1|s active_users:user2|sactive_users:user1|s
                                                                                                                                                

                                                                                                                                                Will cause the value of active_users to be 2.

                                                                                                                                                Metric Labels

                                                                                                                                                Labels are an extension of the StatsD specification offered by Sysdig Monitor to offer better flexibility in the way metrics are grouped, filtered and visualized. Labeling can be achieved by using the following syntax:

                                                                                                                                                enqueued_messages#az=eu-west-3,country=italy:10|c
                                                                                                                                                

                                                                                                                                                In general, this is the syntax you can use for labeling:

                                                                                                                                                METRIC_NAME#LABEL_NAME=LABEL_VALUE,LABEL_NAME ...
                                                                                                                                                

                                                                                                                                                Labels can be simple strings or key/value pairs, separated by an = sign. Simple labels can be used for filtering in the Sysdig Monitor web interface. Key/value labels can be used for both filtering and segmentation.

                                                                                                                                                Label names prefixed with ‘agent.label’ are reserved for Sysdig agent use only and any custom labels starting with that prefix will be ignored.

                                                                                                                                                Limits

                                                                                                                                                The number of StatsD metrics the agent can transmit is limited to 1000 for the host and 1000 for all running containers combined. If more metrics are needed please contact your sales representative with your use case.

                                                                                                                                                Collect StatsD Metrics Under Load

                                                                                                                                                The Sysdig agent can reliably receive StatsD metrics from containers, even while the agent is under load. This setting is controlled by the use_forwarder configuration parameter.

                                                                                                                                                The Sysdig agent automatically parses and records StatsD metrics. Historically, the agent parsed the system call stream from the kernel in order to read and record StatsD metrics from containers. For performance reasons, the agent may not be able to collect all StatsD metrics using this mechanism if the load is high. For example, if the StatsD client writes more than 2kB worth of StatsD metrics in a single system call, the agent will truncate the StatsD message, resulting in loss of StatsD metrics.

                                                                                                                                                With the introduction of the togglable use_forwarder option, the agent can collect StastsD metrics even under high load.

                                                                                                                                                This feature is introduced in Sysdig agent v0.90.1. As of agent v10.4.0, the configuration is enabled by default.

                                                                                                                                                statsd:
                                                                                                                                                  use_forwarder: true
                                                                                                                                                

                                                                                                                                                To disable, set it to false:

                                                                                                                                                statsd:
                                                                                                                                                  use_forwarder: false
                                                                                                                                                

                                                                                                                                                When enabled, rather than use the system call stream for container StatsD messages, the agent listens for UDP datagrams on the configured StatsD port on the localhost within the container’s network namespace. This enables the agent to reliably receive StatsD metrics from containers, even while the agent is under load.

                                                                                                                                                This option introduces a behavior change in the agent, both in the destination address and in port settings.

                                                                                                                                                • When the option is disabled, the agent reads StatsD metrics that are destined to any remote address.

                                                                                                                                                  With the option is enabled, the agent receives only those metrics that are addressed to the localhost.

                                                                                                                                                • When the option is disabled, the agent reads the container StatsD messages destined to only port 8125.

                                                                                                                                                  When the option is enabled, the agent uses the configured StatsD port.

                                                                                                                                                StatsD Server Running in a Monitored Container

                                                                                                                                                Using the forwarder is not a valid use case when a StatsD server is running in the container that you are monitoring.

                                                                                                                                                A StatsD server running in a container will already have a process bound to port 8125 or a configured StatsD port, so you can’t use that port to collect the metrics with the forwarder. A 10-second startup delay exists in the detection logic to allow any custom StatsD process to bind to that particular port before the forwarder. This ensures that the forwarder does not interrupt the operation.

                                                                                                                                                Therefore, for this particular use case, you will need to use the traditional method. Disable the forwarder and capture the metrics via the system call stream.

                                                                                                                                                Compatible Clients

                                                                                                                                                Every StatsD compliant client works with our implementation. Here is a quick list, it’s provided just as reference. We don’t support them, we support only the protocol specification compliance.

                                                                                                                                                A full list can be found at the StatsD GitHub page.

                                                                                                                                                Turning Off StatsD Reporting

                                                                                                                                                To disable Sysdig agent’s embedded StatsD server, append the following lines to the /opt/draios/etc/dragent.yaml configuration file in each installed host:

                                                                                                                                                statsd:
                                                                                                                                                  enabled: false
                                                                                                                                                

                                                                                                                                                Note that if Sysdig Secure is used, a compliance check is enabled by default and it sends metrics via StatsD. When disabling StatsD, you need to disable the compliance check as well.

                                                                                                                                                security:
                                                                                                                                                  default_compliance_schedule: ""
                                                                                                                                                

                                                                                                                                                After modifying the configuration file, you will need to restart the agent with:

                                                                                                                                                service dragent restart
                                                                                                                                                

                                                                                                                                                Changing the StatsD Listener Port and Transport Protocol

                                                                                                                                                To modify the port that the agent’s embedded StatsD server listens on, append the following lines to the /opt/draios/etc/dragent.yaml configuration file in each installed host (replace #### with your port):

                                                                                                                                                statsd:
                                                                                                                                                  tcp_port: ####
                                                                                                                                                  udp_port: ####
                                                                                                                                                

                                                                                                                                                Characters Allowed For StatsD Metric Names

                                                                                                                                                Use standard ASCII characters, we suggest also to use . namespaces as we do for all our metrics.

                                                                                                                                                Allowed characters: a-z A-Z 0-9 _ .

                                                                                                                                                For more information on adding parameters to a container agent’s configuration file, see /en/docs/installation/sysdig-agent/agent-configuration/understand-the-agent-configuration/.

                                                                                                                                                3.4 -

                                                                                                                                                Integrate Node.js Application Metrics

                                                                                                                                                Sysdig is able to monitor node.js applications by linking a library to the node.js code, which then creates a server in the code to export the StatsD metrics.

                                                                                                                                                The example below shows a node.js application that exports metrics using the Prometheus protocol:

                                                                                                                                                {
                                                                                                                                                          "name": "node-example",
                                                                                                                                                          "version": "1.0.0",
                                                                                                                                                          "description": "Node example exporting metrics via Prometheus",
                                                                                                                                                          "main": "index.js",
                                                                                                                                                          "scripts": {
                                                                                                                                                            "test": "echo \"Error: no test specified\" && exit 1"
                                                                                                                                                          },
                                                                                                                                                          "license": "BSD-2-Clause",
                                                                                                                                                          "dependencies": {
                                                                                                                                                            "express": "^4.14.0",
                                                                                                                                                            "gc-stats": "^1.0.0",
                                                                                                                                                            "prom-client": "^6.3.0",
                                                                                                                                                            "prometheus-gc-stats": "^0.3.1"
                                                                                                                                                          }
                                                                                                                                                }
                                                                                                                                                

                                                                                                                                                The index.js library function is shown below:

                                                                                                                                                        // Use express as HTTP middleware
                                                                                                                                                        // Feel free to use your own
                                                                                                                                                        var express = require('express')
                                                                                                                                                                var app = express()
                                                                                                                                                
                                                                                                                                                        // Initialize Prometheus exporter
                                                                                                                                                                const prom = require('prom-client')
                                                                                                                                                                const prom_gc = require('prometheus-gc-stats')
                                                                                                                                                                prom_gc()
                                                                                                                                                
                                                                                                                                                        // Sample HTTP route
                                                                                                                                                                app.get('/', function (req, res) {
                                                                                                                                                                res.send('Hello World!')
                                                                                                                                                                })
                                                                                                                                                
                                                                                                                                                        // Export Prometheus metrics from /metrics endpoint
                                                                                                                                                                app.get('/metrics', function(req, res) {
                                                                                                                                                                res.end(prom.register.metrics());
                                                                                                                                                                });
                                                                                                                                                
                                                                                                                                                                app.listen(3000, function () {
                                                                                                                                                                console.log('Example app listening on port 3000!')
                                                                                                                                                                })
                                                                                                                                                

                                                                                                                                                To integrate an application:

                                                                                                                                                1. Add an appcheck in the dockerfile:

                                                                                                                                                  FROM node:latest
                                                                                                                                                  WORKDIR /app
                                                                                                                                                  ADD package.json ./
                                                                                                                                                  RUN npm install
                                                                                                                                                  ENV SYSDIG_AGENT_CONF 'app_checks: [{name: node, check_module: prometheus, pattern: {comm: node}, conf: { url: "http://localhost:{port}/metrics" }}]'
                                                                                                                                                  ADD index.js ./
                                                                                                                                                  ENTRYPOINT [ "node", "index.js" ]
                                                                                                                                                  
                                                                                                                                                2. Run the application:

                                                                                                                                                  user@host:~$ docker build -t node-example
                                                                                                                                                  user@host:~$ docker run -d node-example
                                                                                                                                                  

                                                                                                                                                Once the Sysdig agent is deployed, node.js metrics will be automatically retrieved. The image below shows an example of key node.js metrics visible on the Sysdig Monitor UI:

                                                                                                                                                For code and configuration examples, refer to the Github repository.

                                                                                                                                                4.1 -

                                                                                                                                                Configure PVC Metrics

                                                                                                                                                You can use dashboards and alerts for PersistentVolumeClaim (PVC) metrics in the regions where PVC metrics are supported.

                                                                                                                                                To see data on PVC dashboards and alerts, ensure that the prerequisites are met.

                                                                                                                                                Prerequisites

                                                                                                                                                Apply Rules

                                                                                                                                                If you are upgrading the Sysdig agent, either download sysdig-agent-clusterrole.yaml or apply the following rule to the ClusterRole associated with your Sysdig agent.

                                                                                                                                                rules:
                                                                                                                                                - apiGroups:
                                                                                                                                                  - ""
                                                                                                                                                  resources:
                                                                                                                                                  - nodes/metrics
                                                                                                                                                    nodes/proxy
                                                                                                                                                - nonResourceURLs:
                                                                                                                                                  - /metrics
                                                                                                                                                  verbs:
                                                                                                                                                  - get
                                                                                                                                                

                                                                                                                                                The rules are required to scrape the kubelet containers. With this rule enabled, you will also have the kubelet metrics and can access kubelet templates for both dashboards and alerts.

                                                                                                                                                This configuration change is only required for agent upgrades because the sysdig-agent-clusterrole.yaml associated with fresh installations will already have this configuration. See Steps for Kubernetes (Vanilla) for information on Sysdig agent installation.

                                                                                                                                                Sysdig Agent v12.3.0 or Above

                                                                                                                                                PVC metrics are enabled by default for Sysdig agent v12.3.0 or above. To disable collecting PVC metrics, add the following to the dragent.yaml file:

                                                                                                                                                k8s_extra_resources:
                                                                                                                                                  include:
                                                                                                                                                    - services
                                                                                                                                                    - resourcequotas
                                                                                                                                                

                                                                                                                                                Sysdig Agent Prior to v12.3.0

                                                                                                                                                Contact your Sysdig representative or Sysdig Support for technical assistance with enabling PVC metrics in your environment.
                                                                                                                                                • Upgrade Sysdig agent to v12.2.0 or above

                                                                                                                                                • If you are an existing Sysdig user, include the following configuration in the dragent.yaml file:

                                                                                                                                                  k8s_extra_resources:
                                                                                                                                                    include:
                                                                                                                                                      - persistentvolumes
                                                                                                                                                      - persistentvolumeclaims
                                                                                                                                                      - storageclasses
                                                                                                                                                  

                                                                                                                                                Access PVC Dashboard Template

                                                                                                                                                1. Log in to Sysdig Monitor and click Dashboards.

                                                                                                                                                2. On the Dashboards slider, scroll down to locate Dashboard Templates.

                                                                                                                                                3. Click Kubernetes to expand the Kubernetes dashboard templates.

                                                                                                                                                4. Select the PVC and Storage dashboard.

                                                                                                                                                Access PVC Alert Template

                                                                                                                                                1. Log in to Sysdig Monitor and click Alerts.

                                                                                                                                                2. On the Alerts page, click Library.

                                                                                                                                                3. On the Library page, click All Templates.

                                                                                                                                                4. Select the Kubenetes PVC alert templates.

                                                                                                                                                PVC Metrics

                                                                                                                                                MetricsMetric TypeLabelsMetric Source
                                                                                                                                                kube_persistentvolume_status_phaseGaugepersistentvolume, phaseKubernetes API
                                                                                                                                                kube_persistentvolume_claim_refGaugepersistentvolume, nameKubernetes API
                                                                                                                                                kube_storageclass_createdGaugestorageclassKubernetes API
                                                                                                                                                kube_storageclass_infoGaugestorageclass, provisioner, reclaim_policy, volume_binding_modeKubernetes API
                                                                                                                                                kube_storageclass_labelsGaugestorageclassKubernetes API
                                                                                                                                                kube_pod_spec_volumes_persistentvolumeclaims_infoGaugenamespace, pod, uid, volume, persistentvolumeclaimKubernetes API
                                                                                                                                                kube_pod_spec_volumes_persistentvolumeclaims_readonlyGaugenamespace, pod, uid, volume, persistentvolumeclaimKubernetes API
                                                                                                                                                kube_persistentvolumeclaim_status_conditionGaugenamespace, persistentvolumeclaim, type, statusKubernetes API
                                                                                                                                                kube_persistentvolumeclaim_status_phaseGaugenamespace, persistentvolumeclaim, phaseKubernetes API
                                                                                                                                                kube_persistentvolumeclaim_access_modeGaugenamespace, persistentvolumeclaim, access_modeKubernetes API
                                                                                                                                                kubelet_volume_stats_inodesGaugenamespace, persistentvolumeclaimKubelet
                                                                                                                                                kubelet_volume_stats_inodes_freeGaugenamespace, persistentvolumeclaimKubelet
                                                                                                                                                kubelet_volume_stats_inodes_usedGaugenamespace, persistentvolumeclaimKubelet
                                                                                                                                                kubelet_volume_stats_used_bytesGaugenamespace, persistentvolumeclaimKubelet
                                                                                                                                                kubelet_volume_stats_available_bytesGaugenamespace, persistentvolumeclaimKubelet
                                                                                                                                                kubelet_volume_stats_capacity_bytesGaugenamespace, persistentvolumeclaimKubelet
                                                                                                                                                storage_operation_duration_seconds_bucketGaugeoperation_name, volume_plugin,leKubelet
                                                                                                                                                storage_operation_duration_seconds_sumGaugeoperation_name, volume_pluginKubelet
                                                                                                                                                storage_operation_duration_seconds_countGaugeoperation_name, volume_pluginKubelet
                                                                                                                                                storage_operation_errors_totalGaugeoperation_name, volume_pluginKubelet
                                                                                                                                                storage_operation_status_countGaugeoperation_name, status, volume_pluginKubelet

                                                                                                                                                4.2 -

                                                                                                                                                Integrate Keda for HPA

                                                                                                                                                Sysdig supports Keda to deploy Kubernetes Horizontal Pod Autoscaler (HPA) using custom metrics exposed by Sysdig Monitor. You can do this by configuring Prometheus queries and endpoints in Keda. Keda uses that information to query your Prometheus server and create HPA. The HPA will takee care of scaling pods based on your usage of resources, such as CPU and memory.

                                                                                                                                                This option replaces Sysdig’s existing custom metric server for HPA.

                                                                                                                                                Install Keda

                                                                                                                                                Requirements:

                                                                                                                                                • Helm
                                                                                                                                                • Keda v2.3 or above (Endpoint authentication)

                                                                                                                                                Install Keda with helm by running the following command:

                                                                                                                                                helm repo add kedacore https://kedacore.github.io/charts
                                                                                                                                                helm repo update
                                                                                                                                                helm install keda kedacore/keda --namespace keda --create-namespace \
                                                                                                                                                  --set image.metricsApiServer.tag=2.4.0 --set image.keda.tag=2.4.0 \
                                                                                                                                                  --set prometheus.metricServer.enabled=true
                                                                                                                                                

                                                                                                                                                Create Authentication for Sysdig Prometheus Endpoint

                                                                                                                                                Do the following in each namespace where you want to use Keda. This example uses the namespace, keda.

                                                                                                                                                1. Create the secret with the API key as the bearer token:

                                                                                                                                                  kubectl create secret generic keda-prom-secret --from-literal=bearerToken=<API_KEY> -n keda
                                                                                                                                                  
                                                                                                                                                2. Create the triggerAuthentication.yaml file:

                                                                                                                                                  apiVersion: keda.sh/v1alpha1
                                                                                                                                                  kind: TriggerAuthentication
                                                                                                                                                  metadata:
                                                                                                                                                    name: keda-prom-creds
                                                                                                                                                  spec:
                                                                                                                                                    secretTargetRef:
                                                                                                                                                    - parameter: bearerToken
                                                                                                                                                      name: keda-prom-secret
                                                                                                                                                      key: bearerToken
                                                                                                                                                  
                                                                                                                                                3. Apply the configurations in the triggerAuthentication.yaml file :

                                                                                                                                                  kubectl apply -f -n keda triggerAuthentication.yaml
                                                                                                                                                  

                                                                                                                                                Configure HPA

                                                                                                                                                You can configure HPA for a Deployment, StatefulSet, or CRD. Keda uses a CRD to configure the HPA. You create a ScaledObject and it automatically sets up the metrics server and the HPA object under the hood.

                                                                                                                                                1. To create a ScaledObject, specify the following:

                                                                                                                                                  • spec.scaleTargetRef.name: The unique name of the Deployment.
                                                                                                                                                  • spec.scaleTargetRef.kind: The kind of object to be scaled: Deployment, SStatefulSet, CustomResource.
                                                                                                                                                  • spec.minReplicaCount: The minimum number of replicas that the Deployment should have.
                                                                                                                                                  • spec.maxReplicaCount: The maximum number of replicas that the Deployment should have.
                                                                                                                                                2. In the ScaledObject, use a trigger of type prometheus to get the metrics from your Sysdig Monitor account. To do so, specify the following:

                                                                                                                                                  • triggers.metadata.serverAddress: The address of the Prometheus endpoint. It is the Sysdig Monitor URL with prefix /prometheus. For example: https://app.sysdigcloud.com/prometheus.
                                                                                                                                                  • triggers.metadata.query: The PromQL query that will return a value. Ensure that the query returns a vector/scalar single element response.
                                                                                                                                                  • triggers.metadata.metricName: The name of the metric that will be created in the kubernetes API endpoint, /apis/external.metrics.k8s.io/v1beta1.
                                                                                                                                                  • triggers.metadata.threshold: The threshold that will be used to scale the Deployment.
                                                                                                                                                3. Ensure that you add the authModes and authenticationRef to the trigger.

                                                                                                                                                4. Check the ScaledObject. Here is an example of a ScaledObject:

                                                                                                                                                  apiVersion: keda.sh/v1alpha1
                                                                                                                                                  kind: ScaledObject
                                                                                                                                                  metadata:
                                                                                                                                                    name: keda-web
                                                                                                                                                  spec:
                                                                                                                                                    scaleTargetRef:
                                                                                                                                                      kind: Deployment
                                                                                                                                                      name: web
                                                                                                                                                    minReplicaCount: 1
                                                                                                                                                    maxReplicaCount: 4
                                                                                                                                                    triggers:
                                                                                                                                                    - type: prometheus
                                                                                                                                                      metadata:
                                                                                                                                                        serverAddress: https://app.sysdigcloud.com/prometheus
                                                                                                                                                        metricName: sysdig_container_cpu_cores_used
                                                                                                                                                        query: sum(sysdig_container_cpu_cores_used{kube_cluster_name="my-cluster-name", kube_namespace_name="keda", kube_workload_name = "web"} * 10
                                                                                                                                                        threshold: "5"
                                                                                                                                                        authModes: "bearer"
                                                                                                                                                      authenticationRef:
                                                                                                                                                        name: keda-prom-creds
                                                                                                                                                  
                                                                                                                                                  
                                                                                                                                                  

                                                                                                                                                The HPA will divide the value of the metric by the number of current replicas, therefore, try to avoid using the AVERAGE aggregation. Use SUM instead to aggregate the metrics by workload. For example, if the sum of all the values of all the pods is 100 and there are 5 replicas, the HPA will calculate that the value of the metric is 20.

                                                                                                                                                Advanced Configurations

                                                                                                                                                The ScaledObject permits additional options:

                                                                                                                                                spec.pollingInterval:

                                                                                                                                                Specify the interval to check each trigger on. By default KEDA will check each trigger source on every ScaledObject every 30 seconds.

                                                                                                                                                Warning: setting this to a low value will cause Keda to make frequent API calls to the Prometheus endpoint. The minimum value for pollingInterval is 10 seconds. The scraping frequency of the Sysdig Agent is 10 seconds.

                                                                                                                                                spec.cooldownPeriod:

                                                                                                                                                The wait period between the last active trigger reported and scaling the resource back to 0. By default the value is 5 minutes (300 seconds).

                                                                                                                                                spec.idleReplicaCount:

                                                                                                                                                Enabling this property allows KEDA to scale the resource down to the specified number of replicas. If some activity exists on the target triggers, KEDA will scale the target resource immediately to the value of minReplicaCount and scaling is handed over to HPA. When there is no activity, the target resource is again scaled down to the value specified by idleReplicaCount. This setting must be less than minReplicaCount.

                                                                                                                                                spec.fallback:

                                                                                                                                                This property allows you to define a number of replicas if consecutive connection errors happens with the Prometheus endpoint of your Sysdig account.

                                                                                                                                                • spec.fallback.failureThreshold: The number of consecutive errors to apply the fallback.
                                                                                                                                                • spec.fallback.replicas: The number of replicas to apply in case of connection error.

                                                                                                                                                spec.advanced.horizontalPodAutoscalerConfig.behavior:

                                                                                                                                                This property allows you to define the behavior of the Kubernetes HPA Object. See the Kubernetes documentation for more information.

                                                                                                                                                Learn More

                                                                                                                                                4.3 -

                                                                                                                                                Configure Recording Rules

                                                                                                                                                Sysdig now supports Prometheus recording rules for metric aggregation and querying.

                                                                                                                                                You can configure recording rules by using the Sysdig API. Ensure that you define them in a Prometheus compatible way. The mandatory parameters are:

                                                                                                                                                • record: The unique name of the time series. It must be a valid metric name.

                                                                                                                                                • expr: The PromQL expression to evaluate. In each evaluation cycle, the given expression is evaluated and the result is recorded as a new set of time series with the metric name specified in record.

                                                                                                                                                • labels: The unique identifiers to add or overwrite before storing the result.

                                                                                                                                                To enable this feature in your environment, contact Sysdig Support.

                                                                                                                                                4.4 -

                                                                                                                                                Configure Sysdig with Grafana

                                                                                                                                                Sysdig enables Grafana users to query metrics from Sysdig and visualize them in Grafana dashboards. In order to integrate Sysdig with Grafana, you configure a data source. There are two types of data sources supported:

                                                                                                                                                • Prometheus

                                                                                                                                                  Prometheus data source comes with Grafana and is natively compatible with PromQL. Sysdig provides a Prometheus-compatible API to achieve API-only integration with Grafana.

                                                                                                                                                • Sysdig

                                                                                                                                                  Sysdig data source requires additional settings and is more compatible with the simple “form-based” data configuration. Use the Sysdig native API instead of the Prometheus API. See Sysdig Grafana datasource for more information.

                                                                                                                                                Using the Prometheus API on Grafana v6.7 and Above

                                                                                                                                                You use the Sysdig Prometheus API to set up the datasource to use with Grafana. Before Grafana can consume Sysdig metrics, Grafana must authenticate itself to Sysdig. To do so, you must set up an HTTP authentication by using the Sysdig API Token because no UI support is currently available on Grafana.

                                                                                                                                                1. Assuming that you are not using Grafana, spin up a Grafana container as follows:

                                                                                                                                                  $ docker run --rm -p 3000:3000 --name grafana grafana/grafana
                                                                                                                                                  
                                                                                                                                                2. Login to Grafana as administrator and create a new datasource by using the following information:

                                                                                                                                                  • URL: https://<Monitor URL for Your Region>/prometheus

                                                                                                                                                    See SaaS Regions and IP Ranges and identify the correct URLs associated with your Sysdig application and region.

                                                                                                                                                  • Authentication: Do not select any authentication mechanisms.

                                                                                                                                                  • Access: Server (default)

                                                                                                                                                  • Custom HTTP Headers:

                                                                                                                                                    • Header: Enter the word, Authorization

                                                                                                                                                    • Value:  Enter the word, Bearer , followed by a space and <Your Sysdig API Token>

                                                                                                                                                      API Token is available through Settings > User Profile > Sysdig Monitor API.

                                                                                                                                                Using the Grafana API on Grafana v6.6 and Below

                                                                                                                                                The feature requires Grafana v5.3.0 or above.

                                                                                                                                                You use the Grafana API to set up the Sysdig datasource.

                                                                                                                                                1. Download and run Grafana in a container.

                                                                                                                                                  docker run --rm -p 3000:3000 --name grafana grafana/grafana
                                                                                                                                                  
                                                                                                                                                2. Create a JSON file.

                                                                                                                                                  cat grafana-stg-ds.json
                                                                                                                                                  {
                                                                                                                                                      "name": "Sysdig staging PromQL",
                                                                                                                                                      "orgId": 1,
                                                                                                                                                      "type": "prometheus",
                                                                                                                                                      "access": "proxy",
                                                                                                                                                      "url": "https://app-staging.sysdigcloud.com/prometheus",
                                                                                                                                                      "basicAuth": false,
                                                                                                                                                      "withCredentials": false,
                                                                                                                                                      "isDefault": false,
                                                                                                                                                      "editable": true,
                                                                                                                                                      "jsonData": {
                                                                                                                                                          "httpHeaderName1": "Authorization",
                                                                                                                                                          "tlsSkipVerify": true
                                                                                                                                                      },
                                                                                                                                                      "secureJsonData": {
                                                                                                                                                          "httpHeaderValue1": "Bearer your-Sysdig-API-token"
                                                                                                                                                      }
                                                                                                                                                  }
                                                                                                                                                  
                                                                                                                                                3. Get your Sysdig API Token and plug it in the JSON file above.

                                                                                                                                                  "httpHeaderValue1": "Bearer your_Sysdig_API_Token"
                                                                                                                                                  
                                                                                                                                                4. Add the datasource to Grafana.

                                                                                                                                                  curl -u admin:admin -H "Content-Type: application/json" http://localhost:3000/api/datasources -XPOST -d @grafana-stg-ds.json
                                                                                                                                                  
                                                                                                                                                5. Run Grafana.

                                                                                                                                                  http://localhost:3000
                                                                                                                                                  
                                                                                                                                                6. Use the default credentials, admin: admin, to sign in to Grafana.

                                                                                                                                                7. Open the Data Source tab under Configuration on Grafana and confirm that the one you have added is listed on the page.

                                                                                                                                                5 -

                                                                                                                                                Troubleshoot Monitoring Integrations

                                                                                                                                                Review the common troubleshooting scenarios you might encounter while getting a Monitor integration working and see what you can do if an integration does not report metics after installation.

                                                                                                                                                Check Prerequisites

                                                                                                                                                Some integrations require secrets and other resources available in the correct namespace in order for it to work. Integrations such as database exporters might require you to create a user and provide with special permissions in the database to be able to connect with the endpoint and generate metrics.

                                                                                                                                                Ensure that the prerequisites of the integration are met before proceeding with installation.

                                                                                                                                                Verify Exporter Is Running

                                                                                                                                                If the integration is an exporter, ensure that the pods corresponding to the exporter are running correctly. You can check this after installing the integration. If the exporter is installed as a sidecar of the application (such as Nginx), verify that the exporter container is added to the pod.

                                                                                                                                                You can check the status of the pods with the Kubernetes dashboard Pods Status and Performance or with the following command:

                                                                                                                                                kubectl get pods --namespace=<namespace>
                                                                                                                                                

                                                                                                                                                Additionally, if the container has problems and cannot start, check the description of the pod for error messages:

                                                                                                                                                kubectl describe pod <pod-name> --namespace=<namespace>
                                                                                                                                                

                                                                                                                                                Verify Metrics Are Generated

                                                                                                                                                Check whether a running exporter is generating metrics by accessing the metrics endpoint:

                                                                                                                                                kubectl port-forward <pod-name> <pod-port> <local-port> --namespace=<namespace>
                                                                                                                                                curl http://localhost:<local-port>/metrics
                                                                                                                                                

                                                                                                                                                This is also valid for applications that don’t need an exporter to generate their own metrics.

                                                                                                                                                If the exporter is not generating metics, there could be problems accessing or authenticating with the application. Check the logs associated with the pods:

                                                                                                                                                kubectl logs <pod-name> --namespace=<namespace>
                                                                                                                                                

                                                                                                                                                If the application is instrumented and is not generating metrics, check if the Prometheus metrics option or the module is activated.

                                                                                                                                                Verify Sysdig Agent Is Scraping Metrics

                                                                                                                                                If an application doesn’t need an exporter to generate metrics, check if it has the default Prometheus annotations.

                                                                                                                                                Additionally, you can check if the Sysdig agent can access the metrics endpoint. To do so, use the following command:

                                                                                                                                                kubectl exec <sysdig-agent-pod-name> --namespace=sysdig-agent -- /bin/sh -c "curl http://<exporer-pod-ip>:<pod-port>/metrics"
                                                                                                                                                

                                                                                                                                                Select the Sysdig Agent pod in the same node than the pod used to scrape.

                                                                                                                                                5.1 -

                                                                                                                                                Monitor Log Files

                                                                                                                                                You can search for particular strings within a given log file, and create a metric that is displayed in Sysdig Monitor’s Explore page. The metrics appear under the StatsD section:

                                                                                                                                                Sysdig provides this functionality via a “chisel” script called “logwatcher”, written in Lua. You call the script by adding a logwatcher parameter in the chisels section of the agent configuration file (dragent.yaml). You define the log file name and the precise string to be searched. The results are displayed as metrics in the Monitor UI.

                                                                                                                                                Caveats

                                                                                                                                                The logwatcher chisel adds to Sysdig’s monitoring capability but is not a fully featured log monitor. Note the following limitations:

                                                                                                                                                • No regex support: Sysdig does not offer regex support; you must define the precise log file and string to be searched.

                                                                                                                                                  (If you were to supply a string with spaces, forward-slashes, or back-slashes in it, the metric generated would also have these characters and so could not be used to create an alert.)

                                                                                                                                                • Limit of 12 string searches/host: Logwatcher is implemented as a LUA script and, due to resources consumed by this chisel, it is not recommended to have more than a dozen string searches configured per agent/host.

                                                                                                                                                Implementation

                                                                                                                                                Edit the agent configuration file to enable the logwatcher chisel. See Understanding the Agent Config Files for editing options.

                                                                                                                                                Preparation

                                                                                                                                                Determine the log file name(s) and string(s) you want to monitor.

                                                                                                                                                To monitor the output of docker logs <container-name>, find the container’s docker log file with:

                                                                                                                                                docker inspect <container-name> | grep LogPath

                                                                                                                                                Edit dragent.yaml

                                                                                                                                                1. Access dragent.yaml directly at /opt/draios/etc/dragent.yaml.

                                                                                                                                                2. Add a chisels entry:

                                                                                                                                                  Format:

                                                                                                                                                  chisels:
                                                                                                                                                    - name: logwatcher
                                                                                                                                                      args:
                                                                                                                                                        filespattern: YOURFILENAME.log
                                                                                                                                                        term: YOURSTRING
                                                                                                                                                  

                                                                                                                                                  Sample Entry:

                                                                                                                                                  customerid: 831f2-your-key-here-d69401
                                                                                                                                                  tags: tagname.tagvalue
                                                                                                                                                  chisels:
                                                                                                                                                    - name: logwatcher
                                                                                                                                                      args:
                                                                                                                                                        filespattern: draios.log
                                                                                                                                                        term: Sent
                                                                                                                                                  

                                                                                                                                                  In this example, Sysdig’s own draios.log is searched for the Sent string.

                                                                                                                                                  The output, in the Sysdig Monitor UI, would show the StatsD metric logwatcher.draios_log.Sent and the number of ‘Sent’ items detected.

                                                                                                                                                3. Optional: Add multiple -name: sections in the config file to search for additional logs/strings.

                                                                                                                                                  Note the recommended 12-string/agent limit.

                                                                                                                                                4. Restart the agent for changes to take effect.

                                                                                                                                                  For container agent:

                                                                                                                                                  docker restart sysdig-agent
                                                                                                                                                  

                                                                                                                                                  For non-containerized (service) agent:

                                                                                                                                                  service dragent restart
                                                                                                                                                  

                                                                                                                                                Parameters

                                                                                                                                                NameValueDescription
                                                                                                                                                namelogwatcherThe chisel used in the enterprise Sysdig platform to search log files. (Other chisels are available in Sysdig’s open-source product.)
                                                                                                                                                filespatternYOURFILENAME.logThe log file to be searched. Do not specify a path with the file name.
                                                                                                                                                termYOURSTRINGThe string to be searched.

                                                                                                                                                View Log File Metrics in the Monitor UI

                                                                                                                                                To view logwatcher results:

                                                                                                                                                1. Log in to Sysdig Monitor and select Explore.

                                                                                                                                                2. Select Entire Infrastructure > Overview by Host.

                                                                                                                                                3. In the resulting drop-down, either scroll to Metrics > StatsD > logwatcher or enter “logwatcher” in the search field.

                                                                                                                                                  Each string you configured in the agent config file will be listed in the format logwatcher.YOURFILENAME_log.STRING.

                                                                                                                                                4. The relevant metrics are displayed.

                                                                                                                                                You can also Add an Alert on logwatcher metrics, to be notified when an important log entry appears.

                                                                                                                                                6 -

                                                                                                                                                (Legacy) Integrations for Sysdig Monitor

                                                                                                                                                Integrate metrics with Sysdig Monitor from a number of platforms, orchestrators, and a wide range of applications. Sysdig collects metrics from Prometheus, JMX, StatsD, Kubernetes, and many application stacks to provide a 360-degree view of your infrastructure. Many metrics are collected by default out of the box; you can also extend the integration or create custom metrics.

                                                                                                                                                Key Benefits

                                                                                                                                                • Collects the richest data set for cloud-native visibility and security

                                                                                                                                                • Polls data, auto-discover context in order to provide operational and security insights

                                                                                                                                                • Extends the power of Prometheus metrics with additional insights from other metrics types and infrastructure stack

                                                                                                                                                • Integrate Prometheus alert and events for Kubernetes monitoring needs

                                                                                                                                                • Expose application metrics using Java JMX and MBeans monitoring

                                                                                                                                                Key Integrations

                                                                                                                                                Inbound

                                                                                                                                                • Prometheus Metrics

                                                                                                                                                  Describes how Sysdig Agent enables automatically collecting metrics from Prometheus exporters, how to set up your environment, and scrape Prometheus metrics from local as well as remote hosts.

                                                                                                                                                • Java Management Extention (JMX) Metrics

                                                                                                                                                  Describes how to configure your Java virtual machines so Sysdig Agent can collect JMX metrics using the JMX protocol.

                                                                                                                                                • StatsD Metrics

                                                                                                                                                  Describes how the Sysdig agent collects custom StatsD metrics with an embedded StatsD server.

                                                                                                                                                • Node.JS Metrics

                                                                                                                                                  Illustrates how Sysdig is able to monitor node.js applications by linking a library to the node.js codebase.

                                                                                                                                                • Integrate Applications

                                                                                                                                                  Describes the monitoring capabilities of Sysdig agent with application check scripts or ‘app checks’.

                                                                                                                                                • Monitor Log Files

                                                                                                                                                  Learn how to search a string by using the chisel script called logwatcher.

                                                                                                                                                • AWS CloudWatch

                                                                                                                                                  Illustrates how to configure Sysdig to collect various types of CloudWatch metrics.

                                                                                                                                                • Agent Installation

                                                                                                                                                  Learn how to install Sysdig agents on supported platforms.

                                                                                                                                                Oubound

                                                                                                                                                • Notification Channels

                                                                                                                                                  Learn how to add, edit, or delete a variety of notification channel types, and how to disable or delete notifications when they are not needed, for example, during scheduled downtime.

                                                                                                                                                • S3 Capture Storage

                                                                                                                                                  Learn how to configure Sysdig to use an AWS S3 bucket or custom S3 storage for storing Capture files.

                                                                                                                                                Platform Metrics (IBM)

                                                                                                                                                For Sysdig instances deployed on IBM Cloud Monitoring with Sysdig, an additional form of metrics collection is offered: Platform metrics. Rather than being collected by the Sysdig agent, when enabled, Platform metrics are reported to Sysdig directly by the IBM Cloud infrastructure.

                                                                                                                                                Enable this feature by logging into the IBM Cloud console and selecting “Enable” for IBM Platform metrics under the Configure your resource section when creating a new IBM Cloud Monitoring with a Sysdig instance, as described here.

                                                                                                                                                6.1 -

                                                                                                                                                (Legacy) Collect Prometheus Metrics

                                                                                                                                                Sysdig supports collecting, storing, and querying Prometheus native metrics and labels. You can use Sysdig in the same way that you use Prometheus and leverage Prometheus Query Language (PromQL) to create dashboards and alerts. Sysdig is compatible with Prometheus HTTP API to query your monitoring data programmatically using PromQL and extend Sysdig to other platforms like Grafana.

                                                                                                                                                From a metric collection standpoint, a lightweight Prometheus server is directly embedded into the Sysdig agent to facilitate metric collection. This also supports targets, instances, and jobs with filtering and relabeling using Prometheus syntax. You can configure the agent to identify these processes that expose Prometheus metric endpoints on its own host and send it to the Sysdig collector for storing and further processing.

                                                                                                                                                This document uses metric and time series interchangeably. The description of configuration parameters refers to “metric”, but in strict Prometheus terms, those imply time series. That is, applying a limit of 100 metrics implies applying a limit on time series, where all the time series data might not have the same metric name.

                                                                                                                                                The Prometheus product itself does not necessarily have to be installed for Prometheus metrics collection.

                                                                                                                                                See the Sysdig agent versions and compatibility with Prometheus features:

                                                                                                                                                • Latest versions of agent (v12.0.0 and above): The following features are enabled by default:

                                                                                                                                                  • Automatically scraping any Kubernetes pods with the following annotation set: prometheus.io/scrape=true
                                                                                                                                                  • Automatically scrape applications supported by Monitoring Integrations.
                                                                                                                                                • Sysdig agent prior to v12.0.0: Manually enable Prometheus in dragent.yaml file:

                                                                                                                                                    prometheus:
                                                                                                                                                         enabled: true
                                                                                                                                                  

                                                                                                                                                Learn More

                                                                                                                                                The following topics describe in detail how to configure the Sysdig agent for service discovery, metrics collection, and further processing.

                                                                                                                                                See the following blog posts for additional context on the Prometheus metric and how such metrics are typically used.

                                                                                                                                                6.1.1 -

                                                                                                                                                (Legacy) Working with Prometheus Metrics

                                                                                                                                                The Sysdig agent uses its visibility to all running processes (at both the host and container levels) to find eligible targets for scraping Prometheus metrics. By default, no scraping is attempted. Once the feature is enabled, the agent assembles a list of eligible targets, apply filtering rules, and sends back to the Sysdig collector.

                                                                                                                                                Latest Prometheus Features

                                                                                                                                                Sysdig agents v12.0 or above is required for the following capabilities:

                                                                                                                                                Sysdig agents v10.0 or above is required for the following capabilities:

                                                                                                                                                • New capabilities of using Prometheus data:

                                                                                                                                                  • Ability to visualize data using PromQL queries. See Using PromQL.

                                                                                                                                                  • Create alerts from PromQL-based Dashboards. See Create Panel Alerts.

                                                                                                                                                  • Backward compatibility for dashboards v2 and alerts.

                                                                                                                                                    The new PromQL data cannot be visualized by using the Dashboard v2 Histogram. Use time-series based visualization for the histogram metrics.

                                                                                                                                                • New metrics limit per agent

                                                                                                                                                • 10-second data granularity

                                                                                                                                                • Higher retention rate on the new metric store.

                                                                                                                                                Prerequisites and Guidelines

                                                                                                                                                • Sysdig agent v 10.0.0 and above is required for the latest Prometheus features.

                                                                                                                                                • Prometheus feature is enabled in the dragent.yaml file.

                                                                                                                                                  prometheus:
                                                                                                                                                    enabled: true
                                                                                                                                                  

                                                                                                                                                  See Setting up the Environment for more information.

                                                                                                                                                • The endpoints of the target should be available on a TCP connection to the agent. The agent scrapes a target, remote or local, specified by the IP: Port or the URL in dragent.yaml.

                                                                                                                                                Service Discovery

                                                                                                                                                To use native Prometheus service discovery, enable Promscrape V2 as described in Enable Prometheus Native Service Discovery. This section covers the Sysdig way of service discovery that involves configuring process filters in the Sysdig agent.

                                                                                                                                                The way service discovery works in the Sysdig agent differs from that of the Prometheus server. While the Prometheus server has built-in integration with several service discovery mechanisms and the prometheus.yml file to read the configuration settings from, the Sysdig agent auto-discovers any process (exporter or instrumented) that matches the specifications in the dragent.yaml, file and instructs the embedded lightweight Prometheus server to retrieve the metrics from it.

                                                                                                                                                The lightweight Prometheus server in the agent is named promscrape and is controlled by the flag of the same name in the dragent.yaml file. See Configuring Sysdig Agent for more information.

                                                                                                                                                Unlike the Prometheus server that can scrape processes running on all the machines in a cluster, the agent can scrape only those processes that are running on the host that it is installed on.

                                                                                                                                                Within the set of eligible processes/ports/endpoints, the agent scrapes only the ports that are exporting Prometheus metrics and will stop attempting to scrape or retry on ports based on how they respond to attempts to connect and scrape them. It is therefore strongly recommended that you create a configuration that restricts the process and ports for attempted scraping to the minimum expected range for your exporters. This minimizes the potential for unintended side-effects in both the Agent and your applications due to repeated failed connection attempts.

                                                                                                                                                The end to end metric collection can be summarized as follows:

                                                                                                                                                1. A process is determined to be eligible for possible scraping if it positively matches against a series of Process Filter include/exclude rules. See Process Filter for more information.

                                                                                                                                                2. The Agent will then attempt to scrape an eligible process at a /metrics endpoint on all of its listening TCP ports unless the additional configuration is present to restrict scraping to a subset of ports and/or another endpoint name.

                                                                                                                                                3. Upon receiving the metrics, the agent applies the following rules before sending them to the Sysdig collector.

                                                                                                                                                The metrics ultimately appear in the Sysdig Monitor Explore interface in the Prometheus section.

                                                                                                                                                6.1.2 -

                                                                                                                                                (Legacy) Set Up the Environment

                                                                                                                                                Quick Start For Kubernetes Environments

                                                                                                                                                Prometheus users who are already leveraging Kubernetes Service Discovery (specifically the approach in this sample prometheus-kubernetes.yml) may already have Annotations attached to the Pods that mark them as eligible for scraping. Such environments can quickly begin scraping the same metrics using the Sysdig Agent in a couple of easy steps.

                                                                                                                                                1. Enable the Prometheus metrics feature in the Sysdig Agent. Assuming you are deploying using DaemonSets, the needed config can be added to the Agent’s dragent.yaml by including the following in your DaemonSet YAML (placing it in the env section for the sysdig-agent container):

                                                                                                                                                  - name: ADDITIONAL_CONF
                                                                                                                                                    value: "prometheus:\n  enabled: true"
                                                                                                                                                  
                                                                                                                                                2. Ensure the Kubernetes Pods that contain your Prometheus exporters have been deployed with the following Annotations to enable scraping (substituting the listening exporter-TCP-port) :

                                                                                                                                                  spec:
                                                                                                                                                    template:
                                                                                                                                                      metadata:
                                                                                                                                                        annotations:
                                                                                                                                                          prometheus.io/scrape: "true"
                                                                                                                                                          prometheus.io/port: "exporter-TCP-port"
                                                                                                                                                  

                                                                                                                                                  The configuration above assumes your exporters use the typical endpoint called /metrics. If an exporter is using a different endpoint, this can also be specified by adding the following additional optional Annotation, substituting the exporter-endpoint-name:

                                                                                                                                                  prometheus.io/path: "/exporter-endpoint-name"
                                                                                                                                                  

                                                                                                                                                If you try this Kubernetes Deployment of a simple exporter, you will quickly see auto-discovered Prometheus metrics being displayed in Sysdig Monitor. You can use this working example as a basis to similarly Annotate your own exporters.

                                                                                                                                                If you have Prometheus exporters not deployed in annotated Kubernetes Pods that you would like to scrape, the following sections describe the full set of options to configure the Agent to find and scrape your metrics.

                                                                                                                                                Quick Start for Container Environments

                                                                                                                                                In order for Prometheus scraping to work in a Docker-based container environment, set the following labels to the application containers, substituting <exporter-port> and <exporter-path> with the correct port and path where metrics are exported by your application:

                                                                                                                                                • io.prometheus.scrape=true

                                                                                                                                                • io.prometheus.port=<exporter-port>

                                                                                                                                                • io.prometheus.path=<exporter-path>

                                                                                                                                                For example, if mysqld-exporter is to be scraped, spin up the container as follows:

                                                                                                                                                docker -d -l io.prometheus.scrape=true -l io.prometheus.port=9104 -l io.prometheus.path=/metrics mysqld-exporter
                                                                                                                                                
                                                                                                                                                

                                                                                                                                                6.1.3 -

                                                                                                                                                (Legacy) Configuring Sysdig Agent

                                                                                                                                                This feature is not supported with Promscrape V2. For information on different versions of Promscrape and migrating to the latest version, see Migrating from Promscrape V1 to V2.

                                                                                                                                                As is typical for the agent, the default configuration for the feature is specified in dragent.default.yaml, and you can override the defaults by configuring parameters in the dragent.yaml. For each parameter, you do not set in dragent.yaml, the defaults in dragent.default.yaml will remain in effect.

                                                                                                                                                Main Configuration Parameters

                                                                                                                                                Parameter

                                                                                                                                                Default

                                                                                                                                                Description

                                                                                                                                                prometheus

                                                                                                                                                See below

                                                                                                                                                Turns Prometheus scraping on and off.

                                                                                                                                                process_filter

                                                                                                                                                See below

                                                                                                                                                Specifies which processes may be eligible for scraping. See [Process Filter](/en/docs/sysdig-monitor/monitoring-integrations/legacy-integrations/legacycollect-prometheus-metrics/configuring-sysdig-agent/#process-filter).

                                                                                                                                                use_promscrape

                                                                                                                                                See below.

                                                                                                                                                Determines whether to use promscrape for scraping Prometheus metrics.

                                                                                                                                                promscrape

                                                                                                                                                Promscrape is a lightweight Prometheus server that is embedded with the Sysdig agent. The use_promscrape parameter controls whether to use it to scrape Prometheus endpoints.

                                                                                                                                                Promscrape has two versions: Promscrape V1 and Promscrape V2. With V1, Sysdig agent discovers scrape targets through the process_filter rules. With V2, promscrape itself discovers targets by using the standard Prometheus configuration, allowing the use of relabel_configs to find or modify targets.

                                                                                                                                                Parameters

                                                                                                                                                Default

                                                                                                                                                Description

                                                                                                                                                use_promscrape

                                                                                                                                                true

                                                                                                                                                prometheus

                                                                                                                                                The prometheus section defines the behavior related to Prometheus metrics collection and analysis. It allows for turning the feature on, set a limit from the agent side on the number of metrics to be scraped, and determines whether to report histogram metrics and log failed scrape attempts.

                                                                                                                                                Parameter

                                                                                                                                                Default

                                                                                                                                                Description

                                                                                                                                                enabled

                                                                                                                                                false

                                                                                                                                                Turns Prometheus scraping on and off.

                                                                                                                                                interval

                                                                                                                                                10

                                                                                                                                                How often (in seconds) the agent will scrape a port for Prometheus metrics

                                                                                                                                                prom_service_discovery

                                                                                                                                                true

                                                                                                                                                Enables native Prometheus service discovery. If disabled, promscrape.v1 is used to scrape the targets. See Enable Prometheus Native Service Discovery.

                                                                                                                                                On agent versions prior to 11.2, the default is false.

                                                                                                                                                max_metrics

                                                                                                                                                1000

                                                                                                                                                The maximum number of total Prometheus metrics that will be scraped across all targets. This value of 1000 is the maximum per-agent, and is a separate limit from other Custom Metrics. For example, StatsD, JMX, and App Checks.

                                                                                                                                                timeout

                                                                                                                                                1

                                                                                                                                                Used to configure the amount of time the agent will wait while scraping a Prometheus endpoint before timing out. The default value is 1 second.

                                                                                                                                                As of agent v10.0, this parameter is only used when promscrape is disabled. Since promscrape is now default, timeout can be considered deprecated, however it is still used when you explicitly disable promscrape.

                                                                                                                                                Process Filter

                                                                                                                                                The process_filter section specifies which of the processes known by an agent may be eligible for scraping.

                                                                                                                                                Note that once you specify a process_filter in your dragent.yaml, this replaces the entire Prometheus process_filter section (i.e. all the rules) shown in the dragent.default.yaml.

                                                                                                                                                The Process Filter is specified in a series of include and exclude rules that are evaluated top-to-bottom for each process known by an Agent. If a process matches an include rule, scraping will be attempted via a /metrics endpoint on each listening TCP port for the process, unless a conf section also appears within the rule to further restrict how the process will be scraped. See conf for more information.

                                                                                                                                                Multiple patterns can be specified in a single rule, in which case all patterns must match for the rule to be a match (AND logic).

                                                                                                                                                Within a pattern value, simple “glob” wildcarding may be used, where * matches any number of characters (including none) and ? matches any single character. Note that due to YAML syntax, when using wildcards, be sure to enclose the value in quotes ("*").

                                                                                                                                                The table below describes the supported patterns in Process Filter rules. To provide realistic examples, we’ll use a simple sample Prometheus exporter (source code here) which can be deployed as a container using the Docker command line below. To help illustrate some of the configuration options, this sample exporter presents Prometheus metrics on /prometheus instead of the more common /metrics endpoint, which will be shown in the example configurations further below.

                                                                                                                                                # docker run -d -p 8080:8080 \
                                                                                                                                                    --label class="exporter" \
                                                                                                                                                    --name my-java-app \
                                                                                                                                                    luca3m/prometheus-java-app
                                                                                                                                                
                                                                                                                                                # ps auxww | grep app.jar
                                                                                                                                                root     11502 95.9  9.2 3745724 753632 ?      Ssl  15:52   1:42 java -jar /app.jar --management.security.enabled=false
                                                                                                                                                
                                                                                                                                                # curl http://localhost:8080/prometheus
                                                                                                                                                ...
                                                                                                                                                random_bucket{le="0.005",} 6.0
                                                                                                                                                random_bucket{le="0.01",} 17.0
                                                                                                                                                random_bucket{le="0.025",} 51.0
                                                                                                                                                ...
                                                                                                                                                

                                                                                                                                                Pattern name

                                                                                                                                                Description

                                                                                                                                                Example

                                                                                                                                                container.image

                                                                                                                                                Matches if the process is running inside a container running the specified image

                                                                                                                                                - include:

                                                                                                                                                container.image: luca3m/prometheus-java-app

                                                                                                                                                container.name

                                                                                                                                                Matches if the process is running inside a container with the specified name

                                                                                                                                                - include:

                                                                                                                                                container.name: my-java-app

                                                                                                                                                container.label.*

                                                                                                                                                Matches if the process is running in a container that has a Label matching the given value

                                                                                                                                                - include:

                                                                                                                                                container.label.class: exporter

                                                                                                                                                kubernetes.<object>.annotation.* kubernetes.<object>.label.*

                                                                                                                                                Matches if the process is attached to a Kubernetes object (Pod, Namespace, etc.) that is marked with the Annotation/Label matching the given value.

                                                                                                                                                Note: This pattern does not apply to the Docker-only command-line shown above, but would instead apply if the exporter were installed as a Kubernetes Deployment using this example YAML.

                                                                                                                                                Note: See Kubernetes Objects, below, for information on the full set of supported Annotations and Labels.

                                                                                                                                                - include:

                                                                                                                                                kubernetes.pod.annotation.prometheus.io/scrape: true

                                                                                                                                                process.name

                                                                                                                                                Matches the name of the running process

                                                                                                                                                - include:

                                                                                                                                                process.name: java

                                                                                                                                                process.cmdline

                                                                                                                                                Matches a command line argument

                                                                                                                                                - include:

                                                                                                                                                process.cmdline: "*app.jar*"

                                                                                                                                                port

                                                                                                                                                Matches if the process is listening on one or more TCP ports.

                                                                                                                                                The pattern for a single rule can specify a single port as shown in this example, or a single range (e.g.8079-8081), but does not support comma-separated lists of ports/ranges.

                                                                                                                                                Note: This parameter is only used to confirm if a process is eligible for scraping based on the ports on which it is listening. For example, if a process is listening on one port for application traffic and has a second port open for exporting Prometheus metrics, it would be possible to specify the application port here (but not the exporting port), and the exporting port in the conf section (but not the application port), and the process would be matched as eligible and the exporting port would be scraped.

                                                                                                                                                - include:

                                                                                                                                                port: 8080

                                                                                                                                                appcheck.match

                                                                                                                                                Matches if an Application Check with the specific name or pattern is scheduled to run for the process.

                                                                                                                                                - exclude:

                                                                                                                                                appcheck.match: "*"

                                                                                                                                                Instead of the **`include`** examples shown above that would have each matched our process, due to the previously-described ability to combine multiple patterns in a single rule, the following very strict configuration would also have matched:
                                                                                                                                                - include:
                                                                                                                                                    container.image: luca3m/prometheus-java-app
                                                                                                                                                    container.name: my-java-app
                                                                                                                                                    container.label.class: exporter
                                                                                                                                                    process.name: java
                                                                                                                                                    process.cmdline: "*app.jar*"
                                                                                                                                                    port: 8080
                                                                                                                                                

                                                                                                                                                conf

                                                                                                                                                Each include rule in the port_filter may include a conf portion that further describes how scraping will be attempted on the eligible process. If a conf portion is not included, scraping will be attempted at a /metrics endpoint on all listening ports of the matching process. The possible settings:

                                                                                                                                                Parameter name

                                                                                                                                                Description

                                                                                                                                                Example

                                                                                                                                                port

                                                                                                                                                Either a static number for a single TCP port to be scraped, or a container/Kubernetes Label name or Kubernetes Annotation specified in curly braces. If the process is running in a container that is marked with this Label or is attached to a Kubernetes object (Pod, Namespace, etc.) that is marked with this Annotation/Label, scraping will be attempted only on the port specified as the value of the Label/Annotation.

                                                                                                                                                Note: The Label/Annotation to match against will not include the text shown in red.

                                                                                                                                                Note: See Kubernetes Objectsfor information on the full set of supported Annotations and Labels.

                                                                                                                                                Note: If running the exporter inside a container, this should specify the port number that the exporter process in the container is listening on, not the port that the container exposes to the host.

                                                                                                                                                port: 8080

                                                                                                                                                - or -

                                                                                                                                                port: "{container.label.io.prometheus.port}"

                                                                                                                                                - or -

                                                                                                                                                port: "{kubernetes.pod.annotation.prometheus.io/port}"

                                                                                                                                                port_filter

                                                                                                                                                A set of include and exclude rules that define the ultimate set of listening TCP ports for an eligible process on which scraping may be attempted. Note that the syntax is different from the port pattern option from within the higher-level include rule in the process_filter. Here a given rule can include single ports, comma-separated lists of ports (enclosed in square brackets), or contiguous port ranges (without brackets).

                                                                                                                                                port_filter:

                                                                                                                                                - include: 8080 - exclude: [9092,9200,9300] - include: 9090-9100

                                                                                                                                                path

                                                                                                                                                Either the static specification of an endpoint to be scraped, or a container/Kubernetes Label name or Kubernetes Annotation specified in curly braces. If the process is running in a container that is marked with this Label or is attached to a Kubernetes object (Pod, Namespace, etc.) that is marked with this Annotation/Label, scraping will be attempted via the endpoint specified as the value of the Label/Annotation.

                                                                                                                                                If path is not specified, or specified but the Agent does not find the Label/Annotation attached to the process, the common Prometheus exporter default of /metrics will be used.

                                                                                                                                                Note: A Label/Annotation to match against will not include the text shown in red.

                                                                                                                                                Note: See Kubernetes Objects for information on the full set of supported Annotations and Labels.

                                                                                                                                                path: "/prometheus"

                                                                                                                                                - or -

                                                                                                                                                path: "{container.label.io.prometheus.path}"

                                                                                                                                                - or -

                                                                                                                                                path: "{kubernetes.pod.annotation.prometheus.io/path}"

                                                                                                                                                host

                                                                                                                                                A hostname or IP address. The default is localhost.

                                                                                                                                                host: 192.168.1.101
                                                                                                                                                - or -
                                                                                                                                                host: subdomain.example.com
                                                                                                                                                - or -
                                                                                                                                                host: localhost

                                                                                                                                                use_https

                                                                                                                                                When set to true, connectivity to the exporter will only be attempted through HTTPS instead of HTTP. It is false by default.

                                                                                                                                                (Available in Agent version 0.79.0 and newer)

                                                                                                                                                use_https: true

                                                                                                                                                ssl_verify

                                                                                                                                                When set to true, verification will be performed for the server certificates for an HTTPS connection. It is false by default. Verification was enabled by default before 0.79.0.

                                                                                                                                                (Available in Agent version 0.79.0 and newer)

                                                                                                                                                ssl_verify: true

                                                                                                                                                Authentication Integration

                                                                                                                                                As of agent version 0.89, Sysdig can collect Prometheus metrics from endpoints requiring authentication. Use the parameters below to enable this function.

                                                                                                                                                • For username/password authentication:

                                                                                                                                                  • username

                                                                                                                                                  • password

                                                                                                                                                • For authentication using a token:

                                                                                                                                                  • auth_token_path
                                                                                                                                                • For certificate authentication with a certificate key:

                                                                                                                                                  • auth_cert_path

                                                                                                                                                  • auth_key_path

                                                                                                                                                Token substitution is also supported for all the authorization parameters. For instance a username can be taken from a Kubernetes annotation by specifying

                                                                                                                                                username: "{kubernetes.service.annotation.prometheus.openshift.io/username}"

                                                                                                                                                conf Authentication Example

                                                                                                                                                Below is an example of the dragent.yaml section showing all the Prometheus authentication configuration options, on OpenShift, Kubernetes, and etcd.

                                                                                                                                                In this example:

                                                                                                                                                • The username/password are taken from a default annotation used by OpenShift.

                                                                                                                                                • The auth token path is commonly available in Kubernetes deployments.

                                                                                                                                                • The certificate and key used here for etcd may normally not be as easily accessible to the agent. In this case they were extracted from the host namespace, constructed into Kubernetes secrets, and then mounted into the agent container.

                                                                                                                                                prometheus:
                                                                                                                                                  enabled: true
                                                                                                                                                  process_filter:
                                                                                                                                                    - include:
                                                                                                                                                        port: 1936
                                                                                                                                                        conf:
                                                                                                                                                            username: "{kubernetes.service.annotation.prometheus.openshift.io/username}"
                                                                                                                                                            password: "{kubernetes.service.annotation.prometheus.openshift.io/password}"
                                                                                                                                                    - include:
                                                                                                                                                        process.name: kubelet
                                                                                                                                                        conf:
                                                                                                                                                            port: 10250
                                                                                                                                                            use_https: true
                                                                                                                                                            auth_token_path: "/run/secrets/kubernetes.io/serviceaccount/token"
                                                                                                                                                    - include:
                                                                                                                                                        process.name: etcd
                                                                                                                                                        conf:
                                                                                                                                                            port: 2379
                                                                                                                                                            use_https: true
                                                                                                                                                            auth_cert_path: "/run/secrets/etcd/client-cert"
                                                                                                                                                            auth_key_path: "/run/secrets/etcd/client-key"
                                                                                                                                                

                                                                                                                                                Kubernetes Objects

                                                                                                                                                As described above, there are multiple configuration options that can be set based on auto-discovered values for Kubernetes Labels and/or Annotations. The format in each case begins with "kubernetes.OBJECT.annotation." or "kubernetes.OBJECT.label." where OBJECT can be any of the following supported Kubernetes object types:

                                                                                                                                                • daemonSet

                                                                                                                                                • deployment

                                                                                                                                                • namespace

                                                                                                                                                • node

                                                                                                                                                • pod

                                                                                                                                                • replicaSet

                                                                                                                                                • replicationController

                                                                                                                                                • service

                                                                                                                                                • statefulset

                                                                                                                                                The configuration text you add after the final dot becomes the name of the Kubernetes Label/Annotation that the Agent will look for. If the Label/Annotation is discovered attached to the process, the value of that Label/Annotation will be used for the configuration option.

                                                                                                                                                Note that there are multiple ways for a Kubernetes Label/Annotation to be attached to a particular process. One of the simplest examples of this is the Pod-based approach shown in Quick Start For Kubernetes Environments. However, as an example alternative to marking at the Pod level, you could attach Labels/Annotations at the Namespace level, in which case auto-discovered configuration options would apply to all processes running in that Namespace regardless of whether they’re in a Deployment, DaemonSet, ReplicaSet, etc.

                                                                                                                                                6.1.4 -

                                                                                                                                                (Legacy) Filtering Prometheus Metrics

                                                                                                                                                As of Sysdig agent 9.8.0, a lightweight Prometheus server is embedded in agents named promscrape and a prometheus.yaml file is included as part of configuration files. Using the open source Prometheus capabilities, Sysdig leverages a Prometheus feature to allow you to filter Prometheus metrics at the source before ingestion. To do so, you will:

                                                                                                                                                • Ensure that the Prometheus scraping is enabled in the  dragent.yaml file.

                                                                                                                                                  prometheus:
                                                                                                                                                    enabled: true
                                                                                                                                                  
                                                                                                                                                • On agent v9.8.0 and above, enable the feature by setting the

                                                                                                                                                  use_promscrape parameter to true in the dragent.yaml. See Enable Filtering at Ingestion.

                                                                                                                                                • Edit the configuration in the prometheus.yaml file. See Edit Prometheus Configuration File.

                                                                                                                                                  Sysdig-specific configuration is found in the prometheus.yaml file.

                                                                                                                                                Enable Filtering at Ingestion

                                                                                                                                                On agent v9.8.0, in order for target filtering to work, the use_promscrape parameter in the dragent.yaml must be set to true. For more information on configuration, see Configuring Sysdig Agent.

                                                                                                                                                use_promscrape: true
                                                                                                                                                

                                                                                                                                                On agent v10.0, use_promscrape is enabled by default. Implies, promscrape is used for scraping Prometheus metrics.

                                                                                                                                                Filtering configuration is optional. The absence of prometheus.yaml  will not change the existing behavior of the agent.

                                                                                                                                                Edit Prometheus Configuration File

                                                                                                                                                About the Prometheus Configuration File

                                                                                                                                                The prometheus.yaml file contains mostly the filtering/relabeling configuration in a list of key-value pairs, representing target process attributes.

                                                                                                                                                You replace keys and values with the desired tags corresponding to your environment.

                                                                                                                                                In this file, you will configure the following:

                                                                                                                                                • Default scrape interval (optional).

                                                                                                                                                  For example:

                                                                                                                                                  scrape_interval: 10s

                                                                                                                                                • Of the labeling parameters that Prometheus offers, Sysdig supports only metric_relabel_configs. The relabel_config parameter is not supported.

                                                                                                                                                • Zero or more process-specific filtering configurations (optional).

                                                                                                                                                  See Kubernetes Environments and Docker Environments

                                                                                                                                                  The filtering configuration includes:

                                                                                                                                                  • Filtering rules

                                                                                                                                                    For example:

                                                                                                                                                    - source_labels: [container_label_io_kubernetes_pod_name]

                                                                                                                                                  • Limit on number of scraped samples (optional)

                                                                                                                                                    For example:

                                                                                                                                                    sample_limit: 2000

                                                                                                                                                • Default filtering configuration (optional). The filtering configuration includes:

                                                                                                                                                  • Filtering rules

                                                                                                                                                    For example:

                                                                                                                                                    - source_labels: [car]

                                                                                                                                                  • Limit on number of scraped samples (optional)

                                                                                                                                                    For example:

                                                                                                                                                    sample_limit: 2000

                                                                                                                                                The prometheus.yaml file is installed alongside dragent.yaml. For the most part, the syntax of prometheus.yaml complies with the standard Prometheus configuration

                                                                                                                                                Default Configuration

                                                                                                                                                A configuration with empty key-value pairs is considered a default configuration. The default configuration will be applied to all the processes to be scraped that don’t have a matching filtering configuration. In Sample Prometheus Configuration File, the job_name: 'default' section represents the default configuration.

                                                                                                                                                Kubernetes Environments

                                                                                                                                                If the agent runs in Kubernetes environments (Open Source/OpenShift/GKE), include the following Kubernetes objects as key-value pairs. See Agent Install: Kubernetes for details on agent installation.

                                                                                                                                                For example:

                                                                                                                                                sysdig_sd_configs:
                                                                                                                                                - tags:
                                                                                                                                                    namespace: backend
                                                                                                                                                    deployment: my-api
                                                                                                                                                

                                                                                                                                                In addition to the aforementioned tags, any of these object types can be matched against:

                                                                                                                                                daemonset: my_daemon
                                                                                                                                                deployment: my_deployment
                                                                                                                                                hpa: my_hpa
                                                                                                                                                namespace: my_namespace
                                                                                                                                                node: my_node
                                                                                                                                                pod: my_pode
                                                                                                                                                replicaset: my_replica
                                                                                                                                                replicationcontroller: my_controller
                                                                                                                                                resourcequota: my_quota
                                                                                                                                                service: my_service
                                                                                                                                                stateful: my_statefulset
                                                                                                                                                

                                                                                                                                                For Kubernetes/OpenShift/GKE deployments, prometheus.yaml shares the same ConfigMap with dragent.yaml.

                                                                                                                                                Docker Environments

                                                                                                                                                In Docker environments, include attributes such as container, host, port, and more. For example:

                                                                                                                                                sysdig_sd_configs:
                                                                                                                                                - tags:
                                                                                                                                                    host: my-host
                                                                                                                                                    port: 8080
                                                                                                                                                

                                                                                                                                                For Docker-based deployments, prometheus.yaml can be mounted from the host.

                                                                                                                                                Sample Prometheus Configuration File

                                                                                                                                                global:
                                                                                                                                                  scrape_interval: 20s
                                                                                                                                                scrape_configs:
                                                                                                                                                - job_name: 'default'
                                                                                                                                                  sysdig_sd_configs: # default config
                                                                                                                                                  relabel_configs:
                                                                                                                                                - job_name: 'my-app-job'
                                                                                                                                                  sample_limit: 2000
                                                                                                                                                  sysdig_sd_configs:  # apply this filtering config only to my-app
                                                                                                                                                  - tags:
                                                                                                                                                      namespace: backend
                                                                                                                                                      deployment: my-app
                                                                                                                                                  metric_relabel_configs:
                                                                                                                                                  # Drop all metrics starting with http_
                                                                                                                                                  - source_labels: [__name__]
                                                                                                                                                    regex: "http_(.+)"
                                                                                                                                                    action: drop
                                                                                                                                                  metric_relabel_configs:
                                                                                                                                                  # Drop all metrics for which the city label equals atlantis
                                                                                                                                                  - source_labels: [city]
                                                                                                                                                    regex: "atlantis"
                                                                                                                                                    action: drop
                                                                                                                                                

                                                                                                                                                6.1.5 -

                                                                                                                                                (Legacy) Example Configuration

                                                                                                                                                This topic introduces you to default and specific Prometheus configurations.

                                                                                                                                                Default Configuration

                                                                                                                                                As an example that pulls together many of the configuration elements shown above, consider the default Agent configuration that’s inherited from the dragent.default.yaml.

                                                                                                                                                prometheus:
                                                                                                                                                  enabled: true
                                                                                                                                                  interval: 10
                                                                                                                                                  log_errors: true
                                                                                                                                                  max_metrics: 1000
                                                                                                                                                  max_metrics_per_process: 100
                                                                                                                                                  max_tags_per_metric: 20
                                                                                                                                                
                                                                                                                                                  # Filtering processes to scan. Processes not matching a rule will not
                                                                                                                                                  # be scanned
                                                                                                                                                  # If an include rule doesn't contain a port or port_filter in the conf
                                                                                                                                                  # section, we will scan all the ports that a matching process is listening to.
                                                                                                                                                  process_filter:
                                                                                                                                                    - exclude:
                                                                                                                                                        process.name: docker-proxy
                                                                                                                                                    - exclude:
                                                                                                                                                        container.image: sysdig/agent
                                                                                                                                                    # special rule to exclude processes matching configured prometheus appcheck
                                                                                                                                                    - exclude:
                                                                                                                                                        appcheck.match: prometheus
                                                                                                                                                    - include:
                                                                                                                                                        container.label.io.prometheus.scrape: "true"
                                                                                                                                                        conf:
                                                                                                                                                            # Custom path definition
                                                                                                                                                            # If the Label doesn't exist we'll still use "/metrics"
                                                                                                                                                            path: "{container.label.io.prometheus.path}"
                                                                                                                                                
                                                                                                                                                            # Port definition
                                                                                                                                                            # - If the Label exists, only scan the given port.
                                                                                                                                                            # - If it doesn't, use port_filter instead.
                                                                                                                                                            # - If there is no port_filter defined, skip this process
                                                                                                                                                            port: "{container.label.io.prometheus.port}"
                                                                                                                                                            port_filter:
                                                                                                                                                                - exclude: [9092,9200,9300]
                                                                                                                                                                - include: 9090-9500
                                                                                                                                                                - include: [9913,9984,24231,42004]
                                                                                                                                                    - exclude:
                                                                                                                                                        container.label.io.prometheus.scrape: "false"
                                                                                                                                                    - include:
                                                                                                                                                        kubernetes.pod.annotation.prometheus.io/scrape: true
                                                                                                                                                        conf:
                                                                                                                                                            path: "{kubernetes.pod.annotation.prometheus.io/path}"
                                                                                                                                                            port: "{kubernetes.pod.annotation.prometheus.io/port}"
                                                                                                                                                    - exclude:
                                                                                                                                                        kubernetes.pod.annotation.prometheus.io/scrape: false
                                                                                                                                                

                                                                                                                                                Consider the following about this default configuration:

                                                                                                                                                • All Prometheus scraping is disabled by default. To enable the entire configuration shown here, you would only need to add the following to your dragent.yaml:

                                                                                                                                                  prometheus:
                                                                                                                                                    enabled: true
                                                                                                                                                  

                                                                                                                                                  Enabling this option and any pods (in case of Kubernetes) that have the right annotation set or containers (if not) that have the labels set will automatically be scrapped.

                                                                                                                                                • Once enabled, this default configuration is ideal for the use case described in the Quick Start For Kubernetes Environments.

                                                                                                                                                • A Process Filter rule excludes processes that are likely to exist in most environments but are known to never export Prometheus metrics, such as the Docker Proxy and the Agent itself.

                                                                                                                                                • Another Process Filter rule ensures that any processes configured to be scraped by the legacy Prometheus application check will not be scraped.

                                                                                                                                                • Another Process Filter rule is tailored to use container Labels. Processes marked with the container Label io.prometheus.scrape will become eligible for scraping, and if further marked with container Labels io.prometheus.port and/or io.prometheus.path, scraping will be attempted only on this port and/or endpoint. If the container is not marked with the specified path Label, scraping the /metrics endpoint will be attempted. If the container is not marked with the specified port Label, any listening ports in the port_filter will be attempted for scraping (this port_filter in the default is set for the range of ports for common Prometheus exporters, with exclusions for ports in the range that are known to be used by other applications that are not exporters).

                                                                                                                                                • The final Process Filter Include rule is tailored to the use case described in the Quick Start For Kubernetes Environments.

                                                                                                                                                Scrape a Single Custom Process

                                                                                                                                                If you need to scrape a single custom process, for instance, a java process listening on port 9000 with path /prometheus, add the following to the dragent.yaml:

                                                                                                                                                prometheus:
                                                                                                                                                  enabled: true
                                                                                                                                                  process_filter:
                                                                                                                                                    - include:
                                                                                                                                                        process.name: java
                                                                                                                                                        port: 9000
                                                                                                                                                        conf:
                                                                                                                                                          # ensure we only scrape port 9000 as opposed to all ports this process may be listening to
                                                                                                                                                          port: 9000
                                                                                                                                                          path: "/prometheus"
                                                                                                                                                

                                                                                                                                                This configuration overrides the default process_filter section shown in Default Configuration. You can add relevant rules from the default configuration to this to further filter down the metrics.

                                                                                                                                                port has different purposes depending on where it’s placed in the configuration. When placed under the include section, it is a condition for matching the include rule.

                                                                                                                                                Placing a port under conf indicates that only that particular port is scraped when the rule is matched as opposed to all the ports that the process could be listening on.

                                                                                                                                                In this example, the first rule will be matched for the Java process listening on port 9000. The java process listening only on port 9000 will be scrapped.

                                                                                                                                                Scrape a Single Custom Process Based on Container Labels

                                                                                                                                                If you still want to scrape based on container labels, you could just append the relevant rules from the defaults to the process_filter. For example:

                                                                                                                                                prometheus:
                                                                                                                                                  enabled: true
                                                                                                                                                  process_filter:
                                                                                                                                                    - include:
                                                                                                                                                        process.name: java
                                                                                                                                                        port: 9000
                                                                                                                                                        conf:
                                                                                                                                                          # ensure we only scrape port 9000 as opposed to all ports this process may be listening to
                                                                                                                                                          port: 9000
                                                                                                                                                          path: "/prometheus"
                                                                                                                                                    - exclude:
                                                                                                                                                        process.name: docker-proxy
                                                                                                                                                    - include:
                                                                                                                                                        container.label.io.prometheus.scrape: "true"
                                                                                                                                                        conf:
                                                                                                                                                            path: "{container.label.io.prometheus.path}"
                                                                                                                                                            port: "{container.label.io.prometheus.port}"
                                                                                                                                                

                                                                                                                                                port has a different meaning depending on where it’s placed in the configuration. When placed under the include section, it’s a condition for matching the include rule.

                                                                                                                                                Placing port under conf indicates that only that port is scraped when the rule is matched as opposed to all the ports that the process could be listening on.

                                                                                                                                                In this example, the first rule will be matched for the process listening on port 9000. The java process listening only on port 9000 will be scrapped.

                                                                                                                                                Container Environment

                                                                                                                                                With this default configuration enabled, a containerized install of our example exporter shown below would be automatically scraped via the Agent.

                                                                                                                                                # docker run -d -p 8080:8080 \
                                                                                                                                                    --label io.prometheus.scrape="true" \
                                                                                                                                                    --label io.prometheus.port="8080" \
                                                                                                                                                    --label io.prometheus.path="/prometheus" \
                                                                                                                                                    luca3m/prometheus-java-app
                                                                                                                                                

                                                                                                                                                Kubernetes Environment

                                                                                                                                                In a Kubernetes-based environment, a Deployment with the Annotations as shown in this example YAML would be scraped by enabling the default configuration.

                                                                                                                                                apiVersion: extensions/v1beta1
                                                                                                                                                kind: Deployment
                                                                                                                                                metadata:
                                                                                                                                                  name: prometheus-java-app
                                                                                                                                                spec:
                                                                                                                                                  replicas: 1
                                                                                                                                                  template:
                                                                                                                                                    metadata:
                                                                                                                                                      labels:
                                                                                                                                                        app: prometheus-java-app
                                                                                                                                                      annotations:
                                                                                                                                                        prometheus.io/scrape: "true"
                                                                                                                                                        prometheus.io/path: "/prometheus"
                                                                                                                                                        prometheus.io/port: "8080"
                                                                                                                                                    spec:
                                                                                                                                                      containers:
                                                                                                                                                        - name: prometheus-java-app
                                                                                                                                                          image: luca3m/prometheus-java-app
                                                                                                                                                          imagePullPolicy: Always
                                                                                                                                                

                                                                                                                                                Non-Containerized Environment

                                                                                                                                                This is an example of a non-containerized environment or a containerized environment that doesn’t use Labels or Annotations. The following dragent.yaml would override the default and do per-second scrapes of our sample exporter and also a second exporter on port 5005, each at their respective non-standard endpoints. This can be thought of as a conservative “whitelist” type of configuration since it restricts scraping to only exporters that are known to exist in the environment and the ports on which they’re known to export Prometheus metrics.

                                                                                                                                                prometheus:
                                                                                                                                                  enabled: true
                                                                                                                                                  interval: 1
                                                                                                                                                  process_filter:
                                                                                                                                                    - include:
                                                                                                                                                        process.cmdline: "*app.jar*"
                                                                                                                                                        conf:
                                                                                                                                                          port: 8080
                                                                                                                                                          path: "/prometheus"
                                                                                                                                                    - include:
                                                                                                                                                        port: 5005
                                                                                                                                                        conf:
                                                                                                                                                          port: 5005
                                                                                                                                                          path: "/wacko"
                                                                                                                                                

                                                                                                                                                port has a different meaning depending on where it’s placed in the configuration. When placed under the include section, it’s a condition for matching the include rule. Placing port under conf indicates that only that port is scraped when the rule is matched as opposed to all the ports that the process could be listening on.

                                                                                                                                                In this example, the first rule will be matched for the process *app.jar*. The java process listening only on port 8080 will be scrapped as opposed to all the ports that *app.jar* could be listening on. The second rule will be matched for port 5005 and the process listening only on 5005 will be scraped.

                                                                                                                                                6.1.6 -

                                                                                                                                                (Legacy) Logging and Troubleshooting

                                                                                                                                                Logging

                                                                                                                                                After the Agent begins scraping Prometheus metrics, there may be a delay of up to a few minutes before the metrics become visible in Sysdig Monitor. To help quickly confirm your configuration is correct, starting with Agent version 0.80.0, the following log line will appear in the Agent log the first time since starting that it has found and is successfully scraping at least one Prometheus exporter:

                                                                                                                                                2018-05-04 21:42:10.048, 8820, Information, 05-04 21:42:10.048324 Starting export of Prometheus metrics
                                                                                                                                                

                                                                                                                                                As this is an INFO level log message, it will appear in Agents using the default logging settings. To reveal even more detail,increase the Agent log level to DEBUG , which produces a message like the following that reveals the name of a specific metric first detected. You can then look for this metric to be visible in Sysdig Monitor shortly after.

                                                                                                                                                2018-05-04 21:50:46.068, 11212, Debug, 05-04 21:50:46.068141 First prometheus metrics since agent start: pid 9583: 5 metrics including: randomSummary.95percentile
                                                                                                                                                

                                                                                                                                                Troubleshooting

                                                                                                                                                See the previous section for information on expected log messages during successful scraping. If you have enabled Prometheus and are not seeing the Starting export message shown there, revisit your configuration.

                                                                                                                                                It is also suggested to leave the configuration option in its default setting of log_errors: true , which will reveal any issues scraping eligible processes in the Agent log.

                                                                                                                                                For example, here is an error message for a failed scrape of a TCP port that was listening but not accepting HTTP requests:

                                                                                                                                                2017-10-13 22:00:12.076, 4984, Error, sdchecks[4987] Exception on running check prometheus.5000: Exception('Timeout when hitting http://localhost:5000/metrics',)
                                                                                                                                                2017-10-13 22:00:12.076, 4984, Error, sdchecks, Traceback (most recent call last):
                                                                                                                                                2017-10-13 22:00:12.076, 4984, Error, sdchecks, File "/opt/draios/lib/python/sdchecks.py", line 246, in run
                                                                                                                                                2017-10-13 22:00:12.076, 4984, Error, sdchecks, self.check_instance.check(self.instance_conf)
                                                                                                                                                2017-10-13 22:00:12.076, 4984, Error, sdchecks, File "/opt/draios/lib/python/checks.d/prometheus.py", line 44, in check
                                                                                                                                                2017-10-13 22:00:12.076, 4984, Error, sdchecks, metrics = self.get_prometheus_metrics(query_url, timeout, "prometheus")
                                                                                                                                                2017-10-13 22:00:12.076, 4984, Error, sdchecks, File "/opt/draios/lib/python/checks.d/prometheus.py", line 105, in get_prometheus_metrics
                                                                                                                                                2017-10-13 22:00:12.077, 4984, Error, sdchecks, raise Exception("Timeout when hitting %s" % url)
                                                                                                                                                2017-10-13 22:00:12.077, 4984, Error, sdchecks, Exception: Timeout when hitting http://localhost:5000/metrics
                                                                                                                                                

                                                                                                                                                Here is an example error message for a failed scrape of a port that was responding to HTTP requests on the /metrics endpoint but not responding with valid Prometheus-format data. The invalid endpoint is responding as follows:

                                                                                                                                                # curl http://localhost:5002/metrics
                                                                                                                                                This ain't no Prometheus metrics!
                                                                                                                                                

                                                                                                                                                And the corresponding error message in the Agent log, indicating no further scraping will be attempted after the initial failure:

                                                                                                                                                2017-10-13 22:03:05.081, 5216, Information, sdchecks[5219] Skip retries for Prometheus error: could not convert string to float: ain't
                                                                                                                                                2017-10-13 22:03:05.082, 5216, Error, sdchecks[5219] Exception on running check prometheus.5002: could not convert string to float: ain't
                                                                                                                                                
                                                                                                                                                

                                                                                                                                                6.1.7 -

                                                                                                                                                This feature is not supported with Promscrape V2. For information on different versions of Promscrape and migrating to the latest version, see Migrating from Promscrape V1 to V2.

                                                                                                                                                (Legacy) Collecting Prometheus Metrics from Remote Hosts

                                                                                                                                                Sysdig Monitor can collect Prometheus metrics from remote endpoints with minimum configuration. Remote endpoints (remote hosts) refer to hosts where Sysdig Agent cannot be deployed. For example, a Kubernetes master node on managed Kubernetes services such as GKE and EKS where user workload cannot be deployed, which in turn implies no Agents involved. Enabling remote scraping on such hosts is as simple as identifying an Agent to perform the scraping and declaring the endpoint configurations with a remote services section in the Agent configuration file.

                                                                                                                                                The collected Prometheus metrics are reported under and associated with the Agent that performed the scraping as opposed to associating them with a process.

                                                                                                                                                Preparing the Configuration File

                                                                                                                                                Multiple Agents can share the same configuration. Therefore, determine which one of those Agents scrape the remote endpoints with the dragent.yaml file. This is applicable to both

                                                                                                                                                • Create a separate configuration section for remote services in the Agent configuration file under the prometheus configuration.

                                                                                                                                                • Include a configuration section for each remote endpoint, and add either a URL or host/port (and an optional path) parameter to each section to identify the endpoint to scrape. The optional path identifies the resource at the endpoint. An empty path parameter defaults to the "/metrics" endpoint for scraping.

                                                                                                                                                • Optionally, add custom tags for each endpoint configuration for remote services. In the absence of tags, metric reporting might not work as expected when multiple endpoints are involved. Agents cannot distinguish similar metrics scraped from multiple endpoints unless those metrics are uniquely identified by tags.

                                                                                                                                                To help you get started, an example configuration for Kubernetes is given below:

                                                                                                                                                prometheus:
                                                                                                                                                  remote_services:
                                                                                                                                                        - prom_1:
                                                                                                                                                            kubernetes.node.annotation.sysdig.com/region: europe
                                                                                                                                                            kubernetes.node.annotation.sysdig.com/scraper: true
                                                                                                                                                            conf:
                                                                                                                                                                url: "https://xx.xxx.xxx.xy:5005/metrics"
                                                                                                                                                                tags:
                                                                                                                                                                    host: xx.xxx.xxx.xy
                                                                                                                                                                    service: prom_1
                                                                                                                                                                    scraping_node: "{kubernetes.node.name}"
                                                                                                                                                        - prom_2:
                                                                                                                                                            kubernetes.node.annotation.sysdig.com/region: india
                                                                                                                                                            kubernetes.node.annotation.sysdig.com/scraper: true
                                                                                                                                                            conf:
                                                                                                                                                                host: xx.xxx.xxx.yx
                                                                                                                                                                port: 5005
                                                                                                                                                                use_https: true
                                                                                                                                                                tags:
                                                                                                                                                                    host: xx.xxx.xxx.yx
                                                                                                                                                                    service: prom_2
                                                                                                                                                                    scraping_node: "{kubernetes.node.name}"
                                                                                                                                                        - prom_3:
                                                                                                                                                            kubernetes.pod.annotation.sysdig.com/prom_3_scraper: true
                                                                                                                                                            conf:
                                                                                                                                                                url: "{kubernetes.pod.annotation.sysdig.com/prom_3_url}"
                                                                                                                                                                tags:
                                                                                                                                                                    service: prom_3
                                                                                                                                                                    scraping_node: "{kubernetes.node.name}"
                                                                                                                                                        - haproxy:
                                                                                                                                                            kubernetes.node.annotation.yourhost.com/haproxy_scraper: true
                                                                                                                                                            conf:
                                                                                                                                                                host: "mymasternode"
                                                                                                                                                                port: 1936
                                                                                                                                                                path: "/metrics"
                                                                                                                                                                username: "{kubernetes.node.annotation.yourhost.com/haproxy_username}"
                                                                                                                                                                password: "{kubernetes.node.annotation.yourhost.com/haproxy_password}"
                                                                                                                                                                tags:
                                                                                                                                                                    service: router
                                                                                                                                                

                                                                                                                                                In the above example, scraping is triggered by node and pod annotations. You can add annotations to nodes and pods by using the kubectl annotate command as follows:

                                                                                                                                                kubectl annotate node mynode --overwrite sysdig.com/region=india sysdig.com/scraper=true haproxy_scraper=true yourhost.com/haproxy_username=admin yourhost.com/haproxy_password=admin
                                                                                                                                                

                                                                                                                                                In this example, you set annotation on a node to trigger scraping of the prom2 and haproxy services as defined in the above configuration.

                                                                                                                                                Preparing Container Environments

                                                                                                                                                An example configuration for Docker environment is given below:

                                                                                                                                                prometheus:
                                                                                                                                                  remote_services:
                                                                                                                                                        - prom_container:
                                                                                                                                                            container.label.com.sysdig.scrape_xyz: true
                                                                                                                                                            conf:
                                                                                                                                                                url: "https://xyz:5005/metrics"
                                                                                                                                                                tags:
                                                                                                                                                                    host: xyz
                                                                                                                                                                    service: xyz
                                                                                                                                                

                                                                                                                                                In order for remote scraping to work in a Docker-based container environment, set the com.sysdig.scrape_xyz=true label to the Agent container. For example:

                                                                                                                                                docker run -d --name sysdig-agent --restart always --privileged --net host --pid host -e ACCESS_KEY=<KEY> -e COLLECTOR=<COLLECTOR> -e SECURE=true -e TAGS=example_tag:example_value -v /var/run/docker.sock:/host/var/run/docker.sock -v /dev:/host/dev -v /proc:/host/proc:ro -v /boot:/host/boot:ro -v /lib/modules:/host/lib/modules:ro -v /usr:/host/usr:ro --shm-size=512m sysdig/agent
                                                                                                                                                

                                                                                                                                                Substitute <KEY>, <COLLECTOR>, TAGS with your account key, collector, and tags respectively.

                                                                                                                                                Syntax of the Rules

                                                                                                                                                The syntax of the rules for the remote_services is almost identical to those of process_filter with an exception to the include/exclude rule. The remote_services section does not use include/exclude rules. The process_filter uses include and exclude rules of which only the first match against a process is applied, whereas, in the remote_services section, each rule has a corresponding service name and all the matching rules are applied.

                                                                                                                                                Rule Conditions

                                                                                                                                                The rule conditions work the same way as those for the process_filter. The only caveat is that the rules will be matched against the Agent process and container because the remote process/context is unknown. Therefore, matches for container labels and annotations work as before but they must be applicable to the Agent container as well. For instance, node annotations will apply because the Agent container runs on a node.

                                                                                                                                                For annotations, multiple patterns can be specified in a single rule, in which case all patterns must match for the rule to be a match (AND operator). In the following example, the endpoint will not be considered unless both the annotations match:

                                                                                                                                                kubernetes.node.annotation.sysdig.com/region_scraper: europe
                                                                                                                                                kubernetes.node.annotation.sysdig.com/scraper: true
                                                                                                                                                

                                                                                                                                                That is, Kubernetes nodes belonging to only the Europe region are considered for scraping.

                                                                                                                                                Authenticating Sysdig Agent

                                                                                                                                                Sysdig Agent requires necessary permissions on the remote host to scrape for metrics. The authentication methods for local scraping works for authenticating agents on remote hosts as well, but the authorization parameters work only in the agent context.

                                                                                                                                                • Authentication based on certificate-key pair requires it to be constructed into Kubernetes secret and mounted to the agent.

                                                                                                                                                • In token-based authentication, make sure the agent token has access rights on the remote endpoint to do the scraping.

                                                                                                                                                • Use annotation to retrieve username/password instead of passing them in plaintext. Any annotation enclosed in curly braces will be replaced by the value of the annotation. If the annotation doesn’t exist the value will be an empty string. Token substitution is supported for all the authorization parameters. Because authorization works only in the Agent context, credentials cannot be automatically retrieved from the target pod. Therefore, use an annotation in the Agent pod to pass them. To do so, set the password into an annotation for the selected Kubernetes object.

                                                                                                                                                In the following example, an HAProxy account is authenticated with the password supplied in the yourhost.com/haproxy_password annotation on the agent node.

                                                                                                                                                - haproxy:
                                                                                                                                                            kubernetes.node.annotation.yourhost.com/haproxy_scraper: true
                                                                                                                                                            conf:
                                                                                                                                                                host: "mymasternode"
                                                                                                                                                                port: 1936
                                                                                                                                                                path: "/metrics"
                                                                                                                                                                username: "{kubernetes.node.annotation.yourhost.com/haproxy_username}"
                                                                                                                                                                password: "{kubernetes.node.annotation.yourhost.com/haproxy_password}"
                                                                                                                                                                tags:
                                                                                                                                                                    service: router
                                                                                                                                                

                                                                                                                                                6.2 -

                                                                                                                                                (Legacy) Integrate Applications (Default App Checks)

                                                                                                                                                We are sunsetting application checks in favor of Monitoring Integrations.

                                                                                                                                                The Sysdig agent supports additional application monitoring capabilities with application check scripts or ‘app checks’. These are a set of plugins that poll for custom metrics from the specific applications which export them via status or management pages: e.g. NGINX, Redis, MongoDB, Memcached and more.

                                                                                                                                                Many app checks are enabled by default in the agent and when a supported application is found, the correct app check script will be called and metrics polled automatically.

                                                                                                                                                However, if default connection parameters are changed in your application, you will need to modify the app check connection parameters in the Sysdig Agent configuration file (dragent.yaml) to match your application.

                                                                                                                                                In some cases, you may also need to enable the metrics reporting functionality in the application before the agent can poll them.

                                                                                                                                                This page details how to make configuration changes in the agent’s configuration file, and provides an application integration example. Click the Supported Applications links for application-specific details.

                                                                                                                                                Python Version for App Checks:

                                                                                                                                                As of agent version 9.9.0, the default version of Python used for app checks is Python 3.

                                                                                                                                                Python 2 can still be used by setting the following option in your dragent.yaml:

                                                                                                                                                python_binary: <path to python 2.7 binary>

                                                                                                                                                For containerized agents, this path will be: /usr/bin/python2.7

                                                                                                                                                Edit dragent.yaml to Integrate or Modify Application Checks

                                                                                                                                                Out of the box, the Sysdig agent will gather and report on a wide variety of pre-defined metrics. It can also accommodate any number of custom parameters for additional metrics collection.

                                                                                                                                                The agent relies on a pair of configuration files to define metrics collection parameters:

                                                                                                                                                dragent.default.yaml

                                                                                                                                                The core configuration file. You can look at it to understand more about the default configurations provided.

                                                                                                                                                Location: "/opt/draios/etc/dragent.default.yaml."

                                                                                                                                                CAUTION. This file should never be edited.

                                                                                                                                                dragent.yaml

                                                                                                                                                The configuration file where parameters can be added, either directly in YAML as name/value pairs, or using environment variables such as 'ADDITIONAL_CONF." Location: "/opt/draios/etc/dragent.yaml."

                                                                                                                                                The “dragent.yaml” file can be accessed and edited in several ways, depending on how the agent was installed.

                                                                                                                                                Review Understanding the Agent Config Files for details.

                                                                                                                                                The examples in this section presume you are entering YAML code directly intodragent.yaml, under the app_checks section.

                                                                                                                                                Find the default settings

                                                                                                                                                To find the default app-checks for already supported applications, check the dragent.default.yaml file.

                                                                                                                                                (Location: /opt/draios/etc/dragent.default.yaml.)

                                                                                                                                                Sample format

                                                                                                                                                app_checks:
                                                                                                                                                  - name: APP_NAME
                                                                                                                                                    check_module: APP_CHECK_SCRIPT
                                                                                                                                                    pattern:
                                                                                                                                                      comm: PROCESS_NAME
                                                                                                                                                    conf:
                                                                                                                                                      host: IP_ADDR
                                                                                                                                                      port: PORT
                                                                                                                                                

                                                                                                                                                Parameter

                                                                                                                                                Parameter 2

                                                                                                                                                Description

                                                                                                                                                Sample Value

                                                                                                                                                app_checks

                                                                                                                                                The main section of dragent.default.yaml that contains a list of pre-configured checks.

                                                                                                                                                n/a

                                                                                                                                                name

                                                                                                                                                Every check should have a uniquename: which will be displayed on Sysdig Monitor as the process name of the integrated application.

                                                                                                                                                e.g. MongoDB

                                                                                                                                                check_module

                                                                                                                                                The name of the Python plugin that polls the data from the designated application.

                                                                                                                                                All the app check scripts can be found inside the /opt/draios/lib/python/checks.d directory.

                                                                                                                                                e.g. elastic

                                                                                                                                                pattern

                                                                                                                                                This section is used by the Sysdig agent to match a process with a check. Four kinds of keys can be specified along with any arguments to help distinguish them.

                                                                                                                                                n/a

                                                                                                                                                comm

                                                                                                                                                Matches process name as seen in /proc/PID/status

                                                                                                                                                port

                                                                                                                                                Matches based on the port used (i.e MySQL identified by 'port: 3306')

                                                                                                                                                arg

                                                                                                                                                Matches any process arguments

                                                                                                                                                exe

                                                                                                                                                Matches the process exe as seen in /proc/PID/exe link

                                                                                                                                                conf

                                                                                                                                                This section is specific for each plugin. You can specify any key/values that the plugins support.

                                                                                                                                                host

                                                                                                                                                Application-specific. A URL or IP address

                                                                                                                                                port

                                                                                                                                                {...} tokens can be used as values, which will be substituted with values from process info.

                                                                                                                                                Change the default settings

                                                                                                                                                To override the defaults:

                                                                                                                                                1. Copy relevant code blocks from dragent.default.yaml into dragent.yaml . (Or copy the code from the appropriate app check integration page in this documentation section.)

                                                                                                                                                  Any entries copied into dragent.yaml file will override similar entries in dragent.default.yaml.

                                                                                                                                                  Never modify dragent.default.yaml, as it will be overwritten whenever the agent is updated.

                                                                                                                                                2. Modify the parameters as needed.

                                                                                                                                                  Be sure to use proper YAML. Pay attention to consistent spacing for indents (as shown) and list all check entries under an app_checks: section title.

                                                                                                                                                3. Save the changes and restart the agent.

                                                                                                                                                  Use service restart agent or docker restart sysdig-agent.

                                                                                                                                                Metrics for the relevant application should appear in the Sysdig Monitor interface under the appropriate name.

                                                                                                                                                Example 1: Change Name and Add Password

                                                                                                                                                Here is a sample app-check entry for Redis. The app_checks section was copied from the dragent.default.yaml file and modified for a specific instance.

                                                                                                                                                customerid: 831f3-Your-Access-Key-9401
                                                                                                                                                tags: local:sf,acct:dev,svc:db
                                                                                                                                                app_checks:
                                                                                                                                                  - name: redis-6380
                                                                                                                                                    check_module: redisdb
                                                                                                                                                    pattern:
                                                                                                                                                      comm: redis-server
                                                                                                                                                    conf:
                                                                                                                                                      host: 127.0.0.1
                                                                                                                                                      port: PORT
                                                                                                                                                      password: PASSWORD
                                                                                                                                                

                                                                                                                                                Edits made:

                                                                                                                                                • The name to be displayed in the interface

                                                                                                                                                • A required password.

                                                                                                                                                As the token PORT is used, it will be translated to the actual port where Redis is listening.

                                                                                                                                                Example 2: Increase Polling Interval

                                                                                                                                                The default interval for an application check to be run by the agent is set to every second. You can increase the interval per application check by adding the interval: parameter (under the -name section) and the number of seconds to wait before each run of the script.

                                                                                                                                                interval: must be put into each app check entry that should run less often; there is no global setting.

                                                                                                                                                Example: Run the NTP check once per minute:

                                                                                                                                                app_checks:
                                                                                                                                                  - name: ntp
                                                                                                                                                    interval: 60
                                                                                                                                                    pattern:
                                                                                                                                                      comm: systemd
                                                                                                                                                    conf:
                                                                                                                                                      host: us.pool.ntp.org
                                                                                                                                                

                                                                                                                                                Disabling

                                                                                                                                                Disable a Single Application Check

                                                                                                                                                Sometimes the default configuration shipped with the Sysdig agent does not work for you or you may not be interested in checks for a single application. To turn a single check off, add an entry like this to disable it:

                                                                                                                                                app_checks:
                                                                                                                                                 - name: nginx
                                                                                                                                                   enabled: false
                                                                                                                                                

                                                                                                                                                This entry overrides the default configuration of the nginx check, disabling it.

                                                                                                                                                If you are using the ADDITIONAL_CONF parameter to modify your container agent’s configuration, you would add an entry like this to your Docker run command (or Kubernetes manifest):

                                                                                                                                                -e ADDITIONAL_CONF="app_checks:\n  - name: nginx\n    enabled: false\n"
                                                                                                                                                

                                                                                                                                                Disable ALL Application Checks

                                                                                                                                                If you do not need it or otherwise want to disable the application check functionality, you can add the following entry to the agent’s user settings configuration file /opt/draios/etc/dragent.yaml:

                                                                                                                                                app_checks_enabled: false
                                                                                                                                                

                                                                                                                                                Restart the agent as shown immediately above for either the native Linux agent installation or the container agent installation.

                                                                                                                                                Optional: Configure a Custom App-Check

                                                                                                                                                Sysdig allows custom application check-script configurations to be created for each individual container in the infrastructure, via the environment variable SYSDIG_AGENT_CONF. This avoids the need for multiple edits and entries to achieve the container-specific customization, by enabling application teams to configure their own checks.

                                                                                                                                                The SYSDIG_AGENT_CONF variable stores a YAML-formatted configuration for the app check, and is used to match app-check configurations. It can be stored directly within the Docker file.

                                                                                                                                                The syntax is the same as dragent.yaml syntax.

                                                                                                                                                The example below defines a per container app-check for Redis in the Dockerfile, using the SYSDIG_AGENT_CONF environment variable:

                                                                                                                                                FROM redis
                                                                                                                                                # This config file adds a password for accessing redis instance
                                                                                                                                                ADD redis.conf /
                                                                                                                                                
                                                                                                                                                ENV SYSDIG_AGENT_CONF { "app_checks": [{ "name": "redis", "check_module": "redisdb", "pattern": {"comm": "redis-server"}, "conf": { "host": "127.0.0.1", "port": "6379", "password": "protected"} }] }
                                                                                                                                                ENTRYPOINT ["redis-server"]
                                                                                                                                                CMD [ "/redis.conf" ]
                                                                                                                                                

                                                                                                                                                The example below shows how parameters can be added to a container started with docker run, by either using the -e/–envflag variable, or injecting the parameters using an orchestration system (for example, Kubernetes):

                                                                                                                                                PER_CONTAINER_CONF='{ "app_checks": [{ "name": "redis", "check_module": "redisdb", "pattern": {"comm": "redis-server"}, "conf": { "host": "127.0.0.1", "port": "6379", "password": "protected"} }] }'
                                                                                                                                                
                                                                                                                                                docker run --name redis -v /tmp/redis.conf:/etc/redis.conf -e SYSDIG_AGENT_CONF="${PER_CONTAINER_CONF}" -d redis /etc/redis.conf
                                                                                                                                                

                                                                                                                                                Metrics Limit

                                                                                                                                                Metric limits are defined by your payment plan. If more metrics are needed please contact your sales representative with your use case.

                                                                                                                                                Note that a metric with the same name but different tag name will count as a unique metric by the agent. Example: a metric 'user.clicks' with the tag 'country=us' and another 'user.clicks' with the 'tag country=it'are considered two metrics which count towards the limit.

                                                                                                                                                Supported Applications

                                                                                                                                                Below is the supported list of applications the agent will automatically poll.

                                                                                                                                                Some app-check scripts will need to be configured since no defaults exist, while some applications may need to be configured to output their metrics. Click a highlighted link to see application-specific notes.

                                                                                                                                                • Active MQ
                                                                                                                                                • Apache
                                                                                                                                                • Apache CouchDB
                                                                                                                                                • Apache HBase
                                                                                                                                                • Apache Kafka
                                                                                                                                                • Apache Zookeeper
                                                                                                                                                • Consul
                                                                                                                                                • CEPH
                                                                                                                                                • Couchbase
                                                                                                                                                • Elasticsearch
                                                                                                                                                • etcd
                                                                                                                                                • fluentd
                                                                                                                                                • Gearman
                                                                                                                                                • Go
                                                                                                                                                • Gunicorn
                                                                                                                                                • HAProxy
                                                                                                                                                • HDFS
                                                                                                                                                • HTTP
                                                                                                                                                • Jenkins
                                                                                                                                                • JVM
                                                                                                                                                • Lighttpd
                                                                                                                                                • Memcached
                                                                                                                                                • Mesos/Marathon
                                                                                                                                                • MongoDB
                                                                                                                                                • MySQL
                                                                                                                                                • NGINX and NGINX Plus
                                                                                                                                                • NTP
                                                                                                                                                • PGBouncer
                                                                                                                                                • PHP-FPM
                                                                                                                                                • Postfix
                                                                                                                                                • PostgreSQL
                                                                                                                                                • Prometheus
                                                                                                                                                • RabbitMQ
                                                                                                                                                • RedisDB
                                                                                                                                                • Supervisord
                                                                                                                                                • SNMP
                                                                                                                                                • TCP

                                                                                                                                                You can also

                                                                                                                                                6.2.1 -

                                                                                                                                                Apache

                                                                                                                                                Apache web server is an open-source, web server creation, deployment, and management software. If Apache is installed on your environment, the Sysdig agent will connect using the mod_status module on Apache. You may need to edit the default entries in the agent configuration file to connect. See the Default Configuration, below.

                                                                                                                                                This page describes the default configuration settings, how to edit the configuration to collect additional information, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                Apache Setup

                                                                                                                                                Install mod_status on your Apache servers and enable ExtendedStatus.

                                                                                                                                                The following configuration is required. If it is already present, then un-comment the lines, otherwise add the configuration.

                                                                                                                                                LoadModule status_module modules/mod_status.so
                                                                                                                                                ...
                                                                                                                                                
                                                                                                                                                <Location /server-status>
                                                                                                                                                    SetHandler server-status
                                                                                                                                                    Order Deny,Allow
                                                                                                                                                    Deny from all
                                                                                                                                                    Allow from localhost
                                                                                                                                                </Location>
                                                                                                                                                ...
                                                                                                                                                
                                                                                                                                                ExtendedStatus On
                                                                                                                                                

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                Apache has a common default for exposing metrics. The process command name can be either apache2 or httpd. By default, the Sysdig agent will look for the process apache2. If named differently in your environment (e.g. httpd), edit the configuration file to match the process name as shown in Example 1.

                                                                                                                                                Default Configuration

                                                                                                                                                By default, Sysdig’s dragent.default.yaml uses the following code to connect with Apache and collect all metrics.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: apache
                                                                                                                                                    check_module: apache
                                                                                                                                                    pattern:
                                                                                                                                                      comm: apache2
                                                                                                                                                    conf:
                                                                                                                                                      apache_status_url: "http://localhost:{port}/server-status?auto"
                                                                                                                                                    log_errors: false
                                                                                                                                                

                                                                                                                                                Example

                                                                                                                                                If it is necessary to edit dragent.yaml to change the process name, use the following example and update the comm with the value httpd.

                                                                                                                                                Remember! Never edit dragent.default.yaml directly; always edit only dragent.yaml.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: apache
                                                                                                                                                    check_module: apache
                                                                                                                                                    pattern:
                                                                                                                                                      comm: httpd
                                                                                                                                                    conf:
                                                                                                                                                      apache_status_url: "http://localhost/server-status?auto"
                                                                                                                                                    log_errors: false
                                                                                                                                                

                                                                                                                                                Metrics Available

                                                                                                                                                The Apache metrics are listed in the metrics dictionary here: Apache Metrics.

                                                                                                                                                UI Examples

                                                                                                                                                6.2.2 -

                                                                                                                                                Apache Kafka

                                                                                                                                                Apache Kafka is a distributed streaming platform. Kafka is used for building real-time data pipelines and streaming apps. It is horizontally scalable, fault-tolerant, wicked fast, and runs in production in thousands of companies. If Kafka is installed on your environment, the Sysdig agent will automatically connect. See the Default Configuration, below.

                                                                                                                                                The Sysdig agent automatically collects metrics from Kafka via JMX polling. You need to provide consumer names and topics in the agent config file to collect consumer-based Kafka metrics.

                                                                                                                                                This page describes the default configuration settings, how to edit the configuration to collect additional information, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                Kafka Setup

                                                                                                                                                Kafka will automatically expose all metrics. You do not need to add anything on the Kafka instance.

                                                                                                                                                Zstandard, one of the compressions available in the Kafka integration, is only included in Kafka versions 2.1.0 or newer. See also Apache documentation.

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                Metrics from Kafka via JMX polling are already configured in the agent’s default-settings configuration file. Metrics for consumers, however, need to use app-checks to poll the Kafka and Zookeeper API. You need to provide consumer names and topics in dragent.yaml file.

                                                                                                                                                Default Configuration

                                                                                                                                                Since consumer names and topics are environment-specific, a default configuration is not present in dragent.default.yaml.

                                                                                                                                                Refer to the following examples for adding Kafka checks to dragent.yaml.

                                                                                                                                                Remember! Never edit dragent.default.yaml directly; always edit only dragent.yaml.

                                                                                                                                                Example 1: Basic Configuration

                                                                                                                                                A basic example with sample consumer and topic names:

                                                                                                                                                app_checks:
                                                                                                                                                  - name: kafka
                                                                                                                                                    check_module: kafka_consumer
                                                                                                                                                    pattern:
                                                                                                                                                      comm: java
                                                                                                                                                      arg: kafka.Kafka
                                                                                                                                                    conf:
                                                                                                                                                      kafka_connect_str: "127.0.0.1:9092" # kafka address, usually localhost as we run the check on the same instance
                                                                                                                                                      zk_connect_str: "localhost:2181" # zookeeper address, may be different than localhost
                                                                                                                                                      zk_prefix: /
                                                                                                                                                      consumer_groups:
                                                                                                                                                        sample-consumer-1: # sample consumer name
                                                                                                                                                          sample-topic-1: [0, ] # sample topic name and partitions
                                                                                                                                                        sample-consumer-2: # sample consumer name
                                                                                                                                                          sample-topic-2: [0, 1, 2, 3] # sample topic name and partitions
                                                                                                                                                

                                                                                                                                                Example 2: Store Consumer Group Info (Kafka 9+)

                                                                                                                                                From Kafka 9 onwards, you can store consumer group config info inside Kafka itself for better performance.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: kafka
                                                                                                                                                    check_module: kafka_consumer
                                                                                                                                                    pattern:
                                                                                                                                                      comm: java
                                                                                                                                                      arg: kafka.Kafka
                                                                                                                                                    conf:
                                                                                                                                                      kafka_connect_str: "localhost:9092"
                                                                                                                                                      zk_connect_str: "localhost:2181"
                                                                                                                                                      zk_prefix: /
                                                                                                                                                      kafka_consumer_offsets: true
                                                                                                                                                      consumer_groups:
                                                                                                                                                        sample-consumer-1: # sample consumer name
                                                                                                                                                          sample-topic-1: [0, ] # sample topic name and partitions
                                                                                                                                                

                                                                                                                                                If kafka_consumer_offsets entry is set to true the app check will look for consumer offsets in Kafka. The appcheck will also look in Kafka if zk_connect_str is not set.

                                                                                                                                                Example 3: Aggregate Partitions at the Topic Level

                                                                                                                                                To enable aggregation of partitions at the topic level, use kafka_consumer_topics with aggregate_partitions = true.

                                                                                                                                                In this case the app check will aggregate the lag & offset values at the partition level, reducing the number of metrics collected.

                                                                                                                                                Set aggregate_partitions = false to disable metrics aggregation at the partition level. In this case, the appcheck will show lag and offset values for each partition.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: kafka
                                                                                                                                                    check_module: kafka_consumer
                                                                                                                                                    pattern:
                                                                                                                                                      comm: java
                                                                                                                                                      arg: kafka.Kafka
                                                                                                                                                    conf:
                                                                                                                                                      kafka_connect_str: "localhost:9092"
                                                                                                                                                      zk_connect_str: "localhost:2181"
                                                                                                                                                      zk_prefix: /
                                                                                                                                                      kafka_consumer_offsets: true
                                                                                                                                                      kafka_consumer_topics:
                                                                                                                                                        aggregate_partitions: true
                                                                                                                                                      consumer_groups:
                                                                                                                                                        sample-consumer-1: # sample consumer name
                                                                                                                                                          sample-topic-1: [0, ] # sample topic name and partitions
                                                                                                                                                        sample-consumer-2: # sample consumer name
                                                                                                                                                          sample-topic-2: [0, 1, 2, 3] # sample topic name and partitions
                                                                                                                                                

                                                                                                                                                Example 4: Custom Tags

                                                                                                                                                Optional tags can be applied to every emitted metric, service check, and/or event.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: kafka
                                                                                                                                                    check_module: kafka_consumer
                                                                                                                                                    pattern:
                                                                                                                                                      comm: java
                                                                                                                                                      arg: kafka.Kafka
                                                                                                                                                    conf:
                                                                                                                                                      kafka_connect_str: "localhost:9092"
                                                                                                                                                      zk_connect_str: "localhost:2181"
                                                                                                                                                      zk_prefix: /
                                                                                                                                                      consumer_groups:
                                                                                                                                                        sample-consumer-1: # sample consumer name
                                                                                                                                                          sample-topic-1: [0, ] # sample topic name and partitions
                                                                                                                                                    tags:  ["key_first_tag:value_1", "key_second_tag:value_2", "key_third_tag:value_3"]
                                                                                                                                                

                                                                                                                                                Example 5: SSL and Authentication

                                                                                                                                                If SSL and authentication are enabled on Kafka, use the following configuration.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: kafka
                                                                                                                                                    check_module: kafka_consumer
                                                                                                                                                    pattern:
                                                                                                                                                      comm: java
                                                                                                                                                      arg: kafka.Kafka
                                                                                                                                                    conf:
                                                                                                                                                      kafka_consumer_offsets: true
                                                                                                                                                      kafka_connect_str: "127.0.0.1:9093"
                                                                                                                                                      zk_connect_str: "localhost:2181"
                                                                                                                                                      zk_prefix: /
                                                                                                                                                      consumer_groups:
                                                                                                                                                        test-group:
                                                                                                                                                          test: [0, ]
                                                                                                                                                          test-4: [0, 1, 2, 3]
                                                                                                                                                      security_protocol: SASL_SSL
                                                                                                                                                      sasl_mechanism: PLAIN
                                                                                                                                                      sasl_plain_username: <USERNAME>
                                                                                                                                                      sasl_plain_password: <PASSWORD>
                                                                                                                                                      ssl_check_hostname: true
                                                                                                                                                      ssl_cafile:  <SSL_CA_FILE_PATH>
                                                                                                                                                      #ssl_context: <SSL_CONTEXT>
                                                                                                                                                      #ssl_certfile: <CERT_FILE_PATH>
                                                                                                                                                      #ssl_keyfile: <KEY_FILE_PATH>
                                                                                                                                                      #ssl_password: <PASSWORD>
                                                                                                                                                      #ssl_crlfile: <SSL_FILE_PATH>
                                                                                                                                                

                                                                                                                                                Configuration Keywords and Descriptions

                                                                                                                                                Keyword

                                                                                                                                                Description

                                                                                                                                                Default Value

                                                                                                                                                security_protocol (str)

                                                                                                                                                Protocol used to communicate with brokers.

                                                                                                                                                PLAINTEXT

                                                                                                                                                sasl_mechanism (str)

                                                                                                                                                String picking SASL mechanism when security_protocol is SASL_PLAINTEXT or SASL_SSL

                                                                                                                                                Currently only PLAIN is supported

                                                                                                                                                sasl_plain_username (str) 

                                                                                                                                                Username for SASL PLAIN authentication.

                                                                                                                                                sasl_plain_password (str) 

                                                                                                                                                Password for SASL PLAIN authentication.

                                                                                                                                                ssl_context (ssl.SSLContext) 

                                                                                                                                                Pre-configured SSLContext for wrapping socket connections. If provided, all other ssl_* configurations will be ignored.

                                                                                                                                                none

                                                                                                                                                ssl_check_hostname (bool)

                                                                                                                                                Flag to configure whether SSL handshake should verify that the certificate matches the broker's hostname.

                                                                                                                                                true

                                                                                                                                                ssl_cafile (str)

                                                                                                                                                Optional filename of ca file to use in certificate veriication.

                                                                                                                                                none

                                                                                                                                                ssl_certfile (str)

                                                                                                                                                Optional filename of file in pem format containing the client certificate, as well as any CA certificates needed to establish the certificate's authenticity.

                                                                                                                                                none

                                                                                                                                                ssl_keyfile (str)

                                                                                                                                                Optional filename containing the client private key.

                                                                                                                                                none

                                                                                                                                                ssl_password (str) 

                                                                                                                                                Optional password to be used when loading the certificate chain.

                                                                                                                                                none

                                                                                                                                                ssl_crlfile (str)

                                                                                                                                                Optional filename containing the CRL to check for certificate expiration. By default, no CRL check is done.

                                                                                                                                                When providing a file, only the leaf certificate will be checked against this CRL. The CRL can only be checked with 2.7.9+.

                                                                                                                                                none

                                                                                                                                                Example 6: Regex for Consumer Groups and Topics

                                                                                                                                                As of Sysdig agent version 0.94, the Kafka app check has added optional regex (regular expression) support for Kafka consumer groups and topics.

                                                                                                                                                Regex Configuration:

                                                                                                                                                • No new metrics are added with this feature

                                                                                                                                                • The new parameter consumer_groups_regex is added, which includes regex for consumers and topics from Kafka. Consumer offsets stored in Zookeeper are not collected.

                                                                                                                                                • Regex for topics is optional. When not provided, all topics under the consumer will be reported.

                                                                                                                                                • The regex Python syntax is documented here: https://docs.python.org/3.7/library/re.html#regular-expression-syntax

                                                                                                                                                • If both consumer_groups and consumer_groups_regex are provided at the same time, matched consumer groups from both parameters will be merged

                                                                                                                                                Sample configuration:

                                                                                                                                                app_checks:
                                                                                                                                                  - name: kafka
                                                                                                                                                    check_module: kafka_consumer
                                                                                                                                                    pattern:
                                                                                                                                                      comm: java
                                                                                                                                                      arg: kafka.Kafka
                                                                                                                                                    conf:
                                                                                                                                                      kafka_connect_str: "localhost:9092"
                                                                                                                                                      zk_connect_str: "localhost:2181"
                                                                                                                                                      zk_prefix: /
                                                                                                                                                      kafka_consumer_offsets: true
                                                                                                                                                      # Regex can be provided in following format
                                                                                                                                                      # consumer_groups_regex:
                                                                                                                                                      #   'REGEX_1_FOR_CONSUMER_GROUPS':
                                                                                                                                                      #      - 'REGEX_1_FOR_TOPIC'
                                                                                                                                                      #      - 'REGEX_2_FOR_TOPIC'
                                                                                                                                                      consumer_groups_regex:
                                                                                                                                                        'consumer*':
                                                                                                                                                          - 'topic'
                                                                                                                                                          - '^topic.*'
                                                                                                                                                          - '.*topic$'
                                                                                                                                                          - '^topic.*'
                                                                                                                                                          - 'topic\d+'
                                                                                                                                                          - '^topic_\w+'
                                                                                                                                                

                                                                                                                                                Example

                                                                                                                                                Regex

                                                                                                                                                Description

                                                                                                                                                Examples Matched

                                                                                                                                                Examples NOT Matched

                                                                                                                                                topic_\d+

                                                                                                                                                All strings having keyword topic followed by _ and one or more digit characters (equal to [0-9])

                                                                                                                                                my-topic_1

                                                                                                                                                topic_23

                                                                                                                                                topic_5-dev

                                                                                                                                                topic_x

                                                                                                                                                my-topic-1

                                                                                                                                                topic-123

                                                                                                                                                topic

                                                                                                                                                All strings having topic keyword

                                                                                                                                                topic_x

                                                                                                                                                x_topic123

                                                                                                                                                xyz

                                                                                                                                                consumer*

                                                                                                                                                All strings have consumer keyword

                                                                                                                                                consumer-1

                                                                                                                                                sample-consumer

                                                                                                                                                sample-consumer-2

                                                                                                                                                xyz

                                                                                                                                                ^topic_\w+

                                                                                                                                                All strings starting with topic followed by _ and any one or more word characters (equal to [a-zA-Z0-9_])

                                                                                                                                                topic_12

                                                                                                                                                topic_x

                                                                                                                                                topic_xyz_123

                                                                                                                                                topic-12

                                                                                                                                                x_topic

                                                                                                                                                topic__xyz

                                                                                                                                                ^topic.*

                                                                                                                                                All strings starting with topic

                                                                                                                                                topic-x

                                                                                                                                                topic123

                                                                                                                                                x-topic

                                                                                                                                                x_topic123

                                                                                                                                                .*topic$

                                                                                                                                                All strings ending with topic

                                                                                                                                                x_topic

                                                                                                                                                sampletopic

                                                                                                                                                topic-1

                                                                                                                                                x_topic123

                                                                                                                                                Metrics Available

                                                                                                                                                Kafka Consumer Metrics (App Checks)

                                                                                                                                                See Apache Kafka Consumer Metrics.

                                                                                                                                                JMX Metrics

                                                                                                                                                See Apache Kafka JMX Metrics.

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                6.2.3 -

                                                                                                                                                Consul

                                                                                                                                                Consul is a distributed service mesh to connect, secure, and configure services across any runtime platform and public or private cloud. If Consul is installed on your environment, the Sysdig agent will automatically connect and collect basic metrics. If the Consul Access Control List (ACL) is configured, you may need to edit the default entries to connect. Also, additional latency metrics can be collected by modifying default entries. See the Default Configuration, below.

                                                                                                                                                It’s easy! Sysdig automatically detects metrics from this app based on standard default configurations.

                                                                                                                                                This page describes the default configuration settings, how to edit the configuration to collect additional information, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                Consul Configuration

                                                                                                                                                Consul is ready to expose metrics without any special configuration.

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                Default Configuration

                                                                                                                                                By default, Sysdig’s dragent.default.yaml ``uses the following code to connect with Consul and collect basic metrics.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: consul
                                                                                                                                                    pattern:
                                                                                                                                                      comm: consul
                                                                                                                                                    conf:
                                                                                                                                                      url: "http://localhost:8500"
                                                                                                                                                      catalog_checks: yes
                                                                                                                                                

                                                                                                                                                With the dragent.default.yaml file, the below set of metrics are available in the Sysdig Monitor UI:

                                                                                                                                                Metrics name
                                                                                                                                                consul.catalog.nodes_critical
                                                                                                                                                consul.catalog.nodes_passing
                                                                                                                                                consul.catalog.nodes_up
                                                                                                                                                consul.catalog.nodes_warning
                                                                                                                                                consul.catalog.total_nodes
                                                                                                                                                consul.catalog.services_critical
                                                                                                                                                consul.catalog.services_passing
                                                                                                                                                consul.catalog.services_up
                                                                                                                                                consul.catalog.services_warning
                                                                                                                                                consul.peers

                                                                                                                                                Additional metrics and event can be collected by adding configuration in dragent.yaml file. The ACL token must be provided if enabled. See the following examples.

                                                                                                                                                Remember! Never edit dragent.default.yaml ``directly; always edit only dragent.yaml.

                                                                                                                                                Example 1: Enable Leader Change Event

                                                                                                                                                self_leader_check An enabled node will watch for itself to become the leader and will emit an event when that happens. It can be enabled on all nodes.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: consul
                                                                                                                                                    pattern:
                                                                                                                                                      comm: consul
                                                                                                                                                    conf:
                                                                                                                                                      url: "http://localhost:8500"
                                                                                                                                                      catalog_checks: yes
                                                                                                                                                      self_leader_check: yes
                                                                                                                                                    logs_enabled: true
                                                                                                                                                

                                                                                                                                                Example 2: Enable Latency Metrics

                                                                                                                                                If the network_latency_checks flag is enabled, then the Consul network coordinates will be retrieved and the latency calculated for each node and between data centers.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: consul
                                                                                                                                                    pattern:
                                                                                                                                                      comm: consul
                                                                                                                                                    conf:
                                                                                                                                                      url: "http://localhost:8500"
                                                                                                                                                      catalog_checks: yes
                                                                                                                                                      network_latency_checks: yes
                                                                                                                                                    logs_enabled: true
                                                                                                                                                

                                                                                                                                                With the above changes, you can see the following additional metrics:

                                                                                                                                                Metrics name
                                                                                                                                                consul.net.node.latency.min
                                                                                                                                                consul.net.node.latency.p25
                                                                                                                                                consul.net.node.latency.median
                                                                                                                                                consul.net.node.latency.p75
                                                                                                                                                consul.net.node.latency.p90
                                                                                                                                                consul.net.node.latency.p95
                                                                                                                                                consul.net.node.latency.p99
                                                                                                                                                consul.net.node.latency.max

                                                                                                                                                Example 3: Enable ACL Token

                                                                                                                                                When the ACL Systemis enabled in Consul, the ACL Agent Token must be added in dragent.yaml in order to collect metrics.

                                                                                                                                                Follow Consul’s official documentation to Configure ACL, Bootstrap ACL and Create Agent Token.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: consul
                                                                                                                                                    pattern:
                                                                                                                                                      comm: consul
                                                                                                                                                    conf:
                                                                                                                                                      url: "http://localhost:8500"
                                                                                                                                                      acl_token: "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" #Add agent token
                                                                                                                                                      catalog_checks: yes
                                                                                                                                                      logs_enabled: true
                                                                                                                                                

                                                                                                                                                Example 4: Collect Metrics from Non-Leader Node

                                                                                                                                                Required: Agent 9.6.0+

                                                                                                                                                With agent 9.6.0, you can use the configuration option single_node_install (Optional. Default: false). Set this option to true and the app check will be performed on non-leader nodes of Consul.

                                                                                                                                                app_checks:
                                                                                                                                                   - name: consul
                                                                                                                                                    pattern:
                                                                                                                                                      comm: consul
                                                                                                                                                    conf:
                                                                                                                                                      url: "http://localhost:8500"
                                                                                                                                                      catalog_checks: yes
                                                                                                                                                      single_node_install: true
                                                                                                                                                

                                                                                                                                                StatsD Metrics

                                                                                                                                                In addition to the metrics from the Sysdig app-check, there are many other metrics that Consul can send using StatsD. Those metrics will be automatically collected by the Sysdig agent’s StatsD integration if Consul is configured to send them.

                                                                                                                                                Add statsd_address under telemetry to the Consul config file. The default config file location is /consul/config/local.json

                                                                                                                                                {
                                                                                                                                                ...
                                                                                                                                                  "telemetry": {
                                                                                                                                                     "statsd_address": "127.0.0.1:8125"
                                                                                                                                                  }
                                                                                                                                                ...
                                                                                                                                                }
                                                                                                                                                

                                                                                                                                                See Telemetry Metrics for more details.

                                                                                                                                                Metrics Available

                                                                                                                                                See Consul Metrics.

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                6.2.4 -

                                                                                                                                                Couchbase

                                                                                                                                                Couchbase Server is a distributed, open-source, NoSQL database engine. The core architecture is designed to simplify building modern applications with a flexible data model and simpler high availability, high scalability, high performance, and advanced security. If Couchbase is installed on your environment, the Sysdig agent will automatically connect. If authentication is configured, you may need to edit the default entries to connect. See the Default Configuration, below.

                                                                                                                                                The Sysdig agent automatically collects all bucket and node metrics. You can also edit the configuration to collect query metrics.

                                                                                                                                                This page describes the default configuration settings, how to edit the configuration to collect additional information, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                Couchbase Setup

                                                                                                                                                Couchbase will automatically expose all metrics. You do not need to configure anything on the Couchbase instance.

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                Default Configuration

                                                                                                                                                By default, Sysdig’s dragent.default.yaml uses the following code to connect with Couchbase and collect all bucket and node metrics.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: couchbase
                                                                                                                                                    pattern:
                                                                                                                                                      comm: beam.smp
                                                                                                                                                      arg: couchbase
                                                                                                                                                      port: 8091
                                                                                                                                                    conf:
                                                                                                                                                      server: http://localhost:8091
                                                                                                                                                

                                                                                                                                                If authentication is enabled, you need to edit dragent.yaml file to connect with Couchbase. See Example 1.

                                                                                                                                                Remember! Never edit dragent.default.yaml directly; always edit only dragent.yaml.

                                                                                                                                                Example 1: Authentication

                                                                                                                                                Replace <username> and <password> with appropriate values and update the dragent.yaml file.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: couchbase
                                                                                                                                                    pattern:
                                                                                                                                                      comm: beam.smp
                                                                                                                                                      arg: couchbase
                                                                                                                                                      port: 8091
                                                                                                                                                    conf:
                                                                                                                                                      server: http://localhost:8091
                                                                                                                                                      user: <username>
                                                                                                                                                      password: <password>
                                                                                                                                                      # The following block is optional and required only if the 'path' and
                                                                                                                                                      # 'port' need to be set to non-default values specified here
                                                                                                                                                      cbstats:
                                                                                                                                                        port: 11210
                                                                                                                                                        path: /opt/couchbase/bin/cbstats
                                                                                                                                                

                                                                                                                                                Example 2: Query Stats

                                                                                                                                                Additionally, you can configure query_monitoring_url to get query monitoring stats. This is available from Couchbase version 4.5. See Query Monitoring for more detail.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: couchbase
                                                                                                                                                    pattern:
                                                                                                                                                      comm: beam.smp
                                                                                                                                                      arg: couchbase
                                                                                                                                                      port: 8091
                                                                                                                                                    conf:
                                                                                                                                                      server: http://localhost:8091
                                                                                                                                                      query_monitoring_url: http://localhost:8093
                                                                                                                                                

                                                                                                                                                Metrics Available

                                                                                                                                                See Couchbase Metrics.

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                6.2.5 -

                                                                                                                                                Elasticsearch

                                                                                                                                                Elasticsearch is an open-source, distributed, document storage and search engine that stores and retrieves data structures in near real-time. Elasticsearch represents data in the form of structured JSON documents and makes full-text search accessible via RESTful API and web clients for languages like PHP, Python, and Ruby. It’s also elastic in the sense that it’s easy to scale horizontally—simply add more nodes to distribute the load. If Elasticsearch is installed on your environment, the Sysdig agent will automatically connect in most of the cases. See the Default Configuration, below.

                                                                                                                                                The Sysdig Agent automatically collects default metrics. You can also edit the configuration to collect Primary Shard stats.

                                                                                                                                                This page describes the default configuration settings, how to edit the configuration to collect additional information, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                Elasticsearch Setup

                                                                                                                                                Elasticsearch is ready to expose metrics without any special configuration.

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                Default Configuration

                                                                                                                                                By default, Sysdig’s dragent.default.yaml uses the following code to connect with Elasticsearch and collect basic metrics.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: elasticsearch
                                                                                                                                                    check_module: elastic
                                                                                                                                                    pattern:
                                                                                                                                                      port: 9200
                                                                                                                                                      comm: java
                                                                                                                                                    conf:
                                                                                                                                                      url: http://localhost:9200
                                                                                                                                                

                                                                                                                                                For more metrics, you may need to change the elasticsearch default setting in dragent.yaml:

                                                                                                                                                Remember! Never edit dragent.default.yaml directly; always edit only dragent.yaml.

                                                                                                                                                Example 1: Agent authentication to Elasticsearch Cluster with Authentication

                                                                                                                                                Password Authentication

                                                                                                                                                app_checks:
                                                                                                                                                  - name: elasticsearch
                                                                                                                                                    check_module: elastic
                                                                                                                                                    pattern:
                                                                                                                                                      port: 9200
                                                                                                                                                      comm: java
                                                                                                                                                    conf:
                                                                                                                                                      url: https://sysdigcloud-elasticsearch:9200
                                                                                                                                                      username: readonly
                                                                                                                                                      password: some_password
                                                                                                                                                      ssl_verify: false
                                                                                                                                                

                                                                                                                                                Certificate Authentication

                                                                                                                                                app_checks:
                                                                                                                                                   - name: elasticsearch
                                                                                                                                                     check_module: elastic
                                                                                                                                                     pattern:
                                                                                                                                                       port: 9200
                                                                                                                                                       comm: java
                                                                                                                                                     conf:
                                                                                                                                                       url: https://localhost:9200
                                                                                                                                                       ssl_cert: /tmp/certs/ssl.crt
                                                                                                                                                       ssl_key: /tmp/certs/ssl.key
                                                                                                                                                       ssl_verify: true
                                                                                                                                                

                                                                                                                                                ssl_cert: Path to the certificate chain used for validating the authenticity of the Elasticsearch server.

                                                                                                                                                ssl_key: Path to the certificate key used for authenticating to the Elasticsearch server.

                                                                                                                                                Example 2: Enable Primary shard Statistics

                                                                                                                                                app_checks:
                                                                                                                                                  - name: elasticsearch
                                                                                                                                                    check_module: elastic
                                                                                                                                                    pattern:
                                                                                                                                                      port: 9200
                                                                                                                                                      comm: java
                                                                                                                                                    conf:
                                                                                                                                                      url: http://localhost:9200
                                                                                                                                                      pshard_stats : true
                                                                                                                                                

                                                                                                                                                pshard-specific Metrics

                                                                                                                                                Enable pshard_stats to monitor the following additional metrics:

                                                                                                                                                Metric Name
                                                                                                                                                elasticsearch.primaries.flush.total
                                                                                                                                                elasticsearch.primaries.flush.total.time
                                                                                                                                                elasticsearch.primaries.docs.count
                                                                                                                                                elasticsearch.primaries.docs.deleted
                                                                                                                                                elasticsearch.primaries.get.current
                                                                                                                                                elasticsearch.primaries.get.exists.time
                                                                                                                                                elasticsearch.primaries.get.exists.total
                                                                                                                                                elasticsearch.primaries.get.missing.time
                                                                                                                                                elasticsearch.primaries.get.missing.total
                                                                                                                                                elasticsearch.primaries.get.time
                                                                                                                                                elasticsearch.primaries.get.total
                                                                                                                                                elasticsearch.primaries.indexing.delete.current
                                                                                                                                                elasticsearch.primaries.indexing.delete.time
                                                                                                                                                elasticsearch.primaries.indexing.delete.total
                                                                                                                                                elasticsearch.primaries.indexing.index.current
                                                                                                                                                elasticsearch.primaries.indexing.index.time
                                                                                                                                                elasticsearch.primaries.indexing.index.total
                                                                                                                                                elasticsearch.primaries.merges.current
                                                                                                                                                elasticsearch.primaries.merges.current.docs
                                                                                                                                                elasticsearch.primaries.merges.current.size
                                                                                                                                                elasticsearch.primaries.merges.total
                                                                                                                                                elasticsearch.primaries.merges.total.docs
                                                                                                                                                elasticsearch.primaries.merges.total.size
                                                                                                                                                elasticsearch.primaries.merges.total.time
                                                                                                                                                elasticsearch.primaries.refresh.total
                                                                                                                                                elasticsearch.primaries.refresh.total.time
                                                                                                                                                elasticsearch.primaries.search.fetch.current
                                                                                                                                                elasticsearch.primaries.search.fetch.time
                                                                                                                                                elasticsearch.primaries.search.fetch.total
                                                                                                                                                elasticsearch.primaries.search.query.current
                                                                                                                                                elasticsearch.primaries.search.query.time
                                                                                                                                                elasticsearch.primaries.search.query.total
                                                                                                                                                elasticsearch.primaries.store.size

                                                                                                                                                Example 3: Enable Primary shard Statistics for Master Node only

                                                                                                                                                app_checks:
                                                                                                                                                  - name: elasticsearch
                                                                                                                                                    check_module: elastic
                                                                                                                                                    pattern:
                                                                                                                                                      port: 9200
                                                                                                                                                      comm: java
                                                                                                                                                    conf:
                                                                                                                                                      url: http://localhost:9200
                                                                                                                                                      pshard_stats_master_node_only: true
                                                                                                                                                

                                                                                                                                                Note that this option takes precedence over the pshard_stats option (above). This means that if the following configuration were put into place, only the pshard_stats_master_node_only option would be respected:

                                                                                                                                                app_checks:
                                                                                                                                                  - name: elasticsearch
                                                                                                                                                    check_module: elastic
                                                                                                                                                    pattern:
                                                                                                                                                      port: 9200
                                                                                                                                                      comm: java
                                                                                                                                                    conf:
                                                                                                                                                      url: http://localhost:9200
                                                                                                                                                      pshard_stats: true
                                                                                                                                                      pshard_stats_master_node_only: true
                                                                                                                                                

                                                                                                                                                All Available Metrics

                                                                                                                                                With the default settings and the pshard setting, the total available metrics are listed here: Elasticsearch Metrics.

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                6.2.6 -

                                                                                                                                                etcd

                                                                                                                                                etcdis a distributed key-value store that provides a reliable way to store data across a cluster of machines. If etcd is installed on your environment, the Sysdig agent will automatically connect. If you are using ectd older than version 2, you may need to edit the default entries to connect. See the Default Configuration section, below.

                                                                                                                                                The Sysdig Agent automatically collects all metrics.

                                                                                                                                                This page describes the default configuration settings, how to edit the configuration to collect additional information, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                etcd Versions

                                                                                                                                                etcd v2

                                                                                                                                                The app check functionality described on this page supports etcd metrics from APIs that are specific to v2 of etcd.

                                                                                                                                                These APIs are present in etcd v3 as well, but export metrics only for the v2 datastores. For example, after upgrading from etcd v2 to v3, if the v2 datastores are not migrated to v3, the v2 APIs will continue exporting metrics for these datastores. If the v2 datastores are migrated to v3, the v2 APIs will no longer export metrics for these datastores.

                                                                                                                                                etcd v3

                                                                                                                                                etcd v3 uses a native Prometheus exporter. The exporter only exports metrics for v3 datastores. For example, after upgrading from etcd v2 to v3, if v2 datastores are not migrated to v3, the Prometheus endpoint will not export metrics for these datastores. The Prometheus endpoint will only export metrics for datastores migrated to v3 or datastores created after the upgrade to v3.

                                                                                                                                                If your etcd version is v3 or higher, use the information on this page to enable an integration: Integrate Prometheus Metrics.

                                                                                                                                                etcd Setup

                                                                                                                                                etcd will automatically expose all metrics. You do not need to add anything to the etcd instance.

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to Edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                The default agent configuration for etcd will look for the application on localhost, port 2379. No customization is required.

                                                                                                                                                Default Configuration

                                                                                                                                                By default, Sysdig’s dragent.default.yaml uses the following code to connect with etcd and collect all metrics.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: etcd
                                                                                                                                                    pattern:
                                                                                                                                                      comm: etcd
                                                                                                                                                    conf:
                                                                                                                                                      url: "http://localhost:2379"
                                                                                                                                                

                                                                                                                                                etcd (before version 2) does not listen on localhost, so the Sysdig agent will not connect to it automatically. In such case, you may need edit the dragent.yaml file with the hostname and port. See Example 1.

                                                                                                                                                Alternatively, you can add the option -bind-addr 0.0.0.0:4001 to the etcd command line to allow the agent to connect.

                                                                                                                                                Remember! Never edit dragent.default.yaml directly; always edit only dragent.yaml.

                                                                                                                                                Example 1

                                                                                                                                                You can use {hostname} and {port} as a tokens in the conf: section. This is the recommended setting for Kubernetes customers.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: etcd
                                                                                                                                                    pattern:
                                                                                                                                                      comm: etcd
                                                                                                                                                    conf:
                                                                                                                                                      url: "http://{hostname}:{port}"
                                                                                                                                                

                                                                                                                                                Alternatively you can specify the real hostname and port.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: etcd
                                                                                                                                                    pattern:
                                                                                                                                                      comm: etcd
                                                                                                                                                    conf:
                                                                                                                                                      url: "http://my_hostname:4000"  #etcd service listening on port 4000
                                                                                                                                                

                                                                                                                                                Example 2: SSL/TLS Certificate

                                                                                                                                                If encryption is used, add the appropriate SSL/TLS entries. Provide correct path of SSL/TLS key and certificates used in etcd configuration in fields ssl_keyfile, ssl_certfile, ssl_ca_certs.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: etcd
                                                                                                                                                    pattern:
                                                                                                                                                      comm: etcd
                                                                                                                                                    conf:
                                                                                                                                                      url: "https://localhost:PORT"
                                                                                                                                                      ssl_keyfile:  /etc/etcd/peer.key  # Path to key file
                                                                                                                                                      ssl_certfile: /etc/etcd/peer.crt  # Path to SSL certificate
                                                                                                                                                      ssl_ca_certs: /etc/etcd/ca.crt    # Path to CA certificate
                                                                                                                                                      ssl_cert_validation: True
                                                                                                                                                

                                                                                                                                                Metrics Available

                                                                                                                                                See etcd Metrics.

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                6.2.7 -

                                                                                                                                                fluentd

                                                                                                                                                Fluentd is an open source data collector, which allows unifying data collection and consumption to better use and understand data. Fluentd structures data as JSON as much as possible, to unify all facets of processing log data: collecting, filtering, buffering, and outputting logs across multiple sources and destinations. If Fluentd is installed on your environment, the Sysdig agent will automatically connect. See See the Default Configuration section, below. The Sysdig agent automatically collects default metrics.

                                                                                                                                                This page describes the default configuration settings, how to edit the configuration to collect additional information, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                Fluentd Setup

                                                                                                                                                Fluentd can be installed as a package (.deb, .rpm, etc) depending on the OS flavor, or it can be deployed in a Docker container. Fluentd installation is documented here. For the examples on this page, a .deb package installation is used.

                                                                                                                                                After installing Fluentd, add following lines in fluentd.conf :

                                                                                                                                                <source>
                                                                                                                                                  @type monitor_agent
                                                                                                                                                  bind 0.0.0.0
                                                                                                                                                  port 24220
                                                                                                                                                </source>
                                                                                                                                                

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to Edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                Default Configuration

                                                                                                                                                By default, Sysdig’sdragent.default.yaml uses the following code to connect with Fluentd and collect default metrics.

                                                                                                                                                (If you use a non-standard port for monitor_agent , you can configure it as usual in the agent config file dragent.yaml.)

                                                                                                                                                  - name: fluentd
                                                                                                                                                    pattern:
                                                                                                                                                      comm: fluentd
                                                                                                                                                    conf:
                                                                                                                                                      monitor_agent_url: http://localhost:24220/api/plugins.json
                                                                                                                                                

                                                                                                                                                Remember! Never edit dragent.default.yaml directly; always edit only dragent.yaml.

                                                                                                                                                Example

                                                                                                                                                To generate the metric data, it is necessary to generate some logs through an application. In the following example, HTTP is used. (For more information, see Life of a Fluentd event.)

                                                                                                                                                Execute the following command on in the Fluentd environment:

                                                                                                                                                $ curl -i -X POST -d 'json={"action":"login","user":2}' http://localhost:8888/test.cycle
                                                                                                                                                

                                                                                                                                                Expected output: (Note: Here the status code is 200 OK, as HTTP traffic is successfully generated; it will vary per application.)

                                                                                                                                                HTTP/1.1 200 OK
                                                                                                                                                Content-type: text/plain
                                                                                                                                                Connection: Keep-Alive
                                                                                                                                                Content-length: 0
                                                                                                                                                

                                                                                                                                                Metrics Available

                                                                                                                                                See fluentd Metrics.

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                6.2.8 -

                                                                                                                                                Go

                                                                                                                                                Golang expvaris the standard interface designed to instrument and expose custom metrics from a Go program via HTTP. In addition to custom metrics, it also exports some metrics out-of-the-box, such as command line arguments, allocation stats, heap stats, and garbage collection metrics.

                                                                                                                                                This page describes the default configuration settings, how to edit the configuration to collect additional information, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                Go_expvar Setup

                                                                                                                                                You will need to create a custom entry in the user settings config file for your Go application, due to the difficulty in determining if an application is written in Go by looking at process names or arguments. Be sure your app has expvars enabled, which means importing the expvar module and having an HTTP server started from inside your app, as follows:

                                                                                                                                                import (
                                                                                                                                                    ...
                                                                                                                                                    "net/http"
                                                                                                                                                    "expvar"
                                                                                                                                                    ...
                                                                                                                                                )
                                                                                                                                                
                                                                                                                                                // If your application has no http server running for the DefaultServeMux,
                                                                                                                                                // you'll have to have a http server running for expvar to use, for example
                                                                                                                                                // by adding the following to your init function
                                                                                                                                                func init() {
                                                                                                                                                    go http.ServeAndListen(":8080", nil)
                                                                                                                                                }
                                                                                                                                                
                                                                                                                                                // You can also expose variables that are specific to your application
                                                                                                                                                // See http://golang.org/pkg/expvar/ for more information
                                                                                                                                                
                                                                                                                                                var (
                                                                                                                                                    exp_points_processed = expvar.NewInt("points_processed")
                                                                                                                                                )
                                                                                                                                                
                                                                                                                                                func processPoints(p RawPoints) {
                                                                                                                                                    points_processed, err := parsePoints(p)
                                                                                                                                                    exp_points_processed.Add(points_processed)
                                                                                                                                                    ...
                                                                                                                                                }
                                                                                                                                                

                                                                                                                                                See also the following blog entry: How to instrument Go code with custom expvar metrics.

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to Edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                Default Configuration

                                                                                                                                                No default configuration for Go is provided in the Sysdig agent dragent.default.yaml file. You must edit the agent config file as described in Example 1.

                                                                                                                                                Remember! Never edit dragent.default.yaml directly; always edit only dragent.yaml.

                                                                                                                                                Example

                                                                                                                                                Add the following code sample to dragent.yaml to collect Go metrics.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: go-expvar
                                                                                                                                                    check_module: go_expvar
                                                                                                                                                    pattern:
                                                                                                                                                          comm: go-expvar
                                                                                                                                                    conf:
                                                                                                                                                      expvar_url: "http://localhost:8080/debug/vars" # automatically match url using the listening port
                                                                                                                                                      # Add custom metrics if you want
                                                                                                                                                      metrics:
                                                                                                                                                        - path: system.numberOfSeconds
                                                                                                                                                          type: gauge # gauge or rate
                                                                                                                                                          alias: go_expvar.system.numberOfSeconds
                                                                                                                                                        - path: system.lastLoad
                                                                                                                                                          type: gauge
                                                                                                                                                          alias: go_expvar.system.lastLoad
                                                                                                                                                        - path: system.numberOfLoginsPerUser/.* # You can use / to get inside the map and use .* to match any record inside
                                                                                                                                                          type: gauge
                                                                                                                                                        - path: system.allLoad/.*
                                                                                                                                                          type: gauge
                                                                                                                                                

                                                                                                                                                Metrics Available

                                                                                                                                                See Go Metrics.

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                6.2.9 -

                                                                                                                                                HAProxy

                                                                                                                                                HAProxy provides a high-availability load balancer and proxy server for TCP- and HTTP-based applications which spreads requests across multiple servers.

                                                                                                                                                The Sysdig agent automatically collects haproxy metrics. You can also edit the agent configuration file to collect additional metrics.

                                                                                                                                                This page describes the default configuration settings, how to edit the configuration to collect additional information, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                HAProxy Setup

                                                                                                                                                The stats feature must be enabled on your HAProxy instance. This can be done by adding the following entry to the HAProxy configuration file /etc/haproxy/haproxy.cfg

                                                                                                                                                listen stats
                                                                                                                                                  bind :1936
                                                                                                                                                  mode http
                                                                                                                                                  stats enable
                                                                                                                                                  stats hide-version
                                                                                                                                                  stats realm Haproxy\ Statistics
                                                                                                                                                  stats uri /haproxy_stats
                                                                                                                                                  stats auth stats:stats
                                                                                                                                                

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to Edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                Default Configuration

                                                                                                                                                By default, Sysdig’s dragent.default.yaml uses the following code to connect with HAProxy and collect haproxy metrics:

                                                                                                                                                app_checks:
                                                                                                                                                  - name: haproxy
                                                                                                                                                    pattern:
                                                                                                                                                      comm: haproxy
                                                                                                                                                      port: 1936
                                                                                                                                                    conf:
                                                                                                                                                      username: stats
                                                                                                                                                      password: stats
                                                                                                                                                      url: http://localhost:1936/
                                                                                                                                                      collect_aggregates_only: True
                                                                                                                                                    log_errors: false
                                                                                                                                                

                                                                                                                                                You can get a few additional status metrics by editing the configuration in dragent.yaml,as in the following examples.

                                                                                                                                                Remember! Never edit dragent.default.yaml directly; always edit only dragent.yaml

                                                                                                                                                Example: Collect Status Metrics Per Service

                                                                                                                                                Enable the collect_status_metrics flag to collect the metrics haproxy.count_per_status, and haproxy.backend_hosts.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: haproxy
                                                                                                                                                    pattern:
                                                                                                                                                      comm: haproxy
                                                                                                                                                      port: 1936
                                                                                                                                                    conf:
                                                                                                                                                      username: stats
                                                                                                                                                      password: stats
                                                                                                                                                      url: http://localhost:1936/haproxy_stats
                                                                                                                                                      collect_aggregates_only: True
                                                                                                                                                      collect_status_metrics: True
                                                                                                                                                    log_errors: false
                                                                                                                                                

                                                                                                                                                Example: Collect Status Metrics Per Host

                                                                                                                                                Enable:

                                                                                                                                                • collect_status_metrics_by_host: Instructs the check to collect status metrics per host, instead of per service. This only applies if `collect_status_metrics` is true.

                                                                                                                                                • tag_service_check_by_host: When this flag is set, the hostname is also passed with the service check ‘haproxy.backend_up’.

                                                                                                                                                  By default, only the backend name and service name are associated with it.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: haproxy
                                                                                                                                                    pattern:
                                                                                                                                                      comm: haproxy
                                                                                                                                                      port: 1936
                                                                                                                                                    conf:
                                                                                                                                                      username: stats
                                                                                                                                                      password: stats
                                                                                                                                                      url: http://localhost:1936/haproxy_stats
                                                                                                                                                      collect_aggregates_only: True
                                                                                                                                                      collect_status_metrics: True
                                                                                                                                                      collect_status_metrics_by_host: True
                                                                                                                                                      tag_service_check_by_host: True
                                                                                                                                                    log_errors: false
                                                                                                                                                

                                                                                                                                                Example: Collect HAProxy Stats by UNIX Socket

                                                                                                                                                If you’ve configured HAProxy to report statistics to a UNIX socket, you can set the url in dragent.yaml to the socket’s path (e.g., unix:///var/run/haproxy.sock).

                                                                                                                                                Set up HAProxy Config File

                                                                                                                                                Edit your HAProxy configuration file ( /etc/haproxy/haproxy.cfg ) to add the following lines to the global section:

                                                                                                                                                global
                                                                                                                                                    [snip]
                                                                                                                                                       stats socket /run/haproxy/admin.sock mode 660 level admin
                                                                                                                                                       stats timeout 30s
                                                                                                                                                    [snip]
                                                                                                                                                

                                                                                                                                                Edit dragent.yaml url

                                                                                                                                                Add the socket URL from the HAProxy config to the dragent.yaml file:

                                                                                                                                                app_checks:
                                                                                                                                                      - name: haproxy
                                                                                                                                                        pattern:
                                                                                                                                                          comm: haproxy
                                                                                                                                                        conf:
                                                                                                                                                          url: unix:///run/haproxy/admin.sock
                                                                                                                                                        log_errors: True
                                                                                                                                                

                                                                                                                                                Metrics Available

                                                                                                                                                See HAProxy Metrics.

                                                                                                                                                Example: Enable Service Check

                                                                                                                                                Required: Agent 9.6.0+

                                                                                                                                                enable_service_check: Enable/Disable service check haproxy.backend.up.

                                                                                                                                                When set to false , all service checks will be disabled.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: haproxy
                                                                                                                                                    pattern:
                                                                                                                                                      comm: haproxy
                                                                                                                                                      port: 1936
                                                                                                                                                    conf:
                                                                                                                                                      username: stats
                                                                                                                                                      password: stats
                                                                                                                                                      url: http://localhost:1936/haproxy_stats
                                                                                                                                                      collect_aggregates_only: true
                                                                                                                                                      enable_service_check: false
                                                                                                                                                

                                                                                                                                                Example: Filter Metrics Per Service

                                                                                                                                                Required: Agent 9.6.0+

                                                                                                                                                services_exclude (Optional): Name or regex of services to be excluded.

                                                                                                                                                services_include (Optional): Name or regex of services to be included

                                                                                                                                                If a service is excluded with services_exclude, it can still be be included explicitly by services_include. The following example excludes all services except service_1 and service_2.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: haproxy
                                                                                                                                                    pattern:
                                                                                                                                                      comm: haproxy
                                                                                                                                                      port: 1936
                                                                                                                                                    conf:
                                                                                                                                                      username: stats
                                                                                                                                                      password: stats
                                                                                                                                                      url: http://localhost:1936/haproxy_stats
                                                                                                                                                      collect_aggregates_only: true
                                                                                                                                                      services_exclude:
                                                                                                                                                        - ".*"
                                                                                                                                                      services_include:
                                                                                                                                                        - "service_1"
                                                                                                                                                        - "service_2"
                                                                                                                                                

                                                                                                                                                Additional Options: active_tag, headers

                                                                                                                                                Required: Agent 9.6.0+

                                                                                                                                                There are two additional configuration options introduced with agent 9.6.0:

                                                                                                                                                • active_tag (Optional. Default: false):

                                                                                                                                                  Adds tag active to backend metrics that belong to the active pool of connections.

                                                                                                                                                • headers (Optional):

                                                                                                                                                  Extra headers such as auth-token can be passed along with requests.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: haproxy
                                                                                                                                                    pattern:
                                                                                                                                                      comm: haproxy
                                                                                                                                                      port: 1936
                                                                                                                                                    conf:
                                                                                                                                                      username: stats
                                                                                                                                                      password: stats
                                                                                                                                                      url: http://localhost:1936/haproxy_stats
                                                                                                                                                      collect_aggregates_only: true
                                                                                                                                                      active_tag: true
                                                                                                                                                      headers:
                                                                                                                                                        <HEADER_NAME>: <HEADER_VALUE>
                                                                                                                                                        <HEADER_NAME>: <HEADER_VALUE>
                                                                                                                                                

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                6.2.10 -

                                                                                                                                                HTTP

                                                                                                                                                The HTTP check monitors HTTP-based applications for URL availability.

                                                                                                                                                This page describes the default configuration settings, how to edit the configuration to collect additional information, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                HTTP Setup

                                                                                                                                                You do not need to configure anything on HTTP-based applications for the Sysdig agent to connect.

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to Edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                Default Configuration

                                                                                                                                                No default entry is present in the dragent.default.yaml for the HTTP check. You need to add an entry in dragent.yaml as shown in following examples.

                                                                                                                                                Never edit dragent.default.yaml directly; always edit only dragent.yaml.

                                                                                                                                                Example 1

                                                                                                                                                First you must identify the process pattern (comm:). It must match an actively running process for the HTTP check to work. Sysdig recommends the process be the one that is serving the URL being checked.

                                                                                                                                                If the URL is is remote from the agent, the user should use a process that is always running, such as “systemd”.

                                                                                                                                                Confirm the “comm” value using the following command:

                                                                                                                                                cat /proc/1/comm
                                                                                                                                                

                                                                                                                                                Add the following entry to the dragent.yaml file and modify the 'name:''comm:' and 'url:' parameters as needed:

                                                                                                                                                app_checks:
                                                                                                                                                  - name: EXAMPLE_WEBSITE
                                                                                                                                                    check_module: http_check
                                                                                                                                                    pattern:
                                                                                                                                                      comm:  systemd
                                                                                                                                                    conf:
                                                                                                                                                      url: https://www.MYEXAMPLE.com
                                                                                                                                                

                                                                                                                                                Example 2

                                                                                                                                                There are multiple configuration options available with the HTTP check. A full list is provided in the table following Example 2. These keys should be listed under the conf: section of the configuration in Example 1.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: EXAMPLE_WEBSITE
                                                                                                                                                    check_module: http_check
                                                                                                                                                    pattern:
                                                                                                                                                      comm:  systemd
                                                                                                                                                    conf:
                                                                                                                                                      url: https://www.MYEXAMPLE.com
                                                                                                                                                      # timeout: 1
                                                                                                                                                      #  method: get
                                                                                                                                                      #  data:
                                                                                                                                                      #    <KEY>: <VALUE>
                                                                                                                                                      #  content_match: '<REGEX>''
                                                                                                                                                      #  reverse_content_match: false
                                                                                                                                                      #  username: <USERNAME>
                                                                                                                                                      #  ntlm_domain: <DOMAIN>
                                                                                                                                                      #  password: <PASSWORD>
                                                                                                                                                      #  client_cert: /opt/client.crt
                                                                                                                                                      #  client_key: /opt/client.key
                                                                                                                                                      #  http_response_status_code: (1|2|3)\d\d
                                                                                                                                                      #  include_content: false
                                                                                                                                                      #  collect_response_time: true
                                                                                                                                                      #  disable_ssl_validation: true
                                                                                                                                                      #  ignore_ssl_warning: false
                                                                                                                                                      #  ca_certs: /etc/ssl/certs/ca-certificates.crt
                                                                                                                                                      #  check_certificate_expiration: true
                                                                                                                                                      #  days_warning: <THRESHOLD_DAYS>
                                                                                                                                                      #  check_hostname: true
                                                                                                                                                      #  ssl_server_name: <HOSTNAME>
                                                                                                                                                      #  headers:
                                                                                                                                                      #    Host: alternative.host.example.com
                                                                                                                                                      #    X-Auth-Token: <AUTH_TOKEN>
                                                                                                                                                      #  skip_proxy: false
                                                                                                                                                      #  allow_redirects: true
                                                                                                                                                      #  include_default_headers: true
                                                                                                                                                      #  tags:
                                                                                                                                                      #    - <KEY_1>:<VALUE_1>
                                                                                                                                                      #    - <KEY_2>:<VALUE_2>
                                                                                                                                                

                                                                                                                                                Key

                                                                                                                                                Description

                                                                                                                                                url

                                                                                                                                                The URL to test.

                                                                                                                                                timeout

                                                                                                                                                The time in seconds to allow for a response.

                                                                                                                                                method

                                                                                                                                                The HTTP method. This setting defaults to GET, though many other HTTP methods are supported, including POST and PUT.

                                                                                                                                                data

                                                                                                                                                The data option is only available when using the POST method. Data should be included as key-value pairs and will be sent in the body of the request.

                                                                                                                                                content_match

                                                                                                                                                A string or Python regular expression. The HTTP check will search for this value in the response and will report as DOWN if the string or expression is not found.

                                                                                                                                                reverse_content_match

                                                                                                                                                When true, reverses the behavior of the content_matchoption, i.e. the HTTP check will report as DOWN if the string or expression in content_match IS found. (default is false)

                                                                                                                                                username & password

                                                                                                                                                If your service uses basic authentication, you can provide the username and password here.

                                                                                                                                                http_response_status_code

                                                                                                                                                A string or Python regular expression for an HTTP status code. This check will report DOWN for any status code that does not match. This defaults to 1xx, 2xx and 3xx HTTP status codes. For example: 401 or 4\d\d.

                                                                                                                                                include_content

                                                                                                                                                When set to true, the check will include the first 200 characters of the HTTP response body in notifications. The default value is false.

                                                                                                                                                collect_response_time

                                                                                                                                                By default, the check will collect the response time (in seconds) as the metric network.http.response_time. To disable, set this value to false.

                                                                                                                                                disable_ssl_validation

                                                                                                                                                This setting will skip SSL certificate validation and is enabled by default. If you require SSL certificate validation, set this to false. This option is only used when gathering the response time/aliveness from the specified endpoint. Note this setting doesn't apply to the check_certificate_expirationoption.

                                                                                                                                                ignore_ssl_warning

                                                                                                                                                When SSL certificate validation is enabled (see setting above), this setting allows you to disable security warnings.

                                                                                                                                                ca_certs

                                                                                                                                                This setting allows you to override the default certificate path as specified in init_config

                                                                                                                                                check_certificate_expiration

                                                                                                                                                When check_certificate_expiration is enabled, the service check will check the expiration date of the SSL certificate.

                                                                                                                                                Note that this will cause the SSL certificate to be validated, regardless of the value of the disable_ssl_validation setting.

                                                                                                                                                days_warning

                                                                                                                                                When check_certificate_expiration is enabled, these settings will raise a warning alert when the SSL certificate is within the specified number of days from expiration.

                                                                                                                                                check_hostname

                                                                                                                                                When check_certificate_expiration is enabled, this setting will raise a warning if the hostname on the SSL certificate does not match the host of the given URL.

                                                                                                                                                headers

                                                                                                                                                This parameter allows you to send additional headers with the request. e.g. X-Auth-Token: <AUTH_TOKEN>

                                                                                                                                                skip_proxy

                                                                                                                                                If set, the check will bypass proxy settings and attempt to reach the check URL directly. This defaults to false.

                                                                                                                                                allow_redirects

                                                                                                                                                This setting allows the service check to follow HTTP redirects and defaults to true.

                                                                                                                                                tags

                                                                                                                                                A list of arbitrary tags that will be associated with the check.

                                                                                                                                                Metrics Available

                                                                                                                                                HTTP metrics concern response time and SSL certificate expiry information.

                                                                                                                                                See HTTP Metrics.

                                                                                                                                                Service Checks

                                                                                                                                                http.can_connect:

                                                                                                                                                Returns DOWN when any of the following occur:

                                                                                                                                                • the request to URL times out

                                                                                                                                                • the response code is 4xx/5xx, or it doesn’t match the pattern provided in the http_response_status_code

                                                                                                                                                • the response body does not contain the pattern in content_match

                                                                                                                                                • reverse_content_match is true and the response body does contain the pattern in content_match

                                                                                                                                                • URI contains https and disable_ssl_validation is false, and the SSL connection cannot be validated

                                                                                                                                                • Otherwise, returns UP.

                                                                                                                                                Segmentation of the http.can_connect can be done by URL.

                                                                                                                                                http.ssl_cert:

                                                                                                                                                The check returns:

                                                                                                                                                • DOWN if the URL’s certificate has already expired

                                                                                                                                                • WARNING if the URL’s certificate expires in less than days_warning days

                                                                                                                                                • Otherwise, returns UP.

                                                                                                                                                To disable this check, set check_certificate_expiration to false.

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                6.2.11 -

                                                                                                                                                Jenkins

                                                                                                                                                Jenkins is an open-source automation server which helps to automate part of the software development process, permitting continuous integration and facilitating the technical aspects of continuous delivery. It supports version control tools (such as Subversion, Git, Mercurial, etc), can execute Apache Ant, Apache Maven and SBT-based projects, and allows shell scripts and Windows batch commands. If Jenkins is installed on your environment, the Sysdig agent will automatically connect and collect all Jenkins metrics. See the Default Configuration section, below.

                                                                                                                                                This page describes the default configuration settings, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                Jenkins Setup

                                                                                                                                                Requires the standard Jenkins server setup with one or more Jenkins Jobs running on it.

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to Edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                Remember! Never edit dragent.default.yaml directly; always edit only dragent.yaml.

                                                                                                                                                Default Configuration

                                                                                                                                                By default, Sysdig’s dragent.default.yaml uses the following code to connect with Jenkins and collect basic metrics.

                                                                                                                                                  - name: jenkins
                                                                                                                                                    pattern:
                                                                                                                                                      comm: java
                                                                                                                                                      port: 50000
                                                                                                                                                    conf:
                                                                                                                                                      name: default
                                                                                                                                                      jenkins_home: /var/lib/jenkins #this depends on your environment
                                                                                                                                                

                                                                                                                                                Jenkins Folders Plugin

                                                                                                                                                By default, the Sysdig agent does not monitor jobs under job folders created using Folders plugin.

                                                                                                                                                Set jobs_folder_depth to monitor these jobs. Job folders are scanned recursively for jobs until the designated folder depth is reached. The default value = 1.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: jenkins
                                                                                                                                                    pattern:
                                                                                                                                                      comm: java
                                                                                                                                                      port: 50000
                                                                                                                                                    conf:
                                                                                                                                                      name: default
                                                                                                                                                      jenkins_home: /var/lib/jenkins
                                                                                                                                                      jobs_folder_depth: 3
                                                                                                                                                

                                                                                                                                                Metrics Available

                                                                                                                                                The following metrics will be available only after running one or more Jenkins jobs. They handle queue size, job duration, and job waiting time.

                                                                                                                                                See Jenkins Metrics.

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                6.2.12 -

                                                                                                                                                Lighttpd

                                                                                                                                                Lighttpd is a secure, fast, compliant, and very flexible web server that has been optimized for high-performance environments. It has a very low memory footprint compared to other web servers and takes care of CPU load. Its advanced feature set (FastCGI, CGI, Auth, Output Compression, URL Rewriting, and many more) make Lighttpd the perfect web server software for every server that suffers load problems. If Lighttpd is installed on your environment, the Sysdig agent will automatically connect. See the Default Configuration section, below. The Sysdig agent automatically collects the default metrics.

                                                                                                                                                This page describes the default configuration settings, how to edit the configuration to collect additional information, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                At this time, the Sysdig app check for Lighttpd supports Lighttpd version 1.x.x only.

                                                                                                                                                Lighttpd Setup

                                                                                                                                                For Lighttpd, the status page must be enabled. Add mod_status in the /etc/lighttpd/lighttpd.conf config file:

                                                                                                                                                server.modules = ( ..., "mod_status", ... )
                                                                                                                                                

                                                                                                                                                Then configure an endpoint for it. If (for security purposes) you want to open the status page only to users from the local network, it can be done by adding the following lines in the /etc/lighttpd/lighttpd.conf file :

                                                                                                                                                $HTTP["remoteip"] == "127.0.0.1/8" {
                                                                                                                                                    status.status-url = "/server-status"
                                                                                                                                                  }
                                                                                                                                                

                                                                                                                                                If you want an endpoint to be open for remote users based on authentication, then the mod_auth module should be enabled in the /etc/lighttpd/lighttpd.conf config file:

                                                                                                                                                server.modules = ( ..., "mod_auth", ... )
                                                                                                                                                

                                                                                                                                                Then you can add the auth.require parameter in the /etc/lighttpd/lighttpd.conf config file:

                                                                                                                                                auth.require = ( "/server-status" => ( "method"  => ... , "realm"   => ... , "require" => ... ) )
                                                                                                                                                

                                                                                                                                                For more information on the auth.require parameter, see the Lighttpd documentation..

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to Edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                Default Configuration

                                                                                                                                                By default, Sysdig’s dragent.default.yaml uses the following code to connect with Lighttpd and collect basic metrics.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: lighttpd
                                                                                                                                                    pattern:
                                                                                                                                                      comm: lighttpd
                                                                                                                                                    conf:
                                                                                                                                                      lighttpd_status_url: "http://localhost:{port}/server-status?auto"
                                                                                                                                                    log_errors: false
                                                                                                                                                

                                                                                                                                                Metrics Available

                                                                                                                                                These metrics are supported for Lighttpd version 1.x.x only. Lighttpd version 2.x.x is being built and is NOT ready for use as of this publication.

                                                                                                                                                See Lighttpd Metrics.

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                6.2.13 -

                                                                                                                                                Memcached

                                                                                                                                                Memcached is an in-memory key-value store for small chunks of arbitrary data (strings, objects) from the results of database calls, API calls, or page rendering. If Memcached is installed on your environment, the Sysdig agent will automatically connect. See the Default Configuration section, below. The Sysdig agent automatically collects basic metrics. You can also edit the configuration to collect additional metrics related to items and slabs.

                                                                                                                                                This page describes the default configuration settings, how to edit the configuration to collect additional information, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                Memcached Setup

                                                                                                                                                Memcached will automatically expose all metrics. You do not need to add anything on Memcached instance.

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to Edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                Default Configuration

                                                                                                                                                By default, Sysdig’s dragent.default.yaml uses the following code to connect with Memcached and collect basic metrics:

                                                                                                                                                app_checks:
                                                                                                                                                  - name: memcached
                                                                                                                                                    check_module: mcache
                                                                                                                                                    pattern:
                                                                                                                                                      comm: memcached
                                                                                                                                                    conf:
                                                                                                                                                      url: localhost
                                                                                                                                                      port: "{port}"
                                                                                                                                                

                                                                                                                                                Additional metrics can be collected by editing Sysdig’s configuration file dragent.yaml. If SASL is enabled, authentication parameters must be added to dragent.yaml.

                                                                                                                                                Remember! Never edit dragent.default.yaml directly; always edit only dragent.yaml.

                                                                                                                                                Example 1: Additional Metrics

                                                                                                                                                memcache.items.* and memcache.slabs.* can be collected by setting flags in the options section, as follows . Either value can be set to false if you do not want to collect metrics from them.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: memcached
                                                                                                                                                    check_module: mcache
                                                                                                                                                    pattern:
                                                                                                                                                      comm: memcached
                                                                                                                                                    conf:
                                                                                                                                                      url: localhost
                                                                                                                                                      port: "{port}"
                                                                                                                                                    options:
                                                                                                                                                      items: true       # Default is false
                                                                                                                                                      slabs: true       # Default is false
                                                                                                                                                

                                                                                                                                                Example 2: SASL

                                                                                                                                                SASL authentication can be enabled with Memcached (see instructions here). If enabled, credentials must be provided against username and password fields as shown in Example 2.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: memcached
                                                                                                                                                    check_module: mcache
                                                                                                                                                    pattern:
                                                                                                                                                      comm: memcached
                                                                                                                                                    conf:
                                                                                                                                                      url: localhost
                                                                                                                                                      port: "{port}"
                                                                                                                                                      username: <username>
                                                                                                                                                      # Some memcached version will support <username>@<hostname>.
                                                                                                                                                      # If memcached is installed as a container, hostname of memcached container will be used as username
                                                                                                                                                      password: <password>
                                                                                                                                                

                                                                                                                                                Metrics Available

                                                                                                                                                See Memcached Metrics.

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                6.2.14 -

                                                                                                                                                Mesos/Marathon

                                                                                                                                                Mesos is built using the same principles as the Linux kernel, only at a different level of abstraction. The Mesos kernel runs on every machine and provides applications (e.g., Hadoop, Spark, Kafka, Elasticsearch) with APIs for resource management and scheduling across entire datacenter and cloud environments. The Mesos metrics are divided into master and agent. Marathon is a production-grade container orchestration platform for Apache Mesos.

                                                                                                                                                If Mesos and Marathon are installed in your environment, the Sysdig agent will automatically connect and start collecting metrics. You may need to edit the default entries to add a custom configuration if the default does not work. See the Default Configuration section, below.

                                                                                                                                                This page describes the default configuration settings, how to edit the configuration to collect additional information, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                Mesos/Marathon Setup

                                                                                                                                                Both Mesos and Marathon will automatically expose all metrics. You do not need to add anything to the Mesos/Marathon instance.

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to Edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                The Sysdig agent has different entries for mesos-master, mesos-slave and marathon in its configuration file. Default entries are present in Sysdig’s dragent.default.yaml file and collect all metrics for Mesos. For Marathon, it collects basic metrics. You may need add configuration to collect additional metrics.

                                                                                                                                                Default Configuration

                                                                                                                                                In the URLs for mesos-master and mesos-slave, {mesos_url} will be replaced with either the hostname of the auto-detected mesos master/slave (if auto-detection is enabled), or with an explicit value from mesos_state_uri otherwise.

                                                                                                                                                In the URLs for marathon, {marathon_url} will be replaced with the hostname of the first configured/discovered Marathon framework.

                                                                                                                                                For all Mesos and Marathon apps, {auth_token} will either be blank or an auto-generated token obtained via the /acs/api/v1/auth/login endpoint.

                                                                                                                                                Mesos Master

                                                                                                                                                app_checks:
                                                                                                                                                  - name: mesos-master
                                                                                                                                                    check_module: mesos_master
                                                                                                                                                    interval: 30
                                                                                                                                                    pattern:
                                                                                                                                                      comm: mesos-master
                                                                                                                                                    conf:
                                                                                                                                                      url: "http://localhost:5050"
                                                                                                                                                    auth_token: "{auth_token}"
                                                                                                                                                    mesos_creds: "{mesos_creds}"
                                                                                                                                                

                                                                                                                                                Mesos Agent

                                                                                                                                                app_checks:
                                                                                                                                                
                                                                                                                                                  - name: mesos-slave
                                                                                                                                                    check_module: mesos_slave
                                                                                                                                                    interval: 30
                                                                                                                                                    pattern:
                                                                                                                                                      comm: mesos-slave
                                                                                                                                                    conf:
                                                                                                                                                      url: "http://localhost:5051"
                                                                                                                                                    auth_token: "{auth_token}"
                                                                                                                                                    mesos_creds: "{mesos_creds}"
                                                                                                                                                

                                                                                                                                                Marathon

                                                                                                                                                app_checks:
                                                                                                                                                
                                                                                                                                                  - name: marathon
                                                                                                                                                    check_module: marathon
                                                                                                                                                    interval: 30
                                                                                                                                                    pattern:
                                                                                                                                                      arg: mesosphere.marathon.Main
                                                                                                                                                    conf:
                                                                                                                                                      url: "{marathon_url}"
                                                                                                                                                    auth_token: "{auth_token}"
                                                                                                                                                    marathon_creds: "{marathon_creds}"
                                                                                                                                                

                                                                                                                                                Remember! Never edit dragent.default.yaml directly; always edit dragent.yaml.

                                                                                                                                                Marathon

                                                                                                                                                Enable the flag full_metrics to collect all metrics for marathon.

                                                                                                                                                The following additional metrics are collected with this configuration:

                                                                                                                                                • marathon.cpus

                                                                                                                                                • marathon.disk

                                                                                                                                                • marathon.instances

                                                                                                                                                • marathon.mem

                                                                                                                                                app_checks:
                                                                                                                                                
                                                                                                                                                  - name: marathon
                                                                                                                                                    check_module: marathon
                                                                                                                                                    interval: 30
                                                                                                                                                    pattern:
                                                                                                                                                      arg: mesosphere.marathon.Main
                                                                                                                                                    conf:
                                                                                                                                                      url: "{marathon_url}"
                                                                                                                                                    auth_token: "{auth_token}"
                                                                                                                                                    marathon_creds: "{marathon_creds}"
                                                                                                                                                

                                                                                                                                                Metrics Available

                                                                                                                                                See Mesos Master Metrics.

                                                                                                                                                See Mesos Agent Metrics.

                                                                                                                                                See Marathon Metrics.

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                Mesos Master

                                                                                                                                                Mesos Agent

                                                                                                                                                Marathon

                                                                                                                                                6.2.15 -

                                                                                                                                                MongoDB

                                                                                                                                                MongoDB is an open-source database management system (DBMS) that uses a document-oriented database model that supports various forms of data. If MongoDB is installed in your environment, the Sysdig agent will automatically connect and collect basic metrics (if authentication is not used). You may need to edit the default entries to connect and collect additional metrics. See the Default Configuration section, below.

                                                                                                                                                This page describes the default configuration settings, how to edit the configuration to collect additional information, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                MongoDB Setup

                                                                                                                                                Create a read-only user for the Sysdig agent.

                                                                                                                                                # Authenticate as the admin user.
                                                                                                                                                use admin
                                                                                                                                                db.auth("admin", "<YOUR_MONGODB_ADMIN_PASSWORD>")
                                                                                                                                                
                                                                                                                                                # On MongoDB 2.x, use the addUser command.
                                                                                                                                                db.addUser("sysdig-cloud", "sysdig-cloud-password", true)
                                                                                                                                                
                                                                                                                                                # On MongoDB 3.x or higher, use the createUser command.
                                                                                                                                                db.createUser({
                                                                                                                                                  "user":"sysdig-cloud",
                                                                                                                                                  "pwd": "sysdig-cloud-password",
                                                                                                                                                  "roles" : [
                                                                                                                                                    {role: 'read', db: 'admin' },
                                                                                                                                                    {role: 'clusterMonitor', db: 'admin'},
                                                                                                                                                    {role: 'read', db: 'local' }
                                                                                                                                                  ]
                                                                                                                                                })
                                                                                                                                                

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to Edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                Default Configuration

                                                                                                                                                By default, Sysdig’s dragent.default.yaml uses the following code to connect with MongoDB.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: mongodb
                                                                                                                                                    check_module: mongo
                                                                                                                                                    pattern:
                                                                                                                                                      comm: mongod
                                                                                                                                                    conf:
                                                                                                                                                      server: "mongodb://localhost:{port}/admin"
                                                                                                                                                

                                                                                                                                                The default MongoDB entry should work for without modification if authentication is not configured. If you have enabled password authentication, the entry will need to be changed.

                                                                                                                                                Some metrics are not available by default. Additional configuration needs to be provided to collect them as shown in following examples.

                                                                                                                                                Remember! Never edit dragent.default.yaml directly; always edit only dragent.yaml.

                                                                                                                                                Example 1: With Authentication

                                                                                                                                                Replace <username> and <password> with actual username and password.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: mongodb
                                                                                                                                                    check_module: mongo
                                                                                                                                                    pattern:
                                                                                                                                                      comm: mongod
                                                                                                                                                    conf:
                                                                                                                                                      server: mongodb://<username>:<password>@localhost:{port}/admin
                                                                                                                                                      replica_check: true
                                                                                                                                                

                                                                                                                                                Example 2: Additional Metrics

                                                                                                                                                Some metrics are not collected by default. These can be collected by adding additional_metrics section in the dragent.yaml file under the app_checks mongodb configuration.

                                                                                                                                                Available options are:

                                                                                                                                                collection - Metrics of the specified collections

                                                                                                                                                metrics.commands - Use of database commands

                                                                                                                                                tcmalloc - TCMalloc memory allocator

                                                                                                                                                top - Usage statistics for each collection

                                                                                                                                                app_checks:
                                                                                                                                                  - name: mongodb
                                                                                                                                                    check_module: mongo
                                                                                                                                                    pattern:
                                                                                                                                                      comm: mongod
                                                                                                                                                    conf:
                                                                                                                                                      server: mongodb://<username>:<password>@localhost:{port}/admin
                                                                                                                                                      replica_check: true
                                                                                                                                                      additional_metrics:
                                                                                                                                                        - collection
                                                                                                                                                        - metrics.commands
                                                                                                                                                        - tcmalloc
                                                                                                                                                        - top
                                                                                                                                                

                                                                                                                                                List of metrics with respective entries in dragent.yaml:

                                                                                                                                                metric prefixEntry under additional_metrics
                                                                                                                                                mongodb.collectioncollection
                                                                                                                                                mongodb.usage.commandstop
                                                                                                                                                mongodb.usage.getmoretop
                                                                                                                                                mongodb.usage.inserttop
                                                                                                                                                mongodb.usage.queriestop
                                                                                                                                                mongodb.usage.readLocktop
                                                                                                                                                mongodb.usage.writeLocktop
                                                                                                                                                mongodb.usage.removetop
                                                                                                                                                mongodb.usage.totaltop
                                                                                                                                                mongodb.usage.updatetop
                                                                                                                                                mongodb.usage.writeLocktop
                                                                                                                                                mongodb.tcmalloctcmalloc
                                                                                                                                                mongodb.metrics.commandsmetrics.commands

                                                                                                                                                Example 3: Collections Metrics

                                                                                                                                                MongoDB stores documents in collections. Collections are analogous to tables in relational databases. The Sysdig agent by default does not collect the following collections metrics:

                                                                                                                                                • collections: List of MongoDB collections to be polled by the agent. Metrics will be collected for the specified set of collections. This configuration requires the additional_metrics.collection section to be present with an entry for collection in the dragent.yaml file. The collection entry under additional_metrics is a flag that enables the collection metrics.

                                                                                                                                                • collections_indexes_stats: Collect indexes access metrics for every index in every collection in the collections list. The default value is false.

                                                                                                                                                  The metric is available starting MongoDB v3.2.

                                                                                                                                                For the agent to poll them, you must configure the dragent.yaml file and add an entry corresponding to the metrics to the conf section as follows.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: mongodb
                                                                                                                                                    check_module: mongo
                                                                                                                                                    pattern:
                                                                                                                                                      comm: mongod
                                                                                                                                                    conf:
                                                                                                                                                      server: mongodb://<username>:<password>@localhost:{port}/admin
                                                                                                                                                      replica_check: true
                                                                                                                                                      additional_metrics:
                                                                                                                                                        - collection
                                                                                                                                                        - metrics.commands
                                                                                                                                                        - tcmalloc
                                                                                                                                                        - top
                                                                                                                                                      collections:
                                                                                                                                                        - <LIST_COLLECTIONS>
                                                                                                                                                      collections_indexes_stats: true
                                                                                                                                                

                                                                                                                                                Configure SSL for MongoDB App Check

                                                                                                                                                You can tighten the security measure of the app check connection with MongoDB by establishing an SSL connection. To enable secure communication, you need to set the SSL configuration in dragent.yaml to true. In an advanced deployment with multi-instances of MongoDB, you need to include a custom CA certificate or client certificate and other additional configurations.

                                                                                                                                                Basic SSL Connection

                                                                                                                                                In a basic SSL connection:

                                                                                                                                                • A single MongoDB instance is running on the host.

                                                                                                                                                • An SSL connection with no advanced features, such as the use of a custom CA certificate or client certificate.

                                                                                                                                                To establish a basic SSL connection between the agent and the MongoDB instance:

                                                                                                                                                1. Open the dragent.yaml file.

                                                                                                                                                2. Configure the SSL entries as follows:

                                                                                                                                                  app_checks:
                                                                                                                                                    - name: mongodb
                                                                                                                                                      check_module: mongo
                                                                                                                                                      pattern:
                                                                                                                                                        comm: mongod
                                                                                                                                                      conf:
                                                                                                                                                        server: "mongodb://<HOSTNAME>:{port}/admin"
                                                                                                                                                        ssl: true
                                                                                                                                                        # ssl_cert_reqs: 0    # Disable SSL validation
                                                                                                                                                  

                                                                                                                                                  To disable SSL validation, set ssl_cert_reqs to 0. This setting is equivalent to ssl_cert_reqs=CERT_NONE.

                                                                                                                                                Advanced SSL Connection

                                                                                                                                                In an advanced SSL connection:

                                                                                                                                                • Advanced features, such as custom CA certificate or client certificate, are configured.

                                                                                                                                                • Single or multi-MongoDB instances are running on the host. The agent is installed as one of the following:

                                                                                                                                                  • Container

                                                                                                                                                  • Service

                                                                                                                                                Prerequisites

                                                                                                                                                Set up the following:

                                                                                                                                                • Custom CA certificate

                                                                                                                                                • Client SSL verification

                                                                                                                                                • SSL validation

                                                                                                                                                (Optional ) SSL Configuration Parameters

                                                                                                                                                Parameters

                                                                                                                                                Description

                                                                                                                                                ssl_certfile

                                                                                                                                                The certificate file that is used to identify the local connection with MongoDB.

                                                                                                                                                ssl_keyfile

                                                                                                                                                The private keyfile that is used to identify the local connection with MongoDB. Ignore this option if the key is included with ssl_certfile.

                                                                                                                                                ssl_cert_reqs

                                                                                                                                                Specifies whether a certificate is required from the MongoDB server, and whether it will be validated if provided. Possible values are:

                                                                                                                                                • 0 for ssl.CERT_NONE. Implies certificates are ignored.

                                                                                                                                                • 1 for ssl.CERT_OPTIONAL. Implies certificates are not required, but validated if provided.

                                                                                                                                                • 2 for ssl.CERT_REQUIRED. Implies certificates are required and validated.

                                                                                                                                                ssl_ca_certs

                                                                                                                                                The ca_certs file contains a set of concatenated certification authority certificates, which are used to validate certificates used by MongoDB server. Mostly used when server certificates are self-signed.

                                                                                                                                                Sysdig Agent as a Container

                                                                                                                                                1. If Sysdig agent is installed as a container, start it with an extra volume containing the SSL files mentioned in the agent configuration. For example:

                                                                                                                                                  # extra parameter added: -v /etc/ssl:/etc/ssl
                                                                                                                                                  docker run -d --name sysdig-agent --restart always --privileged --net host --pid host -e ACCESS_KEY=xxxxxxxxxxxxx -e SECURE=true -e TAGS=example_tag:example_value -v /var/run/docker.sock:/host/var/run/docker.sock -v /dev:/host/dev -v /proc:/host/proc:ro -v /boot:/host/boot:ro -v /lib/modules:/host/lib/modules:ro -v /usr:/host/usr:ro -v /etc/ssl:/etc/ssl --shm-size=512m sysdig/agent
                                                                                                                                                  
                                                                                                                                                2. Open the dragent.yaml file and configure the SSL entries:

                                                                                                                                                  app_checks:
                                                                                                                                                    - name: mongodb
                                                                                                                                                      check_module: mongo
                                                                                                                                                      pattern:
                                                                                                                                                        comm: mongod
                                                                                                                                                      conf:
                                                                                                                                                        server: "mongodb://<HOSTNAME>:{port}/admin"
                                                                                                                                                        ssl: true
                                                                                                                                                        # ssl_ca_certs: </path/to/ca/certificate>
                                                                                                                                                        # ssl_cert_reqs: 0    # Disable SSL validation
                                                                                                                                                        # ssl_certfile: </path/to/client/certfile>
                                                                                                                                                        # ssl_keyfile: </path/to/client/keyfile>
                                                                                                                                                  

                                                                                                                                                Sysdig Agent as a Process

                                                                                                                                                1. If Sysdig agent is installed as a process, store the SSL files on the host and provide the path in the agent configuration.

                                                                                                                                                  app_checks:
                                                                                                                                                    - name: mongodb
                                                                                                                                                      check_module: mongo
                                                                                                                                                      pattern:
                                                                                                                                                        comm: mongod
                                                                                                                                                      conf:
                                                                                                                                                        server: "mongodb://<HOSTNAME>:{port}/admin"
                                                                                                                                                        ssl: true
                                                                                                                                                        # ssl_ca_certs: </path/to/ca/certificate>
                                                                                                                                                        # ssl_cert_reqs: 0    # Disable SSL validation
                                                                                                                                                        # ssl_certfile: </path/to/client/certfile>
                                                                                                                                                        # ssl_keyfile: </path/to/client/keyfile>
                                                                                                                                                  

                                                                                                                                                  See optional SSL configuration parameters for information on SSL certificate files.

                                                                                                                                                Multi-MongoDB Setup

                                                                                                                                                In a multi-MongoDB setup, multiple MongoDB instances are running on a single host. You can configure either a basic or an advanced SSL connection individually for each MongoDB instance.

                                                                                                                                                Store SSL Files

                                                                                                                                                In an advanced connection, different SSL certificates are used for each instance of MongoDB on the same host and are stored in separate directories. For instance, the SSL files corresponding to two different MongoDB instances can be stored at a mount point as follows:

                                                                                                                                                • Mount point is /etc/ssl/

                                                                                                                                                • Files for instance 1 are stored in  /etc/ssl/mongo1/

                                                                                                                                                • Files for instance 2 are stored in  /etc/ssl/mongo2/

                                                                                                                                                Configure the Agent
                                                                                                                                                1. Open the dragent.yaml file.

                                                                                                                                                2. Configure the SSL entries as follows:

                                                                                                                                                  app_checks:
                                                                                                                                                    - name: mongodb-ssl-1
                                                                                                                                                      check_module: mongo
                                                                                                                                                      pattern:
                                                                                                                                                        comm: mongod
                                                                                                                                                        args: ssl_certificate-1.pem
                                                                                                                                                      conf:
                                                                                                                                                        server: "mongodb://<HOSTNAME|Certificate_CN>:{port}/admin"
                                                                                                                                                        ssl: true
                                                                                                                                                        ssl_ca_certs: /etc/ssl/mongo1/ca-cert-1
                                                                                                                                                        tags:
                                                                                                                                                          - "instance:ssl-1"
                                                                                                                                                  
                                                                                                                                                    - name: mongodb-ssl-2
                                                                                                                                                      check_module: mongo
                                                                                                                                                      pattern:
                                                                                                                                                        comm: mongod
                                                                                                                                                        args: ssl_certificate-2.pem
                                                                                                                                                      conf:
                                                                                                                                                        server: "mongodb://<HOSTNAME|Certificate_CN>:{port}/admin"
                                                                                                                                                        ssl: true
                                                                                                                                                        ssl_ca_certs: /etc/ssl/mongo2/ca-cert-2
                                                                                                                                                        tags:
                                                                                                                                                          - "instance:ssl-2"
                                                                                                                                                  

                                                                                                                                                  Replace the names of the instances and certificate files with the names that you prefer.

                                                                                                                                                Metrics Available

                                                                                                                                                See MongoDB Metrics.

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                6.2.16 -

                                                                                                                                                MySQL

                                                                                                                                                MySQL is the world’s most popular open-source database. With its proven performance, reliability, and ease-of-use, MySQL has become the leading database choice for web-based applications, used by high profile web properties including Facebook, Twitter, YouTube. Additionally, it is an extremely popular choice as an embedded database, distributed by thousands of ISVs and OEMs.

                                                                                                                                                Supported Distribution

                                                                                                                                                The MySQL AppCheck is supported for following MySQL versions.

                                                                                                                                                If the Sysdig agent is installed as a Process:

                                                                                                                                                • Host with Python 2.7: MySQL versions supported - 5.5 to 8

                                                                                                                                                • Host with Python 2.6: MySQL versions supported - 4.1 to 5.7 (tested with v5.x only)

                                                                                                                                                  NOTE: This implies that MySQL 5.5, 5.6 and 5.7 are supported on both the Python 2.6 and 2.7 environments.

                                                                                                                                                If the Sysdig agent is installed as a Docker container:

                                                                                                                                                The Docker container of the Sysdig agent has Python 2.7 installed. If it is installed, respective versions against Python 2.7 will be supported.

                                                                                                                                                The following environments have been tested and are supported. Tests environments include both the Host/Process and Docker environment.

                                                                                                                                                PythonMySQL
                                                                                                                                                2.7 (Ubuntu 16/ CentOS 7)NoYesYesYesYes
                                                                                                                                                2.6 (CentOS 6)YesYesYesYesNo

                                                                                                                                                MySQL Setup

                                                                                                                                                A user must be created on MySQL so the Sysdig agent can collect metrics. To configure credentials, run the following commands on your server, replacing the sysdig-clouc-password parameter.

                                                                                                                                                MySQL version-specific commands to create a user are provided below.

                                                                                                                                                # MySQL 5.6 and earlier
                                                                                                                                                CREATE USER 'sysdig-cloud'@'127.0.0.1' IDENTIFIED BY 'sysdig-cloud-password';
                                                                                                                                                GRANT PROCESS, REPLICATION CLIENT ON *.* TO 'sysdig-cloud'@'127.0.0.1' WITH MAX_USER_CONNECTIONS 5;
                                                                                                                                                
                                                                                                                                                ## OR ##
                                                                                                                                                
                                                                                                                                                # MySQL 5.7 and 8
                                                                                                                                                CREATE USER 'sysdig-cloud'@'127.0.0.1' IDENTIFIED BY 'sysdig-cloud-password' WITH MAX_USER_CONNECTIONS 5;
                                                                                                                                                GRANT PROCESS, REPLICATION CLIENT ON *.* TO 'sysdig-cloud'@'127.0.0.1';
                                                                                                                                                

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to Edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                There is no default configuration for MySQL, as a unique user and password are required for metrics polling.

                                                                                                                                                Add the entry for MySQL into dragent.yaml , updating the user and pass field credentials.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: mysql
                                                                                                                                                    pattern:
                                                                                                                                                      comm: mysqld
                                                                                                                                                    conf:
                                                                                                                                                      server: 127.0.0.1
                                                                                                                                                      user: sysdig-cloud
                                                                                                                                                      pass: sysdig-cloud-password
                                                                                                                                                

                                                                                                                                                Metrics Available

                                                                                                                                                See MySQL Metrics.

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                Default Dashboard

                                                                                                                                                Additional Views

                                                                                                                                                6.2.17 -

                                                                                                                                                NGINX and NGINX Plus

                                                                                                                                                NGINX is open-source software for web serving, reverse proxying, caching, load balancing, media streaming, and more. It started out as a web server designed for maximum performance and stability. In addition to its HTTP server capabilities, NGINX can also function as a proxy server for email (IMAP, POP3, and SMTP) and a reverse proxy and load balancer for HTTP, TCP, and UDP servers.

                                                                                                                                                NGINX Plus is a software load balancer, web server, and content cache built on top of open source NGINX. NGINX Plus has exclusive enterprise‑grade features beyond what’s available in the open-source offering, including session persistence, configuration via API, and active health checks.

                                                                                                                                                The Sysdig agent has a default configuration to collect metrics for open-source NGINX, provided that you have the HTTP stub status module enabled. NGINX exposes basic metrics about server activity on a simple status page with this status module. If NGINX Plus is installed, a wide range of metrics is available with the NGINX Plus API.

                                                                                                                                                This page describes the setup steps for NGINX/NGINX Plus, the default configuration settings, how to edit the configuration to collect additional information, the metrics available for integration, and sample results in the Sysdig Monitor UI.

                                                                                                                                                NGINX/ NGINX Plus Setup

                                                                                                                                                This section describes the configuration required on the NGINX server.

                                                                                                                                                The Sysdig agent will not collect metrics until the required endpoint is added to the NGINX configuration, per one of the following methods:

                                                                                                                                                • For NGINX (Open Source): use the stub status module

                                                                                                                                                • For NGINX Plus: use the Plus API

                                                                                                                                                Configuration examples of each are provided below

                                                                                                                                                NGINX Stub Status Module Configuration

                                                                                                                                                The ngx_http_stub_status_module provides access to basic status information. It is compiled by default on most distributions. If not, it should be enabled with the --with-http_stub_status_module configuration parameter.

                                                                                                                                                1. To check if the module is already compiled, run the following command:

                                                                                                                                                  nginx -V 2>&1 | grep -o with-http_stub_status_module
                                                                                                                                                  

                                                                                                                                                  If with-http_stub_status_module is listed, the status module is enabled. (For more information, see http://nginx.org/en/docs/http/ngx_http_stub_status_module.html.)

                                                                                                                                                2. Update the NGINX configuration file with /nginx_status endpoint as follows. The default NGINX configuration file is present at /etc/nginx/nginx.conf or /etc/nginx/conf.d/default.conf.

                                                                                                                                                  # HTTP context
                                                                                                                                                  server {
                                                                                                                                                  ...
                                                                                                                                                    # Enable NGINX status module
                                                                                                                                                    location /nginx_status {
                                                                                                                                                      # freely available with open source NGINX
                                                                                                                                                      stub_status;
                                                                                                                                                      access_log   off;
                                                                                                                                                  
                                                                                                                                                      # for open source NGINX < version 1.7.5
                                                                                                                                                      # stub_status on;
                                                                                                                                                    }
                                                                                                                                                  ...
                                                                                                                                                  }
                                                                                                                                                  

                                                                                                                                                NGINX Plus API Configuration

                                                                                                                                                When NGINX Plus is configured, the Plus API can be enabled by adding /api endpoint in the NGINX configuration file as follows.

                                                                                                                                                The default NGINX configuration file is present at /etc/nginx/nginx.conf or /etc/nginx/conf.d/default.conf.

                                                                                                                                                # HTTP context
                                                                                                                                                server {
                                                                                                                                                ...
                                                                                                                                                  # Enable NGINX Plus API
                                                                                                                                                  location /api {
                                                                                                                                                    api write=on;
                                                                                                                                                    allow all;
                                                                                                                                                  }
                                                                                                                                                ...
                                                                                                                                                }
                                                                                                                                                

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                • Configuration Examples:

                                                                                                                                                  • Example 1 (Default): When only open-source Nginx is configured.

                                                                                                                                                  • Example 2: When only NginxPlus node is configured.

                                                                                                                                                  • Example 3: When Nginx and NginxPlus are installed in different containers on same host.

                                                                                                                                                • Flag use_plus_api and is used for differentiating NGINX & NGINXPlus metrics.

                                                                                                                                                • NGINXPlus metrics are differentiated with prefix nginx.plus.*

                                                                                                                                                • When use_plus_api = true,

                                                                                                                                                  • nginx_plus_api_url is used to fetch NginxPlus metrics from the NginxPlus node.

                                                                                                                                                  • nginx_status_url is used to fetch Nginx metrics from the Nginx node (If single host is running two separate containers for Nginx and NginxPlus).

                                                                                                                                                Example 1: Default Configuration

                                                                                                                                                With the default configuration, only NGINX metrics will be available once the ngx_http_stub_status_module is configured.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: nginx
                                                                                                                                                    check_module: nginx
                                                                                                                                                    pattern:
                                                                                                                                                      exe: "nginx: worker process"
                                                                                                                                                    conf:
                                                                                                                                                      nginx_status_url: "http://localhost:{port}/nginx_status"
                                                                                                                                                    log_errors: true
                                                                                                                                                

                                                                                                                                                Example 2: NGINX Plus only

                                                                                                                                                With this example only NGINX Plus Metrics will be available.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: nginx
                                                                                                                                                    check_module: nginx
                                                                                                                                                    pattern:
                                                                                                                                                      exe: "nginx: worker process"
                                                                                                                                                    conf:
                                                                                                                                                      nginx_plus_api_url: "http://localhost:{port}/api"
                                                                                                                                                      use_plus_api: true
                                                                                                                                                      user: admin
                                                                                                                                                      password: admin
                                                                                                                                                    log_errors: true
                                                                                                                                                

                                                                                                                                                Example 3: NGINX and NGINX Plus

                                                                                                                                                This is special case where NGINX open-source and NGINX PLUS are installed on same host but in different containers. With this configuration, respective metrics will be available for NGINX and NGINX Plus containers.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: nginx
                                                                                                                                                    check_module: nginx
                                                                                                                                                    pattern:
                                                                                                                                                      exe: "nginx: worker process"
                                                                                                                                                    conf:
                                                                                                                                                      nginx_plus_api_url: "http://localhost:{port}/api"
                                                                                                                                                      nginx_status_url: "http://localhost:{port}/nginx_status"
                                                                                                                                                      use_plus_api: true
                                                                                                                                                      user: admin
                                                                                                                                                      password: admin
                                                                                                                                                    log_errors: true
                                                                                                                                                

                                                                                                                                                List of Metrics

                                                                                                                                                NGINX (Open Source)

                                                                                                                                                See NGINX Metrics.

                                                                                                                                                NGINX Plus

                                                                                                                                                See NGINX Plus Metrics.

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                6.2.18 -

                                                                                                                                                NTP

                                                                                                                                                NTP stands for Network Time Protocol. It is used to synchronize the time on your Linux system with a centralized NTP server. A local NTP server on the network can be synchronized with an external timing source to keep all the servers in your organization in-sync with an accurate time.

                                                                                                                                                If the NTP check is enabled in the Sysdig agent, it reports the time offset of the local agent from an NTP server.

                                                                                                                                                This page describes how to edit the configuration to collect information, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to Edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                Default Configuration

                                                                                                                                                By default, Sysdig's dragent.default.yaml does not provide any configuration for NTP.

                                                                                                                                                Add the configuration in Example 1 to the dragent.yaml file to enable NTP checks.

                                                                                                                                                Never edit dragent.default.yaml directly; always edit only dragent.yaml.

                                                                                                                                                Example

                                                                                                                                                - name: ntp
                                                                                                                                                    interval: 60
                                                                                                                                                    pattern:
                                                                                                                                                      comm: systemd
                                                                                                                                                    conf:
                                                                                                                                                      host: us.pool.ntp.org
                                                                                                                                                      offset_threshold: 60
                                                                                                                                                
                                                                                                                                                • host : (mandatory) provides the host name of NTP server.

                                                                                                                                                • offset_threshold: (optional) provides the difference (in seconds) between the local clock and the NTP server, when the ntp.in_sync service check becomes CRITICAL. The default is 60 seconds.

                                                                                                                                                Metrics Available

                                                                                                                                                ntp.offset, the time difference between the local clock and the NTP reference clock, is the primary NTP metric.

                                                                                                                                                See also NTP Metrics.

                                                                                                                                                Service Checks

                                                                                                                                                ntp.in_sync:

                                                                                                                                                Returns CRITICAL if the NTP offset is greater than the threshold specified in dragent.yaml, otherwise OK.

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                6.2.19 -

                                                                                                                                                PGBouncer

                                                                                                                                                PgBouncer is a lightweight connection pooler for PostgreSQL. If PgBouncer is installed on your environment, you may need to edit the Sysdig agent configuration file to connect. See the Default Configuration section, below.

                                                                                                                                                This page describes the configuration settings, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                PgBouncer Setup

                                                                                                                                                PgBouncer does not ship with a default stats user configuration. To configure it, you need to add a user allowed to access PgBouncer stats. Do so by adding following line in pgbouncer.ini. The default file location is /etc/pgbouncer/pgbouncer.ini

                                                                                                                                                stats_users = sysdig_cloud
                                                                                                                                                

                                                                                                                                                For the same user you need the following entry in userlist.txt. The default file location is /etc/pgbouncer/userlist.txt

                                                                                                                                                "sysdig_cloud" "sysdig_cloud_password"
                                                                                                                                                

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to Edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                Default Configuration

                                                                                                                                                No default configuration is present in Sysdig’s dragent.default.yaml file for PgBouncer, as it requires a unique username and password. You must add a custom entry in dragent.yaml as follows:

                                                                                                                                                Remember! Never edit dragent.default.yaml directly; always edit only dragent.yaml.

                                                                                                                                                Example

                                                                                                                                                app_checks:
                                                                                                                                                  - name: pgbouncer
                                                                                                                                                    pattern:
                                                                                                                                                      comm: pgbouncer
                                                                                                                                                    conf:
                                                                                                                                                      host: localhost # set if the bind ip is different
                                                                                                                                                      port: 6432      # set if the port is not the default
                                                                                                                                                      username: sysdig_cloud
                                                                                                                                                      password: sysdig_cloud_password #replace with appropriate password
                                                                                                                                                

                                                                                                                                                Metrics Available

                                                                                                                                                See PGBouncer Metrics.

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                6.2.20 -

                                                                                                                                                PHP-FPM

                                                                                                                                                PHP-FPM (FastCGI Process Manager) is an alternative PHP FastCGI implementation, with some additional features useful for sites of any size, especially busier sites. If PHP-FPM is installed on your environment, the Sysdig agent will automatically connect. You may need to edit the default entries to connect if PHP-FPM has a custom setting in its config file. See the Default Configuration section, below.

                                                                                                                                                The Sysdig agent automatically collects all metrics with default configuration.

                                                                                                                                                This page describes the default configuration settings, how to edit the configuration to collect additional information, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                PHP-FPM Setup

                                                                                                                                                This check has a default configuration that should suit most use cases. If it does not work for you, verify that you have added these lines to your php-fpm.conf file. The default location is /etc/

                                                                                                                                                pm.status_path = /status
                                                                                                                                                ping.path = /ping
                                                                                                                                                

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to Edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                Default Configuration

                                                                                                                                                By default, Sysdig’s dragent.default.yaml uses the following code to connect with PHP-FPM and collect all metrics:

                                                                                                                                                app_checks:
                                                                                                                                                  - name: php-fpm
                                                                                                                                                    check_module: php_fpm
                                                                                                                                                    retry: false
                                                                                                                                                    pattern:
                                                                                                                                                      exe: "php-fpm: master process"
                                                                                                                                                

                                                                                                                                                If you have a configuration other than those for PHP-FPM in php-fpm.conf, you can edit the Sysdig agent configuration in dragent.yaml, as shown in Example 1.

                                                                                                                                                Remember! Never edit dragent.default.yaml directly; always edit only dragent.yaml.

                                                                                                                                                Example

                                                                                                                                                Replace the values of status_url and ping_url below with the values set against pm.status_path and ping.path respectively in your php-fpm.conf:

                                                                                                                                                app_checks:
                                                                                                                                                  - name: php-fpm
                                                                                                                                                    check_module: php_fpm
                                                                                                                                                    pattern:
                                                                                                                                                      exe: "php-fpm: master process"
                                                                                                                                                    conf:
                                                                                                                                                      status_url: /mystatus
                                                                                                                                                      ping_url: /myping
                                                                                                                                                      ping_reply: mypingreply
                                                                                                                                                

                                                                                                                                                Metrics Available

                                                                                                                                                See PHP-FPM Metrics.

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                6.2.21 -

                                                                                                                                                PostgreSQL

                                                                                                                                                PostgreSQL is a powerful, open-source, object-relational database system that has earned a strong reputation for reliability, feature robustness, and performance.

                                                                                                                                                If PostgreSQL is installed in your environment, the Sysdig agent will automatically connect in most cases. In some conditions, you may need to create a specific user for Sysdig and edit the default entries to connect.

                                                                                                                                                See the Default Configuration section, below. The Sysdig agent automatically collects all metrics with the default configuration when correct credentials are provided.

                                                                                                                                                This page describes the default configuration settings, how to edit the configuration to collect additional information, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                PostgreSQL Setup

                                                                                                                                                PostgreSQL will be auto-discovered and the agent will connect through the Unix socket using the Default Configuration with the **postgres **default user. If this does not work, you can create a user for Sysdig Monitor and give it enough permissions to read Postgres stats. To do this, execute the following example statements on your server:

                                                                                                                                                create user sysdig-cloud with password 'password';
                                                                                                                                                grant SELECT ON pg_stat_database to sysdig_cloud;
                                                                                                                                                

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to Edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                Default Configuration

                                                                                                                                                By default, Sysdig’s default.dragent.yaml uses the following code to connect with Postgres.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: postgres
                                                                                                                                                    pattern:
                                                                                                                                                      comm: postgres
                                                                                                                                                      port: 5432
                                                                                                                                                    conf:
                                                                                                                                                      unix_sock: "/var/run/postgresql/"
                                                                                                                                                      username: postgres
                                                                                                                                                

                                                                                                                                                If a special user for Sysdig is created, then update dragent.yaml file with the Expanded Example, below.

                                                                                                                                                Never edit default.dragent.yaml directly; always edit only dragent.yaml.

                                                                                                                                                Example 1: Special User

                                                                                                                                                Update the username and password created for the Sysdig agent in the respective fields, as follows:

                                                                                                                                                app_checks:
                                                                                                                                                  - name: postgres
                                                                                                                                                    pattern:
                                                                                                                                                      comm: postgres
                                                                                                                                                      port: 5432
                                                                                                                                                    conf:
                                                                                                                                                      username: sysdig-cloud
                                                                                                                                                      password: password
                                                                                                                                                

                                                                                                                                                Example 2: Connecting on Unix Socket

                                                                                                                                                If Postgres is listening on Unix socket /tmp/.s.PGSQL.5432, set value of unix_sock to /tmp/

                                                                                                                                                app_checks:
                                                                                                                                                  - name: postgres
                                                                                                                                                    pattern:
                                                                                                                                                      comm: postgres
                                                                                                                                                      port: 5432
                                                                                                                                                    conf:
                                                                                                                                                      unix_sock: "/tmp/"
                                                                                                                                                      username: postgres
                                                                                                                                                

                                                                                                                                                Example 3: Relations

                                                                                                                                                Lists of relations/tables can be specified to track per-relation metrics.

                                                                                                                                                A single relation can be specified in two ways:

                                                                                                                                                • Single relation with exact name against relation_name.

                                                                                                                                                • Regex to include all matching relation against relation_regex.

                                                                                                                                                If schemas are not provided, all schemas will be included. dbname is to be provided if relations is specified.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: postgres
                                                                                                                                                    pattern:
                                                                                                                                                      comm: postgres
                                                                                                                                                      port: 5432
                                                                                                                                                    conf:
                                                                                                                                                      username: <username>
                                                                                                                                                      password: <password>
                                                                                                                                                      dbname: <user_db_name>
                                                                                                                                                      relations:
                                                                                                                                                        - relation_name: <table_name_1>
                                                                                                                                                          schemas:
                                                                                                                                                            - <schema_name_1>
                                                                                                                                                        - relation_regex: <table_pattern>
                                                                                                                                                

                                                                                                                                                Example 4: Other Optional Parameters

                                                                                                                                                app_checks:
                                                                                                                                                  - name: postgres
                                                                                                                                                    check_module: postgres
                                                                                                                                                    pattern:
                                                                                                                                                      comm: postgres
                                                                                                                                                      port: 5432
                                                                                                                                                    conf:
                                                                                                                                                      username: postgres
                                                                                                                                                      unix_sock: "/var/run/postgresql"
                                                                                                                                                      dbname: <user_db_name>
                                                                                                                                                      #collect_activity_metrics: true
                                                                                                                                                      #collect_default_database: true
                                                                                                                                                      #tag_replication_role: true
                                                                                                                                                
                                                                                                                                                Optional Parameters

                                                                                                                                                Config Parameter

                                                                                                                                                Description

                                                                                                                                                Default Value

                                                                                                                                                collect_activity_metrics

                                                                                                                                                When set to true, it will enable metrics from pg_stat_activity. New metrics added will be:

                                                                                                                                                • postgresql.active_queries

                                                                                                                                                • postgresql.transactions.idle_in_transaction

                                                                                                                                                • postgresql.transactions.open

                                                                                                                                                • postgresql.waiting_queries

                                                                                                                                                false

                                                                                                                                                collect_default_database

                                                                                                                                                When set to true, it will collect statistics from default database which is postgres. All metrics from postgres database will have tag db:postgres

                                                                                                                                                false

                                                                                                                                                tag_replication_role

                                                                                                                                                When set to true, metrics and checks will be tagged with replication_role:<master|standby>

                                                                                                                                                false

                                                                                                                                                Optional Parameters

                                                                                                                                                Example 5: Custom Metrics Using Custom Queries

                                                                                                                                                Personalized custom metrics can be collected from Postgres using custom queries.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: postgres
                                                                                                                                                    pattern:
                                                                                                                                                      comm: postgres
                                                                                                                                                      port: 5432
                                                                                                                                                    conf:
                                                                                                                                                      unix_sock: "/var/run/postgresql/"
                                                                                                                                                      username: postgres
                                                                                                                                                      custom_queries:
                                                                                                                                                        - metric_prefix: postgresql.custom
                                                                                                                                                          query: <QUERY>
                                                                                                                                                          columns:
                                                                                                                                                            - name: <COLUNMS_1_NAME>
                                                                                                                                                              type: <COLUMNS_1_TYPE>
                                                                                                                                                            - name: <COLUNMS_2_NAME>
                                                                                                                                                              type: <COLUMNS_2_TYPE>
                                                                                                                                                          tags:
                                                                                                                                                            - <TAG_KEY>:<TAG_VALUE>
                                                                                                                                                
                                                                                                                                                OptionRequiredDescription
                                                                                                                                                metric_prefixYesEach metric starts with the chosen prefix.
                                                                                                                                                queryYesThis is the SQL to execute. It can be a simple statement or a multi-line script. All of the rows of the results are evaluated. Use the pipe if you require a multi-line script
                                                                                                                                                columnsYesThis is a list representing each column ordered sequentially from left to right. The number of columns must equal the number of columns returned in the query. There are 2 required pieces of data:- name: This is the suffix to append to the metric_prefix to form the full metric name. If the type is specified as tag, the column is instead applied as a tag to every metric collected by this query.- type: This is the submission method (gauge, count, rate, etc.). This can also be set to ’tag’ to tag each metric in the row with the name and value of the item in this column
                                                                                                                                                tagsNoA list of tags to apply to each metric (as specified above).

                                                                                                                                                Optional Parameters

                                                                                                                                                Metrics Available

                                                                                                                                                See PostgreSQL Metrics.

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                Default Dashboard

                                                                                                                                                The default PostgreSQL dashboard includes combined metrics and individual metrics in an overview page.

                                                                                                                                                Other Views

                                                                                                                                                You can also view individual metric charts from a drop-down menu in an Explore view.

                                                                                                                                                6.2.22 -

                                                                                                                                                RabbitMQ

                                                                                                                                                RabbitMQ is an open-source message-broker software (sometimes called message-oriented middleware) that implements Advanced Message Queuing Protocol (AMQP). The RabbitMQ server is written in the Erlang language and is built on the Open Telecom Platform framework for clustering and fail-over. Client libraries to interface with the broker are available in all major programming languages. If RabbitMQ is installed on your environment, the Sysdig agent will automatically connect. See the Default Configuration section, below.

                                                                                                                                                The Sysdig agent automatically collects all metrics with the default configuration. You may need to edit the dragent.yaml file if a metrics limit is reached.

                                                                                                                                                This page describes the default configuration settings, how to edit the configuration to collect additional information, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                RabbitMQ Setup

                                                                                                                                                Enable the RabbitMQ management plugin. See RabbitMQ’s documentation to enable it.

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to Edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                Default Configuration

                                                                                                                                                By default, Sysdig’s dragent.default.yaml uses the following code to connect with RabbitMQ and collect all metrics.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: rabbitmq
                                                                                                                                                    pattern:
                                                                                                                                                      port: 15672
                                                                                                                                                    conf:
                                                                                                                                                      rabbitmq_api_url: "http://localhost:15672/api/"
                                                                                                                                                      rabbitmq_user: guest
                                                                                                                                                      rabbitmq_pass: guest
                                                                                                                                                

                                                                                                                                                The RabbitMQ app check tracks various entities, such as exchanges, queues and nodes. Each of these entities has its maximum limits. If the limit is reached, metrics can be controlled by editing the dragent.yaml file, as in the following examples.

                                                                                                                                                Remember! Never edit dragent.default.yaml directly; always edit only dragent.yaml.

                                                                                                                                                Example 1: Manage logging_interval

                                                                                                                                                When a maximum limit is exceeded, the app check will log an info message:

                                                                                                                                                rabbitmq: Too many <entity type> (<number of entities>) to fetch and maximum limit is (<configured limit>). You must choose the <entity type> you are interested in by editing the dragent.yaml configuration file

                                                                                                                                                This message is suppressed by a configuration parameter, logging_interval.

                                                                                                                                                Its default value is 300 seconds. This can be altered by specifying a different value in dragent.yaml.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: rabbitmq
                                                                                                                                                    pattern:
                                                                                                                                                      port: 15672
                                                                                                                                                    conf:
                                                                                                                                                      rabbitmq_api_url: "http://localhost:15672/api/"
                                                                                                                                                      rabbitmq_user: guest
                                                                                                                                                      rabbitmq_pass: guest
                                                                                                                                                      logging_interval: 10 # Value in seconds. Default is 300
                                                                                                                                                

                                                                                                                                                Example 2: Specify Nodes, Queues, or Exchanges

                                                                                                                                                Each of the tracked RabbitMQ entities has its maximum limits. As of Agent v10.5.1, the default limits are as follows:

                                                                                                                                                • Exchanges: 16 per-exchange metrics

                                                                                                                                                • Queues: 20 per-queue metrics

                                                                                                                                                • Nodes: 9 per-node metrics

                                                                                                                                                The max_detailed_* settings for the RabbitMQ app check do not limit the reported number of queues, exchanges, and node, but the number of generated metrics for the objects. For example, a single queue might report up to 20 metrics, and therefore, set max_detailed_queues to 20 times the actual number of queues.

                                                                                                                                                The metrics for these entities are tagged. If any of these entities are present but no transactions have occurred for them, the metrics are still reported with 0 values, though without tags. Therefore, when segmenting these metrics, the tags will show as unset in the Sysdig Monitor Explore view. However, all such entities are still counted against the maximum limits. In such a scenario, you can specify the entity names for which you want to collect metrics in the dragent.yaml file.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: rabbitmq
                                                                                                                                                    pattern:
                                                                                                                                                      port: 15672
                                                                                                                                                    conf:
                                                                                                                                                      rabbitmq_api_url: "http://localhost:15672/api/"
                                                                                                                                                      rabbitmq_user: guest
                                                                                                                                                      rabbitmq_pass: guest
                                                                                                                                                      tags: ["queues:<queuename>"]
                                                                                                                                                      nodes:
                                                                                                                                                        - rabbit@localhost
                                                                                                                                                        - rabbit2@domain
                                                                                                                                                      nodes_regexes:
                                                                                                                                                        - bla.*
                                                                                                                                                      queues:
                                                                                                                                                        - queue1
                                                                                                                                                        - queue2
                                                                                                                                                      queues_regexes:
                                                                                                                                                        - thisqueue-.*
                                                                                                                                                        - another_\d+queue
                                                                                                                                                      exchanges:
                                                                                                                                                        - exchange1
                                                                                                                                                        - exchange2
                                                                                                                                                      exchanges_regexes:
                                                                                                                                                        - exchange*
                                                                                                                                                

                                                                                                                                                Example 3: Custom tags

                                                                                                                                                Optional tags can be applied to every emitted metric, service check, and/or event.

                                                                                                                                                Names can be specified by exact name or regular expression.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: rabbitmq
                                                                                                                                                    pattern:
                                                                                                                                                      port: 15672
                                                                                                                                                    conf:
                                                                                                                                                      rabbitmq_api_url: "http://localhost:15672/api/"
                                                                                                                                                      rabbitmq_user: guest
                                                                                                                                                      rabbitmq_pass: guest
                                                                                                                                                      tags: ["some_tag:some_value"]
                                                                                                                                                

                                                                                                                                                Example 4: filter_by_node

                                                                                                                                                Use filter_by_node: true if you want each node to report information localized to the node. Without this option, each node reports cluster-wide info (as presented by RabbitMQ itself). This option makes it easier to view the metrics in the UI by removing redundant information reported by individual nodes.

                                                                                                                                                Default: false.

                                                                                                                                                Prerequisite: Sysdig agent v. 92.3 or higher.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: rabbitmq
                                                                                                                                                    pattern:
                                                                                                                                                      port: 15672
                                                                                                                                                    conf:
                                                                                                                                                      rabbitmq_api_url: "http://localhost:15672/api/"
                                                                                                                                                      rabbitmq_user: guest
                                                                                                                                                      rabbitmq_pass: guest
                                                                                                                                                      filter_by_node: true
                                                                                                                                                

                                                                                                                                                Metrics Available

                                                                                                                                                See RabbitMQ Metrics.

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                6.2.23 -

                                                                                                                                                RedisDB

                                                                                                                                                Redis is an open-source (BSD licensed), in-memory data structure store, used as a database, cache, and message broker. If Redis is installed in your environment, the Sysdig agent will automatically connect in most cases. You may need to edit the default entries to get additional metrics. See the Default Configuration section, below.

                                                                                                                                                This page describes the default configuration settings, how to edit the configuration to collect additional information, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                Application Setup

                                                                                                                                                Redis will automatically expose all metrics. You do not need to configure anything in the Redis instance.

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to Edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                Default Configuration

                                                                                                                                                By default, Sysdig’s dragent.default.yaml uses the following code to connect with Redis and collect basic metrics:

                                                                                                                                                app_checks:
                                                                                                                                                  - name: redis
                                                                                                                                                    check_module: redisdb
                                                                                                                                                    pattern:
                                                                                                                                                      comm: redis-server
                                                                                                                                                    conf:
                                                                                                                                                      host: 127.0.0.1
                                                                                                                                                      port: "{port}"
                                                                                                                                                

                                                                                                                                                Some additional metrics can be collected by editing the configuration file as shown in following examples. The options shown in Example 2 are relevant if Redis requires authentication or if a Unix socket is used.

                                                                                                                                                Remember! Never edit dragent.default.yaml directly; always edit only dragent.yaml.

                                                                                                                                                Example 1: Key Lengths

                                                                                                                                                The following example entry results in the metric redis.key.length in the Sysdig Monitor UI, displaying the length of specific keys (segmented by: key). To enable, provide the key names in dragent.yaml as follows.

                                                                                                                                                Note that length is 0 (zero) for keys that have a type other than list, set, hash, or sorted set. Keys can be expressed as patterns; see https://redis.io/commands/keys.

                                                                                                                                                Sample entry in dragent.yaml:

                                                                                                                                                app_checks:
                                                                                                                                                  - name: redis
                                                                                                                                                    check_module: redisdb
                                                                                                                                                    pattern:
                                                                                                                                                      comm: redis-server
                                                                                                                                                    conf:
                                                                                                                                                      host: 127.0.0.1
                                                                                                                                                      port: "{port}"
                                                                                                                                                      keys:
                                                                                                                                                        - "list_1"
                                                                                                                                                        - "list_9*"
                                                                                                                                                

                                                                                                                                                Example 2: Additional Configuration Options

                                                                                                                                                • unix_socket_path (Optional) - Can be used if your Redis uses a socket instead of host and port.

                                                                                                                                                • password (Optional) - Can be used if your Redis requires a password

                                                                                                                                                app_checks:
                                                                                                                                                  - name: redis
                                                                                                                                                    check_module: redisdb
                                                                                                                                                    pattern:
                                                                                                                                                      comm: redis-server
                                                                                                                                                    conf:
                                                                                                                                                      host: 127.0.0.1
                                                                                                                                                      port: "{port}"
                                                                                                                                                      # unix_socket_path: /var/run/redis/redis.sock # can be used in lieu of host/port
                                                                                                                                                      # password: mypassword                                            # if your Redis requires auth
                                                                                                                                                

                                                                                                                                                Example 3: COMMANDSTATS Metrics

                                                                                                                                                You can also collect the INFO COMMANDSTATS result as metrics (redis.command.*). This works with Redis >= 2.6

                                                                                                                                                Sample implementation:

                                                                                                                                                app_checks:
                                                                                                                                                  - name: redis
                                                                                                                                                    check_module: redisdb
                                                                                                                                                    pattern:
                                                                                                                                                      comm: redis-server
                                                                                                                                                    conf:
                                                                                                                                                      host: 127.0.0.1
                                                                                                                                                      port: "{port}"
                                                                                                                                                      command_stats: true
                                                                                                                                                

                                                                                                                                                Metrics Available

                                                                                                                                                See RedisDB Metrics.

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                6.2.24 -

                                                                                                                                                SNMP

                                                                                                                                                Simple Network Management Protocol (SNMP) is an application-layer protocol used to manage and monitor network devices and their functions. The Sysdig agent can connect to network devices and collect metrics using SNMP.

                                                                                                                                                This page describes the default configuration settings, how to edit the configuration to collect additional information, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                SNMP Overview

                                                                                                                                                Simple Network Management Protocol (SNMP) is an Internet Standard protocol for collecting and configuring information about devices in the networks. The network devices include physical devices like switches, routers, servers etc.

                                                                                                                                                SNMP has three primary versions ( SNMPv1, SNMPv2c and SNMPv3) and SNMPv2c is most widely used.

                                                                                                                                                SNMP allows device vendors to expose management data in the form of variables on managed systems organized in a management information base (MIB), which describe the system status and configuration. The devices can be queried as well as configured remotely using these variables. Certain MIBs are generic and supported by the majority of the device vendors. Additionally, each vendor can have their own private/enterprise MIBs for vendor-specific information.

                                                                                                                                                SNMP MIB is a collection of objects uniquely identified by an Object Identifier (OID). OIDs are represented in the form of x.0, where x is the name of object in the MIB definition.

                                                                                                                                                For example, suppose one wanted to identify an instance of the variable sysDescr
                                                                                                                                                
                                                                                                                                                The object class for sysDescr is:
                                                                                                                                                
                                                                                                                                                iso org dod internet mgmt mib system sysDescr
                                                                                                                                                 1   3   6     1      2    1    1       1
                                                                                                                                                
                                                                                                                                                Hence, the object type, x, would be 1.3.6.1.2.1.1.1
                                                                                                                                                

                                                                                                                                                SNMP Agent Configuration

                                                                                                                                                To monitor the servers with the Sysdig agent, the SNMP agent must be installed on the servers to query the system information.

                                                                                                                                                For Ubuntu-based servers, use the following commands to install the SNMP Daemon:

                                                                                                                                                $sudo apt-get update
                                                                                                                                                $sudo apt-get install snmpd
                                                                                                                                                

                                                                                                                                                Next, configure this SNMP agent to respond to queries from the SNMP manager by updating the configuration file located at /etc/snmp/snmpd.conf

                                                                                                                                                Below are the important fields that must be configured:

                                                                                                                                                snmpd.conf

                                                                                                                                                # Listen for connections on all interfaces (both IPv4 *and* IPv6)
                                                                                                                                                agentAddress udp:161,udp6:[::1]:161
                                                                                                                                                
                                                                                                                                                ## ACCESS CONTROL
                                                                                                                                                ## system + hrSystem groups only
                                                                                                                                                view systemonly included .1.3.6.1.2.1.1
                                                                                                                                                view systemonly included .1.3.6.1.2.1.25.1
                                                                                                                                                view systemonly included .1.3.6.1.2.1.31.1
                                                                                                                                                view systemonly included .1.3.6.1.2.1.2.2.1.1
                                                                                                                                                
                                                                                                                                                # Default access to basic system info
                                                                                                                                                rocommunity public default -V systemonly
                                                                                                                                                # rocommunity6 is for IPv6
                                                                                                                                                rocommunity6 public default -V systemonly
                                                                                                                                                

                                                                                                                                                After making changes to the config file, restart the snmpd service using:

                                                                                                                                                $sudo service snmpd restart
                                                                                                                                                

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to Edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                Default Configuration

                                                                                                                                                No default configuration is present for SNMP check.

                                                                                                                                                • You must specify the OID/MIB for every parameter you want to collect, as in the following example.

                                                                                                                                                • The OIDs configured in dragent.yaml are included in the snmpd.conf configuration under the ‘ACCESS CONTROL’ section

                                                                                                                                                • Ensure that the community_string is same as configured in the system configuration (rocommunity).

                                                                                                                                                Remember! Never edit dragent.default.yaml directly; always edit only dragent.yaml.

                                                                                                                                                Example

                                                                                                                                                app_checks:
                                                                                                                                                  - name: snmp
                                                                                                                                                    pattern:
                                                                                                                                                      comm: python
                                                                                                                                                      arg: /opt/draios/bin/sdchecks
                                                                                                                                                    interval: 30
                                                                                                                                                    conf:
                                                                                                                                                      mibs_folder: /usr/share/mibs/ietf/
                                                                                                                                                      ip_address: 52.53.158.103
                                                                                                                                                      port: 161
                                                                                                                                                      community_string: public
                                                                                                                                                
                                                                                                                                                      # Only required for snmp v1, will default to 2
                                                                                                                                                      # snmp_version: 2
                                                                                                                                                
                                                                                                                                                      # Optional tags can be set with each metric
                                                                                                                                                      tags:
                                                                                                                                                         - vendor:EMC
                                                                                                                                                         - array:VNX5300
                                                                                                                                                         - location:front
                                                                                                                                                
                                                                                                                                                      metrics:
                                                                                                                                                        - OID: 1.3.6.1.2.1.25.2.3.1.5
                                                                                                                                                          name: hrStorageSize
                                                                                                                                                        - OID: 1.3.6.1.2.1.1.7
                                                                                                                                                          name: sysServices
                                                                                                                                                        - MIB: TCP-MIB
                                                                                                                                                          symbol: tcpActiveOpens
                                                                                                                                                        - MIB: UDP-MIB
                                                                                                                                                          symbol: udpInDatagrams
                                                                                                                                                        - MIB: IP-MIB
                                                                                                                                                          table: ipSystemStatsTable
                                                                                                                                                          symbols:
                                                                                                                                                            - ipSystemStatsInReceives
                                                                                                                                                          metric_tags:
                                                                                                                                                            - tag: ipversion
                                                                                                                                                              index: 1        # specify which index you want to read the tag value from
                                                                                                                                                        - MIB: IF-MIB
                                                                                                                                                          table: ifTable
                                                                                                                                                          symbols:
                                                                                                                                                            - ifInOctets
                                                                                                                                                            - ifOutOctets
                                                                                                                                                          metric_tags:
                                                                                                                                                            - tag: interface
                                                                                                                                                              column: ifDescr  # specify which column to read the tag value from
                                                                                                                                                

                                                                                                                                                The Sysdig agent allows you to monitor the SNMP counters and gauge of your choice. For each device, specify the metrics that you want to monitor in the metrics subsection using one of the following methods:

                                                                                                                                                1. Specify a MIB and the symbol that you want to export

                                                                                                                                                  metrics:
                                                                                                                                                    - MIB: UDP-MIB
                                                                                                                                                      symbol: udpInDatagrams
                                                                                                                                                  
                                                                                                                                                2. Specify an OID and the name you want the metric to appear under in Sysdig Monitor:

                                                                                                                                                  metrics:
                                                                                                                                                    - OID: 1.3.6.1.2.1.6.5
                                                                                                                                                      name: tcpActiveOpens
                                                                                                                                                  #The name here is the one specified in the MIB but you could use any name.
                                                                                                                                                  
                                                                                                                                                3. Specify an MIB and a table from which to extract information:

                                                                                                                                                  metrics:
                                                                                                                                                    - MIB: IF-MIB
                                                                                                                                                      table: ifTable
                                                                                                                                                      symbols:
                                                                                                                                                        - ifInOctets
                                                                                                                                                      metric_tags:
                                                                                                                                                        - tag: interface
                                                                                                                                                      column: ifDescr
                                                                                                                                                  

                                                                                                                                                Metrics Available

                                                                                                                                                The SNMP check does not have default metrics. All metrics mentioned in dragent.yaml file will be seen with snmp.* prefix/

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                6.2.25 -

                                                                                                                                                Supervisord

                                                                                                                                                Supervisor daemon is a client/server system that allows its users to monitor and control a number of processes on UNIX-like operating systems., The Supervisor check monitors the uptime, status, and number of processes running under Supervisord.

                                                                                                                                                No default configuration is provided for the Supervisor check; you must provide the configuration in the dragent.yaml file for the Sysdig agent to collect the data provided by Supervisor.

                                                                                                                                                This page describes the setup steps required on Supervisor, how to edit the Sysdig agent configuration to collect additional information, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                Supervisor Setup

                                                                                                                                                Configuration

                                                                                                                                                The Sysdig agent can collect data from Supervisor via HTTP server or UNIX socket. The agent collects the same data regardless of the configured collection method.

                                                                                                                                                Un-comment the following or add them if they are not present in /etc/supervisor/supervisord.conf

                                                                                                                                                [inet_http_server]
                                                                                                                                                port=localhost:9001
                                                                                                                                                username=user  # optional
                                                                                                                                                password=pass  # optional
                                                                                                                                                ...
                                                                                                                                                
                                                                                                                                                
                                                                                                                                                [supervisorctl]
                                                                                                                                                serverurl=unix:///tmp/supervisor.sock
                                                                                                                                                ...
                                                                                                                                                
                                                                                                                                                [unix_http_server]
                                                                                                                                                file=/tmp/supervisor.sock
                                                                                                                                                chmod=777 # make sure chmod is set so that non-root users can read the socket.
                                                                                                                                                ...
                                                                                                                                                
                                                                                                                                                [program:foo]
                                                                                                                                                command=/bin/cat
                                                                                                                                                

                                                                                                                                                The programs controlled by Supervisor are given by different [program] sections in the configuration. Each program you want to manage by Supervisor must be specified in the Supervisor configuration file, with its supported options in the [program] section. See Supervisor’s sample.conf file for details.

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to Edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                Default Configuration

                                                                                                                                                By default, Sysdig’s dragent.default.yaml does not have any configuration to connect the agent with Supervisor. Edit dragent.yaml following the Examples given to connect with Supervisor and collect supervisor.* metrics.

                                                                                                                                                Remember! Never edit dragent.default.yaml directly; always edit only dragent.yaml.

                                                                                                                                                Example 1: Connect by UNIX Socket

                                                                                                                                                  - name: supervisord
                                                                                                                                                    pattern:
                                                                                                                                                      comm: supervisord
                                                                                                                                                    conf:
                                                                                                                                                      socket: "unix:///tmp/supervisor.sock"
                                                                                                                                                
                                                                                                                                                

                                                                                                                                                Example 2: Connect by Host Name and Port, Optional Authentication

                                                                                                                                                - name: supervisord
                                                                                                                                                  pattern:
                                                                                                                                                    comm: supervisord
                                                                                                                                                  conf:
                                                                                                                                                    host: localhost
                                                                                                                                                    port: 9001
                                                                                                                                                # user: user # Optional. Required only if a username is configured.
                                                                                                                                                # pass: pass # Optional. Required only if a password is configured.
                                                                                                                                                

                                                                                                                                                Metrics Available

                                                                                                                                                Metric Name

                                                                                                                                                Metric Description

                                                                                                                                                supervisord.process.count

                                                                                                                                                (gauge)

                                                                                                                                                The number of supervisord monitored processes

                                                                                                                                                shown as process

                                                                                                                                                supervisord.process.uptime

                                                                                                                                                (gauge)

                                                                                                                                                The process uptime

                                                                                                                                                shown as second

                                                                                                                                                See also Supervisord Metrics.

                                                                                                                                                Service Check

                                                                                                                                                supervisored.can.connect:

                                                                                                                                                Returns CRITICAL if the Sysdig agent cannot connect to the HTTP server or UNIX socket configured, otherwise OK.

                                                                                                                                                supervisord.process.status:

                                                                                                                                                SUPERVISORD STATUSSUPERVISORD.PROCESS.STATUS
                                                                                                                                                STOPPEDCRITICAL
                                                                                                                                                STARTINGUNKNOWN
                                                                                                                                                RUNNINGOK
                                                                                                                                                BACKOFFCRITICAL
                                                                                                                                                STOPPINGCRITICAL
                                                                                                                                                EXITEDCRITICAL
                                                                                                                                                FATALCRITICAL
                                                                                                                                                UNKNOWNUNKNOWN

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                6.2.26 -

                                                                                                                                                TCP

                                                                                                                                                You can monitor the status of your custom application’s port using the TCP check. This check will routinely connect to the designated port and send Sysdig Monitor a simple on/off metric and response time.

                                                                                                                                                This page describes the default configuration settings, how to edit the configuration to collect additional information, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                TCP Application Setup

                                                                                                                                                Any application listening on a TCP port can be monitored with tcp_check.

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to Edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                Default Configuration

                                                                                                                                                No default configuration is provided in the default settings file; you must add the entries in Example 1 to the user settings config file dragent.yaml.

                                                                                                                                                Remember! Never edit dragent.default.yaml directly; always edit only dragent.yaml.

                                                                                                                                                Example

                                                                                                                                                 - name: tcp_check
                                                                                                                                                    check_module: tcp_check
                                                                                                                                                    pattern:
                                                                                                                                                      comm: httpd
                                                                                                                                                      arg: DFOREGROUND
                                                                                                                                                    conf:
                                                                                                                                                      port: 80
                                                                                                                                                      collect_response_time: true
                                                                                                                                                

                                                                                                                                                This example shows monitoring a TCP check on an Apache process running on the host on port 80.

                                                                                                                                                comm: is the command for running the Apache server on port 80.

                                                                                                                                                If you want the response time for your port, meaning the amount of time the process takes to accept the connection, you can add the collect_response_time: true parameter under the conf: section and the additional metric network.tcp.response_time will appear in the Metrics list.

                                                                                                                                                Do not use port: under the pattern: section in this case, because if the process is not listening it will not be matched and the metric will not be sent to Sysdig Monitor.

                                                                                                                                                Metrics Available

                                                                                                                                                Metric Name

                                                                                                                                                Metric Description

                                                                                                                                                network.tcp.response_time

                                                                                                                                                (gauge)

                                                                                                                                                The response time of a given host and TCP port, tagged with url, e.g. 'url:192.168.1.100:22'.

                                                                                                                                                shown as second

                                                                                                                                                See TCP Metrics.

                                                                                                                                                Service Checks

                                                                                                                                                tcp.can_connect :

                                                                                                                                                DOWN if the agent cannot connect to the configured host and port, otherwise UP.

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                6.2.27 -

                                                                                                                                                Varnish

                                                                                                                                                Varnish HTTP Cache is a web application accelerator, also known as a “caching HTTP reverse proxy.” You install it in front of any server that speaks HTTP and configure it to cache the contents. If Varnish is installed on your environment, the Sysdig agent will automatically connect. See the Default Configuration section, below.

                                                                                                                                                The Sysdig Agent automatically collects all metrics. You can also edit the configuration to emit service checks for the back end.

                                                                                                                                                This page describes the default configuration settings, how to edit the configuration to collect additional information, the metrics available for integration, and a sample result in the Sysdig Monitor UI.

                                                                                                                                                Varnish Setup

                                                                                                                                                Varnish will automatically expose all metrics. You do not need to add anything to the Varnish instance.

                                                                                                                                                Sysdig Agent Configuration

                                                                                                                                                Review how to Edit dragent.yaml to Integrate or Modify Application Checks.

                                                                                                                                                Default Configuration

                                                                                                                                                By default, Sysdig’s dragent.default.yaml uses the following code to connect with Varnish and collect all but the VBE metrics. See Example 2 Enable Varnish VBE Metrics.

                                                                                                                                                metrics_filter:
                                                                                                                                                 - exclude: varnish.VBE.*
                                                                                                                                                app_checks:
                                                                                                                                                 - name: varnishapp_checks:
                                                                                                                                                    interval: 15
                                                                                                                                                    pattern:
                                                                                                                                                      comm: varnishd
                                                                                                                                                    conf:
                                                                                                                                                      varnishstat: /usr/bin/varnishstat
                                                                                                                                                

                                                                                                                                                Optionally, if you want to submit service checks for the health of each back end, you can configure varnishadm and edit dragent.yaml as in Example 1.

                                                                                                                                                Remember! Never edit dragent.default.yaml directly; always edit only dragent.yaml.

                                                                                                                                                Example 1 Service Health Checks with varnishadm

                                                                                                                                                When varnishadm is configured, the Sysdig agent requires privileges to execute the binary with root privileges. Add the following to your /etc/sudoers file:

                                                                                                                                                sysdig-agent ALL=(ALL) NOPASSWD:/usr/bin/varnishadm
                                                                                                                                                

                                                                                                                                                Then edit dragent.yaml as follows. Note: If you have configured varnishadm and your secret file is NOT /etc/varnish/secret, you can comment out secretfile.

                                                                                                                                                app_checks:
                                                                                                                                                  - name: varnish
                                                                                                                                                    interval: 15
                                                                                                                                                    pattern:
                                                                                                                                                      comm: varnishd
                                                                                                                                                    conf:
                                                                                                                                                      varnishstat: /usr/bin/varnishstat
                                                                                                                                                      varnishadm: /usr/bin/varnishadm
                                                                                                                                                      secretfile: /etc/varnish/secret
                                                                                                                                                

                                                                                                                                                This example will enable following service check.

                                                                                                                                                varnish.backend_healthy: The agent submits a service check for each Varnish backend, tagging each with backend:<backend_name>.

                                                                                                                                                Example 2 Enable Varnish VBE Metrics

                                                                                                                                                Varnish VBE metrics are dynamically generated (and therefore are not listed in the Metrics Dictionary). Because they generate unique metric names with timestamps, they can clutter metric handling and are filtered out by default. If you want to collect these metrics, use include in the metrics_filter in dragent.yaml:

                                                                                                                                                metrics_filter:
                                                                                                                                                 - include: varnish.VBE.*
                                                                                                                                                app_checks:
                                                                                                                                                 - name: varnishapp_checks:
                                                                                                                                                    interval: 15
                                                                                                                                                    pattern:
                                                                                                                                                      comm: varnishd
                                                                                                                                                    conf:
                                                                                                                                                      varnishstat: /usr/bin/varnishstat
                                                                                                                                                

                                                                                                                                                Metrics Available

                                                                                                                                                See Varnish Metrics.

                                                                                                                                                Result in the Monitor UI

                                                                                                                                                6.3 -

                                                                                                                                                (Legacy) Create a Custom App Check

                                                                                                                                                We are sunsetting application checks in favor of Monitoring Integrations.

                                                                                                                                                Application checks are integrations that allow the Sysdig agent to poll specific metrics exposed by any application, and the built-in app checks currently supported are listed on the App Checks main page. Many other Java-based applications are also supported out-of-the-box.

                                                                                                                                                If your application is not already supported though, you have a few options:

                                                                                                                                                1. Utilize Prometheus, StatsD, or JMX to collect custom metrics:

                                                                                                                                                2. Send a request at support@sysdig.com, and we’ll do our best to add support for your application.

                                                                                                                                                3. Create your own check by following the instructions below.

                                                                                                                                                If you do write a custom check, let us know. We love hearing about how our users extend Sysdig Monitor, and we can also consider embedding your app check automatically in the Sysdig agent.

                                                                                                                                                See also Understanding the Agent Config Files for details on accessing and editing the agent configuration files in general.

                                                                                                                                                Check Anatomy

                                                                                                                                                Essentially, an app check is a Python Class that extends AgentCheck :

                                                                                                                                                from checks import AgentCheck
                                                                                                                                                
                                                                                                                                                class MyCustomCheck(AgentCheck):
                                                                                                                                                    # namespaces of the monitored process to join
                                                                                                                                                    # right now we support 'net', 'mnt' and 'uts'
                                                                                                                                                    # put there the minimum necessary namespaces to join
                                                                                                                                                    # usually 'net' is enough. In this case you can also omit the variable
                                                                                                                                                    # NEEDED_NS = ( 'net', )
                                                                                                                                                
                                                                                                                                                    # def __init__(self, name, init_config, agentConfig):
                                                                                                                                                    #     '''
                                                                                                                                                    #     Optional, define it if you need custom initialization
                                                                                                                                                    #     remember to accept these parameters and pass them to the superclass
                                                                                                                                                    #     '''
                                                                                                                                                    #     AgentCheck.__init__(self, name, init_config, agentConfig)
                                                                                                                                                    #     self.myvar = None
                                                                                                                                                
                                                                                                                                                    def check(self, instance):
                                                                                                                                                        '''
                                                                                                                                                        This function gets called to perform the check.
                                                                                                                                                        Connect to the application, parse the metrics and add them to aggregation using
                                                                                                                                                        superclass methods like `self.gauge(metricname, value, tags)`
                                                                                                                                                        '''
                                                                                                                                                        server_port = instance['port']
                                                                                                                                                        self.gauge("testmetric", 1)
                                                                                                                                                

                                                                                                                                                Put this file into /opt/draios/lib/python/checks.custom.d (create the directory if not present) and it will be available to the Sysdig agent. To run your checks, you need to supply configuration information in the agent’s config file, dragent.yaml as is done with bundled checks:

                                                                                                                                                app_checks:
                                                                                                                                                  - name: voltdb # check name, must be unique
                                                                                                                                                    # name of your .py file, if it's the same of the check name you can omit it
                                                                                                                                                    # check_module: voltdb
                                                                                                                                                    pattern: # pattern to match the application
                                                                                                                                                      comm: java
                                                                                                                                                      arg: org.voltdb.VoltDB
                                                                                                                                                    conf:
                                                                                                                                                      port: 21212 # any key value config you need on `check(self, instance_conf)` function
                                                                                                                                                

                                                                                                                                                Check Interface Detail

                                                                                                                                                As you can see, the most important piece of the check interface is the check function. The function declaration is:

                                                                                                                                                    def check(self, instance)
                                                                                                                                                

                                                                                                                                                instance is a dict containing the configuration of the check. It will contain all the attributes found in the conf: section in dragent.yaml plus the following:

                                                                                                                                                • name: The check unique name.

                                                                                                                                                • ports: An array of all listening ports of the process.

                                                                                                                                                • port: The first listening port of the process.

                                                                                                                                                These attributes are available as defaults and allow you to automatically configure your check. The conf: section as higher priority on these values.

                                                                                                                                                Inside the check function you can call these methods to send metrics:

                                                                                                                                                self.gauge(metric_name, value, tags) # Sample a gauge metric
                                                                                                                                                
                                                                                                                                                self.rate(metric_name, value, tags) # Sample a point, with the rate calculated at the end of the check
                                                                                                                                                
                                                                                                                                                self.increment(metric_name, value, tags) # Increment a counter metric
                                                                                                                                                
                                                                                                                                                self.decrement(metric_name, value, tags) # Decrement a counter metric
                                                                                                                                                
                                                                                                                                                self.histogram(metric_name, value, tags) # Sample a histogram metric
                                                                                                                                                
                                                                                                                                                self.count(metric_name, value, tags) # Sample a raw count metric
                                                                                                                                                
                                                                                                                                                self.monotonic_count(metric_name, value, tags) # Sample an increasing counter metric
                                                                                                                                                

                                                                                                                                                Usually the most used are gauge and rate . Besides metric_name and value parameters that are quite obvious, you can also add tags to your metric using this format:

                                                                                                                                                tags = [ "key:value", "key2:value2", "key_without_value"]
                                                                                                                                                

                                                                                                                                                It is an array of string representing tags in both single or key/value approach. They will be useful in Sysdig Monitor for graph segmentation.

                                                                                                                                                You can also send service checks which are on/off metrics, using this interface:

                                                                                                                                                self.service_check(name, status, tags)
                                                                                                                                                

                                                                                                                                                Where status can be:

                                                                                                                                                • AgentCheck.OK

                                                                                                                                                • AgentCheck.WARNING

                                                                                                                                                • AgentCheck.CRITICAL

                                                                                                                                                • AgentCheck.UNKNOWN

                                                                                                                                                Testing

                                                                                                                                                To test your check you can launch Sysdig App Checks from the command line to avoid running the full agent and iterate faster:

                                                                                                                                                # from /opt/draios directory
                                                                                                                                                ./bin/sdchecks runCheck <check_unique_name> <process_pid> [<process_vpid>] [<process_port>]
                                                                                                                                                
                                                                                                                                                • check_unique_name: The check name as on config file.

                                                                                                                                                • pid: Process pid seen from host.

                                                                                                                                                • vpid: Optional, process pid seen inside the container, defaults to 1.

                                                                                                                                                • port: Optional, port where the process is listening, defaults to None.

                                                                                                                                                Example:

                                                                                                                                                ./bin/sdchecks runCheck redis 1254 1 6379
                                                                                                                                                5658:INFO:Starting
                                                                                                                                                5658:INFO:Container support: True
                                                                                                                                                5658:INFO:Run AppCheck for {'ports': [6379], 'pid': 5625, 'check': 'redis', 'vpid': 1}
                                                                                                                                                Conf: {'port': 6379, 'socket_timeout': 5, 'host': '127.0.0.1', 'name': 'redis', 'ports': [6379]}
                                                                                                                                                Metrics: # metrics array
                                                                                                                                                Checks: # metrics check
                                                                                                                                                Exception: None # exceptions
                                                                                                                                                

                                                                                                                                                The output is intentionally raw to allow you to better debug what the check is doing.

                                                                                                                                                6.4 -

                                                                                                                                                (Legacy) Create Per-Container Custom App Checks

                                                                                                                                                We are sunsetting application checks in favor of Monitoring Integrations.

                                                                                                                                                Sysdig supports adding custom application check-script configurations for each individual container in the infrastructure. This avoids multiple edits and entries to achieve container specific customization. In particular, this enables PaaS to work smarter, by delegating application teams to configure their own checks.

                                                                                                                                                See also Understanding the Agent Config Files for details on accessing and editing the agent configuration files in general.

                                                                                                                                                How It Works

                                                                                                                                                The SYSDIG_AGENT_CONF variable stores a YAML-formatted configuration for your app check and will be used to match app check configurations.

                                                                                                                                                All original app_checks are available, and the syntax is the same as for dragent.yaml. You can add the environment variable directly to the Docker file.

                                                                                                                                                Example with Dockerfile

                                                                                                                                                This example defines a per container app-check for Redis. Normally you would have a YAML formatted entry installed into the agent’s /opt/draios/etc/dragent.yaml file that would look like this:

                                                                                                                                                app_checks:
                                                                                                                                                  - name: redis
                                                                                                                                                    check_module: redisdb
                                                                                                                                                    pattern:
                                                                                                                                                      comm: redis-server
                                                                                                                                                    conf:
                                                                                                                                                      host: 127.0.0.1
                                                                                                                                                      port: "{port}"
                                                                                                                                                      password: protected
                                                                                                                                                

                                                                                                                                                For the per-container method, convert and add the above entry to the Docker file via the SYSDIG_AGENT_CONF environment variable:

                                                                                                                                                FROM redis
                                                                                                                                                # This config file adds a password for accessing redis instance
                                                                                                                                                ADD redis.conf /
                                                                                                                                                
                                                                                                                                                ENV SYSDIG_AGENT_CONF { "app_checks": [{ "name": "redis", "check_module": "redisdb", "pattern": {"comm": "redis-server"}, "conf": { "host": "127.0.0.1", "port": "6379", "password": "protected"} }] }
                                                                                                                                                ENTRYPOINT ["redis-server"]
                                                                                                                                                CMD [ "/redis.conf" ]
                                                                                                                                                

                                                                                                                                                Example with Docker CLI

                                                                                                                                                You can add parameters starting a container with dockerrunusing-e/–envflag or injecting it using orchestration systems like Kubernetes:

                                                                                                                                                PER_CONTAINER_CONF='{ "app_checks": [{ "name": "redis", "check_module": "redisdb", "pattern": {"comm": "redis-server"}, "conf": { "host": "127.0.0.1", "port": "6379", "password": "protected"} }] }'
                                                                                                                                                
                                                                                                                                                docker run --name redis -v /tmp/redis.conf:/etc/redis.conf -e SYSDIG_AGENT_CONF="${PER_CONTAINER_CONF}" -d redis /etc/redis.conf