Nagios¶
There are various customizations you can do to tailor the deployment of OpenStack Nagios. You can find those below.
General Parameters¶
conf.httpd
Type: string
Description:
ServerRoot \"/usr/local/apache2\" Listen 80 LoadModule mpm_event_module modules/mod_mpm_event.so LoadModule authn_file_module modules/mod_authn_file.so LoadModule authn_core_module modules/mod_authn_core.so LoadModule authz_host_module modules/mod_authz_host.so LoadModule authz_groupfile_module modules/mod_authz_groupfile.so LoadModule authz_user_module modules/mod_authz_user.so LoadModule authz_core_module modules/mod_authz_core.so LoadModule access_compat_module modules/mod_access_compat.so LoadModule auth_basic_module modules/mod_auth_basic.so LoadModule ldap_module modules/mod_ldap.so LoadModule authnz_ldap_module modules/mod_authnz_ldap.so LoadModule reqtimeout_module modules/mod_reqtimeout.so LoadModule filter_module modules/mod_filter.so LoadModule proxy_html_module modules/mod_proxy_html.so LoadModule log_config_module modules/mod_log_config.so LoadModule env_module modules/mod_env.so LoadModule headers_module modules/mod_headers.so LoadModule setenvif_module modules/mod_setenvif.so LoadModule version_module modules/mod_version.so LoadModule proxy_module modules/mod_proxy.so LoadModule proxy_connect_module modules/mod_proxy_connect.so LoadModule proxy_http_module modules/mod_proxy_http.so LoadModule proxy_balancer_module modules/mod_proxy_balancer.so LoadModule slotmem_shm_module modules/mod_slotmem_shm.so LoadModule slotmem_plain_module modules/mod_slotmem_plain.so LoadModule unixd_module modules/mod_unixd.so LoadModule status_module modules/mod_status.so LoadModule autoindex_module modules/mod_autoindex.so <IfModule unixd_module> User daemon Group daemon </IfModule> <Directory /> AllowOverride none Require all denied </Directory> <Files \".ht*\"> Require all denied </Files> ErrorLog /dev/stderr LogLevel warn <IfModule log_config_module> LogFormat \"%a %l %u %t \\\"%r\\\" %>s %b \\\"%{Referer}i\\\" \\\"%{User-Agent}i\\\"\" combined LogFormat \"%{X-Forwarded-For}i %l %u %t \\\"%r\\\" %>s %b \\\"%{Referer}i\\\" \\\"%{User-Agent}i\\\"\" proxy LogFormat \"%h %l %u %t \\\"%r\\\" %>s %b\" common <IfModule logio_module> LogFormat \"%a %l %u %t \\\"%r\\\" %>s %b \\\"%{Referer}i\\\" \\\"%{User-Agent}i\\\" %I %O\" combinedio </IfModule> SetEnvIf X-Forwarded-For \"^.*\\..*\\..*\\..*\" forwarded CustomLog /dev/stdout common CustomLog /dev/stdout combined CustomLog /dev/stdout proxy env=forwarded </IfModule> <Directory \"/usr/local/apache2/cgi-bin\"> AllowOverride None Options None Require all granted </Directory> <IfModule headers_module> RequestHeader unset Proxy early </IfModule> <IfModule proxy_html_module> Include conf/extra/proxy-html.conf </IfModule> <VirtualHost *:80> <Location /> ProxyPass http://localhost:{{ tuple \"nagios\" \"internal\" \"nagios\" . | include \"helm-toolkit.endpoints.endpoint_port_lookup\" }}/ ProxyPassReverse http://localhost:{{ tuple \"nagios\" \"internal\" \"nagios\" . | include \"helm-toolkit.endpoints.endpoint_port_lookup\" }}/ </Location> <Proxy *> AuthName \"Nagios\" AuthType Basic AuthBasicProvider file ldap AuthUserFile /usr/local/apache2/conf/.htpasswd AuthLDAPBindDN {{ .Values.endpoints.ldap.auth.admin.bind }} AuthLDAPBindPassword {{ .Values.endpoints.ldap.auth.admin.password }} AuthLDAPURL {{ tuple \"ldap\" \"default\" \"ldap\" . | include \"helm-toolkit.endpoints.keystone_endpoint_uri_lookup\" | quote }} Require valid-user </Proxy> </VirtualHost>conf.nagios.additionalPlugins
Type: list
Description:
[]
conf.nagios.cgi.template
Type: string
Description:
action_url_target=_blank authorized_for_all_host_commands=* authorized_for_all_hosts=* authorized_for_all_service_commands=* authorized_for_all_services=* authorized_for_configuration_information=* authorized_for_system_commands=nagiosadmin authorized_for_system_information=* default_statuswrl_layout=4 enable_page_tour=0 escape_html_tags=1 lock_author_names=1 main_config_file=/opt/nagios/etc/nagios.cfg navbar_search_for_addresses=1 navbar_search_for_aliases=1 notes_url_target=_blank physical_html_path=/opt/nagios/share ping_syntax=/bin/ping -n -U -c 5 $HOSTADDRESS$ refresh_rate=90 result_limit=100 show_context_help=0 url_html_path=/nagios use_authentication=0 use_pending_states=1 use_ssl_authentication=0conf.nagios.nagios.template
Type: string
Description:
accept_passive_host_checks=1 accept_passive_service_checks=1 additional_freshness_latency=15 allow_empty_hostgroup_assignment=1 auto_reschedule_checks=0 auto_rescheduling_interval=30 auto_rescheduling_window=180 bare_update_check=0 cached_host_check_horizon=15 cached_service_check_horizon=15 {{- $objectKeys := keys .Values.conf.nagios.objects -}} {{- range $object := $objectKeys }} cfg_file=/opt/nagios/etc/{{$object}}.cfg {{- end }} cfg_file=/opt/nagios/etc/objects/commands.cfg cfg_file=/opt/nagios/etc/objects/contacts.cfg cfg_file=/opt/nagios/etc/objects/timeperiods.cfg cfg_file=/opt/nagios/etc/objects/templates.cfg cfg_file=/opt/nagios/etc/conf.d/nagios-hosts.cfg check_external_commands=1 check_for_orphaned_hosts=1 check_for_orphaned_services=1 check_for_updates=1 check_host_freshness=0 check_result_path=/opt/nagios/var/spool/checkresults check_result_reaper_frequency=10 check_service_freshness=1 check_workers=4 command_file=/opt/nagios/var/rw/nagios.cmd daemon_dumps_core=0 date_format=us debug_file=/opt/nagios/var/nagios.debug debug_level=0 debug_verbosity=1 enable_environment_macros=0 enable_event_handlers=1 enable_flap_detection=1 enable_notifications=1 enable_predictive_host_dependency_checks=1 enable_predictive_service_dependency_checks=1 event_broker_options=-1 event_handler_timeout=60 execute_host_checks=1 execute_service_checks=1 high_host_flap_threshold=20 high_service_flap_threshold=20 host_check_timeout=60 host_freshness_check_interval=60 host_inter_check_delay_method=s illegal_macro_output_chars=`~$&|'<>\" interval_length=1 lock_file=/var/run/nagios.lock log_archive_path=/opt/nagios/var/log/archives log_current_states=1 log_event_handlers=1 log_external_commands=1 log_file=/opt/nagios/var/log/nagios.log log_host_retries=1 log_initial_states=0 log_notifications=0 log_passive_checks=1 log_rotation_method=d log_service_retries=1 low_host_flap_threshold=5 low_service_flap_threshold=5 max_check_result_file_age=3600 max_check_result_reaper_time=30 max_concurrent_checks=10 max_debug_file_size=1e+06 max_host_check_spread=30 max_service_check_spread=30 nagios_group=nagios nagios_user=nagios notification_timeout=60 object_cache_file=/opt/nagios/var/objects.cache obsess_over_hosts=0 obsess_over_services=0 ocsp_timeout=5 passive_host_checks_are_soft=0 perfdata_timeout=5 precached_object_file=/opt/nagios/var/objects.precache process_performance_data=0 resource_file=/opt/nagios/etc/resource.cfg retain_state_information=1 retained_contact_host_attribute_mask=0 retained_contact_service_attribute_mask=0 retained_host_attribute_mask=0 retained_process_host_attribute_mask=0 retained_process_service_attribute_mask=0 retained_service_attribute_mask=0 retention_update_interval=60 service_check_timeout=60 service_freshness_check_interval=60 service_inter_check_delay_method=s service_interleave_factor=s soft_state_dependencies=0 state_retention_file=/opt/nagios/var/retention.dat status_file=/opt/nagios/var/status.dat status_update_interval=10 temp_file=/opt/nagios/var/nagios.tmp temp_path=/tmp translate_passive_host_checks=0 use_aggressive_host_checking=0 use_large_installation_tweaks=0 use_regexp_matching=1 use_retained_program_state=1 use_retained_scheduling_info=1 use_syslog=0 use_true_regexp_matching=0conf.nagios.notification.http.primary_target
Type: string
Description:
“127.0.0.1:3904/events”
conf.nagios.notification.http.secondary_target
Type: string
Description:
“127.0.0.1:3904/events”
conf.nagios.notification.snmp.primary_target
Type: string
Description:
“127.0.0.1:15162”
conf.nagios.notification.snmp.secondary_target
Type: string
Description:
“127.0.0.1:15162”
conf.nagios.objects.base.template
Type: string
Description:
define host { address 127.0.0.1 alias Prometheus Monitoring check_command check-prometheus-host-alive host_name {{ tuple \"monitoring\" \"public\" . | include \"helm-toolkit.endpoints.hostname_short_endpoint_lookup\" }} hostgroups prometheus-hosts use linux-server } define contact { alias notifying contact contact_name notifying_contact host_notification_options d,u,r,f,s host_notification_period 24x7 name notifying_contact register 0 service_notification_options w,u,c,r,f,s service_notification_period 24x7 } define contact { alias snmp contact contact_name snmp_notifying_contact host_notification_commands send_host_snmp_trap name snmp_notifying_contact service_notification_commands send_service_snmp_trap use notifying_contact } define contact { alias HTTP contact contact_name http_notifying_contact host_notification_commands send_host_http_post name http_notifying_contact service_notification_commands send_service_http_post use notifying_contact } define contactgroup { alias SNMP and HTTP notifying group contactgroup_name snmp_and_http_notifying_contact_group members snmp_notifying_contact,http_notifying_contact } define hostgroup { alias Prometheus Virtual Host hostgroup_name prometheus-hosts } define hostgroup { alias all hostgroup_name all } define hostgroup { alias base-os hostgroup_name base-os } define command { command_line $USER1$/send_service_trap.sh '$USER8$' '$HOSTNAME$' '$SERVICEDESC$' $SERVICESTATEID$ '$SERVICEOUTPUT$' '$USER4$' '$USER5$' command_name send_service_snmp_trap } define command { command_line $USER1$/send_host_trap.sh '$USER8$' '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$USER4$' '$USER5$' command_name send_host_snmp_trap } define command { command_line $USER1$/send_http_post_event.py --type service --hostname '$HOSTNAME$' --servicedesc '$SERVICEDESC$' --state_id $SERVICESTATEID$ --output '$SERVICEOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$' command_name send_service_http_post } define command { command_line $USER1$/send_http_post_event.py --type host --hostname '$HOSTNAME$' --state_id $HOSTSTATEID$ --output '$HOSTOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$' command_name send_host_http_post } define command { command_line $USER1$/check_rest_get_api.py --url $USER2$ --warning_response_seconds 5 --critical_response_seconds 10 command_name check-prometheus-host-alive } define command { command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --labels_csv '$ARG2$' --msg_format '$ARG3$' --ok_message '$ARG4$' command_name check_prom_alert_with_labels } define command { command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --msg_format '$ARG2$' --ok_message '$ARG3$' command_name check_prom_alert } define service { check_interval 60 contact_groups snmp_and_http_notifying_contact_group flap_detection_enabled 0 name notifying_service notification_interval 120 process_perf_data 0 register 0 retry_interval 30 use generic-service }conf.nagios.objects.ceph.template
Type: string
Description:
define service { check_command check_prom_alert!prom_exporter_ceph_unavailable!CRITICAL- CEPH exporter is not collecting metrics for alerting!OK- CEPH exporter metrics are available. hostgroup_name prometheus-hosts service_description Prometheus-exporter_CEPH use generic-service } define command { command_line $USER1$/check_exporter_health_metric.py --exporter_api $USER10$ --health_metric ceph_health_status --critical 2 --warning 1 command_name check_ceph_health } define service { check_command check_ceph_health check_interval 300 hostgroup_name base-os service_description CEPH_health use notifying_service } define service { check_command check_prom_alert!ceph_monitor_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists check_interval 60 hostgroup_name prometheus-hosts service_description CEPH_quorum use notifying_service } define service { check_command check_prom_alert!ceph_monitor_quorum_absent!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists check_interval 60 hostgroup_name prometheus-hosts service_description CEPH_quorum use notifying_service } define service { check_command check_prom_alert!ceph_cluster_usage_high!CRITICAL- ceph cluster storage is more than 80 percent!OK- ceph storage is less than 80 percent check_interval 60 hostgroup_name prometheus-hosts service_description CEPH_storage-usage use notifying_service } define service { check_command check_prom_alert!ceph_placement_group_degrade_pct_high!CRITICAL- ceph cluster PGs down are more than 80 percent!OK- ceph PG degradation is less than 80 percent check_interval 60 hostgroup_name prometheus-hosts service_description CEPH_PGs-degradation use notifying_service } define service { check_command check_prom_alert!ceph_osd_down!CRITICAL- One or more CEPH OSDs are down for more than 5 minutes!OK- All the CEPH OSDs are up check_interval 60 hostgroup_name prometheus-hosts service_description CEPH_OSDs-down use notifying_service } define service { check_command check_prom_alert_with_labels!node_ntp_clock_skew_high!ceph-mon=\"enabled\"!CRITICAL- CEPH clock skew is more than 2 seconds!OK- CEPH clock skew is less than 2 seconds check_interval 60 hostgroup_name prometheus-hosts service_description CEPH_Clock-skew use notifying_service }conf.nagios.objects.kubernetes.template
Type: string
Description:
define service { check_command check_prom_alert!prom_exporter_calico_unavailable!CRITICAL- Calico exporter is not collecting metrics for alerting!OK- Calico exporter metrics are available. hostgroup_name prometheus-hosts service_description Prometheus-exporter_Calico use generic-service } define service { check_command check_prom_alert!prom_exporter_kube_state_metrics_unavailable!CRITICAL- kube-state-metrics exporter is not collecting metrics for alerting!OK- kube-state-metrics exporter metrics are available. hostgroup_name prometheus-hosts service_description Prometheus-exporter_Kube-state-metrics use generic-service } define service { check_command check_prom_alert!K8SNodesNotReady!CRITICAL- One or more nodes are not ready. check_interval 60 hostgroup_name prometheus-hosts service_description Nodes_health use generic-service } define service { check_command check_prom_alert_with_labels!kube_statefulset_replicas_unavailable!statefulset=\"prometheus\"!statefulset {statefulset} has lesser than configured replicas check_interval 60 hostgroup_name prometheus-hosts service_description Prometheus_replica-count use notifying_service } define service { check_command check_prom_alert_with_labels!kube_statefulset_replicas_unavailable!statefulset=\"alertmanager\"!statefulset {statefulset} has lesser than configured replicas check_interval 60 hostgroup_name prometheus-hosts service_description PrometheusAlertmanager_replica-count use notifying_service } define service { check_command check_prom_alert!kube_statefulset_replicas_unavailable!CRITICAL- statefulset {statefulset} has lesser than configured replicas!OK- All statefulsets have configured amount of replicas check_interval 60 hostgroup_name prometheus-hosts service_description Statefulset_replica-count use notifying_service } define service { check_command check_prom_alert!daemonsets_misscheduled!CRITICAL- Daemonset {daemonset} is incorrectly scheudled!OK- No daemonset misscheduling detected check_interval 60 hostgroup_name prometheus-hosts service_description Daemonset_misscheduled use notifying_service } define service { check_command check_prom_alert!daemonsets_not_scheduled!CRITICAL- Daemonset {daemonset} is missing to be scheduled in some nodes!OK- All daemonset scheduling is as desired check_interval 60 hostgroup_name prometheus-hosts service_description Daemonset_not-scheduled use notifying_service } define service { check_command check_prom_alert!daemonset_pods_unavailable!CRITICAL- Daemonset {daemonset} has pods unavailable!OK- All daemonset pods available check_interval 60 hostgroup_name prometheus-hosts service_description Daemonset_pods-unavailable use notifying_service } define service { check_command check_prom_alert!deployment_replicas_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas!OK- All deployments have desired replicas check_interval 60 hostgroup_name prometheus-hosts service_description Deployment_replicas-unavailable use notifying_service } define service { check_command check_prom_alert!volume_claim_capacity_high_utilization!CRITICAL- Volume claim {persistentvolumeclaim} has exceed 80% utilization!OK- All volume claims less than 80% utilization check_interval 60 hostgroup_name prometheus-hosts service_description Volume_claim_high_utilization use notifying_service } define service { check_command check_prom_alert!rollingupdate_deployment_replica_less_than_spec_max_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas during a rolling update!OK- All deployments have desired replicas check_interval 60 hostgroup_name prometheus-hosts service_description RollingUpdate_Deployment-replicas-unavailable use notifying_service } define service { check_command check_prom_alert!job_status_failed!CRITICAL- Job {exported_job} has failed!OK- No Job failures check_interval 60 hostgroup_name prometheus-hosts service_description Job_status-failed use notifying_service } define service { check_command check_prom_alert!pod_status_pending!CRITICAL- Pod {pod} in namespace {namespace} has been in pending status for more than 10 minutes!OK- No pods in pending status check_interval 60 hostgroup_name prometheus-hosts service_description Pod_status-pending use notifying_service } define service { check_command check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status check_interval 60 hostgroup_name prometheus-hosts service_description Pod_status-error-image-pull use notifying_service } define service { check_command check_prom_alert! pod_status_error_image_pull_backoff!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ImagePullBackOff for more than 10 minutes!OK- No pods in error status check_interval 60 hostgroup_name prometheus-hosts service_description Pod_status-error-image-pull use notifying_service } define service { check_command check_prom_alert! pod_error_config_error!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of CreateContainerConfigError for more than 10 minutes!OK- No pods in error status check_interval 60 hostgroup_name prometheus-hosts service_description Pod_status-error-image-pull use notifying_service } define service { check_command check_prom_alert!pod_error_crash_loop_back_off!CRITICAL- Pod {pod} in namespace {namespace} has been in error status of CrashLoopBackOff for more than 10 minutes!OK- No pods in crashLoopBackOff status check_interval 60 hostgroup_name prometheus-hosts service_description Pod_status-crashLoopBackOff use notifying_service } define service { check_command check_prom_alert!replicaset_missing_replicas!CRITICAL- Replicaset {replicaset} is missing replicas!OK- No replicas missing from replicaset check_interval 60 hostgroup_name prometheus-hosts service_description Replicaset_missing-replicas use notifying_service } define service { check_command check_prom_alert!pod_container_terminated!CRITICAL- pod {pod} in namespace {namespace} has a container in terminated state!OK- pod container status looks good check_interval 60 hostgroup_name prometheus-hosts service_description Pod_status-container-terminated use notifying_service } define service { check_command check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method=\"DELETE\"!CRITICAL- ETCD {instance} has a high HTTP DELETE operations failure!OK- ETCD at {instance} has low or no failures for HTTP DELETE check_interval 60 hostgroup_name prometheus-hosts service_description ETCD_high-http-delete-failures use notifying_service } define service { check_command check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method=~\"GET|QGET\"!CRITICAL- ETCD {instance} has a high HTTP GET operations failure!OK- ETCD at {instance} has low or no failures for HTTP GET check_interval 60 hostgroup_name prometheus-hosts service_description ETCD_high-http-get-failures use notifying_service } define service { check_command check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method=\"PUT\"!CRITICAL- ETCD {instance} has a high HTTP PUT operations failure!OK- ETCD at {instance} has low or no failures for HTTP PUT check_interval 60 hostgroup_name prometheus-hosts service_description ETCD_high-http-update-failures use notifying_service } define service { check_command check_prom_alert!calico_iptable_save_errors_high_1h!CRITICAL- Felix instance {instance} has seen high iptable save errors within the last hour!OK- iptables save errors are none or low hostgroup_name prometheus-hosts service_description Calico_iptables-save-errors use notifying_service } define service { check_command check_prom_alert!calico_ipset_errors_high_1h!CRITICAL- Felix instance {instance} has seen high ipset errors within the last hour!OK- ipset errors are none or low hostgroup_name prometheus-hosts service_description Calico_ipset-errors use notifying_service } define service { check_command check_prom_alert!calico_datapane_iface_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane interface message batch size!OK- dataplane interface message batch size are low hostgroup_name prometheus-hosts service_description Calico_interface-message-batch-size use notifying_service } define service { check_command check_prom_alert!calico_datapane_address_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane address message batch size!OK- dataplane address message batch size are low hostgroup_name prometheus-hosts service_description Calico_address-message-batch-size use notifying_service } define service { check_command check_prom_alert!calico_datapane_failures_high_1h!CRITICAL- Felix instance {instance} has seen high dataplane failures within the last hour!OK- datapane failures are none or low hostgroup_name prometheus-hosts service_description Calico_datapane_failures_high use notifying_service }conf.nagios.objects.node.template
Type: string
Description:
define service { check_command check_prom_alert!prom_exporter_node_unavailable!CRITICAL- Node exporter is not collecting metrics for alerting!OK- Node exporter metrics are available. hostgroup_name prometheus-hosts service_description Prometheus-exporter_Node use generic-service } define command { command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_in_4h' --labels_csv 'instance=~\"$HOSTADDRESS$.*\"' --msg_format 'CRITICAL- Mountpoint {mountpoint} will be full in four hours' --ok_message 'OK- All mountpoints usage rate is normal' command_name check_filespace_mounts-usage-rate-fullin4hrs } define command { command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_80percent' --labels_csv 'instance=~\"$HOSTADDRESS$.*\"' --msg_format 'CRITICAL- Mountpoint {mountpoint} is more than 80 pecent full' --ok_message 'OK- All mountpoints usage is normal' command_name check_filespace_mounts-usage } define command { command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_load1_90percent' --labels_csv 'instance=~\"$HOSTADDRESS$.*\"' --msg_format 'CRITICAL- Node load average has been more than 90% for the pash hour' --ok_message 'OK- Node load average is normal' command_name check_node_loadavg } define command { command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_cpu_util_90percent' --labels_csv 'instance=~\"$HOSTADDRESS$.*\"' --msg_format 'CRITICAL- Node CPU utilization has been more than 90% for the pash hour' --ok_message 'OK- Node cpu utilization is normal' command_name check_node_cpu_util } define command { command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_conntrack_usage_80percent' --labels_csv 'instance=~\"$HOSTADDRESS$.*\"' --msg_format 'CRITICAL- Node network connections are more than 90% in use' --ok_message 'OK- Network connection utilization is normal' command_name check_network_connections } define command { command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_high_memory_load' --labels_csv 'instance=~\"$HOSTADDRESS$.*\"' --msg_format 'CRITICAL- Node memory usage is more than 85%' --ok_message 'OK- Node memory usage is less than 85%' command_name check_memory_usage } define command { command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_write_latency' --labels_csv 'instance=~\"$HOSTADDRESS$.*\"' --msg_format 'CRITICAL- Disk write latency is high' --ok_message 'OK- Node disk write latency is normal' command_name check_disk_write_latency } define command { command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_read_latency' --labels_csv 'instance=~\"$HOSTADDRESS$.*\"' --msg_format 'CRITICAL- Disk read latency is high' --ok_message 'OK- Node disk read latency is normal' command_name check_disk_read_latency } define command { command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_entropy_available_low' --labels_csv 'instance=~\"$HOSTADDRESS$.*\"' --msg_format 'CRITICAL- System has low entropy availability' --ok_message 'OK- System entropy availability is sufficient' command_name check_entropy_availability } define command { command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filedescriptors_full_in_3h' --labels_csv 'instance=~\"$HOSTADDRESS$.*\"' --msg_format 'CRITICAL- at current consumption rate no free file descriptors will be available in 3hrs.' --ok_message 'OK- System file descriptor consumption is ok.' command_name check_filedescriptor_usage_rate } define command { command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_hwmon_high_cpu_temp' --labels_csv 'instance=~\"$HOSTADDRESS$.*\"' --msg_format 'CRITICAL- CPU temperature is 90 percent of critical temperature.' --ok_message 'OK- CPU temperatures are normal.' command_name check_hwmon_high_cpu_temp } define command { command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_high_network_drop_rcv' --labels_csv 'instance=~\"$HOSTADDRESS$.*\"' --msg_format 'CRITICAL- Host system has an unusally high drop in network reception.' --ok_message 'OK- network packet receive drops not high.' command_name check_network_receive_drop_high } define command { command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_high_network_drop_send' --labels_csv 'instance=~\"$HOSTADDRESS$.*\"' --msg_format 'CRITICAL- Host system has an unusally high drop in network transmission.' --ok_message 'OK- network packet tramsmit drops not high.' command_name check_network_transmit_drop_high } define command { command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_high_network_errs_rcv' --labels_csv 'instance=~\"$HOSTADDRESS$.*\"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network reception.' --ok_message 'OK- network reception errors not high.' command_name check_network_receive_errors_high } define command { command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_high_network_errs_send' --labels_csv 'instance=~\"$HOSTADDRESS$.*\"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network transmission.' --ok_message 'OK- network transmission errors not high.' command_name check_network_transmit_errors_high } define command { command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_vmstat_paging_rate_high' --labels_csv 'instance=~\"$HOSTADDRESS$.*\"' --msg_format 'CRITICAL- Memory paging rate over 5 minutes is high.' --ok_message 'OK- Memory paging rate over 5 minutes is ok.' command_name check_vmstat_paging_rate } define command { command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_xfs_block_allocation_high' --labels_csv 'instance=~\"$HOSTADDRESS$.*\"' --msg_format 'CRITICAL- XFS block allocation is more than 80 percent of available.' --ok_message 'OK- XFS block allocation is less than 80 percent of available.' command_name check_xfs_block_allocation } define command { command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_bond_slaves_down' --labels_csv 'instance=~\"$HOSTADDRESS$.*\"' --msg_format 'CRITICAL- {master} is missing slave interfaces.' --ok_message 'OK- Network bonds have slave interfaces functional.' command_name check_network_bond_status } define command { command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_numa_memory_used' --labels_csv 'instance=~\"$HOSTADDRESS$.*\"' --msg_format 'CRITICAL- NUMA memory usage is more than 80 percent of available.' --ok_message 'OK- NUMA memory usage is normal.' command_name check_numa_memory_usage } define command { command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_ntp_clock_skew_high' --labels_csv 'instance=~\"$HOSTADDRESS$.*\"' --msg_format 'CRITICAL- NTP clock skew is more than 2 seconds.' --ok_message 'OK- NTP clock skew is less than 2 seconds.' command_name check_ntp_sync } define service { check_command check_filespace_mounts-usage-rate-fullin4hrs check_interval 60 hostgroup_name base-os service_description Filespace_mounts-usage-rate-fullin4hrs use notifying_service } define service { check_command check_filespace_mounts-usage check_interval 60 hostgroup_name base-os service_description Filespace_mounts-usage use notifying_service } define service { check_command check_node_loadavg hostgroup_name base-os service_description CPU_Load-average use notifying_service } define service { check_command check_node_cpu_util hostgroup_name base-os service_description CPU_utilization use notifying_service } define service { check_command check_network_connections hostgroup_name base-os service_description Network_connections use notifying_service } define service { check_command check_memory_usage hostgroup_name base-os service_description Memory_usage use notifying_service } define service { check_command check_disk_write_latency hostgroup_name base-os service_description Disk_write-latency use notifying_service } define service { check_command check_disk_read_latency hostgroup_name base-os service_description Disk_read-latency use notifying_service } define service { check_command check_entropy_availability hostgroup_name base-os service_description Entropy_availability use notifying_service } define service { check_command check_filedescriptor_usage_rate hostgroup_name base-os service_description FileDescriptors_usage-rate-high use notifying_service } define service { check_command check_hwmon_high_cpu_temp hostgroup_name base-os service_description HW_cpu-temp-high use notifying_service } define service { check_command check_network_receive_drop_high hostgroup_name base-os service_description Network_receive-drop-high use notifying_service } define service { check_command check_network_transmit_drop_high hostgroup_name base-os service_description Network_transmit-drop-high use notifying_service } define service { check_command check_network_receive_errors_high hostgroup_name base-os service_description Network_receive-errors-high use notifying_service } define service { check_command check_network_transmit_errors_high hostgroup_name base-os service_description Network_transmit-errors-high use notifying_service } define service { check_command check_vmstat_paging_rate hostgroup_name base-os service_description Memory_vmstat-paging-rate use notifying_service } define service { check_command check_xfs_block_allocation hostgroup_name base-os service_description XFS_block-allocation use notifying_service } define service { check_command check_network_bond_status hostgroup_name base-os service_description Network_bondstatus use notifying_service } define service { check_command check_numa_memory_usage hostgroup_name base-os service_description Memory_NUMA-usage use notifying_service } define service { check_command check_ntp_sync hostgroup_name base-os service_description NTP_sync use notifying_service }conf.nagios.query_es_clauses
Type: string
Description:
nil
dependencies.dynamic.common.jobs[0]
Type: string
Description:
“nagios-image-repo-sync”
dependencies.dynamic.common.services[0].endpoint
Type: string
Description:
“node”
dependencies.dynamic.common.services[0].service
Type: string
Description:
“local_image_registry”
dependencies.static.image_repo_sync.services[0].endpoint
Type: string
Description:
“internal”
dependencies.static.image_repo_sync.services[0].service
Type: string
Description:
“local_image_registry”
dependencies.static.nagios.services
Type: string
Description:
nil
dependencies.static.tests.services[0].endpoint
Type: string
Description:
“internal”
dependencies.static.tests.services[0].service
Type: string
Description:
“nagios”
endpoints.ceph_mgr.host_fqdn_override.default
Type: string
Description:
nil
endpoints.ceph_mgr.hosts.default
Type: string
Description:
“ceph-mgr”
endpoints.ceph_mgr.namespace
Type: string
Description:
nil
endpoints.ceph_mgr.port.metrics.default
Type: int
Description:
9283
endpoints.ceph_mgr.port.mgr.default
Type: int
Description:
7000
endpoints.ceph_mgr.scheme.default
Type: string
Description:
“http”
endpoints.cluster_domain_suffix
Type: string
Description:
“cluster.local”
endpoints.elasticsearch.auth.admin.password
Type: string
Description:
“changeme”
endpoints.elasticsearch.auth.admin.username
Type: string
Description:
“admin”
endpoints.elasticsearch.host_fqdn_override.default
Type: string
Description:
nil
endpoints.elasticsearch.hosts.default
Type: string
Description:
“elasticsearch-logging”
endpoints.elasticsearch.name
Type: string
Description:
“elasticsearch”
endpoints.elasticsearch.namespace
Type: string
Description:
nil
endpoints.elasticsearch.path.default
Type: string
Description:
“/”
endpoints.elasticsearch.port.http.default
Type: int
Description:
80
endpoints.elasticsearch.scheme.default
Type: string
Description:
“http”
endpoints.ldap.auth.admin.bind
Type: string
Description:
“cn=admin,dc=cluster,dc=local”
endpoints.ldap.auth.admin.password
Type: string
Description:
“password”
endpoints.ldap.host_fqdn_override.default
Type: string
Description:
nil
endpoints.ldap.hosts.default
Type: string
Description:
“ldap”
endpoints.ldap.path.default
Type: string
Description:
“/ou=People,dc=cluster,dc=local”
endpoints.ldap.port.ldap.default
Type: int
Description:
389
endpoints.ldap.scheme.default
Type: string
Description:
“ldap”
endpoints.local_image_registry.host_fqdn_override.default
Type: string
Description:
nil
endpoints.local_image_registry.hosts.default
Type: string
Description:
“localhost”
endpoints.local_image_registry.hosts.internal
Type: string
Description:
“docker-registry”
endpoints.local_image_registry.hosts.node
Type: string
Description:
“localhost”
endpoints.local_image_registry.name
Type: string
Description:
“docker-registry”
endpoints.local_image_registry.namespace
Type: string
Description:
“docker-registry”
endpoints.local_image_registry.port.registry.node
Type: int
Description:
5000
endpoints.monitoring.auth.admin.password
Type: string
Description:
“changeme”
endpoints.monitoring.auth.admin.secret.tls.internal
Type: string
Description:
“prometheus-tls-api”
endpoints.monitoring.auth.admin.username
Type: string
Description:
“admin”
endpoints.monitoring.host_fqdn_override.default
Type: string
Description:
nil
endpoints.monitoring.hosts.default
Type: string
Description:
“prom-metrics”
endpoints.monitoring.hosts.public
Type: string
Description:
“prometheus”
endpoints.monitoring.name
Type: string
Description:
“prometheus”
endpoints.monitoring.path.default
Type: string
Description:
nil
endpoints.monitoring.port.http.default
Type: int
Description:
80
endpoints.monitoring.scheme.default
Type: string
Description:
“http”
endpoints.nagios.auth.admin.password
Type: string
Description:
“password”
endpoints.nagios.auth.admin.username
Type: string
Description:
“nagiosadmin”
endpoints.nagios.host_fqdn_override.default
Type: string
Description:
nil
endpoints.nagios.hosts.default
Type: string
Description:
“nagios-metrics”
endpoints.nagios.hosts.public
Type: string
Description:
“nagios”
endpoints.nagios.name
Type: string
Description:
“nagios”
endpoints.nagios.namespace
Type: string
Description:
nil
endpoints.nagios.path.default
Type: string
Description:
nil
endpoints.nagios.port.http.default
Type: int
Description:
80
endpoints.nagios.port.nagios.default
Type: int
Description:
8000
endpoints.nagios.scheme.default
Type: string
Description:
“http”
endpoints.oci_image_registry.auth.enabled
Type: bool
Description:
false
endpoints.oci_image_registry.auth.nagios.password
Type: string
Description:
“password”
endpoints.oci_image_registry.auth.nagios.username
Type: string
Description:
“nagios”
endpoints.oci_image_registry.host_fqdn_override.default
Type: string
Description:
nil
endpoints.oci_image_registry.hosts.default
Type: string
Description:
“localhost”
endpoints.oci_image_registry.name
Type: string
Description:
“oci-image-registry”
endpoints.oci_image_registry.namespace
Type: string
Description:
“oci-image-registry”
endpoints.oci_image_registry.port.registry.default
Type: string
Description:
nil
images.local_registry.active
Type: bool
Description:
false
images.local_registry.exclude[0]
Type: string
Description:
“dep_check”
images.local_registry.exclude[1]
Type: string
Description:
“image_repo_sync”
images.pull_policy
Type: string
Description:
“IfNotPresent”
images.tags.apache_proxy
Type: string
Description:
“docker.io/library/httpd:2.4”
images.tags.dep_check
Type: string
Description:
“quay.io/stackanetes/kubernetes-entrypoint:v0.2.1”
images.tags.image_repo_sync
Type: string
Description:
“docker.io/library/docker:17.07.0”
images.tags.nagios
Type: string
Description:
“docker.io/openstackhelm/nagios:latest-ubuntu_jammy”
images.tags.selenium_tests
Type: string
Description:
“docker.io/openstackhelm/osh-selenium:latest-ubuntu_jammy”
labels.job.node_selector_key
Type: string
Description:
“openstack-control-plane”
labels.job.node_selector_value
Type: string
Description:
“enabled”
labels.nagios.node_selector_key
Type: string
Description:
“openstack-control-plane”
labels.nagios.node_selector_value
Type: string
Description:
“enabled”
labels.test.node_selector_key
Type: string
Description:
“openstack-control-plane”
labels.test.node_selector_value
Type: string
Description:
“enabled”
manifests.certificates
Type: bool
Description:
false
manifests.configmap_additional_plugins
Type: bool
Description:
false
manifests.configmap_bin
Type: bool
Description:
true
manifests.configmap_etc
Type: bool
Description:
true
manifests.deployment
Type: bool
Description:
true
manifests.ingress
Type: bool
Description:
true
manifests.job_image_repo_sync
Type: bool
Description:
true
manifests.network_policy
Type: bool
Description:
false
manifests.pod_helm_test
Type: bool
Description:
true
manifests.secret_ingress_tls
Type: bool
Description:
true
manifests.secret_nagios
Type: bool
Description:
true
manifests.secret_registry
Type: bool
Description:
true
manifests.service
Type: bool
Description:
true
manifests.service_ingress
Type: bool
Description:
true
network.nagios.ingress.annotations.”nginx.ingress.kubernetes.io/affinity”
Type: string
Description:
“cookie”
network.nagios.ingress.annotations.”nginx.ingress.kubernetes.io/configuration-snippet”
Type: string
Description:
more_set_headers \"X-Content-Type-Options: 'nosniff'\"; more_set_headers \"X-Frame-Options: SAMEORIGIN\"; more_set_headers \"Content-Security-Policy: script-src 'self'\"; more_set_headers \"X-XSS-Protection: 1; mode=block\";network.nagios.ingress.annotations.”nginx.ingress.kubernetes.io/rewrite-target”
Type: string
Description:
“/”
network.nagios.ingress.annotations.”nginx.ingress.kubernetes.io/session-cookie-expires”
Type: string
Description:
“600”
network.nagios.ingress.annotations.”nginx.ingress.kubernetes.io/session-cookie-hash”
Type: string
Description:
“sha1”
network.nagios.ingress.annotations.”nginx.ingress.kubernetes.io/session-cookie-max-age”
Type: string
Description:
“600”
network.nagios.ingress.annotations.”nginx.ingress.kubernetes.io/session-cookie-name”
Type: string
Description:
“kube-ingress-session-nagios”
network.nagios.ingress.classes.cluster
Type: string
Description:
“nginx-cluster”
network.nagios.ingress.classes.namespace
Type: string
Description:
“nginx”
network.nagios.ingress.public
Type: bool
Description:
true
network.nagios.node_port.enabled
Type: bool
Description:
false
network.nagios.node_port.port
Type: int
Description:
30925
network_policy.nagios.egress[0]
Type: object
Description:
{}
network_policy.nagios.ingress[0]
Type: object
Description:
{}
pod.affinity.anti.topologyKey.default
Type: string
Description:
“kubernetes.io/hostname”
pod.affinity.anti.type.default
Type: string
Description:
“preferredDuringSchedulingIgnoredDuringExecution”
pod.affinity.anti.weight.default
Type: int
Description:
10
pod.lifecycle.termination_grace_period.nagios.timeout
Type: int
Description:
30
pod.lifecycle.upgrades.deployments.pod_replacement_strategy
Type: string
Description:
“RollingUpdate”
pod.lifecycle.upgrades.deployments.revision_history
Type: int
Description:
3
pod.lifecycle.upgrades.deployments.rolling_update.max_surge
Type: int
Description:
3
pod.lifecycle.upgrades.deployments.rolling_update.max_unavailable
Type: int
Description:
1
pod.probes.monitoring.apache_proxy.readiness.enabled
Type: bool
Description:
true
pod.probes.monitoring.apache_proxy.readiness.params.initialDelaySeconds
Type: int
Description:
20
pod.probes.monitoring.apache_proxy.readiness.params.periodSeconds
Type: int
Description:
10
pod.probes.monitoring.nagios.readiness.enabled
Type: bool
Description:
true
pod.probes.monitoring.nagios.readiness.params.initialDelaySeconds
Type: int
Description:
60
pod.probes.monitoring.nagios.readiness.params.periodSeconds
Type: int
Description:
30
pod.probes.monitoring.nagios.readiness.params.timeoutSeconds
Type: int
Description:
10
pod.replicas.nagios
Type: int
Description:
1
pod.resources.apache_proxy.limits.cpu
Type: string
Description:
“2000m”
pod.resources.apache_proxy.limits.memory
Type: string
Description:
“1024Mi”
pod.resources.apache_proxy.requests.cpu
Type: string
Description:
“100m”
pod.resources.apache_proxy.requests.memory
Type: string
Description:
“128Mi”
pod.resources.enabled
Type: bool
Description:
false
pod.resources.jobs.image_repo_sync.limits.cpu
Type: string
Description:
“2000m”
pod.resources.jobs.image_repo_sync.limits.memory
Type: string
Description:
“1024Mi”
pod.resources.jobs.image_repo_sync.requests.cpu
Type: string
Description:
“100m”
pod.resources.jobs.image_repo_sync.requests.memory
Type: string
Description:
“128Mi”
pod.resources.jobs.tests.limits.cpu
Type: string
Description:
“2000m”
pod.resources.jobs.tests.limits.memory
Type: string
Description:
“1024Mi”
pod.resources.jobs.tests.requests.cpu
Type: string
Description:
“100m”
pod.resources.jobs.tests.requests.memory
Type: string
Description:
“128Mi”
pod.resources.nagios.limits.cpu
Type: string
Description:
“2000m”
pod.resources.nagios.limits.memory
Type: string
Description:
“1024Mi”
pod.resources.nagios.requests.cpu
Type: string
Description:
“100m”
pod.resources.nagios.requests.memory
Type: string
Description:
“128Mi”
pod.security_context.monitoring.container.apache_proxy.readOnlyRootFilesystem
Type: bool
Description:
false
pod.security_context.monitoring.container.define_nagios_hosts.readOnlyRootFilesystem
Type: bool
Description:
false
pod.security_context.monitoring.container.helm_tests.readOnlyRootFilesystem
Type: bool
Description:
true
pod.security_context.monitoring.container.nagios.readOnlyRootFilesystem
Type: bool
Description:
false
pod.security_context.monitoring.pod.runAsUser
Type: int
Description:
0
secrets.nagios.admin
Type: string
Description:
“nagios-admin-creds”
secrets.oci_image_registry.nagios
Type: string
Description:
“nagios-oci-image-registry-key”
secrets.tls.nagios.nagios.public
Type: string
Description:
“nagios-tls-public”
selenium_v4
Type: bool
Description:
true