From d347fd7215c81fa156a0d9258bcd1d41cba24766 Mon Sep 17 00:00:00 2001 From: Timotej Lazar Date: Fri, 17 Oct 2025 21:18:45 +0200 Subject: [PATCH 1/3] Add prometheus role Configured to include job definitions from /etc/prometheus/conf.d/*.yml. --- roles/prometheus/README.md | 3 +++ roles/prometheus/files/prometheus.yml | 2 ++ roles/prometheus/handlers/main.yml | 5 +++++ roles/prometheus/tasks/main.yml | 21 +++++++++++++++++++++ 4 files changed, 31 insertions(+) create mode 100644 roles/prometheus/README.md create mode 100644 roles/prometheus/files/prometheus.yml create mode 100644 roles/prometheus/handlers/main.yml create mode 100644 roles/prometheus/tasks/main.yml diff --git a/roles/prometheus/README.md b/roles/prometheus/README.md new file mode 100644 index 0000000..13309e1 --- /dev/null +++ b/roles/prometheus/README.md @@ -0,0 +1,3 @@ +Install and configure prometheus. + +Job definitions should be placed in /etc/prometheus/conf.d by roles using this one. diff --git a/roles/prometheus/files/prometheus.yml b/roles/prometheus/files/prometheus.yml new file mode 100644 index 0000000..2d54a25 --- /dev/null +++ b/roles/prometheus/files/prometheus.yml @@ -0,0 +1,2 @@ +scrape_config_files: + - "conf.d/*.yml" diff --git a/roles/prometheus/handlers/main.yml b/roles/prometheus/handlers/main.yml new file mode 100644 index 0000000..c85cc91 --- /dev/null +++ b/roles/prometheus/handlers/main.yml @@ -0,0 +1,5 @@ +- name: reload prometheus + service: + name: prometheus + state: reloaded + when: "'handler' not in ansible_skip_tags" diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml new file mode 100644 index 0000000..9e44c4d --- /dev/null +++ b/roles/prometheus/tasks/main.yml @@ -0,0 +1,21 @@ +- name: Install packages + package: + name: + - prometheus + +- name: Create directory for prometheus configs + file: + path: /etc/prometheus/conf.d + state: directory + +- name: Configure prometheus + copy: + dest: /etc/prometheus/ + src: prometheus.yml + notify: reload prometheus + +- name: Enable prometheus service + service: + name: prometheus + enabled: true + state: started From 6600a6fa36aae5f26c8311c6437f6464933de89b Mon Sep 17 00:00:00 2001 From: Timotej Lazar Date: Fri, 17 Oct 2025 21:34:25 +0200 Subject: [PATCH 2/3] telegraf: drop agent configuration stuff Servers will be switched to Prometheus. Telegraf remains as a SNMP proxy (see following commit). --- roles/telegraf/tasks/debian.yml | 31 --------------- roles/telegraf/tasks/main.yml | 31 ++++++++++----- roles/telegraf/tasks/token.yml | 53 ------------------------- roles/telegraf/templates/output.conf.j2 | 5 --- setup.yml | 1 - 5 files changed, 21 insertions(+), 100 deletions(-) delete mode 100644 roles/telegraf/tasks/debian.yml delete mode 100644 roles/telegraf/tasks/token.yml delete mode 100644 roles/telegraf/templates/output.conf.j2 diff --git a/roles/telegraf/tasks/debian.yml b/roles/telegraf/tasks/debian.yml deleted file mode 100644 index a53989f..0000000 --- a/roles/telegraf/tasks/debian.yml +++ /dev/null @@ -1,31 +0,0 @@ -- name: Add influxdb repository - deb822_repository: - name: influxdata - uris: https://repos.influxdata.com/debian - suites: stable - components: main - architectures: amd64 - signed_by: https://repos.influxdata.com/influxdata-archive.key - notify: update package cache - -- meta: flush_handlers - -- name: Install telegraf - package: - name: telegraf - -- name: Configure telegraf - when: not ansible_check_mode - template: - dest: /etc/telegraf/telegraf.d/output.conf - src: output.conf.j2 - owner: telegraf - group: telegraf - mode: 0640 - notify: restart telegraf - -- name: Enable telegraf - service: - name: telegraf - enabled: true - state: started diff --git a/roles/telegraf/tasks/main.yml b/roles/telegraf/tasks/main.yml index 0193aa9..6d4fea3 100644 --- a/roles/telegraf/tasks/main.yml +++ b/roles/telegraf/tasks/main.yml @@ -1,11 +1,22 @@ -- name: Get influxdb info - set_fact: - influxdb_info: '{{ lookup("passwordstore", "vm/"~influxdb_host, returnall=true, missing="empty") | from_yaml }}' - -- name: Create influxdb token for this host - include_tasks: token.yml - when: 'not ansible_check_mode and "influxdb_token" not in password' - -- name: Install telegraf on Debian - include_tasks: debian.yml +- name: Add telegraf package repo on Debian when: ansible_os_family == "Debian" + deb822_repository: + name: influxdata + uris: https://repos.influxdata.com/debian + suites: stable + components: main + architectures: amd64 + signed_by: https://repos.influxdata.com/influxdata-archive.key + notify: update package cache + +- meta: flush_handlers + +- name: Install telegraf + package: + name: telegraf + +- name: Enable telegraf service + service: + name: telegraf + enabled: true + state: started diff --git a/roles/telegraf/tasks/token.yml b/roles/telegraf/tasks/token.yml deleted file mode 100644 index 6343331..0000000 --- a/roles/telegraf/tasks/token.yml +++ /dev/null @@ -1,53 +0,0 @@ -- name: Get influxdb organization ID - delegate_to: localhost - uri: - url: '{{ influxdb_info.influxdb_url }}/api/v2/orgs' - headers: - Authorization: Token {{ influxdb_info.influxdb_operator_token }} - register: response - -- name: Parse influxdb orgID - set_fact: - influxdb_orgID: '{{ response.json.orgs | selectattr("name", "==", influxdb_info.influxdb_org) | map(attribute="id") | first }}' - -- name: Get influxdb bucket ID - delegate_to: localhost - uri: - url: '{{ influxdb_info.influxdb_url }}/api/v2/buckets?orgID={{ influxdb_orgID }}' - headers: - Authorization: Token {{ influxdb_info.influxdb_operator_token }} - register: response - -- name: Parse influxdb bucketID - set_fact: - influxdb_bucketID: '{{ response.json.buckets | selectattr("name", "==", "servers") | map(attribute="id") | first }}' - -- name: Create influxdb token - delegate_to: localhost - uri: - url: '{{ influxdb_info.influxdb_url }}/api/v2/authorizations' - method: POST - body_format: json - status_code: 201 - headers: - Authorization: Token {{ influxdb_info.influxdb_operator_token }} - Content-Type: application/json - body: | - { - "description": "{{ inventory_hostname }}", - "orgID": "{{ influxdb_orgID }}", - "permissions": [{ "action": "write", "resource": { "type": "buckets", "id": "{{ influxdb_bucketID }}" } }] - } - register: response - -- name: Parse influxdb token - set_fact: - influxdb_token: '{{ response.json.token }}' - -# Ansible’s passwordstore lookup plugin should be able to do that but is pretty broken, -# so we do it manually. -- name: Store influxdb token in password store - delegate_to: localhost - command: - cmd: 'pass insert --force --multiline {{ ("vm/" if is_virtual else "host/")~inventory_hostname }}' - stdin: '{{ password | to_nice_yaml(sort_keys=false) }}influxdb_token: {{ influxdb_token }}' diff --git a/roles/telegraf/templates/output.conf.j2 b/roles/telegraf/templates/output.conf.j2 deleted file mode 100644 index 25ba0de..0000000 --- a/roles/telegraf/templates/output.conf.j2 +++ /dev/null @@ -1,5 +0,0 @@ -[[outputs.influxdb_v2]] - urls = ["{{ influxdb_info.influxdb_url }}"] - organization = "{{ influxdb_info.influxdb_org }}" - bucket = "{{ influxdb_info.influxdb_bucket }}" - token = "{{ influxdb_token | default(password.influxdb_token) }}" diff --git a/setup.yml b/setup.yml index f5457c3..8bdb494 100644 --- a/setup.yml +++ b/setup.yml @@ -16,7 +16,6 @@ - hosts: ceph-* roles: - frr - - telegraf - ceph - hosts: mgmt-gw From da3db8cc02cd2718c0364feff7a3bc15afa5fd29 Mon Sep 17 00:00:00 2001 From: Timotej Lazar Date: Fri, 17 Oct 2025 22:00:42 +0200 Subject: [PATCH 3/3] Add collector role Sets up prometheus to pull metrics, with telegraf to process SNMP data. --- roles/collector/README.md | 18 +++ roles/collector/files/prometheus-snmp.yml | 4 + roles/collector/files/prometheus.nft | 12 ++ roles/collector/files/telegraf.nft | 9 ++ roles/collector/handlers/main.yml | 17 +++ roles/collector/meta/main.yml | 3 + roles/collector/tasks/main.yml | 34 ++++++ roles/collector/templates/output.conf.j2 | 4 + .../collector/templates/prometheus-job.yml.j2 | 20 ++++ roles/collector/templates/snmp.conf.j2 | 106 ++++++++++++++++++ setup.yml | 1 + 11 files changed, 228 insertions(+) create mode 100644 roles/collector/README.md create mode 100644 roles/collector/files/prometheus-snmp.yml create mode 100644 roles/collector/files/prometheus.nft create mode 100644 roles/collector/files/telegraf.nft create mode 100644 roles/collector/handlers/main.yml create mode 100644 roles/collector/meta/main.yml create mode 100644 roles/collector/tasks/main.yml create mode 100644 roles/collector/templates/output.conf.j2 create mode 100644 roles/collector/templates/prometheus-job.yml.j2 create mode 100644 roles/collector/templates/snmp.conf.j2 diff --git a/roles/collector/README.md b/roles/collector/README.md new file mode 100644 index 0000000..77c4f93 --- /dev/null +++ b/roles/collector/README.md @@ -0,0 +1,18 @@ +Set up metric collection with prometheus and telegraf as the SNMP proxy. + +NetBox config context should contain the lists `prometheus_hosts` and `snmp_hosts` with job definitions. Each entry should define `name` and `nb_filter` user to query hosts from NetBox. For example: + + { + "prometheus_hosts": [ + { + "name": "classroom", + "nb_filter": "role=desktop-computer status=active location=classroom" + } + ], + "snmp_hosts": [ + { + "name": "switch", + "nb_filter": "role=switch name__isw=sw- status=active status=staged status=planned" + } + ] + } diff --git a/roles/collector/files/prometheus-snmp.yml b/roles/collector/files/prometheus-snmp.yml new file mode 100644 index 0000000..b996b24 --- /dev/null +++ b/roles/collector/files/prometheus-snmp.yml @@ -0,0 +1,4 @@ +scrape_configs: + - job_name: "snmp" + static_configs: + - targets: ["localhost:9273"] diff --git a/roles/collector/files/prometheus.nft b/roles/collector/files/prometheus.nft new file mode 100644 index 0000000..e0e8280 --- /dev/null +++ b/roles/collector/files/prometheus.nft @@ -0,0 +1,12 @@ +table inet filter { + chain output { + type filter hook output priority 0; policy accept; + + skuid prometheus ct state { established, related } accept + skuid prometheus th dport domain accept + skuid prometheus tcp dport { 443, 9100 } accept comment "prometheus" + skuid prometheus ip daddr 127.0.0.1 tcp dport 9090 accept comment "prometheus self" + skuid prometheus ip daddr 127.0.0.1 tcp dport 9273 accept comment "telegraf snmp exporter" + skuid prometheus drop + } +} diff --git a/roles/collector/files/telegraf.nft b/roles/collector/files/telegraf.nft new file mode 100644 index 0000000..3af3fed --- /dev/null +++ b/roles/collector/files/telegraf.nft @@ -0,0 +1,9 @@ +table inet filter { + chain output { + type filter hook output priority 0; policy accept; + + skuid telegraf ct state { established, related } accept + skuid telegraf th dport snmp accept + skuid telegraf drop + } +} diff --git a/roles/collector/handlers/main.yml b/roles/collector/handlers/main.yml new file mode 100644 index 0000000..5d02988 --- /dev/null +++ b/roles/collector/handlers/main.yml @@ -0,0 +1,17 @@ +- name: reload nftables + service: + name: nftables + state: reloaded + when: "'handler' not in ansible_skip_tags" + +- name: reload prometheus + service: + name: prometheus + state: reloaded + when: "'handler' not in ansible_skip_tags" + +- name: restart telegraf + service: + name: telegraf + state: restarted # seems to crash on reloads + when: "'handler' not in ansible_skip_tags" diff --git a/roles/collector/meta/main.yml b/roles/collector/meta/main.yml new file mode 100644 index 0000000..368b911 --- /dev/null +++ b/roles/collector/meta/main.yml @@ -0,0 +1,3 @@ +dependencies: + - role: prometheus + - role: telegraf diff --git a/roles/collector/tasks/main.yml b/roles/collector/tasks/main.yml new file mode 100644 index 0000000..a5176ba --- /dev/null +++ b/roles/collector/tasks/main.yml @@ -0,0 +1,34 @@ +# since this host likely has access to sensitive networks, +# restrict the destinations where monitoring daemons can connect +- name: Set up outbound firewall rules + copy: + dest: "/etc/nftables.d/{{ item }}.nft" + src: "{{ item }}.nft" + loop: + - prometheus + - telegraf + notify: reload nftables + +- name: Configure telegraf to expose SNMP data as prometheus metrics + template: + dest: "/etc/telegraf.conf.d/{{ item }}.conf" + src: "{{ item }}.conf.j2" + loop: + - output + - snmp + notify: restart telegraf + +- name: Configure prometheus to pull SNMP data + copy: + dest: "/etc/prometheus/conf.d/snmp.yml" + src: "prometheus-snmp.yml" + notify: reload prometheus + +- name: Configure prometheus to pull custom data + template: + dest: "/etc/prometheus/conf.d/{{ item.name }}.yml" + src: "prometheus-job.yml.j2" + loop: "{{ prometheus_hosts }}" + loop_control: + label: "{{ item.name }}" + notify: reload prometheus diff --git a/roles/collector/templates/output.conf.j2 b/roles/collector/templates/output.conf.j2 new file mode 100644 index 0000000..6dbe53c --- /dev/null +++ b/roles/collector/templates/output.conf.j2 @@ -0,0 +1,4 @@ +[[outputs.prometheus_client]] +listen = "127.0.0.1:9273" +expiration_interval = "300s" +tagexclude = ["mac?"] # temporary tags we don’t need to export diff --git a/roles/collector/templates/prometheus-job.yml.j2 b/roles/collector/templates/prometheus-job.yml.j2 new file mode 100644 index 0000000..7e24f05 --- /dev/null +++ b/roles/collector/templates/prometheus-job.yml.j2 @@ -0,0 +1,20 @@ +{% set devices = query("netbox.netbox.nb_lookup", "devices", api_filter="{{ item.nb_filter }}", raw_data=true) + | selectattr("primary_ip") + | map(attribute="name") + | map("extract", hostvars) -%} + +scrape_configs: + - job_name: "{{ item.name }}" + relabel_configs: + - source_labels: [__address__] + regex: '([^.]+).*' + target_label: name + replacement: ${1} + static_configs: + - targets: +{% for address in devices + | selectattr("dns_name", "defined") + | map(attribute="dns_name") + | reject("none") | sort | unique %} + - "{{ address }}:9100" +{% endfor %} diff --git a/roles/collector/templates/snmp.conf.j2 b/roles/collector/templates/snmp.conf.j2 new file mode 100644 index 0000000..dd4624d --- /dev/null +++ b/roles/collector/templates/snmp.conf.j2 @@ -0,0 +1,106 @@ +[[inputs.snmp]] + interval = "300s" + agent_host_tag = "source" + agents = [ +{% for item in snmp_hosts %} +{% for address in query("netbox.netbox.nb_lookup", "devices", api_filter=item.nb_filter, raw_data=true) + | selectattr("primary_ip4") | map(attribute="primary_ip4.address") + | ipaddr("address") %} + "{{ address }}", +{% endfor %} +{% endfor %} + ] + version = 3 + sec_level = "authPriv" + auth_protocol = "SHA" + priv_protocol = "DES" + sec_name = "{{ password.snmp_user }}" + auth_password = "{{ password.snmp_pass }}" + priv_password = "{{ password.snmp_pass }}" + + fieldexclude = ["ifDescr", "ifSpecific"] + + [[inputs.snmp.field]] + name = "hostname" + oid = "RFC1213-MIB::sysName.0" + is_tag = true + + # interface table + [[inputs.snmp.table]] + name = "iface" + oid = "IF-MIB::ifTable" + inherit_tags = ["hostname"] + + [[inputs.snmp.table.field]] + oid = "IF-MIB::ifName" + + # rename counters to make prometheus happy + [[inputs.snmp.table.field]] + name = "in_total" + oid = "IF-MIB::ifInOctets" + + [[inputs.snmp.table.field]] + name = "in_err_total" + oid = "IF-MIB::ifInErrors" + + [[inputs.snmp.table.field]] + name = "out_total" + oid = "IF-MIB::ifOutOctets" + + [[inputs.snmp.table.field]] + name = "out_err_total" + oid = "IF-MIB::ifOutErrors" + + # MAC address table per VLAN + [[inputs.snmp.table]] + name = "fdb" + index_as_tag = true + inherit_tags = ["hostname"] + + [[inputs.snmp.table.field]] + name = "ifIndex" + oid = "Q-BRIDGE-MIB::dot1qTpFdbPort" + is_tag = true + + [[inputs.snmp.table.field]] + name = "entry" + oid = "Q-BRIDGE-MIB::dot1qTpFdbStatus" + +# look up interface name from its index +# seems we need another SNMP connection for that +[[processors.snmp_lookup]] + namepass = ["fdb", "iface"] + agent_tag = "source" + index_tag = "ifIndex" + + version = 3 + sec_level = "authPriv" + auth_protocol = "SHA" + priv_protocol = "DES" + sec_name = "{{ password.snmp_user }}" + auth_password = "{{ password.snmp_pass }}" + priv_password = "{{ password.snmp_pass }}" + + [[processors.snmp_lookup.tag]] + oid = "IF-MIB::ifName" + name = "iface" + +# split index 42.1.2.3.10.11.12 into tags "vlan" and "mac1" to "mac6" +[[processors.regex]] + namepass = ["fdb"] + + [[processors.regex.tags]] + key = "index" + pattern = '^(?P\d+)\.(?P\d+)\.(?P\d+)\.(?P\d+)\.(?P\d+)\.(?P\d+)\.(?P\d+)' + +# combine "mac*" tags into a single tag "mac" with value 01:02:03:0a:0b:0c +[[processors.template]] + namepass = ["fdb"] + tagexclude = ["ifIndex", "index"] + tag = "mac" +{% raw %} + template = '''{{ + printf "%02x:%02x:%02x:%02x:%02x:%02x" + (.Tag "mac1"|int) (.Tag "mac2"|int) (.Tag "mac3"|int) (.Tag "mac4"|int) (.Tag "mac5"|int) (.Tag "mac6"|int) + }}''' +{% endraw %} diff --git a/setup.yml b/setup.yml index 8bdb494..273ffcc 100644 --- a/setup.yml +++ b/setup.yml @@ -21,6 +21,7 @@ - hosts: mgmt-gw roles: - radvd # we are router for mgmt networks + - collector - hosts: proxmox-backup roles: