From da3db8cc02cd2718c0364feff7a3bc15afa5fd29 Mon Sep 17 00:00:00 2001 From: Timotej Lazar Date: Fri, 17 Oct 2025 22:00:42 +0200 Subject: [PATCH] Add collector role Sets up prometheus to pull metrics, with telegraf to process SNMP data. --- roles/collector/README.md | 18 +++ roles/collector/files/prometheus-snmp.yml | 4 + roles/collector/files/prometheus.nft | 12 ++ roles/collector/files/telegraf.nft | 9 ++ roles/collector/handlers/main.yml | 17 +++ roles/collector/meta/main.yml | 3 + roles/collector/tasks/main.yml | 34 ++++++ roles/collector/templates/output.conf.j2 | 4 + .../collector/templates/prometheus-job.yml.j2 | 20 ++++ roles/collector/templates/snmp.conf.j2 | 106 ++++++++++++++++++ setup.yml | 1 + 11 files changed, 228 insertions(+) create mode 100644 roles/collector/README.md create mode 100644 roles/collector/files/prometheus-snmp.yml create mode 100644 roles/collector/files/prometheus.nft create mode 100644 roles/collector/files/telegraf.nft create mode 100644 roles/collector/handlers/main.yml create mode 100644 roles/collector/meta/main.yml create mode 100644 roles/collector/tasks/main.yml create mode 100644 roles/collector/templates/output.conf.j2 create mode 100644 roles/collector/templates/prometheus-job.yml.j2 create mode 100644 roles/collector/templates/snmp.conf.j2 diff --git a/roles/collector/README.md b/roles/collector/README.md new file mode 100644 index 0000000..77c4f93 --- /dev/null +++ b/roles/collector/README.md @@ -0,0 +1,18 @@ +Set up metric collection with prometheus and telegraf as the SNMP proxy. + +NetBox config context should contain the lists `prometheus_hosts` and `snmp_hosts` with job definitions. Each entry should define `name` and `nb_filter` user to query hosts from NetBox. For example: + + { + "prometheus_hosts": [ + { + "name": "classroom", + "nb_filter": "role=desktop-computer status=active location=classroom" + } + ], + "snmp_hosts": [ + { + "name": "switch", + "nb_filter": "role=switch name__isw=sw- status=active status=staged status=planned" + } + ] + } diff --git a/roles/collector/files/prometheus-snmp.yml b/roles/collector/files/prometheus-snmp.yml new file mode 100644 index 0000000..b996b24 --- /dev/null +++ b/roles/collector/files/prometheus-snmp.yml @@ -0,0 +1,4 @@ +scrape_configs: + - job_name: "snmp" + static_configs: + - targets: ["localhost:9273"] diff --git a/roles/collector/files/prometheus.nft b/roles/collector/files/prometheus.nft new file mode 100644 index 0000000..e0e8280 --- /dev/null +++ b/roles/collector/files/prometheus.nft @@ -0,0 +1,12 @@ +table inet filter { + chain output { + type filter hook output priority 0; policy accept; + + skuid prometheus ct state { established, related } accept + skuid prometheus th dport domain accept + skuid prometheus tcp dport { 443, 9100 } accept comment "prometheus" + skuid prometheus ip daddr 127.0.0.1 tcp dport 9090 accept comment "prometheus self" + skuid prometheus ip daddr 127.0.0.1 tcp dport 9273 accept comment "telegraf snmp exporter" + skuid prometheus drop + } +} diff --git a/roles/collector/files/telegraf.nft b/roles/collector/files/telegraf.nft new file mode 100644 index 0000000..3af3fed --- /dev/null +++ b/roles/collector/files/telegraf.nft @@ -0,0 +1,9 @@ +table inet filter { + chain output { + type filter hook output priority 0; policy accept; + + skuid telegraf ct state { established, related } accept + skuid telegraf th dport snmp accept + skuid telegraf drop + } +} diff --git a/roles/collector/handlers/main.yml b/roles/collector/handlers/main.yml new file mode 100644 index 0000000..5d02988 --- /dev/null +++ b/roles/collector/handlers/main.yml @@ -0,0 +1,17 @@ +- name: reload nftables + service: + name: nftables + state: reloaded + when: "'handler' not in ansible_skip_tags" + +- name: reload prometheus + service: + name: prometheus + state: reloaded + when: "'handler' not in ansible_skip_tags" + +- name: restart telegraf + service: + name: telegraf + state: restarted # seems to crash on reloads + when: "'handler' not in ansible_skip_tags" diff --git a/roles/collector/meta/main.yml b/roles/collector/meta/main.yml new file mode 100644 index 0000000..368b911 --- /dev/null +++ b/roles/collector/meta/main.yml @@ -0,0 +1,3 @@ +dependencies: + - role: prometheus + - role: telegraf diff --git a/roles/collector/tasks/main.yml b/roles/collector/tasks/main.yml new file mode 100644 index 0000000..a5176ba --- /dev/null +++ b/roles/collector/tasks/main.yml @@ -0,0 +1,34 @@ +# since this host likely has access to sensitive networks, +# restrict the destinations where monitoring daemons can connect +- name: Set up outbound firewall rules + copy: + dest: "/etc/nftables.d/{{ item }}.nft" + src: "{{ item }}.nft" + loop: + - prometheus + - telegraf + notify: reload nftables + +- name: Configure telegraf to expose SNMP data as prometheus metrics + template: + dest: "/etc/telegraf.conf.d/{{ item }}.conf" + src: "{{ item }}.conf.j2" + loop: + - output + - snmp + notify: restart telegraf + +- name: Configure prometheus to pull SNMP data + copy: + dest: "/etc/prometheus/conf.d/snmp.yml" + src: "prometheus-snmp.yml" + notify: reload prometheus + +- name: Configure prometheus to pull custom data + template: + dest: "/etc/prometheus/conf.d/{{ item.name }}.yml" + src: "prometheus-job.yml.j2" + loop: "{{ prometheus_hosts }}" + loop_control: + label: "{{ item.name }}" + notify: reload prometheus diff --git a/roles/collector/templates/output.conf.j2 b/roles/collector/templates/output.conf.j2 new file mode 100644 index 0000000..6dbe53c --- /dev/null +++ b/roles/collector/templates/output.conf.j2 @@ -0,0 +1,4 @@ +[[outputs.prometheus_client]] +listen = "127.0.0.1:9273" +expiration_interval = "300s" +tagexclude = ["mac?"] # temporary tags we don’t need to export diff --git a/roles/collector/templates/prometheus-job.yml.j2 b/roles/collector/templates/prometheus-job.yml.j2 new file mode 100644 index 0000000..7e24f05 --- /dev/null +++ b/roles/collector/templates/prometheus-job.yml.j2 @@ -0,0 +1,20 @@ +{% set devices = query("netbox.netbox.nb_lookup", "devices", api_filter="{{ item.nb_filter }}", raw_data=true) + | selectattr("primary_ip") + | map(attribute="name") + | map("extract", hostvars) -%} + +scrape_configs: + - job_name: "{{ item.name }}" + relabel_configs: + - source_labels: [__address__] + regex: '([^.]+).*' + target_label: name + replacement: ${1} + static_configs: + - targets: +{% for address in devices + | selectattr("dns_name", "defined") + | map(attribute="dns_name") + | reject("none") | sort | unique %} + - "{{ address }}:9100" +{% endfor %} diff --git a/roles/collector/templates/snmp.conf.j2 b/roles/collector/templates/snmp.conf.j2 new file mode 100644 index 0000000..dd4624d --- /dev/null +++ b/roles/collector/templates/snmp.conf.j2 @@ -0,0 +1,106 @@ +[[inputs.snmp]] + interval = "300s" + agent_host_tag = "source" + agents = [ +{% for item in snmp_hosts %} +{% for address in query("netbox.netbox.nb_lookup", "devices", api_filter=item.nb_filter, raw_data=true) + | selectattr("primary_ip4") | map(attribute="primary_ip4.address") + | ipaddr("address") %} + "{{ address }}", +{% endfor %} +{% endfor %} + ] + version = 3 + sec_level = "authPriv" + auth_protocol = "SHA" + priv_protocol = "DES" + sec_name = "{{ password.snmp_user }}" + auth_password = "{{ password.snmp_pass }}" + priv_password = "{{ password.snmp_pass }}" + + fieldexclude = ["ifDescr", "ifSpecific"] + + [[inputs.snmp.field]] + name = "hostname" + oid = "RFC1213-MIB::sysName.0" + is_tag = true + + # interface table + [[inputs.snmp.table]] + name = "iface" + oid = "IF-MIB::ifTable" + inherit_tags = ["hostname"] + + [[inputs.snmp.table.field]] + oid = "IF-MIB::ifName" + + # rename counters to make prometheus happy + [[inputs.snmp.table.field]] + name = "in_total" + oid = "IF-MIB::ifInOctets" + + [[inputs.snmp.table.field]] + name = "in_err_total" + oid = "IF-MIB::ifInErrors" + + [[inputs.snmp.table.field]] + name = "out_total" + oid = "IF-MIB::ifOutOctets" + + [[inputs.snmp.table.field]] + name = "out_err_total" + oid = "IF-MIB::ifOutErrors" + + # MAC address table per VLAN + [[inputs.snmp.table]] + name = "fdb" + index_as_tag = true + inherit_tags = ["hostname"] + + [[inputs.snmp.table.field]] + name = "ifIndex" + oid = "Q-BRIDGE-MIB::dot1qTpFdbPort" + is_tag = true + + [[inputs.snmp.table.field]] + name = "entry" + oid = "Q-BRIDGE-MIB::dot1qTpFdbStatus" + +# look up interface name from its index +# seems we need another SNMP connection for that +[[processors.snmp_lookup]] + namepass = ["fdb", "iface"] + agent_tag = "source" + index_tag = "ifIndex" + + version = 3 + sec_level = "authPriv" + auth_protocol = "SHA" + priv_protocol = "DES" + sec_name = "{{ password.snmp_user }}" + auth_password = "{{ password.snmp_pass }}" + priv_password = "{{ password.snmp_pass }}" + + [[processors.snmp_lookup.tag]] + oid = "IF-MIB::ifName" + name = "iface" + +# split index 42.1.2.3.10.11.12 into tags "vlan" and "mac1" to "mac6" +[[processors.regex]] + namepass = ["fdb"] + + [[processors.regex.tags]] + key = "index" + pattern = '^(?P\d+)\.(?P\d+)\.(?P\d+)\.(?P\d+)\.(?P\d+)\.(?P\d+)\.(?P\d+)' + +# combine "mac*" tags into a single tag "mac" with value 01:02:03:0a:0b:0c +[[processors.template]] + namepass = ["fdb"] + tagexclude = ["ifIndex", "index"] + tag = "mac" +{% raw %} + template = '''{{ + printf "%02x:%02x:%02x:%02x:%02x:%02x" + (.Tag "mac1"|int) (.Tag "mac2"|int) (.Tag "mac3"|int) (.Tag "mac4"|int) (.Tag "mac5"|int) (.Tag "mac6"|int) + }}''' +{% endraw %} diff --git a/setup.yml b/setup.yml index 8bdb494..273ffcc 100644 --- a/setup.yml +++ b/setup.yml @@ -21,6 +21,7 @@ - hosts: mgmt-gw roles: - radvd # we are router for mgmt networks + - collector - hosts: proxmox-backup roles: