Add collector role

Sets up prometheus to pull metrics, with telegraf to process SNMP data.
This commit is contained in:
Timotej Lazar 2025-10-17 22:00:42 +02:00
parent 6600a6fa36
commit da3db8cc02
11 changed files with 228 additions and 0 deletions

18
roles/collector/README.md Normal file
View file

@ -0,0 +1,18 @@
Set up metric collection with prometheus and telegraf as the SNMP proxy.
NetBox config context should contain the lists `prometheus_hosts` and `snmp_hosts` with job definitions. Each entry should define `name` and `nb_filter` user to query hosts from NetBox. For example:
{
"prometheus_hosts": [
{
"name": "classroom",
"nb_filter": "role=desktop-computer status=active location=classroom"
}
],
"snmp_hosts": [
{
"name": "switch",
"nb_filter": "role=switch name__isw=sw- status=active status=staged status=planned"
}
]
}

View file

@ -0,0 +1,4 @@
scrape_configs:
- job_name: "snmp"
static_configs:
- targets: ["localhost:9273"]

View file

@ -0,0 +1,12 @@
table inet filter {
chain output {
type filter hook output priority 0; policy accept;
skuid prometheus ct state { established, related } accept
skuid prometheus th dport domain accept
skuid prometheus tcp dport { 443, 9100 } accept comment "prometheus"
skuid prometheus ip daddr 127.0.0.1 tcp dport 9090 accept comment "prometheus self"
skuid prometheus ip daddr 127.0.0.1 tcp dport 9273 accept comment "telegraf snmp exporter"
skuid prometheus drop
}
}

View file

@ -0,0 +1,9 @@
table inet filter {
chain output {
type filter hook output priority 0; policy accept;
skuid telegraf ct state { established, related } accept
skuid telegraf th dport snmp accept
skuid telegraf drop
}
}

View file

@ -0,0 +1,17 @@
- name: reload nftables
service:
name: nftables
state: reloaded
when: "'handler' not in ansible_skip_tags"
- name: reload prometheus
service:
name: prometheus
state: reloaded
when: "'handler' not in ansible_skip_tags"
- name: restart telegraf
service:
name: telegraf
state: restarted # seems to crash on reloads
when: "'handler' not in ansible_skip_tags"

View file

@ -0,0 +1,3 @@
dependencies:
- role: prometheus
- role: telegraf

View file

@ -0,0 +1,34 @@
# since this host likely has access to sensitive networks,
# restrict the destinations where monitoring daemons can connect
- name: Set up outbound firewall rules
copy:
dest: "/etc/nftables.d/{{ item }}.nft"
src: "{{ item }}.nft"
loop:
- prometheus
- telegraf
notify: reload nftables
- name: Configure telegraf to expose SNMP data as prometheus metrics
template:
dest: "/etc/telegraf.conf.d/{{ item }}.conf"
src: "{{ item }}.conf.j2"
loop:
- output
- snmp
notify: restart telegraf
- name: Configure prometheus to pull SNMP data
copy:
dest: "/etc/prometheus/conf.d/snmp.yml"
src: "prometheus-snmp.yml"
notify: reload prometheus
- name: Configure prometheus to pull custom data
template:
dest: "/etc/prometheus/conf.d/{{ item.name }}.yml"
src: "prometheus-job.yml.j2"
loop: "{{ prometheus_hosts }}"
loop_control:
label: "{{ item.name }}"
notify: reload prometheus

View file

@ -0,0 +1,4 @@
[[outputs.prometheus_client]]
listen = "127.0.0.1:9273"
expiration_interval = "300s"
tagexclude = ["mac?"] # temporary tags we don’t need to export

View file

@ -0,0 +1,20 @@
{% set devices = query("netbox.netbox.nb_lookup", "devices", api_filter="{{ item.nb_filter }}", raw_data=true)
| selectattr("primary_ip")
| map(attribute="name")
| map("extract", hostvars) -%}
scrape_configs:
- job_name: "{{ item.name }}"
relabel_configs:
- source_labels: [__address__]
regex: '([^.]+).*'
target_label: name
replacement: ${1}
static_configs:
- targets:
{% for address in devices
| selectattr("dns_name", "defined")
| map(attribute="dns_name")
| reject("none") | sort | unique %}
- "{{ address }}:9100"
{% endfor %}

View file

@ -0,0 +1,106 @@
[[inputs.snmp]]
interval = "300s"
agent_host_tag = "source"
agents = [
{% for item in snmp_hosts %}
{% for address in query("netbox.netbox.nb_lookup", "devices", api_filter=item.nb_filter, raw_data=true)
| selectattr("primary_ip4") | map(attribute="primary_ip4.address")
| ipaddr("address") %}
"{{ address }}",
{% endfor %}
{% endfor %}
]
version = 3
sec_level = "authPriv"
auth_protocol = "SHA"
priv_protocol = "DES"
sec_name = "{{ password.snmp_user }}"
auth_password = "{{ password.snmp_pass }}"
priv_password = "{{ password.snmp_pass }}"
fieldexclude = ["ifDescr", "ifSpecific"]
[[inputs.snmp.field]]
name = "hostname"
oid = "RFC1213-MIB::sysName.0"
is_tag = true
# interface table
[[inputs.snmp.table]]
name = "iface"
oid = "IF-MIB::ifTable"
inherit_tags = ["hostname"]
[[inputs.snmp.table.field]]
oid = "IF-MIB::ifName"
# rename counters to make prometheus happy
[[inputs.snmp.table.field]]
name = "in_total"
oid = "IF-MIB::ifInOctets"
[[inputs.snmp.table.field]]
name = "in_err_total"
oid = "IF-MIB::ifInErrors"
[[inputs.snmp.table.field]]
name = "out_total"
oid = "IF-MIB::ifOutOctets"
[[inputs.snmp.table.field]]
name = "out_err_total"
oid = "IF-MIB::ifOutErrors"
# MAC address table per VLAN
[[inputs.snmp.table]]
name = "fdb"
index_as_tag = true
inherit_tags = ["hostname"]
[[inputs.snmp.table.field]]
name = "ifIndex"
oid = "Q-BRIDGE-MIB::dot1qTpFdbPort"
is_tag = true
[[inputs.snmp.table.field]]
name = "entry"
oid = "Q-BRIDGE-MIB::dot1qTpFdbStatus"
# look up interface name from its index
# seems we need another SNMP connection for that
[[processors.snmp_lookup]]
namepass = ["fdb", "iface"]
agent_tag = "source"
index_tag = "ifIndex"
version = 3
sec_level = "authPriv"
auth_protocol = "SHA"
priv_protocol = "DES"
sec_name = "{{ password.snmp_user }}"
auth_password = "{{ password.snmp_pass }}"
priv_password = "{{ password.snmp_pass }}"
[[processors.snmp_lookup.tag]]
oid = "IF-MIB::ifName"
name = "iface"
# split index 42.1.2.3.10.11.12 into tags "vlan" and "mac1" to "mac6"
[[processors.regex]]
namepass = ["fdb"]
[[processors.regex.tags]]
key = "index"
pattern = '^(?P<vlan>\d+)\.(?P<mac1>\d+)\.(?P<mac2>\d+)\.(?P<mac3>\d+)\.(?P<mac4>\d+)\.(?P<mac5>\d+)\.(?P<mac6>\d+)'
# combine "mac*" tags into a single tag "mac" with value 01:02:03:0a:0b:0c
[[processors.template]]
namepass = ["fdb"]
tagexclude = ["ifIndex", "index"]
tag = "mac"
{% raw %}
template = '''{{
printf "%02x:%02x:%02x:%02x:%02x:%02x"
(.Tag "mac1"|int) (.Tag "mac2"|int) (.Tag "mac3"|int) (.Tag "mac4"|int) (.Tag "mac5"|int) (.Tag "mac6"|int)
}}'''
{% endraw %}

View file

@ -21,6 +21,7 @@
- hosts: mgmt-gw - hosts: mgmt-gw
roles: roles:
- radvd # we are router for mgmt networks - radvd # we are router for mgmt networks
- collector
- hosts: proxmox-backup - hosts: proxmox-backup
roles: roles: