Compare commits

...

3 commits

Author SHA1 Message Date
da3db8cc02 Add collector role
Sets up prometheus to pull metrics, with telegraf to process SNMP data.
2025-10-17 22:12:02 +02:00
6600a6fa36 telegraf: drop agent configuration stuff
Servers will be switched to Prometheus. Telegraf remains as a SNMP proxy
(see following commit).
2025-10-17 22:03:33 +02:00
d347fd7215 Add prometheus role
Configured to include job definitions from /etc/prometheus/conf.d/*.yml.
2025-10-17 21:21:01 +02:00
19 changed files with 280 additions and 100 deletions

18
roles/collector/README.md Normal file
View file

@ -0,0 +1,18 @@
Set up metric collection with prometheus and telegraf as the SNMP proxy.
NetBox config context should contain the lists `prometheus_hosts` and `snmp_hosts` with job definitions. Each entry should define `name` and `nb_filter` user to query hosts from NetBox. For example:
{
"prometheus_hosts": [
{
"name": "classroom",
"nb_filter": "role=desktop-computer status=active location=classroom"
}
],
"snmp_hosts": [
{
"name": "switch",
"nb_filter": "role=switch name__isw=sw- status=active status=staged status=planned"
}
]
}

View file

@ -0,0 +1,4 @@
scrape_configs:
- job_name: "snmp"
static_configs:
- targets: ["localhost:9273"]

View file

@ -0,0 +1,12 @@
table inet filter {
chain output {
type filter hook output priority 0; policy accept;
skuid prometheus ct state { established, related } accept
skuid prometheus th dport domain accept
skuid prometheus tcp dport { 443, 9100 } accept comment "prometheus"
skuid prometheus ip daddr 127.0.0.1 tcp dport 9090 accept comment "prometheus self"
skuid prometheus ip daddr 127.0.0.1 tcp dport 9273 accept comment "telegraf snmp exporter"
skuid prometheus drop
}
}

View file

@ -0,0 +1,9 @@
table inet filter {
chain output {
type filter hook output priority 0; policy accept;
skuid telegraf ct state { established, related } accept
skuid telegraf th dport snmp accept
skuid telegraf drop
}
}

View file

@ -0,0 +1,17 @@
- name: reload nftables
service:
name: nftables
state: reloaded
when: "'handler' not in ansible_skip_tags"
- name: reload prometheus
service:
name: prometheus
state: reloaded
when: "'handler' not in ansible_skip_tags"
- name: restart telegraf
service:
name: telegraf
state: restarted # seems to crash on reloads
when: "'handler' not in ansible_skip_tags"

View file

@ -0,0 +1,3 @@
dependencies:
- role: prometheus
- role: telegraf

View file

@ -0,0 +1,34 @@
# since this host likely has access to sensitive networks,
# restrict the destinations where monitoring daemons can connect
- name: Set up outbound firewall rules
copy:
dest: "/etc/nftables.d/{{ item }}.nft"
src: "{{ item }}.nft"
loop:
- prometheus
- telegraf
notify: reload nftables
- name: Configure telegraf to expose SNMP data as prometheus metrics
template:
dest: "/etc/telegraf.conf.d/{{ item }}.conf"
src: "{{ item }}.conf.j2"
loop:
- output
- snmp
notify: restart telegraf
- name: Configure prometheus to pull SNMP data
copy:
dest: "/etc/prometheus/conf.d/snmp.yml"
src: "prometheus-snmp.yml"
notify: reload prometheus
- name: Configure prometheus to pull custom data
template:
dest: "/etc/prometheus/conf.d/{{ item.name }}.yml"
src: "prometheus-job.yml.j2"
loop: "{{ prometheus_hosts }}"
loop_control:
label: "{{ item.name }}"
notify: reload prometheus

View file

@ -0,0 +1,4 @@
[[outputs.prometheus_client]]
listen = "127.0.0.1:9273"
expiration_interval = "300s"
tagexclude = ["mac?"] # temporary tags we don’t need to export

View file

@ -0,0 +1,20 @@
{% set devices = query("netbox.netbox.nb_lookup", "devices", api_filter="{{ item.nb_filter }}", raw_data=true)
| selectattr("primary_ip")
| map(attribute="name")
| map("extract", hostvars) -%}
scrape_configs:
- job_name: "{{ item.name }}"
relabel_configs:
- source_labels: [__address__]
regex: '([^.]+).*'
target_label: name
replacement: ${1}
static_configs:
- targets:
{% for address in devices
| selectattr("dns_name", "defined")
| map(attribute="dns_name")
| reject("none") | sort | unique %}
- "{{ address }}:9100"
{% endfor %}

View file

@ -0,0 +1,106 @@
[[inputs.snmp]]
interval = "300s"
agent_host_tag = "source"
agents = [
{% for item in snmp_hosts %}
{% for address in query("netbox.netbox.nb_lookup", "devices", api_filter=item.nb_filter, raw_data=true)
| selectattr("primary_ip4") | map(attribute="primary_ip4.address")
| ipaddr("address") %}
"{{ address }}",
{% endfor %}
{% endfor %}
]
version = 3
sec_level = "authPriv"
auth_protocol = "SHA"
priv_protocol = "DES"
sec_name = "{{ password.snmp_user }}"
auth_password = "{{ password.snmp_pass }}"
priv_password = "{{ password.snmp_pass }}"
fieldexclude = ["ifDescr", "ifSpecific"]
[[inputs.snmp.field]]
name = "hostname"
oid = "RFC1213-MIB::sysName.0"
is_tag = true
# interface table
[[inputs.snmp.table]]
name = "iface"
oid = "IF-MIB::ifTable"
inherit_tags = ["hostname"]
[[inputs.snmp.table.field]]
oid = "IF-MIB::ifName"
# rename counters to make prometheus happy
[[inputs.snmp.table.field]]
name = "in_total"
oid = "IF-MIB::ifInOctets"
[[inputs.snmp.table.field]]
name = "in_err_total"
oid = "IF-MIB::ifInErrors"
[[inputs.snmp.table.field]]
name = "out_total"
oid = "IF-MIB::ifOutOctets"
[[inputs.snmp.table.field]]
name = "out_err_total"
oid = "IF-MIB::ifOutErrors"
# MAC address table per VLAN
[[inputs.snmp.table]]
name = "fdb"
index_as_tag = true
inherit_tags = ["hostname"]
[[inputs.snmp.table.field]]
name = "ifIndex"
oid = "Q-BRIDGE-MIB::dot1qTpFdbPort"
is_tag = true
[[inputs.snmp.table.field]]
name = "entry"
oid = "Q-BRIDGE-MIB::dot1qTpFdbStatus"
# look up interface name from its index
# seems we need another SNMP connection for that
[[processors.snmp_lookup]]
namepass = ["fdb", "iface"]
agent_tag = "source"
index_tag = "ifIndex"
version = 3
sec_level = "authPriv"
auth_protocol = "SHA"
priv_protocol = "DES"
sec_name = "{{ password.snmp_user }}"
auth_password = "{{ password.snmp_pass }}"
priv_password = "{{ password.snmp_pass }}"
[[processors.snmp_lookup.tag]]
oid = "IF-MIB::ifName"
name = "iface"
# split index 42.1.2.3.10.11.12 into tags "vlan" and "mac1" to "mac6"
[[processors.regex]]
namepass = ["fdb"]
[[processors.regex.tags]]
key = "index"
pattern = '^(?P<vlan>\d+)\.(?P<mac1>\d+)\.(?P<mac2>\d+)\.(?P<mac3>\d+)\.(?P<mac4>\d+)\.(?P<mac5>\d+)\.(?P<mac6>\d+)'
# combine "mac*" tags into a single tag "mac" with value 01:02:03:0a:0b:0c
[[processors.template]]
namepass = ["fdb"]
tagexclude = ["ifIndex", "index"]
tag = "mac"
{% raw %}
template = '''{{
printf "%02x:%02x:%02x:%02x:%02x:%02x"
(.Tag "mac1"|int) (.Tag "mac2"|int) (.Tag "mac3"|int) (.Tag "mac4"|int) (.Tag "mac5"|int) (.Tag "mac6"|int)
}}'''
{% endraw %}

View file

@ -0,0 +1,3 @@
Install and configure prometheus.
Job definitions should be placed in /etc/prometheus/conf.d by roles using this one.

View file

@ -0,0 +1,2 @@
scrape_config_files:
- "conf.d/*.yml"

View file

@ -0,0 +1,5 @@
- name: reload prometheus
service:
name: prometheus
state: reloaded
when: "'handler' not in ansible_skip_tags"

View file

@ -0,0 +1,21 @@
- name: Install packages
package:
name:
- prometheus
- name: Create directory for prometheus configs
file:
path: /etc/prometheus/conf.d
state: directory
- name: Configure prometheus
copy:
dest: /etc/prometheus/
src: prometheus.yml
notify: reload prometheus
- name: Enable prometheus service
service:
name: prometheus
enabled: true
state: started

View file

@ -1,31 +0,0 @@
- name: Add influxdb repository
deb822_repository:
name: influxdata
uris: https://repos.influxdata.com/debian
suites: stable
components: main
architectures: amd64
signed_by: https://repos.influxdata.com/influxdata-archive.key
notify: update package cache
- meta: flush_handlers
- name: Install telegraf
package:
name: telegraf
- name: Configure telegraf
when: not ansible_check_mode
template:
dest: /etc/telegraf/telegraf.d/output.conf
src: output.conf.j2
owner: telegraf
group: telegraf
mode: 0640
notify: restart telegraf
- name: Enable telegraf
service:
name: telegraf
enabled: true
state: started

View file

@ -1,11 +1,22 @@
- name: Get influxdb info
set_fact:
influxdb_info: '{{ lookup("passwordstore", "vm/"~influxdb_host, returnall=true, missing="empty") | from_yaml }}'
- name: Create influxdb token for this host
include_tasks: token.yml
when: 'not ansible_check_mode and "influxdb_token" not in password'
- name: Install telegraf on Debian
include_tasks: debian.yml
- name: Add telegraf package repo on Debian
when: ansible_os_family == "Debian"
deb822_repository:
name: influxdata
uris: https://repos.influxdata.com/debian
suites: stable
components: main
architectures: amd64
signed_by: https://repos.influxdata.com/influxdata-archive.key
notify: update package cache
- meta: flush_handlers
- name: Install telegraf
package:
name: telegraf
- name: Enable telegraf service
service:
name: telegraf
enabled: true
state: started

View file

@ -1,53 +0,0 @@
- name: Get influxdb organization ID
delegate_to: localhost
uri:
url: '{{ influxdb_info.influxdb_url }}/api/v2/orgs'
headers:
Authorization: Token {{ influxdb_info.influxdb_operator_token }}
register: response
- name: Parse influxdb orgID
set_fact:
influxdb_orgID: '{{ response.json.orgs | selectattr("name", "==", influxdb_info.influxdb_org) | map(attribute="id") | first }}'
- name: Get influxdb bucket ID
delegate_to: localhost
uri:
url: '{{ influxdb_info.influxdb_url }}/api/v2/buckets?orgID={{ influxdb_orgID }}'
headers:
Authorization: Token {{ influxdb_info.influxdb_operator_token }}
register: response
- name: Parse influxdb bucketID
set_fact:
influxdb_bucketID: '{{ response.json.buckets | selectattr("name", "==", "servers") | map(attribute="id") | first }}'
- name: Create influxdb token
delegate_to: localhost
uri:
url: '{{ influxdb_info.influxdb_url }}/api/v2/authorizations'
method: POST
body_format: json
status_code: 201
headers:
Authorization: Token {{ influxdb_info.influxdb_operator_token }}
Content-Type: application/json
body: |
{
"description": "{{ inventory_hostname }}",
"orgID": "{{ influxdb_orgID }}",
"permissions": [{ "action": "write", "resource": { "type": "buckets", "id": "{{ influxdb_bucketID }}" } }]
}
register: response
- name: Parse influxdb token
set_fact:
influxdb_token: '{{ response.json.token }}'
# Ansible’s passwordstore lookup plugin should be able to do that but is pretty broken,
# so we do it manually.
- name: Store influxdb token in password store
delegate_to: localhost
command:
cmd: 'pass insert --force --multiline {{ ("vm/" if is_virtual else "host/")~inventory_hostname }}'
stdin: '{{ password | to_nice_yaml(sort_keys=false) }}influxdb_token: {{ influxdb_token }}'

View file

@ -1,5 +0,0 @@
[[outputs.influxdb_v2]]
urls = ["{{ influxdb_info.influxdb_url }}"]
organization = "{{ influxdb_info.influxdb_org }}"
bucket = "{{ influxdb_info.influxdb_bucket }}"
token = "{{ influxdb_token | default(password.influxdb_token) }}"

View file

@ -16,12 +16,12 @@
- hosts: ceph-*
roles:
- frr
- telegraf
- ceph
- hosts: mgmt-gw
roles:
- radvd # we are router for mgmt networks
- collector
- hosts: proxmox-backup
roles: