Simple hardware metrics collection + alerting (#115)
* Add CPU and RAM usage alerting * Create basic troubleshooting document to point alerts at * Limit max number of hardware values collected * Save metric value with the point in time it was taken
This commit is contained in:
41
metrics/alerting.go
Normal file
41
metrics/alerting.go
Normal file
@@ -0,0 +1,41 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
const maxCPUAlertingThresholdPCT = 95
|
||||
const maxRAMAlertingThresholdPCT = 95
|
||||
|
||||
const alertingError = "The %s utilization of %d%% is higher than the alerting threshold of %d%%. This can cause issues with video generation and delivery. Please visit the documentation at https://github.com/gabek/owncast/blob/master/doc/troubleshooting.md to help troubleshoot this issue."
|
||||
|
||||
func handleAlerting() {
|
||||
handleCPUAlerting()
|
||||
handleRAMAlerting()
|
||||
}
|
||||
|
||||
func handleCPUAlerting() {
|
||||
if len(Metrics.CPUUtilizations) < 2 {
|
||||
return
|
||||
}
|
||||
|
||||
avg := recentAverage(Metrics.CPUUtilizations)
|
||||
if avg > maxCPUAlertingThresholdPCT {
|
||||
log.Errorf(alertingError, "CPU", avg, maxCPUAlertingThresholdPCT)
|
||||
}
|
||||
}
|
||||
|
||||
func handleRAMAlerting() {
|
||||
if len(Metrics.RAMUtilizations) < 2 {
|
||||
return
|
||||
}
|
||||
|
||||
avg := recentAverage(Metrics.RAMUtilizations)
|
||||
if avg > maxRAMAlertingThresholdPCT {
|
||||
log.Errorf(alertingError, "memory", avg, maxRAMAlertingThresholdPCT)
|
||||
}
|
||||
}
|
||||
|
||||
func recentAverage(values []value) int {
|
||||
return int((values[len(values)-1].Value + values[len(values)-2].Value) / 2)
|
||||
}
|
||||
35
metrics/hardware.go
Normal file
35
metrics/hardware.go
Normal file
@@ -0,0 +1,35 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/shirou/gopsutil/cpu"
|
||||
"github.com/shirou/gopsutil/mem"
|
||||
)
|
||||
|
||||
// Max number of metrics we want to keep.
|
||||
const maxCollectionValues = 500
|
||||
|
||||
func collectCPUUtilization() {
|
||||
if len(Metrics.CPUUtilizations) > maxCollectionValues {
|
||||
Metrics.CPUUtilizations = Metrics.CPUUtilizations[1:]
|
||||
}
|
||||
|
||||
v, err := cpu.Percent(0, false)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
metricValue := value{time.Now(), int(v[0])}
|
||||
Metrics.CPUUtilizations = append(Metrics.CPUUtilizations, metricValue)
|
||||
}
|
||||
|
||||
func collectRAMUtilization() {
|
||||
if len(Metrics.RAMUtilizations) > maxCollectionValues {
|
||||
Metrics.RAMUtilizations = Metrics.RAMUtilizations[1:]
|
||||
}
|
||||
|
||||
memoryUsage, _ := mem.VirtualMemory()
|
||||
metricValue := value{time.Now(), int(memoryUsage.UsedPercent)}
|
||||
Metrics.RAMUtilizations = append(Metrics.RAMUtilizations, metricValue)
|
||||
}
|
||||
39
metrics/metrics.go
Normal file
39
metrics/metrics.go
Normal file
@@ -0,0 +1,39 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// How often we poll for updates
|
||||
const metricsPollingInterval = 15 * time.Second
|
||||
|
||||
type value struct {
|
||||
Time time.Time
|
||||
Value int
|
||||
}
|
||||
|
||||
type metrics struct {
|
||||
CPUUtilizations []value
|
||||
RAMUtilizations []value
|
||||
}
|
||||
|
||||
// Metrics is the shared Metrics instance
|
||||
var Metrics *metrics
|
||||
|
||||
// Start will begin the metrics collection and alerting
|
||||
func Start() {
|
||||
Metrics = new(metrics)
|
||||
|
||||
for range time.Tick(metricsPollingInterval) {
|
||||
handlePolling()
|
||||
}
|
||||
}
|
||||
|
||||
func handlePolling() {
|
||||
// Collect hardware stats
|
||||
collectCPUUtilization()
|
||||
collectRAMUtilization()
|
||||
|
||||
// Alerting
|
||||
handleAlerting()
|
||||
}
|
||||
Reference in New Issue
Block a user