Simple hardware metrics collection + alerting (#115)

* Add CPU and RAM usage alerting

* Create basic troubleshooting document to point alerts at

* Limit max number of hardware values collected

* Save metric value with the point in time it was taken
This commit is contained in:
Gabe Kangas
2020-08-27 00:37:32 -07:00
committed by GitHub
parent e791a3c1dc
commit 4c3da2704f
7 changed files with 137 additions and 15 deletions

41
metrics/alerting.go Normal file
View File

@@ -0,0 +1,41 @@
package metrics
import (
log "github.com/sirupsen/logrus"
)
const maxCPUAlertingThresholdPCT = 95
const maxRAMAlertingThresholdPCT = 95
const alertingError = "The %s utilization of %d%% is higher than the alerting threshold of %d%%. This can cause issues with video generation and delivery. Please visit the documentation at https://github.com/gabek/owncast/blob/master/doc/troubleshooting.md to help troubleshoot this issue."
func handleAlerting() {
handleCPUAlerting()
handleRAMAlerting()
}
func handleCPUAlerting() {
if len(Metrics.CPUUtilizations) < 2 {
return
}
avg := recentAverage(Metrics.CPUUtilizations)
if avg > maxCPUAlertingThresholdPCT {
log.Errorf(alertingError, "CPU", avg, maxCPUAlertingThresholdPCT)
}
}
func handleRAMAlerting() {
if len(Metrics.RAMUtilizations) < 2 {
return
}
avg := recentAverage(Metrics.RAMUtilizations)
if avg > maxRAMAlertingThresholdPCT {
log.Errorf(alertingError, "memory", avg, maxRAMAlertingThresholdPCT)
}
}
func recentAverage(values []value) int {
return int((values[len(values)-1].Value + values[len(values)-2].Value) / 2)
}

35
metrics/hardware.go Normal file
View File

@@ -0,0 +1,35 @@
package metrics
import (
"time"
"github.com/shirou/gopsutil/cpu"
"github.com/shirou/gopsutil/mem"
)
// Max number of metrics we want to keep.
const maxCollectionValues = 500
func collectCPUUtilization() {
if len(Metrics.CPUUtilizations) > maxCollectionValues {
Metrics.CPUUtilizations = Metrics.CPUUtilizations[1:]
}
v, err := cpu.Percent(0, false)
if err != nil {
panic(err)
}
metricValue := value{time.Now(), int(v[0])}
Metrics.CPUUtilizations = append(Metrics.CPUUtilizations, metricValue)
}
func collectRAMUtilization() {
if len(Metrics.RAMUtilizations) > maxCollectionValues {
Metrics.RAMUtilizations = Metrics.RAMUtilizations[1:]
}
memoryUsage, _ := mem.VirtualMemory()
metricValue := value{time.Now(), int(memoryUsage.UsedPercent)}
Metrics.RAMUtilizations = append(Metrics.RAMUtilizations, metricValue)
}

39
metrics/metrics.go Normal file
View File

@@ -0,0 +1,39 @@
package metrics
import (
"time"
)
// How often we poll for updates
const metricsPollingInterval = 15 * time.Second
type value struct {
Time time.Time
Value int
}
type metrics struct {
CPUUtilizations []value
RAMUtilizations []value
}
// Metrics is the shared Metrics instance
var Metrics *metrics
// Start will begin the metrics collection and alerting
func Start() {
Metrics = new(metrics)
for range time.Tick(metricsPollingInterval) {
handlePolling()
}
}
func handlePolling() {
// Collect hardware stats
collectCPUUtilization()
collectRAMUtilization()
// Alerting
handleAlerting()
}