Skip to content

Commit 3d55571

Browse files
authored
20190117 prom rules endpoints (#1999)
* add api to support prom rules and alerts to ruler Signed-off-by: Jacob Lisi <[email protected]> * clean up deps Signed-off-by: Jacob Lisi <[email protected]> * update changelog Signed-off-by: Jacob Lisi <[email protected]> * update documentation Signed-off-by: Jacob Lisi <[email protected]> * register ruler routes in modules.go Signed-off-by: Jacob Lisi <[email protected]> * refactor to have proper struct tags and cleaner logic for rule retrieval Signed-off-by: Jacob Lisi <[email protected]> * add Rules test Signed-off-by: Jacob Lisi <[email protected]> * update docs Signed-off-by: Jacob Lisi <[email protected]> * explicitly pass registerer to the ruler Signed-off-by: Jacob Lisi <[email protected]> * create respondError function Signed-off-by: Jacob Lisi <[email protected]> * remove unused return and err vars Signed-off-by: Jacob Lisi <[email protected]> * reorder registry wrapping Signed-off-by: Jacob Lisi <[email protected]> * refactor based on pr suggestions Signed-off-by: Jacob Lisi <[email protected]> * improve ruler tests for loading rules Signed-off-by: Jacob Lisi <[email protected]> * add logger to mapper tests Signed-off-by: Jacob Lisi <[email protected]> * add tests for ruler api calls Signed-off-by: Jacob Lisi <[email protected]> * ensure all test rulers load rules before returning Signed-off-by: Jacob Lisi <[email protected]> * format api_test file Signed-off-by: Jacob Lisi <[email protected]> * go format Signed-off-by: Jacob Lisi <[email protected]> * clean mapper_test file Signed-off-by: Jacob Lisi <[email protected]> * add api documentation Signed-off-by: Jacob Lisi <[email protected]> * fix alert array instantiation Signed-off-by: Jacob Lisi <[email protected]> * refactor according to changes and comments on PR Signed-off-by: Jacob Lisi <[email protected]> * make all time related fields nonnullable for rules protos Signed-off-by: Jacob Lisi <[email protected]> * ensure ruler is registered as grpc service Signed-off-by: Jacob Lisi <[email protected]> * use noop querier for test cases Signed-off-by: Jacob Lisi <[email protected]> * format alert value string identical to Prometheus Signed-off-by: Jacob Lisi <[email protected]> * refactor per PR comments Signed-off-by: Jacob Lisi <[email protected]> * fix rebase changelog Signed-off-by: Jacob Lisi <[email protected]>
1 parent fc451e1 commit 3d55571

18 files changed

+2429
-255
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
* `--querier.query-store-after` has been added in it's place.
1010
* [FEATURE] Added user sub rings to distribute users to a subset of ingesters. #1947
1111
* `--experimental.distributor.user-subring-size`
12+
* [FEATURE] Added flag `-experimental.ruler.enable-api` to enable the ruler api which implements the Prometheus API `/api/v1/rules` and `/api/v1/alerts` endpoints under the configured `-http.prefix`. #1999
1213
* [ENHANCEMENT] Experimental TSDB: Export TSDB Syncer metrics from Compactor component, they are prefixed with `cortex_compactor_`. #2023
1314
* [ENHANCEMENT] Experimental TSDB: Added dedicated flag `-experimental.tsdb.bucket-store.tenant-sync-concurrency` to configure the maximum number of concurrent tenants for which blocks are synched. #2026
1415
* [ENHANCEMENT] Experimental TSDB: Expose metrics for objstore operations (prefixed with `cortex_<component>_thanos_objstore_`, component being one of `ingester`, `querier` and `compactor`). #2027

docs/apis.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,13 @@ APIs. The encoding is Protobuf over http.
1717

1818
Read is on `/api/prom/read` and write is on `/api/prom/push`.
1919

20+
## Alerts & Rules API
21+
22+
Cortex supports the Prometheus' [alerts](https://prometheus.io/docs/prometheus/latest/querying/api/#alerts) and [rules](https://prometheus.io/docs/prometheus/latest/querying/api/#rules) api endpoints. This is supported in the Ruler service and can be enabled using the `experimental.ruler.enable-api` flag.
23+
24+
`GET /api/prom/api/v1/rules` - List of alerting and recording rules that are currently loaded
25+
26+
`GET /api/prom/api/v1/alerts` - List of all active alerts
2027

2128
## Configs API
2229

docs/configuration/config-file-reference.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -777,6 +777,10 @@ ring:
777777
# Period with which to attempt to flush rule groups.
778778
# CLI flag: -ruler.flush-period
779779
[flushcheckperiod: <duration> | default = 1m0s]
780+
781+
# Enable the ruler api
782+
# CLI flag: -experimental.ruler.enable-api
783+
[enable_api: <boolean> | default = false]
780784
```
781785

782786
## `alertmanager_config`

pkg/cortex/modules.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -415,9 +415,15 @@ func (t *Cortex) initRuler(cfg *Config) (err error) {
415415
cfg.Ruler.Ring.ListenPort = cfg.Server.GRPCListenPort
416416
queryable, engine := querier.New(cfg.Querier, t.distributor, t.store)
417417

418-
t.ruler, err = ruler.NewRuler(cfg.Ruler, engine, queryable, t.distributor)
418+
t.ruler, err = ruler.NewRuler(cfg.Ruler, engine, queryable, t.distributor, prometheus.DefaultRegisterer, util.Logger)
419419
if err != nil {
420-
return
420+
return err
421+
}
422+
423+
if cfg.Ruler.EnableAPI {
424+
subrouter := t.server.HTTP.PathPrefix(cfg.HTTPPrefix).Subrouter()
425+
t.ruler.RegisterRoutes(subrouter)
426+
ruler.RegisterRulerServer(t.server.GRPC, t.ruler)
421427
}
422428

423429
t.server.HTTP.Handle("/ruler_ring", t.ruler)

pkg/ruler/api.go

Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,252 @@
1+
package ruler
2+
3+
import (
4+
"encoding/json"
5+
"net/http"
6+
"strconv"
7+
"time"
8+
9+
"github.com/go-kit/kit/log"
10+
"github.com/go-kit/kit/log/level"
11+
"github.com/gorilla/mux"
12+
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
13+
"github.com/prometheus/prometheus/pkg/labels"
14+
"github.com/weaveworks/common/user"
15+
16+
"github.com/cortexproject/cortex/pkg/ingester/client"
17+
"github.com/cortexproject/cortex/pkg/util"
18+
)
19+
20+
// RegisterRoutes registers the ruler API HTTP routes with the provided Router.
21+
func (r *Ruler) RegisterRoutes(router *mux.Router) {
22+
for _, route := range []struct {
23+
name, method, path string
24+
handler http.HandlerFunc
25+
}{
26+
{"get_rules", "GET", "/api/v1/rules", r.rules},
27+
{"get_alerts", "GET", "/api/v1/alerts", r.alerts},
28+
} {
29+
level.Debug(util.Logger).Log("msg", "ruler: registering route", "name", route.name, "method", route.method, "path", route.path)
30+
router.Handle(route.path, route.handler).Methods(route.method).Name(route.name)
31+
}
32+
}
33+
34+
// In order to reimplement the prometheus rules API, a large amount of code was copied over
35+
// This is required because the prometheus api implementation does not pass a context to
36+
// the rule retrieval function.
37+
// https://github.com/prometheus/prometheus/blob/2aacd807b3ec6ddd90ae55f3a42f4cffed561ea9/web/api/v1/api.go#L108
38+
// https://github.com/prometheus/prometheus/pull/4999
39+
40+
type response struct {
41+
Status string `json:"status"`
42+
Data interface{} `json:"data,omitempty"`
43+
ErrorType v1.ErrorType `json:"errorType,omitempty"`
44+
Error string `json:"error,omitempty"`
45+
}
46+
47+
// AlertDiscovery has info for all active alerts.
48+
type AlertDiscovery struct {
49+
Alerts []*Alert `json:"alerts"`
50+
}
51+
52+
// Alert has info for an alert.
53+
type Alert struct {
54+
Labels labels.Labels `json:"labels"`
55+
Annotations labels.Labels `json:"annotations"`
56+
State string `json:"state"`
57+
ActiveAt *time.Time `json:"activeAt,omitempty"`
58+
Value string `json:"value"`
59+
}
60+
61+
// RuleDiscovery has info for all rules
62+
type RuleDiscovery struct {
63+
RuleGroups []*RuleGroup `json:"groups"`
64+
}
65+
66+
// RuleGroup has info for rules which are part of a group
67+
type RuleGroup struct {
68+
Name string `json:"name"`
69+
File string `json:"file"`
70+
// In order to preserve rule ordering, while exposing type (alerting or recording)
71+
// specific properties, both alerting and recording rules are exposed in the
72+
// same array.
73+
Rules []rule `json:"rules"`
74+
Interval float64 `json:"interval"`
75+
}
76+
77+
type rule interface{}
78+
79+
type alertingRule struct {
80+
// State can be "pending", "firing", "inactive".
81+
State string `json:"state"`
82+
Name string `json:"name"`
83+
Query string `json:"query"`
84+
Duration float64 `json:"duration"`
85+
Labels labels.Labels `json:"labels"`
86+
Annotations labels.Labels `json:"annotations"`
87+
Alerts []*Alert `json:"alerts"`
88+
Health string `json:"health"`
89+
LastError string `json:"lastError,omitempty"`
90+
Type v1.RuleType `json:"type"`
91+
}
92+
93+
type recordingRule struct {
94+
Name string `json:"name"`
95+
Query string `json:"query"`
96+
Labels labels.Labels `json:"labels,omitempty"`
97+
Health string `json:"health"`
98+
LastError string `json:"lastError,omitempty"`
99+
Type v1.RuleType `json:"type"`
100+
}
101+
102+
func respondError(logger log.Logger, w http.ResponseWriter, msg string) {
103+
b, err := json.Marshal(&response{
104+
Status: "error",
105+
ErrorType: v1.ErrServer,
106+
Error: msg,
107+
Data: nil,
108+
})
109+
110+
if err != nil {
111+
level.Error(logger).Log("msg", "error marshaling json response", "err", err)
112+
http.Error(w, err.Error(), http.StatusInternalServerError)
113+
return
114+
}
115+
116+
w.WriteHeader(http.StatusInternalServerError)
117+
if n, err := w.Write(b); err != nil {
118+
level.Error(logger).Log("msg", "error writing response", "bytesWritten", n, "err", err)
119+
}
120+
}
121+
122+
func (r *Ruler) rules(w http.ResponseWriter, req *http.Request) {
123+
logger := util.WithContext(req.Context(), util.Logger)
124+
userID, ctx, err := user.ExtractOrgIDFromHTTPRequest(req)
125+
if err != nil {
126+
level.Error(logger).Log("msg", "error extracting org id from context", "err", err)
127+
respondError(logger, w, "no valid org id found")
128+
return
129+
}
130+
131+
w.Header().Set("Content-Type", "application/json")
132+
rgs, err := r.GetRules(ctx, userID)
133+
134+
if err != nil {
135+
respondError(logger, w, err.Error())
136+
return
137+
}
138+
139+
groups := make([]*RuleGroup, 0, len(rgs))
140+
141+
for _, g := range rgs {
142+
grp := RuleGroup{
143+
Name: g.Name,
144+
File: g.Namespace,
145+
Interval: g.Interval.Seconds(),
146+
Rules: make([]rule, len(g.Rules)),
147+
}
148+
149+
for i, rl := range g.Rules {
150+
if g.Rules[i].Alert != "" {
151+
alerts := make([]*Alert, 0, len(rl.Alerts))
152+
for _, a := range rl.Alerts {
153+
alerts = append(alerts, &Alert{
154+
Labels: client.FromLabelAdaptersToLabels(a.Labels),
155+
Annotations: client.FromLabelAdaptersToLabels(a.Annotations),
156+
State: a.GetState(),
157+
ActiveAt: &a.ActiveAt,
158+
Value: strconv.FormatFloat(a.Value, 'e', -1, 64),
159+
})
160+
}
161+
grp.Rules[i] = alertingRule{
162+
State: rl.GetState(),
163+
Name: rl.GetAlert(),
164+
Query: rl.GetExpr(),
165+
Duration: rl.For.Seconds(),
166+
Labels: client.FromLabelAdaptersToLabels(rl.Labels),
167+
Annotations: client.FromLabelAdaptersToLabels(rl.Annotations),
168+
Alerts: alerts,
169+
Health: rl.GetHealth(),
170+
LastError: rl.GetLastError(),
171+
Type: v1.RuleTypeAlerting,
172+
}
173+
} else {
174+
grp.Rules[i] = recordingRule{
175+
Name: rl.GetRecord(),
176+
Query: rl.GetExpr(),
177+
Labels: client.FromLabelAdaptersToLabels(rl.Labels),
178+
Health: rl.GetHealth(),
179+
LastError: rl.GetLastError(),
180+
Type: v1.RuleTypeRecording,
181+
}
182+
}
183+
}
184+
groups = append(groups, &grp)
185+
}
186+
187+
b, err := json.Marshal(&response{
188+
Status: "success",
189+
Data: &RuleDiscovery{RuleGroups: groups},
190+
})
191+
if err != nil {
192+
level.Error(logger).Log("msg", "error marshaling json response", "err", err)
193+
respondError(logger, w, "unable to marshal the requested data")
194+
return
195+
}
196+
w.Header().Set("Content-Type", "application/json")
197+
w.WriteHeader(http.StatusOK)
198+
if n, err := w.Write(b); err != nil {
199+
level.Error(logger).Log("msg", "error writing response", "bytesWritten", n, "err", err)
200+
}
201+
}
202+
203+
func (r *Ruler) alerts(w http.ResponseWriter, req *http.Request) {
204+
logger := util.WithContext(req.Context(), util.Logger)
205+
userID, ctx, err := user.ExtractOrgIDFromHTTPRequest(req)
206+
if err != nil {
207+
level.Error(logger).Log("msg", "error extracting org id from context", "err", err)
208+
respondError(logger, w, "no valid org id found")
209+
return
210+
}
211+
212+
w.Header().Set("Content-Type", "application/json")
213+
rgs, err := r.GetRules(ctx, userID)
214+
215+
if err != nil {
216+
respondError(logger, w, err.Error())
217+
return
218+
}
219+
220+
alerts := []*Alert{}
221+
222+
for _, g := range rgs {
223+
for _, rl := range g.Rules {
224+
if rl.Alert != "" {
225+
for _, a := range rl.Alerts {
226+
alerts = append(alerts, &Alert{
227+
Labels: client.FromLabelAdaptersToLabels(a.Labels),
228+
Annotations: client.FromLabelAdaptersToLabels(a.Annotations),
229+
State: a.GetState(),
230+
ActiveAt: &a.ActiveAt,
231+
Value: strconv.FormatFloat(a.Value, 'e', -1, 64),
232+
})
233+
}
234+
}
235+
}
236+
}
237+
238+
b, err := json.Marshal(&response{
239+
Status: "success",
240+
Data: &AlertDiscovery{Alerts: alerts},
241+
})
242+
if err != nil {
243+
level.Error(logger).Log("msg", "error marshaling json response", "err", err)
244+
respondError(logger, w, "unable to marshal the requested data")
245+
return
246+
}
247+
w.Header().Set("Content-Type", "application/json")
248+
w.WriteHeader(http.StatusOK)
249+
if n, err := w.Write(b); err != nil {
250+
level.Error(logger).Log("msg", "error writing response", "bytesWritten", n, "err", err)
251+
}
252+
}

0 commit comments

Comments
 (0)