@@ -23,10 +23,13 @@ import (
23
23
"github.com/aws/aws-sdk-go/aws"
24
24
"github.com/aws/aws-sdk-go/service/cloudwatchlogs"
25
25
"github.com/gorilla/websocket"
26
+ "gopkg.in/karalabe/cookiejar.v2/collections/deque"
26
27
27
28
awslib "github.com/cortexlabs/cortex/pkg/lib/aws"
29
+ "github.com/cortexlabs/cortex/pkg/lib/errors"
28
30
"github.com/cortexlabs/cortex/pkg/lib/sets/strset"
29
31
s "github.com/cortexlabs/cortex/pkg/lib/strings"
32
+ libtime "github.com/cortexlabs/cortex/pkg/lib/time"
30
33
"github.com/cortexlabs/cortex/pkg/operator/api/context"
31
34
"github.com/cortexlabs/cortex/pkg/operator/api/resource"
32
35
"github.com/cortexlabs/cortex/pkg/operator/config"
@@ -37,10 +40,40 @@ const (
37
40
socketCloseGracePeriod = 10 * time .Second
38
41
socketMaxMessageSize = 8192
39
42
43
+ maxCacheSize = 10000
40
44
maxLogLinesPerRequest = 500
41
- pollPeriod = 250 // milliseconds
45
+ maxStreamsPerRequest = 50
46
+ pollPeriod = 250 * time .Millisecond
47
+ streamRefreshPeriod = 2 * time .Second
42
48
)
43
49
50
+ type eventCache struct {
51
+ size int
52
+ seen strset.Set
53
+ eventQueue * deque.Deque
54
+ }
55
+
56
+ func newEventCache (cacheSize int ) eventCache {
57
+ return eventCache {
58
+ size : cacheSize ,
59
+ seen : strset .New (),
60
+ eventQueue : deque .New (),
61
+ }
62
+ }
63
+
64
+ func (c * eventCache ) Has (eventID string ) bool {
65
+ return c .seen .Has (eventID )
66
+ }
67
+
68
+ func (c * eventCache ) Add (eventID string ) {
69
+ if c .eventQueue .Size () == c .size {
70
+ eventID := c .eventQueue .PopLeft ().(string )
71
+ c .seen .Remove (eventID )
72
+ }
73
+ c .seen .Add (eventID )
74
+ c .eventQueue .PushRight (eventID )
75
+ }
76
+
44
77
func ReadLogs (appName string , podLabels map [string ]string , socket * websocket.Conn ) {
45
78
podCheckCancel := make (chan struct {})
46
79
defer close (podCheckCancel )
@@ -67,14 +100,29 @@ func StreamFromCloudWatch(podCheckCancel chan struct{}, appName string, podLabel
67
100
timer := time .NewTimer (0 )
68
101
defer timer .Stop ()
69
102
70
- lastTimestamp := int64 (0 )
71
- previousEvents := strset .New ()
103
+ lastLogTime := time .Now ()
104
+ lastLogStreamUpdateTime := time .Now ().Add (- 1 * streamRefreshPeriod )
105
+
106
+ logStreamNames := strset .New ()
72
107
73
108
var currentContextID string
74
109
var prefix string
75
- var ctx * context.Context
76
110
var err error
77
111
112
+ var ctx = CurrentContext (appName )
113
+ eventCache := newEventCache (maxCacheSize )
114
+
115
+ if ctx == nil {
116
+ writeAndCloseSocket (socket , "\n deployment " + appName + " not found" )
117
+ return
118
+ }
119
+
120
+ logGroupName , err := getLogGroupName (ctx , podLabels )
121
+ if err != nil {
122
+ writeAndCloseSocket (socket , err .Error ()) // unexpected
123
+ return
124
+ }
125
+
78
126
for {
79
127
select {
80
128
case <- podCheckCancel :
@@ -83,8 +131,7 @@ func StreamFromCloudWatch(podCheckCancel chan struct{}, appName string, podLabel
83
131
ctx = CurrentContext (appName )
84
132
85
133
if ctx == nil {
86
- writeString (socket , "\n deployment " + appName + " not found" )
87
- closeSocket (socket )
134
+ writeAndCloseSocket (socket , "\n deployment " + appName + " not found" )
88
135
continue
89
136
}
90
137
@@ -93,105 +140,129 @@ func StreamFromCloudWatch(podCheckCancel chan struct{}, appName string, podLabel
93
140
if podLabels ["workloadType" ] == resource .APIType .String () {
94
141
apiName := podLabels ["apiName" ]
95
142
if _ , ok := ctx .APIs [apiName ]; ! ok {
96
- writeString (socket , "\n api " + apiName + " was not found in latest deployment" )
97
- closeSocket (socket )
143
+ writeAndCloseSocket (socket , "\n api " + apiName + " was not found in latest deployment" )
98
144
continue
99
145
}
100
146
writeString (socket , "\n a new deployment was detected, streaming logs from the latest deployment" )
101
147
} else {
102
- writeString (socket , "\n logging non-api workloads is not supported" ) // unexpected
103
- closeSocket (socket )
148
+ writeAndCloseSocket (socket , "\n logging non-api workloads is not supported" ) // unexpected
104
149
continue
105
150
}
106
151
} else {
107
- lastTimestamp = ctx .CreatedEpoch * 1000
108
- }
109
-
110
- if podLabels ["workloadType" ] == resource .APIType .String () {
111
- podLabels ["workloadID" ] = ctx .APIs [podLabels ["apiName" ]].WorkloadID
152
+ lastLogTime , _ = getPodStartTime (podLabels )
112
153
}
113
154
114
155
currentContextID = ctx .ID
115
-
116
156
writeString (socket , "\n retrieving logs..." )
117
- prefix = ""
118
157
}
119
158
120
- if len ( prefix ) == 0 {
121
- prefix , err = getPrefix ( podLabels )
159
+ if lastLogStreamUpdateTime . Add ( streamRefreshPeriod ). Before ( time . Now ()) {
160
+ newLogStreamNames , err := getLogStreams ( logGroupName )
122
161
if err != nil {
123
- writeString (socket , err .Error ())
124
- closeSocket (socket )
162
+ writeAndCloseSocket (socket , "error encountered while searching for log streams: " + err .Error ())
125
163
continue
126
164
}
165
+
166
+ if ! logStreamNames .IsEqual (newLogStreamNames ) {
167
+ lastLogTime = lastLogTime .Add (- streamRefreshPeriod )
168
+ logStreamNames = newLogStreamNames
169
+ }
170
+ lastLogStreamUpdateTime = time .Now ()
127
171
}
128
172
129
- if len (prefix ) == 0 {
173
+ if len (logStreamNames ) == 0 {
130
174
timer .Reset (pollPeriod )
131
175
continue
132
176
}
133
177
134
- endTime := time .Now (). Unix () * 1000
135
- startTime := lastTimestamp - pollPeriod
178
+ endTime := libtime . ToMillis ( time .Now ())
179
+
136
180
logEventsOutput , err := config .AWS .CloudWatchLogsClient .FilterLogEvents (& cloudwatchlogs.FilterLogEventsInput {
137
- LogGroupName : aws .String (config . Cortex . LogGroup ),
138
- LogStreamNamePrefix : aws .String ( prefix ),
139
- StartTime : aws .Int64 (startTime ),
140
- EndTime : aws .Int64 (endTime ), // requires milliseconds
141
- Limit : aws .Int64 (int64 (maxLogLinesPerRequest )),
181
+ LogGroupName : aws .String (logGroupName ),
182
+ LogStreamNames : aws .StringSlice ( logStreamNames . Slice () ),
183
+ StartTime : aws .Int64 (libtime . ToMillis ( lastLogTime . Add ( - pollPeriod )) ),
184
+ EndTime : aws .Int64 (endTime ),
185
+ Limit : aws .Int64 (int64 (maxLogLinesPerRequest )),
142
186
})
143
187
144
188
if err != nil {
145
- if ! awslib .CheckErrCode (err , "ResourceNotFoundException" ) {
146
- writeString (socket , "error encountered while fetching logs from cloudwatch: " + err .Error ())
147
- closeSocket (socket )
189
+ if ! awslib .CheckErrCode (err , cloudwatchlogs .ErrCodeResourceNotFoundException ) {
190
+ writeAndCloseSocket (socket , "error encountered while fetching logs from cloudwatch: " + err .Error ())
148
191
continue
149
192
}
150
193
}
151
194
152
- newEvents := strset . New ( )
195
+ lastLogTimestampMillis := libtime . ToMillis ( lastLogTime )
153
196
for _ , logEvent := range logEventsOutput .Events {
154
197
var log FluentdLog
155
198
json .Unmarshal ([]byte (* logEvent .Message ), & log )
156
-
157
- if ! previousEvents .Has (* logEvent .EventId ) {
199
+ if ! eventCache .Has (* logEvent .EventId ) {
158
200
socket .WriteMessage (websocket .TextMessage , []byte (log .Log ))
159
- if * logEvent .Timestamp > lastTimestamp {
160
- lastTimestamp = * logEvent .Timestamp
201
+ if * logEvent .Timestamp > lastLogTimestampMillis {
202
+ lastLogTimestampMillis = * logEvent .Timestamp
161
203
}
204
+ eventCache .Add (* logEvent .EventId )
162
205
}
163
- newEvents .Add (* logEvent .EventId )
164
206
}
165
207
208
+ lastLogTime = libtime .MillisToTime (lastLogTimestampMillis )
166
209
if len (logEventsOutput .Events ) == maxLogLinesPerRequest {
167
- socket . WriteMessage ( websocket . TextMessage , [] byte ( "---- Showing at most " + s .Int (maxLogLinesPerRequest )+ " lines. Visit AWS cloudwatch logs console and search for \" " + prefix + "\" in log group \" " + config .Cortex .LogGroup + "\" for complete logs ----" ) )
168
- lastTimestamp = endTime
210
+ writeString ( socket , "---- Showing at most " + s .Int (maxLogLinesPerRequest )+ " lines. Visit AWS cloudwatch logs console and search for \" " + prefix + "\" in log group \" " + config .Cortex .LogGroup + "\" for complete logs ----" )
211
+ lastLogTime = libtime . MillisToTime ( endTime )
169
212
}
170
213
171
- previousEvents = newEvents
172
- timer .Reset (pollPeriod * time .Millisecond )
214
+ timer .Reset (pollPeriod )
173
215
}
174
216
}
175
217
}
176
218
177
- func getPrefix (searchLabels map [string ]string ) (string , error ) {
219
+ func getLogStreams (logGroupName string ) (strset.Set , error ) {
220
+ describeLogStreamsOutput , err := config .AWS .CloudWatchLogsClient .DescribeLogStreams (& cloudwatchlogs.DescribeLogStreamsInput {
221
+ OrderBy : aws .String (cloudwatchlogs .OrderByLastEventTime ),
222
+ Descending : aws .Bool (true ),
223
+ LogGroupName : aws .String (logGroupName ),
224
+ Limit : aws .Int64 (maxStreamsPerRequest ),
225
+ })
226
+ if err != nil {
227
+ if ! awslib .CheckErrCode (err , cloudwatchlogs .ErrCodeResourceNotFoundException ) {
228
+ return nil , err
229
+ }
230
+ return nil , nil
231
+ }
232
+
233
+ streams := strset .New ()
234
+
235
+ for _ , stream := range describeLogStreamsOutput .LogStreams {
236
+ streams .Add (* stream .LogStreamName )
237
+ }
238
+ return streams , nil
239
+ }
240
+
241
+ func getPodStartTime (searchLabels map [string ]string ) (time.Time , error ) {
178
242
pods , err := config .Kubernetes .ListPodsByLabels (searchLabels )
179
243
if err != nil {
180
- return "" , err
244
+ return time. Time {} , err
181
245
}
182
246
183
247
if len (pods ) == 0 {
184
- return "" , nil
248
+ return time . Now () , nil
185
249
}
186
250
187
- podLabels := pods [0 ].GetLabels ()
188
- if apiName , ok := podLabels [ "apiName" ]; ok {
189
- if podTemplateHash , ok := podLabels [ " pod-template-hash" ]; ok {
190
- return internalAPIName ( apiName , podLabels [ "appName" ]) + "-" + podTemplateHash , nil
251
+ startTime := pods [0 ].CreationTimestamp . Time
252
+ for _ , pod := range pods [ 1 :] {
253
+ if pod . CreationTimestamp . Time . Before ( startTime ) {
254
+ startTime = pod . CreationTimestamp . Time
191
255
}
192
- return "" , nil // unexpected, pod template hash not set yet
193
256
}
194
- return pods [0 ].Name , nil // unexpected, logging non api resources
257
+
258
+ return startTime , nil
259
+ }
260
+
261
+ func getLogGroupName (ctx * context.Context , searchLabels map [string ]string ) (string , error ) {
262
+ if searchLabels ["workloadType" ] == resource .APIType .String () {
263
+ return ctx .LogGroupName (searchLabels ["apiName" ]), nil
264
+ }
265
+ return "nil" , errors .New ("unsupported workload type" ) // unexpected
195
266
}
196
267
197
268
func writeString (socket * websocket.Conn , message string ) {
@@ -203,3 +274,8 @@ func closeSocket(socket *websocket.Conn) {
203
274
socket .WriteMessage (websocket .CloseMessage , websocket .FormatCloseMessage (websocket .CloseNormalClosure , "" ))
204
275
time .Sleep (socketCloseGracePeriod )
205
276
}
277
+
278
+ func writeAndCloseSocket (socket * websocket.Conn , message string ) {
279
+ writeString (socket , message )
280
+ closeSocket (socket )
281
+ }
0 commit comments