Skip to content

Commit c4f2157

Browse files
authored
Merge pull request #198 from intel/runperfrace
address race condition in metrics event processing
2 parents 4188374 + 773aa4c commit c4f2157

File tree

2 files changed

+44
-47
lines changed

2 files changed

+44
-47
lines changed

cmd/metrics/metric.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ type MetricFrame struct {
3636
}
3737

3838
// ProcessEvents is responsible for producing metrics from raw perf events
39-
func ProcessEvents(perfEvents [][]byte, eventGroupDefinitions []GroupDefinition, metricDefinitions []MetricDefinition, processes []Process, previousTimestamp float64, metadata Metadata, outputDir string) (metricFrames []MetricFrame, timeStamp float64, err error) {
39+
func ProcessEvents(perfEvents [][]byte, eventGroupDefinitions []GroupDefinition, metricDefinitions []MetricDefinition, processes []Process, previousTimestamp float64, metadata Metadata) (metricFrames []MetricFrame, timeStamp float64, err error) {
4040
var eventFrames []EventFrame
4141
if eventFrames, err = GetEventFrames(perfEvents, eventGroupDefinitions, flagScope, flagGranularity, metadata); err != nil { // arrange the events into groups
4242
err = fmt.Errorf("failed to put perf events into groups: %v", err)

cmd/metrics/metrics.go

Lines changed: 43 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1212,14 +1212,15 @@ func runPerf(myTarget target.Target, noRoot bool, processes []Process, cmd *exec
12121212
}
12131213
// must manually terminate perf in cgroup scope when a timeout is specified and/or need to refresh cgroups
12141214
startPerfTimestamp := time.Now()
1215-
var timeout int
1215+
var cgroupTimeout int
12161216
if flagScope == scopeCgroup && (flagDuration != 0 || len(flagCidList) == 0) {
12171217
if flagDuration > 0 && flagDuration < flagRefresh {
1218-
timeout = flagDuration
1218+
cgroupTimeout = flagDuration
12191219
} else {
1220-
timeout = flagRefresh
1220+
cgroupTimeout = flagRefresh
12211221
}
12221222
}
1223+
// Start a goroutine to wait for and then process perf output
12231224
// Use a timer to determine when we received an entire frame of events from perf
12241225
// The timer will expire when no lines (events) have been received from perf for more than 100ms. This
12251226
// works because perf writes the events to stderr in a burst every collection interval, e.g., 5 seconds.
@@ -1230,22 +1231,17 @@ func runPerf(myTarget target.Target, noRoot bool, processes []Process, cmd *exec
12301231
frameCount := 0
12311232
stopAnonymousFuncChannel := make(chan bool)
12321233
go func() {
1234+
stop := false
12331235
for {
12341236
select {
12351237
case <-t1.C: // waits for timer to expire
12361238
case <-stopAnonymousFuncChannel: // wait for signal to exit the goroutine
1237-
return
1239+
stop = true // exit the loop
12381240
}
1239-
if len(outputLines) != 0 {
1240-
if flagWriteEventsToFile {
1241-
if err = writeEventsToFile(outputDir+"/"+myTarget.GetName()+"_"+"events.json", outputLines); err != nil {
1242-
err = fmt.Errorf("failed to write events to raw file: %v", err)
1243-
slog.Error(err.Error())
1244-
return
1245-
}
1246-
}
1241+
if !stop && len(outputLines) != 0 {
1242+
// process the events
12471243
var metricFrames []MetricFrame
1248-
if metricFrames, frameTimestamp, err = ProcessEvents(outputLines, eventGroupDefinitions, metricDefinitions, processes, frameTimestamp, metadata, outputDir); err != nil {
1244+
if metricFrames, frameTimestamp, err = ProcessEvents(outputLines, eventGroupDefinitions, metricDefinitions, processes, frameTimestamp, metadata); err != nil {
12491245
slog.Warn(err.Error())
12501246
outputLines = [][]byte{} // empty it
12511247
continue
@@ -1254,57 +1250,58 @@ func runPerf(myTarget target.Target, noRoot bool, processes []Process, cmd *exec
12541250
frameCount += 1
12551251
metricFrames[i].FrameCount = frameCount
12561252
}
1253+
// send the metrics frames out to be printed
12571254
frameChannel <- metricFrames
1258-
outputLines = [][]byte{} // empty it
1255+
// write the events to a file
1256+
if flagWriteEventsToFile {
1257+
if err = writeEventsToFile(outputDir+"/"+myTarget.GetName()+"_"+"events.json", outputLines); err != nil {
1258+
err = fmt.Errorf("failed to write events to raw file: %v", err)
1259+
slog.Error(err.Error())
1260+
return
1261+
}
1262+
}
1263+
// empty the outputLines
1264+
outputLines = [][]byte{}
12591265
}
1260-
if timeout != 0 && int(time.Since(startPerfTimestamp).Seconds()) > timeout {
1261-
err = localCommand.Process.Signal(os.Interrupt)
1262-
if err != nil {
1263-
err = fmt.Errorf("failed to terminate perf: %v", err)
1264-
slog.Error(err.Error())
1266+
// for cgroup scope, terminate perf if timeout is reached
1267+
if flagScope == scopeCgroup {
1268+
if stop || (cgroupTimeout != 0 && int(time.Since(startPerfTimestamp).Seconds()) > cgroupTimeout) {
1269+
err = localCommand.Process.Signal(os.Interrupt)
1270+
if err != nil {
1271+
err = fmt.Errorf("failed to terminate perf: %v", err)
1272+
slog.Error(err.Error())
1273+
}
12651274
}
12661275
}
1276+
if stop {
1277+
break
1278+
}
12671279
}
1280+
// signal that the goroutine is done
1281+
stopAnonymousFuncChannel <- true
12681282
}()
1269-
// read perf output
1283+
// receive perf output
12701284
done := false
12711285
for !done {
12721286
select {
1273-
case err := <-scriptErrorChannel:
1287+
case err := <-scriptErrorChannel: // if there is an error running perf, it comes here
12741288
if err != nil {
12751289
slog.Error("error from perf", slog.String("error", err.Error()))
12761290
}
1277-
done = true
1278-
case exitCode := <-exitcodeChannel:
1291+
done = true // exit the loop
1292+
case exitCode := <-exitcodeChannel: // when perf exits, the exit code comes to this channel
12791293
slog.Debug("perf exited", slog.Int("exit code", exitCode))
1280-
done = true
1281-
case line := <-stderrChannel:
1294+
done = true // exit the loop
1295+
case line := <-stderrChannel: // perf output comes in on this channel, one line at a time
12821296
t1.Stop()
12831297
t1.Reset(100 * time.Millisecond) // 100ms is somewhat arbitrary, but seems to work
1298+
// accumulate the lines, they will be processed in the goroutine when the timer expires
12841299
outputLines = append(outputLines, []byte(line))
12851300
}
12861301
}
12871302
t1.Stop()
12881303
// send signal to exit the goroutine
1289-
defer func() { stopAnonymousFuncChannel <- true }()
1290-
// process any remaining events
1291-
if len(outputLines) != 0 {
1292-
if flagWriteEventsToFile {
1293-
if err = writeEventsToFile(outputDir+"/"+myTarget.GetName()+"_"+"events.json", outputLines); err != nil {
1294-
err = fmt.Errorf("failed to write events to raw file: %v", err)
1295-
slog.Error(err.Error())
1296-
return
1297-
}
1298-
}
1299-
var metricFrames []MetricFrame
1300-
if metricFrames, frameTimestamp, err = ProcessEvents(outputLines, eventGroupDefinitions, metricDefinitions, processes, frameTimestamp, metadata, outputDir); err != nil {
1301-
slog.Error(err.Error())
1302-
return
1303-
}
1304-
for i := range metricFrames {
1305-
frameCount += 1
1306-
metricFrames[i].FrameCount = frameCount
1307-
}
1308-
frameChannel <- metricFrames
1309-
}
1304+
stopAnonymousFuncChannel <- true
1305+
// wait for the goroutine to exit
1306+
<-stopAnonymousFuncChannel
13101307
}

0 commit comments

Comments
 (0)