Skip to content

Commit 1fbd2c4

Browse files
authored
Fix bad interactions between timeouts and build retires (#10480)
1 parent b8bc858 commit 1fbd2c4

8 files changed

Lines changed: 121 additions & 36 deletions

File tree

toolkit/tools/internal/logger/log.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,17 @@ func PanicOnError(err interface{}, args ...interface{}) {
163163
}
164164
}
165165

166+
// FatalOnError logs a fatal error and any message strings, then exists (while
167+
// running any cleanup functions registered with the log package)
168+
func FatalOnError(err interface{}, args ...interface{}) {
169+
if err != nil {
170+
if len(args) > 0 {
171+
Log.Errorf(args[0].(string), args[1:]...)
172+
}
173+
Log.Fatalln(err)
174+
}
175+
}
176+
166177
// WarningOnError logs a warning error and any message strings
167178
func WarningOnError(err interface{}, args ...interface{}) {
168179
if err != nil {

toolkit/tools/internal/shell/shell.go

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,17 +41,30 @@ func CurrentEnvironment() []string {
4141

4242
// PermanentlyStopAllChildProcesses will send the provided signal to all processes spawned by this package,
4343
// and all of those process's children.
44-
// Invoking this will also block future process creation, causing the Execute methods to return an error.
44+
// Invoking this will also block future process creation, causing the Execute methods to return an error. Be aware that
45+
// this will block the gpg-agent cleanup mechanism from running, which may cause chroots to not unmount properly. Consider
46+
// using StopAllChildProcesses instead.
4547
func PermanentlyStopAllChildProcesses(signal unix.Signal) {
4648
// Acquire the global activeCommandsMutex to ensure no
4749
// new commands are executed during this teardown routine
48-
logger.Log.Info("Waiting for outstanding processes to be created")
50+
logger.Log.Info("Waiting for outstanding processes to be created before blocking all future processes")
4951

5052
activeCommandsMutex.Lock()
51-
defer activeCommandsMutex.Unlock()
52-
5353
// Disallow future processes from being created
5454
allowProcessCreation = false
55+
activeCommandsMutex.Unlock()
56+
57+
StopAllChildProcesses(signal)
58+
}
59+
60+
// StopAllChildProcesses will stop all currently running processes spawned by this package, but will not block future process creation.
61+
func StopAllChildProcesses(signal unix.Signal) {
62+
// Acquire the global activeCommandsMutex to ensure no
63+
// new commands are executed during this teardown routine
64+
logger.Log.Info("Waiting for outstanding processes to be created before stopping all child processes")
65+
66+
activeCommandsMutex.Lock()
67+
defer activeCommandsMutex.Unlock()
5568

5669
// For every running process, issue the provided signal to its process group,
5770
// resulting in both the process and all of its children being stopped.

toolkit/tools/pkgworker/pkgworker.go

Lines changed: 45 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -74,13 +74,13 @@ func main() {
7474
logger.InitBestEffort(logFlags)
7575

7676
rpmsDirAbsPath, err := filepath.Abs(*rpmsDirPath)
77-
logger.PanicOnError(err, "Unable to find absolute path for RPMs directory '%s'", *rpmsDirPath)
77+
logger.FatalOnError(err, "Unable to find absolute path for RPMs directory '%s'", *rpmsDirPath)
7878

7979
toolchainDirAbsPath, err := filepath.Abs(*toolchainDirPath)
80-
logger.PanicOnError(err, "Unable to find absolute path for toolchain RPMs directory '%s'", *toolchainDirPath)
80+
logger.FatalOnError(err, "Unable to find absolute path for toolchain RPMs directory '%s'", *toolchainDirPath)
8181

8282
srpmsDirAbsPath, err := filepath.Abs(*srpmsDirPath)
83-
logger.PanicOnError(err, "Unable to find absolute path for SRPMs directory '%s'", *srpmsDirPath)
83+
logger.FatalOnError(err, "Unable to find absolute path for SRPMs directory '%s'", *srpmsDirPath)
8484

8585
chrootDir := buildChrootDirPath(*workDir, *srpmFile, *runCheck)
8686

@@ -118,15 +118,15 @@ func main() {
118118
}
119119

120120
builtRPMs, err := buildSRPMInChroot(chrootDir, rpmsDirAbsPath, toolchainDirAbsPath, *workerTar, *srpmFile, *repoFile, *rpmmacrosFile, *outArch, defines, *noCleanup, *runCheck, *packagesToInstall, ccacheManager, *timeout)
121-
logger.PanicOnError(err, "Failed to build SRPM '%s'. For details see log file: %s .", *srpmFile, *logFlags.LogFile)
121+
logger.FatalOnError(err, "Failed to build SRPM '%s'. For details see log file: %s .", *srpmFile, *logFlags.LogFile)
122122

123123
// For regular (non-test) package builds:
124124
// - Copy the SRPM which produced the package to the output directory.
125125
// - Write a comma-separated list of RPMs built to stdout that can be parsed by the invoker.
126126
// Any output from logger will be on stderr so stdout will only contain this output.
127127
if !*runCheck {
128128
err = copySRPMToOutput(*srpmFile, srpmsDirAbsPath)
129-
logger.PanicOnError(err, "Failed to copy SRPM '%s' to output directory '%s'.", *srpmFile, rpmsDirAbsPath)
129+
logger.FatalOnError(err, "Failed to copy SRPM '%s' to output directory '%s'.", *srpmFile, rpmsDirAbsPath)
130130

131131
fmt.Print(strings.Join(builtRPMs, ","))
132132
}
@@ -171,6 +171,7 @@ func buildSRPMInChroot(chrootDir, rpmDirPath, toolchainDirPath, workerTar, srpmF
171171
quit := make(chan bool)
172172
go func() {
173173
logger.Log.Infof("Building (%s).", srpmBaseName)
174+
startTime := time.Now()
174175

175176
for {
176177
select {
@@ -180,7 +181,7 @@ func buildSRPMInChroot(chrootDir, rpmDirPath, toolchainDirPath, workerTar, srpmF
180181
}
181182
return
182183
case <-time.After(buildHeartbeatTimeout):
183-
logger.Log.Infof("Heartbeat: still building (%s).", srpmBaseName)
184+
logger.Log.Infof("Heartbeat: still building (%s) after %s.", srpmBaseName, time.Since(startTime).String())
184185
}
185186
}
186187
}()
@@ -214,34 +215,48 @@ func buildSRPMInChroot(chrootDir, rpmDirPath, toolchainDirPath, workerTar, srpmF
214215

215216
err = chroot.Initialize(workerTar, extraDirs, mountPoints, true)
216217
if err != nil {
218+
err = fmt.Errorf("failed to initialize chroot:\n%w", err)
217219
return
218220
}
219221
defer chroot.Close(noCleanup)
220222

221223
// Place extra files that will be needed to build into the chroot
222224
srpmFileInChroot, err := copyFilesIntoChroot(chroot, srpmFile, repoFile, rpmmacrosFile, runCheck)
223225
if err != nil {
226+
err = fmt.Errorf("failed to copy files into chroot:\n%w", err)
224227
return
225228
}
226229

227-
// Run the build in a go routine so we can monitor and kill it if it takes too long.
230+
// Run the build in a goroutine so we can monitor and kill it if it takes too long.
231+
//
232+
// It is important to run the timeout logic inside the chroot so that the chroot cleanup
233+
// flow in chroot.Run() is executed if the build times out.
234+
//
235+
// If the timeout logic is run outside of the chroot.Run() call, the process will still
236+
// be running in the chroot's context and the automatic chroot cleanup triggered by the
237+
// process exiting will fail (see safechroot.go:cleanupAllChroots()). For example,
238+
// `unmount /path/to/chroot/dev` will fail since our root is currently `/path/to/chroot`,
239+
// and `/path/to/chroot/path/to/chroot/dev` is not a real path.
228240
results := make(chan error)
229-
go func() {
230-
buildErr := chroot.Run(func() (err error) {
231-
return buildRPMFromSRPMInChroot(srpmFileInChroot, outArch, runCheck, defines, packagesToInstall, isCCacheEnabled(ccacheManager))
232-
})
233-
results <- buildErr
234-
}()
235-
236-
select {
237-
case err = <-results:
238-
case <-time.After(timeout):
239-
logger.Log.Errorf("Timeout after %v: killing all processes in chroot...", timeout)
240-
shell.PermanentlyStopAllChildProcesses(unix.SIGKILL)
241-
err = fmt.Errorf("build timed out after %s", timeout)
242-
}
241+
err = chroot.Run(func() (err error) {
242+
go func() {
243+
results <- buildRPMFromSRPMInChroot(srpmFileInChroot, outArch, runCheck, defines, packagesToInstall, isCCacheEnabled(ccacheManager))
244+
}()
245+
246+
var chrootErr error = nil
247+
select {
248+
case chrootErr = <-results:
249+
logger.Log.Debug("Build thread in chroot finished.")
250+
case <-time.After(timeout):
251+
logger.Log.Errorf("Timeout after %v: stopping chroot...", timeout)
252+
shell.StopAllChildProcesses(unix.SIGKILL)
253+
chrootErr = fmt.Errorf("build timed out after %s", timeout)
254+
}
255+
return chrootErr // Internal error is returned via the channel
256+
})
243257

244258
if err != nil {
259+
err = fmt.Errorf("failed to build RPM from SRPM in chroot:\n%w", err)
245260
return
246261
}
247262

@@ -266,18 +281,21 @@ func buildRPMFromSRPMInChroot(srpmFile, outArch string, runCheck bool, defines m
266281
// Convert /localrpms into a repository that a package manager can use.
267282
err = rpmrepomanager.CreateRepo(chrootLocalRpmsDir)
268283
if err != nil {
284+
err = fmt.Errorf("failed to create local RPM repository:\n%w", err)
269285
return
270286
}
271287

272288
// Convert /toolchainrpms into a repository that a package manager can use.
273289
err = rpmrepomanager.CreateRepo(chrootLocalToolchainDir)
274290
if err != nil {
291+
err = fmt.Errorf("failed to create toolchain RPM repository:\n%w", err)
275292
return
276293
}
277294

278295
// install any additional packages, such as build dependencies.
279296
err = tdnfInstall(packagesToInstall)
280297
if err != nil {
298+
err = fmt.Errorf("failed to install additional packages:\n%w", err)
281299
return
282300
}
283301

@@ -286,6 +304,7 @@ func buildRPMFromSRPMInChroot(srpmFile, outArch string, runCheck bool, defines m
286304
logger.Log.Infof("USE_CCACHE: installing package: %s", ccachePkgName[0])
287305
err = tdnfInstall(ccachePkgName)
288306
if err != nil {
307+
err = fmt.Errorf("failed to install ccache:\n%w", err)
289308
return
290309
}
291310
}
@@ -296,6 +315,7 @@ func buildRPMFromSRPMInChroot(srpmFile, outArch string, runCheck bool, defines m
296315
// build failures.
297316
err = removeLibArchivesFromSystem()
298317
if err != nil {
318+
err = fmt.Errorf("failed to remove lib archives from system:\n%w", err)
299319
return
300320
}
301321

@@ -305,6 +325,10 @@ func buildRPMFromSRPMInChroot(srpmFile, outArch string, runCheck bool, defines m
305325
} else {
306326
err = rpm.BuildRPMFromSRPM(srpmFile, outArch, defines)
307327
}
328+
if err != nil {
329+
err = fmt.Errorf("failed to build/test RPM from SRPM:\n%w", err)
330+
return
331+
}
308332

309333
return
310334
}

toolkit/tools/scheduler/buildagents/chrootagent.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"fmt"
88
"path/filepath"
99
"strings"
10+
"time"
1011

1112
"github.com/microsoft/azurelinux/toolkit/tools/internal/shell"
1213
"github.com/sirupsen/logrus"
@@ -38,7 +39,7 @@ func (c *ChrootAgent) Initialize(config *BuildAgentConfig) (err error) {
3839
// - outArch is the target architecture to build for.
3940
// - runCheck is true if the package should run the "%check" section during the build
4041
// - dependencies is a list of dependencies that need to be installed before building.
41-
func (c *ChrootAgent) BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string) (builtFiles []string, logFile string, err error) {
42+
func (c *ChrootAgent) BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string, allowableRuntime time.Duration) (builtFiles []string, logFile string, err error) {
4243
// On success, pkgworker will print a comma-seperated list of all RPMs built to stdout.
4344
// This will be the last stdout line written.
4445
const delimiter = ","
@@ -50,7 +51,7 @@ func (c *ChrootAgent) BuildPackage(basePackageName, inputFile, logName, outArch
5051
lastStdoutLine = strings.TrimSpace(line)
5152
}
5253

53-
args := serializeChrootBuildAgentConfig(c.config, basePackageName, inputFile, logFile, outArch, runCheck, dependencies)
54+
args := serializeChrootBuildAgentConfig(c.config, basePackageName, inputFile, logFile, outArch, runCheck, dependencies, allowableRuntime)
5455
err = shell.NewExecBuilder(c.config.Program, args...).
5556
StdoutCallback(onStdout).
5657
LogLevel(logrus.TraceLevel, logrus.TraceLevel).
@@ -75,7 +76,7 @@ func (c *ChrootAgent) Close() (err error) {
7576
}
7677

7778
// serializeChrootBuildAgentConfig serializes a BuildAgentConfig into arguments usable by pkgworker for the sake of building the package.
78-
func serializeChrootBuildAgentConfig(config *BuildAgentConfig, basePackageName, inputFile, logFile, outArch string, runCheck bool, dependencies []string) (serializedArgs []string) {
79+
func serializeChrootBuildAgentConfig(config *BuildAgentConfig, basePackageName, inputFile, logFile, outArch string, runCheck bool, dependencies []string, allowableRuntime time.Duration) (serializedArgs []string) {
7980
serializedArgs = []string{
8081
fmt.Sprintf("--input=%s", inputFile),
8182
fmt.Sprintf("--work-dir=%s", config.WorkDir),
@@ -93,7 +94,7 @@ func serializeChrootBuildAgentConfig(config *BuildAgentConfig, basePackageName,
9394
fmt.Sprintf("--log-level=%s", config.LogLevel),
9495
fmt.Sprintf("--out-arch=%s", outArch),
9596
fmt.Sprintf("--max-cpu=%s", config.MaxCpu),
96-
fmt.Sprintf("--timeout=%s", config.Timeout.String()),
97+
fmt.Sprintf("--timeout=%s", allowableRuntime),
9798
}
9899

99100
if config.RpmmacrosFile != "" {

toolkit/tools/scheduler/buildagents/definition.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ type BuildAgent interface {
4848
// - outArch is the target architecture to build for.
4949
// - runCheck is true if the package should run the "%check" section during the build
5050
// - dependencies is a list of dependencies that need to be installed before building.
51-
BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string) ([]string, string, error)
51+
// - allowableRuntime is how long the package build is allowed to run.
52+
BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string, allowableRuntime time.Duration) ([]string, string, error)
5253

5354
// Config returns a copy of the agent's configuration.
5455
Config() BuildAgentConfig

toolkit/tools/scheduler/buildagents/testagent.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ func (t *TestAgent) Initialize(config *BuildAgentConfig) (err error) {
2828
}
2929

3030
// BuildPackage simply sleeps and then returns success for TestAgent.
31-
func (t *TestAgent) BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string) (builtFiles []string, logFile string, err error) {
31+
func (t *TestAgent) BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string, allowableRuntime time.Duration) (builtFiles []string, logFile string, err error) {
3232
const sleepDuration = time.Second * 5
3333
time.Sleep(sleepDuration)
3434

toolkit/tools/scheduler/scheduler.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,7 @@ func cancelOutstandingBuilds(agent buildagents.BuildAgent) {
264264
}
265265

266266
// Issue a SIGINT to all children processes to allow them to gracefully exit.
267-
shell.PermanentlyStopAllChildProcesses(unix.SIGINT)
267+
shell.StopAllChildProcesses(unix.SIGINT)
268268
}
269269

270270
// cancelBuildsOnSignal will stop any builds running on SIGINT/SIGTERM.

toolkit/tools/scheduler/schedulerutils/buildworker.go

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ package schedulerutils
55

66
import (
77
"bufio"
8+
"context"
89
"fmt"
910
"os"
1011
"path/filepath"
@@ -278,10 +279,27 @@ func buildSRPMFile(agent buildagents.BuildAgent, buildAttempts int, basePackageN
278279
)
279280

280281
logBaseName := filepath.Base(srpmFile) + ".log"
281-
err = retry.Run(func() (buildErr error) {
282-
builtFiles, logFile, buildErr = agent.BuildPackage(basePackageName, srpmFile, logBaseName, outArch, runCheck, dependencies)
282+
283+
// Track the time the build may take, and ensure we don't exceed the maximum limit.
284+
attemptNumber := 0
285+
totalExecutionTimeout := agent.Config().Timeout
286+
deadline := time.Now().Add(totalExecutionTimeout)
287+
ctx, cancelFunc := context.WithDeadline(context.Background(), deadline)
288+
defer cancelFunc()
289+
290+
wasCancelled, err := retry.RunWithLinearBackoff(ctx, func() (buildErr error) {
291+
if attemptNumber > 0 {
292+
logger.Log.Warnf("Build for '%s' failed %d times, retrying up to %d times.", srpmFile, attemptNumber, buildAttempts)
293+
}
294+
attemptNumber++
295+
296+
builtFiles, logFile, buildErr = agent.BuildPackage(basePackageName, srpmFile, logBaseName, outArch, runCheck, dependencies, time.Until(deadline))
283297
return
284298
}, buildAttempts, retryDuration)
299+
if wasCancelled {
300+
err = fmt.Errorf("after %d/%d attempts, the build exceeded the maximum time of %s", attemptNumber, buildAttempts, totalExecutionTimeout)
301+
return
302+
}
285303

286304
return
287305
}
@@ -296,10 +314,23 @@ func testSRPMFile(agent buildagents.BuildAgent, checkAttempts int, basePackageNa
296314
)
297315

298316
logBaseName := filepath.Base(srpmFile) + ".test.log"
299-
err = retry.Run(func() (buildErr error) {
317+
318+
// Track the time the build may take, and ensure we don't exceed the maximum limit.
319+
attemptNumber := 0
320+
totalExecutionTimeout := agent.Config().Timeout
321+
deadline := time.Now().Add(totalExecutionTimeout)
322+
ctx, cancelFunc := context.WithDeadline(context.Background(), deadline)
323+
defer cancelFunc()
324+
325+
wasCancelled, err := retry.RunWithLinearBackoff(ctx, func() (buildErr error) {
326+
if attemptNumber > 0 {
327+
logger.Log.Warnf("Test for '%s' failed %d times, retrying up to %d times.", srpmFile, attemptNumber, checkAttempts)
328+
}
329+
attemptNumber++
330+
300331
checkFailed = false
301332

302-
_, logFile, buildErr = agent.BuildPackage(basePackageName, srpmFile, logBaseName, outArch, runCheck, dependencies)
333+
_, logFile, buildErr = agent.BuildPackage(basePackageName, srpmFile, logBaseName, outArch, runCheck, dependencies, time.Until(deadline))
303334
if buildErr != nil {
304335
logger.Log.Warnf("Test build for '%s' failed on a non-test build issue. Error: %s", srpmFile, buildErr)
305336
return
@@ -312,6 +343,10 @@ func testSRPMFile(agent buildagents.BuildAgent, checkAttempts int, basePackageNa
312343
}
313344
return
314345
}, checkAttempts, retryDuration)
346+
if wasCancelled {
347+
err = fmt.Errorf("after %d/%d attempts, the check exceeded the maximum time of %s", attemptNumber, checkAttempts, totalExecutionTimeout)
348+
return
349+
}
315350

316351
if checkFailed {
317352
logger.Log.Debugf("Tests failed for '%s' after %d attempt(s).", basePackageName, checkAttempts)

0 commit comments

Comments
 (0)