Skip to content

Commit

Permalink
retry failed containers separately so we can run them in parallel (#58)
Browse files Browse the repository at this point in the history
  • Loading branch information
henrygd committed Oct 21, 2024
1 parent 5f4dcb0 commit 539c0cc
Showing 1 changed file with 23 additions and 6 deletions.
29 changes: 23 additions & 6 deletions beszel/internal/agent/docker.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ func (dm *dockerManager) getDockerStats() ([]*container.Stats, error) {
clear(dm.validIds)
}

var failedContainters []container.ApiInfo

for _, ctr := range *dm.apiContainerList {
ctr.IdShort = ctr.Id[:12]
dm.validIds[ctr.IdShort] = struct{}{}
Expand All @@ -74,18 +76,33 @@ func (dm *dockerManager) getDockerStats() ([]*container.Stats, error) {
defer dm.dequeue()
err := dm.updateContainerStats(ctr)
if err != nil {
dm.deleteContainerStatsSync(ctr.IdShort)
// retry once
err = dm.updateContainerStats(ctr)
if err != nil {
slog.Error("Error getting container stats", "err", err)
}
dm.containerStatsMutex.Lock()
delete(dm.containerStatsMap, ctr.IdShort)
failedContainters = append(failedContainters, ctr)
dm.containerStatsMutex.Unlock()
}
}()
}

dm.wg.Wait()

// retry failed containers separately so we can run them in parallel (docker 24 bug)
if len(failedContainters) > 0 {
slog.Debug("Retrying failed containers", "count", len(failedContainters))
// time.Sleep(time.Millisecond * 1100)
for _, ctr := range failedContainters {
dm.wg.Add(1)
go func() {
defer dm.wg.Done()
err = dm.updateContainerStats(ctr)
if err != nil {
slog.Error("Error getting container stats", "err", err)
}
}()
}
dm.wg.Wait()
}

// populate final stats and remove old / invalid container stats
stats := make([]*container.Stats, 0, containersLength)
for id, v := range dm.containerStatsMap {
Expand Down

0 comments on commit 539c0cc

Please sign in to comment.