forgejo/models/git/commit_status.go
Mathieu Fenniak 270317a3ad fix: add forgejo doctor cleanup-commit-status command (#10686)
```
NAME:
   forgejo doctor cleanup-commit-status - Cleanup extra records in commit_status table

USAGE:
   forgejo doctor cleanup-commit-status

DESCRIPTION:
   Forgejo suffered from a bug which caused the creation of more entries in the
   "commit_status" table than necessary. This operation removes the redundant
   data caused by the bug. Removing this data is almost always safe.
   These reundant records can be accessed by users through the API, making it
   possible, but unlikely, that removing it could have an impact to
   integrating services (API: /repos/{owner}/{repo}/commits/{ref}/statuses).

   It is safe to run while Forgejo is online.

   On very large Forgejo instances, the performance of operation will improve
   if the buffer-size option is used with large values. Approximately 130 MB of
   memory is required for every 100,000 records in the buffer.

   Bug reference: https://codeberg.org/forgejo/forgejo/issues/10671

OPTIONS:
   --help, -h                       show help
   --custom-path string, -C string  Set custom path (defaults to '{WorkPath}/custom')
   --config string, -c string       Set custom config file (defaults to '{WorkPath}/custom/conf/app.ini')
   --work-path string, -w string    Set Forgejo's working path (defaults to the directory of the Forgejo binary)
   --verbose, -V                    Show process details
   --dry-run                        Report statistics from the operation but do not modify the database
   --buffer-size int                Record count per query while iterating records; larger values are typically faster but use more memory (default: 100000)
   --delete-chunk-size int          Number of records to delete per DELETE query (default: 1000)
```

The cleanup effectively performs `SELECT * FROM commit_status ORDER BY repo_id, sha, context, index, id`, and iterates through the records.  Whenever `index, id` changes without the other fields changing, then it's a useless record that can be deleted.  The major complication is doing that at scale without bringing the entire database table into memory, which is performed through a new iteration method `IterateByKeyset`.

Manually tested against a 455,303 record table in PostgreSQL, MySQL, and SQLite, which was reduced to 10,781 records, dropping 97.5% of the records.

Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/10686
Reviewed-by: Gusted <gusted@noreply.codeberg.org>
Co-authored-by: Mathieu Fenniak <mathieu@fenniak.net>
Co-committed-by: Mathieu Fenniak <mathieu@fenniak.net>
2026-01-10 23:15:21 +01:00

534 lines
17 KiB
Go

// Copyright 2017 Gitea. All rights reserved.
// SPDX-License-Identifier: MIT
package git
import (
"context"
"crypto/sha1"
"errors"
"fmt"
"net/url"
"strconv"
"strings"
"time"
asymkey_model "forgejo.org/models/asymkey"
"forgejo.org/models/db"
repo_model "forgejo.org/models/repo"
user_model "forgejo.org/models/user"
"forgejo.org/modules/git"
"forgejo.org/modules/log"
"forgejo.org/modules/setting"
api "forgejo.org/modules/structs"
"forgejo.org/modules/timeutil"
"forgejo.org/modules/translation"
"xorm.io/builder"
"xorm.io/xorm"
)
// CommitStatus holds a single Status of a single Commit
type CommitStatus struct {
ID int64 `xorm:"pk autoincr"`
Index int64 `xorm:"INDEX UNIQUE(repo_sha_index)"`
RepoID int64 `xorm:"INDEX UNIQUE(repo_sha_index)"`
Repo *repo_model.Repository `xorm:"-"`
State api.CommitStatusState `xorm:"VARCHAR(7) NOT NULL"`
SHA string `xorm:"VARCHAR(64) NOT NULL INDEX UNIQUE(repo_sha_index)"`
TargetURL string `xorm:"TEXT"`
Description string `xorm:"TEXT"`
ContextHash string `xorm:"VARCHAR(64) index"`
Context string `xorm:"TEXT"`
Creator *user_model.User `xorm:"-"`
CreatorID int64
CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"`
UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
}
func init() {
db.RegisterModel(new(CommitStatus))
db.RegisterModel(new(CommitStatusIndex))
}
func postgresGetCommitStatusIndex(ctx context.Context, repoID int64, sha string) (int64, error) {
res, err := db.GetEngine(ctx).Query("INSERT INTO `commit_status_index` (repo_id, sha, max_index) "+
"VALUES (?,?,1) ON CONFLICT (repo_id, sha) DO UPDATE SET max_index = `commit_status_index`.max_index+1 RETURNING max_index",
repoID, sha)
if err != nil {
return 0, err
}
if len(res) == 0 {
return 0, db.ErrGetResourceIndexFailed
}
return strconv.ParseInt(string(res[0]["max_index"]), 10, 64)
}
func mysqlGetCommitStatusIndex(ctx context.Context, repoID int64, sha string) (int64, error) {
if _, err := db.GetEngine(ctx).Exec("INSERT INTO `commit_status_index` (repo_id, sha, max_index) "+
"VALUES (?,?,1) ON DUPLICATE KEY UPDATE max_index = max_index+1",
repoID, sha); err != nil {
return 0, err
}
var idx int64
_, err := db.GetEngine(ctx).SQL("SELECT max_index FROM `commit_status_index` WHERE repo_id = ? AND sha = ?",
repoID, sha).Get(&idx)
if err != nil {
return 0, err
}
if idx == 0 {
return 0, errors.New("cannot get the correct index")
}
return idx, nil
}
// GetNextCommitStatusIndex retried 3 times to generate a resource index
func GetNextCommitStatusIndex(ctx context.Context, repoID int64, sha string) (int64, error) {
_, err := git.NewIDFromString(sha)
if err != nil {
return 0, git.ErrInvalidSHA{SHA: sha}
}
switch {
case setting.Database.Type.IsPostgreSQL():
return postgresGetCommitStatusIndex(ctx, repoID, sha)
case setting.Database.Type.IsMySQL():
return mysqlGetCommitStatusIndex(ctx, repoID, sha)
}
e := db.GetEngine(ctx)
// try to update the max_index to next value, and acquire the write-lock for the record
res, err := e.Exec("UPDATE `commit_status_index` SET max_index=max_index+1 WHERE repo_id=? AND sha=?", repoID, sha)
if err != nil {
return 0, fmt.Errorf("update failed: %w", err)
}
affected, err := res.RowsAffected()
if err != nil {
return 0, err
}
if affected == 0 {
// this slow path is only for the first time of creating a resource index
_, errIns := e.Exec("INSERT INTO `commit_status_index` (repo_id, sha, max_index) VALUES (?, ?, 0)", repoID, sha)
res, err = e.Exec("UPDATE `commit_status_index` SET max_index=max_index+1 WHERE repo_id=? AND sha=?", repoID, sha)
if err != nil {
return 0, fmt.Errorf("update2 failed: %w", err)
}
affected, err = res.RowsAffected()
if err != nil {
return 0, fmt.Errorf("RowsAffected failed: %w", err)
}
// if the update still can not update any records, the record must not exist and there must be some errors (insert error)
if affected == 0 {
if errIns == nil {
return 0, errors.New("impossible error when GetNextCommitStatusIndex, insert and update both succeeded but no record is updated")
}
return 0, fmt.Errorf("insert failed: %w", errIns)
}
}
// now, the new index is in database (protected by the transaction and write-lock)
var newIdx int64
has, err := e.SQL("SELECT max_index FROM `commit_status_index` WHERE repo_id=? AND sha=?", repoID, sha).Get(&newIdx)
if err != nil {
return 0, fmt.Errorf("select failed: %w", err)
}
if !has {
return 0, errors.New("impossible error when GetNextCommitStatusIndex, upsert succeeded but no record can be selected")
}
return newIdx, nil
}
func (status *CommitStatus) loadRepository(ctx context.Context) (err error) {
if status.Repo == nil {
status.Repo, err = repo_model.GetRepositoryByID(ctx, status.RepoID)
if err != nil {
return fmt.Errorf("getRepositoryByID [%d]: %w", status.RepoID, err)
}
}
return nil
}
func (status *CommitStatus) loadCreator(ctx context.Context) (err error) {
if status.Creator == nil && status.CreatorID > 0 {
status.Creator, err = user_model.GetUserByID(ctx, status.CreatorID)
if err != nil {
return fmt.Errorf("getUserByID [%d]: %w", status.CreatorID, err)
}
}
return nil
}
func (status *CommitStatus) loadAttributes(ctx context.Context) (err error) {
if err := status.loadRepository(ctx); err != nil {
return err
}
return status.loadCreator(ctx)
}
// APIURL returns the absolute APIURL to this commit-status.
func (status *CommitStatus) APIURL(ctx context.Context) string {
_ = status.loadAttributes(ctx)
return status.Repo.APIURL() + "/statuses/" + url.PathEscape(status.SHA)
}
// LocaleString returns the locale string name of the Status
func (status *CommitStatus) LocaleString(lang translation.Locale) string {
return lang.TrString("repo.commitstatus." + status.State.String())
}
// CalcCommitStatus returns commit status state via some status, the commit statues should order by id desc
func CalcCommitStatus(statuses []*CommitStatus) *CommitStatus {
if len(statuses) == 0 {
return nil
}
latestWorstStatus := statuses[0]
for _, status := range statuses[1:] {
if status.State.NoBetterThan(latestWorstStatus.State) {
latestWorstStatus = status
}
}
return latestWorstStatus
}
// CommitStatusOptions holds the options for query commit statuses
type CommitStatusOptions struct {
db.ListOptions
RepoID int64
SHA string
State string
SortType string
}
func (opts *CommitStatusOptions) ToConds() builder.Cond {
var cond builder.Cond = builder.Eq{
"repo_id": opts.RepoID,
"sha": opts.SHA,
}
switch opts.State {
case "pending", "success", "error", "failure", "warning":
cond = cond.And(builder.Eq{
"state": opts.State,
})
}
return cond
}
func (opts *CommitStatusOptions) ToOrders() string {
switch opts.SortType {
case "oldest":
return "created_unix ASC"
case "recentupdate":
return "updated_unix DESC"
case "leastupdate":
return "updated_unix ASC"
case "leastindex":
return "`index` DESC"
case "highestindex":
return "`index` ASC"
default:
return "created_unix DESC"
}
}
// CommitStatusIndex represents a table for commit status index
type CommitStatusIndex struct {
ID int64
RepoID int64 `xorm:"unique(repo_sha)"`
SHA string `xorm:"unique(repo_sha)"`
MaxIndex int64 `xorm:"index"`
}
// GetLatestCommitStatus returns all statuses with a unique context for a given commit.
func GetLatestCommitStatus(ctx context.Context, repoID int64, sha string, listOptions db.ListOptions) ([]*CommitStatus, int64, error) {
getBase := func() *xorm.Session {
return db.GetEngine(ctx).Table(&CommitStatus{}).
Where("repo_id = ?", repoID).And("sha = ?", sha)
}
indices := make([]int64, 0, 10)
sess := getBase().Select("max( `index` ) as `index`").
GroupBy("context_hash").OrderBy("max( `index` ) desc")
if !listOptions.IsListAll() {
sess = db.SetSessionPagination(sess, &listOptions)
}
count, err := sess.FindAndCount(&indices)
if err != nil {
return nil, count, err
}
statuses := make([]*CommitStatus, 0, len(indices))
if len(indices) == 0 {
return statuses, count, nil
}
return statuses, count, getBase().And(builder.In("`index`", indices)).Find(&statuses)
}
// GetLatestCommitStatusForPairs returns all statuses with a unique context for a given list of repo-sha pairs
func GetLatestCommitStatusForPairs(ctx context.Context, repoSHAs []RepoSHA) (map[int64][]*CommitStatus, error) {
results := []*CommitStatus{}
repoStatuses := make(map[int64][]*CommitStatus)
if len(repoSHAs) == 0 {
// Avoid performing query when there will be no query conditions added.
return repoStatuses, nil
}
// Create a disjunction of conditions for each repoID and SHA pair
conds := make([]builder.Cond, 0, len(repoSHAs))
for _, repoSHA := range repoSHAs {
conds = append(conds, builder.Eq{"repo_id": repoSHA.RepoID, "sha": repoSHA.SHA})
}
subquery := builder.Dialect(db.BuilderDialect()).
Select("context_hash, repo_id, sha, MAX(`index`) AS max_index").
From("commit_status").
Where(builder.Or(conds...)).
GroupBy("context_hash, repo_id, sha")
sess := db.GetEngine(ctx).
Table(&CommitStatus{}).
Alias("c").
Join(
"INNER",
subquery,
"c.context_hash = commit_status.context_hash AND c.repo_id = commit_status.repo_id AND c.sha = commit_status.sha AND c.`index` = commit_status.max_index",
).
OrderBy("c.`index` DESC")
err := sess.Find(&results)
if err != nil {
return nil, err
}
// Group the statuses by repo ID
for _, status := range results {
repoStatuses[status.RepoID] = append(repoStatuses[status.RepoID], status)
}
return repoStatuses, nil
}
// GetLatestCommitStatusForRepoCommitIDs returns all statuses with a unique context for a given list of repo-sha pairs
func GetLatestCommitStatusForRepoCommitIDs(ctx context.Context, repoID int64, commitIDs []string) (map[string][]*CommitStatus, error) {
type result struct {
Index int64
SHA string
}
repoStatuses := make(map[string][]*CommitStatus)
if len(commitIDs) == 0 {
// Avoid performing query when there will be no `sha` query conditions added.
return repoStatuses, nil
}
getBase := func() *xorm.Session {
return db.GetEngine(ctx).Table(&CommitStatus{}).Where("repo_id = ?", repoID)
}
results := make([]result, 0, len(commitIDs))
conds := make([]builder.Cond, 0, len(commitIDs))
for _, sha := range commitIDs {
conds = append(conds, builder.Eq{"sha": sha})
}
sess := getBase().And(builder.Or(conds...)).
Select("max( `index` ) as `index`, sha").
GroupBy("context_hash, sha").OrderBy("max( `index` ) desc")
err := sess.Find(&results)
if err != nil {
return nil, err
}
if len(results) > 0 {
statuses := make([]*CommitStatus, 0, len(results))
conds = make([]builder.Cond, 0, len(results))
for _, result := range results {
conds = append(conds, builder.Eq{"`index`": result.Index, "sha": result.SHA})
}
err = getBase().And(builder.Or(conds...)).Find(&statuses)
if err != nil {
return nil, err
}
// Group the statuses by commit
for _, status := range statuses {
repoStatuses[status.SHA] = append(repoStatuses[status.SHA], status)
}
}
return repoStatuses, nil
}
// FindRepoRecentCommitStatusContexts returns repository's recent commit status contexts
func FindRepoRecentCommitStatusContexts(ctx context.Context, repoID int64, before time.Duration) ([]string, error) {
start := timeutil.TimeStampNow().AddDuration(-before)
var contexts []string
if err := db.GetEngine(ctx).Table("commit_status").
Where("repo_id = ?", repoID).And("updated_unix >= ?", start).
Cols("context").Distinct().Find(&contexts); err != nil {
return nil, err
}
return contexts, nil
}
// NewCommitStatusOptions holds options for creating a CommitStatus
type NewCommitStatusOptions struct {
Repo *repo_model.Repository
Creator *user_model.User
SHA git.ObjectID
CommitStatus *CommitStatus
}
// NewCommitStatus save commit statuses into database
func NewCommitStatus(ctx context.Context, opts NewCommitStatusOptions) error {
if opts.Repo == nil {
return fmt.Errorf("NewCommitStatus[nil, %s]: no repository specified", opts.SHA)
}
repoPath := opts.Repo.RepoPath()
if opts.Creator == nil {
return fmt.Errorf("NewCommitStatus[%s, %s]: no user specified", repoPath, opts.SHA)
}
ctx, committer, err := db.TxContext(ctx)
if err != nil {
return fmt.Errorf("NewCommitStatus[repo_id: %d, user_id: %d, sha: %s]: %w", opts.Repo.ID, opts.Creator.ID, opts.SHA, err)
}
defer committer.Close()
// Get the next Status Index
idx, err := GetNextCommitStatusIndex(ctx, opts.Repo.ID, opts.SHA.String())
if err != nil {
return fmt.Errorf("generate commit status index failed: %w", err)
}
opts.CommitStatus.Description = strings.TrimSpace(opts.CommitStatus.Description)
opts.CommitStatus.Context = strings.TrimSpace(opts.CommitStatus.Context)
opts.CommitStatus.TargetURL = strings.TrimSpace(opts.CommitStatus.TargetURL)
opts.CommitStatus.SHA = opts.SHA.String()
opts.CommitStatus.CreatorID = opts.Creator.ID
opts.CommitStatus.RepoID = opts.Repo.ID
opts.CommitStatus.Index = idx
log.Debug("NewCommitStatus[%s, %s]: %d", repoPath, opts.SHA, opts.CommitStatus.Index)
opts.CommitStatus.ContextHash = hashCommitStatusContext(opts.CommitStatus.Context)
// Insert new CommitStatus
if _, err = db.GetEngine(ctx).Insert(opts.CommitStatus); err != nil {
return fmt.Errorf("insert CommitStatus[%s, %s]: %w", repoPath, opts.SHA, err)
}
return committer.Commit()
}
// SignCommitWithStatuses represents a commit with validation of signature and status state.
type SignCommitWithStatuses struct {
Status *CommitStatus
Statuses []*CommitStatus
*asymkey_model.SignCommit
}
// ParseCommitsWithStatus converts git commits into SignCommitWithStatuses (checks signature and calculates its worst status state)
func ParseCommitsWithStatus(ctx context.Context, commits []*git.Commit, repo *repo_model.Repository) []*SignCommitWithStatuses {
commitsWithSignature := asymkey_model.ParseCommitsWithSignature(
ctx,
user_model.ValidateCommitsWithEmails(ctx, commits),
repo.GetTrustModel(),
func(user *user_model.User) (bool, error) {
return repo_model.IsOwnerMemberCollaborator(ctx, repo, user.ID)
},
)
commitsWithStatus := make([]*SignCommitWithStatuses, 0, len(commitsWithSignature))
for _, c := range commitsWithSignature {
commit := &SignCommitWithStatuses{
SignCommit: c,
}
statuses, _, err := GetLatestCommitStatus(ctx, repo.ID, commit.ID.String(), db.ListOptions{})
if err != nil {
log.Error("GetLatestCommitStatus: %v", err)
} else {
commit.Statuses = statuses
commit.Status = CalcCommitStatus(statuses)
}
commitsWithStatus = append(commitsWithStatus, commit)
}
return commitsWithStatus
}
// hashCommitStatusContext hash context
func hashCommitStatusContext(context string) string {
return fmt.Sprintf("%x", sha1.Sum([]byte(context)))
}
func CleanupCommitStatus(ctx context.Context, bufferSize, deleteChunkSize int, dryRun bool) error {
startTime := time.Now()
var lastCommitStatus CommitStatus
deleteTargets := make([]int64, 0, deleteChunkSize)
recordCount := 0
deleteCount := 0
err := db.IterateByKeyset(ctx,
nil,
[]string{"repo_id", "sha", "context", "index", "id"},
bufferSize,
func(ctx context.Context, commitStatus *CommitStatus) error {
if commitStatus.RepoID != lastCommitStatus.RepoID ||
commitStatus.SHA != lastCommitStatus.SHA ||
commitStatus.Context != lastCommitStatus.Context ||
commitStatus.State != lastCommitStatus.State ||
commitStatus.Description != lastCommitStatus.Description {
// New context, or changed state/description; keep it, start looking for duplicates of it.
lastCommitStatus = *commitStatus
} else {
// Same context as previous record, and same state -- this record shouldn't have been stored.
deleteTargets = append(deleteTargets, commitStatus.ID)
if len(deleteTargets) == deleteChunkSize {
// Flush delete chunk
log.Debug("deleting chunk of %d records (dryRun=%v)", len(deleteTargets), dryRun)
if !dryRun {
if err := db.DeleteByIDs[CommitStatus](ctx, deleteTargets...); err != nil {
return err
}
}
deleteCount += len(deleteTargets)
deleteTargets = make([]int64, 0, deleteChunkSize)
}
}
recordCount++
return nil
})
if err != nil {
return err
}
if len(deleteTargets) > 0 {
log.Debug("deleting final chunk of %d records (dryRun=%v)", len(deleteTargets), dryRun)
if !dryRun {
if err := db.DeleteByIDs[CommitStatus](ctx, deleteTargets...); err != nil {
return err
}
}
deleteCount += len(deleteTargets)
}
duration := time.Since(startTime)
if dryRun {
log.Info("Reviewed %d records in commit_status, and would delete %d", recordCount, deleteCount)
} else {
log.Info("Reviewed %d records in commit_status, and deleted %d", recordCount, deleteCount)
}
log.Info("Cleanup commit status took %d milliseconds", duration.Milliseconds())
return nil
}