diff --git a/cmd/doctor.go b/cmd/doctor.go index f557481110..f7636013a6 100644 --- a/cmd/doctor.go +++ b/cmd/doctor.go @@ -15,6 +15,7 @@ import ( "text/tabwriter" "forgejo.org/models/db" + git_model "forgejo.org/models/git" "forgejo.org/models/gitea_migrations" migrate_base "forgejo.org/models/gitea_migrations/base" repo_model "forgejo.org/models/repo" @@ -41,6 +42,7 @@ func cmdDoctor() *cli.Command { cmdRecreateTable(), cmdDoctorConvert(), cmdAvatarStripExif(), + cmdCleanupCommitStatuses(), }, } } @@ -115,6 +117,54 @@ func cmdAvatarStripExif() *cli.Command { } } +func cmdCleanupCommitStatuses() *cli.Command { + return &cli.Command{ + Name: "cleanup-commit-status", + Usage: "Cleanup extra records in commit_status table", + Description: `Forgejo suffered from a bug which caused the creation of more entries in the +"commit_status" table than necessary. This operation removes the redundant +data caused by the bug. Removing this data is almost always safe. +These reundant records can be accessed by users through the API, making it +possible, but unlikely, that removing it could have an impact to +integrating services (API: /repos/{owner}/{repo}/commits/{ref}/statuses). + +It is safe to run while Forgejo is online. + +On very large Forgejo instances, the performance of operation will improve +if the buffer-size option is used with large values. Approximately 130 MB of +memory is required for every 100,000 records in the buffer. + +Bug reference: https://codeberg.org/forgejo/forgejo/issues/10671 +`, + + Before: multipleBefore(noDanglingArgs, PrepareConsoleLoggerLevel(log.INFO)), + Action: runCleanupCommitStatus, + Flags: []cli.Flag{ + &cli.BoolFlag{ + Name: "verbose", + Aliases: []string{"V"}, + Usage: "Show process details", + }, + &cli.BoolFlag{ + Name: "dry-run", + Usage: "Report statistics from the operation but do not modify the database", + }, + &cli.IntFlag{ + Name: "buffer-size", + Usage: "Record count per query while iterating records; larger values are typically faster but use more memory", + // See IterateByKeyset's documentation for performance notes which led to the choice of the default + // buffer size for this operation. + Value: 100000, + }, + &cli.IntFlag{ + Name: "delete-chunk-size", + Usage: "Number of records to delete per DELETE query", + Value: 1000, + }, + }, + } +} + func runRecreateTable(stdCtx context.Context, ctx *cli.Command) error { stdCtx, cancel := installSignals(stdCtx) defer cancel() @@ -322,3 +372,19 @@ func runAvatarStripExif(ctx context.Context, c *cli.Command) error { return nil } + +func runCleanupCommitStatus(ctx context.Context, cli *cli.Command) error { + ctx, cancel := installSignals(ctx) + defer cancel() + + if err := initDB(ctx); err != nil { + return err + } + + bufferSize := cli.Int("buffer-size") + deleteChunkSize := cli.Int("delete-chunk-size") + dryRun := cli.Bool("dry-run") + log.Debug("bufferSize = %d, deleteChunkSize = %d, dryRun = %v", bufferSize, deleteChunkSize, dryRun) + + return git_model.CleanupCommitStatus(ctx, bufferSize, deleteChunkSize, dryRun) +} diff --git a/models/db/iterate.go b/models/db/iterate.go index 5e30b5e8bc..d2315cb12c 100644 --- a/models/db/iterate.go +++ b/models/db/iterate.go @@ -5,8 +5,10 @@ package db import ( "context" + "errors" "fmt" "reflect" + "strings" "forgejo.org/modules/setting" @@ -84,3 +86,123 @@ func extractFieldValue(bean any, fieldName string) any { field := v.FieldByName(fieldName) return field.Interface() } + +// IterateByKeyset iterates all the records on a database (matching the provided condition) in the order of specified +// order fields, and invokes the provided handler function for each record. It is safe to UPDATE or DELETE the record in +// the handler function, as long as the order fields are not mutated on the record (which could cause records to be +// missed or iterated multiple times). +// +// Assuming order fields a, b, and c, then database queries will be performed as "SELECT * FROM table WHERE (a, b, c) > +// (last_a, last_b, last_c) ORDER BY a, b, c LIMIT buffer_size" repeatedly until the query returns no records (except +// the first query will have no WHERE clause). +// +// Critical requirements for proper usage: +// +// - the order fields encompass at least one UNIQUE or PRIMARY KEY constraint of the table to ensure that records are +// not duplicated -- for example, if the table has a unique index covering `(repo_id, index)`, then it would be safe to +// use this function as long as both fields (in either order) are provided as order fields. +// +// - none of the order fields may have NULL values in them, as the `=` and `>` comparisons being performed by the +// iterative queries will not operate on these records consistently as they do with other values. +// +// This implementation could be a much simpler streaming scan of the query results, except that doesn't permit making +// any additional database queries or data modifications in the target function -- SQLite cannot write while holding a +// read lock. Buffering pages of data in-memory avoids that issue. +// +// Performance: +// +// - High performance will result from an alignment of an index on the table with the order fields, in the same field +// order, even if additional ordering fields could be provided after the index fields. In the absence of this index +// alignment, it is reasonable to expect that every extra page of data accessed will require a query that will perform +// an index scan (if available) or sequential scan of the target table. In testing on the `commit_status` table with +// 455k records, a fully index-supported ordering allowed each query page to execute in 0.18ms, as opposed to 80ms +// per-query without matching supporting index. +// +// - In the absence of a matching index, slower per-query performance can be compensated with a larger `batchSize` +// parameter, which controls how many records to fetch at once and therefore reduces the number of queries required. +// This requires more memory. Similar `commit_status` table testing showed these stats for iteration time and memory +// usage for different buffer sizes; specifics will vary depending on the target table: +// - buffer size = 1,000,000 - iterates in 2.8 seconds, consumes 363 MB of RAM +// - buffer size = 100,000 - iterates in 3.5 seconds, consume 130 MB of RAM +// - buffer size = 10,000 - iterates in 7.1 seconds, consumes 59 MB of RAM +// - buffer size = 1,000 - iterates in 33.9 seconds, consumes 42 MB of RAM +func IterateByKeyset[Bean any](ctx context.Context, cond builder.Cond, orderFields []string, batchSize int, f func(ctx context.Context, bean *Bean) error) error { + var dummy Bean + + if len(orderFields) == 0 { + return errors.New("orderFields must be provided") + } + + table, err := TableInfo(&dummy) + if err != nil { + return fmt.Errorf("unable to fetch table info for bean %v: %w", dummy, err) + } + goFieldNames := make([]string, len(orderFields)) + for i, f := range orderFields { + goFieldNames[i] = table.GetColumn(f).FieldName + } + sqlFieldNames := make([]string, len(orderFields)) + for i, f := range orderFields { + // Support field names like "index" which need quoting in builder.Cond & OrderBy + sqlFieldNames[i] = x.Dialect().Quoter().Quote(f) + } + + var lastKey []any + + // For the order fields, generate clauses (a, b, c) and (?, ?, ?) which will be used in the WHERE clause when + // reading additional pages of data. + rowValue := strings.Builder{} + rowParameterValue := strings.Builder{} + rowValue.WriteString("(") + rowParameterValue.WriteString("(") + for i, f := range sqlFieldNames { + rowValue.WriteString(f) + rowParameterValue.WriteString("?") + if i != len(sqlFieldNames)-1 { + rowValue.WriteString(", ") + rowParameterValue.WriteString(", ") + } + } + rowValue.WriteString(")") + rowParameterValue.WriteString(")") + + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + beans := make([]*Bean, 0, batchSize) + + sess := GetEngine(ctx) + for _, f := range sqlFieldNames { + sess = sess.OrderBy(f) + } + if cond != nil { + sess = sess.Where(cond) + } + if lastKey != nil { + sess = sess.Where( + builder.Expr(fmt.Sprintf("%s > %s", rowValue.String(), rowParameterValue.String()), lastKey...)) + } + + if err := sess.Limit(batchSize).Find(&beans); err != nil { + return err + } + if len(beans) == 0 { + return nil + } + + for _, bean := range beans { + if err := f(ctx, bean); err != nil { + return err + } + } + + lastBean := beans[len(beans)-1] + lastKey = make([]any, len(goFieldNames)) + for i := range goFieldNames { + lastKey[i] = extractFieldValue(lastBean, goFieldNames[i]) + } + } + } +} diff --git a/models/db/iterate_test.go b/models/db/iterate_test.go index 405db84866..d60db3da11 100644 --- a/models/db/iterate_test.go +++ b/models/db/iterate_test.go @@ -10,6 +10,7 @@ import ( "testing" "forgejo.org/models/db" + git_model "forgejo.org/models/git" repo_model "forgejo.org/models/repo" "forgejo.org/models/unittest" "forgejo.org/modules/setting" @@ -21,7 +22,6 @@ import ( ) func TestIterate(t *testing.T) { - db.SetLogSQL(t.Context(), true) defer test.MockVariableValue(&setting.Database.IterateBufferSize, 50)() t.Run("No Modifications", func(t *testing.T) { @@ -115,3 +115,31 @@ func TestIterate(t *testing.T) { assert.Empty(t, remainingRepoIDs) }) } + +func TestIterateMultipleFields(t *testing.T) { + for _, bufferSize := range []int{1, 2, 3, 10} { // 8 records in fixture + t.Run(fmt.Sprintf("No Modifications bufferSize=%d", bufferSize), func(t *testing.T) { + require.NoError(t, unittest.PrepareTestDatabase()) + + // Fetch all the commit status IDs... + var remainingIDs []int64 + err := db.GetEngine(t.Context()).Table(&git_model.CommitStatus{}).Cols("id").Find(&remainingIDs) + require.NoError(t, err) + require.NotEmpty(t, remainingIDs) + + // Ensure that every repo unit ID is found when doing iterate: + err = db.IterateByKeyset(t.Context(), + nil, + []string{"repo_id", "sha", "context", "index", "id"}, + bufferSize, + func(ctx context.Context, commit_status *git_model.CommitStatus) error { + remainingIDs = slices.DeleteFunc(remainingIDs, func(n int64) bool { + return commit_status.ID == n + }) + return nil + }) + require.NoError(t, err) + assert.Empty(t, remainingIDs) + }) + } +} diff --git a/models/git/TestCleanupCommitStatus/commit_status.yml b/models/git/TestCleanupCommitStatus/commit_status.yml new file mode 100644 index 0000000000..e62b39b6d2 --- /dev/null +++ b/models/git/TestCleanupCommitStatus/commit_status.yml @@ -0,0 +1,135 @@ +# Fields that should, if changed, prevent deletion: repo_id, sha, context, state, description. The first test sets will +# be varying each of these fields independently to confirm they're kept. + + +# Vary description: +- + id: 10 + index: 1 + repo_id: 62 + state: "pending" + sha: "01" + description: "Waiting for wake up" + context: deploy/awesomeness +- + id: 11 + index: 2 + repo_id: 62 + state: "pending" + sha: "01" + description: "Almost woke up..." + context: deploy/awesomeness + +# Vary state: +- + id: 12 + index: 1 + repo_id: 62 + state: "pending" + sha: "02" + description: "Waiting for wake up" + context: deploy/awesomeness +- + id: 13 + index: 2 + repo_id: 62 + state: "success" + sha: "02" + description: "Waiting for wake up" + context: deploy/awesomeness + +# Vary context: +- + id: 14 + index: 1 + repo_id: 62 + state: "pending" + sha: "03" + description: "Waiting for wake up" + context: deploy/awesomeness-v1 +- + id: 15 + index: 2 + repo_id: 62 + state: "pending" + sha: "03" + description: "Waiting for wake up" + context: deploy/awesomeness-v2 + +# Vary sha: +- + id: 16 + index: 1 + repo_id: 62 + state: "pending" + sha: "04" + description: "Waiting for wake up" + context: deploy/awesomeness-v1 +- + id: 17 + index: 2 + repo_id: 62 + state: "pending" + sha: "05" + description: "Waiting for wake up" + context: deploy/awesomeness-v1 + +# Vary Repo ID: +- + id: 18 + index: 1 + repo_id: 62 + state: "pending" + sha: "06" + description: "Waiting for wake up" + context: deploy/awesomeness-v1 +- + id: 19 + index: 2 + repo_id: 63 + state: "pending" + sha: "06" + description: "Waiting for wake up" + context: deploy/awesomeness-v1 + +# That's all the varying cases, now here's the data that should be affected by the delete: +- + id: 20 + index: 1 + repo_id: 62 + state: "pending" + sha: "07" + description: "Waiting for wake up" + context: deploy/awesomeness-v1 +- # Dupe 1 + id: 21 + index: 2 + repo_id: 62 + state: "pending" + sha: "07" + description: "Waiting for wake up" + context: deploy/awesomeness-v1 +- # Dupe 2 + id: 22 + index: 3 + repo_id: 62 + state: "pending" + sha: "07" + description: "Waiting for wake up" + context: deploy/awesomeness-v1 +- # Switched to "success", keep + id: 23 + index: 4 + repo_id: 62 + state: "success" + sha: "07" + description: "Successful!" + context: deploy/awesomeness-v1 +- # Dupe reporting success again + id: 24 + index: 5 + repo_id: 62 + state: "success" + sha: "07" + description: "Successful!" + context: deploy/awesomeness-v1 diff --git a/models/git/commit_status.go b/models/git/commit_status.go index 60a0aa5a4f..4ae926eed6 100644 --- a/models/git/commit_status.go +++ b/models/git/commit_status.go @@ -467,3 +467,68 @@ func ParseCommitsWithStatus(ctx context.Context, commits []*git.Commit, repo *re func hashCommitStatusContext(context string) string { return fmt.Sprintf("%x", sha1.Sum([]byte(context))) } + +func CleanupCommitStatus(ctx context.Context, bufferSize, deleteChunkSize int, dryRun bool) error { + startTime := time.Now() + + var lastCommitStatus CommitStatus + deleteTargets := make([]int64, 0, deleteChunkSize) + recordCount := 0 + deleteCount := 0 + + err := db.IterateByKeyset(ctx, + nil, + []string{"repo_id", "sha", "context", "index", "id"}, + bufferSize, + func(ctx context.Context, commitStatus *CommitStatus) error { + if commitStatus.RepoID != lastCommitStatus.RepoID || + commitStatus.SHA != lastCommitStatus.SHA || + commitStatus.Context != lastCommitStatus.Context || + commitStatus.State != lastCommitStatus.State || + commitStatus.Description != lastCommitStatus.Description { + // New context, or changed state/description; keep it, start looking for duplicates of it. + lastCommitStatus = *commitStatus + } else { + // Same context as previous record, and same state -- this record shouldn't have been stored. + deleteTargets = append(deleteTargets, commitStatus.ID) + + if len(deleteTargets) == deleteChunkSize { + // Flush delete chunk + log.Debug("deleting chunk of %d records (dryRun=%v)", len(deleteTargets), dryRun) + if !dryRun { + if err := db.DeleteByIDs[CommitStatus](ctx, deleteTargets...); err != nil { + return err + } + } + deleteCount += len(deleteTargets) + deleteTargets = make([]int64, 0, deleteChunkSize) + } + } + recordCount++ + return nil + }) + if err != nil { + return err + } + + if len(deleteTargets) > 0 { + log.Debug("deleting final chunk of %d records (dryRun=%v)", len(deleteTargets), dryRun) + if !dryRun { + if err := db.DeleteByIDs[CommitStatus](ctx, deleteTargets...); err != nil { + return err + } + } + deleteCount += len(deleteTargets) + } + + duration := time.Since(startTime) + + if dryRun { + log.Info("Reviewed %d records in commit_status, and would delete %d", recordCount, deleteCount) + } else { + log.Info("Reviewed %d records in commit_status, and deleted %d", recordCount, deleteCount) + } + log.Info("Cleanup commit status took %d milliseconds", duration.Milliseconds()) + + return nil +} diff --git a/models/git/commit_status_test.go b/models/git/commit_status_test.go index ce6c0d4673..b5c3690e0f 100644 --- a/models/git/commit_status_test.go +++ b/models/git/commit_status_test.go @@ -244,3 +244,46 @@ func TestFindRepoRecentCommitStatusContexts(t *testing.T) { assert.Equal(t, "compliance/lint-backend", contexts[0]) } } + +func TestCleanupCommitStatus(t *testing.T) { + defer unittest.OverrideFixtures("models/git/TestCleanupCommitStatus")() + require.NoError(t, unittest.PrepareTestDatabase()) + + // No changes after a dry run: + originalCount := unittest.GetCount(t, &git_model.CommitStatus{}) + err := git_model.CleanupCommitStatus(t.Context(), 100, 100, true) + require.NoError(t, err) + countAfterDryRun := unittest.GetCount(t, &git_model.CommitStatus{}) + assert.Equal(t, originalCount, countAfterDryRun) + + // Perform actual cleanup + err = git_model.CleanupCommitStatus(t.Context(), 100, 100, false) + require.NoError(t, err) + + // Varying descriptions + unittest.AssertExistsAndLoadBean(t, &git_model.CommitStatus{ID: 10}) + unittest.AssertExistsAndLoadBean(t, &git_model.CommitStatus{ID: 11}) + + // Varying state + unittest.AssertExistsAndLoadBean(t, &git_model.CommitStatus{ID: 12}) + unittest.AssertExistsAndLoadBean(t, &git_model.CommitStatus{ID: 13}) + + // Varying context + unittest.AssertExistsAndLoadBean(t, &git_model.CommitStatus{ID: 14}) + unittest.AssertExistsAndLoadBean(t, &git_model.CommitStatus{ID: 15}) + + // Varying sha + unittest.AssertExistsAndLoadBean(t, &git_model.CommitStatus{ID: 16}) + unittest.AssertExistsAndLoadBean(t, &git_model.CommitStatus{ID: 17}) + + // Varying repo ID + unittest.AssertExistsAndLoadBean(t, &git_model.CommitStatus{ID: 18}) + unittest.AssertExistsAndLoadBean(t, &git_model.CommitStatus{ID: 19}) + + // Expected to remain or be removed from cleanup of fixture data: + unittest.AssertExistsAndLoadBean(t, &git_model.CommitStatus{ID: 20}) + unittest.AssertNotExistsBean(t, &git_model.CommitStatus{ID: 21}) + unittest.AssertNotExistsBean(t, &git_model.CommitStatus{ID: 22}) + unittest.AssertExistsAndLoadBean(t, &git_model.CommitStatus{ID: 23}) + unittest.AssertNotExistsBean(t, &git_model.CommitStatus{ID: 24}) +}