From b62b1d310b0d7739300ead0716fedc1e570e9268 Mon Sep 17 00:00:00 2001 From: Gergely Nagy Date: Fri, 29 Dec 2023 15:50:47 +0100 Subject: [PATCH 1/6] settings: charset & collate settings for the db Remove the (undocumented) `Database.MysqlCharset` setting, and introduce `Database.DefaultCharset` and `Database.DefaultCollation` settings instead. The reason for the `MysqlCharset` removal is that in a later patch, we're going to adjust the engine initialization so that it achieves similar results, in a different way. The new settings will remain undocumented for similar reasons `MysqlCharset` was undocumented: the defaults work for the vast majority of cases, and it's too easy to break things if changing them. Signed-off-by: Gergely Nagy --- modules/setting/database.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/modules/setting/database.go b/modules/setting/database.go index b68f250f787ef..bd401f4e071fb 100644 --- a/modules/setting/database.go +++ b/modules/setting/database.go @@ -34,7 +34,6 @@ var ( SSLMode string Path string LogSQL bool - MysqlCharset string Timeout int // seconds SQLiteJournalMode string DBConnectRetries int @@ -44,6 +43,8 @@ var ( ConnMaxLifetime time.Duration IterateBufferSize int AutoMigration bool + DefaultCharset string + DefaultCollation string }{ Timeout: 500, IterateBufferSize: 50, @@ -67,7 +68,6 @@ func loadDBSetting(rootCfg ConfigProvider) { } Database.Schema = sec.Key("SCHEMA").String() Database.SSLMode = sec.Key("SSL_MODE").MustString("disable") - Database.MysqlCharset = sec.Key("MYSQL_CHARSET").MustString("utf8mb4") // do not document it, end users won't need it. Database.Path = sec.Key("PATH").MustString(filepath.Join(AppDataPath, "gitea.db")) Database.Timeout = sec.Key("SQLITE_TIMEOUT").MustInt(500) @@ -86,6 +86,9 @@ func loadDBSetting(rootCfg ConfigProvider) { Database.DBConnectRetries = sec.Key("DB_RETRIES").MustInt(10) Database.DBConnectBackoff = sec.Key("DB_RETRY_BACKOFF").MustDuration(3 * time.Second) Database.AutoMigration = sec.Key("AUTO_MIGRATION").MustBool(true) + + Database.DefaultCharset = sec.Key("DEFAULT_CHARSET").String() + Database.DefaultCollation = sec.Key("DEFAULT_COLLATION").String() } // DBConnStr returns database connection string @@ -105,8 +108,8 @@ func DBConnStr() (string, error) { if tls == "disable" { // allow (Postgres-inspired) default value to work in MySQL tls = "false" } - connStr = fmt.Sprintf("%s:%s@%s(%s)/%s%scharset=%s&parseTime=true&tls=%s", - Database.User, Database.Passwd, connType, Database.Host, Database.Name, paramSep, Database.MysqlCharset, tls) + connStr = fmt.Sprintf("%s:%s@%s(%s)/%s%sparseTime=true&tls=%s", + Database.User, Database.Passwd, connType, Database.Host, Database.Name, paramSep, tls) case "postgres": connStr = getPostgreSQLConnectionString(Database.Host, Database.User, Database.Passwd, Database.Name, Database.SSLMode) case "mssql": From dcde4e3e772737e548240a07bbda716fe355e6da Mon Sep 17 00:00:00 2001 From: Gergely Nagy Date: Fri, 29 Dec 2023 15:56:00 +0100 Subject: [PATCH 2/6] db: Make ConvertUtf8ToUtf8mb4() more flexible Replace `db.ConvertUtf8ToUtf8mb4()` with `db.ConvertCharsetAndCollation()`, which does essentially the same thing, but the charset and the collation is controlled by the caller. Also introduce `db.findCaseSensitiveCollation()` which attempts to auto-detect the best supported case-sensitive collation for MySQL and MariaDB databases. With these two functions, `gitea doctor check` can be adapted to use the charset and collation set in the configuration (or auto-detect them, if unset), and this patch does that too. Signed-off-by: Gergely Nagy --- cmd/doctor_convert.go | 11 ++++++++--- models/db/convert.go | 6 +++--- models/db/engine.go | 45 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 6 deletions(-) diff --git a/cmd/doctor_convert.go b/cmd/doctor_convert.go index 2385f23e52193..af4980ef8b7f4 100644 --- a/cmd/doctor_convert.go +++ b/cmd/doctor_convert.go @@ -37,11 +37,16 @@ func runDoctorConvert(ctx *cli.Context) error { switch { case setting.Database.Type.IsMySQL(): - if err := db.ConvertUtf8ToUtf8mb4(); err != nil { - log.Fatal("Failed to convert database from utf8 to utf8mb4: %v", err) + charset, collation, err := db.GetDesiredCharsetAndCollation() + if err != nil { + log.Fatal("Failed to determine the desired database charset or collation: %v", err) return err } - fmt.Println("Converted successfully, please confirm your database's character set is now utf8mb4") + if err := db.ConvertCharsetAndCollation(charset, collation); err != nil { + log.Fatal("Failed to convert database from utf8 to %s: %v", charset, err) + return err + } + fmt.Printf("Converted successfully, please confirm your database's character set is now %s, and collation is set to %s\n", charset, collation) case setting.Database.Type.IsMSSQL(): if err := db.ConvertVarcharToNVarchar(); err != nil { log.Fatal("Failed to convert database from varchar to nvarchar: %v", err) diff --git a/models/db/convert.go b/models/db/convert.go index 112c8575ca2c7..0abe435cb9ee5 100644 --- a/models/db/convert.go +++ b/models/db/convert.go @@ -15,12 +15,12 @@ import ( ) // ConvertUtf8ToUtf8mb4 converts database and tables from utf8 to utf8mb4 if it's mysql and set ROW_FORMAT=dynamic -func ConvertUtf8ToUtf8mb4() error { +func ConvertCharsetAndCollation(charset, collation string) error { if x.Dialect().URI().DBType != schemas.MYSQL { return nil } - _, err := x.Exec(fmt.Sprintf("ALTER DATABASE `%s` CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci", setting.Database.Name)) + _, err := x.Exec(fmt.Sprintf("ALTER DATABASE `%s` CHARACTER SET `%s` COLLATE `%s`", setting.Database.Name, charset, collation)) if err != nil { return err } @@ -34,7 +34,7 @@ func ConvertUtf8ToUtf8mb4() error { return err } - if _, err := x.Exec(fmt.Sprintf("ALTER TABLE `%s` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;", table.Name)); err != nil { + if _, err := x.Exec(fmt.Sprintf("ALTER TABLE `%s` CONVERT TO CHARACTER SET `%s` COLLATE `%s`", table.Name, charset, collation)); err != nil { return err } } diff --git a/models/db/engine.go b/models/db/engine.go index 182d8cd993696..3129c86b04080 100755 --- a/models/db/engine.go +++ b/models/db/engine.go @@ -150,6 +150,51 @@ func InitEngine(ctx context.Context) error { return nil } +func findCaseSensitiveCollation() (string, error) { + if x.Dialect().URI().DBType != schemas.MYSQL { + return "", nil + } + + v, err := x.DBVersion() + if err != nil { + return "", nil + } + + var collation string + switch v.Edition { + case "MariaDB": + collation = "uca1400_as_cs" + default: + collation = "utf8mb4_0900_as_cs" + } + + return collation, nil +} + +func GetDesiredCharsetAndCollation() (string, string, error) { + if x.Dialect().URI().DBType != schemas.MYSQL { + return "", "", nil + } + + var charset string + var collation string + var err error + if setting.Database.DefaultCharset == "" { + charset = "utf8mb4" + } else { + charset = setting.Database.DefaultCharset + } + if setting.Database.DefaultCollation == "" { + collation, err = findCaseSensitiveCollation() + if err != nil { + return "", "", err + } + } else { + collation = setting.Database.DefaultCollation + } + return charset, collation, nil +} + // SetDefaultEngine sets the default engine for db func SetDefaultEngine(ctx context.Context, eng *xorm.Engine) { x = eng From ec6ba9a669a66604fb38799d0795d973e6a09b37 Mon Sep 17 00:00:00 2001 From: Gergely Nagy Date: Fri, 29 Dec 2023 16:00:21 +0100 Subject: [PATCH 3/6] cmd/web: Perform a db sanity check on startup When starting the web server, perform a sanity check on the database. The sanity check currently runs for MySQL/MariaDB only, and verifies that the charset and collation are correctly set on both the database, and on all tables in it. If there is a discrepancy, it does not error out, but prints a warning only. Signed-off-by: Gergely Nagy --- cmd/web.go | 5 ++ models/db/engine.go | 63 ++++++++++++++ tests/integration/collate_test.go | 138 ++++++++++++++++++++++++++++++ 3 files changed, 206 insertions(+) create mode 100644 tests/integration/collate_test.go diff --git a/cmd/web.go b/cmd/web.go index 01386251becfa..afc11ebee1839 100644 --- a/cmd/web.go +++ b/cmd/web.go @@ -15,6 +15,7 @@ import ( _ "net/http/pprof" // Used for debugging if enabled and a web server is running + "code.gitea.io/gitea/models/db" "code.gitea.io/gitea/modules/container" "code.gitea.io/gitea/modules/graceful" "code.gitea.io/gitea/modules/log" @@ -193,6 +194,10 @@ func serveInstalled(ctx *cli.Context) error { routers.InitWebInstalled(graceful.GetManager().HammerContext()) + if err := db.SanityCheck(); err != nil { + log.Warn("database sanity check warning: %s", err) + } + // We check that AppDataPath exists here (it should have been created during installation) // We can't check it in `InitWebInstalled`, because some integration tests // use cmd -> InitWebInstalled, but the AppDataPath doesn't exist during those tests. diff --git a/models/db/engine.go b/models/db/engine.go index 3129c86b04080..3e00165566ac3 100755 --- a/models/db/engine.go +++ b/models/db/engine.go @@ -195,6 +195,69 @@ func GetDesiredCharsetAndCollation() (string, string, error) { return charset, collation, nil } +func SanityCheck() error { + // We do not have any sanity checks for engines other than MySQL + if !setting.Database.Type.IsMySQL() { + return nil + } + + expectedCharset, expectedCollation, err := GetDesiredCharsetAndCollation() + if err != nil { + return err + } + + // check that the database collation is set to a case sensitive one. + var collation []string + _, err = x.SQL("SELECT default_collation_name FROM information_schema.schemata WHERE schema_name = ?", + setting.Database.Name).Get(&collation) + if err != nil { + return err + } + // For mariadb, when we set the collation to uca1400_as_cs, that is + // translated to utf8mb4_uca1400_as_cs, hence the suffix check. + if !strings.HasSuffix(collation[0], expectedCollation) { + return fmt.Errorf(`database collation ("%s") is not %s. Consider running "gitea doctor convert"`, collation[0], expectedCollation) + } + + // check the database character set + var charset []string + _, err = x.SQL("SELECT default_character_set_name FROM information_schema.schemata WHERE schema_name = ?", setting.Database.Name).Get(&charset) + if err != nil { + return err + } + if charset[0] != expectedCharset { + return fmt.Errorf(`database charset ("%s") is not %s. Consider running "gitea doctor convert"`, charset[0], expectedCharset) + } + + // check table collations and character sets + tables, err := x.DBMetas() + if err != nil { + return err + } + for _, table := range tables { + _, err := x.SQL("SELECT CCSA.character_set_name FROM information_schema.tables T, information_schema.collation_character_set_applicability CCSA WHERE CCSA.collation_name = T.table_collation AND T.table_schema = ? AND T.table_name = ?", + setting.Database.Name, table.Name).Get(&charset) + if err != nil { + return err + } + if charset[0] != expectedCharset { + return fmt.Errorf(`table charset for '%s' (%s) is not %s. Consider running "gitea doctor convert"`, table.Name, charset[0], expectedCharset) + } + + _, err = x.SQL("SELECT CCSA.collation_name FROM information_schema.tables T, information_schema.collation_character_set_applicability CCSA WHERE CCSA.collation_name = T.table_collation AND T.table_schema = ? AND T.table_name = ?", + setting.Database.Name, table.Name).Get(&collation) + if err != nil { + return err + } + if !strings.HasSuffix(collation[0], expectedCollation) { + return fmt.Errorf(`table collation for '%s' (%s) is not %s. Consider running "gitea doctor convert"`, table.Name, collation[0], expectedCollation) + } + } + + // if all is well, return without an error + return nil +} + // SetDefaultEngine sets the default engine for db func SetDefaultEngine(ctx context.Context, eng *xorm.Engine) { x = eng diff --git a/tests/integration/collate_test.go b/tests/integration/collate_test.go new file mode 100644 index 0000000000000..1295f2e4b7001 --- /dev/null +++ b/tests/integration/collate_test.go @@ -0,0 +1,138 @@ +// Copyright 2023 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package integration + +import ( + "fmt" + "net/http" + "testing" + + auth_model "code.gitea.io/gitea/models/auth" + "code.gitea.io/gitea/models/db" + repo_model "code.gitea.io/gitea/models/repo" + "code.gitea.io/gitea/models/unittest" + user_model "code.gitea.io/gitea/models/user" + "code.gitea.io/gitea/modules/setting" + api "code.gitea.io/gitea/modules/structs" + "code.gitea.io/gitea/tests" + + "github.com/stretchr/testify/assert" +) + +func TestMySQLCollate(t *testing.T) { + // This test is only for MySQL, return early for any other engine. + if !setting.Database.Type.IsMySQL() { + t.Skip() + } + + defer tests.PrepareTestEnv(t)() + + // Helpers + loadProps := func() (*repo_model.Repository, *user_model.User, string) { + repo := unittest.AssertExistsAndLoadBean(t, &repo_model.Repository{ID: 2}) + owner := unittest.AssertExistsAndLoadBean(t, &user_model.User{ID: repo.OwnerID}) + session := loginUser(t, owner.Name) + token := getTokenForLoggedInUser(t, session, auth_model.AccessTokenScopeWriteIssue) + + return repo, owner, token + } + + breakCollation := func() { + err := db.ConvertCharsetAndCollation("utf8mb4", "utf8mb4_general_ci") + assert.NoError(t, err) + } + fixCollation := func() { + charset, collation, err := db.GetDesiredCharsetAndCollation() + assert.NoError(t, err) + err = db.ConvertCharsetAndCollation(charset, collation) + assert.NoError(t, err) + } + + t.Run("Collation fixing", func(t *testing.T) { + defer tests.PrintCurrentTest(t)() + + // Ensure that the database uses the wrong collation + breakCollation() + + // With the wrong collation, sanity checking fails + err := db.SanityCheck() + assert.Error(t, err) + + // Try updating the collation + fixCollation() + + // Sanity checking works after the collation update + err = db.SanityCheck() + assert.NoError(t, err) + }) + + t.Run("Case sensitive issue search by label", func(t *testing.T) { + defer tests.PrintCurrentTest(t)() + + assert.NoError(t, unittest.LoadFixtures()) + + // Helpers + createLabel := func(name string) int64 { + repo, owner, token := loadProps() + urlStr := fmt.Sprintf("/api/v1/repos/%s/%s/labels", owner.Name, repo.Name) + + // CreateLabel + req := NewRequestWithJSON(t, "POST", urlStr, &api.CreateLabelOption{ + Name: name, + Color: "abcdef", + Description: "test label", + }).AddTokenAuth(token) + resp := MakeRequest(t, req, http.StatusCreated) + apiLabel := new(api.Label) + DecodeJSON(t, resp, &apiLabel) + return apiLabel.ID + } + + createIssue := func(title string, labelID int64) { + repo, owner, token := loadProps() + urlStr := fmt.Sprintf("/api/v1/repos/%s/%s/issues", owner.Name, repo.Name) + + // CreateIssue + req := NewRequestWithJSON(t, "POST", urlStr, &api.CreateIssueOption{ + Title: title, + Labels: []int64{labelID}, + }).AddTokenAuth(token) + MakeRequest(t, req, http.StatusCreated) + } + + searchIssues := func(label string) []*api.Issue { + _, _, token := loadProps() + var apiIssues []*api.Issue + + urlStr := fmt.Sprintf("/api/v1/repos/issues/search?labels=%s", label) + req := NewRequest(t, "GET", urlStr).AddTokenAuth(token) + resp := MakeRequest(t, req, http.StatusOK) + + DecodeJSON(t, resp, &apiIssues) + return apiIssues + } + + // Ensure that the database uses the wrong collation + breakCollation() + + // Create two labels that differ in case only + labelID1 := createLabel("case-sens") + labelID2 := createLabel("Case-Sens") + + // Create two issues, one with each of the labels above + createIssue("case-sens 1", labelID1) + createIssue("case-sens 2", labelID2) + + // Search for 'label1', and expect two results (`label1` and `Label1`) + issues := searchIssues("case-sens") + assert.Len(t, issues, 2) + + // Update the collation + fixCollation() + + // Search for 'label1', and expect only one result now. + issues = searchIssues("case-sens") + assert.Len(t, issues, 1) + }) +} From 9f48545535d611aa0c630c2558d70bcb23ff583b Mon Sep 17 00:00:00 2001 From: Gergely Nagy Date: Fri, 29 Dec 2023 16:01:55 +0100 Subject: [PATCH 4/6] db: Automatically adjust database props on init When initializing an empty MySQL/MariaDB database, ensure that the character set and collation is set to the desired values. With these set, creating a table will inherit these settings, unless table creation specifies a different charset or collate function. This is the reason why `setting.Database.MysqlCharset` was removed: that forced table creation to explicitly set a charset, rather than inherit the database's default, and in doing so, also changed the collate function to the charset's default (which may - and usually is - different from the one we want). Signed-off-by: Gergely Nagy --- models/db/engine.go | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/models/db/engine.go b/models/db/engine.go index 3e00165566ac3..dfe39b24214e4 100755 --- a/models/db/engine.go +++ b/models/db/engine.go @@ -293,6 +293,29 @@ func InitEngineWithMigration(ctx context.Context, migrateFunc func(*xorm.Engine) return err } + // If we're using MySQL, and there are no tables, set the database charaset + // and collation to the desired ones. This will help cases where the + // database is created automatically, and with the wrong settings (such as + // when using the official mysql/mariadb container images). + if x.Dialect().URI().DBType == schemas.MYSQL { + tables, err := x.DBMetas() + if err != nil { + return err + } + + if len(tables) == 0 { + charset, collation, err := GetDesiredCharsetAndCollation() + if err != nil { + return err + } + + _, err = x.Exec(fmt.Sprintf("ALTER DATABASE `%s` DEFAULT CHARACTER SET `%s` COLLATE `%s`", setting.Database.Name, charset, collation)) + if err != nil { + return err + } + } + } + // We have to run migrateFunc here in case the user is re-running installation on a previously created DB. // If we do not then table schemas will be changed and there will be conflicts when the migrations run properly. // From 25a1d3d5d621e53033cdbd36ba90432a558321d6 Mon Sep 17 00:00:00 2001 From: Gergely Nagy Date: Fri, 29 Dec 2023 16:04:47 +0100 Subject: [PATCH 5/6] git/branch: Remove a comment that is no longer relevant Signed-off-by: Gergely Nagy --- models/git/branch.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/git/branch.go b/models/git/branch.go index ffd1d7ed164a0..9b3a4dc99be32 100644 --- a/models/git/branch.go +++ b/models/git/branch.go @@ -103,7 +103,7 @@ func (err ErrBranchesEqual) Unwrap() error { type Branch struct { ID int64 RepoID int64 `xorm:"UNIQUE(s)"` - Name string `xorm:"UNIQUE(s) NOT NULL"` // git's ref-name is case-sensitive internally, however, in some databases (mssql, mysql, by default), it's case-insensitive at the moment + Name string `xorm:"UNIQUE(s) NOT NULL"` CommitID string CommitMessage string `xorm:"TEXT"` // it only stores the message summary (the first line) PusherID int64 From 11798139dd06b00d03da57b01ff6cf814c371d1f Mon Sep 17 00:00:00 2001 From: Gergely Nagy Date: Fri, 29 Dec 2023 16:23:10 +0100 Subject: [PATCH 6/6] [DOCS]: Suggest better collate functions for the db For MySQL, suggest not setting a collate function, to let Gitea deal with it. But also mention some of the options, would one want to set it up in advance and not rely on Gitea. Signed-off-by: Gergely Nagy --- docs/content/help/faq.en-us.md | 6 ++++-- docs/content/installation/database-preparation.en-us.md | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/content/help/faq.en-us.md b/docs/content/help/faq.en-us.md index e6350936ef816..ec3027a5e34c8 100644 --- a/docs/content/help/faq.en-us.md +++ b/docs/content/help/faq.en-us.md @@ -385,10 +385,12 @@ Unfortunately MySQL's `utf8` charset does not completely allow all possible UTF- They created a new charset and collation called `utf8mb4` that allows for emoji to be stored but tables which use the `utf8` charset, and connections which use the `utf8` charset will not use this. -Please run `gitea doctor convert`, or run `ALTER DATABASE database_name CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;` -for the database_name and run `ALTER TABLE table_name CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;` +Please run `gitea doctor convert`, or run `ALTER DATABASE database_name CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;` +for the database_name and run `ALTER TABLE table_name CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;` for each table in the database. +The most appropriate collate function depends on your variant of the database: for MySQL, it is `utf8mb4_0900_as_cs`, for MariaDB, it is `uca1400_as_cs`. Both of them support `utf8mb4_bin`, so that's the common ground. `gitea doctor convert` will choose the best one for you automatically. + ## Why are Emoji displaying only as placeholders or in monochrome Gitea requires the system or browser to have one of the supported Emoji fonts installed, which are Apple Color Emoji, Segoe UI Emoji, Segoe UI Symbol, Noto Color Emoji and Twemoji Mozilla. Generally, the operating system should already provide one of these fonts, but especially on Linux, it may be necessary to install them manually. diff --git a/docs/content/installation/database-preparation.en-us.md b/docs/content/installation/database-preparation.en-us.md index 5e0b94665ff41..bb0fefbe56844 100644 --- a/docs/content/installation/database-preparation.en-us.md +++ b/docs/content/installation/database-preparation.en-us.md @@ -61,13 +61,13 @@ Note: All steps below requires that the database engine of your choice is instal Replace username and password above as appropriate. -4. Create database with UTF-8 charset and collation. Make sure to use `utf8mb4` charset instead of `utf8` as the former supports all Unicode characters (including emojis) beyond _Basic Multilingual Plane_. Also, collation chosen depending on your expected content. When in doubt, use either `unicode_ci` or `general_ci`. +4. Create database with UTF-8 charset and collation. Make sure to use `utf8mb4` charset instead of `utf8` as the former supports all Unicode characters (including emojis) beyond _Basic Multilingual Plane_. Also, collation chosen depending on your expected content (such as `utf8mb4_0900_as_cs` for MySQL, or `uca1400_as_cs` for MariaDB, or `utf8mb4_bin` that works for both). When in doubt, leave it unset, and Gitea will adjust the database to use the most fitting one. ```sql - CREATE DATABASE giteadb CHARACTER SET 'utf8mb4' COLLATE 'utf8mb4_unicode_ci'; + CREATE DATABASE giteadb CHARACTER SET 'utf8mb4' COLLATE 'utf8mb4_bin'; ``` - Replace database name as appropriate. + Replace database name and the collate function as appropriate. 5. Grant all privileges on the database to database user created above.