Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .drone.yml
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ steps:
GOPROXY: off
TAGS: bindata
TEST_LDAP: 1
TEST_INDEXER_CODE_ES_URL: "http://elastic:changeme@elasticsearch:9200"
depends_on:
- build

Expand Down
8 changes: 8 additions & 0 deletions custom/conf/app.ini.sample
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,15 @@ STARTUP_TIMEOUT=30s

; repo indexer by default disabled, since it uses a lot of disk space
REPO_INDEXER_ENABLED = false
; Code search engine type, could be `bleve` or `elasticsearch`.
REPO_INDEXER_TYPE = bleve
; Index file used for code search.
REPO_INDEXER_PATH = indexers/repos.bleve
; Code indexer connection string, available when `REPO_INDEXER_TYPE` is elasticsearch. i.e. http://elastic:changeme@localhost:9200
REPO_INDEXER_CONN_STR =
; Code indexer name, available when `REPO_INDEXER_TYPE` is elasticsearch
`REPO_INDEXER_NAME` = gitea_codes

UPDATE_BUFFER_LEN = 20
MAX_FILE_SIZE = 1048576
; A comma separated list of glob patterns (see https://github.com/gobwas/glob) to include
Expand Down
4 changes: 4 additions & 0 deletions docs/content/doc/advanced/config-cheat-sheet.en-us.md
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,11 @@ relation to port exhaustion.
- `ISSUE_INDEXER_QUEUE_BATCH_NUMBER`: **20**: Batch queue number.

- `REPO_INDEXER_ENABLED`: **false**: Enables code search (uses a lot of disk space, about 6 times more than the repository size).
- `REPO_INDEXER_TYPE`: **bleve**: Code search engine type, could be `bleve` or `elasticsearch`.
- `REPO_INDEXER_PATH`: **indexers/repos.bleve**: Index file used for code search.
- `REPO_INDEXER_CONN_STR`: ****: Code indexer connection string, available when `REPO_INDEXER_TYPE` is elasticsearch. i.e. http://elastic:changeme@localhost:9200
- `REPO_INDEXER_NAME`: **gitea_codes**: Code indexer name, available when `REPO_INDEXER_TYPE` is elasticsearch

- `REPO_INDEXER_INCLUDE`: **empty**: A comma separated list of glob patterns (see https://github.com/gobwas/glob) to **include** in the index. Use `**.txt` to match any files with .txt extension. An empty list means include all files.
- `REPO_INDEXER_EXCLUDE`: **empty**: A comma separated list of glob patterns (see https://github.com/gobwas/glob) to **exclude** from the index. Files that match this list will not be indexed, even if they match in `REPO_INDEXER_INCLUDE`.
- `REPO_INDEXER_EXCLUDE_VENDORED`: **true**: Exclude vendored files from index.
Expand Down
6 changes: 5 additions & 1 deletion docs/content/doc/advanced/config-cheat-sheet.zh-cn.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,12 @@ menu:
- `ISSUE_INDEXER_QUEUE_CONN_STR`: **addrs=127.0.0.1:6379 db=0**: 当 `ISSUE_INDEXER_QUEUE_TYPE` 为 `redis` 时,保存Redis队列的连接字符串。
- `ISSUE_INDEXER_QUEUE_BATCH_NUMBER`: **20**: 队列处理中批量提交数量。

- `REPO_INDEXER_ENABLED`: **false**: 是否启用代码搜索(启用后会占用比较大的磁盘空间)。
- `REPO_INDEXER_ENABLED`: **false**: 是否启用代码搜索(启用后会占用比较大的磁盘空间,如果是bleve可能需要占用约6倍存储空间)。
- `REPO_INDEXER_TYPE`: **bleve**: 代码搜索引擎类型,可以为 `bleve` 或者 `elasticsearch`。
- `REPO_INDEXER_PATH`: **indexers/repos.bleve**: 用于代码搜索的索引文件路径。
- `REPO_INDEXER_CONN_STR`: ****: 代码搜索引擎连接字符串,当 `REPO_INDEXER_TYPE` 为 `elasticsearch` 时有效。例如: http://elastic:changeme@localhost:9200
- `REPO_INDEXER_NAME`: **gitea_codes**: 代码搜索引擎的名字,当 `REPO_INDEXER_TYPE` 为 `elasticsearch` 时有效。

- `UPDATE_BUFFER_LEN`: **20**: 代码索引请求的缓冲区长度。
- `MAX_FILE_SIZE`: **1048576**: 进行解析的源代码文件的最大长度,小于该值时才会索引。

Expand Down
135 changes: 52 additions & 83 deletions modules/indexer/code/bleve.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,10 @@ func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
})
}

// openIndexer open the index at the specified path, checking for metadata
// openBleveIndexer open the index at the specified path, checking for metadata
// updates and bleve version updates. If index needs to be created (or
// re-created), returns (nil, nil)
func openIndexer(path string, latestVersion int) (bleve.Index, error) {
func openBleveIndexer(path string, latestVersion int) (bleve.Index, error) {
_, err := os.Stat(path)
if err != nil && os.IsNotExist(err) {
return nil, nil
Expand Down Expand Up @@ -103,54 +103,14 @@ func (d *RepoIndexerData) Type() string {
return repoIndexerDocType
}

func addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
// Ignore vendored files in code search
if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
return nil
}
stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
RunInDir(repo.RepoPath())
if err != nil {
return err
}
if size, err := strconv.Atoi(strings.TrimSpace(stdout)); err != nil {
return fmt.Errorf("Misformatted git cat-file output: %v", err)
} else if int64(size) > setting.Indexer.MaxIndexerFileSize {
return addDelete(update.Filename, repo, batch)
}

fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha).
RunInDirBytes(repo.RepoPath())
if err != nil {
return err
} else if !base.IsTextFile(fileContents) {
// FIXME: UTF-16 files will probably fail here
return nil
}

id := filenameIndexerID(repo.ID, update.Filename)
return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID,
CommitID: commitSha,
Content: string(charset.ToUTF8DropErrors(fileContents)),
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(),
})
}

func addDelete(filename string, repo *models.Repository, batch rupture.FlushingBatch) error {
id := filenameIndexerID(repo.ID, filename)
return batch.Delete(id)
}

const (
repoIndexerAnalyzer = "repoIndexerAnalyzer"
repoIndexerDocType = "repoIndexerDocType"
repoIndexerLatestVersion = 5
)

// createRepoIndexer create a repo indexer if one does not already exist
func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) {
// createBleveIndexer create a bleve repo indexer if one does not already exist
func createBleveIndexer(path string, latestVersion int) (bleve.Index, error) {
docMapping := bleve.NewDocumentMapping()
numericFieldMapping := bleve.NewNumericFieldMapping()
numericFieldMapping.IncludeInAll = false
Expand Down Expand Up @@ -198,18 +158,6 @@ func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) {
return indexer, nil
}

func filenameIndexerID(repoID int64, filename string) string {
return indexerID(repoID) + "_" + filename
}

func filenameOfIndexerID(indexerID string) string {
index := strings.IndexByte(indexerID, '_')
if index == -1 {
log.Error("Unexpected ID in repo indexer: %s", indexerID)
}
return indexerID[index+1:]
}

var (
_ Indexer = &BleveIndexer{}
)
Expand All @@ -229,18 +177,59 @@ func NewBleveIndexer(indexDir string) (*BleveIndexer, bool, error) {
return indexer, created, err
}

func (b *BleveIndexer) addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
// Ignore vendored files in code search
if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
return nil
}

stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
RunInDir(repo.RepoPath())
if err != nil {
return err
}
if size, err := strconv.Atoi(strings.TrimSpace(stdout)); err != nil {
return fmt.Errorf("Misformatted git cat-file output: %v", err)
} else if int64(size) > setting.Indexer.MaxIndexerFileSize {
return b.addDelete(update.Filename, repo, batch)
}

fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha).
RunInDirBytes(repo.RepoPath())
if err != nil {
return err
} else if !base.IsTextFile(fileContents) {
// FIXME: UTF-16 files will probably fail here
return nil
}

id := filenameIndexerID(repo.ID, update.Filename)
return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID,
CommitID: commitSha,
Content: string(charset.ToUTF8DropErrors(fileContents)),
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(),
})
}

func (b *BleveIndexer) addDelete(filename string, repo *models.Repository, batch rupture.FlushingBatch) error {
id := filenameIndexerID(repo.ID, filename)
return batch.Delete(id)
}

// init init the indexer
func (b *BleveIndexer) init() (bool, error) {
var err error
b.indexer, err = openIndexer(b.indexDir, repoIndexerLatestVersion)
b.indexer, err = openBleveIndexer(b.indexDir, repoIndexerLatestVersion)
if err != nil {
return false, err
}
if b.indexer != nil {
return false, nil
}

b.indexer, err = createRepoIndexer(b.indexDir, repoIndexerLatestVersion)
b.indexer, err = createBleveIndexer(b.indexDir, repoIndexerLatestVersion)
if err != nil {
return false, err
}
Expand All @@ -261,38 +250,19 @@ func (b *BleveIndexer) Close() {
}

// Index indexes the data
func (b *BleveIndexer) Index(repoID int64) error {
repo, err := models.GetRepositoryByID(repoID)
if err != nil {
return err
}

sha, err := getDefaultBranchSha(repo)
if err != nil {
return err
}
changes, err := getRepoChanges(repo, sha)
if err != nil {
return err
} else if changes == nil {
return nil
}

func (b *BleveIndexer) Index(repo *models.Repository, sha string, changes *repoChanges) error {
batch := rupture.NewFlushingBatch(b.indexer, maxBatchSize)
for _, update := range changes.Updates {
if err := addUpdate(sha, update, repo, batch); err != nil {
if err := b.addUpdate(sha, update, repo, batch); err != nil {
return err
}
}
for _, filename := range changes.RemovedFilenames {
if err := addDelete(filename, repo, batch); err != nil {
if err := b.addDelete(filename, repo, batch); err != nil {
return err
}
}
if err = batch.Flush(); err != nil {
return err
}
return repo.UpdateIndexerStatus(models.RepoIndexerTypeCode, sha)
return batch.Flush()
}

// Delete deletes indexes by ids
Expand Down Expand Up @@ -384,8 +354,7 @@ func (b *BleveIndexer) Search(repoIDs []int64, language, keyword string, page, p
}
searchResults[i] = &SearchResult{
RepoID: int64(hit.Fields["RepoID"].(float64)),
StartIndex: startIndex,
EndIndex: endIndex,
Positions: []SearchResultPosition{{StartIndex: startIndex, EndIndex: endIndex}},
Filename: filenameOfIndexerID(hit.ID),
Content: hit.Fields["Content"].(string),
CommitID: hit.Fields["CommitID"].(string),
Expand Down
53 changes: 3 additions & 50 deletions modules/indexer/code/bleve_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,14 @@ package code
import (
"io/ioutil"
"os"
"path/filepath"
"testing"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/setting"

"github.com/stretchr/testify/assert"
)

func TestMain(m *testing.M) {
models.MainTest(m, filepath.Join("..", "..", ".."))
}

func TestIndexAndSearch(t *testing.T) {
func TestBleveIndexAndSearch(t *testing.T) {
models.PrepareTestEnv(t)

dir, err := ioutil.TempDir("", "bleve.index")
Expand All @@ -31,56 +25,15 @@ func TestIndexAndSearch(t *testing.T) {
}
defer os.RemoveAll(dir)

setting.Indexer.RepoIndexerEnabled = true
idx, _, err := NewBleveIndexer(dir)
if err != nil {
assert.Fail(t, "Unable to create indexer Error: %v", err)
assert.Fail(t, "Unable to create bleve indexer Error: %v", err)
if idx != nil {
idx.Close()
}
return
}
defer idx.Close()

err = idx.Index(1)
assert.NoError(t, err)

var (
keywords = []struct {
Keyword string
IDs []int64
Langs int
}{
{
Keyword: "Description",
IDs: []int64{1},
Langs: 1,
},
{
Keyword: "repo1",
IDs: []int64{1},
Langs: 1,
},
{
Keyword: "non-exist",
IDs: []int64{},
Langs: 0,
},
}
)

for _, kw := range keywords {
total, res, langs, err := idx.Search(nil, "", kw.Keyword, 1, 10)
assert.NoError(t, err)
assert.EqualValues(t, len(kw.IDs), total)

assert.NotNil(t, langs)
assert.Len(t, langs, kw.Langs)

var ids = make([]int64, 0, len(res))
for _, hit := range res {
ids = append(ids, hit.RepoID)
}
assert.EqualValues(t, kw.IDs, ids)
}
testIndexer("beleve", t, idx)
}
Loading