From 0d8bd7720d7b1679a433dfd3232a2de15eaa5f4a Mon Sep 17 00:00:00 2001 From: wxiaoguang Date: Fri, 13 Feb 2026 08:15:46 +0800 Subject: [PATCH] Refactor highlight and diff (#36599) 1. fix a performance regression when using line-by-line highlighting * the root cause is that chroma's `lexers.Get` is slow and a lexer cache is missing during recent changes 2. clarify the chroma lexer detection behavior * now we fully manage our logic to detect lexer, and handle overriding problems, everything is fully under control 3. clarify "code analyze" behavior, now only 2 usages: * only use file name and language to detect lexer (very fast), mainly for "diff" page which contains a lot of files * if no lexer is detected by file name and language, use code content to detect again (slow), mainly for "view file" or "blame" page, which can get best result 4. fix git diff bug, it caused "broken pipe" error for large diff files --- modules/git/diff.go | 57 ++--- modules/git/gitcmd/command.go | 4 + modules/highlight/highlight.go | 84 +------- modules/highlight/highlight_test.go | 33 --- modules/highlight/lexerdetect.go | 279 +++++++++++++++++++++++++ modules/highlight/lexerdetect_test.go | 90 ++++++++ modules/indexer/code/search.go | 3 +- modules/markup/orgmode/orgmode.go | 2 +- routers/web/repo/blame.go | 4 +- services/gitdiff/gitdiff.go | 19 +- services/gitdiff/gitdiff_excerpt.go | 3 + services/gitdiff/highlightdiff_test.go | 4 +- 12 files changed, 427 insertions(+), 155 deletions(-) create mode 100644 modules/highlight/lexerdetect.go create mode 100644 modules/highlight/lexerdetect_test.go diff --git a/modules/git/diff.go b/modules/git/diff.go index a198695fc0..d7732eaa29 100644 --- a/modules/git/diff.go +++ b/modules/git/diff.go @@ -28,44 +28,37 @@ const ( // GetRawDiff dumps diff results of repository in given commit ID to io.Writer. func GetRawDiff(repo *Repository, commitID string, diffType RawDiffType, writer io.Writer) (retErr error) { - diffOutput, diffFinish, err := getRepoRawDiffForFile(repo.Ctx, repo, "", commitID, diffType, "") + cmd, err := getRepoRawDiffForFileCmd(repo.Ctx, repo, "", commitID, diffType, "") if err != nil { - return err + return fmt.Errorf("getRepoRawDiffForFileCmd: %w", err) } - defer func() { - err := diffFinish() - if retErr == nil { - retErr = err // only return command's error if no previous error - } - }() - _, err = io.Copy(writer, diffOutput) - return err + return cmd.WithStdoutCopy(writer).RunWithStderr(repo.Ctx) } // GetFileDiffCutAroundLine cuts the old or new part of the diff of a file around a specific line number func GetFileDiffCutAroundLine( repo *Repository, startCommit, endCommit, treePath string, line int64, old bool, numbersOfLine int, -) (_ string, retErr error) { - diffOutput, diffFinish, err := getRepoRawDiffForFile(repo.Ctx, repo, startCommit, endCommit, RawDiffNormal, treePath) +) (ret string, retErr error) { + cmd, err := getRepoRawDiffForFileCmd(repo.Ctx, repo, startCommit, endCommit, RawDiffNormal, treePath) if err != nil { - return "", err + return "", fmt.Errorf("getRepoRawDiffForFileCmd: %w", err) } - defer func() { - err := diffFinish() - if retErr == nil { - retErr = err // only return command's error if no previous error - } - }() - return CutDiffAroundLine(diffOutput, line, old, numbersOfLine) + stdoutReader, stdoutClose := cmd.MakeStdoutPipe() + defer stdoutClose() + cmd.WithPipelineFunc(func(ctx gitcmd.Context) error { + ret, err = CutDiffAroundLine(stdoutReader, line, old, numbersOfLine) + return err + }) + return ret, cmd.RunWithStderr(repo.Ctx) } // getRepoRawDiffForFile returns an io.Reader for the diff results of file in given commit ID // and a "finish" function to wait for the git command and clean up resources after reading is done. -func getRepoRawDiffForFile(ctx context.Context, repo *Repository, startCommit, endCommit string, diffType RawDiffType, file string) (io.Reader, func() gitcmd.RunStdError, error) { +func getRepoRawDiffForFileCmd(_ context.Context, repo *Repository, startCommit, endCommit string, diffType RawDiffType, file string) (*gitcmd.Command, error) { commit, err := repo.GetCommit(endCommit) if err != nil { - return nil, nil, err + return nil, err } var files []string if len(file) > 0 { @@ -84,7 +77,7 @@ func getRepoRawDiffForFile(ctx context.Context, repo *Repository, startCommit, e } else { c, err := commit.Parent(0) if err != nil { - return nil, nil, err + return nil, err } cmd.AddArguments("diff"). AddOptionFormat("--find-renames=%s", setting.Git.DiffRenameSimilarityThreshold). @@ -99,25 +92,15 @@ func getRepoRawDiffForFile(ctx context.Context, repo *Repository, startCommit, e } else { c, err := commit.Parent(0) if err != nil { - return nil, nil, err + return nil, err } query := fmt.Sprintf("%s...%s", endCommit, c.ID.String()) cmd.AddArguments("format-patch", "--no-signature", "--stdout").AddDynamicArguments(query).AddDashesAndList(files...) } default: - return nil, nil, util.NewInvalidArgumentErrorf("invalid diff type: %s", diffType) + return nil, util.NewInvalidArgumentErrorf("invalid diff type: %s", diffType) } - - stdoutReader, stdoutReaderClose := cmd.MakeStdoutPipe() - err = cmd.StartWithStderr(ctx) - if err != nil { - stdoutReaderClose() - return nil, nil, err - } - return stdoutReader, func() gitcmd.RunStdError { - stdoutReaderClose() - return cmd.WaitWithStderr() - }, nil + return cmd, nil } // ParseDiffHunkString parse the diff hunk content and return @@ -254,7 +237,7 @@ func CutDiffAroundLine(originalDiff io.Reader, line int64, old bool, numbersOfLi } } if err := scanner.Err(); err != nil { - return "", err + return "", fmt.Errorf("CutDiffAroundLine: scan: %w", err) } // No hunk found diff --git a/modules/git/gitcmd/command.go b/modules/git/gitcmd/command.go index f780cdf6c9..e9b51802fe 100644 --- a/modules/git/gitcmd/command.go +++ b/modules/git/gitcmd/command.go @@ -306,6 +306,10 @@ func (c *Command) MakeStdinPipe() (writer PipeWriter, closer func()) { // MakeStdoutPipe creates a reader for the command's stdout. // The returned closer function must be called by the caller to close the pipe. // After the pipe reader is closed, the unread data will be discarded. +// +// If the process (git command) still tries to write after the pipe is closed, the Wait error will be "signal: broken pipe". +// WithPipelineFunc + Run won't return "broken pipe" error in this case if the callback returns no error. +// But if you are calling Start / Wait family functions, you should either drain the pipe before close it, or handle the Wait error correctly. func (c *Command) MakeStdoutPipe() (reader PipeReader, closer func()) { return c.makeStdoutStderr(&c.cmdStdout) } diff --git a/modules/highlight/highlight.go b/modules/highlight/highlight.go index fc8699829c..c7416c7a10 100644 --- a/modules/highlight/highlight.go +++ b/modules/highlight/highlight.go @@ -11,20 +11,16 @@ import ( gohtml "html" "html/template" "io" - "path" "strings" "sync" - "code.gitea.io/gitea/modules/analyze" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/util" "github.com/alecthomas/chroma/v2" "github.com/alecthomas/chroma/v2/formatters/html" - "github.com/alecthomas/chroma/v2/lexers" "github.com/alecthomas/chroma/v2/styles" - "github.com/go-enry/go-enry/v2" ) // don't index files larger than this many bytes for performance purposes @@ -84,85 +80,21 @@ func UnsafeSplitHighlightedLines(code template.HTML) (ret [][]byte) { } } -func getChromaLexerByLanguage(fileName, lang string) chroma.Lexer { - lang, _, _ = strings.Cut(lang, "?") // maybe, the value from gitattributes might contain `?` parameters? - ext := path.Ext(fileName) - // the "lang" might come from enry, it has different naming for some languages - switch lang { - case "F#": - lang = "FSharp" - case "Pascal": - lang = "ObjectPascal" - case "C": - if ext == ".C" || ext == ".H" { - lang = "C++" - } - } - if lang == "" && util.AsciiEqualFold(ext, ".sql") { - // there is a bug when using MySQL lexer: "--\nSELECT", the second line will be rendered as comment incorrectly - lang = "SQL" - } - // lexers.Get is slow if the language name can't be matched directly: it does extra "Match" call to iterate all lexers - return lexers.Get(lang) -} - -// GetChromaLexerWithFallback returns a chroma lexer by given file name, language and code content. All parameters can be optional. -// When code content is provided, it will be slow if no lexer is found by file name or language. -// If no lexer is found, it will return the fallback lexer. -func GetChromaLexerWithFallback(fileName, lang string, code []byte) (lexer chroma.Lexer) { - if lang != "" { - lexer = getChromaLexerByLanguage(fileName, lang) - } - - if lexer == nil { - fileExt := path.Ext(fileName) - if val, ok := globalVars().highlightMapping[fileExt]; ok { - lexer = getChromaLexerByLanguage(fileName, val) // use mapped value to find lexer - } - } - - if lexer == nil { - // when using "code" to detect, analyze.GetCodeLanguage is slower, it iterates many rules to detect language from content - // this is the old logic: use enry to detect language, and use chroma to render, but their naming is different for some languages - enryLanguage := analyze.GetCodeLanguage(fileName, code) - lexer = getChromaLexerByLanguage(fileName, enryLanguage) - if lexer == nil { - if enryLanguage != enry.OtherLanguage { - log.Warn("No chroma lexer found for enry detected language: %s (file: %s), need to fix the language mapping between enry and chroma.", enryLanguage, fileName) - } - lexer = lexers.Match(fileName) // lexers.Match will search by its basename and extname - } - } - - return util.IfZero(lexer, lexers.Fallback) -} - -func renderCode(fileName, language, code string, slowGuess bool) (output template.HTML, lexerName string) { +// RenderCodeSlowGuess tries to get a lexer by file name and language first, +// if not found, it will try to guess the lexer by code content, which is slow (more than several hundreds of milliseconds). +func RenderCodeSlowGuess(fileName, language, code string) (output template.HTML, lexer chroma.Lexer, lexerDisplayName string) { // diff view newline will be passed as empty, change to literal '\n' so it can be copied // preserve literal newline in blame view if code == "" || code == "\n" { - return "\n", "" + return "\n", nil, "" } if len(code) > sizeLimit { - return template.HTML(template.HTMLEscapeString(code)), "" + return template.HTML(template.HTMLEscapeString(code)), nil, "" } - var codeForGuessLexer []byte - if slowGuess { - // it is slower to guess lexer by code content, so only do it when necessary - codeForGuessLexer = util.UnsafeStringToBytes(code) - } - lexer := GetChromaLexerWithFallback(fileName, language, codeForGuessLexer) - return RenderCodeByLexer(lexer, code), formatLexerName(lexer.Config().Name) -} - -func RenderCodeFast(fileName, language, code string) (output template.HTML, lexerName string) { - return renderCode(fileName, language, code, false) -} - -func RenderCodeSlowGuess(fileName, language, code string) (output template.HTML, lexerName string) { - return renderCode(fileName, language, code, true) + lexer = detectChromaLexerWithAnalyze(fileName, language, util.UnsafeStringToBytes(code)) // it is also slow + return RenderCodeByLexer(lexer, code), lexer, formatLexerName(lexer.Config().Name) } // RenderCodeByLexer returns a HTML version of code string with chroma syntax highlighting classes @@ -204,7 +136,7 @@ func RenderFullFile(fileName, language string, code []byte) ([]template.HTML, st html.PreventSurroundingPre(true), ) - lexer := GetChromaLexerWithFallback(fileName, language, code) + lexer := detectChromaLexerWithAnalyze(fileName, language, code) lexerName := formatLexerName(lexer.Config().Name) iterator, err := lexer.Tokenise(nil, string(code)) diff --git a/modules/highlight/highlight_test.go b/modules/highlight/highlight_test.go index 69aff07b04..d026210475 100644 --- a/modules/highlight/highlight_test.go +++ b/modules/highlight/highlight_test.go @@ -205,36 +205,3 @@ func TestUnsafeSplitHighlightedLines(t *testing.T) { assert.Equal(t, "a\n", string(ret[0])) assert.Equal(t, "b\n", string(ret[1])) } - -func TestGetChromaLexer(t *testing.T) { - globalVars().highlightMapping[".my-html"] = "HTML" - t.Cleanup(func() { delete(globalVars().highlightMapping, ".my-html") }) - - cases := []struct { - fileName string - language string - content string - expected string - }{ - {"test.py", "", "", "Python"}, - - {"any-file", "javascript", "", "JavaScript"}, - {"any-file", "", "/* vim: set filetype=python */", "Python"}, - {"any-file", "", "", "fallback"}, - - {"test.fs", "", "", "Forth"}, - {"test.fs", "F#", "", "FSharp"}, - {"test.fs", "", "let x = 1", "FSharp"}, - - {"test.c", "", "", "C"}, - {"test.C", "", "", "C++"}, - {"OLD-CODE.PAS", "", "", "ObjectPascal"}, - {"test.my-html", "", "", "HTML"}, - } - for _, c := range cases { - lexer := GetChromaLexerWithFallback(c.fileName, c.language, []byte(c.content)) - if assert.NotNil(t, lexer, "case: %+v", c) { - assert.Equal(t, c.expected, lexer.Config().Name, "case: %+v", c) - } - } -} diff --git a/modules/highlight/lexerdetect.go b/modules/highlight/lexerdetect.go new file mode 100644 index 0000000000..5b39617566 --- /dev/null +++ b/modules/highlight/lexerdetect.go @@ -0,0 +1,279 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package highlight + +import ( + "path" + "strings" + "sync" + + "code.gitea.io/gitea/modules/analyze" + "code.gitea.io/gitea/modules/log" + + "github.com/alecthomas/chroma/v2" + "github.com/alecthomas/chroma/v2/lexers" + "github.com/go-enry/go-enry/v2" +) + +const mapKeyLowerPrefix = "lower/" + +// chromaLexers is fully managed by us to do fast lookup for chroma lexers by file name or language name +// Don't use lexers.Get because it is very slow in many cases (iterate all rules, filepath glob match, etc.) +var chromaLexers = sync.OnceValue(func() (ret struct { + conflictingExtLangMap map[string]string + + lowerNameMap map[string]chroma.Lexer // lexer name (lang name) in lower-case + fileBaseMap map[string]chroma.Lexer + fileExtMap map[string]chroma.Lexer + fileParts []struct { + part string + lexer chroma.Lexer + } +}, +) { + ret.lowerNameMap = make(map[string]chroma.Lexer) + ret.fileBaseMap = make(map[string]chroma.Lexer) + ret.fileExtMap = make(map[string]chroma.Lexer) + + // Chroma has overlaps in file extension for different languages, + // When we need to do fast render, there is no way to detect the language by content, + // So we can only choose some default languages for the overlapped file extensions. + ret.conflictingExtLangMap = map[string]string{ + ".as": "ActionScript 3", // ActionScript + ".asm": "NASM", // TASM, NASM, RGBDS Assembly, Z80 Assembly + ".ASM": "NASM", + ".bas": "VB.net", // QBasic + ".bf": "Beef", // Brainfuck + ".fs": "FSharp", // Forth + ".gd": "GDScript", // GDScript3 + ".h": "C", // Objective-C + ".hcl": "Terraform", // HCL + ".hh": "C++", // HolyC + ".inc": "PHP", // ObjectPascal, POVRay, SourcePawn, PHTML + ".m": "Objective-C", // Matlab, Mathematica, Mason + ".mc": "Mason", // MonkeyC + ".network": "SYSTEMD", // INI + ".php": "PHP", // PHTML + ".php3": "PHP", // PHTML + ".php4": "PHP", // PHTML + ".php5": "PHP", // PHTML + ".pl": "Perl", // Prolog, Raku + ".pm": "Perl", // Promela, Raku + ".pp": "ObjectPascal", // Puppet + ".s": "ArmAsm", // GAS + ".S": "ArmAsm", // R, GAS + ".service": "SYSTEMD", // INI + ".socket": "SYSTEMD", // INI + ".sql": "SQL", // MySQL + ".t": "Perl", // Raku + ".ts": "TypeScript", // TypoScript + ".v": "V", // verilog + ".xslt": "HTML", // XML + } + + isPlainPattern := func(key string) bool { + return !strings.ContainsAny(key, "*?[]") // only support simple patterns + } + + setMapWithLowerKey := func(m map[string]chroma.Lexer, key string, lexer chroma.Lexer) { + if _, conflict := m[key]; conflict { + panic("duplicate key in lexer map: " + key + ", need to add it to conflictingExtLangMap") + } + m[key] = lexer + m[mapKeyLowerPrefix+strings.ToLower(key)] = lexer + } + + processFileName := func(fileName string, lexer chroma.Lexer) bool { + if isPlainPattern(fileName) { + // full base name match + setMapWithLowerKey(ret.fileBaseMap, fileName, lexer) + return true + } + if strings.HasPrefix(fileName, "*") { + // ext name match: "*.js" + fileExt := strings.Trim(fileName, "*") + if isPlainPattern(fileExt) { + presetName := ret.conflictingExtLangMap[fileExt] + if presetName == "" || lexer.Config().Name == presetName { + setMapWithLowerKey(ret.fileExtMap, fileExt, lexer) + } + return true + } + } + if strings.HasSuffix(fileName, "*") { + // part match: "*.env.*" + filePart := strings.Trim(fileName, "*") + if isPlainPattern(filePart) { + ret.fileParts = append(ret.fileParts, struct { + part string + lexer chroma.Lexer + }{ + part: filePart, + lexer: lexer, + }) + return true + } + } + return false + } + + expandGlobPatterns := func(patterns []string) []string { + // expand patterns like "file.[ch]" to "file.c" and "file.h", only one pair of "[]" is supported, enough for current Chroma lexers + for idx, s := range patterns { + idx1 := strings.IndexByte(s, '[') + idx2 := strings.IndexByte(s, ']') + if idx1 != -1 && idx2 != -1 && idx2 > idx1+1 { + left, mid, right := s[:idx1], s[idx1+1:idx2], s[idx2+1:] + patterns[idx] = left + mid[0:1] + right + for i := 1; i < len(mid); i++ { + patterns = append(patterns, left+mid[i:i+1]+right) + } + } + } + return patterns + } + + // add lexers to our map, for fast lookup + for _, lexer := range lexers.GlobalLexerRegistry.Lexers { + cfg := lexer.Config() + ret.lowerNameMap[strings.ToLower(lexer.Config().Name)] = lexer + for _, alias := range cfg.Aliases { + ret.lowerNameMap[strings.ToLower(alias)] = lexer + } + for _, s := range expandGlobPatterns(cfg.Filenames) { + if !processFileName(s, lexer) { + panic("unsupported file name pattern in lexer: " + s) + } + } + for _, s := range expandGlobPatterns(cfg.AliasFilenames) { + if !processFileName(s, lexer) { + panic("unsupported alias file name pattern in lexer: " + s) + } + } + } + + // final check: make sure the default ext-lang mapping is correct, nothing is missing + for ext, lexerName := range ret.conflictingExtLangMap { + if lexer, ok := ret.fileExtMap[ext]; !ok || lexer.Config().Name != lexerName { + panic("missing default ext-lang mapping for: " + ext) + } + } + return ret +}) + +func normalizeFileNameLang(fileName, fileLang string) (string, string) { + fileName = path.Base(fileName) + fileLang, _, _ = strings.Cut(fileLang, "?") // maybe, the value from gitattributes might contain `?` parameters? + ext := path.Ext(fileName) + // the "lang" might come from enry or gitattributes, it has different naming for some languages + switch fileLang { + case "F#": + fileLang = "FSharp" + case "Pascal": + fileLang = "ObjectPascal" + case "C": + if ext == ".C" || ext == ".H" { + fileLang = "C++" + } + } + return fileName, fileLang +} + +func DetectChromaLexerByFileName(fileName, fileLang string) chroma.Lexer { + lexer, _ := detectChromaLexerByFileName(fileName, fileLang) + return lexer +} + +func detectChromaLexerByFileName(fileName, fileLang string) (_ chroma.Lexer, byLang bool) { + fileName, fileLang = normalizeFileNameLang(fileName, fileLang) + fileExt := path.Ext(fileName) + + // apply custom mapping for file extension, highest priority, for example: + // * ".my-js" -> ".js" + // * ".my-html" -> "HTML" + if fileExt != "" { + if val, ok := globalVars().highlightMapping[fileExt]; ok { + if strings.HasPrefix(val, ".") { + fileName = "dummy" + val + fileLang = "" + } else { + fileLang = val + } + } + } + + // try to use language for lexer name + if fileLang != "" { + lexer := chromaLexers().lowerNameMap[strings.ToLower(fileLang)] + if lexer != nil { + return lexer, true + } + } + + if fileName == "" { + return lexers.Fallback, false + } + + // try base name + { + baseName := path.Base(fileName) + if lexer, ok := chromaLexers().fileBaseMap[baseName]; ok { + return lexer, false + } else if lexer, ok = chromaLexers().fileBaseMap[mapKeyLowerPrefix+strings.ToLower(baseName)]; ok { + return lexer, false + } + } + + if fileExt == "" { + return lexers.Fallback, false + } + + // try ext name + { + if lexer, ok := chromaLexers().fileExtMap[fileExt]; ok { + return lexer, false + } else if lexer, ok = chromaLexers().fileExtMap[mapKeyLowerPrefix+strings.ToLower(fileExt)]; ok { + return lexer, false + } + } + + // try file part match, for example: ".env.local" for "*.env.*" + // it assumes that there must be a dot in filename (fileExt isn't empty) + for _, item := range chromaLexers().fileParts { + if strings.Contains(fileName, item.part) { + return item.lexer, false + } + } + return lexers.Fallback, false +} + +// detectChromaLexerWithAnalyze returns a chroma lexer by given file name, language and code content. All parameters can be optional. +// When code content is provided, it will be slow if no lexer is found by file name or language. +// If no lexer is found, it will return the fallback lexer. +func detectChromaLexerWithAnalyze(fileName, lang string, code []byte) chroma.Lexer { + lexer, byLang := detectChromaLexerByFileName(fileName, lang) + + // if lang is provided, and it matches a lexer, use it directly + if byLang { + return lexer + } + + // if a lexer is detected and there is no conflict for the file extension, use it directly + fileExt := path.Ext(fileName) + _, hasConflicts := chromaLexers().conflictingExtLangMap[fileExt] + if !hasConflicts && lexer != lexers.Fallback { + return lexer + } + + // try to detect language by content, for best guessing for the language + // when using "code" to detect, analyze.GetCodeLanguage is slow, it iterates many rules to detect language from content + analyzedLanguage := analyze.GetCodeLanguage(fileName, code) + lexer = DetectChromaLexerByFileName(fileName, analyzedLanguage) + if lexer == lexers.Fallback { + if analyzedLanguage != enry.OtherLanguage { + log.Warn("No chroma lexer found for enry detected language: %s (file: %s), need to fix the language mapping between enry and chroma.", analyzedLanguage, fileName) + } + } + return lexer +} diff --git a/modules/highlight/lexerdetect_test.go b/modules/highlight/lexerdetect_test.go new file mode 100644 index 0000000000..868e793a68 --- /dev/null +++ b/modules/highlight/lexerdetect_test.go @@ -0,0 +1,90 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package highlight + +import ( + "strings" + "testing" + + "github.com/alecthomas/chroma/v2/lexers" + "github.com/stretchr/testify/assert" +) + +func BenchmarkDetectChromaLexerByFileName(b *testing.B) { + for b.Loop() { + // BenchmarkDetectChromaLexerByFileName-12 18214717 61.35 ns/op + DetectChromaLexerByFileName("a.sql", "") + } +} + +func BenchmarkDetectChromaLexerWithAnalyze(b *testing.B) { + b.StopTimer() + code := []byte(strings.Repeat("SELECT * FROM table;\n", 1000)) + b.StartTimer() + for b.Loop() { + // BenchmarkRenderCodeSlowGuess-12 87946 13310 ns/op + detectChromaLexerWithAnalyze("a", "", code) + } +} + +func BenchmarkChromaAnalyze(b *testing.B) { + b.StopTimer() + code := strings.Repeat("SELECT * FROM table;\n", 1000) + b.StartTimer() + for b.Loop() { + // comparing to detectChromaLexerWithAnalyze (go-enry), "chroma/lexers.Analyse" is very slow + // BenchmarkChromaAnalyze-12 519 2247104 ns/op + lexers.Analyse(code) + } +} + +func BenchmarkRenderCodeByLexer(b *testing.B) { + b.StopTimer() + code := strings.Repeat("SELECT * FROM table;\n", 1000) + lexer := DetectChromaLexerByFileName("a.sql", "") + b.StartTimer() + for b.Loop() { + // Really slow ....... + // BenchmarkRenderCodeByLexer-12 22 47159038 ns/op + RenderCodeByLexer(lexer, code) + } +} + +func TestDetectChromaLexer(t *testing.T) { + globalVars().highlightMapping[".my-html"] = "HTML" + t.Cleanup(func() { delete(globalVars().highlightMapping, ".my-html") }) + + cases := []struct { + fileName string + language string + content string + expected string + }{ + {"test.py", "", "", "Python"}, + + {"any-file", "javascript", "", "JavaScript"}, + {"any-file", "", "/* vim: set filetype=python */", "Python"}, + {"any-file", "", "", "fallback"}, + + {"test.fs", "", "", "FSharp"}, + {"test.fs", "F#", "", "FSharp"}, + {"test.fs", "", "let x = 1", "FSharp"}, + + {"test.c", "", "", "C"}, + {"test.C", "", "", "C++"}, + {"OLD-CODE.PAS", "", "", "ObjectPascal"}, + {"test.my-html", "", "", "HTML"}, + + {"a.php", "", "", "PHP"}, + {"a.sql", "", "", "SQL"}, + {"dhcpd.conf", "", "", "ISCdhcpd"}, + {".env.my-production", "", "", "Bash"}, + } + for _, c := range cases { + lexer := detectChromaLexerWithAnalyze(c.fileName, c.language, []byte(c.content)) + if assert.NotNil(t, lexer, "case: %+v", c) { + assert.Equal(t, c.expected, lexer.Config().Name, "case: %+v", c) + } + } +} diff --git a/modules/indexer/code/search.go b/modules/indexer/code/search.go index 907dd1a537..eb20b70e71 100644 --- a/modules/indexer/code/search.go +++ b/modules/indexer/code/search.go @@ -72,7 +72,8 @@ func writeStrings(buf *bytes.Buffer, strs ...string) error { func HighlightSearchResultCode(filename, language string, lineNums []int, code string) []*ResultLine { // we should highlight the whole code block first, otherwise it doesn't work well with multiple line highlighting - hl, _ := highlight.RenderCodeFast(filename, language, code) + lexer := highlight.DetectChromaLexerByFileName(filename, language) + hl := highlight.RenderCodeByLexer(lexer, code) highlightedLines := strings.Split(string(hl), "\n") // The lineNums outputted by render might not match the original lineNums, because "highlight" removes the last `\n` diff --git a/modules/markup/orgmode/orgmode.go b/modules/markup/orgmode/orgmode.go index 17d994734a..fd3071645a 100644 --- a/modules/markup/orgmode/orgmode.go +++ b/modules/markup/orgmode/orgmode.go @@ -56,7 +56,7 @@ func Render(ctx *markup.RenderContext, input io.Reader, output io.Writer) error } }() - lexer := highlight.GetChromaLexerWithFallback("", lang, nil) // don't use content to detect, it is too slow + lexer := highlight.DetectChromaLexerByFileName("", lang) // don't use content to detect, it is too slow lexer = chroma.Coalesce(lexer) sb := &strings.Builder{} diff --git a/routers/web/repo/blame.go b/routers/web/repo/blame.go index 25eb88eefc..4fb61bee6d 100644 --- a/routers/web/repo/blame.go +++ b/routers/web/repo/blame.go @@ -267,7 +267,7 @@ func renderBlame(ctx *context.Context, blameParts []*gitrepo.BlamePart, commitNa bufContent := buf.Bytes() bufContent = charset.ToUTF8(bufContent, charset.ConvertOpts{}) - highlighted, lexerName := highlight.RenderCodeSlowGuess(path.Base(ctx.Repo.TreePath), language, util.UnsafeBytesToString(bufContent)) + highlighted, _, lexerDisplayName := highlight.RenderCodeSlowGuess(path.Base(ctx.Repo.TreePath), language, util.UnsafeBytesToString(bufContent)) unsafeLines := highlight.UnsafeSplitHighlightedLines(highlighted) for i, br := range rows { var line template.HTML @@ -280,5 +280,5 @@ func renderBlame(ctx *context.Context, blameParts []*gitrepo.BlamePart, commitNa ctx.Data["EscapeStatus"] = escapeStatus ctx.Data["BlameRows"] = rows - ctx.Data["LexerName"] = lexerName + ctx.Data["LexerName"] = lexerDisplayName } diff --git a/services/gitdiff/gitdiff.go b/services/gitdiff/gitdiff.go index 6b29582208..7777cf4a1c 100644 --- a/services/gitdiff/gitdiff.go +++ b/services/gitdiff/gitdiff.go @@ -40,6 +40,7 @@ import ( "code.gitea.io/gitea/modules/translation" "code.gitea.io/gitea/modules/util" + "github.com/alecthomas/chroma/v2" "github.com/sergi/go-diff/diffmatchpatch" stdcharset "golang.org/x/net/html/charset" "golang.org/x/text/encoding" @@ -306,6 +307,7 @@ type DiffSection struct { language *diffVarMutable[string] highlightedLeftLines *diffVarMutable[map[int]template.HTML] highlightedRightLines *diffVarMutable[map[int]template.HTML] + highlightLexer *diffVarMutable[chroma.Lexer] FileName string Lines []*DiffLine @@ -347,8 +349,10 @@ func (diffSection *DiffSection) getLineContentForRender(lineIdx int, diffLine *D if setting.Git.DisableDiffHighlight { return template.HTML(html.EscapeString(diffLine.Content[1:])) } - h, _ = highlight.RenderCodeFast(diffSection.FileName, fileLanguage, diffLine.Content[1:]) - return h + if diffSection.highlightLexer.value == nil { + diffSection.highlightLexer.value = highlight.DetectChromaLexerByFileName(diffSection.FileName, fileLanguage) + } + return highlight.RenderCodeByLexer(diffSection.highlightLexer.value, diffLine.Content[1:]) } func (diffSection *DiffSection) getDiffLineForRender(diffLineType DiffLineType, leftLine, rightLine *DiffLine, locale translation.Locale) DiffInline { @@ -391,6 +395,12 @@ func (diffSection *DiffSection) getDiffLineForRender(diffLineType DiffLineType, // GetComputedInlineDiffFor computes inline diff for the given line. func (diffSection *DiffSection) GetComputedInlineDiffFor(diffLine *DiffLine, locale translation.Locale) DiffInline { + defer func() { + if err := recover(); err != nil { + // the logic is too complex in this function, help to catch any panic because Golang template doesn't print the stack + log.Error("panic in GetComputedInlineDiffFor: %v\nStack: %s", err, log.Stack(2)) + } + }() // try to find equivalent diff line. ignore, otherwise switch diffLine.Type { case DiffLineSection: @@ -452,6 +462,7 @@ type DiffFile struct { // for render purpose only, will be filled by the extra loop in GitDiffForRender, the maps of lines are 0-based language diffVarMutable[string] + highlightRender diffVarMutable[chroma.Lexer] // cache render (atm: lexer) for current file, only detect once for line-by-line mode highlightedLeftLines diffVarMutable[map[int]template.HTML] highlightedRightLines diffVarMutable[map[int]template.HTML] } @@ -932,6 +943,7 @@ func skipToNextDiffHead(input *bufio.Reader) (line string, err error) { func newDiffSectionForDiffFile(curFile *DiffFile) *DiffSection { return &DiffSection{ language: &curFile.language, + highlightLexer: &curFile.highlightRender, highlightedLeftLines: &curFile.highlightedLeftLines, highlightedRightLines: &curFile.highlightedRightLines, } @@ -1395,7 +1407,8 @@ func highlightCodeLines(name, lang string, sections []*DiffSection, isLeft bool, } content := util.UnsafeBytesToString(charset.ToUTF8(rawContent, charset.ConvertOpts{})) - highlightedNewContent, _ := highlight.RenderCodeFast(name, lang, content) + lexer := highlight.DetectChromaLexerByFileName(name, lang) + highlightedNewContent := highlight.RenderCodeByLexer(lexer, content) unsafeLines := highlight.UnsafeSplitHighlightedLines(highlightedNewContent) lines := make(map[int]template.HTML, len(unsafeLines)) // only save the highlighted lines we need, but not the whole file, to save memory diff --git a/services/gitdiff/gitdiff_excerpt.go b/services/gitdiff/gitdiff_excerpt.go index be66d8e2af..4b1958fc11 100644 --- a/services/gitdiff/gitdiff_excerpt.go +++ b/services/gitdiff/gitdiff_excerpt.go @@ -11,6 +11,8 @@ import ( "io" "code.gitea.io/gitea/modules/setting" + + "github.com/alecthomas/chroma/v2" ) type BlobExcerptOptions struct { @@ -65,6 +67,7 @@ func BuildBlobExcerptDiffSection(filePath string, reader io.Reader, opts BlobExc chunkSize := BlobExcerptChunkSize section := &DiffSection{ language: &diffVarMutable[string]{value: language}, + highlightLexer: &diffVarMutable[chroma.Lexer]{}, highlightedLeftLines: &diffVarMutable[map[int]template.HTML]{}, highlightedRightLines: &diffVarMutable[map[int]template.HTML]{}, FileName: filePath, diff --git a/services/gitdiff/highlightdiff_test.go b/services/gitdiff/highlightdiff_test.go index b99b7e3675..ea9a8829ed 100644 --- a/services/gitdiff/highlightdiff_test.go +++ b/services/gitdiff/highlightdiff_test.go @@ -76,8 +76,8 @@ func TestDiffWithHighlight(t *testing.T) { }) t.Run("ComplexDiff1", func(t *testing.T) { - oldCode, _ := highlight.RenderCodeFast("a.go", "Go", `xxx || yyy`) - newCode, _ := highlight.RenderCodeFast("a.go", "Go", `bot&xxx || bot&yyy`) + oldCode, _, _ := highlight.RenderCodeSlowGuess("a.go", "Go", `xxx || yyy`) + newCode, _, _ := highlight.RenderCodeSlowGuess("a.go", "Go", `bot&xxx || bot&yyy`) hcd := newHighlightCodeDiff() out := hcd.diffLineWithHighlight(DiffLineAdd, oldCode, newCode) assert.Equal(t, strings.ReplaceAll(`