mirror of
https://github.com/go-gitea/gitea.git
synced 2026-02-14 18:47:20 +01:00
Refactor highlight and diff (#36599)
1. fix a performance regression when using line-by-line highlighting * the root cause is that chroma's `lexers.Get` is slow and a lexer cache is missing during recent changes 2. clarify the chroma lexer detection behavior * now we fully manage our logic to detect lexer, and handle overriding problems, everything is fully under control 3. clarify "code analyze" behavior, now only 2 usages: * only use file name and language to detect lexer (very fast), mainly for "diff" page which contains a lot of files * if no lexer is detected by file name and language, use code content to detect again (slow), mainly for "view file" or "blame" page, which can get best result 4. fix git diff bug, it caused "broken pipe" error for large diff files
This commit is contained in:
@@ -28,44 +28,37 @@ const (
|
||||
|
||||
// GetRawDiff dumps diff results of repository in given commit ID to io.Writer.
|
||||
func GetRawDiff(repo *Repository, commitID string, diffType RawDiffType, writer io.Writer) (retErr error) {
|
||||
diffOutput, diffFinish, err := getRepoRawDiffForFile(repo.Ctx, repo, "", commitID, diffType, "")
|
||||
cmd, err := getRepoRawDiffForFileCmd(repo.Ctx, repo, "", commitID, diffType, "")
|
||||
if err != nil {
|
||||
return err
|
||||
return fmt.Errorf("getRepoRawDiffForFileCmd: %w", err)
|
||||
}
|
||||
defer func() {
|
||||
err := diffFinish()
|
||||
if retErr == nil {
|
||||
retErr = err // only return command's error if no previous error
|
||||
}
|
||||
}()
|
||||
_, err = io.Copy(writer, diffOutput)
|
||||
return err
|
||||
return cmd.WithStdoutCopy(writer).RunWithStderr(repo.Ctx)
|
||||
}
|
||||
|
||||
// GetFileDiffCutAroundLine cuts the old or new part of the diff of a file around a specific line number
|
||||
func GetFileDiffCutAroundLine(
|
||||
repo *Repository, startCommit, endCommit, treePath string,
|
||||
line int64, old bool, numbersOfLine int,
|
||||
) (_ string, retErr error) {
|
||||
diffOutput, diffFinish, err := getRepoRawDiffForFile(repo.Ctx, repo, startCommit, endCommit, RawDiffNormal, treePath)
|
||||
) (ret string, retErr error) {
|
||||
cmd, err := getRepoRawDiffForFileCmd(repo.Ctx, repo, startCommit, endCommit, RawDiffNormal, treePath)
|
||||
if err != nil {
|
||||
return "", err
|
||||
return "", fmt.Errorf("getRepoRawDiffForFileCmd: %w", err)
|
||||
}
|
||||
defer func() {
|
||||
err := diffFinish()
|
||||
if retErr == nil {
|
||||
retErr = err // only return command's error if no previous error
|
||||
}
|
||||
}()
|
||||
return CutDiffAroundLine(diffOutput, line, old, numbersOfLine)
|
||||
stdoutReader, stdoutClose := cmd.MakeStdoutPipe()
|
||||
defer stdoutClose()
|
||||
cmd.WithPipelineFunc(func(ctx gitcmd.Context) error {
|
||||
ret, err = CutDiffAroundLine(stdoutReader, line, old, numbersOfLine)
|
||||
return err
|
||||
})
|
||||
return ret, cmd.RunWithStderr(repo.Ctx)
|
||||
}
|
||||
|
||||
// getRepoRawDiffForFile returns an io.Reader for the diff results of file in given commit ID
|
||||
// and a "finish" function to wait for the git command and clean up resources after reading is done.
|
||||
func getRepoRawDiffForFile(ctx context.Context, repo *Repository, startCommit, endCommit string, diffType RawDiffType, file string) (io.Reader, func() gitcmd.RunStdError, error) {
|
||||
func getRepoRawDiffForFileCmd(_ context.Context, repo *Repository, startCommit, endCommit string, diffType RawDiffType, file string) (*gitcmd.Command, error) {
|
||||
commit, err := repo.GetCommit(endCommit)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
return nil, err
|
||||
}
|
||||
var files []string
|
||||
if len(file) > 0 {
|
||||
@@ -84,7 +77,7 @@ func getRepoRawDiffForFile(ctx context.Context, repo *Repository, startCommit, e
|
||||
} else {
|
||||
c, err := commit.Parent(0)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
return nil, err
|
||||
}
|
||||
cmd.AddArguments("diff").
|
||||
AddOptionFormat("--find-renames=%s", setting.Git.DiffRenameSimilarityThreshold).
|
||||
@@ -99,25 +92,15 @@ func getRepoRawDiffForFile(ctx context.Context, repo *Repository, startCommit, e
|
||||
} else {
|
||||
c, err := commit.Parent(0)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
return nil, err
|
||||
}
|
||||
query := fmt.Sprintf("%s...%s", endCommit, c.ID.String())
|
||||
cmd.AddArguments("format-patch", "--no-signature", "--stdout").AddDynamicArguments(query).AddDashesAndList(files...)
|
||||
}
|
||||
default:
|
||||
return nil, nil, util.NewInvalidArgumentErrorf("invalid diff type: %s", diffType)
|
||||
return nil, util.NewInvalidArgumentErrorf("invalid diff type: %s", diffType)
|
||||
}
|
||||
|
||||
stdoutReader, stdoutReaderClose := cmd.MakeStdoutPipe()
|
||||
err = cmd.StartWithStderr(ctx)
|
||||
if err != nil {
|
||||
stdoutReaderClose()
|
||||
return nil, nil, err
|
||||
}
|
||||
return stdoutReader, func() gitcmd.RunStdError {
|
||||
stdoutReaderClose()
|
||||
return cmd.WaitWithStderr()
|
||||
}, nil
|
||||
return cmd, nil
|
||||
}
|
||||
|
||||
// ParseDiffHunkString parse the diff hunk content and return
|
||||
@@ -254,7 +237,7 @@ func CutDiffAroundLine(originalDiff io.Reader, line int64, old bool, numbersOfLi
|
||||
}
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
return "", err
|
||||
return "", fmt.Errorf("CutDiffAroundLine: scan: %w", err)
|
||||
}
|
||||
|
||||
// No hunk found
|
||||
|
||||
@@ -306,6 +306,10 @@ func (c *Command) MakeStdinPipe() (writer PipeWriter, closer func()) {
|
||||
// MakeStdoutPipe creates a reader for the command's stdout.
|
||||
// The returned closer function must be called by the caller to close the pipe.
|
||||
// After the pipe reader is closed, the unread data will be discarded.
|
||||
//
|
||||
// If the process (git command) still tries to write after the pipe is closed, the Wait error will be "signal: broken pipe".
|
||||
// WithPipelineFunc + Run won't return "broken pipe" error in this case if the callback returns no error.
|
||||
// But if you are calling Start / Wait family functions, you should either drain the pipe before close it, or handle the Wait error correctly.
|
||||
func (c *Command) MakeStdoutPipe() (reader PipeReader, closer func()) {
|
||||
return c.makeStdoutStderr(&c.cmdStdout)
|
||||
}
|
||||
|
||||
@@ -11,20 +11,16 @@ import (
|
||||
gohtml "html"
|
||||
"html/template"
|
||||
"io"
|
||||
"path"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"code.gitea.io/gitea/modules/analyze"
|
||||
"code.gitea.io/gitea/modules/log"
|
||||
"code.gitea.io/gitea/modules/setting"
|
||||
"code.gitea.io/gitea/modules/util"
|
||||
|
||||
"github.com/alecthomas/chroma/v2"
|
||||
"github.com/alecthomas/chroma/v2/formatters/html"
|
||||
"github.com/alecthomas/chroma/v2/lexers"
|
||||
"github.com/alecthomas/chroma/v2/styles"
|
||||
"github.com/go-enry/go-enry/v2"
|
||||
)
|
||||
|
||||
// don't index files larger than this many bytes for performance purposes
|
||||
@@ -84,85 +80,21 @@ func UnsafeSplitHighlightedLines(code template.HTML) (ret [][]byte) {
|
||||
}
|
||||
}
|
||||
|
||||
func getChromaLexerByLanguage(fileName, lang string) chroma.Lexer {
|
||||
lang, _, _ = strings.Cut(lang, "?") // maybe, the value from gitattributes might contain `?` parameters?
|
||||
ext := path.Ext(fileName)
|
||||
// the "lang" might come from enry, it has different naming for some languages
|
||||
switch lang {
|
||||
case "F#":
|
||||
lang = "FSharp"
|
||||
case "Pascal":
|
||||
lang = "ObjectPascal"
|
||||
case "C":
|
||||
if ext == ".C" || ext == ".H" {
|
||||
lang = "C++"
|
||||
}
|
||||
}
|
||||
if lang == "" && util.AsciiEqualFold(ext, ".sql") {
|
||||
// there is a bug when using MySQL lexer: "--\nSELECT", the second line will be rendered as comment incorrectly
|
||||
lang = "SQL"
|
||||
}
|
||||
// lexers.Get is slow if the language name can't be matched directly: it does extra "Match" call to iterate all lexers
|
||||
return lexers.Get(lang)
|
||||
}
|
||||
|
||||
// GetChromaLexerWithFallback returns a chroma lexer by given file name, language and code content. All parameters can be optional.
|
||||
// When code content is provided, it will be slow if no lexer is found by file name or language.
|
||||
// If no lexer is found, it will return the fallback lexer.
|
||||
func GetChromaLexerWithFallback(fileName, lang string, code []byte) (lexer chroma.Lexer) {
|
||||
if lang != "" {
|
||||
lexer = getChromaLexerByLanguage(fileName, lang)
|
||||
}
|
||||
|
||||
if lexer == nil {
|
||||
fileExt := path.Ext(fileName)
|
||||
if val, ok := globalVars().highlightMapping[fileExt]; ok {
|
||||
lexer = getChromaLexerByLanguage(fileName, val) // use mapped value to find lexer
|
||||
}
|
||||
}
|
||||
|
||||
if lexer == nil {
|
||||
// when using "code" to detect, analyze.GetCodeLanguage is slower, it iterates many rules to detect language from content
|
||||
// this is the old logic: use enry to detect language, and use chroma to render, but their naming is different for some languages
|
||||
enryLanguage := analyze.GetCodeLanguage(fileName, code)
|
||||
lexer = getChromaLexerByLanguage(fileName, enryLanguage)
|
||||
if lexer == nil {
|
||||
if enryLanguage != enry.OtherLanguage {
|
||||
log.Warn("No chroma lexer found for enry detected language: %s (file: %s), need to fix the language mapping between enry and chroma.", enryLanguage, fileName)
|
||||
}
|
||||
lexer = lexers.Match(fileName) // lexers.Match will search by its basename and extname
|
||||
}
|
||||
}
|
||||
|
||||
return util.IfZero(lexer, lexers.Fallback)
|
||||
}
|
||||
|
||||
func renderCode(fileName, language, code string, slowGuess bool) (output template.HTML, lexerName string) {
|
||||
// RenderCodeSlowGuess tries to get a lexer by file name and language first,
|
||||
// if not found, it will try to guess the lexer by code content, which is slow (more than several hundreds of milliseconds).
|
||||
func RenderCodeSlowGuess(fileName, language, code string) (output template.HTML, lexer chroma.Lexer, lexerDisplayName string) {
|
||||
// diff view newline will be passed as empty, change to literal '\n' so it can be copied
|
||||
// preserve literal newline in blame view
|
||||
if code == "" || code == "\n" {
|
||||
return "\n", ""
|
||||
return "\n", nil, ""
|
||||
}
|
||||
|
||||
if len(code) > sizeLimit {
|
||||
return template.HTML(template.HTMLEscapeString(code)), ""
|
||||
return template.HTML(template.HTMLEscapeString(code)), nil, ""
|
||||
}
|
||||
|
||||
var codeForGuessLexer []byte
|
||||
if slowGuess {
|
||||
// it is slower to guess lexer by code content, so only do it when necessary
|
||||
codeForGuessLexer = util.UnsafeStringToBytes(code)
|
||||
}
|
||||
lexer := GetChromaLexerWithFallback(fileName, language, codeForGuessLexer)
|
||||
return RenderCodeByLexer(lexer, code), formatLexerName(lexer.Config().Name)
|
||||
}
|
||||
|
||||
func RenderCodeFast(fileName, language, code string) (output template.HTML, lexerName string) {
|
||||
return renderCode(fileName, language, code, false)
|
||||
}
|
||||
|
||||
func RenderCodeSlowGuess(fileName, language, code string) (output template.HTML, lexerName string) {
|
||||
return renderCode(fileName, language, code, true)
|
||||
lexer = detectChromaLexerWithAnalyze(fileName, language, util.UnsafeStringToBytes(code)) // it is also slow
|
||||
return RenderCodeByLexer(lexer, code), lexer, formatLexerName(lexer.Config().Name)
|
||||
}
|
||||
|
||||
// RenderCodeByLexer returns a HTML version of code string with chroma syntax highlighting classes
|
||||
@@ -204,7 +136,7 @@ func RenderFullFile(fileName, language string, code []byte) ([]template.HTML, st
|
||||
html.PreventSurroundingPre(true),
|
||||
)
|
||||
|
||||
lexer := GetChromaLexerWithFallback(fileName, language, code)
|
||||
lexer := detectChromaLexerWithAnalyze(fileName, language, code)
|
||||
lexerName := formatLexerName(lexer.Config().Name)
|
||||
|
||||
iterator, err := lexer.Tokenise(nil, string(code))
|
||||
|
||||
@@ -205,36 +205,3 @@ func TestUnsafeSplitHighlightedLines(t *testing.T) {
|
||||
assert.Equal(t, "<span>a</span>\n", string(ret[0]))
|
||||
assert.Equal(t, "<span>b\n</span>", string(ret[1]))
|
||||
}
|
||||
|
||||
func TestGetChromaLexer(t *testing.T) {
|
||||
globalVars().highlightMapping[".my-html"] = "HTML"
|
||||
t.Cleanup(func() { delete(globalVars().highlightMapping, ".my-html") })
|
||||
|
||||
cases := []struct {
|
||||
fileName string
|
||||
language string
|
||||
content string
|
||||
expected string
|
||||
}{
|
||||
{"test.py", "", "", "Python"},
|
||||
|
||||
{"any-file", "javascript", "", "JavaScript"},
|
||||
{"any-file", "", "/* vim: set filetype=python */", "Python"},
|
||||
{"any-file", "", "", "fallback"},
|
||||
|
||||
{"test.fs", "", "", "Forth"},
|
||||
{"test.fs", "F#", "", "FSharp"},
|
||||
{"test.fs", "", "let x = 1", "FSharp"},
|
||||
|
||||
{"test.c", "", "", "C"},
|
||||
{"test.C", "", "", "C++"},
|
||||
{"OLD-CODE.PAS", "", "", "ObjectPascal"},
|
||||
{"test.my-html", "", "", "HTML"},
|
||||
}
|
||||
for _, c := range cases {
|
||||
lexer := GetChromaLexerWithFallback(c.fileName, c.language, []byte(c.content))
|
||||
if assert.NotNil(t, lexer, "case: %+v", c) {
|
||||
assert.Equal(t, c.expected, lexer.Config().Name, "case: %+v", c)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
279
modules/highlight/lexerdetect.go
Normal file
279
modules/highlight/lexerdetect.go
Normal file
@@ -0,0 +1,279 @@
|
||||
// Copyright 2026 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package highlight
|
||||
|
||||
import (
|
||||
"path"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"code.gitea.io/gitea/modules/analyze"
|
||||
"code.gitea.io/gitea/modules/log"
|
||||
|
||||
"github.com/alecthomas/chroma/v2"
|
||||
"github.com/alecthomas/chroma/v2/lexers"
|
||||
"github.com/go-enry/go-enry/v2"
|
||||
)
|
||||
|
||||
const mapKeyLowerPrefix = "lower/"
|
||||
|
||||
// chromaLexers is fully managed by us to do fast lookup for chroma lexers by file name or language name
|
||||
// Don't use lexers.Get because it is very slow in many cases (iterate all rules, filepath glob match, etc.)
|
||||
var chromaLexers = sync.OnceValue(func() (ret struct {
|
||||
conflictingExtLangMap map[string]string
|
||||
|
||||
lowerNameMap map[string]chroma.Lexer // lexer name (lang name) in lower-case
|
||||
fileBaseMap map[string]chroma.Lexer
|
||||
fileExtMap map[string]chroma.Lexer
|
||||
fileParts []struct {
|
||||
part string
|
||||
lexer chroma.Lexer
|
||||
}
|
||||
},
|
||||
) {
|
||||
ret.lowerNameMap = make(map[string]chroma.Lexer)
|
||||
ret.fileBaseMap = make(map[string]chroma.Lexer)
|
||||
ret.fileExtMap = make(map[string]chroma.Lexer)
|
||||
|
||||
// Chroma has overlaps in file extension for different languages,
|
||||
// When we need to do fast render, there is no way to detect the language by content,
|
||||
// So we can only choose some default languages for the overlapped file extensions.
|
||||
ret.conflictingExtLangMap = map[string]string{
|
||||
".as": "ActionScript 3", // ActionScript
|
||||
".asm": "NASM", // TASM, NASM, RGBDS Assembly, Z80 Assembly
|
||||
".ASM": "NASM",
|
||||
".bas": "VB.net", // QBasic
|
||||
".bf": "Beef", // Brainfuck
|
||||
".fs": "FSharp", // Forth
|
||||
".gd": "GDScript", // GDScript3
|
||||
".h": "C", // Objective-C
|
||||
".hcl": "Terraform", // HCL
|
||||
".hh": "C++", // HolyC
|
||||
".inc": "PHP", // ObjectPascal, POVRay, SourcePawn, PHTML
|
||||
".m": "Objective-C", // Matlab, Mathematica, Mason
|
||||
".mc": "Mason", // MonkeyC
|
||||
".network": "SYSTEMD", // INI
|
||||
".php": "PHP", // PHTML
|
||||
".php3": "PHP", // PHTML
|
||||
".php4": "PHP", // PHTML
|
||||
".php5": "PHP", // PHTML
|
||||
".pl": "Perl", // Prolog, Raku
|
||||
".pm": "Perl", // Promela, Raku
|
||||
".pp": "ObjectPascal", // Puppet
|
||||
".s": "ArmAsm", // GAS
|
||||
".S": "ArmAsm", // R, GAS
|
||||
".service": "SYSTEMD", // INI
|
||||
".socket": "SYSTEMD", // INI
|
||||
".sql": "SQL", // MySQL
|
||||
".t": "Perl", // Raku
|
||||
".ts": "TypeScript", // TypoScript
|
||||
".v": "V", // verilog
|
||||
".xslt": "HTML", // XML
|
||||
}
|
||||
|
||||
isPlainPattern := func(key string) bool {
|
||||
return !strings.ContainsAny(key, "*?[]") // only support simple patterns
|
||||
}
|
||||
|
||||
setMapWithLowerKey := func(m map[string]chroma.Lexer, key string, lexer chroma.Lexer) {
|
||||
if _, conflict := m[key]; conflict {
|
||||
panic("duplicate key in lexer map: " + key + ", need to add it to conflictingExtLangMap")
|
||||
}
|
||||
m[key] = lexer
|
||||
m[mapKeyLowerPrefix+strings.ToLower(key)] = lexer
|
||||
}
|
||||
|
||||
processFileName := func(fileName string, lexer chroma.Lexer) bool {
|
||||
if isPlainPattern(fileName) {
|
||||
// full base name match
|
||||
setMapWithLowerKey(ret.fileBaseMap, fileName, lexer)
|
||||
return true
|
||||
}
|
||||
if strings.HasPrefix(fileName, "*") {
|
||||
// ext name match: "*.js"
|
||||
fileExt := strings.Trim(fileName, "*")
|
||||
if isPlainPattern(fileExt) {
|
||||
presetName := ret.conflictingExtLangMap[fileExt]
|
||||
if presetName == "" || lexer.Config().Name == presetName {
|
||||
setMapWithLowerKey(ret.fileExtMap, fileExt, lexer)
|
||||
}
|
||||
return true
|
||||
}
|
||||
}
|
||||
if strings.HasSuffix(fileName, "*") {
|
||||
// part match: "*.env.*"
|
||||
filePart := strings.Trim(fileName, "*")
|
||||
if isPlainPattern(filePart) {
|
||||
ret.fileParts = append(ret.fileParts, struct {
|
||||
part string
|
||||
lexer chroma.Lexer
|
||||
}{
|
||||
part: filePart,
|
||||
lexer: lexer,
|
||||
})
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
expandGlobPatterns := func(patterns []string) []string {
|
||||
// expand patterns like "file.[ch]" to "file.c" and "file.h", only one pair of "[]" is supported, enough for current Chroma lexers
|
||||
for idx, s := range patterns {
|
||||
idx1 := strings.IndexByte(s, '[')
|
||||
idx2 := strings.IndexByte(s, ']')
|
||||
if idx1 != -1 && idx2 != -1 && idx2 > idx1+1 {
|
||||
left, mid, right := s[:idx1], s[idx1+1:idx2], s[idx2+1:]
|
||||
patterns[idx] = left + mid[0:1] + right
|
||||
for i := 1; i < len(mid); i++ {
|
||||
patterns = append(patterns, left+mid[i:i+1]+right)
|
||||
}
|
||||
}
|
||||
}
|
||||
return patterns
|
||||
}
|
||||
|
||||
// add lexers to our map, for fast lookup
|
||||
for _, lexer := range lexers.GlobalLexerRegistry.Lexers {
|
||||
cfg := lexer.Config()
|
||||
ret.lowerNameMap[strings.ToLower(lexer.Config().Name)] = lexer
|
||||
for _, alias := range cfg.Aliases {
|
||||
ret.lowerNameMap[strings.ToLower(alias)] = lexer
|
||||
}
|
||||
for _, s := range expandGlobPatterns(cfg.Filenames) {
|
||||
if !processFileName(s, lexer) {
|
||||
panic("unsupported file name pattern in lexer: " + s)
|
||||
}
|
||||
}
|
||||
for _, s := range expandGlobPatterns(cfg.AliasFilenames) {
|
||||
if !processFileName(s, lexer) {
|
||||
panic("unsupported alias file name pattern in lexer: " + s)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// final check: make sure the default ext-lang mapping is correct, nothing is missing
|
||||
for ext, lexerName := range ret.conflictingExtLangMap {
|
||||
if lexer, ok := ret.fileExtMap[ext]; !ok || lexer.Config().Name != lexerName {
|
||||
panic("missing default ext-lang mapping for: " + ext)
|
||||
}
|
||||
}
|
||||
return ret
|
||||
})
|
||||
|
||||
func normalizeFileNameLang(fileName, fileLang string) (string, string) {
|
||||
fileName = path.Base(fileName)
|
||||
fileLang, _, _ = strings.Cut(fileLang, "?") // maybe, the value from gitattributes might contain `?` parameters?
|
||||
ext := path.Ext(fileName)
|
||||
// the "lang" might come from enry or gitattributes, it has different naming for some languages
|
||||
switch fileLang {
|
||||
case "F#":
|
||||
fileLang = "FSharp"
|
||||
case "Pascal":
|
||||
fileLang = "ObjectPascal"
|
||||
case "C":
|
||||
if ext == ".C" || ext == ".H" {
|
||||
fileLang = "C++"
|
||||
}
|
||||
}
|
||||
return fileName, fileLang
|
||||
}
|
||||
|
||||
func DetectChromaLexerByFileName(fileName, fileLang string) chroma.Lexer {
|
||||
lexer, _ := detectChromaLexerByFileName(fileName, fileLang)
|
||||
return lexer
|
||||
}
|
||||
|
||||
func detectChromaLexerByFileName(fileName, fileLang string) (_ chroma.Lexer, byLang bool) {
|
||||
fileName, fileLang = normalizeFileNameLang(fileName, fileLang)
|
||||
fileExt := path.Ext(fileName)
|
||||
|
||||
// apply custom mapping for file extension, highest priority, for example:
|
||||
// * ".my-js" -> ".js"
|
||||
// * ".my-html" -> "HTML"
|
||||
if fileExt != "" {
|
||||
if val, ok := globalVars().highlightMapping[fileExt]; ok {
|
||||
if strings.HasPrefix(val, ".") {
|
||||
fileName = "dummy" + val
|
||||
fileLang = ""
|
||||
} else {
|
||||
fileLang = val
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// try to use language for lexer name
|
||||
if fileLang != "" {
|
||||
lexer := chromaLexers().lowerNameMap[strings.ToLower(fileLang)]
|
||||
if lexer != nil {
|
||||
return lexer, true
|
||||
}
|
||||
}
|
||||
|
||||
if fileName == "" {
|
||||
return lexers.Fallback, false
|
||||
}
|
||||
|
||||
// try base name
|
||||
{
|
||||
baseName := path.Base(fileName)
|
||||
if lexer, ok := chromaLexers().fileBaseMap[baseName]; ok {
|
||||
return lexer, false
|
||||
} else if lexer, ok = chromaLexers().fileBaseMap[mapKeyLowerPrefix+strings.ToLower(baseName)]; ok {
|
||||
return lexer, false
|
||||
}
|
||||
}
|
||||
|
||||
if fileExt == "" {
|
||||
return lexers.Fallback, false
|
||||
}
|
||||
|
||||
// try ext name
|
||||
{
|
||||
if lexer, ok := chromaLexers().fileExtMap[fileExt]; ok {
|
||||
return lexer, false
|
||||
} else if lexer, ok = chromaLexers().fileExtMap[mapKeyLowerPrefix+strings.ToLower(fileExt)]; ok {
|
||||
return lexer, false
|
||||
}
|
||||
}
|
||||
|
||||
// try file part match, for example: ".env.local" for "*.env.*"
|
||||
// it assumes that there must be a dot in filename (fileExt isn't empty)
|
||||
for _, item := range chromaLexers().fileParts {
|
||||
if strings.Contains(fileName, item.part) {
|
||||
return item.lexer, false
|
||||
}
|
||||
}
|
||||
return lexers.Fallback, false
|
||||
}
|
||||
|
||||
// detectChromaLexerWithAnalyze returns a chroma lexer by given file name, language and code content. All parameters can be optional.
|
||||
// When code content is provided, it will be slow if no lexer is found by file name or language.
|
||||
// If no lexer is found, it will return the fallback lexer.
|
||||
func detectChromaLexerWithAnalyze(fileName, lang string, code []byte) chroma.Lexer {
|
||||
lexer, byLang := detectChromaLexerByFileName(fileName, lang)
|
||||
|
||||
// if lang is provided, and it matches a lexer, use it directly
|
||||
if byLang {
|
||||
return lexer
|
||||
}
|
||||
|
||||
// if a lexer is detected and there is no conflict for the file extension, use it directly
|
||||
fileExt := path.Ext(fileName)
|
||||
_, hasConflicts := chromaLexers().conflictingExtLangMap[fileExt]
|
||||
if !hasConflicts && lexer != lexers.Fallback {
|
||||
return lexer
|
||||
}
|
||||
|
||||
// try to detect language by content, for best guessing for the language
|
||||
// when using "code" to detect, analyze.GetCodeLanguage is slow, it iterates many rules to detect language from content
|
||||
analyzedLanguage := analyze.GetCodeLanguage(fileName, code)
|
||||
lexer = DetectChromaLexerByFileName(fileName, analyzedLanguage)
|
||||
if lexer == lexers.Fallback {
|
||||
if analyzedLanguage != enry.OtherLanguage {
|
||||
log.Warn("No chroma lexer found for enry detected language: %s (file: %s), need to fix the language mapping between enry and chroma.", analyzedLanguage, fileName)
|
||||
}
|
||||
}
|
||||
return lexer
|
||||
}
|
||||
90
modules/highlight/lexerdetect_test.go
Normal file
90
modules/highlight/lexerdetect_test.go
Normal file
@@ -0,0 +1,90 @@
|
||||
// Copyright 2026 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package highlight
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/alecthomas/chroma/v2/lexers"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func BenchmarkDetectChromaLexerByFileName(b *testing.B) {
|
||||
for b.Loop() {
|
||||
// BenchmarkDetectChromaLexerByFileName-12 18214717 61.35 ns/op
|
||||
DetectChromaLexerByFileName("a.sql", "")
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkDetectChromaLexerWithAnalyze(b *testing.B) {
|
||||
b.StopTimer()
|
||||
code := []byte(strings.Repeat("SELECT * FROM table;\n", 1000))
|
||||
b.StartTimer()
|
||||
for b.Loop() {
|
||||
// BenchmarkRenderCodeSlowGuess-12 87946 13310 ns/op
|
||||
detectChromaLexerWithAnalyze("a", "", code)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkChromaAnalyze(b *testing.B) {
|
||||
b.StopTimer()
|
||||
code := strings.Repeat("SELECT * FROM table;\n", 1000)
|
||||
b.StartTimer()
|
||||
for b.Loop() {
|
||||
// comparing to detectChromaLexerWithAnalyze (go-enry), "chroma/lexers.Analyse" is very slow
|
||||
// BenchmarkChromaAnalyze-12 519 2247104 ns/op
|
||||
lexers.Analyse(code)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkRenderCodeByLexer(b *testing.B) {
|
||||
b.StopTimer()
|
||||
code := strings.Repeat("SELECT * FROM table;\n", 1000)
|
||||
lexer := DetectChromaLexerByFileName("a.sql", "")
|
||||
b.StartTimer()
|
||||
for b.Loop() {
|
||||
// Really slow .......
|
||||
// BenchmarkRenderCodeByLexer-12 22 47159038 ns/op
|
||||
RenderCodeByLexer(lexer, code)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDetectChromaLexer(t *testing.T) {
|
||||
globalVars().highlightMapping[".my-html"] = "HTML"
|
||||
t.Cleanup(func() { delete(globalVars().highlightMapping, ".my-html") })
|
||||
|
||||
cases := []struct {
|
||||
fileName string
|
||||
language string
|
||||
content string
|
||||
expected string
|
||||
}{
|
||||
{"test.py", "", "", "Python"},
|
||||
|
||||
{"any-file", "javascript", "", "JavaScript"},
|
||||
{"any-file", "", "/* vim: set filetype=python */", "Python"},
|
||||
{"any-file", "", "", "fallback"},
|
||||
|
||||
{"test.fs", "", "", "FSharp"},
|
||||
{"test.fs", "F#", "", "FSharp"},
|
||||
{"test.fs", "", "let x = 1", "FSharp"},
|
||||
|
||||
{"test.c", "", "", "C"},
|
||||
{"test.C", "", "", "C++"},
|
||||
{"OLD-CODE.PAS", "", "", "ObjectPascal"},
|
||||
{"test.my-html", "", "", "HTML"},
|
||||
|
||||
{"a.php", "", "", "PHP"},
|
||||
{"a.sql", "", "", "SQL"},
|
||||
{"dhcpd.conf", "", "", "ISCdhcpd"},
|
||||
{".env.my-production", "", "", "Bash"},
|
||||
}
|
||||
for _, c := range cases {
|
||||
lexer := detectChromaLexerWithAnalyze(c.fileName, c.language, []byte(c.content))
|
||||
if assert.NotNil(t, lexer, "case: %+v", c) {
|
||||
assert.Equal(t, c.expected, lexer.Config().Name, "case: %+v", c)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -72,7 +72,8 @@ func writeStrings(buf *bytes.Buffer, strs ...string) error {
|
||||
|
||||
func HighlightSearchResultCode(filename, language string, lineNums []int, code string) []*ResultLine {
|
||||
// we should highlight the whole code block first, otherwise it doesn't work well with multiple line highlighting
|
||||
hl, _ := highlight.RenderCodeFast(filename, language, code)
|
||||
lexer := highlight.DetectChromaLexerByFileName(filename, language)
|
||||
hl := highlight.RenderCodeByLexer(lexer, code)
|
||||
highlightedLines := strings.Split(string(hl), "\n")
|
||||
|
||||
// The lineNums outputted by render might not match the original lineNums, because "highlight" removes the last `\n`
|
||||
|
||||
@@ -56,7 +56,7 @@ func Render(ctx *markup.RenderContext, input io.Reader, output io.Writer) error
|
||||
}
|
||||
}()
|
||||
|
||||
lexer := highlight.GetChromaLexerWithFallback("", lang, nil) // don't use content to detect, it is too slow
|
||||
lexer := highlight.DetectChromaLexerByFileName("", lang) // don't use content to detect, it is too slow
|
||||
lexer = chroma.Coalesce(lexer)
|
||||
|
||||
sb := &strings.Builder{}
|
||||
|
||||
@@ -267,7 +267,7 @@ func renderBlame(ctx *context.Context, blameParts []*gitrepo.BlamePart, commitNa
|
||||
|
||||
bufContent := buf.Bytes()
|
||||
bufContent = charset.ToUTF8(bufContent, charset.ConvertOpts{})
|
||||
highlighted, lexerName := highlight.RenderCodeSlowGuess(path.Base(ctx.Repo.TreePath), language, util.UnsafeBytesToString(bufContent))
|
||||
highlighted, _, lexerDisplayName := highlight.RenderCodeSlowGuess(path.Base(ctx.Repo.TreePath), language, util.UnsafeBytesToString(bufContent))
|
||||
unsafeLines := highlight.UnsafeSplitHighlightedLines(highlighted)
|
||||
for i, br := range rows {
|
||||
var line template.HTML
|
||||
@@ -280,5 +280,5 @@ func renderBlame(ctx *context.Context, blameParts []*gitrepo.BlamePart, commitNa
|
||||
|
||||
ctx.Data["EscapeStatus"] = escapeStatus
|
||||
ctx.Data["BlameRows"] = rows
|
||||
ctx.Data["LexerName"] = lexerName
|
||||
ctx.Data["LexerName"] = lexerDisplayName
|
||||
}
|
||||
|
||||
@@ -40,6 +40,7 @@ import (
|
||||
"code.gitea.io/gitea/modules/translation"
|
||||
"code.gitea.io/gitea/modules/util"
|
||||
|
||||
"github.com/alecthomas/chroma/v2"
|
||||
"github.com/sergi/go-diff/diffmatchpatch"
|
||||
stdcharset "golang.org/x/net/html/charset"
|
||||
"golang.org/x/text/encoding"
|
||||
@@ -306,6 +307,7 @@ type DiffSection struct {
|
||||
language *diffVarMutable[string]
|
||||
highlightedLeftLines *diffVarMutable[map[int]template.HTML]
|
||||
highlightedRightLines *diffVarMutable[map[int]template.HTML]
|
||||
highlightLexer *diffVarMutable[chroma.Lexer]
|
||||
|
||||
FileName string
|
||||
Lines []*DiffLine
|
||||
@@ -347,8 +349,10 @@ func (diffSection *DiffSection) getLineContentForRender(lineIdx int, diffLine *D
|
||||
if setting.Git.DisableDiffHighlight {
|
||||
return template.HTML(html.EscapeString(diffLine.Content[1:]))
|
||||
}
|
||||
h, _ = highlight.RenderCodeFast(diffSection.FileName, fileLanguage, diffLine.Content[1:])
|
||||
return h
|
||||
if diffSection.highlightLexer.value == nil {
|
||||
diffSection.highlightLexer.value = highlight.DetectChromaLexerByFileName(diffSection.FileName, fileLanguage)
|
||||
}
|
||||
return highlight.RenderCodeByLexer(diffSection.highlightLexer.value, diffLine.Content[1:])
|
||||
}
|
||||
|
||||
func (diffSection *DiffSection) getDiffLineForRender(diffLineType DiffLineType, leftLine, rightLine *DiffLine, locale translation.Locale) DiffInline {
|
||||
@@ -391,6 +395,12 @@ func (diffSection *DiffSection) getDiffLineForRender(diffLineType DiffLineType,
|
||||
|
||||
// GetComputedInlineDiffFor computes inline diff for the given line.
|
||||
func (diffSection *DiffSection) GetComputedInlineDiffFor(diffLine *DiffLine, locale translation.Locale) DiffInline {
|
||||
defer func() {
|
||||
if err := recover(); err != nil {
|
||||
// the logic is too complex in this function, help to catch any panic because Golang template doesn't print the stack
|
||||
log.Error("panic in GetComputedInlineDiffFor: %v\nStack: %s", err, log.Stack(2))
|
||||
}
|
||||
}()
|
||||
// try to find equivalent diff line. ignore, otherwise
|
||||
switch diffLine.Type {
|
||||
case DiffLineSection:
|
||||
@@ -452,6 +462,7 @@ type DiffFile struct {
|
||||
|
||||
// for render purpose only, will be filled by the extra loop in GitDiffForRender, the maps of lines are 0-based
|
||||
language diffVarMutable[string]
|
||||
highlightRender diffVarMutable[chroma.Lexer] // cache render (atm: lexer) for current file, only detect once for line-by-line mode
|
||||
highlightedLeftLines diffVarMutable[map[int]template.HTML]
|
||||
highlightedRightLines diffVarMutable[map[int]template.HTML]
|
||||
}
|
||||
@@ -932,6 +943,7 @@ func skipToNextDiffHead(input *bufio.Reader) (line string, err error) {
|
||||
func newDiffSectionForDiffFile(curFile *DiffFile) *DiffSection {
|
||||
return &DiffSection{
|
||||
language: &curFile.language,
|
||||
highlightLexer: &curFile.highlightRender,
|
||||
highlightedLeftLines: &curFile.highlightedLeftLines,
|
||||
highlightedRightLines: &curFile.highlightedRightLines,
|
||||
}
|
||||
@@ -1395,7 +1407,8 @@ func highlightCodeLines(name, lang string, sections []*DiffSection, isLeft bool,
|
||||
}
|
||||
|
||||
content := util.UnsafeBytesToString(charset.ToUTF8(rawContent, charset.ConvertOpts{}))
|
||||
highlightedNewContent, _ := highlight.RenderCodeFast(name, lang, content)
|
||||
lexer := highlight.DetectChromaLexerByFileName(name, lang)
|
||||
highlightedNewContent := highlight.RenderCodeByLexer(lexer, content)
|
||||
unsafeLines := highlight.UnsafeSplitHighlightedLines(highlightedNewContent)
|
||||
lines := make(map[int]template.HTML, len(unsafeLines))
|
||||
// only save the highlighted lines we need, but not the whole file, to save memory
|
||||
|
||||
@@ -11,6 +11,8 @@ import (
|
||||
"io"
|
||||
|
||||
"code.gitea.io/gitea/modules/setting"
|
||||
|
||||
"github.com/alecthomas/chroma/v2"
|
||||
)
|
||||
|
||||
type BlobExcerptOptions struct {
|
||||
@@ -65,6 +67,7 @@ func BuildBlobExcerptDiffSection(filePath string, reader io.Reader, opts BlobExc
|
||||
chunkSize := BlobExcerptChunkSize
|
||||
section := &DiffSection{
|
||||
language: &diffVarMutable[string]{value: language},
|
||||
highlightLexer: &diffVarMutable[chroma.Lexer]{},
|
||||
highlightedLeftLines: &diffVarMutable[map[int]template.HTML]{},
|
||||
highlightedRightLines: &diffVarMutable[map[int]template.HTML]{},
|
||||
FileName: filePath,
|
||||
|
||||
@@ -76,8 +76,8 @@ func TestDiffWithHighlight(t *testing.T) {
|
||||
})
|
||||
|
||||
t.Run("ComplexDiff1", func(t *testing.T) {
|
||||
oldCode, _ := highlight.RenderCodeFast("a.go", "Go", `xxx || yyy`)
|
||||
newCode, _ := highlight.RenderCodeFast("a.go", "Go", `bot&xxx || bot&yyy`)
|
||||
oldCode, _, _ := highlight.RenderCodeSlowGuess("a.go", "Go", `xxx || yyy`)
|
||||
newCode, _, _ := highlight.RenderCodeSlowGuess("a.go", "Go", `bot&xxx || bot&yyy`)
|
||||
hcd := newHighlightCodeDiff()
|
||||
out := hcd.diffLineWithHighlight(DiffLineAdd, oldCode, newCode)
|
||||
assert.Equal(t, strings.ReplaceAll(`
|
||||
|
||||
Reference in New Issue
Block a user