diff --git a/modules/git/catfile_batch_reader.go b/modules/git/catfile_batch_reader.go index 7d2e496ace..8a0b342079 100644 --- a/modules/git/catfile_batch_reader.go +++ b/modules/git/catfile_batch_reader.go @@ -12,11 +12,15 @@ import ( "math" "strconv" "strings" + "sync/atomic" + "time" "code.gitea.io/gitea/modules/git/gitcmd" "code.gitea.io/gitea/modules/log" ) +var catFileBatchDebugWaitClose atomic.Int64 + type catFileBatchCommunicator struct { cancel context.CancelFunc reqWriter io.Writer @@ -36,7 +40,14 @@ func newCatFileBatch(ctx context.Context, repoPath string, cmdCatFile *gitcmd.Co ctx, ctxCancel := context.WithCancelCause(ctx) // We often want to feed the commits in order into cat-file --batch, followed by their trees and subtrees as necessary. - stdinWriter, stdoutReader, pipeClose := cmdCatFile.MakeStdinStdoutPipe() + stdinWriter, stdoutReader, stdPipeClose := cmdCatFile.MakeStdinStdoutPipe() + pipeClose := func() { + if delay := catFileBatchDebugWaitClose.Load(); delay > 0 { + time.Sleep(time.Duration(delay)) // for testing purpose only + } + stdPipeClose() + } + ret = &catFileBatchCommunicator{ debugGitCmd: cmdCatFile, cancel: func() { ctxCancel(nil) }, diff --git a/modules/git/catfile_batch_test.go b/modules/git/catfile_batch_test.go index 8f6b1f5eff..69662ffc1a 100644 --- a/modules/git/catfile_batch_test.go +++ b/modules/git/catfile_batch_test.go @@ -5,9 +5,11 @@ package git import ( "io" + "os" "path/filepath" "sync" "testing" + "time" "code.gitea.io/gitea/modules/test" @@ -37,6 +39,45 @@ func testCatFileBatch(t *testing.T) { require.Error(t, err) }) + simulateQueryTerminated := func(pipeCloseDelay, pipeReadDelay time.Duration) (errRead error) { + catFileBatchDebugWaitClose.Store(int64(pipeCloseDelay)) + defer catFileBatchDebugWaitClose.Store(0) + batch, err := NewBatch(t.Context(), filepath.Join(testReposDir, "repo1_bare")) + require.NoError(t, err) + defer batch.Close() + _, _ = batch.QueryInfo("e2129701f1a4d54dc44f03c93bca0a2aec7c5449") + var c *catFileBatchCommunicator + switch b := batch.(type) { + case *catFileBatchLegacy: + c = b.batchCheck + _, _ = c.reqWriter.Write([]byte("in-complete-line-")) + case *catFileBatchCommand: + c = b.batch + _, _ = c.reqWriter.Write([]byte("info")) + default: + t.FailNow() + } + + wg := sync.WaitGroup{} + wg.Go(func() { + time.Sleep(pipeReadDelay) + var n int + n, errRead = c.respReader.Read(make([]byte, 100)) + assert.Zero(t, n) + }) + time.Sleep(10 * time.Millisecond) + c.debugGitCmd.DebugKill() + wg.Wait() + return errRead + } + + t.Run("QueryTerminated", func(t *testing.T) { + err := simulateQueryTerminated(0, 20*time.Millisecond) + assert.ErrorIs(t, err, os.ErrClosed) // pipes are closed faster + err = simulateQueryTerminated(40*time.Millisecond, 20*time.Millisecond) + assert.ErrorIs(t, err, io.EOF) // reader is faster + }) + batch, err := NewBatch(t.Context(), filepath.Join(testReposDir, "repo1_bare")) require.NoError(t, err) defer batch.Close() @@ -60,30 +101,4 @@ func testCatFileBatch(t *testing.T) { require.NoError(t, err) require.Equal(t, "file1\n", string(content)) }) - - t.Run("QueryTerminated", func(t *testing.T) { - var c *catFileBatchCommunicator - switch b := batch.(type) { - case *catFileBatchLegacy: - c = b.batchCheck - _, _ = c.reqWriter.Write([]byte("in-complete-line-")) - case *catFileBatchCommand: - c = b.batch - _, _ = c.reqWriter.Write([]byte("info")) - default: - t.FailNow() - return - } - - wg := sync.WaitGroup{} - wg.Go(func() { - buf := make([]byte, 100) - _, _ = c.respReader.Read(buf) - n, errRead := c.respReader.Read(buf) - assert.Zero(t, n) - assert.ErrorIs(t, errRead, io.EOF) // the pipe is closed due to command being killed - }) - c.debugGitCmd.DebugKill() - wg.Wait() - }) } diff --git a/modules/git/gitcmd/pipe.go b/modules/git/gitcmd/pipe.go index a1e91fdf8e..d0ce3e2dc6 100644 --- a/modules/git/gitcmd/pipe.go +++ b/modules/git/gitcmd/pipe.go @@ -9,6 +9,14 @@ import ( ) type PipeBufferReader interface { + // Read should be used in the same goroutine as command's Wait + // When Reader in one goroutine, command's Wait in another goroutine, then the command exits, the pipe will be closed: + // * If the Reader goroutine reads faster, it will read all remaining data and then get io.EOF + // * But this io.EOF doesn't mean the Reader has gotten complete data, the data might still be corrupted + // * If the Reader goroutine reads slower, it will get os.ErrClosed because the os.Pipe is closed ahead when the command exits + // + // When using 2 goroutines, no clear solution to distinguish these two cases or make Reader knows whether the data is complete + // It should avoid using Reader in a different goroutine than the command if the Read error needs to be handled. Read(p []byte) (n int, err error) Bytes() []byte } diff --git a/services/gitdiff/gitdiff.go b/services/gitdiff/gitdiff.go index a62177c45c..6b29582208 100644 --- a/services/gitdiff/gitdiff.go +++ b/services/gitdiff/gitdiff.go @@ -16,6 +16,7 @@ import ( "path" "sort" "strings" + "time" "code.gitea.io/gitea/models/db" git_model "code.gitea.io/gitea/models/git" @@ -123,8 +124,14 @@ type DiffHTMLOperation struct { // BlobExcerptChunkSize represent max lines of excerpt const BlobExcerptChunkSize = 20 -// MaxDiffHighlightEntireFileSize is the maximum file size that will be highlighted with "entire file diff" -const MaxDiffHighlightEntireFileSize = 1 * 1024 * 1024 +// Chroma seems extremely slow when highlighting large files, it might take dozens or hundreds of milliseconds. +// When fully highlighting a diff with a lot of large files, it would take many seconds or even dozens of seconds. +// So, don't highlight the entire file if it's too large, or highlighting takes too long. +// When there is no full-file highlighting, the legacy "line-by-line" highlighting is still applied as the fallback. +const ( + MaxFullFileHighlightSizeLimit = 256 * 1024 + MaxFullFileHighlightTimeLimit = 2 * time.Second +) // GetType returns the type of DiffLine. func (d *DiffLine) GetType() int { @@ -564,7 +571,7 @@ func getCommitFileLineCountAndLimitedContent(commit *git.Commit, filePath string if err != nil { return 0, nil } - w := &limitByteWriter{limit: MaxDiffHighlightEntireFileSize + 1} + w := &limitByteWriter{limit: MaxFullFileHighlightSizeLimit + 1} lineCount, err = blob.GetBlobLineCount(w) if err != nil { return 0, nil @@ -1317,6 +1324,8 @@ func GetDiffForRender(ctx context.Context, repoLink string, gitRepo *git.Reposit return nil, err } + startTime := time.Now() + checker, err := attribute.NewBatchChecker(gitRepo, opts.AfterCommitID, []string{attribute.LinguistVendored, attribute.LinguistGenerated, attribute.LinguistLanguage, attribute.GitlabLanguage, attribute.Diff}) if err != nil { return nil, err @@ -1356,7 +1365,8 @@ func GetDiffForRender(ctx context.Context, repoLink string, gitRepo *git.Reposit diffFile.Sections = append(diffFile.Sections, tailSection) } - shouldFullFileHighlight := !setting.Git.DisableDiffHighlight && attrDiff.Value() == "" + shouldFullFileHighlight := attrDiff.Value() == "" // only do highlight if no custom diff command + shouldFullFileHighlight = shouldFullFileHighlight && time.Since(startTime) < MaxFullFileHighlightTimeLimit if shouldFullFileHighlight { if limitedContent.LeftContent != nil { diffFile.highlightedLeftLines.value = highlightCodeLinesForDiffFile(diffFile, true /* left */, limitedContent.LeftContent.buf.Bytes()) @@ -1380,7 +1390,7 @@ func highlightCodeLinesForDiffFile(diffFile *DiffFile, isLeft bool, rawContent [ } func highlightCodeLines(name, lang string, sections []*DiffSection, isLeft bool, rawContent []byte) map[int]template.HTML { - if setting.Git.DisableDiffHighlight || len(rawContent) > MaxDiffHighlightEntireFileSize { + if setting.Git.DisableDiffHighlight || len(rawContent) > MaxFullFileHighlightSizeLimit { return nil } diff --git a/services/gitdiff/highlightdiff.go b/services/gitdiff/highlightdiff.go index 47b41319cb..1de3963788 100644 --- a/services/gitdiff/highlightdiff.go +++ b/services/gitdiff/highlightdiff.go @@ -298,15 +298,21 @@ func (hcd *highlightCodeDiff) convertToPlaceholders(htmlContent template.HTML) s return res.String() } -func (hcd *highlightCodeDiff) extractNextPlaceholder(buf []byte, lastIdx int) (idx int, placeholder rune, runeLen int, token string) { - for idx = lastIdx; idx < len(buf); { - placeholder, runeLen = utf8.DecodeRune(buf[idx:]) - if token = hcd.placeholderTokenMap[placeholder]; token != "" { - break +// recoverOneRune tries to recover one rune +// * if the rune is a placeholder, it will be recovered to the corresponding content +// * otherwise it will be returned as is +func (hcd *highlightCodeDiff) recoverOneRune(buf []byte) (r rune, runeLen int, isSingleTag bool, recovered string) { + r, runeLen = utf8.DecodeRune(buf) + token := hcd.placeholderTokenMap[r] + if token == "" { + return r, runeLen, false, "" // rune itself, not a placeholder + } else if token[0] == '<' { + if token[1] == '<' { + return 0, runeLen, false, token[1 : len(token)-1] // full tag `<content>`, recover to `content` } - idx += runeLen + return r, runeLen, true, token // single tag } - return idx, placeholder, runeLen, token + return 0, runeLen, false, token // HTML entity } func (hcd *highlightCodeDiff) recoverOneDiff(str string) template.HTML { @@ -315,49 +321,65 @@ func (hcd *highlightCodeDiff) recoverOneDiff(str string) template.HTML { var diffCodeOpenTag string diffCodeCloseTag := hcd.placeholderTokenMap[hcd.diffCodeClose] strBytes := util.UnsafeStringToBytes(str) + + // this loop is slightly longer than expected, for performance consideration for idx := 0; idx < len(strBytes); { - newIdx, placeholder, lastRuneLen, token := hcd.extractNextPlaceholder(strBytes, idx) - if newIdx != idx { + // take a look at the next rune + r, runeLen, isSingleTag, recovered := hcd.recoverOneRune(strBytes[idx:]) + idx += runeLen + + // loop section 1: if it isn't a single tag, then try to find the following runes until the next single tag, and recover them together + if !isSingleTag { if diffCodeOpenTag != "" { + // start the "added/removed diff tag" if the current token is in the diff part sb.WriteString(diffCodeOpenTag) - sb.Write(strBytes[idx:newIdx]) - sb.WriteString(diffCodeCloseTag) - } else { - sb.Write(strBytes[idx:newIdx]) } - } // else: no text content before, the current token is a placeholder - if token == "" { - break // reaches the string end, no more placeholder - } - idx = newIdx + lastRuneLen - - // for HTML entity - if token[0] == '&' { - sb.WriteString(token) - continue - } - - // for various HTML tags - var recovered string - if token[1] == '<' { // full tag `<content>`, recover to `content` - recovered = token[1 : len(token)-1] - if diffCodeOpenTag != "" { - recovered = diffCodeOpenTag + recovered + diffCodeCloseTag - } // else: just use the recovered content - } else if token[1] != '/' { // opening tag - if placeholder == hcd.diffCodeAddedOpen || placeholder == hcd.diffCodeRemovedOpen { - diffCodeOpenTag = token + if recovered != "" { + sb.WriteString(recovered) + } else { + sb.WriteRune(r) + } + // inner loop to recover following runes until the next single tag + for idx < len(strBytes) { + r, runeLen, isSingleTag, recovered = hcd.recoverOneRune(strBytes[idx:]) + idx += runeLen + if isSingleTag { + break + } + if recovered != "" { + sb.WriteString(recovered) + } else { + sb.WriteRune(r) + } + } + if diffCodeOpenTag != "" { + // end the "added/removed diff tag" if the current token is in the diff part + sb.WriteString(diffCodeCloseTag) + } + } + + if !isSingleTag { + break // the inner loop has already consumed all remaining runes, no more single tag found + } + + // loop section 2: for opening/closing HTML tags + placeholder := r + if recovered[1] != '/' { // opening tag + if placeholder == hcd.diffCodeAddedOpen || placeholder == hcd.diffCodeRemovedOpen { + diffCodeOpenTag = recovered + recovered = "" } else { - recovered = token tagStack = append(tagStack, recovered) } } else { // closing tag if placeholder == hcd.diffCodeClose { diffCodeOpenTag = "" // the highlighted diff is closed, no more diff + recovered = "" } else if len(tagStack) != 0 { - recovered = token tagStack = tagStack[:len(tagStack)-1] - } // else: if no opening tag in stack yet, skip the closing tag + } else { + recovered = "" + } } sb.WriteString(recovered) } diff --git a/services/gitdiff/highlightdiff_test.go b/services/gitdiff/highlightdiff_test.go index dd7e0984c7..b99b7e3675 100644 --- a/services/gitdiff/highlightdiff_test.go +++ b/services/gitdiff/highlightdiff_test.go @@ -77,15 +77,13 @@ func TestDiffWithHighlight(t *testing.T) { t.Run("ComplexDiff1", func(t *testing.T) { oldCode, _ := highlight.RenderCodeFast("a.go", "Go", `xxx || yyy`) - newCode, _ := highlight.RenderCodeFast("a.go", "Go", `bot.xxx || bot.yyy`) + newCode, _ := highlight.RenderCodeFast("a.go", "Go", `bot&xxx || bot&yyy`) hcd := newHighlightCodeDiff() out := hcd.diffLineWithHighlight(DiffLineAdd, oldCode, newCode) assert.Equal(t, strings.ReplaceAll(` -bot -. +bot& xxx || -bot -. +bot& yyy`, "\n", ""), string(out)) })