docs-i18n: avoid ambiguous tagged body unwrap

This commit is contained in:
masonxhuang
2026-04-09 22:17:03 +08:00
parent 8e02a55a81
commit 6a16d66486
3 changed files with 97 additions and 7 deletions

View File

@@ -222,12 +222,14 @@ func sanitizeDocChunkProtocolWrappers(source, translated string) string {
if !hasUnexpectedTopLevelProtocolWrapper(source, trimmedTranslated) {
return translated
}
_, body, err := parseTaggedDocument(trimmedTranslated)
if err == nil {
if strings.TrimSpace(body) == "" {
return translated
if !hasAmbiguousTaggedBodyClose(source, trimmedTranslated) {
_, body, err := parseTaggedDocument(trimmedTranslated)
if err == nil {
if strings.TrimSpace(body) == "" {
return translated
}
return body
}
return body
}
body, ok := stripBodyOnlyWrapper(trimmedTranslated)
if !ok || strings.TrimSpace(body) == "" {
@@ -251,6 +253,18 @@ func stripBodyOnlyWrapper(text string) (string, bool) {
return trimTagNewlines(body), true
}
func hasAmbiguousTaggedBodyClose(source, translated string) bool {
sourceLower := strings.ToLower(source)
if !strings.Contains(sourceLower, strings.ToLower(bodyTagStart)) && !strings.Contains(sourceLower, strings.ToLower(bodyTagEnd)) {
return false
}
translatedLower := strings.ToLower(translated)
if !strings.Contains(translatedLower, strings.ToLower(frontmatterTagStart)) {
return false
}
return strings.Count(translatedLower, strings.ToLower(bodyTagEnd)) == 1
}
func maskDocComponentTags(text string) (string, []string) {
placeholders := make([]string, 0, 4)
masked := docsComponentTagRE.ReplaceAllStringFunc(text, func(match string) string {

View File

@@ -91,8 +91,8 @@ func parseTaggedDocument(text string) (string, string, error) {
}
bodyStart += frontEnd + len(bodyTagStart)
bodyEnd := strings.LastIndex(text, bodyTagEnd)
if bodyEnd == -1 || bodyEnd < bodyStart {
bodyEnd := findTaggedBodyEnd(text, bodyStart)
if bodyEnd == -1 {
return "", "", fmt.Errorf("missing %s", bodyTagEnd)
}
body := trimTagNewlines(text[bodyStart:bodyEnd])
@@ -107,6 +107,31 @@ func parseTaggedDocument(text string) (string, string, error) {
return frontMatter, body, nil
}
func findTaggedBodyEnd(text string, bodyStart int) int {
if bodyStart < 0 || bodyStart > len(text) {
return -1
}
search := text[bodyStart:]
candidate := -1
offset := 0
for {
index := strings.Index(search[offset:], bodyTagEnd)
if index == -1 {
return candidate
}
index += offset
absolute := bodyStart + index
suffix := strings.TrimSpace(text[absolute+len(bodyTagEnd):])
if suffix == "" {
candidate = absolute
}
offset = index + len(bodyTagEnd)
if offset >= len(search) {
return candidate
}
}
}
func trimTagNewlines(value string) string {
value = strings.TrimPrefix(value, "\n")
value = strings.TrimSuffix(value, "\n")

View File

@@ -252,6 +252,33 @@ func TestParseTaggedDocumentRejectsTrailingTextOutsideTags(t *testing.T) {
}
}
func TestFindTaggedBodyEndSearchesFromBodyStart(t *testing.T) {
t.Parallel()
text := strings.Join([]string{
"<frontmatter>",
"summary: literal </body> token in frontmatter",
"</frontmatter>",
"<body>",
"Translated body",
"</body>",
}, "\n")
bodyStart := strings.Index(text, bodyTagStart)
if bodyStart == -1 {
t.Fatal("expected body tag in test input")
}
bodyStart += len(bodyTagStart)
bodyEnd := findTaggedBodyEnd(text, bodyStart)
if bodyEnd == -1 {
t.Fatal("expected closing body tag to be found")
}
body := trimTagNewlines(text[bodyStart:bodyEnd])
if body != "Translated body" {
t.Fatalf("expected body slice to ignore pre-body literal token, got %q", body)
}
}
func TestSplitDocBodyIntoBlocksKeepsFenceTogether(t *testing.T) {
t.Parallel()
@@ -512,6 +539,30 @@ func TestSanitizeDocChunkProtocolWrappersKeepsLegitimateTopLevelBodyBlock(t *tes
}
}
func TestSanitizeDocChunkProtocolWrappersKeepsAmbiguousTaggedWrapperForRetry(t *testing.T) {
t.Parallel()
source := strings.Join([]string{
"Paragraph mentioning literal tokens `<body>` and `</body>`.",
"",
"Closing example:",
"</body>",
}, "\n")
translated := strings.Join([]string{
"<frontmatter>",
"title: leaked",
"</frontmatter>",
"",
"<body>",
"提到字面量 `<body>` 和 `</body>` 的段落。",
}, "\n")
got := sanitizeDocChunkProtocolWrappers(source, translated)
if got != translated {
t.Fatalf("expected ambiguous tagged wrapper to remain unchanged for retry\nwant:\n%s\ngot:\n%s", translated, got)
}
}
func TestSplitDocBodyIntoBlocksKeepsInfoStringExampleInsideFence(t *testing.T) {
t.Parallel()