docs-i18n: avoid ambiguous tagged body unwrap

2026-04-23 22:55:24 +00:00 · 2026-04-09 22:17:03 +08:00
parent 8e02a55a81
commit 6a16d66486
3 changed files with 97 additions and 7 deletions
--- a/scripts/docs-i18n/doc_chunked_raw.go
+++ b/scripts/docs-i18n/doc_chunked_raw.go
@@ -222,12 +222,14 @@ func sanitizeDocChunkProtocolWrappers(source, translated string) string {
 	if !hasUnexpectedTopLevelProtocolWrapper(source, trimmedTranslated) {
 		return translated
 	}
-	_, body, err := parseTaggedDocument(trimmedTranslated)
-	if err == nil {
-		if strings.TrimSpace(body) == "" {
-			return translated
+	if !hasAmbiguousTaggedBodyClose(source, trimmedTranslated) {
+		_, body, err := parseTaggedDocument(trimmedTranslated)
+		if err == nil {
+			if strings.TrimSpace(body) == "" {
+				return translated
+			}
+			return body
 		}
-		return body
 	}
 	body, ok := stripBodyOnlyWrapper(trimmedTranslated)
 	if !ok || strings.TrimSpace(body) == "" {
@@ -251,6 +253,18 @@ func stripBodyOnlyWrapper(text string) (string, bool) {
 	return trimTagNewlines(body), true
 }

+func hasAmbiguousTaggedBodyClose(source, translated string) bool {
+	sourceLower := strings.ToLower(source)
+	if !strings.Contains(sourceLower, strings.ToLower(bodyTagStart)) && !strings.Contains(sourceLower, strings.ToLower(bodyTagEnd)) {
+		return false
+	}
+	translatedLower := strings.ToLower(translated)
+	if !strings.Contains(translatedLower, strings.ToLower(frontmatterTagStart)) {
+		return false
+	}
+	return strings.Count(translatedLower, strings.ToLower(bodyTagEnd)) == 1
+}
+
 func maskDocComponentTags(text string) (string, []string) {
 	placeholders := make([]string, 0, 4)
 	masked := docsComponentTagRE.ReplaceAllStringFunc(text, func(match string) string {
--- a/scripts/docs-i18n/doc_mode.go
+++ b/scripts/docs-i18n/doc_mode.go
@@ -91,8 +91,8 @@ func parseTaggedDocument(text string) (string, string, error) {
 	}
 	bodyStart += frontEnd + len(bodyTagStart)

-	bodyEnd := strings.LastIndex(text, bodyTagEnd)
-	if bodyEnd == -1 || bodyEnd < bodyStart {
+	bodyEnd := findTaggedBodyEnd(text, bodyStart)
+	if bodyEnd == -1 {
 		return "", "", fmt.Errorf("missing %s", bodyTagEnd)
 	}
 	body := trimTagNewlines(text[bodyStart:bodyEnd])
@@ -107,6 +107,31 @@ func parseTaggedDocument(text string) (string, string, error) {
 	return frontMatter, body, nil
 }

+func findTaggedBodyEnd(text string, bodyStart int) int {
+	if bodyStart < 0 || bodyStart > len(text) {
+		return -1
+	}
+	search := text[bodyStart:]
+	candidate := -1
+	offset := 0
+	for {
+		index := strings.Index(search[offset:], bodyTagEnd)
+		if index == -1 {
+			return candidate
+		}
+		index += offset
+		absolute := bodyStart + index
+		suffix := strings.TrimSpace(text[absolute+len(bodyTagEnd):])
+		if suffix == "" {
+			candidate = absolute
+		}
+		offset = index + len(bodyTagEnd)
+		if offset >= len(search) {
+			return candidate
+		}
+	}
+}
+
 func trimTagNewlines(value string) string {
 	value = strings.TrimPrefix(value, "\n")
 	value = strings.TrimSuffix(value, "\n")
--- a/scripts/docs-i18n/doc_mode_test.go
+++ b/scripts/docs-i18n/doc_mode_test.go
@@ -252,6 +252,33 @@ func TestParseTaggedDocumentRejectsTrailingTextOutsideTags(t *testing.T) {
 	}
 }

+func TestFindTaggedBodyEndSearchesFromBodyStart(t *testing.T) {
+	t.Parallel()
+
+	text := strings.Join([]string{
+		"<frontmatter>",
+		"summary: literal </body> token in frontmatter",
+		"</frontmatter>",
+		"<body>",
+		"Translated body",
+		"</body>",
+	}, "\n")
+	bodyStart := strings.Index(text, bodyTagStart)
+	if bodyStart == -1 {
+		t.Fatal("expected body tag in test input")
+	}
+	bodyStart += len(bodyTagStart)
+
+	bodyEnd := findTaggedBodyEnd(text, bodyStart)
+	if bodyEnd == -1 {
+		t.Fatal("expected closing body tag to be found")
+	}
+	body := trimTagNewlines(text[bodyStart:bodyEnd])
+	if body != "Translated body" {
+		t.Fatalf("expected body slice to ignore pre-body literal token, got %q", body)
+	}
+}
+
 func TestSplitDocBodyIntoBlocksKeepsFenceTogether(t *testing.T) {
 	t.Parallel()

@@ -512,6 +539,30 @@ func TestSanitizeDocChunkProtocolWrappersKeepsLegitimateTopLevelBodyBlock(t *tes
 	}
 }

+func TestSanitizeDocChunkProtocolWrappersKeepsAmbiguousTaggedWrapperForRetry(t *testing.T) {
+	t.Parallel()
+
+	source := strings.Join([]string{
+		"Paragraph mentioning literal tokens `<body>` and `</body>`.",
+		"",
+		"Closing example:",
+		"</body>",
+	}, "\n")
+	translated := strings.Join([]string{
+		"<frontmatter>",
+		"title: leaked",
+		"</frontmatter>",
+		"",
+		"<body>",
+		"提到字面量 `<body>` 和 `</body>` 的段落。",
+	}, "\n")
+
+	got := sanitizeDocChunkProtocolWrappers(source, translated)
+	if got != translated {
+		t.Fatalf("expected ambiguous tagged wrapper to remain unchanged for retry\nwant:\n%s\ngot:\n%s", translated, got)
+	}
+}
+
 func TestSplitDocBodyIntoBlocksKeepsInfoStringExampleInsideFence(t *testing.T) {
 	t.Parallel()