From be989390011f7e31c129336b49efe1db86c9e79b Mon Sep 17 00:00:00 2001 From: hassandotcms Date: Tue, 30 Jun 2026 19:21:44 +0500 Subject: [PATCH] feat(dotai): embed Story Block fields as Markdown instead of HTML-stripped text (#36003) parseBlockEditor now returns StoryBlockMap.toMarkdown() directly instead of rendering to HTML and stripping it with Tika. Markdown is already plain text and preserves the structure (tables, code blocks, lists, headings) that the Tika path flattened away. The markdown is returned raw -- not routed through parseText (collapses newlines) or parseHTML (Tika re-strips) -- so newline-delimited structure survives the extraction layer. Adds ContentToStringUtilTest asserting a Story Block with a table and a fenced code block extracts with that structure intact, and registers it in MainSuite3a. --- .../dotcms/ai/util/ContentToStringUtil.java | 7 +- .../src/test/java/com/dotcms/MainSuite3a.java | 2 + .../ai/util/ContentToStringUtilTest.java | 120 ++++++++++++++++++ 3 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 dotcms-integration/src/test/java/com/dotcms/ai/util/ContentToStringUtilTest.java diff --git a/dotCMS/src/main/java/com/dotcms/ai/util/ContentToStringUtil.java b/dotCMS/src/main/java/com/dotcms/ai/util/ContentToStringUtil.java index 300d4fd74bd4..b36420353eca 100644 --- a/dotCMS/src/main/java/com/dotcms/ai/util/ContentToStringUtil.java +++ b/dotCMS/src/main/java/com/dotcms/ai/util/ContentToStringUtil.java @@ -133,8 +133,13 @@ private Optional parseText(@NotNull String val) { private Optional parseBlockEditor(@NotNull String val) { + // #36003: embed Story Block (Tiptap JSON) as Markdown, preserving structure (tables, + // code blocks, lists, headings). Markdown is already plain text, so it is returned + // directly: do NOT route it through parseText (collapses newlines) or parseHTML (Tika + // re-strips the markup) — either would re-flatten exactly the structure we want to keep. final StoryBlockMap storyBlockMap = new StoryBlockMap(val); - return parseHTML(storyBlockMap.toHtml()); + final String markdown = storyBlockMap.toMarkdown(); + return UtilMethods.isSet(markdown) ? Optional.of(markdown) : Optional.empty(); } diff --git a/dotcms-integration/src/test/java/com/dotcms/MainSuite3a.java b/dotcms-integration/src/test/java/com/dotcms/MainSuite3a.java index e8a969b74ae0..8c5df1cb27d0 100644 --- a/dotcms-integration/src/test/java/com/dotcms/MainSuite3a.java +++ b/dotcms-integration/src/test/java/com/dotcms/MainSuite3a.java @@ -1,6 +1,7 @@ package com.dotcms; import com.dotcms.ai.api.OpenAIVisionAPIImplTest; +import com.dotcms.ai.util.ContentToStringUtilTest; import com.dotcms.contenttype.business.StoryBlockValidationTest; import com.dotcms.contenttype.test.StoryBlockUtilTest; import com.dotcms.cost.RequestCostReportTest; @@ -85,6 +86,7 @@ Task260407AddBaseTypeColumnToIdentifierTest.class, Task260615AlterClusterIdLengthTest.class, ImportContentletsActionSmokeTest.class, + ContentToStringUtilTest.class, }) public class MainSuite3a { diff --git a/dotcms-integration/src/test/java/com/dotcms/ai/util/ContentToStringUtilTest.java b/dotcms-integration/src/test/java/com/dotcms/ai/util/ContentToStringUtilTest.java new file mode 100644 index 000000000000..f39fbc30d257 --- /dev/null +++ b/dotcms-integration/src/test/java/com/dotcms/ai/util/ContentToStringUtilTest.java @@ -0,0 +1,120 @@ +package com.dotcms.ai.util; + +import com.dotcms.contenttype.model.field.Field; +import com.dotcms.contenttype.model.field.StoryBlockField; +import com.dotcms.contenttype.model.type.ContentType; +import com.dotcms.datagen.ContentTypeDataGen; +import com.dotcms.datagen.FieldDataGen; +import com.dotcms.util.IntegrationTestInitService; +import com.dotmarketing.portlets.contentlet.model.Contentlet; +import com.dotmarketing.util.UUIDGenerator; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +/** + * Integration tests for {@link ContentToStringUtil} focused on how Story Block (Block Editor) + * fields are turned into the text that dotAI embeds. + *

+ * #36003: a Story Block field is now extracted as Markdown (via {@code StoryBlockMap.toMarkdown()}) + * instead of being rendered to HTML and stripped by Tika. Markdown preserves the structure + * (tables, code blocks, lists, headings) that the old Tika path flattened away. These tests assert + * that the structure survives into the extracted text. + * + * @author hassandotcms + */ +public class ContentToStringUtilTest { + + /** + * A Story Block (Tiptap/ProseMirror) document holding a heading, a table and a fenced code + * block — exactly the structure that the old HTML + Tika path flattened to plain text. + */ + private static final String STORY_BLOCK_WITH_TABLE_AND_CODE = + "{" + + "\"type\":\"doc\"," + + "\"content\":[" + + "{" + + "\"type\":\"heading\",\"attrs\":{\"level\":2}," + + "\"content\":[{\"type\":\"text\",\"text\":\"Supported Languages\"}]" + + "}," + + "{" + + "\"type\":\"table\",\"content\":[" + + "{" + + "\"type\":\"tableRow\",\"content\":[" + + "{\"type\":\"tableHeader\",\"attrs\":{\"colspan\":1,\"rowspan\":1}," + + "\"content\":[{\"type\":\"paragraph\",\"content\":[{\"type\":\"text\",\"text\":\"Language\"}]}]}," + + "{\"type\":\"tableHeader\",\"attrs\":{\"colspan\":1,\"rowspan\":1}," + + "\"content\":[{\"type\":\"paragraph\",\"content\":[{\"type\":\"text\",\"text\":\"Use Case\"}]}]}" + + "]" + + "}," + + "{" + + "\"type\":\"tableRow\",\"content\":[" + + "{\"type\":\"tableCell\",\"attrs\":{\"colspan\":1,\"rowspan\":1}," + + "\"content\":[{\"type\":\"paragraph\",\"content\":[{\"type\":\"text\",\"text\":\"Java\"}]}]}," + + "{\"type\":\"tableCell\",\"attrs\":{\"colspan\":1,\"rowspan\":1}," + + "\"content\":[{\"type\":\"paragraph\",\"content\":[{\"type\":\"text\",\"text\":\"Enterprise applications\"}]}]}" + + "]" + + "}" + + "]" + + "}," + + "{" + + "\"type\":\"codeBlock\",\"attrs\":{\"language\":\"java\"}," + + "\"content\":[{\"type\":\"text\",\"text\":\"System.out.println(\\\"Hello, World!\\\");\"}]" + + "}" + + "]" + + "}"; + + @BeforeClass + public static void beforeClass() throws Exception { + IntegrationTestInitService.getInstance().init(); + } + + /** + * Method to test: {@link ContentToStringUtil#turnContentletIntoString(Contentlet)} + * Given Scenario: A contentlet with a Story Block field whose value is a Tiptap JSON document + * containing a heading, a table and a fenced code block. + * Expected Result: The extracted text is Markdown with the structure preserved — table pipes + * and separator row, a fenced code block with its language, and the heading marker — none of + * which survive the old HTML-to-Tika flattening. + */ + @Test + public void test_story_block_embedded_as_markdown_preserves_table_and_code() { + final ContentTypeDataGen dataGen = new ContentTypeDataGen(); + final List inputFields = new ArrayList<>(); + inputFields.add(new FieldDataGen().velocityVarName("title").next()); + inputFields.add(new FieldDataGen().type(StoryBlockField.class).velocityVarName("body").next()); + dataGen.fields(inputFields); + final ContentType contentType = dataGen.nextPersisted(); + + final Contentlet contentlet = new Contentlet(); + contentlet.setContentType(contentType); + contentlet.setIdentifier(UUIDGenerator.generateUuid()); + contentlet.setProperty("title", "Supported Languages"); + contentlet.setProperty("body", STORY_BLOCK_WITH_TABLE_AND_CODE); + + final Optional extracted = ContentToStringUtil.impl.get().turnContentletIntoString(contentlet); + + Assert.assertTrue("Story Block field should produce extractable text", extracted.isPresent()); + final String text = extracted.get(); + + // Table structure survives as Markdown: pipe-delimited cells + the separator row. + Assert.assertTrue("Table cells should be pipe-delimited Markdown, got: " + text, text.contains("|")); + Assert.assertTrue("Table separator row should be present, got: " + text, text.contains("---")); + Assert.assertTrue("Table header text should survive", text.contains("Language") && text.contains("Use Case")); + + // Code block survives as a fenced block carrying its language. + Assert.assertTrue("Fenced code block with language should be present, got: " + text, text.contains("```java")); + Assert.assertTrue("Code body should survive", text.contains("System.out.println(\"Hello, World!\");")); + + // Heading survives as a Markdown heading marker. + Assert.assertTrue("Heading marker should be present, got: " + text, text.contains("## Supported Languages")); + + // Sanity: this is Markdown, not HTML that Tika stripped — no markup tags leaked through. + Assert.assertFalse("Extracted text must not contain HTML tags", text.contains("