diff --git a/dotCMS/src/main/java/com/dotcms/ai/util/ContentToStringUtil.java b/dotCMS/src/main/java/com/dotcms/ai/util/ContentToStringUtil.java index 300d4fd74bd4..b36420353eca 100644 --- a/dotCMS/src/main/java/com/dotcms/ai/util/ContentToStringUtil.java +++ b/dotCMS/src/main/java/com/dotcms/ai/util/ContentToStringUtil.java @@ -133,8 +133,13 @@ private Optional parseText(@NotNull String val) { private Optional parseBlockEditor(@NotNull String val) { + // #36003: embed Story Block (Tiptap JSON) as Markdown, preserving structure (tables, + // code blocks, lists, headings). Markdown is already plain text, so it is returned + // directly: do NOT route it through parseText (collapses newlines) or parseHTML (Tika + // re-strips the markup) — either would re-flatten exactly the structure we want to keep. final StoryBlockMap storyBlockMap = new StoryBlockMap(val); - return parseHTML(storyBlockMap.toHtml()); + final String markdown = storyBlockMap.toMarkdown(); + return UtilMethods.isSet(markdown) ? Optional.of(markdown) : Optional.empty(); } diff --git a/dotcms-integration/src/test/java/com/dotcms/MainSuite3a.java b/dotcms-integration/src/test/java/com/dotcms/MainSuite3a.java index 49f8d043487f..195a6ddde6dc 100644 --- a/dotcms-integration/src/test/java/com/dotcms/MainSuite3a.java +++ b/dotcms-integration/src/test/java/com/dotcms/MainSuite3a.java @@ -1,6 +1,7 @@ package com.dotcms; import com.dotcms.ai.api.OpenAIVisionAPIImplTest; +import com.dotcms.ai.util.ContentToStringUtilTest; import com.dotcms.contenttype.business.StoryBlockValidationTest; import com.dotcms.contenttype.test.StoryBlockUtilTest; import com.dotcms.cost.RequestCostReportTest; @@ -87,6 +88,7 @@ Task260615AlterClusterIdLengthTest.class, ImportContentletsActionSmokeTest.class, PublisherQueueJobTest.class, + ContentToStringUtilTest.class, }) public class MainSuite3a { diff --git a/dotcms-integration/src/test/java/com/dotcms/ai/util/ContentToStringUtilTest.java b/dotcms-integration/src/test/java/com/dotcms/ai/util/ContentToStringUtilTest.java new file mode 100644 index 000000000000..f39fbc30d257 --- /dev/null +++ b/dotcms-integration/src/test/java/com/dotcms/ai/util/ContentToStringUtilTest.java @@ -0,0 +1,120 @@ +package com.dotcms.ai.util; + +import com.dotcms.contenttype.model.field.Field; +import com.dotcms.contenttype.model.field.StoryBlockField; +import com.dotcms.contenttype.model.type.ContentType; +import com.dotcms.datagen.ContentTypeDataGen; +import com.dotcms.datagen.FieldDataGen; +import com.dotcms.util.IntegrationTestInitService; +import com.dotmarketing.portlets.contentlet.model.Contentlet; +import com.dotmarketing.util.UUIDGenerator; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +/** + * Integration tests for {@link ContentToStringUtil} focused on how Story Block (Block Editor) + * fields are turned into the text that dotAI embeds. + *

+ * #36003: a Story Block field is now extracted as Markdown (via {@code StoryBlockMap.toMarkdown()}) + * instead of being rendered to HTML and stripped by Tika. Markdown preserves the structure + * (tables, code blocks, lists, headings) that the old Tika path flattened away. These tests assert + * that the structure survives into the extracted text. + * + * @author hassandotcms + */ +public class ContentToStringUtilTest { + + /** + * A Story Block (Tiptap/ProseMirror) document holding a heading, a table and a fenced code + * block — exactly the structure that the old HTML + Tika path flattened to plain text. + */ + private static final String STORY_BLOCK_WITH_TABLE_AND_CODE = + "{" + + "\"type\":\"doc\"," + + "\"content\":[" + + "{" + + "\"type\":\"heading\",\"attrs\":{\"level\":2}," + + "\"content\":[{\"type\":\"text\",\"text\":\"Supported Languages\"}]" + + "}," + + "{" + + "\"type\":\"table\",\"content\":[" + + "{" + + "\"type\":\"tableRow\",\"content\":[" + + "{\"type\":\"tableHeader\",\"attrs\":{\"colspan\":1,\"rowspan\":1}," + + "\"content\":[{\"type\":\"paragraph\",\"content\":[{\"type\":\"text\",\"text\":\"Language\"}]}]}," + + "{\"type\":\"tableHeader\",\"attrs\":{\"colspan\":1,\"rowspan\":1}," + + "\"content\":[{\"type\":\"paragraph\",\"content\":[{\"type\":\"text\",\"text\":\"Use Case\"}]}]}" + + "]" + + "}," + + "{" + + "\"type\":\"tableRow\",\"content\":[" + + "{\"type\":\"tableCell\",\"attrs\":{\"colspan\":1,\"rowspan\":1}," + + "\"content\":[{\"type\":\"paragraph\",\"content\":[{\"type\":\"text\",\"text\":\"Java\"}]}]}," + + "{\"type\":\"tableCell\",\"attrs\":{\"colspan\":1,\"rowspan\":1}," + + "\"content\":[{\"type\":\"paragraph\",\"content\":[{\"type\":\"text\",\"text\":\"Enterprise applications\"}]}]}" + + "]" + + "}" + + "]" + + "}," + + "{" + + "\"type\":\"codeBlock\",\"attrs\":{\"language\":\"java\"}," + + "\"content\":[{\"type\":\"text\",\"text\":\"System.out.println(\\\"Hello, World!\\\");\"}]" + + "}" + + "]" + + "}"; + + @BeforeClass + public static void beforeClass() throws Exception { + IntegrationTestInitService.getInstance().init(); + } + + /** + * Method to test: {@link ContentToStringUtil#turnContentletIntoString(Contentlet)} + * Given Scenario: A contentlet with a Story Block field whose value is a Tiptap JSON document + * containing a heading, a table and a fenced code block. + * Expected Result: The extracted text is Markdown with the structure preserved — table pipes + * and separator row, a fenced code block with its language, and the heading marker — none of + * which survive the old HTML-to-Tika flattening. + */ + @Test + public void test_story_block_embedded_as_markdown_preserves_table_and_code() { + final ContentTypeDataGen dataGen = new ContentTypeDataGen(); + final List inputFields = new ArrayList<>(); + inputFields.add(new FieldDataGen().velocityVarName("title").next()); + inputFields.add(new FieldDataGen().type(StoryBlockField.class).velocityVarName("body").next()); + dataGen.fields(inputFields); + final ContentType contentType = dataGen.nextPersisted(); + + final Contentlet contentlet = new Contentlet(); + contentlet.setContentType(contentType); + contentlet.setIdentifier(UUIDGenerator.generateUuid()); + contentlet.setProperty("title", "Supported Languages"); + contentlet.setProperty("body", STORY_BLOCK_WITH_TABLE_AND_CODE); + + final Optional extracted = ContentToStringUtil.impl.get().turnContentletIntoString(contentlet); + + Assert.assertTrue("Story Block field should produce extractable text", extracted.isPresent()); + final String text = extracted.get(); + + // Table structure survives as Markdown: pipe-delimited cells + the separator row. + Assert.assertTrue("Table cells should be pipe-delimited Markdown, got: " + text, text.contains("|")); + Assert.assertTrue("Table separator row should be present, got: " + text, text.contains("---")); + Assert.assertTrue("Table header text should survive", text.contains("Language") && text.contains("Use Case")); + + // Code block survives as a fenced block carrying its language. + Assert.assertTrue("Fenced code block with language should be present, got: " + text, text.contains("```java")); + Assert.assertTrue("Code body should survive", text.contains("System.out.println(\"Hello, World!\");")); + + // Heading survives as a Markdown heading marker. + Assert.assertTrue("Heading marker should be present, got: " + text, text.contains("## Supported Languages")); + + // Sanity: this is Markdown, not HTML that Tika stripped — no markup tags leaked through. + Assert.assertFalse("Extracted text must not contain HTML tags", text.contains("