From 05704f7a15f2a2d98ae6c7d38f6c2a7b7fce888c Mon Sep 17 00:00:00 2001
From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com>
Date: Thu, 26 Feb 2026 19:53:24 +0100
Subject: [PATCH 01/11] Fix autodetect process crash from oversized field
 values by truncating at 256 characters

---
 docs/CHANGELOG.asciidoc                       |  4 ++
 include/model/CFieldValueTruncator.h          | 62 ++++++++++++++++++
 lib/api/CAnomalyJob.cc                        | 13 +++-
 lib/api/CDataProcessor.cc                     | 10 ++-
 lib/api/unittest/CAnomalyJobTest.cc           | 50 +++++++++++++++
 lib/model/CBucketGatherer.cc                  |  4 ++
 lib/model/CDynamicStringIdRegistry.cc         |  5 +-
 lib/model/CEventRateBucketGatherer.cc         |  2 +
 lib/model/CGathererTools.cc                   |  3 +
 .../unittest/CDynamicStringIdRegistryTest.cc  | 31 +++++++++
 .../unittest/CFieldValueTruncatorTest.cc      | 64 +++++++++++++++++++
 lib/model/unittest/CMakeLists.txt             |  1 +
 12 files changed, 246 insertions(+), 3 deletions(-)
 create mode 100644 include/model/CFieldValueTruncator.h
 create mode 100644 lib/model/unittest/CFieldValueTruncatorTest.cc
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
index fa2d532256..ff1b4c731e 100644
--- a/docs/CHANGELOG.asciidoc
+++ b/docs/CHANGELOG.asciidoc
@@ -30,6 +30,10 @@
 
 == {es} version 9.4.0
 
+=== Bug Fixes
+
+* Truncate oversized field values to prevent autodetect process crash. (See {ml-issue}2796[#2796].)
+
 === Enhancements
 
 * Better handling of invalid JSON state documents (See {ml-pull}[]#2895].)
diff --git a/include/model/CFieldValueTruncator.h b/include/model/CFieldValueTruncator.h
new file mode 100644
index 0000000000..f56c2b7719
--- /dev/null
+++ b/include/model/CFieldValueTruncator.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the following additional limitation. Functionality enabled by the
+ * files subject to the Elastic License 2.0 may only be used in production when
+ * invoked by an Elasticsearch process with a license key installed that permits
+ * use of machine learning features. You may not use this file except in
+ * compliance with the Elastic License 2.0 and the foregoing additional
+ * limitation.
+ */
+#ifndef INCLUDED_ml_model_CFieldValueTruncator_h
+#define INCLUDED_ml_model_CFieldValueTruncator_h
+
+#include <model/ImportExport.h>
+
+#include <string>
+
+namespace ml {
+namespace model {
+
+//! \brief Truncates field values to prevent memory amplification.
+//!
+//! DESCRIPTION:\n
+//! Field values (by, over, partition, influencer) are term fields
+//! in the anomaly detection domain. They are categorical identifiers,
+//! not free text. Their length must be bounded to prevent excessive
+//! memory consumption that could cause the autodetect process to crash.
+//!
+//! IMPLEMENTATION DECISIONS:\n
+//! The limit of 256 characters aligns with Elasticsearch's
+//! ignore_above default for keyword fields. This is sufficient for
+//! meaningful anomaly detection field values while preventing memory
+//! amplification from extremely long strings (e.g., 77K+ characters)
+//! that have been observed to crash the autodetect process.
+class MODEL_EXPORT CFieldValueTruncator {
+public:
+    //! Maximum length for analysis term fields (by, over, partition, influencer).
+    //! Values longer than this are truncated to prevent excessive memory usage.
+    static constexpr std::size_t MAX_FIELD_VALUE_LENGTH = 256;
+
+    //! In-place truncation of a field value.
+    //! \return true if truncation occurred, false if value was within limit.
+    static bool truncate(std::string& value) {
+        if (value.size() <= MAX_FIELD_VALUE_LENGTH) {
+            return false;
+        }
+        value.resize(MAX_FIELD_VALUE_LENGTH);
+        return true;
+    }
+
+    //! Returns a truncated copy of the field value. Original unchanged.
+    static std::string truncated(const std::string& value) {
+        if (value.size() <= MAX_FIELD_VALUE_LENGTH) {
+            return value;
+        }
+        return value.substr(0, MAX_FIELD_VALUE_LENGTH);
+    }
+};
+}
+}
+
+#endif // INCLUDED_ml_model_CFieldValueTruncator_h
diff --git a/lib/api/CAnomalyJob.cc b/lib/api/CAnomalyJob.cc
index d7321cd2a1..8d30e35564 100644
--- a/lib/api/CAnomalyJob.cc
+++ b/lib/api/CAnomalyJob.cc
@@ -28,6 +28,7 @@
 #include <maths/common/CIntegerTools.h>
 #include <maths/common/COrderings.h>
 
+#include <model/CFieldValueTruncator.h>
 #include <model/CHierarchicalResultsAggregator.h>
 #include <model/CHierarchicalResultsPopulator.h>
 #include <model/CHierarchicalResultsProbabilityFinalizer.h>
@@ -1706,8 +1707,18 @@ void CAnomalyJob::addRecord(const TAnomalyDetectorPtr& detector,
     model::CAnomalyDetector::TStrCPtrVec fieldValues;
     const TStrVec& fieldNames = detector->fieldsOfInterest();
     fieldValues.reserve(fieldNames.size());
+    TStrVec truncatedCopies;
     for (const auto& fieldName : fieldNames) {
-        fieldValues.push_back(fieldValue(fieldName, dataRowFields));
+        const std::string* value = fieldValue(fieldName, dataRowFields);
+        if (value != nullptr && value->size() > model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH) {
+            truncatedCopies.push_back(model::CFieldValueTruncator::truncated(*value));
+            fieldValues.push_back(&truncatedCopies.back());
+            LOG_WARN(<< "Field '" << fieldName << "' value exceeds "
+                     << model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH
+                     << " characters and has been truncated");
+        } else {
+            fieldValues.push_back(value);
+        }
     }
 
     detector->addRecord(time, fieldValues);
diff --git a/lib/api/CDataProcessor.cc b/lib/api/CDataProcessor.cc
index 93db8c4751..7638094f5e 100644
--- a/lib/api/CDataProcessor.cc
+++ b/lib/api/CDataProcessor.cc
@@ -15,6 +15,8 @@
 #include <core/CStringUtils.h>
 #include <core/CTimeUtils.h>
 
+#include <model/CFieldValueTruncator.h>
+
 namespace ml {
 namespace api {
 
@@ -49,7 +51,13 @@ std::string CDataProcessor::debugPrintRecord(const TStrStrUMap& dataRowFields) {
             fieldValues.push_back(',');
         }
         fieldNames.append(rowIter->first);
-        fieldValues.append(rowIter->second);
+        const auto& val = rowIter->second;
+        if (val.size() > model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH) {
+            fieldValues.append(val, 0, model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH);
+            fieldValues.append("...");
+        } else {
+            fieldValues.append(val);
+        }
     }
 
     result << fieldNames << core_t::LINE_ENDING << fieldValues;
diff --git a/lib/api/unittest/CAnomalyJobTest.cc b/lib/api/unittest/CAnomalyJobTest.cc
index d5384327ef..5cb277b966 100644
--- a/lib/api/unittest/CAnomalyJobTest.cc
+++ b/lib/api/unittest/CAnomalyJobTest.cc
@@ -1205,4 +1205,54 @@ BOOST_AUTO_TEST_CASE(testHierarchicalResultsNormalizerShouldIncreaseMemoryUsage)
     resourceMonitor.forceRefreshAll();
     BOOST_TEST_REQUIRE(resourceMonitor.totalMemory() < memoryUsageBeforeUnregister);
 }
+
+BOOST_AUTO_TEST_CASE(testOversizedFieldValuesTruncated) {
+    model::CLimits limits;
+    api::CAnomalyJobConfig jobConfig = CTestAnomalyJob::makeSimpleJobConfig(
+        "count", "", "by_field", "", "", {"influencer_field"});
+
+    model::CAnomalyDetectorModelConfig modelConfig =
+        model::CAnomalyDetectorModelConfig::defaultConfig(BUCKET_SIZE);
+    std::stringstream outputStrm;
+    core::CJsonOutputStreamWrapper wrappedOutputStream(outputStrm);
+
+    CTestAnomalyJob job("job", limits, jobConfig, modelConfig, wrappedOutputStream);
+
+    std::string const oversizedValue(77000, 'x');
+    CTestAnomalyJob::TStrStrUMap dataRows{
+        {"time", "1000"}, {"by_field", oversizedValue}, {"influencer_field", oversizedValue}};
+
+    BOOST_TEST_REQUIRE(job.handleRecord(dataRows));
+    BOOST_REQUIRE_EQUAL(uint64_t(1), job.numRecordsHandled());
+}
+
+BOOST_AUTO_TEST_CASE(testNormalFieldValuesNotTruncated) {
+    model::CLimits limits;
+    api::CAnomalyJobConfig jobConfig = CTestAnomalyJob::makeSimpleJobConfig(
+        "count", "", "by_field", "", "", {"influencer_field"});
+
+    model::CAnomalyDetectorModelConfig modelConfig =
+        model::CAnomalyDetectorModelConfig::defaultConfig(BUCKET_SIZE);
+    std::stringstream outputStrm;
+    core::CJsonOutputStreamWrapper wrappedOutputStream(outputStrm);
+
+    CTestAnomalyJob job("job", limits, jobConfig, modelConfig, wrappedOutputStream);
+
+    std::string const normalValue("normal_value");
+    CTestAnomalyJob::TStrStrUMap dataRows{
+        {"time", "1000"}, {"by_field", normalValue}, {"influencer_field", normalValue}};
+
+    BOOST_TEST_REQUIRE(job.handleRecord(dataRows));
+    BOOST_REQUIRE_EQUAL(uint64_t(1), job.numRecordsHandled());
+}
+
+BOOST_AUTO_TEST_CASE(testDebugPrintRecordTruncatesLongValues) {
+    api::CDataProcessor::TStrStrUMap record;
+    record["field1"] = std::string(1000, 'x');
+    record["field2"] = "short";
+    std::string result = api::CDataProcessor::debugPrintRecord(record);
+    BOOST_TEST_REQUIRE(result.find("...") != std::string::npos);
+    BOOST_TEST_REQUIRE(result.size() < 1500);
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/lib/model/CBucketGatherer.cc b/lib/model/CBucketGatherer.cc
index cdffd8d238..520561e7be 100644
--- a/lib/model/CBucketGatherer.cc
+++ b/lib/model/CBucketGatherer.cc
@@ -23,6 +23,7 @@
 #include <maths/common/COrderings.h>
 
 #include <model/CDataGatherer.h>
+#include <model/CFieldValueTruncator.h>
 #include <model/CResourceMonitor.h>
 
 #include <boost/tuple/tuple.hpp>
@@ -116,6 +117,9 @@ bool restoreInfluencerPersonAttributeCounts(core::CStateRestoreTraverser& traver
         RESTORE_BUILT_IN(PERSON_UID_TAG, person)
         RESTORE_BUILT_IN(ATTRIBUTE_UID_TAG, attribute)
         RESTORE_NO_ERROR(INFLUENCER_TAG, influence = traverser.value())
+        if (name == INFLUENCER_TAG) {
+            CFieldValueTruncator::truncate(influence);
+        }
         if (name == COUNT_TAG) {
             if (core::CStringUtils::stringToType(traverser.value(), count) == false) {
                 LOG_ERROR(<< "Failed to restore COUNT_TAG, got " << traverser.value());
diff --git a/lib/model/CDynamicStringIdRegistry.cc b/lib/model/CDynamicStringIdRegistry.cc
index 9974e8e15b..c293f664d4 100644
--- a/lib/model/CDynamicStringIdRegistry.cc
+++ b/lib/model/CDynamicStringIdRegistry.cc
@@ -18,6 +18,7 @@
 #include <maths/common/CChecksum.h>
 #include <maths/common/COrderings.h>
 
+#include <model/CFieldValueTruncator.h>
 #include <model/CResourceMonitor.h>
 
 #include <boost/unordered_set.hpp>
@@ -251,7 +252,9 @@ bool CDynamicStringIdRegistry::acceptRestoreTraverser(core::CStateRestoreTravers
     do {
         const std::string& name = traverser.name();
         if (name == NAMES_TAG) {
-            m_Names.emplace_back(traverser.value());
+            std::string value = traverser.value();
+            CFieldValueTruncator::truncate(value);
+            m_Names.emplace_back(std::move(value));
         } else if (name == FREE_NAMES_TAG) {
             if (!core::CPersistUtils::restore(FREE_NAMES_TAG, m_FreeUids, traverser)) {
                 return false;
diff --git a/lib/model/CEventRateBucketGatherer.cc b/lib/model/CEventRateBucketGatherer.cc
index a01ddc9cdd..600719089e 100644
--- a/lib/model/CEventRateBucketGatherer.cc
+++ b/lib/model/CEventRateBucketGatherer.cc
@@ -27,6 +27,7 @@
 
 #include <model/CDataGatherer.h>
 #include <model/CEventData.h>
+#include <model/CFieldValueTruncator.h>
 #include <model/CResourceMonitor.h>
 #include <model/FunctionTypes.h>
 
@@ -684,6 +685,7 @@ bool restoreInfluencerUniqueStrings(core::CStateRestoreTraverser& traverser,
         const std::string& name = traverser.name();
         if (name == DICTIONARY_WORD_TAG) {
             key = traverser.value();
+            CFieldValueTruncator::truncate(key);
         } else if (name == UNIQUE_WORD_TAG) {
             CUniqueStringFeatureData::TWord value;
             if (value.fromDelimited(traverser.value()) == false) {
diff --git a/lib/model/CGathererTools.cc b/lib/model/CGathererTools.cc
index 378e0ddd2a..2c9e48414c 100644
--- a/lib/model/CGathererTools.cc
+++ b/lib/model/CGathererTools.cc
@@ -23,6 +23,8 @@
 #include <maths/common/CIntegerTools.h>
 #include <maths/common/COrderings.h>
 
+#include <model/CFieldValueTruncator.h>
+
 #include <boost/unordered_map.hpp>
 
 namespace ml {
@@ -89,6 +91,7 @@ struct SInfluencerSumSerializer {
             const std::string& name = traverser.name();
             if (name == SUM_MAP_KEY_TAG) {
                 key = traverser.value();
+                CFieldValueTruncator::truncate(key);
             } else if (name == SUM_MAP_VALUE_TAG) {
                 if (core::CStringUtils::stringToType(traverser.value(), map[key]) == false) {
                     LOG_ERROR(<< "Invalid sum in " << traverser.value());
diff --git a/lib/model/unittest/CDynamicStringIdRegistryTest.cc b/lib/model/unittest/CDynamicStringIdRegistryTest.cc
index 1d4aa16988..60cea2b1da 100644
--- a/lib/model/unittest/CDynamicStringIdRegistryTest.cc
+++ b/lib/model/unittest/CDynamicStringIdRegistryTest.cc
@@ -15,6 +15,7 @@
 #include <core/CSmallVector.h>
 
 #include <model/CDynamicStringIdRegistry.h>
+#include <model/CFieldValueTruncator.h>
 #include <model/CResourceMonitor.h>
 
 #include <boost/test/unit_test.hpp>
@@ -109,4 +110,34 @@ BOOST_AUTO_TEST_CASE(testPersist) {
     BOOST_REQUIRE_EQUAL(restoredJson.str(), origJson.str());
 }
 
+BOOST_AUTO_TEST_CASE(testRestoreTruncatesOversizedNames) {
+    CResourceMonitor resourceMonitor;
+    CDynamicStringIdRegistry registry("person", counter_t::E_TSADNumberNewPeople,
+                                      counter_t::E_TSADNumberNewPeopleNotAllowed,
+                                      counter_t::E_TSADNumberNewPeopleRecycled);
+
+    bool addedPerson = false;
+    std::string shortName("foo");
+    std::string oversizedName(77000, 'x');
+    registry.addName(shortName, 0, resourceMonitor, addedPerson);
+    registry.addName(oversizedName, 0, resourceMonitor, addedPerson);
+
+    std::ostringstream origJson;
+    core::CJsonStatePersistInserter::persist(
+        origJson, std::bind_front(&CDynamicStringIdRegistry::acceptPersistInserter, &registry));
+
+    std::istringstream is("{\"topLevel\" : " + origJson.str() + "}");
+    core::CJsonStateRestoreTraverser traverser(is);
+    CDynamicStringIdRegistry restoredRegistry("person", counter_t::E_TSADNumberNewPeople,
+                                              counter_t::E_TSADNumberNewPeopleNotAllowed,
+                                              counter_t::E_TSADNumberNewPeopleRecycled);
+    traverser.traverseSubLevel(std::bind_front(
+        &CDynamicStringIdRegistry::acceptRestoreTraverser, &restoredRegistry));
+
+    BOOST_REQUIRE_EQUAL(2, restoredRegistry.numberNames());
+    BOOST_REQUIRE_EQUAL(shortName, restoredRegistry.name(0, ""));
+    BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH,
+                        restoredRegistry.name(1, "").size());
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/lib/model/unittest/CFieldValueTruncatorTest.cc b/lib/model/unittest/CFieldValueTruncatorTest.cc
new file mode 100644
index 0000000000..fe5033d496
--- /dev/null
+++ b/lib/model/unittest/CFieldValueTruncatorTest.cc
@@ -0,0 +1,64 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the following additional limitation. Functionality enabled by the
+ * files subject to the Elastic License 2.0 may only be used in production when
+ * invoked by an Elasticsearch process with a license key installed that permits
+ * use of machine learning features. You may not use this file except in
+ * compliance with the Elastic License 2.0 and the foregoing additional
+ * limitation.
+ */
+
+#include <model/CFieldValueTruncator.h>
+
+#include <boost/test/unit_test.hpp>
+
+BOOST_AUTO_TEST_SUITE(CFieldValueTruncatorTest)
+
+using namespace ml;
+using namespace model;
+
+BOOST_AUTO_TEST_CASE(testShortValueUnchanged) {
+    std::string value("short");
+    BOOST_REQUIRE_EQUAL(false, CFieldValueTruncator::truncate(value));
+    BOOST_REQUIRE_EQUAL("short", value);
+}
+
+BOOST_AUTO_TEST_CASE(testExactLimitUnchanged) {
+    std::string value(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, 'x');
+    BOOST_REQUIRE_EQUAL(false, CFieldValueTruncator::truncate(value));
+    BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, value.size());
+}
+
+BOOST_AUTO_TEST_CASE(testOversizedValueTruncated) {
+    std::string value(1000, 'x');
+    BOOST_REQUIRE_EQUAL(true, CFieldValueTruncator::truncate(value));
+    BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, value.size());
+}
+
+BOOST_AUTO_TEST_CASE(testEmptyValueUnchanged) {
+    std::string value;
+    BOOST_REQUIRE_EQUAL(false, CFieldValueTruncator::truncate(value));
+    BOOST_REQUIRE_EQUAL(0, value.size());
+}
+
+BOOST_AUTO_TEST_CASE(testConstOverloadReturnsNewString) {
+    const std::string longValue(1000, 'x');
+    std::string result = CFieldValueTruncator::truncated(longValue);
+    BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, result.size());
+    BOOST_REQUIRE_EQUAL(1000, longValue.size());
+}
+
+BOOST_AUTO_TEST_CASE(testConstOverloadShortValueReturnsSame) {
+    const std::string shortValue("short");
+    std::string result = CFieldValueTruncator::truncated(shortValue);
+    BOOST_REQUIRE_EQUAL("short", result);
+}
+
+BOOST_AUTO_TEST_CASE(testVeryLargeValueTruncated) {
+    std::string value(77000, 'y');
+    BOOST_REQUIRE_EQUAL(true, CFieldValueTruncator::truncate(value));
+    BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, value.size());
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/lib/model/unittest/CMakeLists.txt b/lib/model/unittest/CMakeLists.txt
index 8e6d6dcf48..e8f64ac6b5 100644
--- a/lib/model/unittest/CMakeLists.txt
+++ b/lib/model/unittest/CMakeLists.txt
@@ -22,6 +22,7 @@ set (SRCS
   CDetectionRuleTest.cc
   CDetectorEqualizerTest.cc
   CDynamicStringIdRegistryTest.cc
+  CFieldValueTruncatorTest.cc
   CEventRateAnomalyDetectorTest.cc
   CEventRateDataGathererTest.cc
   CEventRateModelTest.cc

From 4c6e66e6c53e72e2a6cdf4d194fb811c79a76424 Mon Sep 17 00:00:00 2001
From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com>
Date: Thu, 26 Feb 2026 20:26:49 +0100
Subject: [PATCH 02/11] refactor

---
 docs/CHANGELOG.asciidoc                        |  2 +-
 include/model/CFieldValueTruncator.h           | 12 ++++++++++--
 lib/api/CAnomalyJob.cc                         |  2 +-
 lib/api/CDataProcessor.cc                      |  6 +++---
 lib/api/unittest/CAnomalyJobTest.cc            |  5 +++--
 lib/model/unittest/CFieldValueTruncatorTest.cc | 11 +++++++++++
 6 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
index ff1b4c731e..4de997996f 100644
--- a/docs/CHANGELOG.asciidoc
+++ b/docs/CHANGELOG.asciidoc
@@ -32,7 +32,7 @@
 
 === Bug Fixes
 
-* Truncate oversized field values to prevent autodetect process crash. (See {ml-issue}2796[#2796].)
+* Truncate oversized field values to prevent autodetect process crash. (See {ml-pull}2929[#2929].)
 
 === Enhancements
 
diff --git a/include/model/CFieldValueTruncator.h b/include/model/CFieldValueTruncator.h
index f56c2b7719..6171ed83b7 100644
--- a/include/model/CFieldValueTruncator.h
+++ b/include/model/CFieldValueTruncator.h
@@ -38,10 +38,17 @@ class MODEL_EXPORT CFieldValueTruncator {
     //! Values longer than this are truncated to prevent excessive memory usage.
     static constexpr std::size_t MAX_FIELD_VALUE_LENGTH = 256;
 
+    //! Check if a field value needs truncation.
+    //! This avoids creating copies when checking if truncation is necessary.
+    //! \return true if the value exceeds MAX_FIELD_VALUE_LENGTH.
+    static bool needsTruncation(const std::string& value) {
+        return value.size() > MAX_FIELD_VALUE_LENGTH;
+    }
+
     //! In-place truncation of a field value.
     //! \return true if truncation occurred, false if value was within limit.
     static bool truncate(std::string& value) {
-        if (value.size() <= MAX_FIELD_VALUE_LENGTH) {
+        if (!needsTruncation(value)) {
             return false;
         }
         value.resize(MAX_FIELD_VALUE_LENGTH);
@@ -49,8 +56,9 @@ class MODEL_EXPORT CFieldValueTruncator {
     }
 
     //! Returns a truncated copy of the field value. Original unchanged.
+    //! Use needsTruncation() first if you want to avoid copying.
     static std::string truncated(const std::string& value) {
-        if (value.size() <= MAX_FIELD_VALUE_LENGTH) {
+        if (!needsTruncation(value)) {
             return value;
         }
         return value.substr(0, MAX_FIELD_VALUE_LENGTH);
diff --git a/lib/api/CAnomalyJob.cc b/lib/api/CAnomalyJob.cc
index 8d30e35564..6e2951f597 100644
--- a/lib/api/CAnomalyJob.cc
+++ b/lib/api/CAnomalyJob.cc
@@ -1710,7 +1710,7 @@ void CAnomalyJob::addRecord(const TAnomalyDetectorPtr& detector,
     TStrVec truncatedCopies;
     for (const auto& fieldName : fieldNames) {
         const std::string* value = fieldValue(fieldName, dataRowFields);
-        if (value != nullptr && value->size() > model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH) {
+        if (value != nullptr && model::CFieldValueTruncator::needsTruncation(*value)) {
             truncatedCopies.push_back(model::CFieldValueTruncator::truncated(*value));
             fieldValues.push_back(&truncatedCopies.back());
             LOG_WARN(<< "Field '" << fieldName << "' value exceeds "
diff --git a/lib/api/CDataProcessor.cc b/lib/api/CDataProcessor.cc
index 7638094f5e..61b792417b 100644
--- a/lib/api/CDataProcessor.cc
+++ b/lib/api/CDataProcessor.cc
@@ -51,9 +51,9 @@ std::string CDataProcessor::debugPrintRecord(const TStrStrUMap& dataRowFields) {
             fieldValues.push_back(',');
         }
         fieldNames.append(rowIter->first);
-        const auto& val = rowIter->second;
-        if (val.size() > model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH) {
-            fieldValues.append(val, 0, model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH);
+        const std::string& val = rowIter->second;
+        if (model::CFieldValueTruncator::needsTruncation(val)) {
+            fieldValues.append(model::CFieldValueTruncator::truncated(val));
             fieldValues.append("...");
         } else {
             fieldValues.append(val);
diff --git a/lib/api/unittest/CAnomalyJobTest.cc b/lib/api/unittest/CAnomalyJobTest.cc
index 5cb277b966..681a316641 100644
--- a/lib/api/unittest/CAnomalyJobTest.cc
+++ b/lib/api/unittest/CAnomalyJobTest.cc
@@ -1219,8 +1219,9 @@ BOOST_AUTO_TEST_CASE(testOversizedFieldValuesTruncated) {
     CTestAnomalyJob job("job", limits, jobConfig, modelConfig, wrappedOutputStream);
 
     std::string const oversizedValue(77000, 'x');
-    CTestAnomalyJob::TStrStrUMap dataRows{
-        {"time", "1000"}, {"by_field", oversizedValue}, {"influencer_field", oversizedValue}};
+    CTestAnomalyJob::TStrStrUMap dataRows{{"time", "1000"},
+                                          {"by_field", oversizedValue},
+                                          {"influencer_field", oversizedValue}};
 
     BOOST_TEST_REQUIRE(job.handleRecord(dataRows));
     BOOST_REQUIRE_EQUAL(uint64_t(1), job.numRecordsHandled());
diff --git a/lib/model/unittest/CFieldValueTruncatorTest.cc b/lib/model/unittest/CFieldValueTruncatorTest.cc
index fe5033d496..447ab7250d 100644
--- a/lib/model/unittest/CFieldValueTruncatorTest.cc
+++ b/lib/model/unittest/CFieldValueTruncatorTest.cc
@@ -61,4 +61,15 @@ BOOST_AUTO_TEST_CASE(testVeryLargeValueTruncated) {
     BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, value.size());
 }
 
+BOOST_AUTO_TEST_CASE(testNeedsTruncation) {
+    BOOST_REQUIRE_EQUAL(false, CFieldValueTruncator::needsTruncation("short"));
+    BOOST_REQUIRE_EQUAL(false, CFieldValueTruncator::needsTruncation(""));
+    BOOST_REQUIRE_EQUAL(false, CFieldValueTruncator::needsTruncation(std::string(
+                                   CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, 'x')));
+    BOOST_REQUIRE_EQUAL(true, CFieldValueTruncator::needsTruncation(std::string(
+                                  CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH + 1, 'x')));
+    BOOST_REQUIRE_EQUAL(
+        true, CFieldValueTruncator::needsTruncation(std::string(77000, 'x')));
+}
+
 BOOST_AUTO_TEST_SUITE_END()

From 310f228c603930836853d96d6409444c9e312cef Mon Sep 17 00:00:00 2001
From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com>
Date: Fri, 27 Feb 2026 16:23:38 +0100
Subject: [PATCH 03/11] implement hash suffix

---
 include/api/CAnomalyJob.h                     |  12 ++
 include/model/CFieldValueTruncator.h          | 133 ++++++++++++++----
 lib/api/CAnomalyJob.cc                        |  35 +++--
 lib/api/CDataProcessor.cc                     |   2 +-
 .../unittest/CFieldValueTruncatorTest.cc      | 109 +++++++++++++-
 5 files changed, 251 insertions(+), 40 deletions(-)

diff --git a/include/api/CAnomalyJob.h b/include/api/CAnomalyJob.h
index 279a597936..4167c649a7 100644
--- a/include/api/CAnomalyJob.h
+++ b/include/api/CAnomalyJob.h
@@ -392,6 +392,18 @@ class API_EXPORT CAnomalyJob : public CDataProcessor {
                           core_t::TTime time,
                           const TStrStrUMap& dataRowFields);
 
+    //! Prepare field values with truncation handling.
+    //! Extracts field values from \p dataRowFields, truncates oversized values,
+    //! and populates \p fieldValues with pointers to either original or truncated values.
+    //! \param fieldNames The names of fields to extract
+    //! \param dataRowFields The data row containing field values
+    //! \param fieldValues Output vector of pointers to field values
+    //! \param truncatedCopies Storage for truncated copies (must remain valid while fieldValues is used)
+    static void prepareTruncatedFieldValues(const TStrVec& fieldNames,
+                                           const TStrStrUMap& dataRowFields,
+                                           model::CAnomalyDetector::TStrCPtrVec& fieldValues,
+                                           TStrVec& truncatedCopies);
+
     //! Parses a control message requesting that model state be persisted.
     //! Extracts optional arguments to be used for persistence.
     static bool parsePersistControlMessageArgs(const std::string& controlMessageArgs,
diff --git a/include/model/CFieldValueTruncator.h b/include/model/CFieldValueTruncator.h
index 6171ed83b7..f62c1f6d26 100644
--- a/include/model/CFieldValueTruncator.h
+++ b/include/model/CFieldValueTruncator.h
@@ -11,57 +11,138 @@
 #ifndef INCLUDED_ml_model_CFieldValueTruncator_h
 #define INCLUDED_ml_model_CFieldValueTruncator_h
 
+#include <core/CHashing.h>
+
 #include <model/ImportExport.h>
 
+#include <cstdio>
 #include <string>
 
 namespace ml {
 namespace model {
 
-//! \brief Truncates field values to prevent memory amplification.
+//! \brief Enforces term field length constraints with collision prevention.
+//!
+//! In the anomaly detection domain, term fields (by, over, partition, influencer)
+//! are categorical identifiers that must satisfy two invariants:
+//! 1. **Bounded Length** - Prevent memory amplification and OOM crashes
+//! 2. **Unique Identity** - Distinct field values must remain distinguishable
+//!
+//! Values exceeding MAX_FIELD_VALUE_LENGTH (256 chars) are transformed using
+//! collision-safe truncation:
+//!   - Retain PREFIX_LENGTH (240) characters of original value
+//!   - Append HASH_SEPARATOR ('$')
+//!   - Append HASH_HEX_DIGITS (15) character hex hash of complete original value
 //!
-//! DESCRIPTION:\n
-//! Field values (by, over, partition, influencer) are term fields
-//! in the anomaly detection domain. They are categorical identifiers,
-//! not free text. Their length must be bounded to prevent excessive
-//! memory consumption that could cause the autodetect process to crash.
+//! Format: "<prefix_240_chars>$<hash_15_hex_chars>"
+//! Example: "very_long_field_value_that_exceeds_limit_and_continues_for_thousands_of_chars_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx$a1b2c3d4e5f6789"
 //!
-//! IMPLEMENTATION DECISIONS:\n
-//! The limit of 256 characters aligns with Elasticsearch's
-//! ignore_above default for keyword fields. This is sufficient for
-//! meaningful anomaly detection field values while preventing memory
-//! amplification from extremely long strings (e.g., 77K+ characters)
-//! that have been observed to crash the autodetect process.
+//! The 256-character limit aligns with Elasticsearch's ignore_above default
+//! for keyword fields. The hash suffix ensures data integrity while maintaining
+//! human readability (first 240 characters visible) and compatibility with
+//! prefix-based filtering. Collision probability is ~1 in 10^18 (effectively zero).
 class MODEL_EXPORT CFieldValueTruncator {
 public:
-    //! Maximum length for analysis term fields (by, over, partition, influencer).
-    //! Values longer than this are truncated to prevent excessive memory usage.
+    //! Domain constraint: Maximum length for term fields in anomaly detection.
+    //! Aligned with Elasticsearch's ignore_above default for keyword fields.
     static constexpr std::size_t MAX_FIELD_VALUE_LENGTH = 256;
 
-    //! Check if a field value needs truncation.
-    //! This avoids creating copies when checking if truncation is necessary.
-    //! \return true if the value exceeds MAX_FIELD_VALUE_LENGTH.
+    //! Collision prevention format components
+    static constexpr char HASH_SEPARATOR = '$';
+    static constexpr std::size_t HASH_HEX_DIGITS = 15; // 15 hex chars for uint64_t
+    static constexpr std::size_t HASH_SUFFIX_LENGTH =
+        1 /* separator */ + HASH_HEX_DIGITS; // 16 total
+
+    //! Content prefix length (readable portion after truncation)
+    static constexpr std::size_t PREFIX_LENGTH =
+        MAX_FIELD_VALUE_LENGTH - HASH_SUFFIX_LENGTH; // 240
+
+    // Domain invariants (enforced at compile-time)
+    static_assert(PREFIX_LENGTH + HASH_SUFFIX_LENGTH == MAX_FIELD_VALUE_LENGTH,
+                  "Term field format invariant: prefix + suffix = total length");
+    static_assert(PREFIX_LENGTH >= 200,
+                  "Readable prefix must be substantial for human comprehension");
+    static_assert(HASH_HEX_DIGITS * 4 <= 64,
+                  "Hash hex digits must fit in 64-bit hash output");
+
+    //! Check if a term field value exceeds the domain constraint.
+    //! \return true if the value requires length enforcement
     static bool needsTruncation(const std::string& value) {
         return value.size() > MAX_FIELD_VALUE_LENGTH;
     }
 
-    //! In-place truncation of a field value.
-    //! \return true if truncation occurred, false if value was within limit.
+    //! Enforce term field length constraint in-place.
+    //! Applies collision-safe truncation for values exceeding the limit.
+    //! \param[in,out] value Field value to constrain
+    //! \return true if truncation was applied, false if already within limit
     static bool truncate(std::string& value) {
-        if (!needsTruncation(value)) {
+        if (needsTruncation(value) == false) {
             return false;
         }
-        value.resize(MAX_FIELD_VALUE_LENGTH);
+
+        std::string originalValue = std::move(value);
+        value.assign(originalValue, 0, PREFIX_LENGTH);
+        appendCollisionPreventionSuffix(originalValue, value);
+
         return true;
     }
 
-    //! Returns a truncated copy of the field value. Original unchanged.
-    //! Use needsTruncation() first if you want to avoid copying.
+    //! Enforce term field length constraint, returning constrained copy.
+    //! Original value unchanged. For performance, call needsTruncation() first
+    //! to avoid copying when constraint is already satisfied.
+    //! \param value Original field value
+    //! \return Copy with length constraint enforced
     static std::string truncated(const std::string& value) {
-        if (!needsTruncation(value)) {
-            return value;
+        if (needsTruncation(value) == false) {
+            return value; // RVO applies
+        }
+
+        std::string result;
+        result.reserve(MAX_FIELD_VALUE_LENGTH);
+        result.assign(value, 0, PREFIX_LENGTH);
+        appendCollisionPreventionSuffix(value, result);
+
+        return result;
+    }
+
+private:
+    //! \brief Hash encoding for collision prevention.
+    //!
+    //! Encapsulates the technical details of hash computation and formatting.
+    //! Separated from domain logic for clarity and testability.
+    struct HashEncoding {
+        //! Compute collision-resistant identity hash.
+        //! Uses safeMurmurHash64 (endian-neutral) for state persistence safety.
+        static std::uint64_t compute(const std::string& value) {
+            return core::CHashing::safeMurmurHash64(
+                value.data(), static_cast<int>(value.size()),
+                0); // Fixed seed for determinism
         }
-        return value.substr(0, MAX_FIELD_VALUE_LENGTH);
+
+        //! Format 64-bit hash as zero-padded lowercase hex string.
+        //! \param hash The hash value to format
+        //! \param[out] buffer Must be at least HASH_HEX_DIGITS + 1 bytes
+        //! \return Pointer to null-terminated hex string in buffer
+        static const char* toHex(std::uint64_t hash, char* buffer) {
+            // %015llx produces 15-char zero-padded lowercase hex
+            std::snprintf(buffer, HASH_HEX_DIGITS + 1, "%015llx",
+                         static_cast<unsigned long long>(hash));
+            return buffer;
+        }
+    };
+
+    //! Append collision-prevention suffix: separator + hash.
+    //! \param originalValue Complete untruncated value for hash computation
+    //! \param[in,out] prefix Truncated prefix to which suffix is appended
+    static void appendCollisionPreventionSuffix(const std::string& originalValue,
+                                                 std::string& prefix) {
+        std::uint64_t identityHash = HashEncoding::compute(originalValue);
+
+        prefix.reserve(MAX_FIELD_VALUE_LENGTH);
+        prefix.push_back(HASH_SEPARATOR);
+
+        char hashHexBuffer[HASH_HEX_DIGITS + 1];
+        prefix.append(HashEncoding::toHex(identityHash, hashHexBuffer));
     }
 };
 }
diff --git a/lib/api/CAnomalyJob.cc b/lib/api/CAnomalyJob.cc
index 6e2951f597..e607bd9a3e 100644
--- a/lib/api/CAnomalyJob.cc
+++ b/lib/api/CAnomalyJob.cc
@@ -1701,25 +1701,42 @@ const std::string* CAnomalyJob::fieldValue(const std::string& fieldName,
     return !fieldName.empty() && fieldValue.empty() ? nullptr : &fieldValue;
 }
 
-void CAnomalyJob::addRecord(const TAnomalyDetectorPtr& detector,
-                            core_t::TTime time,
-                            const TStrStrUMap& dataRowFields) {
-    model::CAnomalyDetector::TStrCPtrVec fieldValues;
-    const TStrVec& fieldNames = detector->fieldsOfInterest();
+void CAnomalyJob::prepareTruncatedFieldValues(
+    const TStrVec& fieldNames,
+    const TStrStrUMap& dataRowFields,
+    model::CAnomalyDetector::TStrCPtrVec& fieldValues,
+    TStrVec& truncatedCopies) {
+
     fieldValues.reserve(fieldNames.size());
-    TStrVec truncatedCopies;
+    truncatedCopies.reserve(fieldNames.size());
+
     for (const auto& fieldName : fieldNames) {
         const std::string* value = fieldValue(fieldName, dataRowFields);
         if (value != nullptr && model::CFieldValueTruncator::needsTruncation(*value)) {
             truncatedCopies.push_back(model::CFieldValueTruncator::truncated(*value));
             fieldValues.push_back(&truncatedCopies.back());
-            LOG_WARN(<< "Field '" << fieldName << "' value exceeds "
-                     << model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH
-                     << " characters and has been truncated");
+
+            std::string escapedFieldName = fieldName;
+            core::CStringUtils::escape('\\', "\n\r\t", escapedFieldName);
+            LOG_WARN(<< "Field '" << escapedFieldName
+                     << "' value (length=" << value->size()
+                     << ", prefix='" << value->substr(0, std::min<std::size_t>(50, value->size()))
+                     << "...') exceeds " << model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH
+                     << " characters and has been truncated with collision-safe hash suffix");
         } else {
             fieldValues.push_back(value);
         }
     }
+}
+
+void CAnomalyJob::addRecord(const TAnomalyDetectorPtr& detector,
+                            core_t::TTime time,
+                            const TStrStrUMap& dataRowFields) {
+    model::CAnomalyDetector::TStrCPtrVec fieldValues;
+    TStrVec truncatedCopies;
+    const TStrVec& fieldNames = detector->fieldsOfInterest();
+
+    prepareTruncatedFieldValues(fieldNames, dataRowFields, fieldValues, truncatedCopies);
 
     detector->addRecord(time, fieldValues);
 }
diff --git a/lib/api/CDataProcessor.cc b/lib/api/CDataProcessor.cc
index 61b792417b..3ffdb4915e 100644
--- a/lib/api/CDataProcessor.cc
+++ b/lib/api/CDataProcessor.cc
@@ -53,7 +53,7 @@ std::string CDataProcessor::debugPrintRecord(const TStrStrUMap& dataRowFields) {
         fieldNames.append(rowIter->first);
         const std::string& val = rowIter->second;
         if (model::CFieldValueTruncator::needsTruncation(val)) {
-            fieldValues.append(model::CFieldValueTruncator::truncated(val));
+            fieldValues.append(val.substr(0, model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH));
             fieldValues.append("...");
         } else {
             fieldValues.append(val);
diff --git a/lib/model/unittest/CFieldValueTruncatorTest.cc b/lib/model/unittest/CFieldValueTruncatorTest.cc
index 447ab7250d..65c3777da3 100644
--- a/lib/model/unittest/CFieldValueTruncatorTest.cc
+++ b/lib/model/unittest/CFieldValueTruncatorTest.cc
@@ -18,19 +18,23 @@ BOOST_AUTO_TEST_SUITE(CFieldValueTruncatorTest)
 using namespace ml;
 using namespace model;
 
+// ============================================================================
+// Constraint Enforcement Behavior
+// ============================================================================
+
 BOOST_AUTO_TEST_CASE(testShortValueUnchanged) {
     std::string value("short");
     BOOST_REQUIRE_EQUAL(false, CFieldValueTruncator::truncate(value));
     BOOST_REQUIRE_EQUAL("short", value);
 }
 
-BOOST_AUTO_TEST_CASE(testExactLimitUnchanged) {
+BOOST_AUTO_TEST_CASE(testValueAtExactLimitUnchanged) {
     std::string value(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, 'x');
     BOOST_REQUIRE_EQUAL(false, CFieldValueTruncator::truncate(value));
     BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, value.size());
 }
 
-BOOST_AUTO_TEST_CASE(testOversizedValueTruncated) {
+BOOST_AUTO_TEST_CASE(testOversizedValueEnforcedTo256Chars) {
     std::string value(1000, 'x');
     BOOST_REQUIRE_EQUAL(true, CFieldValueTruncator::truncate(value));
     BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, value.size());
@@ -42,7 +46,7 @@ BOOST_AUTO_TEST_CASE(testEmptyValueUnchanged) {
     BOOST_REQUIRE_EQUAL(0, value.size());
 }
 
-BOOST_AUTO_TEST_CASE(testConstOverloadReturnsNewString) {
+BOOST_AUTO_TEST_CASE(testConstOverloadPreservesOriginal) {
     const std::string longValue(1000, 'x');
     std::string result = CFieldValueTruncator::truncated(longValue);
     BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, result.size());
@@ -55,7 +59,7 @@ BOOST_AUTO_TEST_CASE(testConstOverloadShortValueReturnsSame) {
     BOOST_REQUIRE_EQUAL("short", result);
 }
 
-BOOST_AUTO_TEST_CASE(testVeryLargeValueTruncated) {
+BOOST_AUTO_TEST_CASE(testVeryLargeValueFromIssue2796) {
     std::string value(77000, 'y');
     BOOST_REQUIRE_EQUAL(true, CFieldValueTruncator::truncate(value));
     BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, value.size());
@@ -72,4 +76,101 @@ BOOST_AUTO_TEST_CASE(testNeedsTruncation) {
         true, CFieldValueTruncator::needsTruncation(std::string(77000, 'x')));
 }
 
+// ============================================================================
+// Hash Suffix Format Validation
+// ============================================================================
+
+BOOST_AUTO_TEST_CASE(testTruncatedValueHasCorrectFormat) {
+    std::string value(1000, 'x');
+    std::string result = CFieldValueTruncator::truncated(value);
+
+    // Format: 240 prefix + '$' + 15 hex chars = 256 total
+    BOOST_REQUIRE_EQUAL(256, result.size());
+    BOOST_REQUIRE_EQUAL(CFieldValueTruncator::HASH_SEPARATOR, result[240]);
+
+    // Prefix should match original
+    BOOST_REQUIRE_EQUAL(0, result.compare(0, 240, value, 0, 240));
+
+    // Hash portion should be lowercase hex digits
+    for (std::size_t i = 241; i < 256; ++i) {
+        BOOST_REQUIRE(std::isxdigit(result[i]));
+        BOOST_REQUIRE((result[i] >= '0' && result[i] <= '9') ||
+                     (result[i] >= 'a' && result[i] <= 'f'));
+    }
+}
+
+BOOST_AUTO_TEST_CASE(testInPlaceTruncationPreservesFormat) {
+    std::string value(1000, 'z');
+    bool wasTruncated = CFieldValueTruncator::truncate(value);
+
+    BOOST_REQUIRE_EQUAL(true, wasTruncated);
+    BOOST_REQUIRE_EQUAL(256, value.size());
+    BOOST_REQUIRE_EQUAL(CFieldValueTruncator::HASH_SEPARATOR, value[240]);
+
+    // Verify hash portion is valid hex
+    for (std::size_t i = 241; i < 256; ++i) {
+        BOOST_REQUIRE(std::isxdigit(value[i]));
+    }
+}
+
+// ============================================================================
+// Collision Prevention (Data Integrity)
+// ============================================================================
+
+BOOST_AUTO_TEST_CASE(testDistinctValuesProduceDistinctResults) {
+    std::string prefix(240, 'x');
+    std::string value1 = prefix + std::string(1000, 'A');
+    std::string value2 = prefix + std::string(1000, 'B');
+
+    std::string truncated1 = CFieldValueTruncator::truncated(value1);
+    std::string truncated2 = CFieldValueTruncator::truncated(value2);
+
+    // Same prefix
+    BOOST_REQUIRE_EQUAL(truncated1.substr(0, 241), truncated2.substr(0, 241));
+
+    // But different hash suffixes prevent collision
+    BOOST_REQUIRE_NE(truncated1.substr(241), truncated2.substr(241));
+    BOOST_REQUIRE_NE(truncated1, truncated2);
+}
+
+BOOST_AUTO_TEST_CASE(testCollisionsPreventedByHashSuffix) {
+    // Two values differing only after position 256 (original collision case)
+    std::string value1(300, 'x');
+    value1.replace(280, 20, "AAAAAAAAAAAAAAAAAAAA");
+
+    std::string value2(300, 'x');
+    value2.replace(280, 20, "BBBBBBBBBBBBBBBBBBBB");
+
+    std::string truncated1 = CFieldValueTruncator::truncated(value1);
+    std::string truncated2 = CFieldValueTruncator::truncated(value2);
+
+    // Must be distinct despite identical first 240 chars
+    BOOST_REQUIRE_NE(truncated1, truncated2);
+    BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, truncated1.size());
+    BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, truncated2.size());
+}
+
+BOOST_AUTO_TEST_CASE(testDeterministicHashing) {
+    std::string value(1000, 'y');
+    std::string result1 = CFieldValueTruncator::truncated(value);
+    std::string result2 = CFieldValueTruncator::truncated(value);
+
+    BOOST_REQUIRE_EQUAL(result1, result2);
+}
+
+BOOST_AUTO_TEST_CASE(testVeryLongValueWithDistinctEnding) {
+    // Simulate the 77K influencer case from issue #2796
+    std::string value1(77000, 'x');
+    value1.replace(76990, 10, "VARIANT_A");
+
+    std::string value2(77000, 'x');
+    value2.replace(76990, 10, "VARIANT_B");
+
+    std::string truncated1 = CFieldValueTruncator::truncated(value1);
+    std::string truncated2 = CFieldValueTruncator::truncated(value2);
+
+    // Must be distinct despite identical first 240 chars
+    BOOST_REQUIRE_NE(truncated1, truncated2);
+}
+
 BOOST_AUTO_TEST_SUITE_END()

From 50379b457975e225ca82bed2af7699e9c4d37911 Mon Sep 17 00:00:00 2001
From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com>
Date: Thu, 19 Mar 2026 10:24:04 +0100
Subject: [PATCH 04/11] Change hash suffix to 16 characters

---
 include/api/CAnomalyJob.h                     |  6 ++--
 include/model/CFieldValueTruncator.h          | 34 +++++++++----------
 lib/api/CAnomalyJob.cc                        | 13 ++++---
 .../unittest/CFieldValueTruncatorTest.cc      | 26 +++++++-------
 4 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/include/api/CAnomalyJob.h b/include/api/CAnomalyJob.h
index 4167c649a7..fd1cc4bc88 100644
--- a/include/api/CAnomalyJob.h
+++ b/include/api/CAnomalyJob.h
@@ -400,9 +400,9 @@ class API_EXPORT CAnomalyJob : public CDataProcessor {
     //! \param fieldValues Output vector of pointers to field values
     //! \param truncatedCopies Storage for truncated copies (must remain valid while fieldValues is used)
     static void prepareTruncatedFieldValues(const TStrVec& fieldNames,
-                                           const TStrStrUMap& dataRowFields,
-                                           model::CAnomalyDetector::TStrCPtrVec& fieldValues,
-                                           TStrVec& truncatedCopies);
+                                            const TStrStrUMap& dataRowFields,
+                                            model::CAnomalyDetector::TStrCPtrVec& fieldValues,
+                                            TStrVec& truncatedCopies);
 
     //! Parses a control message requesting that model state be persisted.
     //! Extracts optional arguments to be used for persistence.
diff --git a/include/model/CFieldValueTruncator.h b/include/model/CFieldValueTruncator.h
index f62c1f6d26..b044f1adfc 100644
--- a/include/model/CFieldValueTruncator.h
+++ b/include/model/CFieldValueTruncator.h
@@ -30,12 +30,12 @@ namespace model {
 //!
 //! Values exceeding MAX_FIELD_VALUE_LENGTH (256 chars) are transformed using
 //! collision-safe truncation:
-//!   - Retain PREFIX_LENGTH (240) characters of original value
+//!   - Retain PREFIX_LENGTH (239) characters of original value
 //!   - Append HASH_SEPARATOR ('$')
-//!   - Append HASH_HEX_DIGITS (15) character hex hash of complete original value
+//!   - Append HASH_HEX_DIGITS (16) character hex hash of complete original value
 //!
-//! Format: "<prefix_240_chars>$<hash_15_hex_chars>"
-//! Example: "very_long_field_value_that_exceeds_limit_and_continues_for_thousands_of_chars_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx$a1b2c3d4e5f6789"
+//! Format: "<prefix_239_chars>$<hash_16_hex_chars>"
+//! Example: "very_long_field_value_that_exceeds_limit_and_continues_for_thousands_of_chars_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx$a1b2c3d4e5f67890"
 //!
 //! The 256-character limit aligns with Elasticsearch's ignore_above default
 //! for keyword fields. The hash suffix ensures data integrity while maintaining
@@ -49,21 +49,19 @@ class MODEL_EXPORT CFieldValueTruncator {
 
     //! Collision prevention format components
     static constexpr char HASH_SEPARATOR = '$';
-    static constexpr std::size_t HASH_HEX_DIGITS = 15; // 15 hex chars for uint64_t
-    static constexpr std::size_t HASH_SUFFIX_LENGTH =
-        1 /* separator */ + HASH_HEX_DIGITS; // 16 total
+    static constexpr std::size_t HASH_HEX_DIGITS = 16; // 16 hex chars = full 64-bit hash
+    static constexpr std::size_t HASH_SUFFIX_LENGTH = 1 /* separator */ + HASH_HEX_DIGITS; // 17 total
 
     //! Content prefix length (readable portion after truncation)
-    static constexpr std::size_t PREFIX_LENGTH =
-        MAX_FIELD_VALUE_LENGTH - HASH_SUFFIX_LENGTH; // 240
+    static constexpr std::size_t PREFIX_LENGTH = MAX_FIELD_VALUE_LENGTH - HASH_SUFFIX_LENGTH; // 239
 
     // Domain invariants (enforced at compile-time)
     static_assert(PREFIX_LENGTH + HASH_SUFFIX_LENGTH == MAX_FIELD_VALUE_LENGTH,
                   "Term field format invariant: prefix + suffix = total length");
     static_assert(PREFIX_LENGTH >= 200,
                   "Readable prefix must be substantial for human comprehension");
-    static_assert(HASH_HEX_DIGITS * 4 <= 64,
-                  "Hash hex digits must fit in 64-bit hash output");
+    static_assert(HASH_HEX_DIGITS * 4 == 64,
+                  "Hash hex digits must represent full 64-bit hash output");
 
     //! Check if a term field value exceeds the domain constraint.
     //! \return true if the value requires length enforcement
@@ -114,9 +112,9 @@ class MODEL_EXPORT CFieldValueTruncator {
         //! Compute collision-resistant identity hash.
         //! Uses safeMurmurHash64 (endian-neutral) for state persistence safety.
         static std::uint64_t compute(const std::string& value) {
-            return core::CHashing::safeMurmurHash64(
-                value.data(), static_cast<int>(value.size()),
-                0); // Fixed seed for determinism
+            return core::CHashing::safeMurmurHash64(value.data(),
+                                                    static_cast<int>(value.size()),
+                                                    0); // Fixed seed for determinism
         }
 
         //! Format 64-bit hash as zero-padded lowercase hex string.
@@ -124,9 +122,9 @@ class MODEL_EXPORT CFieldValueTruncator {
         //! \param[out] buffer Must be at least HASH_HEX_DIGITS + 1 bytes
         //! \return Pointer to null-terminated hex string in buffer
         static const char* toHex(std::uint64_t hash, char* buffer) {
-            // %015llx produces 15-char zero-padded lowercase hex
-            std::snprintf(buffer, HASH_HEX_DIGITS + 1, "%015llx",
-                         static_cast<unsigned long long>(hash));
+            // %016llx produces 16-char zero-padded lowercase hex (full 64 bits)
+            std::snprintf(buffer, HASH_HEX_DIGITS + 1, "%016llx",
+                          static_cast<unsigned long long>(hash));
             return buffer;
         }
     };
@@ -135,7 +133,7 @@ class MODEL_EXPORT CFieldValueTruncator {
     //! \param originalValue Complete untruncated value for hash computation
     //! \param[in,out] prefix Truncated prefix to which suffix is appended
     static void appendCollisionPreventionSuffix(const std::string& originalValue,
-                                                 std::string& prefix) {
+                                                std::string& prefix) {
         std::uint64_t identityHash = HashEncoding::compute(originalValue);
 
         prefix.reserve(MAX_FIELD_VALUE_LENGTH);
diff --git a/lib/api/CAnomalyJob.cc b/lib/api/CAnomalyJob.cc
index e607bd9a3e..af3256e037 100644
--- a/lib/api/CAnomalyJob.cc
+++ b/lib/api/CAnomalyJob.cc
@@ -1701,11 +1701,10 @@ const std::string* CAnomalyJob::fieldValue(const std::string& fieldName,
     return !fieldName.empty() && fieldValue.empty() ? nullptr : &fieldValue;
 }
 
-void CAnomalyJob::prepareTruncatedFieldValues(
-    const TStrVec& fieldNames,
-    const TStrStrUMap& dataRowFields,
-    model::CAnomalyDetector::TStrCPtrVec& fieldValues,
-    TStrVec& truncatedCopies) {
+void CAnomalyJob::prepareTruncatedFieldValues(const TStrVec& fieldNames,
+                                              const TStrStrUMap& dataRowFields,
+                                              model::CAnomalyDetector::TStrCPtrVec& fieldValues,
+                                              TStrVec& truncatedCopies) {
 
     fieldValues.reserve(fieldNames.size());
     truncatedCopies.reserve(fieldNames.size());
@@ -1719,8 +1718,8 @@ void CAnomalyJob::prepareTruncatedFieldValues(
             std::string escapedFieldName = fieldName;
             core::CStringUtils::escape('\\', "\n\r\t", escapedFieldName);
             LOG_WARN(<< "Field '" << escapedFieldName
-                     << "' value (length=" << value->size()
-                     << ", prefix='" << value->substr(0, std::min<std::size_t>(50, value->size()))
+                     << "' value (length=" << value->size() << ", prefix='"
+                     << value->substr(0, std::min<std::size_t>(50, value->size()))
                      << "...') exceeds " << model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH
                      << " characters and has been truncated with collision-safe hash suffix");
         } else {
diff --git a/lib/model/unittest/CFieldValueTruncatorTest.cc b/lib/model/unittest/CFieldValueTruncatorTest.cc
index 65c3777da3..6cc61174e2 100644
--- a/lib/model/unittest/CFieldValueTruncatorTest.cc
+++ b/lib/model/unittest/CFieldValueTruncatorTest.cc
@@ -84,18 +84,18 @@ BOOST_AUTO_TEST_CASE(testTruncatedValueHasCorrectFormat) {
     std::string value(1000, 'x');
     std::string result = CFieldValueTruncator::truncated(value);
 
-    // Format: 240 prefix + '$' + 15 hex chars = 256 total
+    // Format: 239 prefix + '$' + 16 hex chars = 256 total
     BOOST_REQUIRE_EQUAL(256, result.size());
-    BOOST_REQUIRE_EQUAL(CFieldValueTruncator::HASH_SEPARATOR, result[240]);
+    BOOST_REQUIRE_EQUAL(CFieldValueTruncator::HASH_SEPARATOR, result[239]);
 
     // Prefix should match original
-    BOOST_REQUIRE_EQUAL(0, result.compare(0, 240, value, 0, 240));
+    BOOST_REQUIRE_EQUAL(0, result.compare(0, 239, value, 0, 239));
 
     // Hash portion should be lowercase hex digits
-    for (std::size_t i = 241; i < 256; ++i) {
+    for (std::size_t i = 240; i < 256; ++i) {
         BOOST_REQUIRE(std::isxdigit(result[i]));
         BOOST_REQUIRE((result[i] >= '0' && result[i] <= '9') ||
-                     (result[i] >= 'a' && result[i] <= 'f'));
+                      (result[i] >= 'a' && result[i] <= 'f'));
     }
 }
 
@@ -105,10 +105,10 @@ BOOST_AUTO_TEST_CASE(testInPlaceTruncationPreservesFormat) {
 
     BOOST_REQUIRE_EQUAL(true, wasTruncated);
     BOOST_REQUIRE_EQUAL(256, value.size());
-    BOOST_REQUIRE_EQUAL(CFieldValueTruncator::HASH_SEPARATOR, value[240]);
+    BOOST_REQUIRE_EQUAL(CFieldValueTruncator::HASH_SEPARATOR, value[239]);
 
     // Verify hash portion is valid hex
-    for (std::size_t i = 241; i < 256; ++i) {
+    for (std::size_t i = 240; i < 256; ++i) {
         BOOST_REQUIRE(std::isxdigit(value[i]));
     }
 }
@@ -118,18 +118,18 @@ BOOST_AUTO_TEST_CASE(testInPlaceTruncationPreservesFormat) {
 // ============================================================================
 
 BOOST_AUTO_TEST_CASE(testDistinctValuesProduceDistinctResults) {
-    std::string prefix(240, 'x');
+    std::string prefix(239, 'x');
     std::string value1 = prefix + std::string(1000, 'A');
     std::string value2 = prefix + std::string(1000, 'B');
 
     std::string truncated1 = CFieldValueTruncator::truncated(value1);
     std::string truncated2 = CFieldValueTruncator::truncated(value2);
 
-    // Same prefix
-    BOOST_REQUIRE_EQUAL(truncated1.substr(0, 241), truncated2.substr(0, 241));
+    // Same prefix (239 chars + separator)
+    BOOST_REQUIRE_EQUAL(truncated1.substr(0, 240), truncated2.substr(0, 240));
 
     // But different hash suffixes prevent collision
-    BOOST_REQUIRE_NE(truncated1.substr(241), truncated2.substr(241));
+    BOOST_REQUIRE_NE(truncated1.substr(240), truncated2.substr(240));
     BOOST_REQUIRE_NE(truncated1, truncated2);
 }
 
@@ -144,7 +144,7 @@ BOOST_AUTO_TEST_CASE(testCollisionsPreventedByHashSuffix) {
     std::string truncated1 = CFieldValueTruncator::truncated(value1);
     std::string truncated2 = CFieldValueTruncator::truncated(value2);
 
-    // Must be distinct despite identical first 240 chars
+    // Must be distinct despite identical first 239 chars
     BOOST_REQUIRE_NE(truncated1, truncated2);
     BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, truncated1.size());
     BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, truncated2.size());
@@ -169,7 +169,7 @@ BOOST_AUTO_TEST_CASE(testVeryLongValueWithDistinctEnding) {
     std::string truncated1 = CFieldValueTruncator::truncated(value1);
     std::string truncated2 = CFieldValueTruncator::truncated(value2);
 
-    // Must be distinct despite identical first 240 chars
+    // Must be distinct despite identical first 239 chars
     BOOST_REQUIRE_NE(truncated1, truncated2);
 }
 

From dd955e5a82bb0fb11f9d9d7720f5f7fef177b1f0 Mon Sep 17 00:00:00 2001
From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com>
Date: Thu, 19 Mar 2026 10:56:16 +0100
Subject: [PATCH 05/11] Add tests

---
 docs/CHANGELOG.asciidoc                       |  2 +-
 lib/api/CAnomalyJob.cc                        |  1 +
 lib/model/CBucketGatherer.cc                  |  7 +-
 .../unittest/CEventRateDataGathererTest.cc    | 75 ++++++++++++++++++
 lib/model/unittest/CMetricDataGathererTest.cc | 77 +++++++++++++++++++
 5 files changed, 157 insertions(+), 5 deletions(-)

diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
index 8da062cb0b..5c2485db47 100644
--- a/docs/CHANGELOG.asciidoc
+++ b/docs/CHANGELOG.asciidoc
@@ -32,7 +32,7 @@
 
 === Bug Fixes
 
-* Truncate oversized field values to prevent autodetect process crash. (See {ml-pull}2929[#2929].)
+* Truncate oversized field values to prevent autodetect process crash. (See {ml-pull}2929[#2929], {es-pull}143180[#143180], issue: {ml-issue}2796[#2796].)
 * Report RSS in bytes instead of pages. (See {ml-pull}2917[#2917].)
 
 === Enhancements
diff --git a/lib/api/CAnomalyJob.cc b/lib/api/CAnomalyJob.cc
index af3256e037..8082f6c08f 100644
--- a/lib/api/CAnomalyJob.cc
+++ b/lib/api/CAnomalyJob.cc
@@ -1707,6 +1707,7 @@ void CAnomalyJob::prepareTruncatedFieldValues(const TStrVec& fieldNames,
                                               TStrVec& truncatedCopies) {
 
     fieldValues.reserve(fieldNames.size());
+    // Reserve ensures no reallocation invalidates pointers stored in fieldValues.
     truncatedCopies.reserve(fieldNames.size());
 
     for (const auto& fieldName : fieldNames) {
diff --git a/lib/model/CBucketGatherer.cc b/lib/model/CBucketGatherer.cc
index 520561e7be..70e9f95114 100644
--- a/lib/model/CBucketGatherer.cc
+++ b/lib/model/CBucketGatherer.cc
@@ -116,10 +116,9 @@ bool restoreInfluencerPersonAttributeCounts(core::CStateRestoreTraverser& traver
         const std::string& name = traverser.name();
         RESTORE_BUILT_IN(PERSON_UID_TAG, person)
         RESTORE_BUILT_IN(ATTRIBUTE_UID_TAG, attribute)
-        RESTORE_NO_ERROR(INFLUENCER_TAG, influence = traverser.value())
-        if (name == INFLUENCER_TAG) {
-            CFieldValueTruncator::truncate(influence);
-        }
+        RESTORE_NO_ERROR(INFLUENCER_TAG, influence = traverser.value();
+                         CFieldValueTruncator::truncate(influence))
+
         if (name == COUNT_TAG) {
             if (core::CStringUtils::stringToType(traverser.value(), count) == false) {
                 LOG_ERROR(<< "Failed to restore COUNT_TAG, got " << traverser.value());
diff --git a/lib/model/unittest/CEventRateDataGathererTest.cc b/lib/model/unittest/CEventRateDataGathererTest.cc
index 859d0e204d..b540b38245 100644
--- a/lib/model/unittest/CEventRateDataGathererTest.cc
+++ b/lib/model/unittest/CEventRateDataGathererTest.cc
@@ -1891,4 +1891,79 @@ BOOST_FIXTURE_TEST_CASE(testDiurnalFeatures, CDiurnalTestFixture) {
     }
 }
 
+BOOST_FIXTURE_TEST_CASE(testRestoreTruncatesOversizedInfluencerValues, CTestFixture) {
+    // Verify that oversized influencer field values persisted in old state
+    // are truncated on restore. This exercises truncation in:
+    //   - CBucketGatherer::restoreInfluencerPersonAttributeCounts (Finding 7)
+    //   - CEventRateBucketGatherer::restoreInfluencerUniqueStrings (Finding 8)
+
+    constexpr core_t::TTime startTime = 0;
+    constexpr core_t::TTime bucketLength = 600;
+    SModelParams params(bucketLength);
+    params.s_LatencyBuckets = 2;
+
+    TFeatureVec features;
+    features.push_back(model_t::E_IndividualUniqueCountByBucketAndPerson);
+    TStrVec influencerFieldNames{"IF1"};
+
+    CDataGatherer gatherer =
+        CDataGathererBuilder(model_t::E_EventRate, features, params, key, startTime)
+            .personFieldName("P")
+            .valueFieldName("V")
+            .influenceFieldNames(influencerFieldNames)
+            .build();
+
+    BOOST_REQUIRE_EQUAL(0, addPerson(gatherer, m_ResourceMonitor, "p", "v", 1));
+
+    // Add arrivals with an oversized influencer value (bypasses CAnomalyJob input truncation).
+    std::string const oversizedInfluencer(500, 'x');
+    addArrival(gatherer, m_ResourceMonitor, startTime + 1, "p", "val1", oversizedInfluencer);
+    addArrival(gatherer, m_ResourceMonitor, startTime + 2, "p", "val2", oversizedInfluencer);
+
+    // Persist — the JSON will contain the oversized influencer value.
+    std::ostringstream origJson;
+    core::CJsonStatePersistInserter::persist(
+        origJson, [&gatherer](core::CJsonStatePersistInserter& inserter) {
+            gatherer.acceptPersistInserter(inserter);
+        });
+
+    // Sanity check: the persisted JSON contains the full oversized value.
+    BOOST_TEST_REQUIRE(origJson.str().find(oversizedInfluencer) != std::string::npos);
+
+    // Restore from persisted JSON — truncation should apply.
+    std::istringstream origJsonStrm{"{\"topLevel\" : " + origJson.str() + "}"};
+    core::CJsonStateRestoreTraverser traverser(origJsonStrm);
+
+    CBucketGatherer::SBucketGathererInitData bucketGathererInitData{
+        EMPTY_STRING, EMPTY_STRING, EMPTY_STRING, EMPTY_STRING, {}, 0, 0};
+    CDataGatherer restoredGatherer(model_t::E_EventRate, model_t::E_None,
+                                   params, EMPTY_STRING, key,
+                                   bucketGathererInitData, traverser);
+
+    // Persist restored gatherer — should NOT contain the oversized value.
+    std::ostringstream restoredJson;
+    core::CJsonStatePersistInserter::persist(
+        restoredJson, [&restoredGatherer](core::CJsonStatePersistInserter& inserter) {
+            restoredGatherer.acceptPersistInserter(inserter);
+        });
+
+    // The full 500-char string must no longer appear (it was truncated to 256).
+    BOOST_TEST_REQUIRE(restoredJson.str().find(oversizedInfluencer) == std::string::npos);
+
+    // Verify idempotency: restore again and persist — should be identical.
+    std::istringstream restoredJsonStrm{"{\"topLevel\" : " + restoredJson.str() + "}"};
+    core::CJsonStateRestoreTraverser traverser2(restoredJsonStrm);
+    CDataGatherer restoredGatherer2(model_t::E_EventRate, model_t::E_None,
+                                    params, EMPTY_STRING, key,
+                                    bucketGathererInitData, traverser2);
+
+    std::ostringstream restoredJson2;
+    core::CJsonStatePersistInserter::persist(
+        restoredJson2, [&restoredGatherer2](core::CJsonStatePersistInserter& inserter) {
+            restoredGatherer2.acceptPersistInserter(inserter);
+        });
+
+    BOOST_REQUIRE_EQUAL(restoredJson.str(), restoredJson2.str());
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/lib/model/unittest/CMetricDataGathererTest.cc b/lib/model/unittest/CMetricDataGathererTest.cc
index 76feaf0546..0941e95827 100644
--- a/lib/model/unittest/CMetricDataGathererTest.cc
+++ b/lib/model/unittest/CMetricDataGathererTest.cc
@@ -1843,4 +1843,81 @@ BOOST_FIXTURE_TEST_CASE(testVarp, CTestFixture) {
     }
 }
 
+BOOST_FIXTURE_TEST_CASE(testRestoreTruncatesOversizedInfluencerSums, CTestFixture) {
+    // Verify that oversized influencer keys in CGathererTools::SInfluencerSumSerializer
+    // are truncated on restore. This exercises truncation in the metric sum gatherer's
+    // influencer bucket sum restore path.
+
+    constexpr core_t::TTime startTime = 0;
+    constexpr core_t::TTime bucketLength = 600;
+    SModelParams params(bucketLength);
+    params.s_LatencyBuckets = 2;
+    params.s_SampleCountFactor = 1;
+    params.s_SampleQueueGrowthFactor = 0.1;
+
+    TFeatureVec features;
+    features.push_back(model_t::E_IndividualSumByBucketAndPerson);
+    TStrVec const influencerNames{"i1"};
+
+    CDataGatherer gatherer =
+        CDataGathererBuilder(model_t::E_Metric, features, params, KEY, startTime)
+            .influenceFieldNames(influencerNames)
+            .sampleCountOverride(2U)
+            .build();
+
+    BOOST_REQUIRE_EQUAL(0, addPerson("p", gatherer, m_ResourceMonitor, 1));
+
+    // Add arrivals with an oversized influencer value (bypasses CAnomalyJob input truncation).
+    std::string const oversizedInfluencer(500, 'y');
+    addArrival(gatherer, m_ResourceMonitor, startTime + 1, "p", 1.0,
+               oversizedInfluencer, "");
+    addArrival(gatherer, m_ResourceMonitor, startTime + 2, "p", 2.0,
+               oversizedInfluencer, "");
+
+    // Persist — the JSON will contain the oversized influencer value.
+    std::ostringstream origJson;
+    core::CJsonStatePersistInserter::persist(
+        origJson, [&gatherer](core::CJsonStatePersistInserter& inserter) {
+            gatherer.acceptPersistInserter(inserter);
+        });
+
+    // Sanity check: the persisted JSON contains the full oversized value.
+    BOOST_TEST_REQUIRE(origJson.str().find(oversizedInfluencer) != std::string::npos);
+
+    // Restore from persisted JSON — truncation should apply.
+    std::istringstream origJsonStrm{"{\"topLevel\" : " + origJson.str() + "}"};
+    core::CJsonStateRestoreTraverser traverser(origJsonStrm);
+
+    CBucketGatherer::SBucketGathererInitData bucketGathererInitData{
+        EMPTY_STRING, EMPTY_STRING, EMPTY_STRING, EMPTY_STRING, {}, 0, 0};
+    CDataGatherer restoredGatherer(model_t::E_Metric, model_t::E_None,
+                                   params, EMPTY_STRING, KEY,
+                                   bucketGathererInitData, traverser);
+
+    // Persist restored gatherer — should NOT contain the oversized value.
+    std::ostringstream restoredJson;
+    core::CJsonStatePersistInserter::persist(
+        restoredJson, [&restoredGatherer](core::CJsonStatePersistInserter& inserter) {
+            restoredGatherer.acceptPersistInserter(inserter);
+        });
+
+    // The full 500-char string must no longer appear (it was truncated to 256).
+    BOOST_TEST_REQUIRE(restoredJson.str().find(oversizedInfluencer) == std::string::npos);
+
+    // Verify idempotency: restore again and persist — should be identical.
+    std::istringstream restoredJsonStrm{"{\"topLevel\" : " + restoredJson.str() + "}"};
+    core::CJsonStateRestoreTraverser traverser2(restoredJsonStrm);
+    CDataGatherer restoredGatherer2(model_t::E_Metric, model_t::E_None,
+                                    params, EMPTY_STRING, KEY,
+                                    bucketGathererInitData, traverser2);
+
+    std::ostringstream restoredJson2;
+    core::CJsonStatePersistInserter::persist(
+        restoredJson2, [&restoredGatherer2](core::CJsonStatePersistInserter& inserter) {
+            restoredGatherer2.acceptPersistInserter(inserter);
+        });
+
+    BOOST_REQUIRE_EQUAL(restoredJson.str(), restoredJson2.str());
+}
+
 BOOST_AUTO_TEST_SUITE_END()

From 3f6471f63c0dd665039830ef676b73f5936c3f35 Mon Sep 17 00:00:00 2001
From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com>
Date: Thu, 19 Mar 2026 11:08:24 +0100
Subject: [PATCH 06/11] formatting

---
 .../unittest/CEventRateDataGathererTest.cc    | 22 +++++++++----------
 lib/model/unittest/CMetricDataGathererTest.cc | 16 +++++---------
 2 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/lib/model/unittest/CEventRateDataGathererTest.cc b/lib/model/unittest/CEventRateDataGathererTest.cc
index b540b38245..70e6c55bea 100644
--- a/lib/model/unittest/CEventRateDataGathererTest.cc
+++ b/lib/model/unittest/CEventRateDataGathererTest.cc
@@ -1906,12 +1906,12 @@ BOOST_FIXTURE_TEST_CASE(testRestoreTruncatesOversizedInfluencerValues, CTestFixt
     features.push_back(model_t::E_IndividualUniqueCountByBucketAndPerson);
     TStrVec influencerFieldNames{"IF1"};
 
-    CDataGatherer gatherer =
-        CDataGathererBuilder(model_t::E_EventRate, features, params, key, startTime)
-            .personFieldName("P")
-            .valueFieldName("V")
-            .influenceFieldNames(influencerFieldNames)
-            .build();
+    CDataGatherer gatherer = CDataGathererBuilder(model_t::E_EventRate, features,
+                                                  params, key, startTime)
+                                 .personFieldName("P")
+                                 .valueFieldName("V")
+                                 .influenceFieldNames(influencerFieldNames)
+                                 .build();
 
     BOOST_REQUIRE_EQUAL(0, addPerson(gatherer, m_ResourceMonitor, "p", "v", 1));
 
@@ -1936,9 +1936,8 @@ BOOST_FIXTURE_TEST_CASE(testRestoreTruncatesOversizedInfluencerValues, CTestFixt
 
     CBucketGatherer::SBucketGathererInitData bucketGathererInitData{
         EMPTY_STRING, EMPTY_STRING, EMPTY_STRING, EMPTY_STRING, {}, 0, 0};
-    CDataGatherer restoredGatherer(model_t::E_EventRate, model_t::E_None,
-                                   params, EMPTY_STRING, key,
-                                   bucketGathererInitData, traverser);
+    CDataGatherer restoredGatherer(model_t::E_EventRate, model_t::E_None, params,
+                                   EMPTY_STRING, key, bucketGathererInitData, traverser);
 
     // Persist restored gatherer — should NOT contain the oversized value.
     std::ostringstream restoredJson;
@@ -1953,9 +1952,8 @@ BOOST_FIXTURE_TEST_CASE(testRestoreTruncatesOversizedInfluencerValues, CTestFixt
     // Verify idempotency: restore again and persist — should be identical.
     std::istringstream restoredJsonStrm{"{\"topLevel\" : " + restoredJson.str() + "}"};
     core::CJsonStateRestoreTraverser traverser2(restoredJsonStrm);
-    CDataGatherer restoredGatherer2(model_t::E_EventRate, model_t::E_None,
-                                    params, EMPTY_STRING, key,
-                                    bucketGathererInitData, traverser2);
+    CDataGatherer restoredGatherer2(model_t::E_EventRate, model_t::E_None, params,
+                                    EMPTY_STRING, key, bucketGathererInitData, traverser2);
 
     std::ostringstream restoredJson2;
     core::CJsonStatePersistInserter::persist(
diff --git a/lib/model/unittest/CMetricDataGathererTest.cc b/lib/model/unittest/CMetricDataGathererTest.cc
index 0941e95827..b3085727c7 100644
--- a/lib/model/unittest/CMetricDataGathererTest.cc
+++ b/lib/model/unittest/CMetricDataGathererTest.cc
@@ -1869,10 +1869,8 @@ BOOST_FIXTURE_TEST_CASE(testRestoreTruncatesOversizedInfluencerSums, CTestFixtur
 
     // Add arrivals with an oversized influencer value (bypasses CAnomalyJob input truncation).
     std::string const oversizedInfluencer(500, 'y');
-    addArrival(gatherer, m_ResourceMonitor, startTime + 1, "p", 1.0,
-               oversizedInfluencer, "");
-    addArrival(gatherer, m_ResourceMonitor, startTime + 2, "p", 2.0,
-               oversizedInfluencer, "");
+    addArrival(gatherer, m_ResourceMonitor, startTime + 1, "p", 1.0, oversizedInfluencer, "");
+    addArrival(gatherer, m_ResourceMonitor, startTime + 2, "p", 2.0, oversizedInfluencer, "");
 
     // Persist — the JSON will contain the oversized influencer value.
     std::ostringstream origJson;
@@ -1890,9 +1888,8 @@ BOOST_FIXTURE_TEST_CASE(testRestoreTruncatesOversizedInfluencerSums, CTestFixtur
 
     CBucketGatherer::SBucketGathererInitData bucketGathererInitData{
         EMPTY_STRING, EMPTY_STRING, EMPTY_STRING, EMPTY_STRING, {}, 0, 0};
-    CDataGatherer restoredGatherer(model_t::E_Metric, model_t::E_None,
-                                   params, EMPTY_STRING, KEY,
-                                   bucketGathererInitData, traverser);
+    CDataGatherer restoredGatherer(model_t::E_Metric, model_t::E_None, params, EMPTY_STRING,
+                                   KEY, bucketGathererInitData, traverser);
 
     // Persist restored gatherer — should NOT contain the oversized value.
     std::ostringstream restoredJson;
@@ -1907,9 +1904,8 @@ BOOST_FIXTURE_TEST_CASE(testRestoreTruncatesOversizedInfluencerSums, CTestFixtur
     // Verify idempotency: restore again and persist — should be identical.
     std::istringstream restoredJsonStrm{"{\"topLevel\" : " + restoredJson.str() + "}"};
     core::CJsonStateRestoreTraverser traverser2(restoredJsonStrm);
-    CDataGatherer restoredGatherer2(model_t::E_Metric, model_t::E_None,
-                                    params, EMPTY_STRING, KEY,
-                                    bucketGathererInitData, traverser2);
+    CDataGatherer restoredGatherer2(model_t::E_Metric, model_t::E_None, params, EMPTY_STRING,
+                                    KEY, bucketGathererInitData, traverser2);
 
     std::ostringstream restoredJson2;
     core::CJsonStatePersistInserter::persist(

From ac73b637bdf677b50342063abfa54b1554166711 Mon Sep 17 00:00:00 2001
From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com>
Date: Thu, 19 Mar 2026 12:18:16 +0100
Subject: [PATCH 07/11] clean up

---
 include/model/CFieldValueTruncator.h          | 12 ++++------
 .../unittest/CDynamicStringIdRegistryTest.cc  |  2 +-
 .../unittest/CFieldValueTruncatorTest.cc      | 24 +------------------
 3 files changed, 7 insertions(+), 31 deletions(-)

diff --git a/include/model/CFieldValueTruncator.h b/include/model/CFieldValueTruncator.h
index b044f1adfc..c6df93a803 100644
--- a/include/model/CFieldValueTruncator.h
+++ b/include/model/CFieldValueTruncator.h
@@ -39,12 +39,11 @@ namespace model {
 //!
 //! The 256-character limit aligns with Elasticsearch's ignore_above default
 //! for keyword fields. The hash suffix ensures data integrity while maintaining
-//! human readability (first 240 characters visible) and compatibility with
-//! prefix-based filtering. Collision probability is ~1 in 10^18 (effectively zero).
+//! human readability (first 239 characters visible) and compatibility with
+//! prefix-based filtering. Collision probability is ~1 in 10^19 (effectively zero).
 class MODEL_EXPORT CFieldValueTruncator {
 public:
-    //! Domain constraint: Maximum length for term fields in anomaly detection.
-    //! Aligned with Elasticsearch's ignore_above default for keyword fields.
+    //! Maximum length for term fields in anomaly detection.
     static constexpr std::size_t MAX_FIELD_VALUE_LENGTH = 256;
 
     //! Collision prevention format components
@@ -86,13 +85,12 @@ class MODEL_EXPORT CFieldValueTruncator {
     }
 
     //! Enforce term field length constraint, returning constrained copy.
-    //! Original value unchanged. For performance, call needsTruncation() first
-    //! to avoid copying when constraint is already satisfied.
+    //! Original value unchanged.
     //! \param value Original field value
     //! \return Copy with length constraint enforced
     static std::string truncated(const std::string& value) {
         if (needsTruncation(value) == false) {
-            return value; // RVO applies
+            return value;
         }
 
         std::string result;
diff --git a/lib/model/unittest/CDynamicStringIdRegistryTest.cc b/lib/model/unittest/CDynamicStringIdRegistryTest.cc
index 60cea2b1da..b060340dde 100644
--- a/lib/model/unittest/CDynamicStringIdRegistryTest.cc
+++ b/lib/model/unittest/CDynamicStringIdRegistryTest.cc
@@ -118,7 +118,7 @@ BOOST_AUTO_TEST_CASE(testRestoreTruncatesOversizedNames) {
 
     bool addedPerson = false;
     std::string shortName("foo");
-    std::string oversizedName(77000, 'x');
+    std::string oversizedName(1000, 'x');
     registry.addName(shortName, 0, resourceMonitor, addedPerson);
     registry.addName(oversizedName, 0, resourceMonitor, addedPerson);
 
diff --git a/lib/model/unittest/CFieldValueTruncatorTest.cc b/lib/model/unittest/CFieldValueTruncatorTest.cc
index 6cc61174e2..b17a33b7ce 100644
--- a/lib/model/unittest/CFieldValueTruncatorTest.cc
+++ b/lib/model/unittest/CFieldValueTruncatorTest.cc
@@ -59,12 +59,6 @@ BOOST_AUTO_TEST_CASE(testConstOverloadShortValueReturnsSame) {
     BOOST_REQUIRE_EQUAL("short", result);
 }
 
-BOOST_AUTO_TEST_CASE(testVeryLargeValueFromIssue2796) {
-    std::string value(77000, 'y');
-    BOOST_REQUIRE_EQUAL(true, CFieldValueTruncator::truncate(value));
-    BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, value.size());
-}
-
 BOOST_AUTO_TEST_CASE(testNeedsTruncation) {
     BOOST_REQUIRE_EQUAL(false, CFieldValueTruncator::needsTruncation("short"));
     BOOST_REQUIRE_EQUAL(false, CFieldValueTruncator::needsTruncation(""));
@@ -72,8 +66,7 @@ BOOST_AUTO_TEST_CASE(testNeedsTruncation) {
                                    CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, 'x')));
     BOOST_REQUIRE_EQUAL(true, CFieldValueTruncator::needsTruncation(std::string(
                                   CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH + 1, 'x')));
-    BOOST_REQUIRE_EQUAL(
-        true, CFieldValueTruncator::needsTruncation(std::string(77000, 'x')));
+    BOOST_REQUIRE_EQUAL(true, CFieldValueTruncator::needsTruncation(std::string(1000, 'x')));
 }
 
 // ============================================================================
@@ -158,19 +151,4 @@ BOOST_AUTO_TEST_CASE(testDeterministicHashing) {
     BOOST_REQUIRE_EQUAL(result1, result2);
 }
 
-BOOST_AUTO_TEST_CASE(testVeryLongValueWithDistinctEnding) {
-    // Simulate the 77K influencer case from issue #2796
-    std::string value1(77000, 'x');
-    value1.replace(76990, 10, "VARIANT_A");
-
-    std::string value2(77000, 'x');
-    value2.replace(76990, 10, "VARIANT_B");
-
-    std::string truncated1 = CFieldValueTruncator::truncated(value1);
-    std::string truncated2 = CFieldValueTruncator::truncated(value2);
-
-    // Must be distinct despite identical first 239 chars
-    BOOST_REQUIRE_NE(truncated1, truncated2);
-}
-
 BOOST_AUTO_TEST_SUITE_END()

From 222697199aa69e2f220cb75c82549802659187c1 Mon Sep 17 00:00:00 2001
From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com>
Date: Thu, 19 Mar 2026 12:32:09 +0100
Subject: [PATCH 08/11] fix unit test

---
 lib/model/unittest/CMetricDataGathererTest.cc | 25 +++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/lib/model/unittest/CMetricDataGathererTest.cc b/lib/model/unittest/CMetricDataGathererTest.cc
index b3085727c7..57bdc9838e 100644
--- a/lib/model/unittest/CMetricDataGathererTest.cc
+++ b/lib/model/unittest/CMetricDataGathererTest.cc
@@ -111,6 +111,24 @@ void addArrival(CDataGatherer& gatherer,
     gatherer.addArrival(fieldValues, eventData, resourceMonitor);
 }
 
+void addArrival(CDataGatherer& gatherer,
+                CResourceMonitor& resourceMonitor,
+                core_t::TTime time,
+                const std::string& person,
+                double value,
+                const std::string& influencer) {
+    CDataGatherer::TStrCPtrVec fieldValues;
+    fieldValues.push_back(&person);
+    fieldValues.push_back(influencer.empty() ? nullptr : &influencer);
+    std::string const valueAsString(core::CStringUtils::typeToString(value));
+    fieldValues.push_back(&valueAsString);
+
+    CEventData eventData;
+    eventData.time(time);
+
+    gatherer.addArrival(fieldValues, eventData, resourceMonitor);
+}
+
 void addArrival(CDataGatherer& gatherer,
                 CResourceMonitor& resourceMonitor,
                 core_t::TTime time,
@@ -1869,8 +1887,11 @@ BOOST_FIXTURE_TEST_CASE(testRestoreTruncatesOversizedInfluencerSums, CTestFixtur
 
     // Add arrivals with an oversized influencer value (bypasses CAnomalyJob input truncation).
     std::string const oversizedInfluencer(500, 'y');
-    addArrival(gatherer, m_ResourceMonitor, startTime + 1, "p", 1.0, oversizedInfluencer, "");
-    addArrival(gatherer, m_ResourceMonitor, startTime + 2, "p", 2.0, oversizedInfluencer, "");
+    addArrival(gatherer, m_ResourceMonitor, startTime + 1, "p", 1.0, oversizedInfluencer);
+    addArrival(gatherer, m_ResourceMonitor, startTime + 2, "p", 2.0, oversizedInfluencer);
+
+    // Advance past the first bucket so influencer sums are flushed to the persistable queue.
+    gatherer.timeNow(startTime + bucketLength);
 
     // Persist — the JSON will contain the oversized influencer value.
     std::ostringstream origJson;

From 72c3d6b89aaf393cf6402c47ac98af5e92bdea5a Mon Sep 17 00:00:00 2001
From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com>
Date: Thu, 19 Mar 2026 14:37:49 +0100
Subject: [PATCH 09/11] review comments

---
 include/core/CLoggerThrottler.h               |  3 +-
 include/core/LogMacros.h                      | 18 +++++++
 include/model/CFieldValueTruncator.h          |  7 +--
 lib/api/CAnomalyJob.cc                        | 11 +++--
 lib/api/CDataProcessor.cc                     |  3 +-
 lib/api/unittest/CAnomalyJobTest.cc           | 47 +++++++++++++++++--
 .../unittest/CEventRateDataGathererTest.cc    |  5 ++
 lib/model/unittest/CMetricDataGathererTest.cc |  5 ++
 8 files changed, 83 insertions(+), 16 deletions(-)

diff --git a/include/core/CLoggerThrottler.h b/include/core/CLoggerThrottler.h
index c6a4505128..58b89c4d8b 100644
--- a/include/core/CLoggerThrottler.h
+++ b/include/core/CLoggerThrottler.h
@@ -30,7 +30,8 @@ namespace core {
 //! This is thread safe but uses a very simple strategy: all accesses to a single
 //! hash map are sychronised. We assume that log throttling is only applied to
 //! messages which normally occur infrequently; for example, this is only currently
-//! applied to WARN and ERROR level logging (see LogMacros.h). So there will be
+//! applied to WARN, ERROR, and throttled INFO (LOG_INFO_THROTTLED) logging
+//! (see LogMacros.h). So there will be
 //! little contention. Furthermore, the overhead of locking and unlocking the mutex
 //! should be neglible compared to the work done if the log line were actually
 //! emitted. So this should actually give a significant performance improvement
diff --git a/include/core/LogMacros.h b/include/core/LogMacros.h
index 0c84a88d21..abb96afa21 100644
--- a/include/core/LogMacros.h
+++ b/include/core/LogMacros.h
@@ -83,6 +83,24 @@
     BOOST_LOG_STREAM_SEV(ml::core::CLogger::instance().logger(), ml::core::CLogger::E_Info) \
     LOG_LOCATION_INFO                                                                       \
     message
+#ifdef LOG_INFO_THROTTLED
+#undef LOG_INFO_THROTTLED
+#endif
+#define LOG_INFO_THROTTLED(message)                                                        \
+    do {                                                                                   \
+        std::size_t countOfInfoMessages;                                                   \
+        bool skipInfoMessage;                                                               \
+        std::tie(countOfInfoMessages, skipInfoMessage) =                                    \
+            ml::core::CLogger::instance().throttler().skip(__FILE__, __LINE__);             \
+        if (skipInfoMessage == false) {                                                     \
+            BOOST_LOG_STREAM_SEV(ml::core::CLogger::instance().logger(),                   \
+                                 ml::core::CLogger::E_Info)                                \
+            LOG_LOCATION_INFO                                                               \
+            message << (countOfInfoMessages > 1                                             \
+                            ? " | repeated [" + std::to_string(countOfInfoMessages) + "]"  \
+                            : "");                                                          \
+        }                                                                                   \
+    } while (0)
 #ifdef LOG_WARN
 #undef LOG_WARN
 #endif
diff --git a/include/model/CFieldValueTruncator.h b/include/model/CFieldValueTruncator.h
index c6df93a803..6c0b94aa60 100644
--- a/include/model/CFieldValueTruncator.h
+++ b/include/model/CFieldValueTruncator.h
@@ -35,12 +35,12 @@ namespace model {
 //!   - Append HASH_HEX_DIGITS (16) character hex hash of complete original value
 //!
 //! Format: "<prefix_239_chars>$<hash_16_hex_chars>"
-//! Example: "very_long_field_value_that_exceeds_limit_and_continues_for_thousands_of_chars_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx$a1b2c3d4e5f67890"
+//! Example: "very_long_field_value_that_exceeds_limit_(...)$a1b2c3d4e5f67890"
 //!
 //! The 256-character limit aligns with Elasticsearch's ignore_above default
 //! for keyword fields. The hash suffix ensures data integrity while maintaining
 //! human readability (first 239 characters visible) and compatibility with
-//! prefix-based filtering. Collision probability is ~1 in 10^19 (effectively zero).
+//! prefix-based filtering.
 class MODEL_EXPORT CFieldValueTruncator {
 public:
     //! Maximum length for term fields in anomaly detection.
@@ -59,8 +59,6 @@ class MODEL_EXPORT CFieldValueTruncator {
                   "Term field format invariant: prefix + suffix = total length");
     static_assert(PREFIX_LENGTH >= 200,
                   "Readable prefix must be substantial for human comprehension");
-    static_assert(HASH_HEX_DIGITS * 4 == 64,
-                  "Hash hex digits must represent full 64-bit hash output");
 
     //! Check if a term field value exceeds the domain constraint.
     //! \return true if the value requires length enforcement
@@ -85,7 +83,6 @@ class MODEL_EXPORT CFieldValueTruncator {
     }
 
     //! Enforce term field length constraint, returning constrained copy.
-    //! Original value unchanged.
     //! \param value Original field value
     //! \return Copy with length constraint enforced
     static std::string truncated(const std::string& value) {
diff --git a/lib/api/CAnomalyJob.cc b/lib/api/CAnomalyJob.cc
index 8082f6c08f..374becaf37 100644
--- a/lib/api/CAnomalyJob.cc
+++ b/lib/api/CAnomalyJob.cc
@@ -1718,11 +1718,12 @@ void CAnomalyJob::prepareTruncatedFieldValues(const TStrVec& fieldNames,
 
             std::string escapedFieldName = fieldName;
             core::CStringUtils::escape('\\', "\n\r\t", escapedFieldName);
-            LOG_WARN(<< "Field '" << escapedFieldName
-                     << "' value (length=" << value->size() << ", prefix='"
-                     << value->substr(0, std::min<std::size_t>(50, value->size()))
-                     << "...') exceeds " << model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH
-                     << " characters and has been truncated with collision-safe hash suffix");
+            LOG_INFO_THROTTLED(
+                << "Field '" << escapedFieldName
+                << "' value (length=" << value->size() << ", prefix='"
+                << value->substr(0, std::min<std::size_t>(50, value->size()))
+                << "...') exceeds " << model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH
+                << " characters and has been truncated with collision-safe hash suffix");
         } else {
             fieldValues.push_back(value);
         }
diff --git a/lib/api/CDataProcessor.cc b/lib/api/CDataProcessor.cc
index 3ffdb4915e..cb796e19a1 100644
--- a/lib/api/CDataProcessor.cc
+++ b/lib/api/CDataProcessor.cc
@@ -53,8 +53,7 @@ std::string CDataProcessor::debugPrintRecord(const TStrStrUMap& dataRowFields) {
         fieldNames.append(rowIter->first);
         const std::string& val = rowIter->second;
         if (model::CFieldValueTruncator::needsTruncation(val)) {
-            fieldValues.append(val.substr(0, model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH));
-            fieldValues.append("...");
+            fieldValues.append(model::CFieldValueTruncator::truncated(val));
         } else {
             fieldValues.append(val);
         }
diff --git a/lib/api/unittest/CAnomalyJobTest.cc b/lib/api/unittest/CAnomalyJobTest.cc
index 681a316641..3e9a9da8f3 100644
--- a/lib/api/unittest/CAnomalyJobTest.cc
+++ b/lib/api/unittest/CAnomalyJobTest.cc
@@ -17,6 +17,7 @@
 
 #include <model/CAnomalyDetectorModelConfig.h>
 #include <model/CDataGatherer.h>
+#include <model/CFieldValueTruncator.h>
 #include <model/CLimits.h>
 
 #include <api/CAnomalyJobConfig.h>
@@ -1207,6 +1208,10 @@ BOOST_AUTO_TEST_CASE(testHierarchicalResultsNormalizerShouldIncreaseMemoryUsage)
 }
 
 BOOST_AUTO_TEST_CASE(testOversizedFieldValuesTruncated) {
+    // Verify that addRecord (via prepareTruncatedFieldValues) truncates oversized
+    // by/influencer values before they enter the model. We assert on persisted
+    // state because that reflects what the detector stored; if addRecord did
+    // not truncate, the full value would appear here.
     model::CLimits limits;
     api::CAnomalyJobConfig jobConfig = CTestAnomalyJob::makeSimpleJobConfig(
         "count", "", "by_field", "", "", {"influencer_field"});
@@ -1218,13 +1223,32 @@ BOOST_AUTO_TEST_CASE(testOversizedFieldValuesTruncated) {
 
     CTestAnomalyJob job("job", limits, jobConfig, modelConfig, wrappedOutputStream);
 
-    std::string const oversizedValue(77000, 'x');
+    std::string const oversizedValue(1000, 'x');
     CTestAnomalyJob::TStrStrUMap dataRows{{"time", "1000"},
                                           {"by_field", oversizedValue},
                                           {"influencer_field", oversizedValue}};
 
     BOOST_TEST_REQUIRE(job.handleRecord(dataRows));
     BOOST_REQUIRE_EQUAL(uint64_t(1), job.numRecordsHandled());
+
+    // Advance past bucket boundary so results are output and state can be persisted.
+    CTestAnomalyJob::TStrStrUMap advanceRows{{"time", "5000"},
+                                             {"by_field", oversizedValue},
+                                             {"influencer_field", oversizedValue}};
+    BOOST_TEST_REQUIRE(job.handleRecord(advanceRows));
+    BOOST_REQUIRE_EQUAL(uint64_t(2), job.numRecordsHandled());
+
+    std::ostringstream* strm{nullptr};
+    api::CSingleStreamDataAdder::TOStreamP ptr{strm = new std::ostringstream()};
+    api::CSingleStreamDataAdder persister{ptr};
+    BOOST_TEST_REQUIRE(job.persistStateInForeground(persister, ""));
+    std::string const persistedState{strm->str()};
+
+    // Full oversized value must not be in state (addRecord truncated before store).
+    BOOST_TEST_REQUIRE(persistedState.find(oversizedValue) == std::string::npos);
+    // Persisted state must contain the truncated form produced by input truncation.
+    std::string const expectedTruncated = model::CFieldValueTruncator::truncated(oversizedValue);
+    BOOST_TEST_REQUIRE(persistedState.find(expectedTruncated) != std::string::npos);
 }
 
 BOOST_AUTO_TEST_CASE(testNormalFieldValuesNotTruncated) {
@@ -1245,6 +1269,20 @@ BOOST_AUTO_TEST_CASE(testNormalFieldValuesNotTruncated) {
 
     BOOST_TEST_REQUIRE(job.handleRecord(dataRows));
     BOOST_REQUIRE_EQUAL(uint64_t(1), job.numRecordsHandled());
+
+    // Advance past bucket boundary so results are output and state can be persisted.
+    CTestAnomalyJob::TStrStrUMap advanceRows{
+        {"time", "5000"}, {"by_field", normalValue}, {"influencer_field", normalValue}};
+    BOOST_TEST_REQUIRE(job.handleRecord(advanceRows));
+    BOOST_REQUIRE_EQUAL(uint64_t(2), job.numRecordsHandled());
+
+    std::ostringstream* strm{nullptr};
+    api::CSingleStreamDataAdder::TOStreamP ptr{strm = new std::ostringstream()};
+    api::CSingleStreamDataAdder persister{ptr};
+    BOOST_TEST_REQUIRE(job.persistStateInForeground(persister, ""));
+    std::string const persistedState{strm->str()};
+
+    BOOST_TEST_REQUIRE(persistedState.find(normalValue) != std::string::npos);
 }
 
 BOOST_AUTO_TEST_CASE(testDebugPrintRecordTruncatesLongValues) {
@@ -1252,8 +1290,11 @@ BOOST_AUTO_TEST_CASE(testDebugPrintRecordTruncatesLongValues) {
     record["field1"] = std::string(1000, 'x');
     record["field2"] = "short";
     std::string result = api::CDataProcessor::debugPrintRecord(record);
-    BOOST_TEST_REQUIRE(result.find("...") != std::string::npos);
-    BOOST_TEST_REQUIRE(result.size() < 1500);
+    // truncated() produces prefix + '$' + 16 hex chars; full 1000-char value not present
+    BOOST_TEST_REQUIRE(result.find(std::string(1000, 'x')) == std::string::npos);
+    BOOST_TEST_REQUIRE(result.find(model::CFieldValueTruncator::HASH_SEPARATOR) !=
+                       std::string::npos);
+    BOOST_TEST_REQUIRE(result.size() < 500);
 }
 
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/lib/model/unittest/CEventRateDataGathererTest.cc b/lib/model/unittest/CEventRateDataGathererTest.cc
index 70e6c55bea..2a231a6cb9 100644
--- a/lib/model/unittest/CEventRateDataGathererTest.cc
+++ b/lib/model/unittest/CEventRateDataGathererTest.cc
@@ -17,6 +17,7 @@
 #include <model/CDataGatherer.h>
 #include <model/CEventData.h>
 #include <model/CEventRateBucketGatherer.h>
+#include <model/CFieldValueTruncator.h>
 #include <model/CResourceMonitor.h>
 #include <model/CSearchKey.h>
 #include <model/ModelTypes.h>
@@ -1948,6 +1949,10 @@ BOOST_FIXTURE_TEST_CASE(testRestoreTruncatesOversizedInfluencerValues, CTestFixt
 
     // The full 500-char string must no longer appear (it was truncated to 256).
     BOOST_TEST_REQUIRE(restoredJson.str().find(oversizedInfluencer) == std::string::npos);
+    // Restore-path truncation must produce the same format as CFieldValueTruncator::truncated.
+    std::string const expectedTruncated =
+        model::CFieldValueTruncator::truncated(oversizedInfluencer);
+    BOOST_TEST_REQUIRE(restoredJson.str().find(expectedTruncated) != std::string::npos);
 
     // Verify idempotency: restore again and persist — should be identical.
     std::istringstream restoredJsonStrm{"{\"topLevel\" : " + restoredJson.str() + "}"};
diff --git a/lib/model/unittest/CMetricDataGathererTest.cc b/lib/model/unittest/CMetricDataGathererTest.cc
index 57bdc9838e..0413897891 100644
--- a/lib/model/unittest/CMetricDataGathererTest.cc
+++ b/lib/model/unittest/CMetricDataGathererTest.cc
@@ -18,6 +18,7 @@
 
 #include <model/CDataGatherer.h>
 #include <model/CEventData.h>
+#include <model/CFieldValueTruncator.h>
 #include <model/CGathererTools.h>
 #include <model/CMetricBucketGatherer.h>
 #include <model/CResourceMonitor.h>
@@ -1921,6 +1922,10 @@ BOOST_FIXTURE_TEST_CASE(testRestoreTruncatesOversizedInfluencerSums, CTestFixtur
 
     // The full 500-char string must no longer appear (it was truncated to 256).
     BOOST_TEST_REQUIRE(restoredJson.str().find(oversizedInfluencer) == std::string::npos);
+    // Restore-path truncation must produce the same format as CFieldValueTruncator::truncated.
+    std::string const expectedTruncated =
+        model::CFieldValueTruncator::truncated(oversizedInfluencer);
+    BOOST_TEST_REQUIRE(restoredJson.str().find(expectedTruncated) != std::string::npos);
 
     // Verify idempotency: restore again and persist — should be identical.
     std::istringstream restoredJsonStrm{"{\"topLevel\" : " + restoredJson.str() + "}"};

From 691e8cffccebd9690999383871533779f2349d48 Mon Sep 17 00:00:00 2001
From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com>
Date: Thu, 19 Mar 2026 14:43:46 +0100
Subject: [PATCH 10/11] formatting

---
 include/core/LogMacros.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/core/LogMacros.h b/include/core/LogMacros.h
index abb96afa21..a66a6f5dda 100644
--- a/include/core/LogMacros.h
+++ b/include/core/LogMacros.h
@@ -86,20 +86,20 @@
 #ifdef LOG_INFO_THROTTLED
 #undef LOG_INFO_THROTTLED
 #endif
-#define LOG_INFO_THROTTLED(message)                                                        \
-    do {                                                                                   \
-        std::size_t countOfInfoMessages;                                                   \
-        bool skipInfoMessage;                                                               \
-        std::tie(countOfInfoMessages, skipInfoMessage) =                                    \
-            ml::core::CLogger::instance().throttler().skip(__FILE__, __LINE__);             \
-        if (skipInfoMessage == false) {                                                     \
-            BOOST_LOG_STREAM_SEV(ml::core::CLogger::instance().logger(),                   \
-                                 ml::core::CLogger::E_Info)                                \
-            LOG_LOCATION_INFO                                                               \
-            message << (countOfInfoMessages > 1                                             \
-                            ? " | repeated [" + std::to_string(countOfInfoMessages) + "]"  \
-                            : "");                                                          \
-        }                                                                                   \
+#define LOG_INFO_THROTTLED(message)                                                       \
+    do {                                                                                  \
+        std::size_t countOfInfoMessages;                                                  \
+        bool skipInfoMessage;                                                             \
+        std::tie(countOfInfoMessages, skipInfoMessage) =                                  \
+            ml::core::CLogger::instance().throttler().skip(__FILE__, __LINE__);           \
+        if (skipInfoMessage == false) {                                                   \
+            BOOST_LOG_STREAM_SEV(ml::core::CLogger::instance().logger(),                  \
+                                 ml::core::CLogger::E_Info)                               \
+            LOG_LOCATION_INFO                                                             \
+            message << (countOfInfoMessages > 1                                           \
+                            ? " | repeated [" + std::to_string(countOfInfoMessages) + "]" \
+                            : "");                                                        \
+        }                                                                                 \
     } while (0)
 #ifdef LOG_WARN
 #undef LOG_WARN

From d905ec02a3e3ad54627ca94fbc156d9508b78e33 Mon Sep 17 00:00:00 2001
From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com>
Date: Mon, 22 Jun 2026 09:46:57 +0200
Subject: [PATCH 11/11] Delete docs/CHANGELOG.asciidoc

---
 docs/CHANGELOG.asciidoc | 949 ----------------------------------------
 1 file changed, 949 deletions(-)
 delete mode 100644 docs/CHANGELOG.asciidoc

diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
deleted file mode 100644
index 5c2485db47..0000000000
--- a/docs/CHANGELOG.asciidoc
+++ /dev/null
@@ -1,949 +0,0 @@
-// Use these for links to issue and pulls. Note issues and pulls redirect one to
-// each other on Github, so don't worry too much on using the right prefix.
-//:issue:           https://github.com/elastic/elasticsearch/issues/
-//:ml-issue:        https://github.com/elastic/ml-cpp/issues/
-//:pull:            https://github.com/elastic/elasticsearch/pull/
-//:ml-pull:         https://github.com/elastic/ml-cpp/pull/
-
-= Elasticsearch Release Notes
-
-//
-// To add a release, copy and paste the following text,  uncomment the relevant
-// sections, and add a link to the new section in the list of releases at the
-// top of the page. Note that release subheads must be floated and sections
-// cannot be empty.
-// TEMPLATE:
-
-// == {es} version n.n.n
-
-//=== Breaking Changes
-
-//=== Deprecations
-
-//=== New Features
-
-//=== Enhancements
-
-//=== Bug Fixes
-
-//=== Regressions
-
-== {es} version 9.4.0
-
-=== Bug Fixes
-
-* Truncate oversized field values to prevent autodetect process crash. (See {ml-pull}2929[#2929], {es-pull}143180[#143180], issue: {ml-issue}2796[#2796].)
-* Report RSS in bytes instead of pages. (See {ml-pull}2917[#2917].)
-
-=== Enhancements
-
-* Better handling of invalid JSON state documents (See {ml-pull}[]#2895].)
-* Better error handling regarding quantiles state documents (See {ml-pull}[#2894])
-
-== {es} version 9.3.0
-
-=== Enhancements
-
-* Downgrade log severity for a batch of recoverable errors. (See {ml-pull}[#2889].)
-
-== {es} version 9.2.0
-
-=== Enhancements
-
-* Update the PyTorch library to version 2.7.1. (See {ml-pull}2863[#2863].)
-* Report the actual memory usage of the autodetect process. (See {ml-pull}2846[#2846])
-* Improve adherence to memory limits for the bucket gatherer. (See {ml-pull}2848[#2848].)
-
-== {es} version 9.1.0
-
-=== Enhancements
-
-* Track memory used in the hierarchical results normalizer. (See {ml-pull}2831[#2831].)
-
-=== Bug Fixes
-
-== {es} version 9.0.0
-
-=== Enhancements
-
-* Update Linux build images to Rocky Linux 8 with gcc 13.3. (See {ml-pull}2773[#2773].)
-
-== {es} version 8.19.0
-
-=== Enhancements
-
-* Better messaging regarding OOM process termination. (See {ml-pull}2841[#2841].)
-
-== {es} version 8.18.0
-
-=== Enhancements
-
-* Update the PyTorch library to version 2.5.1. (See {ml-pull}2783[#2798], {ml-pull}2799[#2799].)
-* Upgrade Boost libraries to version 1.86. (See {ml-pull}2780[#2780], {ml-pull}2779[#2779].)
-
-== {es} version 8.17.7
-
-=== Enhancements
-* Restrict file system access for PyTorch models (See {ml-pull}2851[#2851].)
-
-== {es} version 8.16.6
-
-=== Bug Fixes
-
-* Correct handling of config updates. (See {ml-pull}2821[#2821].)
-
-== {es} version 8.16.4
-
-=== Bug Fixes
-
-* Increase the upper limits for the Boost.JSON SAX parser. (See {ml-pull}2809[#2809].)
-
-== {es} version 8.16.0
-
-=== Enhancements
-
-* Allow the user to force a detector to shift time series state by a specific amount.
-  (See {ml-pull}2695[#2695].)
-
-=== Bug Fixes
-
-* Allow for pytorch_inference results to include zero-dimensional tensors.
-
-== {es} version 8.15.4
-
-=== Bug Fixes
-
-* Fix parameter initialization for large forecasting models. (See {ml-pull}2759[#2759].)
-
-== {es} version 8.15.2
-
-=== Enhancements
-
-* Update the Pytorch library to version 2.3.1. (See {ml-pull}2688[#2688].)
-
-=== Bug Fixes
-
-* Allow for pytorch_inference results to include zero-dimensional tensors.
-
-== {es} version 8.15.1
-
-== {es} version 8.15.0
-
-=== Enhancements
-
-* Log 'No statistics at.. ' message as a warning. (See {ml-pull}2684[#2684].)
-
-=== Bug Fixes
-
-* Fix "stack use after scope" memory error. (See {ml-pull}2673[#2673].)
-* Handle any exception thrown by inference. (See {ml-pull}2680[#2680].)
-
-== {es} version 8.14.1
-
-=== Bug Fixes
-
-* Handle any exception thrown by inference. (See {ml-pull}2680[#2680].)
-
-== {es} version 8.14.1
-
-=== Enhancements
-
-* Improve memory allocation management for JSON processing to reduce memory usage.
-  (See {ml-pull}2679[#2679].)
-
-== {es} version 8.14.0
-
-=== Bug Fixes
-
-* Remove ineffective optimizations for duplicate strings. (See {ml-pull}2652[#2652], issue: {ml-issue}2130[#2130].)
-* Use custom Boost.JSON resource allocator. (See {ml-pull}2674[#2674].)
-
-== {es} version 8.13.0
-
-=== Enhancements
-
-* Use Boost.JSON for JSON processing. (See {ml-pull}2614[#2614].)
-* Upgrade Pytorch to version 2.1.2. (See {ml-pull}2588[#2588].)
-* Upgrade zlib to version 1.2.13 on Windows. (See {ml-pull}2588[#2588].)
-* Better handling of number of allocations in pytorch_inference in the case that
-  hardware_concurrency fails. We were previously forcing maximum number of allocations
-  to be one in this case, we now allow what is requested. (See {ml-pull}2607[#2607].)
-* Upgrade MKL to version 2024.0 on Linux x86_64. (See {ml-pull}2619[#2619].)
-
-== {es} version 8.12.0
-
-=== Enhancements
-
-* Upgrade Boost libraries to version 1.83. (See {ml-pull}2560[#2560].)
-
-=== Bug Fixes
-
-* Ensure the estimated latitude is within the allowed range (See {ml-pull}2586[#2586].)
-* Remove dependency on the IPEX library (See {ml-pull}2605[#2605] and {ml-pull}2606[#2606].)
-
-== {es} version 8.11.2
-
-=== Enhancements
-
-* Improve forecasting for time series with step changes. (See {ml-pull}2591[#2591],
-  issue: {ml-issue}2466[#2466]).
-
-== {es} version 8.11.0
-
-=== Enhancements
-
-* Add support for PyTorch models quantized with Intel Extension for PyTorch. This feature is _only_ available on `linux_x86_64`. (See {ml-pull}2547[#2547]).
-
-== {es} version 8.10.3
-
-=== Bug Fixes
-* Fix for lost inference requests when writing to the cache times out leading to processing to stall on the Elasticsearch side. (See {ml-pull}2576[#2576].)
-
-== {es} version 8.9.0
-
-=== Enhancements
-
-* Improved compliance with memory limitations. (See {ml-pull}2469[#2469].)
-* Improve detection of time shifts, for example for day light saving. (See {ml-pull}2479[#2479].)
-* Improve detection of calendar cyclic components with long bucket lengths. (See {ml-pull}2493[#2493].)
-
-=== Bug Fixes
-* Prevent high memory usage by evaluating batch inference singularly. (See {ml-pull}2538[#2538].)
-* Catch exceptions thrown during inference and report as errors. (See {ml-pull}2542[#2542].)
-
-== {es} version 8.8.0
-
-=== Enhancements
-
-* Anomaly score explanation for rare detector. (See {ml-pull}2449[#2449].)
-
-== {es} version 8.7.0
-
-=== Enhancements
-
-* Add identification of multimodal distribution to anomaly explanations. (See {ml-pull}2440[#2440].)
-* Upgrade PyTorch to version 1.13.1. (See {ml-pull}2430[#2430].)
-* Remove the PyTorch inference work queue as now handled in Elasticsearch
-
-== {es} version 8.6.0
-
-=== Bug Fixes
-
-* Fix for 'No statistics' error message. (See {ml-pull}2410[#2410].)
-* Fix for 'No counts available' error message. (See {ml-pull}2414[#2414].)
-* Improve performance of closing files before spawning. (See {ml-pull}2424[#2424].)
-
-== {es} version 8.5.0
-
-=== Enhancements
-
-* Compute outlier feature influence via the Gateaux derivative to improve attribution
-  for high dimension vectors. (See {ml-pull}2256[#2256].)
-* Improve classification and regression model train runtimes for data sets with many
-  numeric features. (See {ml-pull}2380[#2380], {ml-pull}2388[#2388], {ml-pull}2390[#2390]
-  and {ml-pull}2401[#2401].)
-* Increase the limit on the maximum number of classes to 100 for training classification
-  models. (See {ml-pull}2395[#2395] issue: {ml-issue}2246[#2246].)
-
-== {es} version 8.4.2
-
-=== Bug Fixes
-
-* Do not retain categorization tokens when existing category matches. (See {ml-pull}2398[#2398].)
-
-== {es} version 8.4.0
-
-=== Enhancements
-
-* Fairer application of size penalty for model selection for training classification
-  and regression models. (See {ml-pull}2291[#2291].)
-* Accelerate training for data frame analytics by skipping fine parameter tuning if it 
-  is unnecessary. (See {ml-pull}2298[#2298].)
-* Address some causes of high runtimes training regression and classification models
-  on large data sets with many features. (See {ml-pull}2332[#2332].)
-* Add caching for PyTorch inference. (See {ml-pull}2305[#2305].)
-* Improve accuracy of anomaly detection median estimation. (See {ml-pull}2367[#2367],
-  issue: {ml-issue}2364[#2364].)
-
-=== Bug Fixes
-
-* Fix potential cause of classification and regression job failures. (See {ml-pull}2385[#2385].)
-
-== {es} version 8.3.0
-
-=== Enhancements
-
-* Upgrade PyTorch to version 1.11. (See {ml-pull}2233[#2233], {ml-pull}2235[#2235]
-  and {ml-pull}2238[#2238].)
-* Upgrade zlib to version 1.2.12 on Windows. (See {ml-pull}2253[#2253].)
-* Upgrade libxml2 to version 2.9.14 on Linux and Windows. (See {ml-pull}2287[#2287].)
-* Improve time series model stability and anomaly scoring consistency for data
-  for which many buckets are empty. (See {ml-pull}2267[#2267].)
-* Address root cause for actuals equals typical equals zero anomalies. (See {ml-pull}2270[#2270].)
-* Better handling of outliers in update immediately after detecting changes in time
-  series. (See {ml-pull}2280[#2280].)
-* Improve normalization of anomaly detection results for short bucket lengths. This
-  corrects bias which could cause our scoring to be too low for these jobs. (See,
-  {ml-pull}2285[#2285], issue: {ml-issue}2276[#2276].)
-
-=== Bug Fixes
-
-* Correct logic for restart from failover fine tuning hyperparameters for training
-  classification and regression models. (See {ml-pull}2251[#2251].)
-* Fix possible source of "x = NaN, distribution = class boost::math::normal_distribution<..."
-  log errors training classification and regression models. (See {ml-pull}2249[#2249].)
-* Fix some bugs affecting decision to stop optimising hyperparameters for training
-  classification and regression models. (See {ml-pull}2259[#2259].)
-* Fix cause of "Must provide points at which to evaluate function" log error training
-  classification and regression models. (See {ml-pull}2268[#2268].)
-* Fix a source of "Discarding sample = nan, weights = ..." log errors for time series
-  anomaly detection. (See {ml-pull}2286[#2286].)
-
-== {es} version 8.2.2
-
-=== Enhancements
-
-* Make ML native processes work with glibc 2.35 (required for Ubuntu 22.04). (See
-  {ml-pull}2272[#2272].)
-
-=== Bug Fixes
-
-* Adjacency weighting fixes in categorization. (See {ml-pull}2277[#2277].)
-
-== {es} version 8.2.1
-
-=== Bug Fixes
-
-* Fix edge case which could cause the model bounds to blow up after detecting seasonality.
-  (See {ml-pull}2261[#2261].)
-
-== {es} version 8.2.0
-
-=== Enhancements
-
-* Better handle small shifts of the seasonal patterns in time series data.
-  (See {ml-pull}2202[#2202].)
-* Limit the maximum size of classification and regression models training
-  produces so they can always be deployed for inference inside the Elastic
-  Stack. (See {ml-pull}2205[#2205].)
-* Support user defined example weights when training classification and
-  regression models. (See {ml-pull}2222[#2222].)
-* Reduce worst case bucket processing time for anomaly detection. (See {ml-pull}2225[#2225].)
-* Improve handling of low cardinality features for training classification
-  and regression models. (See {ml-pull}2229[#2229].)
-* Improve handling of extremely large outliers in time series modelling.
-  (See {ml-pull}2230[#2230].)
-* Improve detection and modeling of time series' calendar cyclic features.
-  (See {ml-pull}2236[#2236] and {ml-pull}2243[#2243].)
-* Compress quantiles state. (See {ml-pull}2252[#2252].)
-
-=== Bug Fixes
-
-* Fix possible source of "Discarding sample = -nan(ind), weight = 1, variance scale = 1"
-  log errors training classification and regression models. (See {ml-pull}2226[#2226].)
-* Fix error message for failure to create reverse search. (See {ml-pull}2247[#2247].)
-
-== {es} version 8.1.0
-
-=== Enhancements
-
-* Improve skip_model_update rule behaviour (See {ml-pull}2096[#2096].)
-* Upgrade Boost libraries to version 1.77. (See {ml-pull}2095[#2095].)
-* Upgrade RapidJSON to 31st October 2021 version. (See {ml-pull}2106[#2106].)
-* Upgrade Eigen library to version 3.4.0. (See {ml-pull}2137[#2137].)
-* Prevent over-subscription of threads in pytorch_inference. (See {ml-pull}2141[#2141].)
-
-=== Bug Fixes
-
-* Fix a bug in the tuning of the hyperparameters when training regression
-  classification models. (See {ml-pull}2128[#2128].)
-* Improve training stability for regression and classification models
-  (See {ml-pull}2144[#2144], {ml-pull}2147[#2147] and {ml-pull}2150[#2150].)
-* Avoid edge cases in the classification weights calculation to maximize
-  minimum recall which could lead to only a single class being predicted.
-  (See {ml-pull}2194[#2194].)
-* Address cause of "[CStatisticalTests.cc@102] Test statistic is nan"
-  log errors. (See {ml-pull}2196[#2196].)
-* Address possible causes of "x = NaN, distribution = N5boost4math23students_t_distribution"
-  log errors. (See {ml-pull}2197[#2197].)
-* Fix bug restoring data gatherer state for time of day and week anomaly detection
-  functions. This could lead to "No queue item for time " and "Time is out of range.
-  Returning earliest bucket index" log errors. (See {ml-pull}2213[#2213].)
-
-== {es} version 8.0.0-rc1
-
-=== Bug Fixes
-
-* Set model state compatibility version to 8.0.0. (See {ml-pull}2139[#2139].)
-
-== {es} version 8.0.0-beta1
-
-=== Enhancements
-
-* The Linux build platform for the {ml} C++ code is now CentOS 7 running gcc 10.3. (See
-  {ml-pull}2028[#2028].)
-
-== {es} version 8.0.0-alpha1
-
-=== Enhancements
-
-* The Windows build platform for the {ml} C++ code now uses Visual Studio 2019. (See
-  {ml-pull}1352[#1352].)
-* The macOS build platform for the {ml} C++ code is now Mojave running Xcode 11.3.1,
-  or Ubuntu 20.04 running clang 8 for cross compilation. (See {ml-pull}1429[#1429].)
-* The Linux build platform for the {ml} C++ code is now CentOS 7 running gcc 9.3. (See
-  {ml-pull}1170[#1170].)
-* Added a new application for evaluating PyTorch models. The app depends on LibTorch
-  - the C++ front end to PyTorch - and performs inference on models stored in the
-  TorchScript format. (See {ml-pull}1902[#1902].)
-
-
-== {es} version 7.17.0
-
-=== Bug Fixes
-
-* Avoid transient poor time series modelling after detecting new seasonal components.
-  This can affect cases where we have fast and slow repeats in the data, for example
-  30 mins and 1 day, and the job uses a short bucket length. The outcome can be transient
-  poor predictions and model bounds, and sometimes false positive anomalies. (See
-  {ml-pull}2167[#2167].)
-
-== {es} version 7.16.0
-
-=== Enhancements
-
-* Speed up training of regression and classification models. (See {ml-pull}2024[#2024].)
-* Improve concurrency for training regression and classification models. (See
-  {ml-pull}2031[#2031].)
-* Improve aspects of implementation of skip_model_update rule (See {ml-pull}2053[#2053].)
-* Make sure instrumentation captures the best hyperparameters we found for training
-  classification and regression models. (See {ml-pull}2057{#2057}.)
-
-=== Bug Fixes
-
-* Correct ANOVA for Gaussian Process we fit to the loss surface. This affects early stopping.
-  Previously, we would always stop early whether it was approproate or not. It also improves
-  the estimates of hyperparameter importances. (See {ml-pull}2073[#2073].)
-* Fix numerical instability in hyperparameter optimisation for training regression and
-  classification models. (See {ml-pull}2078[#2078].)
-* Fix numerical stability issues in time series modelling. (See {ml-pull}2083[#[2083]].)
-
-== {es} version 7.15.2
-
-=== Bug Fixes
-
-* Fix cancellation of named pipe connection on Linux if the remote end does not connect
-  within the configured timeout period. (See {ml-pull}2102[#2102].)
-
-== {es} version 7.15.0
-
-=== Enhancements
-
-* Speed up training of regression and classification models on very large data sets.
-  (See {ml-pull}1941[#1941].)
-* Improve regression and classification training accuracy for small data sets.
-  (See {ml-pull}1960[#1960].)
-* Prune models for split fields (by, partition) that haven't seen data updates for
-  a given period of time. (See {ml-pull}1962[#1962].)
-
-=== Bug Fixes
-
-* Fix potential "process stopped unexpectedly: Fatal error" for training regression
-  and classification models. (See {ml-pull}1997[#1997], issue {ml-pull}1956[#1956].)
-
-== {es} version 7.14.0
-
-=== Enhancements
-
-* Give higher weight to multiple adjacent dictionary words when performing categorization. (See
-  {ml-pull}1903[#1903].)
-
-=== Bug Fixes
-
-* Make atomic operations safer for aarch64. (See {ml-pull}1893[#1893].)
-* Ensure bucket `event_count` is calculated for jobs with 1 second bucket spans.
-(See {ml-pull}1908[#1908].)
-
-== {es} version 7.13.0
-
-=== Enhancements
-
-* Speed up training of regression and classification model training for data sets
-  with many features. (See {ml-pull}1746[#1746].)
-* Avoid overfitting in final training by scaling regularizers to account for the
-  difference in the number of training examples. This results in a better match
-  between train and test error for classification and regression and often slightly
-  improved test errors. (See {ml-pull}1755[#1755].)
-* Adjust the syscall filter to allow mremap and avoid spurious audit logging.
-  (See {ml-pull}1819[#1819].)
-
-=== Bug Fixes
-
-* Ensure the same hyperparameters are chosen if classification or regression training
-  is stopped and restarted, for example, if the node fails. (See {ml-pull}1848[#1848].)
-* Fail gracefully if insufficient data are supplied for classification or regression
-  training. (See {ml-pull}1855[#1855].)
-* Fail gracefully on encountering unexpected state in restore from snapshot for anomaly
-  detection. (See {ml-pull}1872[#1872].)
-* Use appropriate memory ordering flags for aarch64 with string store to avoid excessive
-  string duplication. (See {ml-pull}1888[#1888].)
-
-== {es} version 7.12.2
-
-=== Bug Fixes
-
-* Add missing hyperparamter to the model metadata. (See {ml-pull}1867[#1867].)
-
-== {es} version 7.12.1
-
-=== Enhancements
-
-* Make ML native processes work with glibc 2.33 on x86_64. (See {ml-pull}1828[#1828].)
-
-== {es} version 7.12.0
-
-=== Enhancements
-
-* Fix edge case which could cause spurious anomalies early in the learning process
-  if the time series has non-diurnal seasonality. (See {ml-pull}1634[#1634].)
-* Compute importance of hyperparameters optimized in the fine parameter tuning step.
-  (See {ml-pull}1627[#1627].)
-* Early stopping for the fine parameter tuning step  of classification and regression
-  model training. (See {ml-pull}1676[#1676].)
-* Correct upgrade for pre-6.3 state for lat_long anomaly anomaly detectors. (See
-  {ml-pull}1681[#1681].)
-* Per tree feature bag to speed up training of regression and classification models
-  and improve scalability for large numbers of features. (See {ml-pull}1733[#1733].)
-
-=== Bug Fixes
-
-* Fix a source of instability in time series modeling for anomaly detection. This has
-  been observed to cause spurious anomalies for a partition which no longer receives
-  any data. (See {ml-pull}1675[#1675].)
-* Ensure that we stop modeling seasonality for data which flatlines. This is important
-  for count and sum detectors which treat empty buckets as zero. We could see spurious
-  anomalies in realtime detection after a partition no longer received data any data
-  as a result. (See {ml-pull}1654[#1654].)
-
-== {es} version 7.11.0
-
-=== Enhancements
-
-* During regression and classification training prefer smaller models if performance is
-  similar (See {ml-pull}1516[#1516].)
-* Add a response mechanism for commands sent to the native controller. (See
-  {ml-pull}1520[#1520], {es-pull}63542[#63542], issue: {es-issue}62823[#62823].)
-* Speed up anomaly detection for seasonal data. This is particularly effective for jobs
-  using longer bucket lengths. (See {ml-pull}1549[#1549].)
-* Fix an edge case which could cause typical and model plot bounds to blow up to around
-  max double. (See {ml-pull}1551[#1551].)
-* Estimate upper bound of potential gains before splitting a decision tree node to avoid
-  unnecessary computation. (See {ml-pull}1537[#1537].)
-* Improvements to time series modeling particularly in relation to adaption to change.
-  (See {ml-pull})1614[#1614].)
-* Warn and error log throttling. (See {ml-pull}1615[#1615].)
-* Soften the effect of fluctuations in anomaly detection job memory usage on node
-  assignment and add `assignment_memory_basis` to `model_size_stats`.
-  (See {ml-pull}1623[#1623], {es-pull}65561[#65561], issue: {es-issue}63163[#63163].)
-
-=== Bug Fixes
-
-* Fix potential cause for log errors from CXMeansOnline1d. (See {ml-pull}1586[#1586].)
-* Fix scaling of some hyperparameter for Bayesian optimization. (See {ml-pull}1612[#1612].)
-* Fix missing state in persist and restore for anomaly detection. This caused suboptimal
-  modelling after a job was closed and reopened or failed over to a different node.
-  (See {ml-pull}1668[#1668].)
-
-== {es} version 7.10.1
-
-=== Bug Fixes
-
-* Fix a bug where the peak_model_bytes value of the model_size_stats object was not
-  restored from the anomaly detector job snapshots. (See {ml-pull}1572[#1572].)
-
-== {es} version 7.10.0
-
-=== Enhancements
-
-* Calculate total feature importance to store with model metadata. (See {ml-pull}1387[#1387].)
-* Change outlier detection feature_influence format to array with nested objects. (See {ml-pull}1475[#1475], {es-pull}62068[#62068].)
-* Add timeouts to named pipe connections. (See {ml-pull}1514[#1514], {es-pull}62993[#62993], issue: {ml-issue}1504[#1504].)
-
-=== Bug Fixes
-
-* Fix progress on resume after final training has completed for classification and regression.
-  We previously showed progress stuck at zero for final training. (See {ml-pull}1443[#1443].)
-* Avoid potential "Failed to compute quantile" and "No values added to quantile sketch" log errors
-  training regression and classification models if there are features with mostly missing values.
-  (See {ml-pull}1500[#1500].)
-* Correct the anomaly detection job model state `min_version`. (See {ml-pull}1546[#1546].)
-
-== {es} version 7.9.2
-
-=== Bug Fixes
-
-* Fix reporting of peak memory usage in memory stats for data frame analytics. (See {ml-pull}1468[#1468].)
-* Fix reporting of peak memory usage in model size stats for anomaly detection. (See {ml-pull}1484[#1484].)
-
-== {es} version 7.9.0
-
-=== New Features
-
-* Report significant changes to anomaly detection models in annotations of the results.
-  (See {ml-pull}1247[#1247], {pull}56342[#56342], {pull}56417[#56417], {pull}57144[#57144], {pull}57278[#57278], {pull}57539[#57539].)
-
-=== Enhancements
-
-* Add support for larger forecasts in memory via max_model_memory setting.
-  (See {ml-pull}1238[#1238] and {pull}57254[#57254].)
-* Don't lose precision when saving model state. (See {ml-pull}1274[#1274].)
-* Parallelize the feature importance calculation for classification and regression
-  over trees. (See {ml-pull}1277[#1277].)
-* Add an option to do categorization independently for each partition.
-  (See {ml-pull}1293[#1293], {ml-pull}1318[#1318], {ml-pull}1356[#1356] and {pull}57683[#57683].)
-* Memory usage is reported during job initialization. (See {ml-pull}1294[#1294].)
-* More realistic memory estimation for classification and regression means that these
-  analyses will require lower memory limits than before (See {ml-pull}1298[#1298].)
-* Checkpoint state to allow efficient failover during coarse parameter search
-  for classification and regression. (See {ml-pull}1300[#1300].)
-* Improve data access patterns to speed up classification and regression.
-  (See {ml-pull}1312[#1312].)
-* Performance improvements for classification and regression, particularly running
-  multithreaded. (See {ml-pull}1317[#1317].)
-* Improve runtime and memory usage training deep trees for classification and
-  regression. (See {ml-pull}1340[#1340].)
-* Improvement in handling large inference model definitions. (See {ml-pull}1349[#1349].)
-* Add a peak_model_bytes field to model_size_stats. (See {ml-pull}1389[#1389].)
-
-=== Bug Fixes
-
-* Fix numerical issues leading to blow up of the model plot bounds. (See {ml-pull}1268[#1268].)
-* Fix causes for inverted forecast confidence interval bounds. (See {ml-pull}1369[#1369],
-  issue: {ml-issue}1357[#1357].)
-* Restrict growth of max matching string length for categories. (See {ml-pull}1406[#1406].)
-
-== {es} version 7.8.1
-
-=== Bug Fixes
-
-* Better interrupt handling during named pipe connection. (See {ml-pull}1311[#1311].)
-* Trap potential cause of SIGFPE. (See {ml-pull}1351[#1351], issue: {ml-issue}1348[#1348].)
-* Correct inference model definition for MSLE regression models. (See {ml-pull}1375[#1375].)
-* Fix cause of SIGSEGV of classification and regression. (See {ml-pull}1379[#1379].)
-* Fix restoration of change detectors after seasonality change. (See {ml-pull}1391[#1391].)
-* Fix potential SIGSEGV when forecasting. (See {ml-pull}1402[#1402], issue: {ml-issue}1401[#1401].)
-
-== {es} version 7.8.0
-
-=== Enhancements
-
-* Speed up anomaly detection for the lat_long function. (See {ml-pull}1102[#1102].)
-* Reduce CPU scheduling priority of native analysis processes to favor the ES JVM
-  when CPU is constrained. This change is only implemented for Linux and macOS, not
-  for Windows. (See {ml-pull}1109[#1109].)
-* Take `training_percent` into account when estimating memory usage for classification and regression.
-  (See {ml-pull}1111[#1111].)
-* Support maximize minimum recall when assigning class labels for multiclass classification.
-  (See {ml-pull}1113[#1113].)
-* Improve robustness of anomaly detection to bad input data. (See {ml-pull}1114[#1114].)
-* Adds new `num_matches` and `preferred_to_categories` fields to category output.
-  (See {ml-pull}1062[#1062])
-* Adds mean squared logarithmic error (MSLE) for regression. (See {ml-pull}1101[#1101].)
-* Adds pseudo-Huber loss for regression. (See {ml-pull}1168[#1168].)
-* Reduce peak memory usage and memory estimates for classification and regression.
-  (See {ml-pull}1125[#1125].)
-* Reduce variability of classification and regression results across our target operating systems.
-  (See {ml-pull}1127[#1127].)
-* Switched data frame analytics model memory estimates from kilobytes to megabytes.
-  (See {ml-pull}1126[#1126], issue: {issue}54506[#54506].)
-* Added a {ml} native code build for Linux on AArch64. (See {ml-pull}1132[#1132] and
-  {ml-pull}1135[#1135].)
-* Improve data frame analysis runtime by optimising memory alignment for intrinsic
-  operations. (See {ml-pull}1142[#1142].)
-* Fix spurious anomalies for count and sum functions after no data are received for long
-  periods of time. (See {ml-pull}1158[#1158].)
-* Improve false positive rates from periodicity test for time series anomaly detection.
-  (See {ml-pull}1177[#1177].)
-* Break progress reporting of data frame analyses into multiple phases. (See {ml-pull}1179[#1179].)
-* Really centre the data before training for classification and regression begins. This
-  means we can choose more optimal smoothing bias and should reduce the number of trees.
-  (See {ml-pull}1192[#1192].)
-
-=== Bug Fixes
-
-* Trap and fail if insufficient features are supplied to data frame analyses. This
-  caused classification and regression getting stuck at zero progress analyzing.
-  (See {ml-pull}1160[#1160], issue: {issue}55593[#55593].)
-* Make categorization respect the `model_memory_limit`. (See {ml-pull}1167[#1167],
-  issue: {ml-issue}1130[#1130].)
-* Respect user overrides for `max_trees` for classification and regression. (See
-  {ml-pull}1185[#1185].)
-* Reset memory status from `soft_limit` to `ok` when pruning is no longer required.
-  (See {ml-pull}1193[#1193], issue: {ml-issue}1131[#1131].)
-* Fix restore from training state for classification and regression. (See
-  {ml-pull}1197[#1197].)
-* Improve the initialization of seasonal components for anomaly detection. (See
-  {ml-pull}1201[#1201], issue: {ml-issue}1178[#1178].)
-
-== {es} version 7.7.1
-
-=== Bug Fixes
-
-* Fixed background persistence of categorizer state (See {ml-pull}1137[#1137],
-  issue: {ml-issue}1136[#1136].)
-* Fix classification job failures when number of classes in configuration differs
-  from the number of classes present in the training data. (See {ml-pull}1144[#1144].)
-* Fix underlying cause for "Failed to calculate splitting significance" log errors.
-  (See {ml-pull}1157[#1157].)
-* Fix possible root cause for "Bad variance scale nan" log errors. (See {ml-pull}1225[#1225].)
-* Change data frame analytics instrumentation timestamp resolution to milliseconds. (See
-  {ml-pull}1237[#1237].)
-* Fix "autodetect process stopped unexpectedly: Fatal error: 'terminate called after
-  throwing an instance of 'std::bad_function_call'". (See {ml-pull}1246[#1246],
-  issue: {ml-issue}1245[#1245].)
-
-== {es} version 7.7.0
-
-=== New Features
-
-* Add instrumentation to report statistics related to data frame analytics jobs, i.e.
-progress, memory usage, etc. (See {ml-pull}906[#906].)
-* Multiclass classification. (See {ml-pull}1037[#1037].)
-
-=== Enhancements
-
-* Improve computational performance of the feature importance computation. (See {ml-pull}1005[1005].)
-* Improve initialization of learn rate for better and more stable results in regression
-and classification. (See {ml-pull}948[#948].)
-* Add number of processed training samples to the definition of decision tree nodes.
-(See {ml-pull}991[#991].)
-* Add new model_size_stats fields to instrument categorization.  (See {ml-pull}948[#948]
-and {pull}51879[#51879], issue: {issue}50794[#50749].)
-* Improve upfront memory estimation for all data frame analyses, which were higher than
-necessary. This will improve the allocation of data frame analyses to cluster nodes.
-(See {ml-pull}1003[#1003].)
-* Upgrade the compiler used on Linux from gcc 7.3 to gcc 7.5, and the binutils used in
-the build from version 2.20 to 2.34.  (See {ml-pull}1013[#1013].)
-* Add instrumentation of the peak memory consumption for data frame analytics jobs.
-(See {ml-pull}1022[#1022].)
-* Remove all memory overheads for computing tree SHAP values. (See {ml-pull}1023[#1023].)
-* Distinguish between empty and missing categorical fields in classification and regression
-model training. (See {ml-pull}1034[#1034].)
-* Add instrumentation information for supervised learning data frame analytics jobs.
-(See {ml-pull}1031[#1031].)
-* Add instrumentation information for outlier detection data frame analytics jobs.
-* Write out feature importance for multi-class models. (See {ml-pull}1071[#1071])
-* Enable system call filtering to the native process used with data frame analytics.
-(See {ml-pull}1098[#1098])
-
-=== Bug Fixes
-
-* Use largest ordered subset of categorization tokens for category reverse search regex.
-(See {ml-pull}970[#970], issue: {ml-issue}949[#949].)
-* Account for the data frame's memory when estimating the peak memory used by classification
-and regression model training. (See {ml-pull}996[#996].)
-* Rename classification and regression parameter maximum_number_trees to max_trees.
-(See {ml-pull}1047[#1047].)
-
-== {es} version 7.6.2
-
-=== Bug Fixes
-
-* Fix a bug in the calculation of the minimum loss leaf values for classification.
-(See {ml-pull}1032[#1032].)
-
-== {es} version 7.6.0
-
-=== New Features
-
-* Add feature importance values to classification and regression results (using tree
-SHapley Additive exPlanation, or SHAP). (See {ml-pull}857[#857].)
-
-=== Enhancements
-
-* Improve performance of boosted tree training for both classification and regression.
-(See {ml-pull}775[#775].)
-* Reduce the peak memory used by boosted tree training and fix an overcounting bug
-estimating maximum memory usage. (See {ml-pull}781[#781].)
-* Stratified fractional cross validation for regression. (See {ml-pull}784[#784].)
-* Added `geo_point` supported output for `lat_long` function records. (See {ml-pull}809[#809]
-and {pull}47050[#47050].)
-* Use a random bag of the data to compute the loss function derivatives for each new
-tree which is trained for both regression and classification. (See {ml-pull}811[#811].)
-* Emit `prediction_probability` field alongside prediction field in ml results.
-(See {ml-pull}818[#818].)
-* Reduce memory usage of {ml} native processes on Windows. (See {ml-pull}844[#844].)
-* Reduce runtime of classification and regression. (See {ml-pull}863[#863].)
-* Stop early training a classification and regression forest when the validation error
-is no longer decreasing. (See {ml-pull}875[#875].)
-* Emit `prediction_field_name` in ml results using the type provided as
-`prediction_field_type` parameter. (See {ml-pull}877[#877].)
-* Improve performance updating quantile estimates. (See {ml-pull}881[#881].)
-* Migrate to use Bayesian Optimisation for initial hyperparameter value line searches and
-stop early if the expected improvement is too small. (See {ml-pull}903[#903].)
-* Stop cross-validation early if the predicted test loss has a small chance of being
-smaller than for the best parameter values found so far. (See {ml-pull}915[#915].)
-* Optimize decision threshold for classification to maximize minimum class recall.
-(See {ml-pull}926[#926].)
-* Include categorization memory usage in the `model_bytes` field in `model_size_stats`,
-so that it is taken into account in node assignment decisions. (See {ml-pull}927[#927],
-issue: {ml-issue}724[#724].)
-
-=== Bug Fixes
-* Fixes potential memory corruption when determining seasonality. (See {ml-pull}852[#852].)
-* Prevent prediction_field_name clashing with other fields in ml results.
-(See {ml-pull}861[#861].)
-* Include out-of-order as well as in-order terms in categorization reverse searches.
-(See {ml-pull}950[#950], issue: {ml-issue}949[#949].)
-
-== {es} version 7.5.2
-
-=== Bug Fixes
-* Fixes potential memory corruption or inconsistent state when background persisting
-categorizer state. (See {ml-pull}921[#921].)
-
-== {es} version 7.5.0
-
-=== Enhancements
-
-* Improve performance and concurrency training boosted tree regression models.
-For large data sets this change was observed to give a 10% to 20% decrease in
-train time. (See {ml-pull}622[#622].)
-* Upgrade Boost libraries to version 1.71. (See {ml-pull}638[#638].)
-* Improve initialisation of boosted tree training. This generally enables us to
-find lower loss models faster. (See {ml-pull}686[#686].)
-* Include a smooth tree depth based penalty to regularized objective function for
-boosted tree training. Hard depth based regularization is often the strategy of
-choice to prevent over fitting for XGBoost. By smoothing we can make better tradeoffs.
-Also, the parameters of the penalty function are mode suited to optimising with our
-Bayesian optimisation based hyperparameter search. (See {ml-pull}698[#698].)
-* Binomial logistic regression targeting cross entropy. (See {ml-pull}713[#713].)
-* Improvements to count and sum anomaly detection for sparse data. This primarily
-aims to improve handling of data which are predictably present: detecting when they
-are unexpectedly missing. (See {ml-pull}721[#721].)
-* Trap numeric errors causing bad hyperparameter search initialisation and repeated
-errors to be logged during boosted tree training. (See {ml-pull}732[#732].)
-
-=== Bug Fixes
-
-* Restore from checkpoint could damage seasonality modeling. For example, it could
-cause seasonal components to be overwritten in error. (See {ml-pull}821[#821].)
-
-== {es} version 7.4.1
-
-=== Enhancements
-
-* The {ml} native processes are now arranged in a .app directory structure on
-  macOS, to allow for notarization on macOS Catalina. (See {ml-pull}593[#593].)
-
-=== Bug Fixes
-
-* A reference to a temporary variable was causing forecast model restoration to fail.
-The bug exhibited itself on MacOS builds with versions of clangd > 10.0.0. (See {ml-pull}688[#688].)
-
-== {es} version 7.4.0
-
-=== Bug Fixes
-
-* Rename outlier detection method values knn and tnn to distance_kth_nn and distance_knn
-respectively to match the API. (See {ml-pull}598[#598].)
-* Fix occasional (non-deterministic) reinitialisation of modelling for the lat_long
-function. (See {ml-pull}641[#641].)
-
-== {es} version 7.3.1
-
-=== Bug Fixes
-
-* Only trap the case that more rows are supplied to outlier detection than expected.
-Previously, if rows were excluded from the data frame after supplying the row count
-in the configuration then we detected the inconsistency and failed outlier detection.
-However, this legitimately happens in case where the field values are non-numeric or
-array valued. (See {ml-pull}569[#569].)
-
-== {es} version 7.3.0
-
-=== Enhancements
-
-* Upgrade to a newer version of the Apache Portable Runtime library. (See {ml-pull}495[#495].)
-* Improve stability of modelling around change points. (See {ml-pull}496[#496].)
-
-=== Bug Fixes
-
-* Reduce false positives associated with the multi-bucket feature. (See {ml-pull}491[#491].)
-* Reduce false positives for sum and count functions on sparse data. (See {ml-pull}492[#492].)
-
-== {es} version 7.2.1
-
-=== Bug Fixes
-
-* Fix an edge case causing spurious anomalies (false positives) if the variance in the count of events
-changed significantly throughout the period of a seasonal quantity. (See {ml-pull}489[#489].)
-
-== {es} version 7.2.0
-
-=== Enhancements
-
-* Remove hard limit for maximum forecast interval and limit based on the time interval of data added
-to the model. (See {ml-pull}214[#214].)
-
-* Use hardened compiler options to build 3rd party libraries. (See {ml-pull}453[#453].)
-
-* Only select more complex trend models for forecasting if there is evidence that they are needed.
-(See {ml-pull}463[#463].)
-
-* Improve residual model selection. (See {ml-pull}468[#468].)
-
-* Stop linking to libcrypt on Linux. (See {ml-pull}480[#480].)
-
-* Improvements to hard_limit audit message. (See {ml-pull}486[#486].)
-
-=== Bug Fixes
-
-* Handle NaNs when detrending seasonal components. {ml-pull}408[#408]
-
-== {es} version 7.0.0-alpha2
-
-=== Bug Fixes
-
-* Fixes CPoissonMeanConjugate sampling error. {ml-pull}335[#335]
-//NOTE: Remove from final 7.0.0 release notes if already in 6.x
-
-* Ensure statics are persisted in a consistent manner {ml-pull}360[#360]
-
-== {es} version 7.0.0-alpha1
-
-== {es} version 6.8.4
-
-=== Bug Fixes
-
-* A reference to a temporary variable was causing forecast model restoration to fail.
-The bug exhibited itself on MacOS builds with versions of clangd > 10.0.0. (See {ml-pull}688[#688].)
-
-== {es} version 6.8.2
-
-=== Bug Fixes
-
-* Don't write model size stats when job is closed without any input {ml-pull}512[#512] (issue: {ml-issue}394[#394])
-* Don't persist model state at the end of lookback if the lookback did not generate any input {ml-pull}521[#521] (issue: {ml-issue}519[#519])
-
-== {es} version 6.7.2
-
-=== Enhancements
-
-* Adjust seccomp filter to allow the "time" system call {ml-pull}459[#459]
-
-== {es} version 6.7.0
-
-=== Bug Fixes
-
-* Improve autodetect logic for persistence. {ml-pull}437[#437]
-
-== {es} version 6.6.2
-
-=== Enhancements
-
-* Adjust seccomp filter for Fedora 29. {ml-pull}354[#354]
-
-=== Bug Fixes
-
-* Fixes an issue where interim results would be calculated after advancing time into an empty bucket. {ml-pull}416[#416]