From 05704f7a15f2a2d98ae6c7d38f6c2a7b7fce888c Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Thu, 26 Feb 2026 19:53:24 +0100 Subject: [PATCH 01/11] Fix autodetect process crash from oversized field values by truncating at 256 characters --- docs/CHANGELOG.asciidoc | 4 ++ include/model/CFieldValueTruncator.h | 62 ++++++++++++++++++ lib/api/CAnomalyJob.cc | 13 +++- lib/api/CDataProcessor.cc | 10 ++- lib/api/unittest/CAnomalyJobTest.cc | 50 +++++++++++++++ lib/model/CBucketGatherer.cc | 4 ++ lib/model/CDynamicStringIdRegistry.cc | 5 +- lib/model/CEventRateBucketGatherer.cc | 2 + lib/model/CGathererTools.cc | 3 + .../unittest/CDynamicStringIdRegistryTest.cc | 31 +++++++++ .../unittest/CFieldValueTruncatorTest.cc | 64 +++++++++++++++++++ lib/model/unittest/CMakeLists.txt | 1 + 12 files changed, 246 insertions(+), 3 deletions(-) create mode 100644 include/model/CFieldValueTruncator.h create mode 100644 lib/model/unittest/CFieldValueTruncatorTest.cc diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index fa2d532256..ff1b4c731e 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -30,6 +30,10 @@ == {es} version 9.4.0 +=== Bug Fixes + +* Truncate oversized field values to prevent autodetect process crash. (See {ml-issue}2796[#2796].) + === Enhancements * Better handling of invalid JSON state documents (See {ml-pull}[]#2895].) diff --git a/include/model/CFieldValueTruncator.h b/include/model/CFieldValueTruncator.h new file mode 100644 index 0000000000..f56c2b7719 --- /dev/null +++ b/include/model/CFieldValueTruncator.h @@ -0,0 +1,62 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the following additional limitation. Functionality enabled by the + * files subject to the Elastic License 2.0 may only be used in production when + * invoked by an Elasticsearch process with a license key installed that permits + * use of machine learning features. You may not use this file except in + * compliance with the Elastic License 2.0 and the foregoing additional + * limitation. + */ +#ifndef INCLUDED_ml_model_CFieldValueTruncator_h +#define INCLUDED_ml_model_CFieldValueTruncator_h + +#include + +#include + +namespace ml { +namespace model { + +//! \brief Truncates field values to prevent memory amplification. +//! +//! DESCRIPTION:\n +//! Field values (by, over, partition, influencer) are term fields +//! in the anomaly detection domain. They are categorical identifiers, +//! not free text. Their length must be bounded to prevent excessive +//! memory consumption that could cause the autodetect process to crash. +//! +//! IMPLEMENTATION DECISIONS:\n +//! The limit of 256 characters aligns with Elasticsearch's +//! ignore_above default for keyword fields. This is sufficient for +//! meaningful anomaly detection field values while preventing memory +//! amplification from extremely long strings (e.g., 77K+ characters) +//! that have been observed to crash the autodetect process. +class MODEL_EXPORT CFieldValueTruncator { +public: + //! Maximum length for analysis term fields (by, over, partition, influencer). + //! Values longer than this are truncated to prevent excessive memory usage. + static constexpr std::size_t MAX_FIELD_VALUE_LENGTH = 256; + + //! In-place truncation of a field value. + //! \return true if truncation occurred, false if value was within limit. + static bool truncate(std::string& value) { + if (value.size() <= MAX_FIELD_VALUE_LENGTH) { + return false; + } + value.resize(MAX_FIELD_VALUE_LENGTH); + return true; + } + + //! Returns a truncated copy of the field value. Original unchanged. + static std::string truncated(const std::string& value) { + if (value.size() <= MAX_FIELD_VALUE_LENGTH) { + return value; + } + return value.substr(0, MAX_FIELD_VALUE_LENGTH); + } +}; +} +} + +#endif // INCLUDED_ml_model_CFieldValueTruncator_h diff --git a/lib/api/CAnomalyJob.cc b/lib/api/CAnomalyJob.cc index d7321cd2a1..8d30e35564 100644 --- a/lib/api/CAnomalyJob.cc +++ b/lib/api/CAnomalyJob.cc @@ -28,6 +28,7 @@ #include #include +#include #include #include #include @@ -1706,8 +1707,18 @@ void CAnomalyJob::addRecord(const TAnomalyDetectorPtr& detector, model::CAnomalyDetector::TStrCPtrVec fieldValues; const TStrVec& fieldNames = detector->fieldsOfInterest(); fieldValues.reserve(fieldNames.size()); + TStrVec truncatedCopies; for (const auto& fieldName : fieldNames) { - fieldValues.push_back(fieldValue(fieldName, dataRowFields)); + const std::string* value = fieldValue(fieldName, dataRowFields); + if (value != nullptr && value->size() > model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH) { + truncatedCopies.push_back(model::CFieldValueTruncator::truncated(*value)); + fieldValues.push_back(&truncatedCopies.back()); + LOG_WARN(<< "Field '" << fieldName << "' value exceeds " + << model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH + << " characters and has been truncated"); + } else { + fieldValues.push_back(value); + } } detector->addRecord(time, fieldValues); diff --git a/lib/api/CDataProcessor.cc b/lib/api/CDataProcessor.cc index 93db8c4751..7638094f5e 100644 --- a/lib/api/CDataProcessor.cc +++ b/lib/api/CDataProcessor.cc @@ -15,6 +15,8 @@ #include #include +#include + namespace ml { namespace api { @@ -49,7 +51,13 @@ std::string CDataProcessor::debugPrintRecord(const TStrStrUMap& dataRowFields) { fieldValues.push_back(','); } fieldNames.append(rowIter->first); - fieldValues.append(rowIter->second); + const auto& val = rowIter->second; + if (val.size() > model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH) { + fieldValues.append(val, 0, model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH); + fieldValues.append("..."); + } else { + fieldValues.append(val); + } } result << fieldNames << core_t::LINE_ENDING << fieldValues; diff --git a/lib/api/unittest/CAnomalyJobTest.cc b/lib/api/unittest/CAnomalyJobTest.cc index d5384327ef..5cb277b966 100644 --- a/lib/api/unittest/CAnomalyJobTest.cc +++ b/lib/api/unittest/CAnomalyJobTest.cc @@ -1205,4 +1205,54 @@ BOOST_AUTO_TEST_CASE(testHierarchicalResultsNormalizerShouldIncreaseMemoryUsage) resourceMonitor.forceRefreshAll(); BOOST_TEST_REQUIRE(resourceMonitor.totalMemory() < memoryUsageBeforeUnregister); } + +BOOST_AUTO_TEST_CASE(testOversizedFieldValuesTruncated) { + model::CLimits limits; + api::CAnomalyJobConfig jobConfig = CTestAnomalyJob::makeSimpleJobConfig( + "count", "", "by_field", "", "", {"influencer_field"}); + + model::CAnomalyDetectorModelConfig modelConfig = + model::CAnomalyDetectorModelConfig::defaultConfig(BUCKET_SIZE); + std::stringstream outputStrm; + core::CJsonOutputStreamWrapper wrappedOutputStream(outputStrm); + + CTestAnomalyJob job("job", limits, jobConfig, modelConfig, wrappedOutputStream); + + std::string const oversizedValue(77000, 'x'); + CTestAnomalyJob::TStrStrUMap dataRows{ + {"time", "1000"}, {"by_field", oversizedValue}, {"influencer_field", oversizedValue}}; + + BOOST_TEST_REQUIRE(job.handleRecord(dataRows)); + BOOST_REQUIRE_EQUAL(uint64_t(1), job.numRecordsHandled()); +} + +BOOST_AUTO_TEST_CASE(testNormalFieldValuesNotTruncated) { + model::CLimits limits; + api::CAnomalyJobConfig jobConfig = CTestAnomalyJob::makeSimpleJobConfig( + "count", "", "by_field", "", "", {"influencer_field"}); + + model::CAnomalyDetectorModelConfig modelConfig = + model::CAnomalyDetectorModelConfig::defaultConfig(BUCKET_SIZE); + std::stringstream outputStrm; + core::CJsonOutputStreamWrapper wrappedOutputStream(outputStrm); + + CTestAnomalyJob job("job", limits, jobConfig, modelConfig, wrappedOutputStream); + + std::string const normalValue("normal_value"); + CTestAnomalyJob::TStrStrUMap dataRows{ + {"time", "1000"}, {"by_field", normalValue}, {"influencer_field", normalValue}}; + + BOOST_TEST_REQUIRE(job.handleRecord(dataRows)); + BOOST_REQUIRE_EQUAL(uint64_t(1), job.numRecordsHandled()); +} + +BOOST_AUTO_TEST_CASE(testDebugPrintRecordTruncatesLongValues) { + api::CDataProcessor::TStrStrUMap record; + record["field1"] = std::string(1000, 'x'); + record["field2"] = "short"; + std::string result = api::CDataProcessor::debugPrintRecord(record); + BOOST_TEST_REQUIRE(result.find("...") != std::string::npos); + BOOST_TEST_REQUIRE(result.size() < 1500); +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/model/CBucketGatherer.cc b/lib/model/CBucketGatherer.cc index cdffd8d238..520561e7be 100644 --- a/lib/model/CBucketGatherer.cc +++ b/lib/model/CBucketGatherer.cc @@ -23,6 +23,7 @@ #include #include +#include #include #include @@ -116,6 +117,9 @@ bool restoreInfluencerPersonAttributeCounts(core::CStateRestoreTraverser& traver RESTORE_BUILT_IN(PERSON_UID_TAG, person) RESTORE_BUILT_IN(ATTRIBUTE_UID_TAG, attribute) RESTORE_NO_ERROR(INFLUENCER_TAG, influence = traverser.value()) + if (name == INFLUENCER_TAG) { + CFieldValueTruncator::truncate(influence); + } if (name == COUNT_TAG) { if (core::CStringUtils::stringToType(traverser.value(), count) == false) { LOG_ERROR(<< "Failed to restore COUNT_TAG, got " << traverser.value()); diff --git a/lib/model/CDynamicStringIdRegistry.cc b/lib/model/CDynamicStringIdRegistry.cc index 9974e8e15b..c293f664d4 100644 --- a/lib/model/CDynamicStringIdRegistry.cc +++ b/lib/model/CDynamicStringIdRegistry.cc @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -251,7 +252,9 @@ bool CDynamicStringIdRegistry::acceptRestoreTraverser(core::CStateRestoreTravers do { const std::string& name = traverser.name(); if (name == NAMES_TAG) { - m_Names.emplace_back(traverser.value()); + std::string value = traverser.value(); + CFieldValueTruncator::truncate(value); + m_Names.emplace_back(std::move(value)); } else if (name == FREE_NAMES_TAG) { if (!core::CPersistUtils::restore(FREE_NAMES_TAG, m_FreeUids, traverser)) { return false; diff --git a/lib/model/CEventRateBucketGatherer.cc b/lib/model/CEventRateBucketGatherer.cc index a01ddc9cdd..600719089e 100644 --- a/lib/model/CEventRateBucketGatherer.cc +++ b/lib/model/CEventRateBucketGatherer.cc @@ -27,6 +27,7 @@ #include #include +#include #include #include @@ -684,6 +685,7 @@ bool restoreInfluencerUniqueStrings(core::CStateRestoreTraverser& traverser, const std::string& name = traverser.name(); if (name == DICTIONARY_WORD_TAG) { key = traverser.value(); + CFieldValueTruncator::truncate(key); } else if (name == UNIQUE_WORD_TAG) { CUniqueStringFeatureData::TWord value; if (value.fromDelimited(traverser.value()) == false) { diff --git a/lib/model/CGathererTools.cc b/lib/model/CGathererTools.cc index 378e0ddd2a..2c9e48414c 100644 --- a/lib/model/CGathererTools.cc +++ b/lib/model/CGathererTools.cc @@ -23,6 +23,8 @@ #include #include +#include + #include namespace ml { @@ -89,6 +91,7 @@ struct SInfluencerSumSerializer { const std::string& name = traverser.name(); if (name == SUM_MAP_KEY_TAG) { key = traverser.value(); + CFieldValueTruncator::truncate(key); } else if (name == SUM_MAP_VALUE_TAG) { if (core::CStringUtils::stringToType(traverser.value(), map[key]) == false) { LOG_ERROR(<< "Invalid sum in " << traverser.value()); diff --git a/lib/model/unittest/CDynamicStringIdRegistryTest.cc b/lib/model/unittest/CDynamicStringIdRegistryTest.cc index 1d4aa16988..60cea2b1da 100644 --- a/lib/model/unittest/CDynamicStringIdRegistryTest.cc +++ b/lib/model/unittest/CDynamicStringIdRegistryTest.cc @@ -15,6 +15,7 @@ #include #include +#include #include #include @@ -109,4 +110,34 @@ BOOST_AUTO_TEST_CASE(testPersist) { BOOST_REQUIRE_EQUAL(restoredJson.str(), origJson.str()); } +BOOST_AUTO_TEST_CASE(testRestoreTruncatesOversizedNames) { + CResourceMonitor resourceMonitor; + CDynamicStringIdRegistry registry("person", counter_t::E_TSADNumberNewPeople, + counter_t::E_TSADNumberNewPeopleNotAllowed, + counter_t::E_TSADNumberNewPeopleRecycled); + + bool addedPerson = false; + std::string shortName("foo"); + std::string oversizedName(77000, 'x'); + registry.addName(shortName, 0, resourceMonitor, addedPerson); + registry.addName(oversizedName, 0, resourceMonitor, addedPerson); + + std::ostringstream origJson; + core::CJsonStatePersistInserter::persist( + origJson, std::bind_front(&CDynamicStringIdRegistry::acceptPersistInserter, ®istry)); + + std::istringstream is("{\"topLevel\" : " + origJson.str() + "}"); + core::CJsonStateRestoreTraverser traverser(is); + CDynamicStringIdRegistry restoredRegistry("person", counter_t::E_TSADNumberNewPeople, + counter_t::E_TSADNumberNewPeopleNotAllowed, + counter_t::E_TSADNumberNewPeopleRecycled); + traverser.traverseSubLevel(std::bind_front( + &CDynamicStringIdRegistry::acceptRestoreTraverser, &restoredRegistry)); + + BOOST_REQUIRE_EQUAL(2, restoredRegistry.numberNames()); + BOOST_REQUIRE_EQUAL(shortName, restoredRegistry.name(0, "")); + BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, + restoredRegistry.name(1, "").size()); +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/model/unittest/CFieldValueTruncatorTest.cc b/lib/model/unittest/CFieldValueTruncatorTest.cc new file mode 100644 index 0000000000..fe5033d496 --- /dev/null +++ b/lib/model/unittest/CFieldValueTruncatorTest.cc @@ -0,0 +1,64 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the following additional limitation. Functionality enabled by the + * files subject to the Elastic License 2.0 may only be used in production when + * invoked by an Elasticsearch process with a license key installed that permits + * use of machine learning features. You may not use this file except in + * compliance with the Elastic License 2.0 and the foregoing additional + * limitation. + */ + +#include + +#include + +BOOST_AUTO_TEST_SUITE(CFieldValueTruncatorTest) + +using namespace ml; +using namespace model; + +BOOST_AUTO_TEST_CASE(testShortValueUnchanged) { + std::string value("short"); + BOOST_REQUIRE_EQUAL(false, CFieldValueTruncator::truncate(value)); + BOOST_REQUIRE_EQUAL("short", value); +} + +BOOST_AUTO_TEST_CASE(testExactLimitUnchanged) { + std::string value(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, 'x'); + BOOST_REQUIRE_EQUAL(false, CFieldValueTruncator::truncate(value)); + BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, value.size()); +} + +BOOST_AUTO_TEST_CASE(testOversizedValueTruncated) { + std::string value(1000, 'x'); + BOOST_REQUIRE_EQUAL(true, CFieldValueTruncator::truncate(value)); + BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, value.size()); +} + +BOOST_AUTO_TEST_CASE(testEmptyValueUnchanged) { + std::string value; + BOOST_REQUIRE_EQUAL(false, CFieldValueTruncator::truncate(value)); + BOOST_REQUIRE_EQUAL(0, value.size()); +} + +BOOST_AUTO_TEST_CASE(testConstOverloadReturnsNewString) { + const std::string longValue(1000, 'x'); + std::string result = CFieldValueTruncator::truncated(longValue); + BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, result.size()); + BOOST_REQUIRE_EQUAL(1000, longValue.size()); +} + +BOOST_AUTO_TEST_CASE(testConstOverloadShortValueReturnsSame) { + const std::string shortValue("short"); + std::string result = CFieldValueTruncator::truncated(shortValue); + BOOST_REQUIRE_EQUAL("short", result); +} + +BOOST_AUTO_TEST_CASE(testVeryLargeValueTruncated) { + std::string value(77000, 'y'); + BOOST_REQUIRE_EQUAL(true, CFieldValueTruncator::truncate(value)); + BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, value.size()); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/model/unittest/CMakeLists.txt b/lib/model/unittest/CMakeLists.txt index 8e6d6dcf48..e8f64ac6b5 100644 --- a/lib/model/unittest/CMakeLists.txt +++ b/lib/model/unittest/CMakeLists.txt @@ -22,6 +22,7 @@ set (SRCS CDetectionRuleTest.cc CDetectorEqualizerTest.cc CDynamicStringIdRegistryTest.cc + CFieldValueTruncatorTest.cc CEventRateAnomalyDetectorTest.cc CEventRateDataGathererTest.cc CEventRateModelTest.cc From 4c6e66e6c53e72e2a6cdf4d194fb811c79a76424 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Thu, 26 Feb 2026 20:26:49 +0100 Subject: [PATCH 02/11] refactor --- docs/CHANGELOG.asciidoc | 2 +- include/model/CFieldValueTruncator.h | 12 ++++++++++-- lib/api/CAnomalyJob.cc | 2 +- lib/api/CDataProcessor.cc | 6 +++--- lib/api/unittest/CAnomalyJobTest.cc | 5 +++-- lib/model/unittest/CFieldValueTruncatorTest.cc | 11 +++++++++++ 6 files changed, 29 insertions(+), 9 deletions(-) diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index ff1b4c731e..4de997996f 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -32,7 +32,7 @@ === Bug Fixes -* Truncate oversized field values to prevent autodetect process crash. (See {ml-issue}2796[#2796].) +* Truncate oversized field values to prevent autodetect process crash. (See {ml-pull}2929[#2929].) === Enhancements diff --git a/include/model/CFieldValueTruncator.h b/include/model/CFieldValueTruncator.h index f56c2b7719..6171ed83b7 100644 --- a/include/model/CFieldValueTruncator.h +++ b/include/model/CFieldValueTruncator.h @@ -38,10 +38,17 @@ class MODEL_EXPORT CFieldValueTruncator { //! Values longer than this are truncated to prevent excessive memory usage. static constexpr std::size_t MAX_FIELD_VALUE_LENGTH = 256; + //! Check if a field value needs truncation. + //! This avoids creating copies when checking if truncation is necessary. + //! \return true if the value exceeds MAX_FIELD_VALUE_LENGTH. + static bool needsTruncation(const std::string& value) { + return value.size() > MAX_FIELD_VALUE_LENGTH; + } + //! In-place truncation of a field value. //! \return true if truncation occurred, false if value was within limit. static bool truncate(std::string& value) { - if (value.size() <= MAX_FIELD_VALUE_LENGTH) { + if (!needsTruncation(value)) { return false; } value.resize(MAX_FIELD_VALUE_LENGTH); @@ -49,8 +56,9 @@ class MODEL_EXPORT CFieldValueTruncator { } //! Returns a truncated copy of the field value. Original unchanged. + //! Use needsTruncation() first if you want to avoid copying. static std::string truncated(const std::string& value) { - if (value.size() <= MAX_FIELD_VALUE_LENGTH) { + if (!needsTruncation(value)) { return value; } return value.substr(0, MAX_FIELD_VALUE_LENGTH); diff --git a/lib/api/CAnomalyJob.cc b/lib/api/CAnomalyJob.cc index 8d30e35564..6e2951f597 100644 --- a/lib/api/CAnomalyJob.cc +++ b/lib/api/CAnomalyJob.cc @@ -1710,7 +1710,7 @@ void CAnomalyJob::addRecord(const TAnomalyDetectorPtr& detector, TStrVec truncatedCopies; for (const auto& fieldName : fieldNames) { const std::string* value = fieldValue(fieldName, dataRowFields); - if (value != nullptr && value->size() > model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH) { + if (value != nullptr && model::CFieldValueTruncator::needsTruncation(*value)) { truncatedCopies.push_back(model::CFieldValueTruncator::truncated(*value)); fieldValues.push_back(&truncatedCopies.back()); LOG_WARN(<< "Field '" << fieldName << "' value exceeds " diff --git a/lib/api/CDataProcessor.cc b/lib/api/CDataProcessor.cc index 7638094f5e..61b792417b 100644 --- a/lib/api/CDataProcessor.cc +++ b/lib/api/CDataProcessor.cc @@ -51,9 +51,9 @@ std::string CDataProcessor::debugPrintRecord(const TStrStrUMap& dataRowFields) { fieldValues.push_back(','); } fieldNames.append(rowIter->first); - const auto& val = rowIter->second; - if (val.size() > model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH) { - fieldValues.append(val, 0, model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH); + const std::string& val = rowIter->second; + if (model::CFieldValueTruncator::needsTruncation(val)) { + fieldValues.append(model::CFieldValueTruncator::truncated(val)); fieldValues.append("..."); } else { fieldValues.append(val); diff --git a/lib/api/unittest/CAnomalyJobTest.cc b/lib/api/unittest/CAnomalyJobTest.cc index 5cb277b966..681a316641 100644 --- a/lib/api/unittest/CAnomalyJobTest.cc +++ b/lib/api/unittest/CAnomalyJobTest.cc @@ -1219,8 +1219,9 @@ BOOST_AUTO_TEST_CASE(testOversizedFieldValuesTruncated) { CTestAnomalyJob job("job", limits, jobConfig, modelConfig, wrappedOutputStream); std::string const oversizedValue(77000, 'x'); - CTestAnomalyJob::TStrStrUMap dataRows{ - {"time", "1000"}, {"by_field", oversizedValue}, {"influencer_field", oversizedValue}}; + CTestAnomalyJob::TStrStrUMap dataRows{{"time", "1000"}, + {"by_field", oversizedValue}, + {"influencer_field", oversizedValue}}; BOOST_TEST_REQUIRE(job.handleRecord(dataRows)); BOOST_REQUIRE_EQUAL(uint64_t(1), job.numRecordsHandled()); diff --git a/lib/model/unittest/CFieldValueTruncatorTest.cc b/lib/model/unittest/CFieldValueTruncatorTest.cc index fe5033d496..447ab7250d 100644 --- a/lib/model/unittest/CFieldValueTruncatorTest.cc +++ b/lib/model/unittest/CFieldValueTruncatorTest.cc @@ -61,4 +61,15 @@ BOOST_AUTO_TEST_CASE(testVeryLargeValueTruncated) { BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, value.size()); } +BOOST_AUTO_TEST_CASE(testNeedsTruncation) { + BOOST_REQUIRE_EQUAL(false, CFieldValueTruncator::needsTruncation("short")); + BOOST_REQUIRE_EQUAL(false, CFieldValueTruncator::needsTruncation("")); + BOOST_REQUIRE_EQUAL(false, CFieldValueTruncator::needsTruncation(std::string( + CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, 'x'))); + BOOST_REQUIRE_EQUAL(true, CFieldValueTruncator::needsTruncation(std::string( + CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH + 1, 'x'))); + BOOST_REQUIRE_EQUAL( + true, CFieldValueTruncator::needsTruncation(std::string(77000, 'x'))); +} + BOOST_AUTO_TEST_SUITE_END() From 310f228c603930836853d96d6409444c9e312cef Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Fri, 27 Feb 2026 16:23:38 +0100 Subject: [PATCH 03/11] implement hash suffix --- include/api/CAnomalyJob.h | 12 ++ include/model/CFieldValueTruncator.h | 133 ++++++++++++++---- lib/api/CAnomalyJob.cc | 35 +++-- lib/api/CDataProcessor.cc | 2 +- .../unittest/CFieldValueTruncatorTest.cc | 109 +++++++++++++- 5 files changed, 251 insertions(+), 40 deletions(-) diff --git a/include/api/CAnomalyJob.h b/include/api/CAnomalyJob.h index 279a597936..4167c649a7 100644 --- a/include/api/CAnomalyJob.h +++ b/include/api/CAnomalyJob.h @@ -392,6 +392,18 @@ class API_EXPORT CAnomalyJob : public CDataProcessor { core_t::TTime time, const TStrStrUMap& dataRowFields); + //! Prepare field values with truncation handling. + //! Extracts field values from \p dataRowFields, truncates oversized values, + //! and populates \p fieldValues with pointers to either original or truncated values. + //! \param fieldNames The names of fields to extract + //! \param dataRowFields The data row containing field values + //! \param fieldValues Output vector of pointers to field values + //! \param truncatedCopies Storage for truncated copies (must remain valid while fieldValues is used) + static void prepareTruncatedFieldValues(const TStrVec& fieldNames, + const TStrStrUMap& dataRowFields, + model::CAnomalyDetector::TStrCPtrVec& fieldValues, + TStrVec& truncatedCopies); + //! Parses a control message requesting that model state be persisted. //! Extracts optional arguments to be used for persistence. static bool parsePersistControlMessageArgs(const std::string& controlMessageArgs, diff --git a/include/model/CFieldValueTruncator.h b/include/model/CFieldValueTruncator.h index 6171ed83b7..f62c1f6d26 100644 --- a/include/model/CFieldValueTruncator.h +++ b/include/model/CFieldValueTruncator.h @@ -11,57 +11,138 @@ #ifndef INCLUDED_ml_model_CFieldValueTruncator_h #define INCLUDED_ml_model_CFieldValueTruncator_h +#include + #include +#include #include namespace ml { namespace model { -//! \brief Truncates field values to prevent memory amplification. +//! \brief Enforces term field length constraints with collision prevention. +//! +//! In the anomaly detection domain, term fields (by, over, partition, influencer) +//! are categorical identifiers that must satisfy two invariants: +//! 1. **Bounded Length** - Prevent memory amplification and OOM crashes +//! 2. **Unique Identity** - Distinct field values must remain distinguishable +//! +//! Values exceeding MAX_FIELD_VALUE_LENGTH (256 chars) are transformed using +//! collision-safe truncation: +//! - Retain PREFIX_LENGTH (240) characters of original value +//! - Append HASH_SEPARATOR ('$') +//! - Append HASH_HEX_DIGITS (15) character hex hash of complete original value //! -//! DESCRIPTION:\n -//! Field values (by, over, partition, influencer) are term fields -//! in the anomaly detection domain. They are categorical identifiers, -//! not free text. Their length must be bounded to prevent excessive -//! memory consumption that could cause the autodetect process to crash. +//! Format: "$" +//! Example: "very_long_field_value_that_exceeds_limit_and_continues_for_thousands_of_chars_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx$a1b2c3d4e5f6789" //! -//! IMPLEMENTATION DECISIONS:\n -//! The limit of 256 characters aligns with Elasticsearch's -//! ignore_above default for keyword fields. This is sufficient for -//! meaningful anomaly detection field values while preventing memory -//! amplification from extremely long strings (e.g., 77K+ characters) -//! that have been observed to crash the autodetect process. +//! The 256-character limit aligns with Elasticsearch's ignore_above default +//! for keyword fields. The hash suffix ensures data integrity while maintaining +//! human readability (first 240 characters visible) and compatibility with +//! prefix-based filtering. Collision probability is ~1 in 10^18 (effectively zero). class MODEL_EXPORT CFieldValueTruncator { public: - //! Maximum length for analysis term fields (by, over, partition, influencer). - //! Values longer than this are truncated to prevent excessive memory usage. + //! Domain constraint: Maximum length for term fields in anomaly detection. + //! Aligned with Elasticsearch's ignore_above default for keyword fields. static constexpr std::size_t MAX_FIELD_VALUE_LENGTH = 256; - //! Check if a field value needs truncation. - //! This avoids creating copies when checking if truncation is necessary. - //! \return true if the value exceeds MAX_FIELD_VALUE_LENGTH. + //! Collision prevention format components + static constexpr char HASH_SEPARATOR = '$'; + static constexpr std::size_t HASH_HEX_DIGITS = 15; // 15 hex chars for uint64_t + static constexpr std::size_t HASH_SUFFIX_LENGTH = + 1 /* separator */ + HASH_HEX_DIGITS; // 16 total + + //! Content prefix length (readable portion after truncation) + static constexpr std::size_t PREFIX_LENGTH = + MAX_FIELD_VALUE_LENGTH - HASH_SUFFIX_LENGTH; // 240 + + // Domain invariants (enforced at compile-time) + static_assert(PREFIX_LENGTH + HASH_SUFFIX_LENGTH == MAX_FIELD_VALUE_LENGTH, + "Term field format invariant: prefix + suffix = total length"); + static_assert(PREFIX_LENGTH >= 200, + "Readable prefix must be substantial for human comprehension"); + static_assert(HASH_HEX_DIGITS * 4 <= 64, + "Hash hex digits must fit in 64-bit hash output"); + + //! Check if a term field value exceeds the domain constraint. + //! \return true if the value requires length enforcement static bool needsTruncation(const std::string& value) { return value.size() > MAX_FIELD_VALUE_LENGTH; } - //! In-place truncation of a field value. - //! \return true if truncation occurred, false if value was within limit. + //! Enforce term field length constraint in-place. + //! Applies collision-safe truncation for values exceeding the limit. + //! \param[in,out] value Field value to constrain + //! \return true if truncation was applied, false if already within limit static bool truncate(std::string& value) { - if (!needsTruncation(value)) { + if (needsTruncation(value) == false) { return false; } - value.resize(MAX_FIELD_VALUE_LENGTH); + + std::string originalValue = std::move(value); + value.assign(originalValue, 0, PREFIX_LENGTH); + appendCollisionPreventionSuffix(originalValue, value); + return true; } - //! Returns a truncated copy of the field value. Original unchanged. - //! Use needsTruncation() first if you want to avoid copying. + //! Enforce term field length constraint, returning constrained copy. + //! Original value unchanged. For performance, call needsTruncation() first + //! to avoid copying when constraint is already satisfied. + //! \param value Original field value + //! \return Copy with length constraint enforced static std::string truncated(const std::string& value) { - if (!needsTruncation(value)) { - return value; + if (needsTruncation(value) == false) { + return value; // RVO applies + } + + std::string result; + result.reserve(MAX_FIELD_VALUE_LENGTH); + result.assign(value, 0, PREFIX_LENGTH); + appendCollisionPreventionSuffix(value, result); + + return result; + } + +private: + //! \brief Hash encoding for collision prevention. + //! + //! Encapsulates the technical details of hash computation and formatting. + //! Separated from domain logic for clarity and testability. + struct HashEncoding { + //! Compute collision-resistant identity hash. + //! Uses safeMurmurHash64 (endian-neutral) for state persistence safety. + static std::uint64_t compute(const std::string& value) { + return core::CHashing::safeMurmurHash64( + value.data(), static_cast(value.size()), + 0); // Fixed seed for determinism } - return value.substr(0, MAX_FIELD_VALUE_LENGTH); + + //! Format 64-bit hash as zero-padded lowercase hex string. + //! \param hash The hash value to format + //! \param[out] buffer Must be at least HASH_HEX_DIGITS + 1 bytes + //! \return Pointer to null-terminated hex string in buffer + static const char* toHex(std::uint64_t hash, char* buffer) { + // %015llx produces 15-char zero-padded lowercase hex + std::snprintf(buffer, HASH_HEX_DIGITS + 1, "%015llx", + static_cast(hash)); + return buffer; + } + }; + + //! Append collision-prevention suffix: separator + hash. + //! \param originalValue Complete untruncated value for hash computation + //! \param[in,out] prefix Truncated prefix to which suffix is appended + static void appendCollisionPreventionSuffix(const std::string& originalValue, + std::string& prefix) { + std::uint64_t identityHash = HashEncoding::compute(originalValue); + + prefix.reserve(MAX_FIELD_VALUE_LENGTH); + prefix.push_back(HASH_SEPARATOR); + + char hashHexBuffer[HASH_HEX_DIGITS + 1]; + prefix.append(HashEncoding::toHex(identityHash, hashHexBuffer)); } }; } diff --git a/lib/api/CAnomalyJob.cc b/lib/api/CAnomalyJob.cc index 6e2951f597..e607bd9a3e 100644 --- a/lib/api/CAnomalyJob.cc +++ b/lib/api/CAnomalyJob.cc @@ -1701,25 +1701,42 @@ const std::string* CAnomalyJob::fieldValue(const std::string& fieldName, return !fieldName.empty() && fieldValue.empty() ? nullptr : &fieldValue; } -void CAnomalyJob::addRecord(const TAnomalyDetectorPtr& detector, - core_t::TTime time, - const TStrStrUMap& dataRowFields) { - model::CAnomalyDetector::TStrCPtrVec fieldValues; - const TStrVec& fieldNames = detector->fieldsOfInterest(); +void CAnomalyJob::prepareTruncatedFieldValues( + const TStrVec& fieldNames, + const TStrStrUMap& dataRowFields, + model::CAnomalyDetector::TStrCPtrVec& fieldValues, + TStrVec& truncatedCopies) { + fieldValues.reserve(fieldNames.size()); - TStrVec truncatedCopies; + truncatedCopies.reserve(fieldNames.size()); + for (const auto& fieldName : fieldNames) { const std::string* value = fieldValue(fieldName, dataRowFields); if (value != nullptr && model::CFieldValueTruncator::needsTruncation(*value)) { truncatedCopies.push_back(model::CFieldValueTruncator::truncated(*value)); fieldValues.push_back(&truncatedCopies.back()); - LOG_WARN(<< "Field '" << fieldName << "' value exceeds " - << model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH - << " characters and has been truncated"); + + std::string escapedFieldName = fieldName; + core::CStringUtils::escape('\\', "\n\r\t", escapedFieldName); + LOG_WARN(<< "Field '" << escapedFieldName + << "' value (length=" << value->size() + << ", prefix='" << value->substr(0, std::min(50, value->size())) + << "...') exceeds " << model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH + << " characters and has been truncated with collision-safe hash suffix"); } else { fieldValues.push_back(value); } } +} + +void CAnomalyJob::addRecord(const TAnomalyDetectorPtr& detector, + core_t::TTime time, + const TStrStrUMap& dataRowFields) { + model::CAnomalyDetector::TStrCPtrVec fieldValues; + TStrVec truncatedCopies; + const TStrVec& fieldNames = detector->fieldsOfInterest(); + + prepareTruncatedFieldValues(fieldNames, dataRowFields, fieldValues, truncatedCopies); detector->addRecord(time, fieldValues); } diff --git a/lib/api/CDataProcessor.cc b/lib/api/CDataProcessor.cc index 61b792417b..3ffdb4915e 100644 --- a/lib/api/CDataProcessor.cc +++ b/lib/api/CDataProcessor.cc @@ -53,7 +53,7 @@ std::string CDataProcessor::debugPrintRecord(const TStrStrUMap& dataRowFields) { fieldNames.append(rowIter->first); const std::string& val = rowIter->second; if (model::CFieldValueTruncator::needsTruncation(val)) { - fieldValues.append(model::CFieldValueTruncator::truncated(val)); + fieldValues.append(val.substr(0, model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH)); fieldValues.append("..."); } else { fieldValues.append(val); diff --git a/lib/model/unittest/CFieldValueTruncatorTest.cc b/lib/model/unittest/CFieldValueTruncatorTest.cc index 447ab7250d..65c3777da3 100644 --- a/lib/model/unittest/CFieldValueTruncatorTest.cc +++ b/lib/model/unittest/CFieldValueTruncatorTest.cc @@ -18,19 +18,23 @@ BOOST_AUTO_TEST_SUITE(CFieldValueTruncatorTest) using namespace ml; using namespace model; +// ============================================================================ +// Constraint Enforcement Behavior +// ============================================================================ + BOOST_AUTO_TEST_CASE(testShortValueUnchanged) { std::string value("short"); BOOST_REQUIRE_EQUAL(false, CFieldValueTruncator::truncate(value)); BOOST_REQUIRE_EQUAL("short", value); } -BOOST_AUTO_TEST_CASE(testExactLimitUnchanged) { +BOOST_AUTO_TEST_CASE(testValueAtExactLimitUnchanged) { std::string value(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, 'x'); BOOST_REQUIRE_EQUAL(false, CFieldValueTruncator::truncate(value)); BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, value.size()); } -BOOST_AUTO_TEST_CASE(testOversizedValueTruncated) { +BOOST_AUTO_TEST_CASE(testOversizedValueEnforcedTo256Chars) { std::string value(1000, 'x'); BOOST_REQUIRE_EQUAL(true, CFieldValueTruncator::truncate(value)); BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, value.size()); @@ -42,7 +46,7 @@ BOOST_AUTO_TEST_CASE(testEmptyValueUnchanged) { BOOST_REQUIRE_EQUAL(0, value.size()); } -BOOST_AUTO_TEST_CASE(testConstOverloadReturnsNewString) { +BOOST_AUTO_TEST_CASE(testConstOverloadPreservesOriginal) { const std::string longValue(1000, 'x'); std::string result = CFieldValueTruncator::truncated(longValue); BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, result.size()); @@ -55,7 +59,7 @@ BOOST_AUTO_TEST_CASE(testConstOverloadShortValueReturnsSame) { BOOST_REQUIRE_EQUAL("short", result); } -BOOST_AUTO_TEST_CASE(testVeryLargeValueTruncated) { +BOOST_AUTO_TEST_CASE(testVeryLargeValueFromIssue2796) { std::string value(77000, 'y'); BOOST_REQUIRE_EQUAL(true, CFieldValueTruncator::truncate(value)); BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, value.size()); @@ -72,4 +76,101 @@ BOOST_AUTO_TEST_CASE(testNeedsTruncation) { true, CFieldValueTruncator::needsTruncation(std::string(77000, 'x'))); } +// ============================================================================ +// Hash Suffix Format Validation +// ============================================================================ + +BOOST_AUTO_TEST_CASE(testTruncatedValueHasCorrectFormat) { + std::string value(1000, 'x'); + std::string result = CFieldValueTruncator::truncated(value); + + // Format: 240 prefix + '$' + 15 hex chars = 256 total + BOOST_REQUIRE_EQUAL(256, result.size()); + BOOST_REQUIRE_EQUAL(CFieldValueTruncator::HASH_SEPARATOR, result[240]); + + // Prefix should match original + BOOST_REQUIRE_EQUAL(0, result.compare(0, 240, value, 0, 240)); + + // Hash portion should be lowercase hex digits + for (std::size_t i = 241; i < 256; ++i) { + BOOST_REQUIRE(std::isxdigit(result[i])); + BOOST_REQUIRE((result[i] >= '0' && result[i] <= '9') || + (result[i] >= 'a' && result[i] <= 'f')); + } +} + +BOOST_AUTO_TEST_CASE(testInPlaceTruncationPreservesFormat) { + std::string value(1000, 'z'); + bool wasTruncated = CFieldValueTruncator::truncate(value); + + BOOST_REQUIRE_EQUAL(true, wasTruncated); + BOOST_REQUIRE_EQUAL(256, value.size()); + BOOST_REQUIRE_EQUAL(CFieldValueTruncator::HASH_SEPARATOR, value[240]); + + // Verify hash portion is valid hex + for (std::size_t i = 241; i < 256; ++i) { + BOOST_REQUIRE(std::isxdigit(value[i])); + } +} + +// ============================================================================ +// Collision Prevention (Data Integrity) +// ============================================================================ + +BOOST_AUTO_TEST_CASE(testDistinctValuesProduceDistinctResults) { + std::string prefix(240, 'x'); + std::string value1 = prefix + std::string(1000, 'A'); + std::string value2 = prefix + std::string(1000, 'B'); + + std::string truncated1 = CFieldValueTruncator::truncated(value1); + std::string truncated2 = CFieldValueTruncator::truncated(value2); + + // Same prefix + BOOST_REQUIRE_EQUAL(truncated1.substr(0, 241), truncated2.substr(0, 241)); + + // But different hash suffixes prevent collision + BOOST_REQUIRE_NE(truncated1.substr(241), truncated2.substr(241)); + BOOST_REQUIRE_NE(truncated1, truncated2); +} + +BOOST_AUTO_TEST_CASE(testCollisionsPreventedByHashSuffix) { + // Two values differing only after position 256 (original collision case) + std::string value1(300, 'x'); + value1.replace(280, 20, "AAAAAAAAAAAAAAAAAAAA"); + + std::string value2(300, 'x'); + value2.replace(280, 20, "BBBBBBBBBBBBBBBBBBBB"); + + std::string truncated1 = CFieldValueTruncator::truncated(value1); + std::string truncated2 = CFieldValueTruncator::truncated(value2); + + // Must be distinct despite identical first 240 chars + BOOST_REQUIRE_NE(truncated1, truncated2); + BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, truncated1.size()); + BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, truncated2.size()); +} + +BOOST_AUTO_TEST_CASE(testDeterministicHashing) { + std::string value(1000, 'y'); + std::string result1 = CFieldValueTruncator::truncated(value); + std::string result2 = CFieldValueTruncator::truncated(value); + + BOOST_REQUIRE_EQUAL(result1, result2); +} + +BOOST_AUTO_TEST_CASE(testVeryLongValueWithDistinctEnding) { + // Simulate the 77K influencer case from issue #2796 + std::string value1(77000, 'x'); + value1.replace(76990, 10, "VARIANT_A"); + + std::string value2(77000, 'x'); + value2.replace(76990, 10, "VARIANT_B"); + + std::string truncated1 = CFieldValueTruncator::truncated(value1); + std::string truncated2 = CFieldValueTruncator::truncated(value2); + + // Must be distinct despite identical first 240 chars + BOOST_REQUIRE_NE(truncated1, truncated2); +} + BOOST_AUTO_TEST_SUITE_END() From 50379b457975e225ca82bed2af7699e9c4d37911 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Thu, 19 Mar 2026 10:24:04 +0100 Subject: [PATCH 04/11] Change hash suffix to 16 characters --- include/api/CAnomalyJob.h | 6 ++-- include/model/CFieldValueTruncator.h | 34 +++++++++---------- lib/api/CAnomalyJob.cc | 13 ++++--- .../unittest/CFieldValueTruncatorTest.cc | 26 +++++++------- 4 files changed, 38 insertions(+), 41 deletions(-) diff --git a/include/api/CAnomalyJob.h b/include/api/CAnomalyJob.h index 4167c649a7..fd1cc4bc88 100644 --- a/include/api/CAnomalyJob.h +++ b/include/api/CAnomalyJob.h @@ -400,9 +400,9 @@ class API_EXPORT CAnomalyJob : public CDataProcessor { //! \param fieldValues Output vector of pointers to field values //! \param truncatedCopies Storage for truncated copies (must remain valid while fieldValues is used) static void prepareTruncatedFieldValues(const TStrVec& fieldNames, - const TStrStrUMap& dataRowFields, - model::CAnomalyDetector::TStrCPtrVec& fieldValues, - TStrVec& truncatedCopies); + const TStrStrUMap& dataRowFields, + model::CAnomalyDetector::TStrCPtrVec& fieldValues, + TStrVec& truncatedCopies); //! Parses a control message requesting that model state be persisted. //! Extracts optional arguments to be used for persistence. diff --git a/include/model/CFieldValueTruncator.h b/include/model/CFieldValueTruncator.h index f62c1f6d26..b044f1adfc 100644 --- a/include/model/CFieldValueTruncator.h +++ b/include/model/CFieldValueTruncator.h @@ -30,12 +30,12 @@ namespace model { //! //! Values exceeding MAX_FIELD_VALUE_LENGTH (256 chars) are transformed using //! collision-safe truncation: -//! - Retain PREFIX_LENGTH (240) characters of original value +//! - Retain PREFIX_LENGTH (239) characters of original value //! - Append HASH_SEPARATOR ('$') -//! - Append HASH_HEX_DIGITS (15) character hex hash of complete original value +//! - Append HASH_HEX_DIGITS (16) character hex hash of complete original value //! -//! Format: "$" -//! Example: "very_long_field_value_that_exceeds_limit_and_continues_for_thousands_of_chars_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx$a1b2c3d4e5f6789" +//! Format: "$" +//! Example: "very_long_field_value_that_exceeds_limit_and_continues_for_thousands_of_chars_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx$a1b2c3d4e5f67890" //! //! The 256-character limit aligns with Elasticsearch's ignore_above default //! for keyword fields. The hash suffix ensures data integrity while maintaining @@ -49,21 +49,19 @@ class MODEL_EXPORT CFieldValueTruncator { //! Collision prevention format components static constexpr char HASH_SEPARATOR = '$'; - static constexpr std::size_t HASH_HEX_DIGITS = 15; // 15 hex chars for uint64_t - static constexpr std::size_t HASH_SUFFIX_LENGTH = - 1 /* separator */ + HASH_HEX_DIGITS; // 16 total + static constexpr std::size_t HASH_HEX_DIGITS = 16; // 16 hex chars = full 64-bit hash + static constexpr std::size_t HASH_SUFFIX_LENGTH = 1 /* separator */ + HASH_HEX_DIGITS; // 17 total //! Content prefix length (readable portion after truncation) - static constexpr std::size_t PREFIX_LENGTH = - MAX_FIELD_VALUE_LENGTH - HASH_SUFFIX_LENGTH; // 240 + static constexpr std::size_t PREFIX_LENGTH = MAX_FIELD_VALUE_LENGTH - HASH_SUFFIX_LENGTH; // 239 // Domain invariants (enforced at compile-time) static_assert(PREFIX_LENGTH + HASH_SUFFIX_LENGTH == MAX_FIELD_VALUE_LENGTH, "Term field format invariant: prefix + suffix = total length"); static_assert(PREFIX_LENGTH >= 200, "Readable prefix must be substantial for human comprehension"); - static_assert(HASH_HEX_DIGITS * 4 <= 64, - "Hash hex digits must fit in 64-bit hash output"); + static_assert(HASH_HEX_DIGITS * 4 == 64, + "Hash hex digits must represent full 64-bit hash output"); //! Check if a term field value exceeds the domain constraint. //! \return true if the value requires length enforcement @@ -114,9 +112,9 @@ class MODEL_EXPORT CFieldValueTruncator { //! Compute collision-resistant identity hash. //! Uses safeMurmurHash64 (endian-neutral) for state persistence safety. static std::uint64_t compute(const std::string& value) { - return core::CHashing::safeMurmurHash64( - value.data(), static_cast(value.size()), - 0); // Fixed seed for determinism + return core::CHashing::safeMurmurHash64(value.data(), + static_cast(value.size()), + 0); // Fixed seed for determinism } //! Format 64-bit hash as zero-padded lowercase hex string. @@ -124,9 +122,9 @@ class MODEL_EXPORT CFieldValueTruncator { //! \param[out] buffer Must be at least HASH_HEX_DIGITS + 1 bytes //! \return Pointer to null-terminated hex string in buffer static const char* toHex(std::uint64_t hash, char* buffer) { - // %015llx produces 15-char zero-padded lowercase hex - std::snprintf(buffer, HASH_HEX_DIGITS + 1, "%015llx", - static_cast(hash)); + // %016llx produces 16-char zero-padded lowercase hex (full 64 bits) + std::snprintf(buffer, HASH_HEX_DIGITS + 1, "%016llx", + static_cast(hash)); return buffer; } }; @@ -135,7 +133,7 @@ class MODEL_EXPORT CFieldValueTruncator { //! \param originalValue Complete untruncated value for hash computation //! \param[in,out] prefix Truncated prefix to which suffix is appended static void appendCollisionPreventionSuffix(const std::string& originalValue, - std::string& prefix) { + std::string& prefix) { std::uint64_t identityHash = HashEncoding::compute(originalValue); prefix.reserve(MAX_FIELD_VALUE_LENGTH); diff --git a/lib/api/CAnomalyJob.cc b/lib/api/CAnomalyJob.cc index e607bd9a3e..af3256e037 100644 --- a/lib/api/CAnomalyJob.cc +++ b/lib/api/CAnomalyJob.cc @@ -1701,11 +1701,10 @@ const std::string* CAnomalyJob::fieldValue(const std::string& fieldName, return !fieldName.empty() && fieldValue.empty() ? nullptr : &fieldValue; } -void CAnomalyJob::prepareTruncatedFieldValues( - const TStrVec& fieldNames, - const TStrStrUMap& dataRowFields, - model::CAnomalyDetector::TStrCPtrVec& fieldValues, - TStrVec& truncatedCopies) { +void CAnomalyJob::prepareTruncatedFieldValues(const TStrVec& fieldNames, + const TStrStrUMap& dataRowFields, + model::CAnomalyDetector::TStrCPtrVec& fieldValues, + TStrVec& truncatedCopies) { fieldValues.reserve(fieldNames.size()); truncatedCopies.reserve(fieldNames.size()); @@ -1719,8 +1718,8 @@ void CAnomalyJob::prepareTruncatedFieldValues( std::string escapedFieldName = fieldName; core::CStringUtils::escape('\\', "\n\r\t", escapedFieldName); LOG_WARN(<< "Field '" << escapedFieldName - << "' value (length=" << value->size() - << ", prefix='" << value->substr(0, std::min(50, value->size())) + << "' value (length=" << value->size() << ", prefix='" + << value->substr(0, std::min(50, value->size())) << "...') exceeds " << model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH << " characters and has been truncated with collision-safe hash suffix"); } else { diff --git a/lib/model/unittest/CFieldValueTruncatorTest.cc b/lib/model/unittest/CFieldValueTruncatorTest.cc index 65c3777da3..6cc61174e2 100644 --- a/lib/model/unittest/CFieldValueTruncatorTest.cc +++ b/lib/model/unittest/CFieldValueTruncatorTest.cc @@ -84,18 +84,18 @@ BOOST_AUTO_TEST_CASE(testTruncatedValueHasCorrectFormat) { std::string value(1000, 'x'); std::string result = CFieldValueTruncator::truncated(value); - // Format: 240 prefix + '$' + 15 hex chars = 256 total + // Format: 239 prefix + '$' + 16 hex chars = 256 total BOOST_REQUIRE_EQUAL(256, result.size()); - BOOST_REQUIRE_EQUAL(CFieldValueTruncator::HASH_SEPARATOR, result[240]); + BOOST_REQUIRE_EQUAL(CFieldValueTruncator::HASH_SEPARATOR, result[239]); // Prefix should match original - BOOST_REQUIRE_EQUAL(0, result.compare(0, 240, value, 0, 240)); + BOOST_REQUIRE_EQUAL(0, result.compare(0, 239, value, 0, 239)); // Hash portion should be lowercase hex digits - for (std::size_t i = 241; i < 256; ++i) { + for (std::size_t i = 240; i < 256; ++i) { BOOST_REQUIRE(std::isxdigit(result[i])); BOOST_REQUIRE((result[i] >= '0' && result[i] <= '9') || - (result[i] >= 'a' && result[i] <= 'f')); + (result[i] >= 'a' && result[i] <= 'f')); } } @@ -105,10 +105,10 @@ BOOST_AUTO_TEST_CASE(testInPlaceTruncationPreservesFormat) { BOOST_REQUIRE_EQUAL(true, wasTruncated); BOOST_REQUIRE_EQUAL(256, value.size()); - BOOST_REQUIRE_EQUAL(CFieldValueTruncator::HASH_SEPARATOR, value[240]); + BOOST_REQUIRE_EQUAL(CFieldValueTruncator::HASH_SEPARATOR, value[239]); // Verify hash portion is valid hex - for (std::size_t i = 241; i < 256; ++i) { + for (std::size_t i = 240; i < 256; ++i) { BOOST_REQUIRE(std::isxdigit(value[i])); } } @@ -118,18 +118,18 @@ BOOST_AUTO_TEST_CASE(testInPlaceTruncationPreservesFormat) { // ============================================================================ BOOST_AUTO_TEST_CASE(testDistinctValuesProduceDistinctResults) { - std::string prefix(240, 'x'); + std::string prefix(239, 'x'); std::string value1 = prefix + std::string(1000, 'A'); std::string value2 = prefix + std::string(1000, 'B'); std::string truncated1 = CFieldValueTruncator::truncated(value1); std::string truncated2 = CFieldValueTruncator::truncated(value2); - // Same prefix - BOOST_REQUIRE_EQUAL(truncated1.substr(0, 241), truncated2.substr(0, 241)); + // Same prefix (239 chars + separator) + BOOST_REQUIRE_EQUAL(truncated1.substr(0, 240), truncated2.substr(0, 240)); // But different hash suffixes prevent collision - BOOST_REQUIRE_NE(truncated1.substr(241), truncated2.substr(241)); + BOOST_REQUIRE_NE(truncated1.substr(240), truncated2.substr(240)); BOOST_REQUIRE_NE(truncated1, truncated2); } @@ -144,7 +144,7 @@ BOOST_AUTO_TEST_CASE(testCollisionsPreventedByHashSuffix) { std::string truncated1 = CFieldValueTruncator::truncated(value1); std::string truncated2 = CFieldValueTruncator::truncated(value2); - // Must be distinct despite identical first 240 chars + // Must be distinct despite identical first 239 chars BOOST_REQUIRE_NE(truncated1, truncated2); BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, truncated1.size()); BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, truncated2.size()); @@ -169,7 +169,7 @@ BOOST_AUTO_TEST_CASE(testVeryLongValueWithDistinctEnding) { std::string truncated1 = CFieldValueTruncator::truncated(value1); std::string truncated2 = CFieldValueTruncator::truncated(value2); - // Must be distinct despite identical first 240 chars + // Must be distinct despite identical first 239 chars BOOST_REQUIRE_NE(truncated1, truncated2); } From dd955e5a82bb0fb11f9d9d7720f5f7fef177b1f0 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Thu, 19 Mar 2026 10:56:16 +0100 Subject: [PATCH 05/11] Add tests --- docs/CHANGELOG.asciidoc | 2 +- lib/api/CAnomalyJob.cc | 1 + lib/model/CBucketGatherer.cc | 7 +- .../unittest/CEventRateDataGathererTest.cc | 75 ++++++++++++++++++ lib/model/unittest/CMetricDataGathererTest.cc | 77 +++++++++++++++++++ 5 files changed, 157 insertions(+), 5 deletions(-) diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index 8da062cb0b..5c2485db47 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -32,7 +32,7 @@ === Bug Fixes -* Truncate oversized field values to prevent autodetect process crash. (See {ml-pull}2929[#2929].) +* Truncate oversized field values to prevent autodetect process crash. (See {ml-pull}2929[#2929], {es-pull}143180[#143180], issue: {ml-issue}2796[#2796].) * Report RSS in bytes instead of pages. (See {ml-pull}2917[#2917].) === Enhancements diff --git a/lib/api/CAnomalyJob.cc b/lib/api/CAnomalyJob.cc index af3256e037..8082f6c08f 100644 --- a/lib/api/CAnomalyJob.cc +++ b/lib/api/CAnomalyJob.cc @@ -1707,6 +1707,7 @@ void CAnomalyJob::prepareTruncatedFieldValues(const TStrVec& fieldNames, TStrVec& truncatedCopies) { fieldValues.reserve(fieldNames.size()); + // Reserve ensures no reallocation invalidates pointers stored in fieldValues. truncatedCopies.reserve(fieldNames.size()); for (const auto& fieldName : fieldNames) { diff --git a/lib/model/CBucketGatherer.cc b/lib/model/CBucketGatherer.cc index 520561e7be..70e9f95114 100644 --- a/lib/model/CBucketGatherer.cc +++ b/lib/model/CBucketGatherer.cc @@ -116,10 +116,9 @@ bool restoreInfluencerPersonAttributeCounts(core::CStateRestoreTraverser& traver const std::string& name = traverser.name(); RESTORE_BUILT_IN(PERSON_UID_TAG, person) RESTORE_BUILT_IN(ATTRIBUTE_UID_TAG, attribute) - RESTORE_NO_ERROR(INFLUENCER_TAG, influence = traverser.value()) - if (name == INFLUENCER_TAG) { - CFieldValueTruncator::truncate(influence); - } + RESTORE_NO_ERROR(INFLUENCER_TAG, influence = traverser.value(); + CFieldValueTruncator::truncate(influence)) + if (name == COUNT_TAG) { if (core::CStringUtils::stringToType(traverser.value(), count) == false) { LOG_ERROR(<< "Failed to restore COUNT_TAG, got " << traverser.value()); diff --git a/lib/model/unittest/CEventRateDataGathererTest.cc b/lib/model/unittest/CEventRateDataGathererTest.cc index 859d0e204d..b540b38245 100644 --- a/lib/model/unittest/CEventRateDataGathererTest.cc +++ b/lib/model/unittest/CEventRateDataGathererTest.cc @@ -1891,4 +1891,79 @@ BOOST_FIXTURE_TEST_CASE(testDiurnalFeatures, CDiurnalTestFixture) { } } +BOOST_FIXTURE_TEST_CASE(testRestoreTruncatesOversizedInfluencerValues, CTestFixture) { + // Verify that oversized influencer field values persisted in old state + // are truncated on restore. This exercises truncation in: + // - CBucketGatherer::restoreInfluencerPersonAttributeCounts (Finding 7) + // - CEventRateBucketGatherer::restoreInfluencerUniqueStrings (Finding 8) + + constexpr core_t::TTime startTime = 0; + constexpr core_t::TTime bucketLength = 600; + SModelParams params(bucketLength); + params.s_LatencyBuckets = 2; + + TFeatureVec features; + features.push_back(model_t::E_IndividualUniqueCountByBucketAndPerson); + TStrVec influencerFieldNames{"IF1"}; + + CDataGatherer gatherer = + CDataGathererBuilder(model_t::E_EventRate, features, params, key, startTime) + .personFieldName("P") + .valueFieldName("V") + .influenceFieldNames(influencerFieldNames) + .build(); + + BOOST_REQUIRE_EQUAL(0, addPerson(gatherer, m_ResourceMonitor, "p", "v", 1)); + + // Add arrivals with an oversized influencer value (bypasses CAnomalyJob input truncation). + std::string const oversizedInfluencer(500, 'x'); + addArrival(gatherer, m_ResourceMonitor, startTime + 1, "p", "val1", oversizedInfluencer); + addArrival(gatherer, m_ResourceMonitor, startTime + 2, "p", "val2", oversizedInfluencer); + + // Persist — the JSON will contain the oversized influencer value. + std::ostringstream origJson; + core::CJsonStatePersistInserter::persist( + origJson, [&gatherer](core::CJsonStatePersistInserter& inserter) { + gatherer.acceptPersistInserter(inserter); + }); + + // Sanity check: the persisted JSON contains the full oversized value. + BOOST_TEST_REQUIRE(origJson.str().find(oversizedInfluencer) != std::string::npos); + + // Restore from persisted JSON — truncation should apply. + std::istringstream origJsonStrm{"{\"topLevel\" : " + origJson.str() + "}"}; + core::CJsonStateRestoreTraverser traverser(origJsonStrm); + + CBucketGatherer::SBucketGathererInitData bucketGathererInitData{ + EMPTY_STRING, EMPTY_STRING, EMPTY_STRING, EMPTY_STRING, {}, 0, 0}; + CDataGatherer restoredGatherer(model_t::E_EventRate, model_t::E_None, + params, EMPTY_STRING, key, + bucketGathererInitData, traverser); + + // Persist restored gatherer — should NOT contain the oversized value. + std::ostringstream restoredJson; + core::CJsonStatePersistInserter::persist( + restoredJson, [&restoredGatherer](core::CJsonStatePersistInserter& inserter) { + restoredGatherer.acceptPersistInserter(inserter); + }); + + // The full 500-char string must no longer appear (it was truncated to 256). + BOOST_TEST_REQUIRE(restoredJson.str().find(oversizedInfluencer) == std::string::npos); + + // Verify idempotency: restore again and persist — should be identical. + std::istringstream restoredJsonStrm{"{\"topLevel\" : " + restoredJson.str() + "}"}; + core::CJsonStateRestoreTraverser traverser2(restoredJsonStrm); + CDataGatherer restoredGatherer2(model_t::E_EventRate, model_t::E_None, + params, EMPTY_STRING, key, + bucketGathererInitData, traverser2); + + std::ostringstream restoredJson2; + core::CJsonStatePersistInserter::persist( + restoredJson2, [&restoredGatherer2](core::CJsonStatePersistInserter& inserter) { + restoredGatherer2.acceptPersistInserter(inserter); + }); + + BOOST_REQUIRE_EQUAL(restoredJson.str(), restoredJson2.str()); +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/model/unittest/CMetricDataGathererTest.cc b/lib/model/unittest/CMetricDataGathererTest.cc index 76feaf0546..0941e95827 100644 --- a/lib/model/unittest/CMetricDataGathererTest.cc +++ b/lib/model/unittest/CMetricDataGathererTest.cc @@ -1843,4 +1843,81 @@ BOOST_FIXTURE_TEST_CASE(testVarp, CTestFixture) { } } +BOOST_FIXTURE_TEST_CASE(testRestoreTruncatesOversizedInfluencerSums, CTestFixture) { + // Verify that oversized influencer keys in CGathererTools::SInfluencerSumSerializer + // are truncated on restore. This exercises truncation in the metric sum gatherer's + // influencer bucket sum restore path. + + constexpr core_t::TTime startTime = 0; + constexpr core_t::TTime bucketLength = 600; + SModelParams params(bucketLength); + params.s_LatencyBuckets = 2; + params.s_SampleCountFactor = 1; + params.s_SampleQueueGrowthFactor = 0.1; + + TFeatureVec features; + features.push_back(model_t::E_IndividualSumByBucketAndPerson); + TStrVec const influencerNames{"i1"}; + + CDataGatherer gatherer = + CDataGathererBuilder(model_t::E_Metric, features, params, KEY, startTime) + .influenceFieldNames(influencerNames) + .sampleCountOverride(2U) + .build(); + + BOOST_REQUIRE_EQUAL(0, addPerson("p", gatherer, m_ResourceMonitor, 1)); + + // Add arrivals with an oversized influencer value (bypasses CAnomalyJob input truncation). + std::string const oversizedInfluencer(500, 'y'); + addArrival(gatherer, m_ResourceMonitor, startTime + 1, "p", 1.0, + oversizedInfluencer, ""); + addArrival(gatherer, m_ResourceMonitor, startTime + 2, "p", 2.0, + oversizedInfluencer, ""); + + // Persist — the JSON will contain the oversized influencer value. + std::ostringstream origJson; + core::CJsonStatePersistInserter::persist( + origJson, [&gatherer](core::CJsonStatePersistInserter& inserter) { + gatherer.acceptPersistInserter(inserter); + }); + + // Sanity check: the persisted JSON contains the full oversized value. + BOOST_TEST_REQUIRE(origJson.str().find(oversizedInfluencer) != std::string::npos); + + // Restore from persisted JSON — truncation should apply. + std::istringstream origJsonStrm{"{\"topLevel\" : " + origJson.str() + "}"}; + core::CJsonStateRestoreTraverser traverser(origJsonStrm); + + CBucketGatherer::SBucketGathererInitData bucketGathererInitData{ + EMPTY_STRING, EMPTY_STRING, EMPTY_STRING, EMPTY_STRING, {}, 0, 0}; + CDataGatherer restoredGatherer(model_t::E_Metric, model_t::E_None, + params, EMPTY_STRING, KEY, + bucketGathererInitData, traverser); + + // Persist restored gatherer — should NOT contain the oversized value. + std::ostringstream restoredJson; + core::CJsonStatePersistInserter::persist( + restoredJson, [&restoredGatherer](core::CJsonStatePersistInserter& inserter) { + restoredGatherer.acceptPersistInserter(inserter); + }); + + // The full 500-char string must no longer appear (it was truncated to 256). + BOOST_TEST_REQUIRE(restoredJson.str().find(oversizedInfluencer) == std::string::npos); + + // Verify idempotency: restore again and persist — should be identical. + std::istringstream restoredJsonStrm{"{\"topLevel\" : " + restoredJson.str() + "}"}; + core::CJsonStateRestoreTraverser traverser2(restoredJsonStrm); + CDataGatherer restoredGatherer2(model_t::E_Metric, model_t::E_None, + params, EMPTY_STRING, KEY, + bucketGathererInitData, traverser2); + + std::ostringstream restoredJson2; + core::CJsonStatePersistInserter::persist( + restoredJson2, [&restoredGatherer2](core::CJsonStatePersistInserter& inserter) { + restoredGatherer2.acceptPersistInserter(inserter); + }); + + BOOST_REQUIRE_EQUAL(restoredJson.str(), restoredJson2.str()); +} + BOOST_AUTO_TEST_SUITE_END() From 3f6471f63c0dd665039830ef676b73f5936c3f35 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Thu, 19 Mar 2026 11:08:24 +0100 Subject: [PATCH 06/11] formatting --- .../unittest/CEventRateDataGathererTest.cc | 22 +++++++++---------- lib/model/unittest/CMetricDataGathererTest.cc | 16 +++++--------- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/lib/model/unittest/CEventRateDataGathererTest.cc b/lib/model/unittest/CEventRateDataGathererTest.cc index b540b38245..70e6c55bea 100644 --- a/lib/model/unittest/CEventRateDataGathererTest.cc +++ b/lib/model/unittest/CEventRateDataGathererTest.cc @@ -1906,12 +1906,12 @@ BOOST_FIXTURE_TEST_CASE(testRestoreTruncatesOversizedInfluencerValues, CTestFixt features.push_back(model_t::E_IndividualUniqueCountByBucketAndPerson); TStrVec influencerFieldNames{"IF1"}; - CDataGatherer gatherer = - CDataGathererBuilder(model_t::E_EventRate, features, params, key, startTime) - .personFieldName("P") - .valueFieldName("V") - .influenceFieldNames(influencerFieldNames) - .build(); + CDataGatherer gatherer = CDataGathererBuilder(model_t::E_EventRate, features, + params, key, startTime) + .personFieldName("P") + .valueFieldName("V") + .influenceFieldNames(influencerFieldNames) + .build(); BOOST_REQUIRE_EQUAL(0, addPerson(gatherer, m_ResourceMonitor, "p", "v", 1)); @@ -1936,9 +1936,8 @@ BOOST_FIXTURE_TEST_CASE(testRestoreTruncatesOversizedInfluencerValues, CTestFixt CBucketGatherer::SBucketGathererInitData bucketGathererInitData{ EMPTY_STRING, EMPTY_STRING, EMPTY_STRING, EMPTY_STRING, {}, 0, 0}; - CDataGatherer restoredGatherer(model_t::E_EventRate, model_t::E_None, - params, EMPTY_STRING, key, - bucketGathererInitData, traverser); + CDataGatherer restoredGatherer(model_t::E_EventRate, model_t::E_None, params, + EMPTY_STRING, key, bucketGathererInitData, traverser); // Persist restored gatherer — should NOT contain the oversized value. std::ostringstream restoredJson; @@ -1953,9 +1952,8 @@ BOOST_FIXTURE_TEST_CASE(testRestoreTruncatesOversizedInfluencerValues, CTestFixt // Verify idempotency: restore again and persist — should be identical. std::istringstream restoredJsonStrm{"{\"topLevel\" : " + restoredJson.str() + "}"}; core::CJsonStateRestoreTraverser traverser2(restoredJsonStrm); - CDataGatherer restoredGatherer2(model_t::E_EventRate, model_t::E_None, - params, EMPTY_STRING, key, - bucketGathererInitData, traverser2); + CDataGatherer restoredGatherer2(model_t::E_EventRate, model_t::E_None, params, + EMPTY_STRING, key, bucketGathererInitData, traverser2); std::ostringstream restoredJson2; core::CJsonStatePersistInserter::persist( diff --git a/lib/model/unittest/CMetricDataGathererTest.cc b/lib/model/unittest/CMetricDataGathererTest.cc index 0941e95827..b3085727c7 100644 --- a/lib/model/unittest/CMetricDataGathererTest.cc +++ b/lib/model/unittest/CMetricDataGathererTest.cc @@ -1869,10 +1869,8 @@ BOOST_FIXTURE_TEST_CASE(testRestoreTruncatesOversizedInfluencerSums, CTestFixtur // Add arrivals with an oversized influencer value (bypasses CAnomalyJob input truncation). std::string const oversizedInfluencer(500, 'y'); - addArrival(gatherer, m_ResourceMonitor, startTime + 1, "p", 1.0, - oversizedInfluencer, ""); - addArrival(gatherer, m_ResourceMonitor, startTime + 2, "p", 2.0, - oversizedInfluencer, ""); + addArrival(gatherer, m_ResourceMonitor, startTime + 1, "p", 1.0, oversizedInfluencer, ""); + addArrival(gatherer, m_ResourceMonitor, startTime + 2, "p", 2.0, oversizedInfluencer, ""); // Persist — the JSON will contain the oversized influencer value. std::ostringstream origJson; @@ -1890,9 +1888,8 @@ BOOST_FIXTURE_TEST_CASE(testRestoreTruncatesOversizedInfluencerSums, CTestFixtur CBucketGatherer::SBucketGathererInitData bucketGathererInitData{ EMPTY_STRING, EMPTY_STRING, EMPTY_STRING, EMPTY_STRING, {}, 0, 0}; - CDataGatherer restoredGatherer(model_t::E_Metric, model_t::E_None, - params, EMPTY_STRING, KEY, - bucketGathererInitData, traverser); + CDataGatherer restoredGatherer(model_t::E_Metric, model_t::E_None, params, EMPTY_STRING, + KEY, bucketGathererInitData, traverser); // Persist restored gatherer — should NOT contain the oversized value. std::ostringstream restoredJson; @@ -1907,9 +1904,8 @@ BOOST_FIXTURE_TEST_CASE(testRestoreTruncatesOversizedInfluencerSums, CTestFixtur // Verify idempotency: restore again and persist — should be identical. std::istringstream restoredJsonStrm{"{\"topLevel\" : " + restoredJson.str() + "}"}; core::CJsonStateRestoreTraverser traverser2(restoredJsonStrm); - CDataGatherer restoredGatherer2(model_t::E_Metric, model_t::E_None, - params, EMPTY_STRING, KEY, - bucketGathererInitData, traverser2); + CDataGatherer restoredGatherer2(model_t::E_Metric, model_t::E_None, params, EMPTY_STRING, + KEY, bucketGathererInitData, traverser2); std::ostringstream restoredJson2; core::CJsonStatePersistInserter::persist( From ac73b637bdf677b50342063abfa54b1554166711 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Thu, 19 Mar 2026 12:18:16 +0100 Subject: [PATCH 07/11] clean up --- include/model/CFieldValueTruncator.h | 12 ++++------ .../unittest/CDynamicStringIdRegistryTest.cc | 2 +- .../unittest/CFieldValueTruncatorTest.cc | 24 +------------------ 3 files changed, 7 insertions(+), 31 deletions(-) diff --git a/include/model/CFieldValueTruncator.h b/include/model/CFieldValueTruncator.h index b044f1adfc..c6df93a803 100644 --- a/include/model/CFieldValueTruncator.h +++ b/include/model/CFieldValueTruncator.h @@ -39,12 +39,11 @@ namespace model { //! //! The 256-character limit aligns with Elasticsearch's ignore_above default //! for keyword fields. The hash suffix ensures data integrity while maintaining -//! human readability (first 240 characters visible) and compatibility with -//! prefix-based filtering. Collision probability is ~1 in 10^18 (effectively zero). +//! human readability (first 239 characters visible) and compatibility with +//! prefix-based filtering. Collision probability is ~1 in 10^19 (effectively zero). class MODEL_EXPORT CFieldValueTruncator { public: - //! Domain constraint: Maximum length for term fields in anomaly detection. - //! Aligned with Elasticsearch's ignore_above default for keyword fields. + //! Maximum length for term fields in anomaly detection. static constexpr std::size_t MAX_FIELD_VALUE_LENGTH = 256; //! Collision prevention format components @@ -86,13 +85,12 @@ class MODEL_EXPORT CFieldValueTruncator { } //! Enforce term field length constraint, returning constrained copy. - //! Original value unchanged. For performance, call needsTruncation() first - //! to avoid copying when constraint is already satisfied. + //! Original value unchanged. //! \param value Original field value //! \return Copy with length constraint enforced static std::string truncated(const std::string& value) { if (needsTruncation(value) == false) { - return value; // RVO applies + return value; } std::string result; diff --git a/lib/model/unittest/CDynamicStringIdRegistryTest.cc b/lib/model/unittest/CDynamicStringIdRegistryTest.cc index 60cea2b1da..b060340dde 100644 --- a/lib/model/unittest/CDynamicStringIdRegistryTest.cc +++ b/lib/model/unittest/CDynamicStringIdRegistryTest.cc @@ -118,7 +118,7 @@ BOOST_AUTO_TEST_CASE(testRestoreTruncatesOversizedNames) { bool addedPerson = false; std::string shortName("foo"); - std::string oversizedName(77000, 'x'); + std::string oversizedName(1000, 'x'); registry.addName(shortName, 0, resourceMonitor, addedPerson); registry.addName(oversizedName, 0, resourceMonitor, addedPerson); diff --git a/lib/model/unittest/CFieldValueTruncatorTest.cc b/lib/model/unittest/CFieldValueTruncatorTest.cc index 6cc61174e2..b17a33b7ce 100644 --- a/lib/model/unittest/CFieldValueTruncatorTest.cc +++ b/lib/model/unittest/CFieldValueTruncatorTest.cc @@ -59,12 +59,6 @@ BOOST_AUTO_TEST_CASE(testConstOverloadShortValueReturnsSame) { BOOST_REQUIRE_EQUAL("short", result); } -BOOST_AUTO_TEST_CASE(testVeryLargeValueFromIssue2796) { - std::string value(77000, 'y'); - BOOST_REQUIRE_EQUAL(true, CFieldValueTruncator::truncate(value)); - BOOST_REQUIRE_EQUAL(CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, value.size()); -} - BOOST_AUTO_TEST_CASE(testNeedsTruncation) { BOOST_REQUIRE_EQUAL(false, CFieldValueTruncator::needsTruncation("short")); BOOST_REQUIRE_EQUAL(false, CFieldValueTruncator::needsTruncation("")); @@ -72,8 +66,7 @@ BOOST_AUTO_TEST_CASE(testNeedsTruncation) { CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH, 'x'))); BOOST_REQUIRE_EQUAL(true, CFieldValueTruncator::needsTruncation(std::string( CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH + 1, 'x'))); - BOOST_REQUIRE_EQUAL( - true, CFieldValueTruncator::needsTruncation(std::string(77000, 'x'))); + BOOST_REQUIRE_EQUAL(true, CFieldValueTruncator::needsTruncation(std::string(1000, 'x'))); } // ============================================================================ @@ -158,19 +151,4 @@ BOOST_AUTO_TEST_CASE(testDeterministicHashing) { BOOST_REQUIRE_EQUAL(result1, result2); } -BOOST_AUTO_TEST_CASE(testVeryLongValueWithDistinctEnding) { - // Simulate the 77K influencer case from issue #2796 - std::string value1(77000, 'x'); - value1.replace(76990, 10, "VARIANT_A"); - - std::string value2(77000, 'x'); - value2.replace(76990, 10, "VARIANT_B"); - - std::string truncated1 = CFieldValueTruncator::truncated(value1); - std::string truncated2 = CFieldValueTruncator::truncated(value2); - - // Must be distinct despite identical first 239 chars - BOOST_REQUIRE_NE(truncated1, truncated2); -} - BOOST_AUTO_TEST_SUITE_END() From 222697199aa69e2f220cb75c82549802659187c1 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Thu, 19 Mar 2026 12:32:09 +0100 Subject: [PATCH 08/11] fix unit test --- lib/model/unittest/CMetricDataGathererTest.cc | 25 +++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/lib/model/unittest/CMetricDataGathererTest.cc b/lib/model/unittest/CMetricDataGathererTest.cc index b3085727c7..57bdc9838e 100644 --- a/lib/model/unittest/CMetricDataGathererTest.cc +++ b/lib/model/unittest/CMetricDataGathererTest.cc @@ -111,6 +111,24 @@ void addArrival(CDataGatherer& gatherer, gatherer.addArrival(fieldValues, eventData, resourceMonitor); } +void addArrival(CDataGatherer& gatherer, + CResourceMonitor& resourceMonitor, + core_t::TTime time, + const std::string& person, + double value, + const std::string& influencer) { + CDataGatherer::TStrCPtrVec fieldValues; + fieldValues.push_back(&person); + fieldValues.push_back(influencer.empty() ? nullptr : &influencer); + std::string const valueAsString(core::CStringUtils::typeToString(value)); + fieldValues.push_back(&valueAsString); + + CEventData eventData; + eventData.time(time); + + gatherer.addArrival(fieldValues, eventData, resourceMonitor); +} + void addArrival(CDataGatherer& gatherer, CResourceMonitor& resourceMonitor, core_t::TTime time, @@ -1869,8 +1887,11 @@ BOOST_FIXTURE_TEST_CASE(testRestoreTruncatesOversizedInfluencerSums, CTestFixtur // Add arrivals with an oversized influencer value (bypasses CAnomalyJob input truncation). std::string const oversizedInfluencer(500, 'y'); - addArrival(gatherer, m_ResourceMonitor, startTime + 1, "p", 1.0, oversizedInfluencer, ""); - addArrival(gatherer, m_ResourceMonitor, startTime + 2, "p", 2.0, oversizedInfluencer, ""); + addArrival(gatherer, m_ResourceMonitor, startTime + 1, "p", 1.0, oversizedInfluencer); + addArrival(gatherer, m_ResourceMonitor, startTime + 2, "p", 2.0, oversizedInfluencer); + + // Advance past the first bucket so influencer sums are flushed to the persistable queue. + gatherer.timeNow(startTime + bucketLength); // Persist — the JSON will contain the oversized influencer value. std::ostringstream origJson; From 72c3d6b89aaf393cf6402c47ac98af5e92bdea5a Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Thu, 19 Mar 2026 14:37:49 +0100 Subject: [PATCH 09/11] review comments --- include/core/CLoggerThrottler.h | 3 +- include/core/LogMacros.h | 18 +++++++ include/model/CFieldValueTruncator.h | 7 +-- lib/api/CAnomalyJob.cc | 11 +++-- lib/api/CDataProcessor.cc | 3 +- lib/api/unittest/CAnomalyJobTest.cc | 47 +++++++++++++++++-- .../unittest/CEventRateDataGathererTest.cc | 5 ++ lib/model/unittest/CMetricDataGathererTest.cc | 5 ++ 8 files changed, 83 insertions(+), 16 deletions(-) diff --git a/include/core/CLoggerThrottler.h b/include/core/CLoggerThrottler.h index c6a4505128..58b89c4d8b 100644 --- a/include/core/CLoggerThrottler.h +++ b/include/core/CLoggerThrottler.h @@ -30,7 +30,8 @@ namespace core { //! This is thread safe but uses a very simple strategy: all accesses to a single //! hash map are sychronised. We assume that log throttling is only applied to //! messages which normally occur infrequently; for example, this is only currently -//! applied to WARN and ERROR level logging (see LogMacros.h). So there will be +//! applied to WARN, ERROR, and throttled INFO (LOG_INFO_THROTTLED) logging +//! (see LogMacros.h). So there will be //! little contention. Furthermore, the overhead of locking and unlocking the mutex //! should be neglible compared to the work done if the log line were actually //! emitted. So this should actually give a significant performance improvement diff --git a/include/core/LogMacros.h b/include/core/LogMacros.h index 0c84a88d21..abb96afa21 100644 --- a/include/core/LogMacros.h +++ b/include/core/LogMacros.h @@ -83,6 +83,24 @@ BOOST_LOG_STREAM_SEV(ml::core::CLogger::instance().logger(), ml::core::CLogger::E_Info) \ LOG_LOCATION_INFO \ message +#ifdef LOG_INFO_THROTTLED +#undef LOG_INFO_THROTTLED +#endif +#define LOG_INFO_THROTTLED(message) \ + do { \ + std::size_t countOfInfoMessages; \ + bool skipInfoMessage; \ + std::tie(countOfInfoMessages, skipInfoMessage) = \ + ml::core::CLogger::instance().throttler().skip(__FILE__, __LINE__); \ + if (skipInfoMessage == false) { \ + BOOST_LOG_STREAM_SEV(ml::core::CLogger::instance().logger(), \ + ml::core::CLogger::E_Info) \ + LOG_LOCATION_INFO \ + message << (countOfInfoMessages > 1 \ + ? " | repeated [" + std::to_string(countOfInfoMessages) + "]" \ + : ""); \ + } \ + } while (0) #ifdef LOG_WARN #undef LOG_WARN #endif diff --git a/include/model/CFieldValueTruncator.h b/include/model/CFieldValueTruncator.h index c6df93a803..6c0b94aa60 100644 --- a/include/model/CFieldValueTruncator.h +++ b/include/model/CFieldValueTruncator.h @@ -35,12 +35,12 @@ namespace model { //! - Append HASH_HEX_DIGITS (16) character hex hash of complete original value //! //! Format: "$" -//! Example: "very_long_field_value_that_exceeds_limit_and_continues_for_thousands_of_chars_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx$a1b2c3d4e5f67890" +//! Example: "very_long_field_value_that_exceeds_limit_(...)$a1b2c3d4e5f67890" //! //! The 256-character limit aligns with Elasticsearch's ignore_above default //! for keyword fields. The hash suffix ensures data integrity while maintaining //! human readability (first 239 characters visible) and compatibility with -//! prefix-based filtering. Collision probability is ~1 in 10^19 (effectively zero). +//! prefix-based filtering. class MODEL_EXPORT CFieldValueTruncator { public: //! Maximum length for term fields in anomaly detection. @@ -59,8 +59,6 @@ class MODEL_EXPORT CFieldValueTruncator { "Term field format invariant: prefix + suffix = total length"); static_assert(PREFIX_LENGTH >= 200, "Readable prefix must be substantial for human comprehension"); - static_assert(HASH_HEX_DIGITS * 4 == 64, - "Hash hex digits must represent full 64-bit hash output"); //! Check if a term field value exceeds the domain constraint. //! \return true if the value requires length enforcement @@ -85,7 +83,6 @@ class MODEL_EXPORT CFieldValueTruncator { } //! Enforce term field length constraint, returning constrained copy. - //! Original value unchanged. //! \param value Original field value //! \return Copy with length constraint enforced static std::string truncated(const std::string& value) { diff --git a/lib/api/CAnomalyJob.cc b/lib/api/CAnomalyJob.cc index 8082f6c08f..374becaf37 100644 --- a/lib/api/CAnomalyJob.cc +++ b/lib/api/CAnomalyJob.cc @@ -1718,11 +1718,12 @@ void CAnomalyJob::prepareTruncatedFieldValues(const TStrVec& fieldNames, std::string escapedFieldName = fieldName; core::CStringUtils::escape('\\', "\n\r\t", escapedFieldName); - LOG_WARN(<< "Field '" << escapedFieldName - << "' value (length=" << value->size() << ", prefix='" - << value->substr(0, std::min(50, value->size())) - << "...') exceeds " << model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH - << " characters and has been truncated with collision-safe hash suffix"); + LOG_INFO_THROTTLED( + << "Field '" << escapedFieldName + << "' value (length=" << value->size() << ", prefix='" + << value->substr(0, std::min(50, value->size())) + << "...') exceeds " << model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH + << " characters and has been truncated with collision-safe hash suffix"); } else { fieldValues.push_back(value); } diff --git a/lib/api/CDataProcessor.cc b/lib/api/CDataProcessor.cc index 3ffdb4915e..cb796e19a1 100644 --- a/lib/api/CDataProcessor.cc +++ b/lib/api/CDataProcessor.cc @@ -53,8 +53,7 @@ std::string CDataProcessor::debugPrintRecord(const TStrStrUMap& dataRowFields) { fieldNames.append(rowIter->first); const std::string& val = rowIter->second; if (model::CFieldValueTruncator::needsTruncation(val)) { - fieldValues.append(val.substr(0, model::CFieldValueTruncator::MAX_FIELD_VALUE_LENGTH)); - fieldValues.append("..."); + fieldValues.append(model::CFieldValueTruncator::truncated(val)); } else { fieldValues.append(val); } diff --git a/lib/api/unittest/CAnomalyJobTest.cc b/lib/api/unittest/CAnomalyJobTest.cc index 681a316641..3e9a9da8f3 100644 --- a/lib/api/unittest/CAnomalyJobTest.cc +++ b/lib/api/unittest/CAnomalyJobTest.cc @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -1207,6 +1208,10 @@ BOOST_AUTO_TEST_CASE(testHierarchicalResultsNormalizerShouldIncreaseMemoryUsage) } BOOST_AUTO_TEST_CASE(testOversizedFieldValuesTruncated) { + // Verify that addRecord (via prepareTruncatedFieldValues) truncates oversized + // by/influencer values before they enter the model. We assert on persisted + // state because that reflects what the detector stored; if addRecord did + // not truncate, the full value would appear here. model::CLimits limits; api::CAnomalyJobConfig jobConfig = CTestAnomalyJob::makeSimpleJobConfig( "count", "", "by_field", "", "", {"influencer_field"}); @@ -1218,13 +1223,32 @@ BOOST_AUTO_TEST_CASE(testOversizedFieldValuesTruncated) { CTestAnomalyJob job("job", limits, jobConfig, modelConfig, wrappedOutputStream); - std::string const oversizedValue(77000, 'x'); + std::string const oversizedValue(1000, 'x'); CTestAnomalyJob::TStrStrUMap dataRows{{"time", "1000"}, {"by_field", oversizedValue}, {"influencer_field", oversizedValue}}; BOOST_TEST_REQUIRE(job.handleRecord(dataRows)); BOOST_REQUIRE_EQUAL(uint64_t(1), job.numRecordsHandled()); + + // Advance past bucket boundary so results are output and state can be persisted. + CTestAnomalyJob::TStrStrUMap advanceRows{{"time", "5000"}, + {"by_field", oversizedValue}, + {"influencer_field", oversizedValue}}; + BOOST_TEST_REQUIRE(job.handleRecord(advanceRows)); + BOOST_REQUIRE_EQUAL(uint64_t(2), job.numRecordsHandled()); + + std::ostringstream* strm{nullptr}; + api::CSingleStreamDataAdder::TOStreamP ptr{strm = new std::ostringstream()}; + api::CSingleStreamDataAdder persister{ptr}; + BOOST_TEST_REQUIRE(job.persistStateInForeground(persister, "")); + std::string const persistedState{strm->str()}; + + // Full oversized value must not be in state (addRecord truncated before store). + BOOST_TEST_REQUIRE(persistedState.find(oversizedValue) == std::string::npos); + // Persisted state must contain the truncated form produced by input truncation. + std::string const expectedTruncated = model::CFieldValueTruncator::truncated(oversizedValue); + BOOST_TEST_REQUIRE(persistedState.find(expectedTruncated) != std::string::npos); } BOOST_AUTO_TEST_CASE(testNormalFieldValuesNotTruncated) { @@ -1245,6 +1269,20 @@ BOOST_AUTO_TEST_CASE(testNormalFieldValuesNotTruncated) { BOOST_TEST_REQUIRE(job.handleRecord(dataRows)); BOOST_REQUIRE_EQUAL(uint64_t(1), job.numRecordsHandled()); + + // Advance past bucket boundary so results are output and state can be persisted. + CTestAnomalyJob::TStrStrUMap advanceRows{ + {"time", "5000"}, {"by_field", normalValue}, {"influencer_field", normalValue}}; + BOOST_TEST_REQUIRE(job.handleRecord(advanceRows)); + BOOST_REQUIRE_EQUAL(uint64_t(2), job.numRecordsHandled()); + + std::ostringstream* strm{nullptr}; + api::CSingleStreamDataAdder::TOStreamP ptr{strm = new std::ostringstream()}; + api::CSingleStreamDataAdder persister{ptr}; + BOOST_TEST_REQUIRE(job.persistStateInForeground(persister, "")); + std::string const persistedState{strm->str()}; + + BOOST_TEST_REQUIRE(persistedState.find(normalValue) != std::string::npos); } BOOST_AUTO_TEST_CASE(testDebugPrintRecordTruncatesLongValues) { @@ -1252,8 +1290,11 @@ BOOST_AUTO_TEST_CASE(testDebugPrintRecordTruncatesLongValues) { record["field1"] = std::string(1000, 'x'); record["field2"] = "short"; std::string result = api::CDataProcessor::debugPrintRecord(record); - BOOST_TEST_REQUIRE(result.find("...") != std::string::npos); - BOOST_TEST_REQUIRE(result.size() < 1500); + // truncated() produces prefix + '$' + 16 hex chars; full 1000-char value not present + BOOST_TEST_REQUIRE(result.find(std::string(1000, 'x')) == std::string::npos); + BOOST_TEST_REQUIRE(result.find(model::CFieldValueTruncator::HASH_SEPARATOR) != + std::string::npos); + BOOST_TEST_REQUIRE(result.size() < 500); } BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/model/unittest/CEventRateDataGathererTest.cc b/lib/model/unittest/CEventRateDataGathererTest.cc index 70e6c55bea..2a231a6cb9 100644 --- a/lib/model/unittest/CEventRateDataGathererTest.cc +++ b/lib/model/unittest/CEventRateDataGathererTest.cc @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -1948,6 +1949,10 @@ BOOST_FIXTURE_TEST_CASE(testRestoreTruncatesOversizedInfluencerValues, CTestFixt // The full 500-char string must no longer appear (it was truncated to 256). BOOST_TEST_REQUIRE(restoredJson.str().find(oversizedInfluencer) == std::string::npos); + // Restore-path truncation must produce the same format as CFieldValueTruncator::truncated. + std::string const expectedTruncated = + model::CFieldValueTruncator::truncated(oversizedInfluencer); + BOOST_TEST_REQUIRE(restoredJson.str().find(expectedTruncated) != std::string::npos); // Verify idempotency: restore again and persist — should be identical. std::istringstream restoredJsonStrm{"{\"topLevel\" : " + restoredJson.str() + "}"}; diff --git a/lib/model/unittest/CMetricDataGathererTest.cc b/lib/model/unittest/CMetricDataGathererTest.cc index 57bdc9838e..0413897891 100644 --- a/lib/model/unittest/CMetricDataGathererTest.cc +++ b/lib/model/unittest/CMetricDataGathererTest.cc @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -1921,6 +1922,10 @@ BOOST_FIXTURE_TEST_CASE(testRestoreTruncatesOversizedInfluencerSums, CTestFixtur // The full 500-char string must no longer appear (it was truncated to 256). BOOST_TEST_REQUIRE(restoredJson.str().find(oversizedInfluencer) == std::string::npos); + // Restore-path truncation must produce the same format as CFieldValueTruncator::truncated. + std::string const expectedTruncated = + model::CFieldValueTruncator::truncated(oversizedInfluencer); + BOOST_TEST_REQUIRE(restoredJson.str().find(expectedTruncated) != std::string::npos); // Verify idempotency: restore again and persist — should be identical. std::istringstream restoredJsonStrm{"{\"topLevel\" : " + restoredJson.str() + "}"}; From 691e8cffccebd9690999383871533779f2349d48 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Thu, 19 Mar 2026 14:43:46 +0100 Subject: [PATCH 10/11] formatting --- include/core/LogMacros.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/include/core/LogMacros.h b/include/core/LogMacros.h index abb96afa21..a66a6f5dda 100644 --- a/include/core/LogMacros.h +++ b/include/core/LogMacros.h @@ -86,20 +86,20 @@ #ifdef LOG_INFO_THROTTLED #undef LOG_INFO_THROTTLED #endif -#define LOG_INFO_THROTTLED(message) \ - do { \ - std::size_t countOfInfoMessages; \ - bool skipInfoMessage; \ - std::tie(countOfInfoMessages, skipInfoMessage) = \ - ml::core::CLogger::instance().throttler().skip(__FILE__, __LINE__); \ - if (skipInfoMessage == false) { \ - BOOST_LOG_STREAM_SEV(ml::core::CLogger::instance().logger(), \ - ml::core::CLogger::E_Info) \ - LOG_LOCATION_INFO \ - message << (countOfInfoMessages > 1 \ - ? " | repeated [" + std::to_string(countOfInfoMessages) + "]" \ - : ""); \ - } \ +#define LOG_INFO_THROTTLED(message) \ + do { \ + std::size_t countOfInfoMessages; \ + bool skipInfoMessage; \ + std::tie(countOfInfoMessages, skipInfoMessage) = \ + ml::core::CLogger::instance().throttler().skip(__FILE__, __LINE__); \ + if (skipInfoMessage == false) { \ + BOOST_LOG_STREAM_SEV(ml::core::CLogger::instance().logger(), \ + ml::core::CLogger::E_Info) \ + LOG_LOCATION_INFO \ + message << (countOfInfoMessages > 1 \ + ? " | repeated [" + std::to_string(countOfInfoMessages) + "]" \ + : ""); \ + } \ } while (0) #ifdef LOG_WARN #undef LOG_WARN From d905ec02a3e3ad54627ca94fbc156d9508b78e33 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Mon, 22 Jun 2026 09:46:57 +0200 Subject: [PATCH 11/11] Delete docs/CHANGELOG.asciidoc --- docs/CHANGELOG.asciidoc | 949 ---------------------------------------- 1 file changed, 949 deletions(-) delete mode 100644 docs/CHANGELOG.asciidoc diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc deleted file mode 100644 index 5c2485db47..0000000000 --- a/docs/CHANGELOG.asciidoc +++ /dev/null @@ -1,949 +0,0 @@ -// Use these for links to issue and pulls. Note issues and pulls redirect one to -// each other on Github, so don't worry too much on using the right prefix. -//:issue: https://github.com/elastic/elasticsearch/issues/ -//:ml-issue: https://github.com/elastic/ml-cpp/issues/ -//:pull: https://github.com/elastic/elasticsearch/pull/ -//:ml-pull: https://github.com/elastic/ml-cpp/pull/ - -= Elasticsearch Release Notes - -// -// To add a release, copy and paste the following text, uncomment the relevant -// sections, and add a link to the new section in the list of releases at the -// top of the page. Note that release subheads must be floated and sections -// cannot be empty. -// TEMPLATE: - -// == {es} version n.n.n - -//=== Breaking Changes - -//=== Deprecations - -//=== New Features - -//=== Enhancements - -//=== Bug Fixes - -//=== Regressions - -== {es} version 9.4.0 - -=== Bug Fixes - -* Truncate oversized field values to prevent autodetect process crash. (See {ml-pull}2929[#2929], {es-pull}143180[#143180], issue: {ml-issue}2796[#2796].) -* Report RSS in bytes instead of pages. (See {ml-pull}2917[#2917].) - -=== Enhancements - -* Better handling of invalid JSON state documents (See {ml-pull}[]#2895].) -* Better error handling regarding quantiles state documents (See {ml-pull}[#2894]) - -== {es} version 9.3.0 - -=== Enhancements - -* Downgrade log severity for a batch of recoverable errors. (See {ml-pull}[#2889].) - -== {es} version 9.2.0 - -=== Enhancements - -* Update the PyTorch library to version 2.7.1. (See {ml-pull}2863[#2863].) -* Report the actual memory usage of the autodetect process. (See {ml-pull}2846[#2846]) -* Improve adherence to memory limits for the bucket gatherer. (See {ml-pull}2848[#2848].) - -== {es} version 9.1.0 - -=== Enhancements - -* Track memory used in the hierarchical results normalizer. (See {ml-pull}2831[#2831].) - -=== Bug Fixes - -== {es} version 9.0.0 - -=== Enhancements - -* Update Linux build images to Rocky Linux 8 with gcc 13.3. (See {ml-pull}2773[#2773].) - -== {es} version 8.19.0 - -=== Enhancements - -* Better messaging regarding OOM process termination. (See {ml-pull}2841[#2841].) - -== {es} version 8.18.0 - -=== Enhancements - -* Update the PyTorch library to version 2.5.1. (See {ml-pull}2783[#2798], {ml-pull}2799[#2799].) -* Upgrade Boost libraries to version 1.86. (See {ml-pull}2780[#2780], {ml-pull}2779[#2779].) - -== {es} version 8.17.7 - -=== Enhancements -* Restrict file system access for PyTorch models (See {ml-pull}2851[#2851].) - -== {es} version 8.16.6 - -=== Bug Fixes - -* Correct handling of config updates. (See {ml-pull}2821[#2821].) - -== {es} version 8.16.4 - -=== Bug Fixes - -* Increase the upper limits for the Boost.JSON SAX parser. (See {ml-pull}2809[#2809].) - -== {es} version 8.16.0 - -=== Enhancements - -* Allow the user to force a detector to shift time series state by a specific amount. - (See {ml-pull}2695[#2695].) - -=== Bug Fixes - -* Allow for pytorch_inference results to include zero-dimensional tensors. - -== {es} version 8.15.4 - -=== Bug Fixes - -* Fix parameter initialization for large forecasting models. (See {ml-pull}2759[#2759].) - -== {es} version 8.15.2 - -=== Enhancements - -* Update the Pytorch library to version 2.3.1. (See {ml-pull}2688[#2688].) - -=== Bug Fixes - -* Allow for pytorch_inference results to include zero-dimensional tensors. - -== {es} version 8.15.1 - -== {es} version 8.15.0 - -=== Enhancements - -* Log 'No statistics at.. ' message as a warning. (See {ml-pull}2684[#2684].) - -=== Bug Fixes - -* Fix "stack use after scope" memory error. (See {ml-pull}2673[#2673].) -* Handle any exception thrown by inference. (See {ml-pull}2680[#2680].) - -== {es} version 8.14.1 - -=== Bug Fixes - -* Handle any exception thrown by inference. (See {ml-pull}2680[#2680].) - -== {es} version 8.14.1 - -=== Enhancements - -* Improve memory allocation management for JSON processing to reduce memory usage. - (See {ml-pull}2679[#2679].) - -== {es} version 8.14.0 - -=== Bug Fixes - -* Remove ineffective optimizations for duplicate strings. (See {ml-pull}2652[#2652], issue: {ml-issue}2130[#2130].) -* Use custom Boost.JSON resource allocator. (See {ml-pull}2674[#2674].) - -== {es} version 8.13.0 - -=== Enhancements - -* Use Boost.JSON for JSON processing. (See {ml-pull}2614[#2614].) -* Upgrade Pytorch to version 2.1.2. (See {ml-pull}2588[#2588].) -* Upgrade zlib to version 1.2.13 on Windows. (See {ml-pull}2588[#2588].) -* Better handling of number of allocations in pytorch_inference in the case that - hardware_concurrency fails. We were previously forcing maximum number of allocations - to be one in this case, we now allow what is requested. (See {ml-pull}2607[#2607].) -* Upgrade MKL to version 2024.0 on Linux x86_64. (See {ml-pull}2619[#2619].) - -== {es} version 8.12.0 - -=== Enhancements - -* Upgrade Boost libraries to version 1.83. (See {ml-pull}2560[#2560].) - -=== Bug Fixes - -* Ensure the estimated latitude is within the allowed range (See {ml-pull}2586[#2586].) -* Remove dependency on the IPEX library (See {ml-pull}2605[#2605] and {ml-pull}2606[#2606].) - -== {es} version 8.11.2 - -=== Enhancements - -* Improve forecasting for time series with step changes. (See {ml-pull}2591[#2591], - issue: {ml-issue}2466[#2466]). - -== {es} version 8.11.0 - -=== Enhancements - -* Add support for PyTorch models quantized with Intel Extension for PyTorch. This feature is _only_ available on `linux_x86_64`. (See {ml-pull}2547[#2547]). - -== {es} version 8.10.3 - -=== Bug Fixes -* Fix for lost inference requests when writing to the cache times out leading to processing to stall on the Elasticsearch side. (See {ml-pull}2576[#2576].) - -== {es} version 8.9.0 - -=== Enhancements - -* Improved compliance with memory limitations. (See {ml-pull}2469[#2469].) -* Improve detection of time shifts, for example for day light saving. (See {ml-pull}2479[#2479].) -* Improve detection of calendar cyclic components with long bucket lengths. (See {ml-pull}2493[#2493].) - -=== Bug Fixes -* Prevent high memory usage by evaluating batch inference singularly. (See {ml-pull}2538[#2538].) -* Catch exceptions thrown during inference and report as errors. (See {ml-pull}2542[#2542].) - -== {es} version 8.8.0 - -=== Enhancements - -* Anomaly score explanation for rare detector. (See {ml-pull}2449[#2449].) - -== {es} version 8.7.0 - -=== Enhancements - -* Add identification of multimodal distribution to anomaly explanations. (See {ml-pull}2440[#2440].) -* Upgrade PyTorch to version 1.13.1. (See {ml-pull}2430[#2430].) -* Remove the PyTorch inference work queue as now handled in Elasticsearch - -== {es} version 8.6.0 - -=== Bug Fixes - -* Fix for 'No statistics' error message. (See {ml-pull}2410[#2410].) -* Fix for 'No counts available' error message. (See {ml-pull}2414[#2414].) -* Improve performance of closing files before spawning. (See {ml-pull}2424[#2424].) - -== {es} version 8.5.0 - -=== Enhancements - -* Compute outlier feature influence via the Gateaux derivative to improve attribution - for high dimension vectors. (See {ml-pull}2256[#2256].) -* Improve classification and regression model train runtimes for data sets with many - numeric features. (See {ml-pull}2380[#2380], {ml-pull}2388[#2388], {ml-pull}2390[#2390] - and {ml-pull}2401[#2401].) -* Increase the limit on the maximum number of classes to 100 for training classification - models. (See {ml-pull}2395[#2395] issue: {ml-issue}2246[#2246].) - -== {es} version 8.4.2 - -=== Bug Fixes - -* Do not retain categorization tokens when existing category matches. (See {ml-pull}2398[#2398].) - -== {es} version 8.4.0 - -=== Enhancements - -* Fairer application of size penalty for model selection for training classification - and regression models. (See {ml-pull}2291[#2291].) -* Accelerate training for data frame analytics by skipping fine parameter tuning if it - is unnecessary. (See {ml-pull}2298[#2298].) -* Address some causes of high runtimes training regression and classification models - on large data sets with many features. (See {ml-pull}2332[#2332].) -* Add caching for PyTorch inference. (See {ml-pull}2305[#2305].) -* Improve accuracy of anomaly detection median estimation. (See {ml-pull}2367[#2367], - issue: {ml-issue}2364[#2364].) - -=== Bug Fixes - -* Fix potential cause of classification and regression job failures. (See {ml-pull}2385[#2385].) - -== {es} version 8.3.0 - -=== Enhancements - -* Upgrade PyTorch to version 1.11. (See {ml-pull}2233[#2233], {ml-pull}2235[#2235] - and {ml-pull}2238[#2238].) -* Upgrade zlib to version 1.2.12 on Windows. (See {ml-pull}2253[#2253].) -* Upgrade libxml2 to version 2.9.14 on Linux and Windows. (See {ml-pull}2287[#2287].) -* Improve time series model stability and anomaly scoring consistency for data - for which many buckets are empty. (See {ml-pull}2267[#2267].) -* Address root cause for actuals equals typical equals zero anomalies. (See {ml-pull}2270[#2270].) -* Better handling of outliers in update immediately after detecting changes in time - series. (See {ml-pull}2280[#2280].) -* Improve normalization of anomaly detection results for short bucket lengths. This - corrects bias which could cause our scoring to be too low for these jobs. (See, - {ml-pull}2285[#2285], issue: {ml-issue}2276[#2276].) - -=== Bug Fixes - -* Correct logic for restart from failover fine tuning hyperparameters for training - classification and regression models. (See {ml-pull}2251[#2251].) -* Fix possible source of "x = NaN, distribution = class boost::math::normal_distribution<..." - log errors training classification and regression models. (See {ml-pull}2249[#2249].) -* Fix some bugs affecting decision to stop optimising hyperparameters for training - classification and regression models. (See {ml-pull}2259[#2259].) -* Fix cause of "Must provide points at which to evaluate function" log error training - classification and regression models. (See {ml-pull}2268[#2268].) -* Fix a source of "Discarding sample = nan, weights = ..." log errors for time series - anomaly detection. (See {ml-pull}2286[#2286].) - -== {es} version 8.2.2 - -=== Enhancements - -* Make ML native processes work with glibc 2.35 (required for Ubuntu 22.04). (See - {ml-pull}2272[#2272].) - -=== Bug Fixes - -* Adjacency weighting fixes in categorization. (See {ml-pull}2277[#2277].) - -== {es} version 8.2.1 - -=== Bug Fixes - -* Fix edge case which could cause the model bounds to blow up after detecting seasonality. - (See {ml-pull}2261[#2261].) - -== {es} version 8.2.0 - -=== Enhancements - -* Better handle small shifts of the seasonal patterns in time series data. - (See {ml-pull}2202[#2202].) -* Limit the maximum size of classification and regression models training - produces so they can always be deployed for inference inside the Elastic - Stack. (See {ml-pull}2205[#2205].) -* Support user defined example weights when training classification and - regression models. (See {ml-pull}2222[#2222].) -* Reduce worst case bucket processing time for anomaly detection. (See {ml-pull}2225[#2225].) -* Improve handling of low cardinality features for training classification - and regression models. (See {ml-pull}2229[#2229].) -* Improve handling of extremely large outliers in time series modelling. - (See {ml-pull}2230[#2230].) -* Improve detection and modeling of time series' calendar cyclic features. - (See {ml-pull}2236[#2236] and {ml-pull}2243[#2243].) -* Compress quantiles state. (See {ml-pull}2252[#2252].) - -=== Bug Fixes - -* Fix possible source of "Discarding sample = -nan(ind), weight = 1, variance scale = 1" - log errors training classification and regression models. (See {ml-pull}2226[#2226].) -* Fix error message for failure to create reverse search. (See {ml-pull}2247[#2247].) - -== {es} version 8.1.0 - -=== Enhancements - -* Improve skip_model_update rule behaviour (See {ml-pull}2096[#2096].) -* Upgrade Boost libraries to version 1.77. (See {ml-pull}2095[#2095].) -* Upgrade RapidJSON to 31st October 2021 version. (See {ml-pull}2106[#2106].) -* Upgrade Eigen library to version 3.4.0. (See {ml-pull}2137[#2137].) -* Prevent over-subscription of threads in pytorch_inference. (See {ml-pull}2141[#2141].) - -=== Bug Fixes - -* Fix a bug in the tuning of the hyperparameters when training regression - classification models. (See {ml-pull}2128[#2128].) -* Improve training stability for regression and classification models - (See {ml-pull}2144[#2144], {ml-pull}2147[#2147] and {ml-pull}2150[#2150].) -* Avoid edge cases in the classification weights calculation to maximize - minimum recall which could lead to only a single class being predicted. - (See {ml-pull}2194[#2194].) -* Address cause of "[CStatisticalTests.cc@102] Test statistic is nan" - log errors. (See {ml-pull}2196[#2196].) -* Address possible causes of "x = NaN, distribution = N5boost4math23students_t_distribution" - log errors. (See {ml-pull}2197[#2197].) -* Fix bug restoring data gatherer state for time of day and week anomaly detection - functions. This could lead to "No queue item for time " and "Time is out of range. - Returning earliest bucket index" log errors. (See {ml-pull}2213[#2213].) - -== {es} version 8.0.0-rc1 - -=== Bug Fixes - -* Set model state compatibility version to 8.0.0. (See {ml-pull}2139[#2139].) - -== {es} version 8.0.0-beta1 - -=== Enhancements - -* The Linux build platform for the {ml} C++ code is now CentOS 7 running gcc 10.3. (See - {ml-pull}2028[#2028].) - -== {es} version 8.0.0-alpha1 - -=== Enhancements - -* The Windows build platform for the {ml} C++ code now uses Visual Studio 2019. (See - {ml-pull}1352[#1352].) -* The macOS build platform for the {ml} C++ code is now Mojave running Xcode 11.3.1, - or Ubuntu 20.04 running clang 8 for cross compilation. (See {ml-pull}1429[#1429].) -* The Linux build platform for the {ml} C++ code is now CentOS 7 running gcc 9.3. (See - {ml-pull}1170[#1170].) -* Added a new application for evaluating PyTorch models. The app depends on LibTorch - - the C++ front end to PyTorch - and performs inference on models stored in the - TorchScript format. (See {ml-pull}1902[#1902].) - - -== {es} version 7.17.0 - -=== Bug Fixes - -* Avoid transient poor time series modelling after detecting new seasonal components. - This can affect cases where we have fast and slow repeats in the data, for example - 30 mins and 1 day, and the job uses a short bucket length. The outcome can be transient - poor predictions and model bounds, and sometimes false positive anomalies. (See - {ml-pull}2167[#2167].) - -== {es} version 7.16.0 - -=== Enhancements - -* Speed up training of regression and classification models. (See {ml-pull}2024[#2024].) -* Improve concurrency for training regression and classification models. (See - {ml-pull}2031[#2031].) -* Improve aspects of implementation of skip_model_update rule (See {ml-pull}2053[#2053].) -* Make sure instrumentation captures the best hyperparameters we found for training - classification and regression models. (See {ml-pull}2057{#2057}.) - -=== Bug Fixes - -* Correct ANOVA for Gaussian Process we fit to the loss surface. This affects early stopping. - Previously, we would always stop early whether it was approproate or not. It also improves - the estimates of hyperparameter importances. (See {ml-pull}2073[#2073].) -* Fix numerical instability in hyperparameter optimisation for training regression and - classification models. (See {ml-pull}2078[#2078].) -* Fix numerical stability issues in time series modelling. (See {ml-pull}2083[#[2083]].) - -== {es} version 7.15.2 - -=== Bug Fixes - -* Fix cancellation of named pipe connection on Linux if the remote end does not connect - within the configured timeout period. (See {ml-pull}2102[#2102].) - -== {es} version 7.15.0 - -=== Enhancements - -* Speed up training of regression and classification models on very large data sets. - (See {ml-pull}1941[#1941].) -* Improve regression and classification training accuracy for small data sets. - (See {ml-pull}1960[#1960].) -* Prune models for split fields (by, partition) that haven't seen data updates for - a given period of time. (See {ml-pull}1962[#1962].) - -=== Bug Fixes - -* Fix potential "process stopped unexpectedly: Fatal error" for training regression - and classification models. (See {ml-pull}1997[#1997], issue {ml-pull}1956[#1956].) - -== {es} version 7.14.0 - -=== Enhancements - -* Give higher weight to multiple adjacent dictionary words when performing categorization. (See - {ml-pull}1903[#1903].) - -=== Bug Fixes - -* Make atomic operations safer for aarch64. (See {ml-pull}1893[#1893].) -* Ensure bucket `event_count` is calculated for jobs with 1 second bucket spans. -(See {ml-pull}1908[#1908].) - -== {es} version 7.13.0 - -=== Enhancements - -* Speed up training of regression and classification model training for data sets - with many features. (See {ml-pull}1746[#1746].) -* Avoid overfitting in final training by scaling regularizers to account for the - difference in the number of training examples. This results in a better match - between train and test error for classification and regression and often slightly - improved test errors. (See {ml-pull}1755[#1755].) -* Adjust the syscall filter to allow mremap and avoid spurious audit logging. - (See {ml-pull}1819[#1819].) - -=== Bug Fixes - -* Ensure the same hyperparameters are chosen if classification or regression training - is stopped and restarted, for example, if the node fails. (See {ml-pull}1848[#1848].) -* Fail gracefully if insufficient data are supplied for classification or regression - training. (See {ml-pull}1855[#1855].) -* Fail gracefully on encountering unexpected state in restore from snapshot for anomaly - detection. (See {ml-pull}1872[#1872].) -* Use appropriate memory ordering flags for aarch64 with string store to avoid excessive - string duplication. (See {ml-pull}1888[#1888].) - -== {es} version 7.12.2 - -=== Bug Fixes - -* Add missing hyperparamter to the model metadata. (See {ml-pull}1867[#1867].) - -== {es} version 7.12.1 - -=== Enhancements - -* Make ML native processes work with glibc 2.33 on x86_64. (See {ml-pull}1828[#1828].) - -== {es} version 7.12.0 - -=== Enhancements - -* Fix edge case which could cause spurious anomalies early in the learning process - if the time series has non-diurnal seasonality. (See {ml-pull}1634[#1634].) -* Compute importance of hyperparameters optimized in the fine parameter tuning step. - (See {ml-pull}1627[#1627].) -* Early stopping for the fine parameter tuning step of classification and regression - model training. (See {ml-pull}1676[#1676].) -* Correct upgrade for pre-6.3 state for lat_long anomaly anomaly detectors. (See - {ml-pull}1681[#1681].) -* Per tree feature bag to speed up training of regression and classification models - and improve scalability for large numbers of features. (See {ml-pull}1733[#1733].) - -=== Bug Fixes - -* Fix a source of instability in time series modeling for anomaly detection. This has - been observed to cause spurious anomalies for a partition which no longer receives - any data. (See {ml-pull}1675[#1675].) -* Ensure that we stop modeling seasonality for data which flatlines. This is important - for count and sum detectors which treat empty buckets as zero. We could see spurious - anomalies in realtime detection after a partition no longer received data any data - as a result. (See {ml-pull}1654[#1654].) - -== {es} version 7.11.0 - -=== Enhancements - -* During regression and classification training prefer smaller models if performance is - similar (See {ml-pull}1516[#1516].) -* Add a response mechanism for commands sent to the native controller. (See - {ml-pull}1520[#1520], {es-pull}63542[#63542], issue: {es-issue}62823[#62823].) -* Speed up anomaly detection for seasonal data. This is particularly effective for jobs - using longer bucket lengths. (See {ml-pull}1549[#1549].) -* Fix an edge case which could cause typical and model plot bounds to blow up to around - max double. (See {ml-pull}1551[#1551].) -* Estimate upper bound of potential gains before splitting a decision tree node to avoid - unnecessary computation. (See {ml-pull}1537[#1537].) -* Improvements to time series modeling particularly in relation to adaption to change. - (See {ml-pull})1614[#1614].) -* Warn and error log throttling. (See {ml-pull}1615[#1615].) -* Soften the effect of fluctuations in anomaly detection job memory usage on node - assignment and add `assignment_memory_basis` to `model_size_stats`. - (See {ml-pull}1623[#1623], {es-pull}65561[#65561], issue: {es-issue}63163[#63163].) - -=== Bug Fixes - -* Fix potential cause for log errors from CXMeansOnline1d. (See {ml-pull}1586[#1586].) -* Fix scaling of some hyperparameter for Bayesian optimization. (See {ml-pull}1612[#1612].) -* Fix missing state in persist and restore for anomaly detection. This caused suboptimal - modelling after a job was closed and reopened or failed over to a different node. - (See {ml-pull}1668[#1668].) - -== {es} version 7.10.1 - -=== Bug Fixes - -* Fix a bug where the peak_model_bytes value of the model_size_stats object was not - restored from the anomaly detector job snapshots. (See {ml-pull}1572[#1572].) - -== {es} version 7.10.0 - -=== Enhancements - -* Calculate total feature importance to store with model metadata. (See {ml-pull}1387[#1387].) -* Change outlier detection feature_influence format to array with nested objects. (See {ml-pull}1475[#1475], {es-pull}62068[#62068].) -* Add timeouts to named pipe connections. (See {ml-pull}1514[#1514], {es-pull}62993[#62993], issue: {ml-issue}1504[#1504].) - -=== Bug Fixes - -* Fix progress on resume after final training has completed for classification and regression. - We previously showed progress stuck at zero for final training. (See {ml-pull}1443[#1443].) -* Avoid potential "Failed to compute quantile" and "No values added to quantile sketch" log errors - training regression and classification models if there are features with mostly missing values. - (See {ml-pull}1500[#1500].) -* Correct the anomaly detection job model state `min_version`. (See {ml-pull}1546[#1546].) - -== {es} version 7.9.2 - -=== Bug Fixes - -* Fix reporting of peak memory usage in memory stats for data frame analytics. (See {ml-pull}1468[#1468].) -* Fix reporting of peak memory usage in model size stats for anomaly detection. (See {ml-pull}1484[#1484].) - -== {es} version 7.9.0 - -=== New Features - -* Report significant changes to anomaly detection models in annotations of the results. - (See {ml-pull}1247[#1247], {pull}56342[#56342], {pull}56417[#56417], {pull}57144[#57144], {pull}57278[#57278], {pull}57539[#57539].) - -=== Enhancements - -* Add support for larger forecasts in memory via max_model_memory setting. - (See {ml-pull}1238[#1238] and {pull}57254[#57254].) -* Don't lose precision when saving model state. (See {ml-pull}1274[#1274].) -* Parallelize the feature importance calculation for classification and regression - over trees. (See {ml-pull}1277[#1277].) -* Add an option to do categorization independently for each partition. - (See {ml-pull}1293[#1293], {ml-pull}1318[#1318], {ml-pull}1356[#1356] and {pull}57683[#57683].) -* Memory usage is reported during job initialization. (See {ml-pull}1294[#1294].) -* More realistic memory estimation for classification and regression means that these - analyses will require lower memory limits than before (See {ml-pull}1298[#1298].) -* Checkpoint state to allow efficient failover during coarse parameter search - for classification and regression. (See {ml-pull}1300[#1300].) -* Improve data access patterns to speed up classification and regression. - (See {ml-pull}1312[#1312].) -* Performance improvements for classification and regression, particularly running - multithreaded. (See {ml-pull}1317[#1317].) -* Improve runtime and memory usage training deep trees for classification and - regression. (See {ml-pull}1340[#1340].) -* Improvement in handling large inference model definitions. (See {ml-pull}1349[#1349].) -* Add a peak_model_bytes field to model_size_stats. (See {ml-pull}1389[#1389].) - -=== Bug Fixes - -* Fix numerical issues leading to blow up of the model plot bounds. (See {ml-pull}1268[#1268].) -* Fix causes for inverted forecast confidence interval bounds. (See {ml-pull}1369[#1369], - issue: {ml-issue}1357[#1357].) -* Restrict growth of max matching string length for categories. (See {ml-pull}1406[#1406].) - -== {es} version 7.8.1 - -=== Bug Fixes - -* Better interrupt handling during named pipe connection. (See {ml-pull}1311[#1311].) -* Trap potential cause of SIGFPE. (See {ml-pull}1351[#1351], issue: {ml-issue}1348[#1348].) -* Correct inference model definition for MSLE regression models. (See {ml-pull}1375[#1375].) -* Fix cause of SIGSEGV of classification and regression. (See {ml-pull}1379[#1379].) -* Fix restoration of change detectors after seasonality change. (See {ml-pull}1391[#1391].) -* Fix potential SIGSEGV when forecasting. (See {ml-pull}1402[#1402], issue: {ml-issue}1401[#1401].) - -== {es} version 7.8.0 - -=== Enhancements - -* Speed up anomaly detection for the lat_long function. (See {ml-pull}1102[#1102].) -* Reduce CPU scheduling priority of native analysis processes to favor the ES JVM - when CPU is constrained. This change is only implemented for Linux and macOS, not - for Windows. (See {ml-pull}1109[#1109].) -* Take `training_percent` into account when estimating memory usage for classification and regression. - (See {ml-pull}1111[#1111].) -* Support maximize minimum recall when assigning class labels for multiclass classification. - (See {ml-pull}1113[#1113].) -* Improve robustness of anomaly detection to bad input data. (See {ml-pull}1114[#1114].) -* Adds new `num_matches` and `preferred_to_categories` fields to category output. - (See {ml-pull}1062[#1062]) -* Adds mean squared logarithmic error (MSLE) for regression. (See {ml-pull}1101[#1101].) -* Adds pseudo-Huber loss for regression. (See {ml-pull}1168[#1168].) -* Reduce peak memory usage and memory estimates for classification and regression. - (See {ml-pull}1125[#1125].) -* Reduce variability of classification and regression results across our target operating systems. - (See {ml-pull}1127[#1127].) -* Switched data frame analytics model memory estimates from kilobytes to megabytes. - (See {ml-pull}1126[#1126], issue: {issue}54506[#54506].) -* Added a {ml} native code build for Linux on AArch64. (See {ml-pull}1132[#1132] and - {ml-pull}1135[#1135].) -* Improve data frame analysis runtime by optimising memory alignment for intrinsic - operations. (See {ml-pull}1142[#1142].) -* Fix spurious anomalies for count and sum functions after no data are received for long - periods of time. (See {ml-pull}1158[#1158].) -* Improve false positive rates from periodicity test for time series anomaly detection. - (See {ml-pull}1177[#1177].) -* Break progress reporting of data frame analyses into multiple phases. (See {ml-pull}1179[#1179].) -* Really centre the data before training for classification and regression begins. This - means we can choose more optimal smoothing bias and should reduce the number of trees. - (See {ml-pull}1192[#1192].) - -=== Bug Fixes - -* Trap and fail if insufficient features are supplied to data frame analyses. This - caused classification and regression getting stuck at zero progress analyzing. - (See {ml-pull}1160[#1160], issue: {issue}55593[#55593].) -* Make categorization respect the `model_memory_limit`. (See {ml-pull}1167[#1167], - issue: {ml-issue}1130[#1130].) -* Respect user overrides for `max_trees` for classification and regression. (See - {ml-pull}1185[#1185].) -* Reset memory status from `soft_limit` to `ok` when pruning is no longer required. - (See {ml-pull}1193[#1193], issue: {ml-issue}1131[#1131].) -* Fix restore from training state for classification and regression. (See - {ml-pull}1197[#1197].) -* Improve the initialization of seasonal components for anomaly detection. (See - {ml-pull}1201[#1201], issue: {ml-issue}1178[#1178].) - -== {es} version 7.7.1 - -=== Bug Fixes - -* Fixed background persistence of categorizer state (See {ml-pull}1137[#1137], - issue: {ml-issue}1136[#1136].) -* Fix classification job failures when number of classes in configuration differs - from the number of classes present in the training data. (See {ml-pull}1144[#1144].) -* Fix underlying cause for "Failed to calculate splitting significance" log errors. - (See {ml-pull}1157[#1157].) -* Fix possible root cause for "Bad variance scale nan" log errors. (See {ml-pull}1225[#1225].) -* Change data frame analytics instrumentation timestamp resolution to milliseconds. (See - {ml-pull}1237[#1237].) -* Fix "autodetect process stopped unexpectedly: Fatal error: 'terminate called after - throwing an instance of 'std::bad_function_call'". (See {ml-pull}1246[#1246], - issue: {ml-issue}1245[#1245].) - -== {es} version 7.7.0 - -=== New Features - -* Add instrumentation to report statistics related to data frame analytics jobs, i.e. -progress, memory usage, etc. (See {ml-pull}906[#906].) -* Multiclass classification. (See {ml-pull}1037[#1037].) - -=== Enhancements - -* Improve computational performance of the feature importance computation. (See {ml-pull}1005[1005].) -* Improve initialization of learn rate for better and more stable results in regression -and classification. (See {ml-pull}948[#948].) -* Add number of processed training samples to the definition of decision tree nodes. -(See {ml-pull}991[#991].) -* Add new model_size_stats fields to instrument categorization. (See {ml-pull}948[#948] -and {pull}51879[#51879], issue: {issue}50794[#50749].) -* Improve upfront memory estimation for all data frame analyses, which were higher than -necessary. This will improve the allocation of data frame analyses to cluster nodes. -(See {ml-pull}1003[#1003].) -* Upgrade the compiler used on Linux from gcc 7.3 to gcc 7.5, and the binutils used in -the build from version 2.20 to 2.34. (See {ml-pull}1013[#1013].) -* Add instrumentation of the peak memory consumption for data frame analytics jobs. -(See {ml-pull}1022[#1022].) -* Remove all memory overheads for computing tree SHAP values. (See {ml-pull}1023[#1023].) -* Distinguish between empty and missing categorical fields in classification and regression -model training. (See {ml-pull}1034[#1034].) -* Add instrumentation information for supervised learning data frame analytics jobs. -(See {ml-pull}1031[#1031].) -* Add instrumentation information for outlier detection data frame analytics jobs. -* Write out feature importance for multi-class models. (See {ml-pull}1071[#1071]) -* Enable system call filtering to the native process used with data frame analytics. -(See {ml-pull}1098[#1098]) - -=== Bug Fixes - -* Use largest ordered subset of categorization tokens for category reverse search regex. -(See {ml-pull}970[#970], issue: {ml-issue}949[#949].) -* Account for the data frame's memory when estimating the peak memory used by classification -and regression model training. (See {ml-pull}996[#996].) -* Rename classification and regression parameter maximum_number_trees to max_trees. -(See {ml-pull}1047[#1047].) - -== {es} version 7.6.2 - -=== Bug Fixes - -* Fix a bug in the calculation of the minimum loss leaf values for classification. -(See {ml-pull}1032[#1032].) - -== {es} version 7.6.0 - -=== New Features - -* Add feature importance values to classification and regression results (using tree -SHapley Additive exPlanation, or SHAP). (See {ml-pull}857[#857].) - -=== Enhancements - -* Improve performance of boosted tree training for both classification and regression. -(See {ml-pull}775[#775].) -* Reduce the peak memory used by boosted tree training and fix an overcounting bug -estimating maximum memory usage. (See {ml-pull}781[#781].) -* Stratified fractional cross validation for regression. (See {ml-pull}784[#784].) -* Added `geo_point` supported output for `lat_long` function records. (See {ml-pull}809[#809] -and {pull}47050[#47050].) -* Use a random bag of the data to compute the loss function derivatives for each new -tree which is trained for both regression and classification. (See {ml-pull}811[#811].) -* Emit `prediction_probability` field alongside prediction field in ml results. -(See {ml-pull}818[#818].) -* Reduce memory usage of {ml} native processes on Windows. (See {ml-pull}844[#844].) -* Reduce runtime of classification and regression. (See {ml-pull}863[#863].) -* Stop early training a classification and regression forest when the validation error -is no longer decreasing. (See {ml-pull}875[#875].) -* Emit `prediction_field_name` in ml results using the type provided as -`prediction_field_type` parameter. (See {ml-pull}877[#877].) -* Improve performance updating quantile estimates. (See {ml-pull}881[#881].) -* Migrate to use Bayesian Optimisation for initial hyperparameter value line searches and -stop early if the expected improvement is too small. (See {ml-pull}903[#903].) -* Stop cross-validation early if the predicted test loss has a small chance of being -smaller than for the best parameter values found so far. (See {ml-pull}915[#915].) -* Optimize decision threshold for classification to maximize minimum class recall. -(See {ml-pull}926[#926].) -* Include categorization memory usage in the `model_bytes` field in `model_size_stats`, -so that it is taken into account in node assignment decisions. (See {ml-pull}927[#927], -issue: {ml-issue}724[#724].) - -=== Bug Fixes -* Fixes potential memory corruption when determining seasonality. (See {ml-pull}852[#852].) -* Prevent prediction_field_name clashing with other fields in ml results. -(See {ml-pull}861[#861].) -* Include out-of-order as well as in-order terms in categorization reverse searches. -(See {ml-pull}950[#950], issue: {ml-issue}949[#949].) - -== {es} version 7.5.2 - -=== Bug Fixes -* Fixes potential memory corruption or inconsistent state when background persisting -categorizer state. (See {ml-pull}921[#921].) - -== {es} version 7.5.0 - -=== Enhancements - -* Improve performance and concurrency training boosted tree regression models. -For large data sets this change was observed to give a 10% to 20% decrease in -train time. (See {ml-pull}622[#622].) -* Upgrade Boost libraries to version 1.71. (See {ml-pull}638[#638].) -* Improve initialisation of boosted tree training. This generally enables us to -find lower loss models faster. (See {ml-pull}686[#686].) -* Include a smooth tree depth based penalty to regularized objective function for -boosted tree training. Hard depth based regularization is often the strategy of -choice to prevent over fitting for XGBoost. By smoothing we can make better tradeoffs. -Also, the parameters of the penalty function are mode suited to optimising with our -Bayesian optimisation based hyperparameter search. (See {ml-pull}698[#698].) -* Binomial logistic regression targeting cross entropy. (See {ml-pull}713[#713].) -* Improvements to count and sum anomaly detection for sparse data. This primarily -aims to improve handling of data which are predictably present: detecting when they -are unexpectedly missing. (See {ml-pull}721[#721].) -* Trap numeric errors causing bad hyperparameter search initialisation and repeated -errors to be logged during boosted tree training. (See {ml-pull}732[#732].) - -=== Bug Fixes - -* Restore from checkpoint could damage seasonality modeling. For example, it could -cause seasonal components to be overwritten in error. (See {ml-pull}821[#821].) - -== {es} version 7.4.1 - -=== Enhancements - -* The {ml} native processes are now arranged in a .app directory structure on - macOS, to allow for notarization on macOS Catalina. (See {ml-pull}593[#593].) - -=== Bug Fixes - -* A reference to a temporary variable was causing forecast model restoration to fail. -The bug exhibited itself on MacOS builds with versions of clangd > 10.0.0. (See {ml-pull}688[#688].) - -== {es} version 7.4.0 - -=== Bug Fixes - -* Rename outlier detection method values knn and tnn to distance_kth_nn and distance_knn -respectively to match the API. (See {ml-pull}598[#598].) -* Fix occasional (non-deterministic) reinitialisation of modelling for the lat_long -function. (See {ml-pull}641[#641].) - -== {es} version 7.3.1 - -=== Bug Fixes - -* Only trap the case that more rows are supplied to outlier detection than expected. -Previously, if rows were excluded from the data frame after supplying the row count -in the configuration then we detected the inconsistency and failed outlier detection. -However, this legitimately happens in case where the field values are non-numeric or -array valued. (See {ml-pull}569[#569].) - -== {es} version 7.3.0 - -=== Enhancements - -* Upgrade to a newer version of the Apache Portable Runtime library. (See {ml-pull}495[#495].) -* Improve stability of modelling around change points. (See {ml-pull}496[#496].) - -=== Bug Fixes - -* Reduce false positives associated with the multi-bucket feature. (See {ml-pull}491[#491].) -* Reduce false positives for sum and count functions on sparse data. (See {ml-pull}492[#492].) - -== {es} version 7.2.1 - -=== Bug Fixes - -* Fix an edge case causing spurious anomalies (false positives) if the variance in the count of events -changed significantly throughout the period of a seasonal quantity. (See {ml-pull}489[#489].) - -== {es} version 7.2.0 - -=== Enhancements - -* Remove hard limit for maximum forecast interval and limit based on the time interval of data added -to the model. (See {ml-pull}214[#214].) - -* Use hardened compiler options to build 3rd party libraries. (See {ml-pull}453[#453].) - -* Only select more complex trend models for forecasting if there is evidence that they are needed. -(See {ml-pull}463[#463].) - -* Improve residual model selection. (See {ml-pull}468[#468].) - -* Stop linking to libcrypt on Linux. (See {ml-pull}480[#480].) - -* Improvements to hard_limit audit message. (See {ml-pull}486[#486].) - -=== Bug Fixes - -* Handle NaNs when detrending seasonal components. {ml-pull}408[#408] - -== {es} version 7.0.0-alpha2 - -=== Bug Fixes - -* Fixes CPoissonMeanConjugate sampling error. {ml-pull}335[#335] -//NOTE: Remove from final 7.0.0 release notes if already in 6.x - -* Ensure statics are persisted in a consistent manner {ml-pull}360[#360] - -== {es} version 7.0.0-alpha1 - -== {es} version 6.8.4 - -=== Bug Fixes - -* A reference to a temporary variable was causing forecast model restoration to fail. -The bug exhibited itself on MacOS builds with versions of clangd > 10.0.0. (See {ml-pull}688[#688].) - -== {es} version 6.8.2 - -=== Bug Fixes - -* Don't write model size stats when job is closed without any input {ml-pull}512[#512] (issue: {ml-issue}394[#394]) -* Don't persist model state at the end of lookback if the lookback did not generate any input {ml-pull}521[#521] (issue: {ml-issue}519[#519]) - -== {es} version 6.7.2 - -=== Enhancements - -* Adjust seccomp filter to allow the "time" system call {ml-pull}459[#459] - -== {es} version 6.7.0 - -=== Bug Fixes - -* Improve autodetect logic for persistence. {ml-pull}437[#437] - -== {es} version 6.6.2 - -=== Enhancements - -* Adjust seccomp filter for Fedora 29. {ml-pull}354[#354] - -=== Bug Fixes - -* Fixes an issue where interim results would be calculated after advancing time into an empty bucket. {ml-pull}416[#416]