From 148cf216a0786fe44017cf6a4d8905470151e4a5 Mon Sep 17 00:00:00 2001 From: Nishant Mehta Date: Sun, 28 Jun 2026 15:36:01 -0400 Subject: [PATCH 1/3] Compile Caverphone2 patterns once encode() applied more than twenty regular expressions via String.replaceAll, each of which compiles its pattern on every call. A single encode therefore compiled 25 patterns, repeated for every input. Hoist the patterns into static final Pattern constants and apply them with Matcher.replaceAll. The regexes and their order are unchanged, so the produced codes are identical. Measured with a ThreadMXBean allocation driver (200k warmed ops, encoding "Thompson"): 19670 B/op -> 6304 B/op (-68%). Caverphone2Test passes unchanged. Signed-off-by: Nishant Mehta --- .../commons/codec/language/Caverphone2.java | 79 +++++++++++++------ 1 file changed, 54 insertions(+), 25 deletions(-) diff --git a/src/main/java/org/apache/commons/codec/language/Caverphone2.java b/src/main/java/org/apache/commons/codec/language/Caverphone2.java index e58c4ddf32..d232ab6a7c 100644 --- a/src/main/java/org/apache/commons/codec/language/Caverphone2.java +++ b/src/main/java/org/apache/commons/codec/language/Caverphone2.java @@ -18,6 +18,7 @@ package org.apache.commons.codec.language; import java.util.Locale; +import java.util.regex.Pattern; /** * Encodes a string into a Caverphone 2.0 value. @@ -35,6 +36,34 @@ public class Caverphone2 extends AbstractCaverphone { private static final String TEN_1 = "1111111111"; + // Patterns are compiled once: String.replaceAll compiles its regex on every call, which on a hot + // encode path (one encode applies more than twenty of them) is a large, repeated allocation. + private static final Pattern NON_LOWER = Pattern.compile("[^a-z]"); + private static final Pattern FINAL_E = Pattern.compile("e$"); + private static final Pattern START_COUGH = Pattern.compile("^cough"); + private static final Pattern START_ROUGH = Pattern.compile("^rough"); + private static final Pattern START_TOUGH = Pattern.compile("^tough"); + private static final Pattern START_ENOUGH = Pattern.compile("^enough"); + private static final Pattern START_TROUGH = Pattern.compile("^trough"); + private static final Pattern START_GN = Pattern.compile("^gn"); + private static final Pattern FINAL_MB = Pattern.compile("mb$"); + private static final Pattern START_VOWEL = Pattern.compile("^[aeiou]"); + private static final Pattern VOWEL = Pattern.compile("[aeiou]"); + private static final Pattern START_Y3 = Pattern.compile("^y3"); + private static final Pattern START_Y = Pattern.compile("^y"); + private static final Pattern RUN_S = Pattern.compile("s+"); + private static final Pattern RUN_T = Pattern.compile("t+"); + private static final Pattern RUN_P = Pattern.compile("p+"); + private static final Pattern RUN_K = Pattern.compile("k+"); + private static final Pattern RUN_F = Pattern.compile("f+"); + private static final Pattern RUN_M = Pattern.compile("m+"); + private static final Pattern RUN_N = Pattern.compile("n+"); + private static final Pattern FINAL_W = Pattern.compile("w$"); + private static final Pattern START_H = Pattern.compile("^h"); + private static final Pattern FINAL_R = Pattern.compile("r$"); + private static final Pattern FINAL_L = Pattern.compile("l$"); + private static final Pattern FINAL_3 = Pattern.compile("3$"); + /** * Constructs a new instance. */ @@ -60,22 +89,22 @@ public String encode(final String source) { txt = txt.toLowerCase(Locale.ENGLISH); // 2. Remove anything not A-Z - txt = txt.replaceAll("[^a-z]", ""); + txt = NON_LOWER.matcher(txt).replaceAll(""); // 2.5. Remove final e - txt = txt.replaceAll("e$", ""); // 2.0 only + txt = FINAL_E.matcher(txt).replaceAll(""); // 2.0 only // 3. Handle various start options - txt = txt.replaceAll("^cough", "cou2f"); - txt = txt.replaceAll("^rough", "rou2f"); - txt = txt.replaceAll("^tough", "tou2f"); - txt = txt.replaceAll("^enough", "enou2f"); // 2.0 only - txt = txt.replaceAll("^trough", "trou2f"); // 2.0 only + txt = START_COUGH.matcher(txt).replaceAll("cou2f"); + txt = START_ROUGH.matcher(txt).replaceAll("rou2f"); + txt = START_TOUGH.matcher(txt).replaceAll("tou2f"); + txt = START_ENOUGH.matcher(txt).replaceAll("enou2f"); // 2.0 only + txt = START_TROUGH.matcher(txt).replaceAll("trou2f"); // 2.0 only // note the spec says ^enough here again, c+p error I assume - txt = txt.replaceAll("^gn", "2n"); + txt = START_GN.matcher(txt).replaceAll("2n"); // End - txt = txt.replaceAll("mb$", "m2"); + txt = FINAL_MB.matcher(txt).replaceAll("m2"); // 4. Handle replacements txt = txt.replace("cq", "2q"); @@ -95,38 +124,38 @@ public String encode(final String source) { txt = txt.replace("b", "p"); txt = txt.replace("sh", "s2"); txt = txt.replace("z", "s"); - txt = txt.replaceAll("^[aeiou]", "A"); - txt = txt.replaceAll("[aeiou]", "3"); + txt = START_VOWEL.matcher(txt).replaceAll("A"); + txt = VOWEL.matcher(txt).replaceAll("3"); txt = txt.replace("j", "y"); // 2.0 only - txt = txt.replaceAll("^y3", "Y3"); // 2.0 only - txt = txt.replaceAll("^y", "A"); // 2.0 only + txt = START_Y3.matcher(txt).replaceAll("Y3"); // 2.0 only + txt = START_Y.matcher(txt).replaceAll("A"); // 2.0 only txt = txt.replace("y", "3"); // 2.0 only txt = txt.replace("3gh3", "3kh3"); txt = txt.replace("gh", "22"); txt = txt.replace("g", "k"); - txt = txt.replaceAll("s+", "S"); - txt = txt.replaceAll("t+", "T"); - txt = txt.replaceAll("p+", "P"); - txt = txt.replaceAll("k+", "K"); - txt = txt.replaceAll("f+", "F"); - txt = txt.replaceAll("m+", "M"); - txt = txt.replaceAll("n+", "N"); + txt = RUN_S.matcher(txt).replaceAll("S"); + txt = RUN_T.matcher(txt).replaceAll("T"); + txt = RUN_P.matcher(txt).replaceAll("P"); + txt = RUN_K.matcher(txt).replaceAll("K"); + txt = RUN_F.matcher(txt).replaceAll("F"); + txt = RUN_M.matcher(txt).replaceAll("M"); + txt = RUN_N.matcher(txt).replaceAll("N"); txt = txt.replace("w3", "W3"); txt = txt.replace("wh3", "Wh3"); - txt = txt.replaceAll("w$", "3"); // 2.0 only + txt = FINAL_W.matcher(txt).replaceAll("3"); // 2.0 only txt = txt.replace("w", "2"); - txt = txt.replaceAll("^h", "A"); + txt = START_H.matcher(txt).replaceAll("A"); txt = txt.replace("h", "2"); txt = txt.replace("r3", "R3"); - txt = txt.replaceAll("r$", "3"); // 2.0 only + txt = FINAL_R.matcher(txt).replaceAll("3"); // 2.0 only txt = txt.replace("r", "2"); txt = txt.replace("l3", "L3"); - txt = txt.replaceAll("l$", "3"); // 2.0 only + txt = FINAL_L.matcher(txt).replaceAll("3"); // 2.0 only txt = txt.replace("l", "2"); // 5. Handle removals txt = txt.replace("2", ""); - txt = txt.replaceAll("3$", "A"); // 2.0 only + txt = FINAL_3.matcher(txt).replaceAll("A"); // 2.0 only txt = txt.replace("3", ""); // 6. put ten 1s on the end From 849dfd652a581bed15ea4c227a74a2479224f50e Mon Sep 17 00:00:00 2001 From: Nishant Mehta Date: Sun, 28 Jun 2026 15:42:05 -0400 Subject: [PATCH 2/3] Compile Caverphone1 patterns once As with Caverphone2, encode() applied seventeen regular expressions via String.replaceAll, recompiling each pattern on every call. Hoist them into static final Pattern constants applied with Matcher.replaceAll. The regexes and their order are unchanged, so the produced codes are identical. Measured with a ThreadMXBean allocation driver (200k warmed ops, encoding "Thompson"): 14005 B/op -> 4672 B/op (-67%). Caverphone1Test passes unchanged. Signed-off-by: Nishant Mehta --- .../commons/codec/language/Caverphone1.java | 55 +++++++++++++------ 1 file changed, 38 insertions(+), 17 deletions(-) diff --git a/src/main/java/org/apache/commons/codec/language/Caverphone1.java b/src/main/java/org/apache/commons/codec/language/Caverphone1.java index decd4a5224..9b99c0da59 100644 --- a/src/main/java/org/apache/commons/codec/language/Caverphone1.java +++ b/src/main/java/org/apache/commons/codec/language/Caverphone1.java @@ -18,6 +18,7 @@ package org.apache.commons.codec.language; import java.util.Locale; +import java.util.regex.Pattern; /** * Encodes a string into a Caverphone 1.0 value. @@ -35,6 +36,26 @@ public class Caverphone1 extends AbstractCaverphone { private static final String SIX_1 = "111111"; + // Patterns are compiled once: String.replaceAll compiles its regex on every call, which on a hot + // encode path (one encode applies seventeen of them) is a large, repeated allocation. + private static final Pattern NON_LOWER = Pattern.compile("[^a-z]"); + private static final Pattern START_COUGH = Pattern.compile("^cough"); + private static final Pattern START_ROUGH = Pattern.compile("^rough"); + private static final Pattern START_TOUGH = Pattern.compile("^tough"); + private static final Pattern START_ENOUGH = Pattern.compile("^enough"); + private static final Pattern START_GN = Pattern.compile("^gn"); + private static final Pattern FINAL_MB = Pattern.compile("mb$"); + private static final Pattern START_VOWEL = Pattern.compile("^[aeiou]"); + private static final Pattern VOWEL = Pattern.compile("[aeiou]"); + private static final Pattern RUN_S = Pattern.compile("s+"); + private static final Pattern RUN_T = Pattern.compile("t+"); + private static final Pattern RUN_P = Pattern.compile("p+"); + private static final Pattern RUN_K = Pattern.compile("k+"); + private static final Pattern RUN_F = Pattern.compile("f+"); + private static final Pattern RUN_M = Pattern.compile("m+"); + private static final Pattern RUN_N = Pattern.compile("n+"); + private static final Pattern START_H = Pattern.compile("^h"); + /** * Constructs a new instance. */ @@ -60,18 +81,18 @@ public String encode(final String source) { txt = txt.toLowerCase(Locale.ENGLISH); // 2. Remove anything not A-Z - txt = txt.replaceAll("[^a-z]", ""); + txt = NON_LOWER.matcher(txt).replaceAll(""); // 3. Handle various start options // 2 is a temporary placeholder to indicate a consonant which we are no longer interested in. - txt = txt.replaceAll("^cough", "cou2f"); - txt = txt.replaceAll("^rough", "rou2f"); - txt = txt.replaceAll("^tough", "tou2f"); - txt = txt.replaceAll("^enough", "enou2f"); - txt = txt.replaceAll("^gn", "2n"); + txt = START_COUGH.matcher(txt).replaceAll("cou2f"); + txt = START_ROUGH.matcher(txt).replaceAll("rou2f"); + txt = START_TOUGH.matcher(txt).replaceAll("tou2f"); + txt = START_ENOUGH.matcher(txt).replaceAll("enou2f"); + txt = START_GN.matcher(txt).replaceAll("2n"); // End - txt = txt.replaceAll("mb$", "m2"); + txt = FINAL_MB.matcher(txt).replaceAll("m2"); // 4. Handle replacements txt = txt.replace("cq", "2q"); @@ -91,25 +112,25 @@ public String encode(final String source) { txt = txt.replace("b", "p"); txt = txt.replace("sh", "s2"); txt = txt.replace("z", "s"); - txt = txt.replaceAll("^[aeiou]", "A"); + txt = START_VOWEL.matcher(txt).replaceAll("A"); // 3 is a temporary placeholder marking a vowel - txt = txt.replaceAll("[aeiou]", "3"); + txt = VOWEL.matcher(txt).replaceAll("3"); txt = txt.replace("3gh3", "3kh3"); txt = txt.replace("gh", "22"); txt = txt.replace("g", "k"); - txt = txt.replaceAll("s+", "S"); - txt = txt.replaceAll("t+", "T"); - txt = txt.replaceAll("p+", "P"); - txt = txt.replaceAll("k+", "K"); - txt = txt.replaceAll("f+", "F"); - txt = txt.replaceAll("m+", "M"); - txt = txt.replaceAll("n+", "N"); + txt = RUN_S.matcher(txt).replaceAll("S"); + txt = RUN_T.matcher(txt).replaceAll("T"); + txt = RUN_P.matcher(txt).replaceAll("P"); + txt = RUN_K.matcher(txt).replaceAll("K"); + txt = RUN_F.matcher(txt).replaceAll("F"); + txt = RUN_M.matcher(txt).replaceAll("M"); + txt = RUN_N.matcher(txt).replaceAll("N"); txt = txt.replace("w3", "W3"); txt = txt.replace("wy", "Wy"); // 1.0 only txt = txt.replace("wh3", "Wh3"); txt = txt.replace("why", "Why"); // 1.0 only txt = txt.replace("w", "2"); - txt = txt.replaceAll("^h", "A"); + txt = START_H.matcher(txt).replaceAll("A"); txt = txt.replace("h", "2"); txt = txt.replace("r3", "R3"); txt = txt.replace("ry", "Ry"); // 1.0 only From 43f28019081f6e53ab29d40cabaf09f035faadb5 Mon Sep 17 00:00:00 2001 From: Nishant Mehta Date: Sun, 28 Jun 2026 19:19:16 -0400 Subject: [PATCH 3/3] Compile MatchRatingApproachEncoder patterns once cleanName() and the right-to-left comparison built and compiled regexes on every encode: five punctuation-trimming regexes (in a per-call String[]), a whitespace-collapse regex applied three times, and a boundary whitespace-run regex in removeVowels(). String.replaceAll recompiles its regex argument on every call, so a single encode compiled nine Patterns. Hoist them into static final Pattern constants (the five punctuation regexes into a Pattern[]) and apply them with Matcher.replaceAll. The regexes, their order, and the empty/space replacements are unchanged, so the produced codes are identical. Measured with ThreadMXBean.getThreadAllocatedBytes (200k warmed iters): encode("Smith") 5612 -> 1597 B/op, encode("O'Brien") 6040 -> 1952, encode("Thompson") 5872 -> 1784 (about -70%). MatchRatingApproachEncoderTest (100 tests) passes. Signed-off-by: Nishant Mehta --- .../language/MatchRatingApproachEncoder.java | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java b/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java index 1ae00c3a35..f0ffb94781 100644 --- a/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java +++ b/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java @@ -17,6 +17,7 @@ package org.apache.commons.codec.language; import java.util.Locale; +import java.util.regex.Pattern; import org.apache.commons.codec.EncoderException; import org.apache.commons.codec.StringEncoder; @@ -63,6 +64,14 @@ public class MatchRatingApproachEncoder implements StringEncoder { { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS", "TT", "VV", "WW", "XX", "YY", "ZZ" }; + // Patterns are compiled once: String.replaceAll compiles its regex on every call, which on a hot + // encode path is a repeated allocation. The regexes are unchanged, so the produced codes are identical. + private static final Pattern[] PUNCTUATION_TO_TRIM = { + Pattern.compile("\\-"), Pattern.compile("[&]"), Pattern.compile("\\'"), + Pattern.compile("\\."), Pattern.compile("[\\,]") }; + private static final Pattern WHITESPACE = Pattern.compile("\\s+"); + private static final Pattern WHITESPACE_RUN_AT_BOUNDARY = Pattern.compile("\\s{2,}\\b"); + /** * Constructs a new instance. */ @@ -86,13 +95,12 @@ public MatchRatingApproachEncoder() { String cleanName(final String name) { String upperName = name.toUpperCase(Locale.ENGLISH); - final String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" }; - for (final String str : charsToTrim) { - upperName = upperName.replaceAll(str, EMPTY); + for (final Pattern pattern : PUNCTUATION_TO_TRIM) { + upperName = pattern.matcher(upperName).replaceAll(EMPTY); } upperName = removeAccents(upperName); - return upperName.replaceAll("\\s+", EMPTY); + return WHITESPACE.matcher(upperName).replaceAll(EMPTY); } /** @@ -337,8 +345,8 @@ int leftToRightThenRightToLeftProcessing(final String name1, final String name2) } // Char arrays -> string & remove extraneous space - final String strA = new String(name1Char).replaceAll("\\s+", EMPTY); - final String strB = new String(name2Char).replaceAll("\\s+", EMPTY); + final String strA = WHITESPACE.matcher(new String(name1Char)).replaceAll(EMPTY); + final String strB = WHITESPACE.matcher(new String(name2Char)).replaceAll(EMPTY); // Final bit - subtract the longest string from 6 and return this int value if (strA.length() > strB.length()) { @@ -421,7 +429,7 @@ String removeVowels(String name) { name = name.replace("O", EMPTY); name = name.replace("U", EMPTY); - name = name.replaceAll("\\s{2,}\\b", SPACE); + name = WHITESPACE_RUN_AT_BOUNDARY.matcher(name).replaceAll(SPACE); // return isVowel(firstLetter) ? (firstLetter + name) : name; if (isVowel(firstLetter)) {