apache · nishantmehta · Jun 28, 2026 · Jun 28, 2026 · Jun 28, 2026
diff --git a/src/main/java/org/apache/commons/codec/language/Caverphone1.java b/src/main/java/org/apache/commons/codec/language/Caverphone1.java
@@ -18,6 +18,7 @@
 package org.apache.commons.codec.language;
 
 import java.util.Locale;
+import java.util.regex.Pattern;
 
 /**
  * Encodes a string into a Caverphone 1.0 value.
@@ -35,6 +36,26 @@ public class Caverphone1 extends AbstractCaverphone {
 
     private static final String SIX_1 = "111111";
 
+    // Patterns are compiled once: String.replaceAll compiles its regex on every call, which on a hot
+    // encode path (one encode applies seventeen of them) is a large, repeated allocation.
+    private static final Pattern NON_LOWER = Pattern.compile("[^a-z]");
+    private static final Pattern START_COUGH = Pattern.compile("^cough");
+    private static final Pattern START_ROUGH = Pattern.compile("^rough");
+    private static final Pattern START_TOUGH = Pattern.compile("^tough");
+    private static final Pattern START_ENOUGH = Pattern.compile("^enough");
+    private static final Pattern START_GN = Pattern.compile("^gn");
+    private static final Pattern FINAL_MB = Pattern.compile("mb$");
+    private static final Pattern START_VOWEL = Pattern.compile("^[aeiou]");
+    private static final Pattern VOWEL = Pattern.compile("[aeiou]");
+    private static final Pattern RUN_S = Pattern.compile("s+");
+    private static final Pattern RUN_T = Pattern.compile("t+");
+    private static final Pattern RUN_P = Pattern.compile("p+");
+    private static final Pattern RUN_K = Pattern.compile("k+");
+    private static final Pattern RUN_F = Pattern.compile("f+");
+    private static final Pattern RUN_M = Pattern.compile("m+");
+    private static final Pattern RUN_N = Pattern.compile("n+");
+    private static final Pattern START_H = Pattern.compile("^h");
+
     /**
      * Constructs a new instance.
      */
@@ -60,18 +81,18 @@ public String encode(final String source) {
         txt = txt.toLowerCase(Locale.ENGLISH);
 
         // 2. Remove anything not A-Z
-        txt = txt.replaceAll("[^a-z]", "");
+        txt = NON_LOWER.matcher(txt).replaceAll("");
 
         // 3. Handle various start options
         // 2 is a temporary placeholder to indicate a consonant which we are no longer interested in.
-        txt = txt.replaceAll("^cough", "cou2f");
-        txt = txt.replaceAll("^rough", "rou2f");
-        txt = txt.replaceAll("^tough", "tou2f");
-        txt = txt.replaceAll("^enough", "enou2f");
-        txt = txt.replaceAll("^gn", "2n");
+        txt = START_COUGH.matcher(txt).replaceAll("cou2f");
+        txt = START_ROUGH.matcher(txt).replaceAll("rou2f");
+        txt = START_TOUGH.matcher(txt).replaceAll("tou2f");
+        txt = START_ENOUGH.matcher(txt).replaceAll("enou2f");
+        txt = START_GN.matcher(txt).replaceAll("2n");
 
         // End
-        txt = txt.replaceAll("mb$", "m2");
+        txt = FINAL_MB.matcher(txt).replaceAll("m2");
 
         // 4. Handle replacements
         txt = txt.replace("cq", "2q");
@@ -91,25 +112,25 @@ public String encode(final String source) {
         txt = txt.replace("b", "p");
         txt = txt.replace("sh", "s2");
         txt = txt.replace("z", "s");
-        txt = txt.replaceAll("^[aeiou]", "A");
+        txt = START_VOWEL.matcher(txt).replaceAll("A");
         // 3 is a temporary placeholder marking a vowel
-        txt = txt.replaceAll("[aeiou]", "3");
+        txt = VOWEL.matcher(txt).replaceAll("3");
         txt = txt.replace("3gh3", "3kh3");
         txt = txt.replace("gh", "22");
         txt = txt.replace("g", "k");
-        txt = txt.replaceAll("s+", "S");
-        txt = txt.replaceAll("t+", "T");
-        txt = txt.replaceAll("p+", "P");
-        txt = txt.replaceAll("k+", "K");
-        txt = txt.replaceAll("f+", "F");
-        txt = txt.replaceAll("m+", "M");
-        txt = txt.replaceAll("n+", "N");
+        txt = RUN_S.matcher(txt).replaceAll("S");
+        txt = RUN_T.matcher(txt).replaceAll("T");
+        txt = RUN_P.matcher(txt).replaceAll("P");
+        txt = RUN_K.matcher(txt).replaceAll("K");
+        txt = RUN_F.matcher(txt).replaceAll("F");
+        txt = RUN_M.matcher(txt).replaceAll("M");
+        txt = RUN_N.matcher(txt).replaceAll("N");
         txt = txt.replace("w3", "W3");
         txt = txt.replace("wy", "Wy"); // 1.0 only
         txt = txt.replace("wh3", "Wh3");
         txt = txt.replace("why", "Why"); // 1.0 only
         txt = txt.replace("w", "2");
-        txt = txt.replaceAll("^h", "A");
+        txt = START_H.matcher(txt).replaceAll("A");
         txt = txt.replace("h", "2");
         txt = txt.replace("r3", "R3");
         txt = txt.replace("ry", "Ry"); // 1.0 only

diff --git a/src/main/java/org/apache/commons/codec/language/Caverphone2.java b/src/main/java/org/apache/commons/codec/language/Caverphone2.java
@@ -18,6 +18,7 @@
 package org.apache.commons.codec.language;
 
 import java.util.Locale;
+import java.util.regex.Pattern;
 
 /**
  * Encodes a string into a Caverphone 2.0 value.
@@ -35,6 +36,34 @@ public class Caverphone2 extends AbstractCaverphone {
 
     private static final String TEN_1 = "1111111111";
 
+    // Patterns are compiled once: String.replaceAll compiles its regex on every call, which on a hot
+    // encode path (one encode applies more than twenty of them) is a large, repeated allocation.
+    private static final Pattern NON_LOWER = Pattern.compile("[^a-z]");
+    private static final Pattern FINAL_E = Pattern.compile("e$");
+    private static final Pattern START_COUGH = Pattern.compile("^cough");
+    private static final Pattern START_ROUGH = Pattern.compile("^rough");
+    private static final Pattern START_TOUGH = Pattern.compile("^tough");
+    private static final Pattern START_ENOUGH = Pattern.compile("^enough");
+    private static final Pattern START_TROUGH = Pattern.compile("^trough");
+    private static final Pattern START_GN = Pattern.compile("^gn");
+    private static final Pattern FINAL_MB = Pattern.compile("mb$");
+    private static final Pattern START_VOWEL = Pattern.compile("^[aeiou]");
+    private static final Pattern VOWEL = Pattern.compile("[aeiou]");
+    private static final Pattern START_Y3 = Pattern.compile("^y3");
+    private static final Pattern START_Y = Pattern.compile("^y");
+    private static final Pattern RUN_S = Pattern.compile("s+");
+    private static final Pattern RUN_T = Pattern.compile("t+");
+    private static final Pattern RUN_P = Pattern.compile("p+");
+    private static final Pattern RUN_K = Pattern.compile("k+");
+    private static final Pattern RUN_F = Pattern.compile("f+");
+    private static final Pattern RUN_M = Pattern.compile("m+");
+    private static final Pattern RUN_N = Pattern.compile("n+");
+    private static final Pattern FINAL_W = Pattern.compile("w$");
+    private static final Pattern START_H = Pattern.compile("^h");
+    private static final Pattern FINAL_R = Pattern.compile("r$");
+    private static final Pattern FINAL_L = Pattern.compile("l$");
+    private static final Pattern FINAL_3 = Pattern.compile("3$");
+
     /**
      * Constructs a new instance.
      */
@@ -60,22 +89,22 @@ public String encode(final String source) {
         txt = txt.toLowerCase(Locale.ENGLISH);
 
         // 2. Remove anything not A-Z
-        txt = txt.replaceAll("[^a-z]", "");
+        txt = NON_LOWER.matcher(txt).replaceAll("");
 
         // 2.5. Remove final e
-        txt = txt.replaceAll("e$", ""); // 2.0 only
+        txt = FINAL_E.matcher(txt).replaceAll(""); // 2.0 only
 
         // 3. Handle various start options
-        txt = txt.replaceAll("^cough", "cou2f");
-        txt = txt.replaceAll("^rough", "rou2f");
-        txt = txt.replaceAll("^tough", "tou2f");
-        txt = txt.replaceAll("^enough", "enou2f"); // 2.0 only
-        txt = txt.replaceAll("^trough", "trou2f"); // 2.0 only
+        txt = START_COUGH.matcher(txt).replaceAll("cou2f");
+        txt = START_ROUGH.matcher(txt).replaceAll("rou2f");
+        txt = START_TOUGH.matcher(txt).replaceAll("tou2f");
+        txt = START_ENOUGH.matcher(txt).replaceAll("enou2f"); // 2.0 only
+        txt = START_TROUGH.matcher(txt).replaceAll("trou2f"); // 2.0 only
                                                    // note the spec says ^enough here again, c+p error I assume
-        txt = txt.replaceAll("^gn", "2n");
+        txt = START_GN.matcher(txt).replaceAll("2n");
 
         // End
-        txt = txt.replaceAll("mb$", "m2");
+        txt = FINAL_MB.matcher(txt).replaceAll("m2");
 
         // 4. Handle replacements
         txt = txt.replace("cq", "2q");
@@ -95,38 +124,38 @@ public String encode(final String source) {
         txt = txt.replace("b", "p");
         txt = txt.replace("sh", "s2");
         txt = txt.replace("z", "s");
-        txt = txt.replaceAll("^[aeiou]", "A");
-        txt = txt.replaceAll("[aeiou]", "3");
+        txt = START_VOWEL.matcher(txt).replaceAll("A");
+        txt = VOWEL.matcher(txt).replaceAll("3");
         txt = txt.replace("j", "y"); // 2.0 only
-        txt = txt.replaceAll("^y3", "Y3"); // 2.0 only
-        txt = txt.replaceAll("^y", "A"); // 2.0 only
+        txt = START_Y3.matcher(txt).replaceAll("Y3"); // 2.0 only
+        txt = START_Y.matcher(txt).replaceAll("A"); // 2.0 only
         txt = txt.replace("y", "3"); // 2.0 only
         txt = txt.replace("3gh3", "3kh3");
         txt = txt.replace("gh", "22");
         txt = txt.replace("g", "k");
-        txt = txt.replaceAll("s+", "S");
-        txt = txt.replaceAll("t+", "T");
-        txt = txt.replaceAll("p+", "P");
-        txt = txt.replaceAll("k+", "K");
-        txt = txt.replaceAll("f+", "F");
-        txt = txt.replaceAll("m+", "M");
-        txt = txt.replaceAll("n+", "N");
+        txt = RUN_S.matcher(txt).replaceAll("S");
+        txt = RUN_T.matcher(txt).replaceAll("T");
+        txt = RUN_P.matcher(txt).replaceAll("P");
+        txt = RUN_K.matcher(txt).replaceAll("K");
+        txt = RUN_F.matcher(txt).replaceAll("F");
+        txt = RUN_M.matcher(txt).replaceAll("M");
+        txt = RUN_N.matcher(txt).replaceAll("N");
         txt = txt.replace("w3", "W3");
         txt = txt.replace("wh3", "Wh3");
-        txt = txt.replaceAll("w$", "3"); // 2.0 only
+        txt = FINAL_W.matcher(txt).replaceAll("3"); // 2.0 only
         txt = txt.replace("w", "2");
-        txt = txt.replaceAll("^h", "A");
+        txt = START_H.matcher(txt).replaceAll("A");
         txt = txt.replace("h", "2");
         txt = txt.replace("r3", "R3");
-        txt = txt.replaceAll("r$", "3"); // 2.0 only
+        txt = FINAL_R.matcher(txt).replaceAll("3"); // 2.0 only
         txt = txt.replace("r", "2");
         txt = txt.replace("l3", "L3");
-        txt = txt.replaceAll("l$", "3"); // 2.0 only
+        txt = FINAL_L.matcher(txt).replaceAll("3"); // 2.0 only
         txt = txt.replace("l", "2");
 
         // 5. Handle removals
         txt = txt.replace("2", "");
-        txt = txt.replaceAll("3$", "A"); // 2.0 only
+        txt = FINAL_3.matcher(txt).replaceAll("A"); // 2.0 only
         txt = txt.replace("3", "");
 
         // 6. put ten 1s on the end

diff --git a/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java b/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java
@@ -17,6 +17,7 @@
 package org.apache.commons.codec.language;
 
 import java.util.Locale;
+import java.util.regex.Pattern;
 
 import org.apache.commons.codec.EncoderException;
 import org.apache.commons.codec.StringEncoder;
@@ -63,6 +64,14 @@ public class MatchRatingApproachEncoder implements StringEncoder {
             { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS",
                    "TT", "VV", "WW", "XX", "YY", "ZZ" };
 
+    // Patterns are compiled once: String.replaceAll compiles its regex on every call, which on a hot
+    // encode path is a repeated allocation. The regexes are unchanged, so the produced codes are identical.
+    private static final Pattern[] PUNCTUATION_TO_TRIM = {
+            Pattern.compile("\\-"), Pattern.compile("[&]"), Pattern.compile("\\'"),
+            Pattern.compile("\\."), Pattern.compile("[\\,]") };
+    private static final Pattern WHITESPACE = Pattern.compile("\\s+");
+    private static final Pattern WHITESPACE_RUN_AT_BOUNDARY = Pattern.compile("\\s{2,}\\b");
+
     /**
      * Constructs a new instance.
      */
@@ -86,13 +95,12 @@ public MatchRatingApproachEncoder() {
     String cleanName(final String name) {
         String upperName = name.toUpperCase(Locale.ENGLISH);
 
-        final String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" };
-        for (final String str : charsToTrim) {
-            upperName = upperName.replaceAll(str, EMPTY);
+        for (final Pattern pattern : PUNCTUATION_TO_TRIM) {
+            upperName = pattern.matcher(upperName).replaceAll(EMPTY);
         }
 
         upperName = removeAccents(upperName);
-        return upperName.replaceAll("\\s+", EMPTY);
+        return WHITESPACE.matcher(upperName).replaceAll(EMPTY);
     }
 
     /**
@@ -337,8 +345,8 @@ int leftToRightThenRightToLeftProcessing(final String name1, final String name2)
         }
 
         // Char arrays -> string & remove extraneous space
-        final String strA = new String(name1Char).replaceAll("\\s+", EMPTY);
-        final String strB = new String(name2Char).replaceAll("\\s+", EMPTY);
+        final String strA = WHITESPACE.matcher(new String(name1Char)).replaceAll(EMPTY);
+        final String strB = WHITESPACE.matcher(new String(name2Char)).replaceAll(EMPTY);
 
         // Final bit - subtract the longest string from 6 and return this int value
         if (strA.length() > strB.length()) {
@@ -421,7 +429,7 @@ String removeVowels(String name) {
         name = name.replace("O", EMPTY);
         name = name.replace("U", EMPTY);
 
-        name = name.replaceAll("\\s{2,}\\b", SPACE);
+        name = WHITESPACE_RUN_AT_BOUNDARY.matcher(name).replaceAll(SPACE);
 
         // return isVowel(firstLetter) ? (firstLetter + name) : name;
         if (isVowel(firstLetter)) {