diff --git a/src/main/java/org/apache/commons/codec/language/Caverphone1.java b/src/main/java/org/apache/commons/codec/language/Caverphone1.java index decd4a5224..9b99c0da59 100644 --- a/src/main/java/org/apache/commons/codec/language/Caverphone1.java +++ b/src/main/java/org/apache/commons/codec/language/Caverphone1.java @@ -18,6 +18,7 @@ package org.apache.commons.codec.language; import java.util.Locale; +import java.util.regex.Pattern; /** * Encodes a string into a Caverphone 1.0 value. @@ -35,6 +36,26 @@ public class Caverphone1 extends AbstractCaverphone { private static final String SIX_1 = "111111"; + // Patterns are compiled once: String.replaceAll compiles its regex on every call, which on a hot + // encode path (one encode applies seventeen of them) is a large, repeated allocation. + private static final Pattern NON_LOWER = Pattern.compile("[^a-z]"); + private static final Pattern START_COUGH = Pattern.compile("^cough"); + private static final Pattern START_ROUGH = Pattern.compile("^rough"); + private static final Pattern START_TOUGH = Pattern.compile("^tough"); + private static final Pattern START_ENOUGH = Pattern.compile("^enough"); + private static final Pattern START_GN = Pattern.compile("^gn"); + private static final Pattern FINAL_MB = Pattern.compile("mb$"); + private static final Pattern START_VOWEL = Pattern.compile("^[aeiou]"); + private static final Pattern VOWEL = Pattern.compile("[aeiou]"); + private static final Pattern RUN_S = Pattern.compile("s+"); + private static final Pattern RUN_T = Pattern.compile("t+"); + private static final Pattern RUN_P = Pattern.compile("p+"); + private static final Pattern RUN_K = Pattern.compile("k+"); + private static final Pattern RUN_F = Pattern.compile("f+"); + private static final Pattern RUN_M = Pattern.compile("m+"); + private static final Pattern RUN_N = Pattern.compile("n+"); + private static final Pattern START_H = Pattern.compile("^h"); + /** * Constructs a new instance. */ @@ -60,18 +81,18 @@ public String encode(final String source) { txt = txt.toLowerCase(Locale.ENGLISH); // 2. Remove anything not A-Z - txt = txt.replaceAll("[^a-z]", ""); + txt = NON_LOWER.matcher(txt).replaceAll(""); // 3. Handle various start options // 2 is a temporary placeholder to indicate a consonant which we are no longer interested in. - txt = txt.replaceAll("^cough", "cou2f"); - txt = txt.replaceAll("^rough", "rou2f"); - txt = txt.replaceAll("^tough", "tou2f"); - txt = txt.replaceAll("^enough", "enou2f"); - txt = txt.replaceAll("^gn", "2n"); + txt = START_COUGH.matcher(txt).replaceAll("cou2f"); + txt = START_ROUGH.matcher(txt).replaceAll("rou2f"); + txt = START_TOUGH.matcher(txt).replaceAll("tou2f"); + txt = START_ENOUGH.matcher(txt).replaceAll("enou2f"); + txt = START_GN.matcher(txt).replaceAll("2n"); // End - txt = txt.replaceAll("mb$", "m2"); + txt = FINAL_MB.matcher(txt).replaceAll("m2"); // 4. Handle replacements txt = txt.replace("cq", "2q"); @@ -91,25 +112,25 @@ public String encode(final String source) { txt = txt.replace("b", "p"); txt = txt.replace("sh", "s2"); txt = txt.replace("z", "s"); - txt = txt.replaceAll("^[aeiou]", "A"); + txt = START_VOWEL.matcher(txt).replaceAll("A"); // 3 is a temporary placeholder marking a vowel - txt = txt.replaceAll("[aeiou]", "3"); + txt = VOWEL.matcher(txt).replaceAll("3"); txt = txt.replace("3gh3", "3kh3"); txt = txt.replace("gh", "22"); txt = txt.replace("g", "k"); - txt = txt.replaceAll("s+", "S"); - txt = txt.replaceAll("t+", "T"); - txt = txt.replaceAll("p+", "P"); - txt = txt.replaceAll("k+", "K"); - txt = txt.replaceAll("f+", "F"); - txt = txt.replaceAll("m+", "M"); - txt = txt.replaceAll("n+", "N"); + txt = RUN_S.matcher(txt).replaceAll("S"); + txt = RUN_T.matcher(txt).replaceAll("T"); + txt = RUN_P.matcher(txt).replaceAll("P"); + txt = RUN_K.matcher(txt).replaceAll("K"); + txt = RUN_F.matcher(txt).replaceAll("F"); + txt = RUN_M.matcher(txt).replaceAll("M"); + txt = RUN_N.matcher(txt).replaceAll("N"); txt = txt.replace("w3", "W3"); txt = txt.replace("wy", "Wy"); // 1.0 only txt = txt.replace("wh3", "Wh3"); txt = txt.replace("why", "Why"); // 1.0 only txt = txt.replace("w", "2"); - txt = txt.replaceAll("^h", "A"); + txt = START_H.matcher(txt).replaceAll("A"); txt = txt.replace("h", "2"); txt = txt.replace("r3", "R3"); txt = txt.replace("ry", "Ry"); // 1.0 only diff --git a/src/main/java/org/apache/commons/codec/language/Caverphone2.java b/src/main/java/org/apache/commons/codec/language/Caverphone2.java index e58c4ddf32..d232ab6a7c 100644 --- a/src/main/java/org/apache/commons/codec/language/Caverphone2.java +++ b/src/main/java/org/apache/commons/codec/language/Caverphone2.java @@ -18,6 +18,7 @@ package org.apache.commons.codec.language; import java.util.Locale; +import java.util.regex.Pattern; /** * Encodes a string into a Caverphone 2.0 value. @@ -35,6 +36,34 @@ public class Caverphone2 extends AbstractCaverphone { private static final String TEN_1 = "1111111111"; + // Patterns are compiled once: String.replaceAll compiles its regex on every call, which on a hot + // encode path (one encode applies more than twenty of them) is a large, repeated allocation. + private static final Pattern NON_LOWER = Pattern.compile("[^a-z]"); + private static final Pattern FINAL_E = Pattern.compile("e$"); + private static final Pattern START_COUGH = Pattern.compile("^cough"); + private static final Pattern START_ROUGH = Pattern.compile("^rough"); + private static final Pattern START_TOUGH = Pattern.compile("^tough"); + private static final Pattern START_ENOUGH = Pattern.compile("^enough"); + private static final Pattern START_TROUGH = Pattern.compile("^trough"); + private static final Pattern START_GN = Pattern.compile("^gn"); + private static final Pattern FINAL_MB = Pattern.compile("mb$"); + private static final Pattern START_VOWEL = Pattern.compile("^[aeiou]"); + private static final Pattern VOWEL = Pattern.compile("[aeiou]"); + private static final Pattern START_Y3 = Pattern.compile("^y3"); + private static final Pattern START_Y = Pattern.compile("^y"); + private static final Pattern RUN_S = Pattern.compile("s+"); + private static final Pattern RUN_T = Pattern.compile("t+"); + private static final Pattern RUN_P = Pattern.compile("p+"); + private static final Pattern RUN_K = Pattern.compile("k+"); + private static final Pattern RUN_F = Pattern.compile("f+"); + private static final Pattern RUN_M = Pattern.compile("m+"); + private static final Pattern RUN_N = Pattern.compile("n+"); + private static final Pattern FINAL_W = Pattern.compile("w$"); + private static final Pattern START_H = Pattern.compile("^h"); + private static final Pattern FINAL_R = Pattern.compile("r$"); + private static final Pattern FINAL_L = Pattern.compile("l$"); + private static final Pattern FINAL_3 = Pattern.compile("3$"); + /** * Constructs a new instance. */ @@ -60,22 +89,22 @@ public String encode(final String source) { txt = txt.toLowerCase(Locale.ENGLISH); // 2. Remove anything not A-Z - txt = txt.replaceAll("[^a-z]", ""); + txt = NON_LOWER.matcher(txt).replaceAll(""); // 2.5. Remove final e - txt = txt.replaceAll("e$", ""); // 2.0 only + txt = FINAL_E.matcher(txt).replaceAll(""); // 2.0 only // 3. Handle various start options - txt = txt.replaceAll("^cough", "cou2f"); - txt = txt.replaceAll("^rough", "rou2f"); - txt = txt.replaceAll("^tough", "tou2f"); - txt = txt.replaceAll("^enough", "enou2f"); // 2.0 only - txt = txt.replaceAll("^trough", "trou2f"); // 2.0 only + txt = START_COUGH.matcher(txt).replaceAll("cou2f"); + txt = START_ROUGH.matcher(txt).replaceAll("rou2f"); + txt = START_TOUGH.matcher(txt).replaceAll("tou2f"); + txt = START_ENOUGH.matcher(txt).replaceAll("enou2f"); // 2.0 only + txt = START_TROUGH.matcher(txt).replaceAll("trou2f"); // 2.0 only // note the spec says ^enough here again, c+p error I assume - txt = txt.replaceAll("^gn", "2n"); + txt = START_GN.matcher(txt).replaceAll("2n"); // End - txt = txt.replaceAll("mb$", "m2"); + txt = FINAL_MB.matcher(txt).replaceAll("m2"); // 4. Handle replacements txt = txt.replace("cq", "2q"); @@ -95,38 +124,38 @@ public String encode(final String source) { txt = txt.replace("b", "p"); txt = txt.replace("sh", "s2"); txt = txt.replace("z", "s"); - txt = txt.replaceAll("^[aeiou]", "A"); - txt = txt.replaceAll("[aeiou]", "3"); + txt = START_VOWEL.matcher(txt).replaceAll("A"); + txt = VOWEL.matcher(txt).replaceAll("3"); txt = txt.replace("j", "y"); // 2.0 only - txt = txt.replaceAll("^y3", "Y3"); // 2.0 only - txt = txt.replaceAll("^y", "A"); // 2.0 only + txt = START_Y3.matcher(txt).replaceAll("Y3"); // 2.0 only + txt = START_Y.matcher(txt).replaceAll("A"); // 2.0 only txt = txt.replace("y", "3"); // 2.0 only txt = txt.replace("3gh3", "3kh3"); txt = txt.replace("gh", "22"); txt = txt.replace("g", "k"); - txt = txt.replaceAll("s+", "S"); - txt = txt.replaceAll("t+", "T"); - txt = txt.replaceAll("p+", "P"); - txt = txt.replaceAll("k+", "K"); - txt = txt.replaceAll("f+", "F"); - txt = txt.replaceAll("m+", "M"); - txt = txt.replaceAll("n+", "N"); + txt = RUN_S.matcher(txt).replaceAll("S"); + txt = RUN_T.matcher(txt).replaceAll("T"); + txt = RUN_P.matcher(txt).replaceAll("P"); + txt = RUN_K.matcher(txt).replaceAll("K"); + txt = RUN_F.matcher(txt).replaceAll("F"); + txt = RUN_M.matcher(txt).replaceAll("M"); + txt = RUN_N.matcher(txt).replaceAll("N"); txt = txt.replace("w3", "W3"); txt = txt.replace("wh3", "Wh3"); - txt = txt.replaceAll("w$", "3"); // 2.0 only + txt = FINAL_W.matcher(txt).replaceAll("3"); // 2.0 only txt = txt.replace("w", "2"); - txt = txt.replaceAll("^h", "A"); + txt = START_H.matcher(txt).replaceAll("A"); txt = txt.replace("h", "2"); txt = txt.replace("r3", "R3"); - txt = txt.replaceAll("r$", "3"); // 2.0 only + txt = FINAL_R.matcher(txt).replaceAll("3"); // 2.0 only txt = txt.replace("r", "2"); txt = txt.replace("l3", "L3"); - txt = txt.replaceAll("l$", "3"); // 2.0 only + txt = FINAL_L.matcher(txt).replaceAll("3"); // 2.0 only txt = txt.replace("l", "2"); // 5. Handle removals txt = txt.replace("2", ""); - txt = txt.replaceAll("3$", "A"); // 2.0 only + txt = FINAL_3.matcher(txt).replaceAll("A"); // 2.0 only txt = txt.replace("3", ""); // 6. put ten 1s on the end diff --git a/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java b/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java index 1ae00c3a35..f0ffb94781 100644 --- a/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java +++ b/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java @@ -17,6 +17,7 @@ package org.apache.commons.codec.language; import java.util.Locale; +import java.util.regex.Pattern; import org.apache.commons.codec.EncoderException; import org.apache.commons.codec.StringEncoder; @@ -63,6 +64,14 @@ public class MatchRatingApproachEncoder implements StringEncoder { { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS", "TT", "VV", "WW", "XX", "YY", "ZZ" }; + // Patterns are compiled once: String.replaceAll compiles its regex on every call, which on a hot + // encode path is a repeated allocation. The regexes are unchanged, so the produced codes are identical. + private static final Pattern[] PUNCTUATION_TO_TRIM = { + Pattern.compile("\\-"), Pattern.compile("[&]"), Pattern.compile("\\'"), + Pattern.compile("\\."), Pattern.compile("[\\,]") }; + private static final Pattern WHITESPACE = Pattern.compile("\\s+"); + private static final Pattern WHITESPACE_RUN_AT_BOUNDARY = Pattern.compile("\\s{2,}\\b"); + /** * Constructs a new instance. */ @@ -86,13 +95,12 @@ public MatchRatingApproachEncoder() { String cleanName(final String name) { String upperName = name.toUpperCase(Locale.ENGLISH); - final String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" }; - for (final String str : charsToTrim) { - upperName = upperName.replaceAll(str, EMPTY); + for (final Pattern pattern : PUNCTUATION_TO_TRIM) { + upperName = pattern.matcher(upperName).replaceAll(EMPTY); } upperName = removeAccents(upperName); - return upperName.replaceAll("\\s+", EMPTY); + return WHITESPACE.matcher(upperName).replaceAll(EMPTY); } /** @@ -337,8 +345,8 @@ int leftToRightThenRightToLeftProcessing(final String name1, final String name2) } // Char arrays -> string & remove extraneous space - final String strA = new String(name1Char).replaceAll("\\s+", EMPTY); - final String strB = new String(name2Char).replaceAll("\\s+", EMPTY); + final String strA = WHITESPACE.matcher(new String(name1Char)).replaceAll(EMPTY); + final String strB = WHITESPACE.matcher(new String(name2Char)).replaceAll(EMPTY); // Final bit - subtract the longest string from 6 and return this int value if (strA.length() > strB.length()) { @@ -421,7 +429,7 @@ String removeVowels(String name) { name = name.replace("O", EMPTY); name = name.replace("U", EMPTY); - name = name.replaceAll("\\s{2,}\\b", SPACE); + name = WHITESPACE_RUN_AT_BOUNDARY.matcher(name).replaceAll(SPACE); // return isVowel(firstLetter) ? (firstLetter + name) : name; if (isVowel(firstLetter)) {