Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 38 additions & 17 deletions src/main/java/org/apache/commons/codec/language/Caverphone1.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package org.apache.commons.codec.language;

import java.util.Locale;
import java.util.regex.Pattern;

/**
* Encodes a string into a Caverphone 1.0 value.
Expand All @@ -35,6 +36,26 @@ public class Caverphone1 extends AbstractCaverphone {

private static final String SIX_1 = "111111";

// Patterns are compiled once: String.replaceAll compiles its regex on every call, which on a hot
// encode path (one encode applies seventeen of them) is a large, repeated allocation.
private static final Pattern NON_LOWER = Pattern.compile("[^a-z]");
private static final Pattern START_COUGH = Pattern.compile("^cough");
private static final Pattern START_ROUGH = Pattern.compile("^rough");
private static final Pattern START_TOUGH = Pattern.compile("^tough");
private static final Pattern START_ENOUGH = Pattern.compile("^enough");
private static final Pattern START_GN = Pattern.compile("^gn");
private static final Pattern FINAL_MB = Pattern.compile("mb$");
private static final Pattern START_VOWEL = Pattern.compile("^[aeiou]");
private static final Pattern VOWEL = Pattern.compile("[aeiou]");
private static final Pattern RUN_S = Pattern.compile("s+");
private static final Pattern RUN_T = Pattern.compile("t+");
private static final Pattern RUN_P = Pattern.compile("p+");
private static final Pattern RUN_K = Pattern.compile("k+");
private static final Pattern RUN_F = Pattern.compile("f+");
private static final Pattern RUN_M = Pattern.compile("m+");
private static final Pattern RUN_N = Pattern.compile("n+");
private static final Pattern START_H = Pattern.compile("^h");

/**
* Constructs a new instance.
*/
Expand All @@ -60,18 +81,18 @@ public String encode(final String source) {
txt = txt.toLowerCase(Locale.ENGLISH);

// 2. Remove anything not A-Z
txt = txt.replaceAll("[^a-z]", "");
txt = NON_LOWER.matcher(txt).replaceAll("");

// 3. Handle various start options
// 2 is a temporary placeholder to indicate a consonant which we are no longer interested in.
txt = txt.replaceAll("^cough", "cou2f");
txt = txt.replaceAll("^rough", "rou2f");
txt = txt.replaceAll("^tough", "tou2f");
txt = txt.replaceAll("^enough", "enou2f");
txt = txt.replaceAll("^gn", "2n");
txt = START_COUGH.matcher(txt).replaceAll("cou2f");
txt = START_ROUGH.matcher(txt).replaceAll("rou2f");
txt = START_TOUGH.matcher(txt).replaceAll("tou2f");
txt = START_ENOUGH.matcher(txt).replaceAll("enou2f");
txt = START_GN.matcher(txt).replaceAll("2n");

// End
txt = txt.replaceAll("mb$", "m2");
txt = FINAL_MB.matcher(txt).replaceAll("m2");

// 4. Handle replacements
txt = txt.replace("cq", "2q");
Expand All @@ -91,25 +112,25 @@ public String encode(final String source) {
txt = txt.replace("b", "p");
txt = txt.replace("sh", "s2");
txt = txt.replace("z", "s");
txt = txt.replaceAll("^[aeiou]", "A");
txt = START_VOWEL.matcher(txt).replaceAll("A");
// 3 is a temporary placeholder marking a vowel
txt = txt.replaceAll("[aeiou]", "3");
txt = VOWEL.matcher(txt).replaceAll("3");
txt = txt.replace("3gh3", "3kh3");
txt = txt.replace("gh", "22");
txt = txt.replace("g", "k");
txt = txt.replaceAll("s+", "S");
txt = txt.replaceAll("t+", "T");
txt = txt.replaceAll("p+", "P");
txt = txt.replaceAll("k+", "K");
txt = txt.replaceAll("f+", "F");
txt = txt.replaceAll("m+", "M");
txt = txt.replaceAll("n+", "N");
txt = RUN_S.matcher(txt).replaceAll("S");
txt = RUN_T.matcher(txt).replaceAll("T");
txt = RUN_P.matcher(txt).replaceAll("P");
txt = RUN_K.matcher(txt).replaceAll("K");
txt = RUN_F.matcher(txt).replaceAll("F");
txt = RUN_M.matcher(txt).replaceAll("M");
txt = RUN_N.matcher(txt).replaceAll("N");
txt = txt.replace("w3", "W3");
txt = txt.replace("wy", "Wy"); // 1.0 only
txt = txt.replace("wh3", "Wh3");
txt = txt.replace("why", "Why"); // 1.0 only
txt = txt.replace("w", "2");
txt = txt.replaceAll("^h", "A");
txt = START_H.matcher(txt).replaceAll("A");
txt = txt.replace("h", "2");
txt = txt.replace("r3", "R3");
txt = txt.replace("ry", "Ry"); // 1.0 only
Expand Down
79 changes: 54 additions & 25 deletions src/main/java/org/apache/commons/codec/language/Caverphone2.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package org.apache.commons.codec.language;

import java.util.Locale;
import java.util.regex.Pattern;

/**
* Encodes a string into a Caverphone 2.0 value.
Expand All @@ -35,6 +36,34 @@ public class Caverphone2 extends AbstractCaverphone {

private static final String TEN_1 = "1111111111";

// Patterns are compiled once: String.replaceAll compiles its regex on every call, which on a hot
// encode path (one encode applies more than twenty of them) is a large, repeated allocation.
private static final Pattern NON_LOWER = Pattern.compile("[^a-z]");
private static final Pattern FINAL_E = Pattern.compile("e$");
private static final Pattern START_COUGH = Pattern.compile("^cough");
private static final Pattern START_ROUGH = Pattern.compile("^rough");
private static final Pattern START_TOUGH = Pattern.compile("^tough");
private static final Pattern START_ENOUGH = Pattern.compile("^enough");
private static final Pattern START_TROUGH = Pattern.compile("^trough");
private static final Pattern START_GN = Pattern.compile("^gn");
private static final Pattern FINAL_MB = Pattern.compile("mb$");
private static final Pattern START_VOWEL = Pattern.compile("^[aeiou]");
private static final Pattern VOWEL = Pattern.compile("[aeiou]");
private static final Pattern START_Y3 = Pattern.compile("^y3");
private static final Pattern START_Y = Pattern.compile("^y");
private static final Pattern RUN_S = Pattern.compile("s+");
private static final Pattern RUN_T = Pattern.compile("t+");
private static final Pattern RUN_P = Pattern.compile("p+");
private static final Pattern RUN_K = Pattern.compile("k+");
private static final Pattern RUN_F = Pattern.compile("f+");
private static final Pattern RUN_M = Pattern.compile("m+");
private static final Pattern RUN_N = Pattern.compile("n+");
private static final Pattern FINAL_W = Pattern.compile("w$");
private static final Pattern START_H = Pattern.compile("^h");
private static final Pattern FINAL_R = Pattern.compile("r$");
private static final Pattern FINAL_L = Pattern.compile("l$");
private static final Pattern FINAL_3 = Pattern.compile("3$");

/**
* Constructs a new instance.
*/
Expand All @@ -60,22 +89,22 @@ public String encode(final String source) {
txt = txt.toLowerCase(Locale.ENGLISH);

// 2. Remove anything not A-Z
txt = txt.replaceAll("[^a-z]", "");
txt = NON_LOWER.matcher(txt).replaceAll("");

// 2.5. Remove final e
txt = txt.replaceAll("e$", ""); // 2.0 only
txt = FINAL_E.matcher(txt).replaceAll(""); // 2.0 only

// 3. Handle various start options
txt = txt.replaceAll("^cough", "cou2f");
txt = txt.replaceAll("^rough", "rou2f");
txt = txt.replaceAll("^tough", "tou2f");
txt = txt.replaceAll("^enough", "enou2f"); // 2.0 only
txt = txt.replaceAll("^trough", "trou2f"); // 2.0 only
txt = START_COUGH.matcher(txt).replaceAll("cou2f");
txt = START_ROUGH.matcher(txt).replaceAll("rou2f");
txt = START_TOUGH.matcher(txt).replaceAll("tou2f");
txt = START_ENOUGH.matcher(txt).replaceAll("enou2f"); // 2.0 only
txt = START_TROUGH.matcher(txt).replaceAll("trou2f"); // 2.0 only
// note the spec says ^enough here again, c+p error I assume
txt = txt.replaceAll("^gn", "2n");
txt = START_GN.matcher(txt).replaceAll("2n");

// End
txt = txt.replaceAll("mb$", "m2");
txt = FINAL_MB.matcher(txt).replaceAll("m2");

// 4. Handle replacements
txt = txt.replace("cq", "2q");
Expand All @@ -95,38 +124,38 @@ public String encode(final String source) {
txt = txt.replace("b", "p");
txt = txt.replace("sh", "s2");
txt = txt.replace("z", "s");
txt = txt.replaceAll("^[aeiou]", "A");
txt = txt.replaceAll("[aeiou]", "3");
txt = START_VOWEL.matcher(txt).replaceAll("A");
txt = VOWEL.matcher(txt).replaceAll("3");
txt = txt.replace("j", "y"); // 2.0 only
txt = txt.replaceAll("^y3", "Y3"); // 2.0 only
txt = txt.replaceAll("^y", "A"); // 2.0 only
txt = START_Y3.matcher(txt).replaceAll("Y3"); // 2.0 only
txt = START_Y.matcher(txt).replaceAll("A"); // 2.0 only
txt = txt.replace("y", "3"); // 2.0 only
txt = txt.replace("3gh3", "3kh3");
txt = txt.replace("gh", "22");
txt = txt.replace("g", "k");
txt = txt.replaceAll("s+", "S");
txt = txt.replaceAll("t+", "T");
txt = txt.replaceAll("p+", "P");
txt = txt.replaceAll("k+", "K");
txt = txt.replaceAll("f+", "F");
txt = txt.replaceAll("m+", "M");
txt = txt.replaceAll("n+", "N");
txt = RUN_S.matcher(txt).replaceAll("S");
txt = RUN_T.matcher(txt).replaceAll("T");
txt = RUN_P.matcher(txt).replaceAll("P");
txt = RUN_K.matcher(txt).replaceAll("K");
txt = RUN_F.matcher(txt).replaceAll("F");
txt = RUN_M.matcher(txt).replaceAll("M");
txt = RUN_N.matcher(txt).replaceAll("N");
txt = txt.replace("w3", "W3");
txt = txt.replace("wh3", "Wh3");
txt = txt.replaceAll("w$", "3"); // 2.0 only
txt = FINAL_W.matcher(txt).replaceAll("3"); // 2.0 only
txt = txt.replace("w", "2");
txt = txt.replaceAll("^h", "A");
txt = START_H.matcher(txt).replaceAll("A");
txt = txt.replace("h", "2");
txt = txt.replace("r3", "R3");
txt = txt.replaceAll("r$", "3"); // 2.0 only
txt = FINAL_R.matcher(txt).replaceAll("3"); // 2.0 only
txt = txt.replace("r", "2");
txt = txt.replace("l3", "L3");
txt = txt.replaceAll("l$", "3"); // 2.0 only
txt = FINAL_L.matcher(txt).replaceAll("3"); // 2.0 only
txt = txt.replace("l", "2");

// 5. Handle removals
txt = txt.replace("2", "");
txt = txt.replaceAll("3$", "A"); // 2.0 only
txt = FINAL_3.matcher(txt).replaceAll("A"); // 2.0 only
txt = txt.replace("3", "");

// 6. put ten 1s on the end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package org.apache.commons.codec.language;

import java.util.Locale;
import java.util.regex.Pattern;

import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;
Expand Down Expand Up @@ -63,6 +64,14 @@ public class MatchRatingApproachEncoder implements StringEncoder {
{ "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS",
"TT", "VV", "WW", "XX", "YY", "ZZ" };

// Patterns are compiled once: String.replaceAll compiles its regex on every call, which on a hot
// encode path is a repeated allocation. The regexes are unchanged, so the produced codes are identical.
private static final Pattern[] PUNCTUATION_TO_TRIM = {
Pattern.compile("\\-"), Pattern.compile("[&]"), Pattern.compile("\\'"),
Pattern.compile("\\."), Pattern.compile("[\\,]") };
private static final Pattern WHITESPACE = Pattern.compile("\\s+");
private static final Pattern WHITESPACE_RUN_AT_BOUNDARY = Pattern.compile("\\s{2,}\\b");

/**
* Constructs a new instance.
*/
Expand All @@ -86,13 +95,12 @@ public MatchRatingApproachEncoder() {
String cleanName(final String name) {
String upperName = name.toUpperCase(Locale.ENGLISH);

final String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" };
for (final String str : charsToTrim) {
upperName = upperName.replaceAll(str, EMPTY);
for (final Pattern pattern : PUNCTUATION_TO_TRIM) {
upperName = pattern.matcher(upperName).replaceAll(EMPTY);
}

upperName = removeAccents(upperName);
return upperName.replaceAll("\\s+", EMPTY);
return WHITESPACE.matcher(upperName).replaceAll(EMPTY);
}

/**
Expand Down Expand Up @@ -337,8 +345,8 @@ int leftToRightThenRightToLeftProcessing(final String name1, final String name2)
}

// Char arrays -> string & remove extraneous space
final String strA = new String(name1Char).replaceAll("\\s+", EMPTY);
final String strB = new String(name2Char).replaceAll("\\s+", EMPTY);
final String strA = WHITESPACE.matcher(new String(name1Char)).replaceAll(EMPTY);
final String strB = WHITESPACE.matcher(new String(name2Char)).replaceAll(EMPTY);

// Final bit - subtract the longest string from 6 and return this int value
if (strA.length() > strB.length()) {
Expand Down Expand Up @@ -421,7 +429,7 @@ String removeVowels(String name) {
name = name.replace("O", EMPTY);
name = name.replace("U", EMPTY);

name = name.replaceAll("\\s{2,}\\b", SPACE);
name = WHITESPACE_RUN_AT_BOUNDARY.matcher(name).replaceAll(SPACE);

// return isVowel(firstLetter) ? (firstLetter + name) : name;
if (isVowel(firstLetter)) {
Expand Down