Apache Commons Codec Source Code

Apache Commons Codec library provides implementations of common encoders and decoders such as Base64, Hex, Phonetic and URLs.

Apache Commons Codec Source Code files are provided in both binary packge (commons-codec-3.15-bin.zip) and source package (commons-codec-3.15-src.zip). You can download them at Apache Commons Codec Website.

Apache Commons Codec Source Code has no dependencies. You can compile it to generate your own version of Apache Commons Codec JAR file.

The source code of Apache Commons Codec 1.15 is provided below:

✍: FYIcenter

org/apache/commons/codec/language/Nysiis.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.commons.codec.language;

import java.util.regex.Pattern;

import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;

/**
 * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names, but can also be used as a
 * general purpose scheme to find word with similar phonemes.
 * <p>
 * NYSIIS features an accuracy increase of 2.7% over the traditional Soundex algorithm.
 * <p>
 * Algorithm description:
 * <pre>
 * 1. Transcode first characters of name
 *   1a. MAC -&gt;   MCC
 *   1b. KN  -&gt;   NN
 *   1c. K   -&gt;   C
 *   1d. PH  -&gt;   FF
 *   1e. PF  -&gt;   FF
 *   1f. SCH -&gt;   SSS
 * 2. Transcode last characters of name
 *   2a. EE, IE          -&gt;   Y
 *   2b. DT,RT,RD,NT,ND  -&gt;   D
 * 3. First character of key = first character of name
 * 4. Transcode remaining characters by following these rules, incrementing by one character each time
 *   4a. EV  -&gt;   AF  else A,E,I,O,U -&gt; A
 *   4b. Q   -&gt;   G
 *   4c. Z   -&gt;   S
 *   4d. M   -&gt;   N
 *   4e. KN  -&gt;   N   else K -&gt; C
 *   4f. SCH -&gt;   SSS
 *   4g. PH  -&gt;   FF
 *   4h. H   -&gt;   If previous or next is nonvowel, previous
 *   4i. W   -&gt;   If previous is vowel, previous
 *   4j. Add current to key if current != last key character
 * 5. If last character is S, remove it
 * 6. If last characters are AY, replace with Y
 * 7. If last character is A, remove it
 * 8. Collapse all strings of repeated characters
 * 9. Add original first character of name as first character of key
 * </pre>
 * <p>
 * This class is immutable and thread-safe.
 *
 * @see <a href="http://en.wikipedia.org/wiki/NYSIIS">NYSIIS on Wikipedia</a>
 * @see <a href="http://www.dropby.com/NYSIIS.html">NYSIIS on dropby.com</a>
 * @see Soundex
 * @since 1.7
 */
public class Nysiis implements StringEncoder {

    private static final char[] CHARS_A   = new char[] { 'A' };
    private static final char[] CHARS_AF  = new char[] { 'A', 'F' };
    private static final char[] CHARS_C   = new char[] { 'C' };
    private static final char[] CHARS_FF  = new char[] { 'F', 'F' };
    private static final char[] CHARS_G   = new char[] { 'G' };
    private static final char[] CHARS_N   = new char[] { 'N' };
    private static final char[] CHARS_NN  = new char[] { 'N', 'N' };
    private static final char[] CHARS_S   = new char[] { 'S' };
    private static final char[] CHARS_SSS = new char[] { 'S', 'S', 'S' };

    private static final Pattern PAT_MAC    = Pattern.compile("^MAC");
    private static final Pattern PAT_KN     = Pattern.compile("^KN");
    private static final Pattern PAT_K      = Pattern.compile("^K");
    private static final Pattern PAT_PH_PF  = Pattern.compile("^(PH|PF)");
    private static final Pattern PAT_SCH    = Pattern.compile("^SCH");
    private static final Pattern PAT_EE_IE  = Pattern.compile("(EE|IE)$");
    private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$");

    private static final char SPACE = ' ';
    private static final int TRUE_LENGTH = 6;

    /**
     * Tests if the given character is a vowel.
     *
     * @param c
     *            the character to test
     * @return {@code true} if the character is a vowel, {@code false} otherwise
     */
    private static boolean isVowel(final char c) {
        return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U';
    }

    /**
     * Transcodes the remaining parts of the String. The method operates on a sliding window, looking at 4 characters at
     * a time: [i-1, i, i+1, i+2].
     *
     * @param prev
     *            the previous character
     * @param curr
     *            the current character
     * @param next
     *            the next character
     * @param aNext
     *            the after next character
     * @return a transcoded array of characters, starting from the current position
     */
    private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) {
        // 1. EV -> AF
        if (curr == 'E' && next == 'V') {
            return CHARS_AF;
        }

        // A, E, I, O, U -> A
        if (isVowel(curr)) {
            return CHARS_A;
        }

        // 2. Q -> G, Z -> S, M -> N
        if (curr == 'Q') {
            return CHARS_G;
        } else if (curr == 'Z') {
            return CHARS_S;
        } else if (curr == 'M') {
            return CHARS_N;
        }

        // 3. KN -> NN else K -> C
        if (curr == 'K') {
            if (next == 'N') {
                return CHARS_NN;
            }
            return CHARS_C;
        }

        // 4. SCH -> SSS
        if (curr == 'S' && next == 'C' && aNext == 'H') {
            return CHARS_SSS;
        }

        // PH -> FF
        if (curr == 'P' && next == 'H') {
            return CHARS_FF;
        }

        // 5. H -> If previous or next is a non vowel, previous.
        if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) {
            return new char[] { prev };
        }

        // 6. W -> If previous is vowel, previous.
        if (curr == 'W' && isVowel(prev)) {
            return new char[] { prev };
        }

        return new char[] { curr };
    }

    /** Indicates the strict mode. */
    private final boolean strict;

    /**
     * Creates an instance of the {@link Nysiis} encoder with strict mode (original form),
     * i.e. encoded strings have a maximum length of 6.
     */
    public Nysiis() {
        this(true);
    }

    /**
     * Create an instance of the {@link Nysiis} encoder with the specified strict mode:
     *
     * <ul>
     *  <li>{@code true}: encoded strings have a maximum length of 6</li>
     *  <li>{@code false}: encoded strings may have arbitrary length</li>
     * </ul>
     *
     * @param strict
     *            the strict mode
     */
    public Nysiis(final boolean strict) {
        this.strict = strict;
    }

    /**
     * Encodes an Object using the NYSIIS algorithm. This method is provided in order to satisfy the requirements of the
     * Encoder interface, and will throw an {@link EncoderException} if the supplied object is not of type
     * {@link String}.
     *
     * @param obj
     *            Object to encode
     * @return An object (or a {@link String}) containing the NYSIIS code which corresponds to the given String.
     * @throws EncoderException
     *            if the parameter supplied is not of a {@link String}
     * @throws IllegalArgumentException
     *            if a character is not mapped
     */
    @Override
    public Object encode(final Object obj) throws EncoderException {
        if (!(obj instanceof String)) {
            throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String");
        }
        return this.nysiis((String) obj);
    }

    /**
     * Encodes a String using the NYSIIS algorithm.
     *
     * @param str
     *            A String object to encode
     * @return A Nysiis code corresponding to the String supplied
     * @throws IllegalArgumentException
     *            if a character is not mapped
     */
    @Override
    public String encode(final String str) {
        return this.nysiis(str);
    }

    /**
     * Indicates the strict mode for this {@link Nysiis} encoder.
     *
     * @return {@code true} if the encoder is configured for strict mode, {@code false} otherwise
     */
    public boolean isStrict() {
        return this.strict;
    }

    /**
     * Retrieves the NYSIIS code for a given String object.
     *
     * @param str
     *            String to encode using the NYSIIS algorithm
     * @return A NYSIIS code for the String supplied
     */
    public String nysiis(String str) {
        if (str == null) {
            return null;
        }

        // Use the same clean rules as Soundex
        str = SoundexUtils.clean(str);

        if (str.length() == 0) {
            return str;
        }

        // Translate first characters of name:
        // MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS
        str = PAT_MAC.matcher(str).replaceFirst("MCC");
        str = PAT_KN.matcher(str).replaceFirst("NN");
        str = PAT_K.matcher(str).replaceFirst("C");
        str = PAT_PH_PF.matcher(str).replaceFirst("FF");
        str = PAT_SCH.matcher(str).replaceFirst("SSS");

        // Translate last characters of name:
        // EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D
        str = PAT_EE_IE.matcher(str).replaceFirst("Y");
        str = PAT_DT_ETC.matcher(str).replaceFirst("D");

        // First character of key = first character of name.
        final StringBuilder key = new StringBuilder(str.length());
        key.append(str.charAt(0));

        // Transcode remaining characters, incrementing by one character each time
        final char[] chars = str.toCharArray();
        final int len = chars.length;

        for (int i = 1; i < len; i++) {
            final char next = i < len - 1 ? chars[i + 1] : SPACE;
            final char aNext = i < len - 2 ? chars[i + 2] : SPACE;
            final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext);
            System.arraycopy(transcoded, 0, chars, i, transcoded.length);

            // only append the current char to the key if it is different from the last one
            if (chars[i] != chars[i - 1]) {
                key.append(chars[i]);
            }
        }

        if (key.length() > 1) {
            char lastChar = key.charAt(key.length() - 1);

            // If last character is S, remove it.
            if (lastChar == 'S') {
                key.deleteCharAt(key.length() - 1);
                lastChar = key.charAt(key.length() - 1);
            }

            if (key.length() > 2) {
                final char last2Char = key.charAt(key.length() - 2);
                // If last characters are AY, replace with Y.
                if (last2Char == 'A' && lastChar == 'Y') {
                    key.deleteCharAt(key.length() - 2);
                }
            }

            // If last character is A, remove it.
            if (lastChar == 'A') {
                key.deleteCharAt(key.length() - 1);
            }
        }

        final String string = key.toString();
        return this.isStrict() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string;
    }

}

org/apache/commons/codec/language/Nysiis.java

 

Or download all of them as a single archive file:

File name: commons-codec-1.15-sources.jar
File size: 364441 bytes
Release date: 2020-09-01
Download 

 

Download and Install commons-codec-1.15-bin.zip

What Is commons-codec.jar

Downloading and Reviewing commons-codec.jar

⇑⇑ FAQ for Apache Commons Codec JAR Library

2020-06-28, 105537👍, 1💬