PinyinHelper.java 12.3 KB
package com.wugian.ping;

import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * 汉字转拼音类
 *
 * @author stuxuhai (dczxxuhai@gmail.com)
 */
public final class PinyinHelper {
    private static List<String> dict = new ArrayList<String>();
    private static final Map<String, String> PINYIN_TABLE = PinyinResource.getPinyinResource();
    private static final Map<String, String> MUTIL_PINYIN_TABLE = PinyinResource.getMutilPinyinResource();
    private static final DoubleArrayTrie DOUBLE_ARRAY_TRIE = new DoubleArrayTrie();
    private static final String PINYIN_SEPARATOR = ","; // 拼音分隔符号
    private static final String ALL_UNMARKED_VOWEL = "aeiouv";
    private static final String ALL_MARKED_VOWEL = "āáǎàēéěèīíǐìōóǒòūúǔùǖǘǚǜ";

    static {
        for (String word : MUTIL_PINYIN_TABLE.keySet()) {
            dict.add(word);
        }
        Collections.sort(dict);
        DOUBLE_ARRAY_TRIE.build(dict);
    }

    private PinyinHelper() {
    }

    private static String[] convertWithToneNumber(String pinyinArrayString) {
        String[] pinyinArray = pinyinArrayString.split(PINYIN_SEPARATOR);
        for (int i = pinyinArray.length - 1; i >= 0; i--) {
            boolean hasMarkedChar = false;
            String originalPinyin = pinyinArray[i].replace("ü", "v"); // 将拼音中的ü替换为v

            for (int j = originalPinyin.length() - 1; j >= 0; j--) {
                char originalChar = originalPinyin.charAt(j);

                // 搜索带声调的拼音字母,如果存在则替换为对应不带声调的英文字母
                if (originalChar < 'a' || originalChar > 'z') {
                    int indexInAllMarked = ALL_MARKED_VOWEL.indexOf(originalChar);
                    int toneNumber = indexInAllMarked % 4 + 1;
                    char replaceChar = ALL_UNMARKED_VOWEL.charAt((indexInAllMarked - indexInAllMarked % 4) / 4);
                    pinyinArray[i] = originalPinyin.replace(String.valueOf(originalChar), String.valueOf(replaceChar))
                            + toneNumber;
                    hasMarkedChar = true;
                    break;
                }
            }
            if (!hasMarkedChar) {
                // 找不到带声调的拼音字母说明是轻声,用数字5表示
                pinyinArray[i] = originalPinyin + "5";
            }
        }

        return pinyinArray;
    }

    private static String[] convertWithoutTone(String pinyinArrayString) {
        String[] pinyinArray;
        for (int i = ALL_MARKED_VOWEL.length() - 1; i >= 0; i--) {
            char originalChar = ALL_MARKED_VOWEL.charAt(i);
            char replaceChar = ALL_UNMARKED_VOWEL.charAt((i - i % 4) / 4);
            pinyinArrayString = pinyinArrayString.replace(String.valueOf(originalChar), String.valueOf(replaceChar));
        }
        // 将拼音中的ü替换为v
        pinyinArray = pinyinArrayString.replace("ü", "v").split(PINYIN_SEPARATOR);
        return pinyinArray;
    }

    private static String[] formatPinyin(String pinyinString, PinyinFormat pinyinFormat) {
        if (pinyinFormat == PinyinFormat.WITH_TONE_MARK) {
            return pinyinString.split(PINYIN_SEPARATOR);
        } else if (pinyinFormat == PinyinFormat.WITH_TONE_NUMBER) {
            return convertWithToneNumber(pinyinString);
        } else if (pinyinFormat == PinyinFormat.WITHOUT_TONE) {
            return convertWithoutTone(pinyinString);
        }
        return new String[0];
    }

    public static String[] convertToPinyinArray(char c, PinyinFormat pinyinFormat) {
        String pinyin = PINYIN_TABLE.get(String.valueOf(c));
        if ((pinyin != null) && (!"null".equals(pinyin))) {
            Set<String> set = new LinkedHashSet<String>();
            for (String str : formatPinyin(pinyin, pinyinFormat)) {
                set.add(str);
            }
            return set.toArray(new String[set.size()]);
        }
        return new String[0];
    }

    public static String[] convertToPinyinArray(char c) {
        return convertToPinyinArray(c, PinyinFormat.WITH_TONE_MARK);
    }

    public static String convertToPinyinString(String str, String separator, PinyinFormat pinyinFormat)
            throws PinyinException {
        str = ChineseHelper.convertToSimplifiedChinese(str);
        StringBuilder sb = new StringBuilder();
        int i = 0;
        int strLen = str.length();
        while (i < strLen) {
            String substr = str.substring(i);
            List<Integer> commonPrefixList = DOUBLE_ARRAY_TRIE.commonPrefixSearch(substr);
            if (commonPrefixList.size() == 0) {
                char c = str.charAt(i);
                if (ChineseHelper.isChinese(c) /*|| c == CHINESE_LING*/) {
                    String[] pinyinArray = convertToPinyinArray(c, pinyinFormat);
                    if (pinyinArray != null) {
                        if (pinyinArray.length > 0) {
                            sb.append(pinyinArray[0]);
                        } else {
                            throw new PinyinException("Can't convert to pinyin: " + c);
                        }
                    } else {
                        sb.append(str.charAt(i));
                    }
                } else {
                    sb.append(c);
                }
                i++;
            } else {
                String words = dict.get(commonPrefixList.get(commonPrefixList.size() - 1));
                String[] pinyinArray = formatPinyin(MUTIL_PINYIN_TABLE.get(words), pinyinFormat);
                for (int j = 0, l = pinyinArray.length; j < l; j++) {
                    sb.append(pinyinArray[j]);
                    if (j < l - 1) {
                        sb.append(separator);
                    }
                }
                i += words.length();
            }

            if (i < strLen) {
                sb.append(separator);
            }
        }
        return sb.toString();
    }

    public static String convertToPinyinString(String str, String separator) throws PinyinException {
        return convertToPinyinString(str, separator, PinyinFormat.WITH_TONE_MARK);
    }

    public static boolean hasMultiPinyin(char c) {
        String[] pinyinArray = convertToPinyinArray(c);
        if (pinyinArray != null && pinyinArray.length > 1) {
            return true;
        }
        return false;
    }

    public static String getShortPinyin(String str) throws PinyinException {
        String separator = "#";
        StringBuilder sb = new StringBuilder();

        char[] charArray = new char[str.length()];
        for (int i = 0, len = str.length(); i < len; i++) {
            char c = str.charAt(i);
            if (!ChineseHelper.isChinese(c) /*&& c != CHINESE_LING*/) {
                charArray[i] = c;
            } else {
                int j = i + 1;
                sb.append(c);

                // 搜索连续的汉字字符串
                while (j < len && (ChineseHelper.isChinese(str.charAt(j)) /*|| str.charAt(j) == CHINESE_LING*/)) {
                    sb.append(str.charAt(j));
                    j++;
                }
                String hanziPinyin = convertToPinyinString(sb.toString(), separator, PinyinFormat.WITH_TONE_NUMBER);
                String[] pinyinArray = hanziPinyin.split(separator);
                for (String string : pinyinArray) {
                    charArray[i] = string.charAt(0);
                    i++;
                }
                i--;
                sb.setLength(0);
            }
        }
        return String.valueOf(charArray);
    }

    public static void addPinyinDict(String path) throws FileNotFoundException {
        PINYIN_TABLE.putAll(PinyinResource.getResource(PinyinResource.newFileReader(path)));
    }

    public static void addMutilPinyinDict(String path) throws FileNotFoundException {
        MUTIL_PINYIN_TABLE.putAll(PinyinResource.getResource(PinyinResource.newFileReader(path)));
        dict.clear();
        DOUBLE_ARRAY_TRIE.clear();
        for (String word : MUTIL_PINYIN_TABLE.keySet()) {
            dict.add(word);
        }
        Collections.sort(dict);
        DOUBLE_ARRAY_TRIE.build(dict);
    }


    public static String getAllShortPinyin(String str) {
        String hanziPinyin = null;
        try {
            hanziPinyin = convertToAllShortPinyinString(str, PinyinFormat.WITH_TONE_NUMBER);
        } catch (PinyinException e) {
            e.printStackTrace();
        }

        return hanziPinyin;
    }

    public static String convertToAllShortPinyinString(String str, PinyinFormat pinyinFormat)
            throws PinyinException {
        str = ChineseHelper.convertToSimplifiedChinese(str);
        StringBuilder sb = new StringBuilder();
        ArrayList<String[]> multi = new ArrayList<String[]>();
        ArrayList<Integer> multiPosition = new ArrayList<Integer>();
        boolean hasMulti = false;
        int i = 0;
        int strLen = str.length();
        while (i < strLen) {
            String substr = str.substring(i);
            List<Integer> commonPrefixList = DOUBLE_ARRAY_TRIE.commonPrefixSearch(substr);
            if (commonPrefixList.size() == 0) {
                char c = str.charAt(i);
                if (ChineseHelper.isChinese(c)) {
                    String[] pinyinArray = convertToPinyinArray(c, pinyinFormat);
                    if (pinyinArray != null) {
                        if (pinyinArray.length > 0) {
                            sb.append(String.valueOf(pinyinArray[0].charAt(0)));

                            if (pinyinArray.length > 1) {
                                hasMulti = true;
                                String[] strings = new String[pinyinArray.length];
                                for (int i1 = 0; i1 < pinyinArray.length; i1++) {
                                    strings[i1] = String.valueOf(pinyinArray[i1].charAt(0));
                                }
                                multi.add(strings);
                                multiPosition.add(i);
                            } else {
                                multi.add(new String[]{String.valueOf(pinyinArray[0].charAt(0))});
                            }
                        } else {
                            throw new PinyinException("Can't convert to pinyin: " + c);
                        }
                    } else {
                        sb.append(str.charAt(i));
                    }
                } else {
                    multi.add(new String[]{String.valueOf(c)});
                    sb.append(c);
                }
                i++;
            } else {
                String words = dict.get(commonPrefixList.get(commonPrefixList.size() - 1));
                String[] pinyinArray = formatPinyin(MUTIL_PINYIN_TABLE.get(words), pinyinFormat);
                for (int j = 0, l = pinyinArray.length; j < l; j++) {
                    char c = pinyinArray[j].charAt(0);
                    sb.append(c);
                    multi.add(new String[]{String.valueOf(c)});
                }
                i += words.length();
            }
        }
        String s = sb.toString();
        ArrayList<String> temp = new ArrayList<String>();
        ArrayList<String> temp1 = new ArrayList<String>();
        temp.add(s);
        StringBuilder result = new StringBuilder();
        if (hasMulti) {
            for (Integer integer : multiPosition) {
                String[] strings = multi.get(integer);
                for (int i1 = 1; i1 < strings.length; i1++) {
                    for (String s1 : temp) {
                        temp1.add(replace(s1, integer, strings[i1]));
                    }
                }
                temp.addAll(temp1);
                temp1.clear();
            }
        }
        for (String s1 : temp) {
            if (!result.toString().contains(s1)) {
                result.append(s1).append(",");
                System.out.println("MainPresenter :" + s1);
            }
        }
        return result.toString();
    }

    private static String replace(String str, int index, String replace) {
        if (str == null) {
            return str;
        } else if (index < 0 || index >= str.length()) {
            return str;
        }
        char[] chars = str.toCharArray();
        chars[index] = replace.charAt(0);
        return String.valueOf(chars);
    }
}