有时候我们数据量很少,但是需要全文检索,如果用es的话就太重了。要引入新的技术栈,当然mysql高版本支持全文检索
但是我们再低版本情况下怎么做
工具类
@Slf4j public class HanLPUtil { // 分词需要跳过的片段:标点,语气,助词,动词等 // https://github.com/hankcs/HanLP/blob/1.x/data/dictionary/other/TagPKU98.csv private final static String[] SKIP_SEGMENT_NATURE = {"w", "y", "u", "v"}; private final static String[] SPECIA_KEYWORDS = {"'", "%", "_", "\\", "$", "^", "*", "(", "+", "?", "["}; private final static String[] REPLACE_KEYWORDS = {"\\\'", "\\%", "\\_", "\\\\\\\\", "\\$", "\\^", "\\*", "\\(", "\\+", "\\?", "\\["}; public static String segment(String input, String split) { return HanLPUtil.segment(input, split, 0); } public static String segment(String input, String split, int mode) { if (StringUtil.isEmptyOrNullString(input)) { return input; // null skipped. } input = StringUtil.trim(input.replace(split,"")); final List<Term> termList = IndexTokenizer.segment(input); // 跳过标点,语气词,助词,动词等 final List<Term> filteredTermList = termList.stream()// .filter(e -> !needSkip(e)).collect(Collectors.toList()); final List<String> filteredWords = new ArrayList<>(); // 全文也加入分词结果 filteredWords.add(input); // 两两拼接(跳过单个字符) if (mode == 0) { for (int i = 0; i < filteredTermList.size(); i++) { final Term current = filteredTermList.get(i); final Term prev = (i - 1 >= 0) ? filteredTermList.get(i - 1) : null; if (null == prev) { continue; } String word = StringUtil.trim(prev.word + current.word); if (StringUtil.isBlank(word) || StringUtil.equals(split, word) || word.length() < 1) { continue; } filteredWords.add(word); } } // 直接过滤后分词结果(跳过单个字符) else if (mode == 1) { for (Term term : filteredTermList) { String word = StringUtil.trim(term.word); if (StringUtil.isBlank(word) || StringUtil.equals(split, word) || word.length() < 1) { continue; } filteredWords.add(word); } } if (log.isDebugEnabled()) { log.info("#1103 HanLPUtil.segment() input={}, mode={}, result={}", input, mode, filteredWords); } // 最终结果以分隔符拼接 return StringUtil.join(filteredWords, split); } private static boolean needSkip(Term term) { if (null == term || null == term.word) return true; for (String nature : SKIP_SEGMENT_NATURE) { if (term.nature.startsWith(nature)) return true; } return false; } /** * SQL的LIKE/REGEXP查询语句中,有一些特殊的字符,需要转换后才能搜索到结果: * ':用于包裹搜索条件,需转为\'; * %:用于代替任意数目的任意字符,需转换为\%; * _:用于代替一个任意字符,需转换为\_; * \:转义符号,需转换为\\\\。 * .... */ public static String filterSpecia(String keyword) { if (StringUtil.isEmptyOrNullString(keyword)) return keyword; final StringBuilder result = new StringBuilder(); for (char keywordChar : keyword.toCharArray()) { boolean isReplace = false; for (int i = 0; i < SPECIA_KEYWORDS.length; i++) { if ((keywordChar + "").equals(SPECIA_KEYWORDS[i])) { result.append(REPLACE_KEYWORDS[i]); isReplace = true; break; } } if (!isReplace) { result.append(keywordChar); } } return result.toString(); } }
1.先转义
// 特殊字符转义 searchKey = HanLPUtil.filterSpecia(searchKey)
2分词条件
HanLPUtil.segment(filterSpeciaWithSearchKey, "|", 1)
3.搜索条件
a.body REGEXP '还有一些特殊字符|一些|特殊|字符' OR body like '%有一些特殊字符一些特殊字符'
标签:word,String,final,StringUtil,return,搜索,mysql,input,分词 From: https://www.cnblogs.com/LQBlog/p/17125936.html