java根据中文包含词和中文排除词,匹配文章中的命中的词组规则-创新互联

文章:"小学生刚刚步入校园生活,还在适应期,虽然坐在教室,但是思绪不知道飘到哪里去了,这也是他们想象力如此丰富的原因";

创新互联为企业级客户提高一站式互联网+设计服务,主要包括成都网站设计、网站建设、app软件开发公司小程序制作、宣传片制作、LOGO设计等,帮助客户快速提升营销能力和企业形象,创新互联各部门都有经验丰富的经验,可以确保每一个作品的质量和创作周期,同时每年都有很多新员工加入,为我们带来大量新的创意。 

+代表与,|代码或,整个规则必须用()括起来

当排除词的规则命中其中一个时,整篇文章视为不匹配,返回false

包含词规则:"(小学生|(思绪|想象力+适应期))"

排除词规则:"(小x生)"

源码:WordMatcher.java

类文件 BoolParse.java在 我以前发布的文章《java 求字符串形式bool表达式的值》中:

import org.apache.commons.lang3.StringUtils;

import java.util.*;
import java.util.function.Function;


public class WordMatcher {
    private final ListincludeExprList;
    private final ListexcludeExprList;

    private final MapwordMapId = new HashMap<>();
    private final MapidMapWord = new HashMap<>();
    private final Tree tree = new Tree();
    private SethitExpr = null;

    public WordMatcher(String includeExpr, String excludeExpr) {
        if (StringUtils.isBlank(includeExpr)) throw new RuntimeException("包含词不能为空");

        Function>function = (expr) ->{
            ListexprList = new ArrayList<>();
            StringBuilder newExpr = new StringBuilder();
            StringBuilder word = new StringBuilder();
            int level = 0;
            for (char aChar : expr.toCharArray()) {
                switch (aChar) {
                    case ' ':
                        continue;
                    case '(':
                        level++;
                        if (level == 1) continue;
                        if (level >1) newExpr.append(aChar);
                        continue;
                    case ')':
                        level--;
                        break;
                    case '|':
                        break;
                    case '+':
                        aChar = '&';
                        break;
                    default:
                        word.append(aChar);
                        continue;

                }

                String s = word.toString();
                if (s.isEmpty()) {
                    if( level == 0) break;
                    throw new RuntimeException("表达式语法错误:" + expr);
                }
                tree.insert(s);
                char id;
                if (!wordMapId.containsKey(s)) {
                    char c = (char) (wordMapId.size() + 256);
                    wordMapId.put(s, c);
                    idMapWord.put(c, s);
                    id = c;
                } else {
                    id = wordMapId.get(s);
                }
                word = new StringBuilder();
                newExpr.append(id);
                if (level >1 || (level == 1 && aChar != '|')) newExpr.append(aChar);
                if (level == 1&& aChar == '|') {
                    exprList.add(newExpr.toString());
                    newExpr = new StringBuilder();
                }
            }

            if (level != 0 || word.length() != 0) throw new RuntimeException("表达式语法错误:" + expr);
            if (newExpr.length() != 0) {
                exprList.add(newExpr.toString());
            }
            return exprList;
        };
        this.includeExprList = function.apply(includeExpr);
        if (includeExprList.isEmpty()) throw new RuntimeException("包含词表达式不能为空");

        if (StringUtils.isBlank(excludeExpr)) this.excludeExprList = new ArrayList<>();
        else this.excludeExprList = function.apply(excludeExpr);
    }

    public boolean match(String content) {
        Objects.requireNonNull(content);
        if (content.isEmpty()) throw new RuntimeException("empty string");
        SetexistWords = new HashSet<>();
        Listbuilders = new ArrayList<>();
        for (char c : content.toCharArray()) {
            {
                for (StringBuilder builder : builders) {
                    builder.append(c);
                }
                String str = c + "";
                boolean b = tree.containsTheWord(str);
                if (b) {
                    builders.add(new StringBuilder(str));
                }
                if (tree.existTheWord(str)) {
                    existWords.add(str);
                }
            }
            builders.removeIf(builder ->{
                if (builder.length() == 1) return false;
                String str = builder.toString();
                boolean b = tree.containsTheWord(str);
                if (!b) return true;
                boolean exist = tree.existTheWord(str);
                if (exist) {
                    existWords.add(str);
                }
                return false;
            });
        }
        for (String exclude : excludeExprList) {
            for (String word : wordMapId.keySet()) {
                String id = wordMapId.get(word) + "";
                if (existWords.contains(word)) {
                    exclude = exclude.replaceAll(id + "", "T");
                } else {
                    exclude = exclude.replaceAll(id + "", "F");
                }
            }
            if (BoolParser.parse(exclude)) {
                hitExpr = new HashSet<>();
                return false;
            }
        }
        SetexprSet = new HashSet<>();
        boolean result = false;
        for (String include : includeExprList) {
            String includeTemp = include;
            for (String word : wordMapId.keySet()) {
                String id = wordMapId.get(word) + "";
                if (existWords.contains(word)) {
                    include = include.replaceAll(id + "", "T");
                } else {
                    include = include.replaceAll(id + "", "F");
                }
            }
            boolean parse = BoolParser.parse(include);
            if (parse) {
                for (Character id : idMapWord.keySet()) {
                    String word = idMapWord.get(id);
                    includeTemp = includeTemp.replaceAll(id + "", word);
                }
                exprSet.add(includeTemp.replaceAll("&", "+"));
            }
            if (!result) result = BoolParser.parse(include);
        }
        hitExpr = exprSet;
        return result;
    }

    public SethitExpr() {
        if (hitExpr == null) throw new RuntimeException("请先匹配文章");
        return hitExpr;
    }
      //字典树
    private static class Tree {
        private final Mapnodes = new HashMap<>();

        public Tree() {
        }

        public void insert(String word) {
            Objects.requireNonNull(word);
            if (word.isEmpty()) return;
            char[] chars = word.toCharArray();
            Node head = nodes.computeIfAbsent(chars[0], Node::new);
            for (int i = 1; i< chars.length; i++) {
                char aChar = chars[i];
                head = head.putChild(aChar);
            }
        }

        public boolean containsTheWord(String word) {
            Objects.requireNonNull(word);
            if (word.isEmpty()) throw new RuntimeException("empty string");
            char[] chars = word.toCharArray();
            if (!nodes.containsKey(chars[0])) return false;
            Node node = nodes.get(chars[0]);
            for (int i = 1; i< chars.length; i++) {
                char aChar = chars[i];
                Node child = node.getChild(aChar);
                if (child == null) return false;
                node = child;
            }
            return true;
        }

        public boolean existTheWord(String word) {
            Objects.requireNonNull(word);
            if (word.isEmpty()) throw new RuntimeException("empty string");
            char[] chars = word.toCharArray();
            if (!nodes.containsKey(chars[0])) return false;
            Node node = nodes.get(chars[0]);
            for (int i = 1; i< chars.length; i++) {
                char aChar = chars[i];
                Node child = node.getChild(aChar);
                if (child == null) return false;
                node = child;
            }
            return node.isEnd();
        }
    }

    private static class Node {
        protected final char value;
        private final MapchildNodes = new HashMap<>();

        public Node(char value) {
            this.value = value;
        }

        public Node putChild(char value) {
            return childNodes.computeIfAbsent(value, Node::new);
        }

        public Node getChild(char value) {
            return childNodes.get(value);
        }

        public boolean containsNode(char value) {
            return childNodes.containsKey(value);
        }

        public boolean isEnd() {
            return childNodes.isEmpty();
        }
    }
}

使用示例1

public static void main(String[] args) {
        WordMatcher matcher = new WordMatcher("(小学生|(思绪|想象力+适应a))", "(小x生)");
        boolean result = matcher.match("小学生刚刚步入校园生活,还在适应期,虽然坐在教室,但是思绪不知道飘到哪里去了,这也是他们想象力如此丰富的原因");
        System.out.println(result);//false
        System.out.println(matcher.hitExpr());//[]
    }

使用示例2

public static void main(String[] args) {
        WordMatcher matcher = new WordMatcher("(小学生|(思绪|想象力+适应期))", "");
        boolean result = matcher.match("小学生刚刚步入校园生活,还在适应期,虽然坐在教室,但是思绪不知道飘到哪里去了,这也是他们想象力如此丰富的原因");
        System.out.println(result);//true
        System.out.println(matcher.hitExpr());//[(思绪|想象力+适应期), 小学生]
    }

你是否还在寻找稳定的海外服务器提供商?创新互联www.cdcxhl.cn海外机房具备T级流量清洗系统配攻击溯源,准确流量调度确保服务器高可用性,企业级服务器适合批量采购,新人活动首月15元起,快前往官网查看详情吧


分享名称:java根据中文包含词和中文排除词,匹配文章中的命中的词组规则-创新互联
本文URL:http://scyanting.com/article/igdgd.html