{
private DatFilterExecutor() {
super("dat 双数组算法脱敏实现");
}
private static class SingleFactory {
private static final DatFilterExecutor INSTANCE = new DatFilterExecutor();
}
public static final DatFilterExecutor getInstance() {
return SingleFactory.INSTANCE;
}
@Override
protected DatCacheNode getCacheNodes() {
return new DatCacheNode();
}
@Override
protected boolean put(String word) throws RuntimeException {
if (StringUtils.isBlank(word)) {
return false;
}
word = StringUtils.trim(word);
if (word.length() < 2) {
return false;
}
cacheNodes.getWords().add(word);
for (Character character : word.toCharArray()) {
cacheNodes.getChars().add(character);
}
return true;
}
@Override
protected boolean processor(boolean partMatch, String content, Callback callback) throws RuntimeException {
if (StringUtils.isBlank(content)) {
return false;
}
content = StringUtils.trim(content);
if (content.length() < 2) {
return false;
}
for (int i = 0; i < content.length(); i++) {
Character wordChar = content.charAt(i);
// 判断是否属于脏字符
if (!cacheNodes.getChars().contains(wordChar)) {
continue;
}
int j = i + 1;
while (j < content.length()) {
// 判断下一个字符是否属于脏字符
wordChar = content.charAt(j);
if (!cacheNodes.getChars().contains(wordChar)) {
break;
}
String word = content.substring(i, j + 1);
// 判断是否是脏词
if (cacheNodes.getWords().contains(word)) {
if (callback.call(word)) {
return true;
}
if (partMatch) {
i += word.length();
}
}
j++;
}
}
return false;
}
public static void main(String[] args) {
DatFilterExecutor.getInstance().init();
DatFilterExecutor.getInstance().put("中国人");
DatFilterExecutor.getInstance().put("中国男人");
DatFilterExecutor.getInstance().put("中国人民");
DatFilterExecutor.getInstance().put("人民");
DatFilterExecutor.getInstance().put("中间");
DatFilterExecutor.getInstance().put("女人");
DatFilterExecutor.getInstance().put("一举");
DatFilterExecutor.getInstance().put("一举成名");
DatFilterExecutor.getInstance().put("一举成名走四方");
DatFilterExecutor.getInstance().put("成名");
DatFilterExecutor.getInstance().put("走四方");
String content = "我们中国人都是好人,在他们中间有男人和女人。中国男人很惨,中国人民长期被压迫。";
System.out.println(DatFilterExecutor.getInstance().getWords(true, content));
System.out.println(DatFilterExecutor.getInstance().getWords(false, content));
System.out.println(DatFilterExecutor.getInstance().filter(false, content, '*'));
System.out.println(DatFilterExecutor.getInstance().highlight(false, content));
content = "一举成名走四方的是什么";
System.out.println(DatFilterExecutor.getInstance().getWords(true, content));
System.out.println(DatFilterExecutor.getInstance().getWords(false, content));
System.out.println(DatFilterExecutor.getInstance().filter(false, content, '*'));
System.out.println(DatFilterExecutor.getInstance().highlight(false, content));
}
}
================================================
FILE: src/main/java/com/cnblogs/hoojo/sensitivewords/filter/dat/exectuor/DoubleArrayTrie2.java
================================================
package com.cnblogs.hoojo.sensitivewords.filter.dat.exectuor;
/**
* DoubleArrayTrie: Java implementation of Darts (Double-ARray Trie System)
*
*
* Copyright(C) 2001-2007 Taku Kudo <taku@chasen.org>
* Copyright(C) 2009 MURAWAKI Yugo <murawaki@nlp.kuee.kyoto-u.ac.jp>
* Copyright(C) 2012 KOMIYA Atsushi <komiya.atsushi@gmail.com>
*
*
*
* The contents of this file may be used under the terms of either of the GNU
* Lesser General Public License Version 2.1 or later (the "LGPL"), or the BSD
* License (the "BSD").
*
*/
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import com.google.common.collect.Lists;
public class DoubleArrayTrie2 {
private final static int BUF_SIZE = 16384;
private final static int UNIT_SIZE = 8; // size of int + int
private static class Node {
int code;
int depth;
int left;
int right;
};
private int check[];
private int base[];
private boolean used[];
private int size;
private int allocSize;
private List key;
private int keySize;
private int length[];
private int value[];
private int progress;
private int nextCheckPos;
// boolean no_delete_;
int error_;
// int (*progressfunc_) (size_t, size_t);
// inline _resize expanded
private int resize(int newSize) {
int[] base2 = new int[newSize];
int[] check2 = new int[newSize];
boolean used2[] = new boolean[newSize];
if (allocSize > 0) {
System.arraycopy(base, 0, base2, 0, allocSize);
System.arraycopy(check, 0, check2, 0, allocSize);
System.arraycopy(used2, 0, used2, 0, allocSize);
}
base = base2;
check = check2;
used = used2;
return allocSize = newSize;
}
private int fetch(Node parent, List siblings) {
if (error_ < 0)
return 0;
int prev = 0;
for (int i = parent.left; i < parent.right; i++) {
if ((length != null ? length[i] : key.get(i).length()) < parent.depth)
continue;
String tmp = key.get(i);
int cur = 0;
if ((length != null ? length[i] : tmp.length()) != parent.depth)
cur = (int) tmp.charAt(parent.depth) + 1;
if (prev > cur) {
error_ = -3;
return 0;
}
if (cur != prev || siblings.size() == 0) {
Node tmp_node = new Node();
tmp_node.depth = parent.depth + 1;
tmp_node.code = cur;
tmp_node.left = i;
if (siblings.size() != 0)
siblings.get(siblings.size() - 1).right = i;
siblings.add(tmp_node);
}
prev = cur;
}
if (siblings.size() != 0)
siblings.get(siblings.size() - 1).right = parent.right;
return siblings.size();
}
private int insert(List siblings) {
if (error_ < 0)
return 0;
int begin = 0;
int pos = ((siblings.get(0).code + 1 > nextCheckPos) ? siblings.get(0).code + 1 : nextCheckPos) - 1;
int nonzero_num = 0;
int first = 0;
if (allocSize <= pos)
resize(pos + 1);
outer: while (true) {
pos++;
if (allocSize <= pos)
resize(pos + 1);
if (check[pos] != 0) {
nonzero_num++;
continue;
} else if (first == 0) {
nextCheckPos = pos;
first = 1;
}
begin = pos - siblings.get(0).code;
if (allocSize <= (begin + siblings.get(siblings.size() - 1).code)) {
// progress can be zero
double l = (1.05 > 1.0 * keySize / (progress + 1)) ? 1.05 : 1.0 * keySize / (progress + 1);
resize((int) (allocSize * l));
}
if (used[begin])
continue;
for (int i = 1; i < siblings.size(); i++)
if (check[begin + siblings.get(i).code] != 0)
continue outer;
break;
}
// -- Simple heuristics --
// if the percentage of non-empty contents in check between the
// index
// 'next_check_pos' and 'check' is greater than some constant value
// (e.g. 0.9),
// new 'next_check_pos' index is written by 'check'.
if (1.0 * nonzero_num / (pos - nextCheckPos + 1) >= 0.95)
nextCheckPos = pos;
used[begin] = true;
size = (size > begin + siblings.get(siblings.size() - 1).code + 1) ? size
: begin + siblings.get(siblings.size() - 1).code + 1;
for (int i = 0; i < siblings.size(); i++)
check[begin + siblings.get(i).code] = begin;
for (int i = 0; i < siblings.size(); i++) {
List new_siblings = new ArrayList();
if (fetch(siblings.get(i), new_siblings) == 0) {
base[begin + siblings.get(i).code] = (value != null) ? (-value[siblings.get(i).left] - 1)
: (-siblings.get(i).left - 1);
if (value != null && (-value[siblings.get(i).left] - 1) >= 0) {
error_ = -2;
return 0;
}
progress++;
// if (progress_func_) (*progress_func_) (progress,
// keySize);
} else {
int h = insert(new_siblings);
base[begin + siblings.get(i).code] = h;
}
}
return begin;
}
public DoubleArrayTrie2() {
check = null;
base = null;
used = null;
size = 0;
allocSize = 0;
// no_delete_ = false;
error_ = 0;
}
// no deconstructor
// set_result omitted
// the search methods returns (the list of) the value(s) instead
// of (the list of) the pair(s) of value(s) and length(s)
// set_array omitted
// array omitted
void clear() {
// if (! no_delete_)
check = null;
base = null;
used = null;
allocSize = 0;
size = 0;
// no_delete_ = false;
}
public int getUnitSize() {
return UNIT_SIZE;
}
public int getSize() {
return size;
}
public int getTotalSize() {
return size * UNIT_SIZE;
}
public int getNonzeroSize() {
int result = 0;
for (int i = 0; i < size; i++)
if (check[i] != 0)
result++;
return result;
}
public int build(List key) {
return build(key, null, null, key.size());
}
public int build(List _key, int _length[], int _value[], int _keySize) {
if (_keySize > _key.size() || _key == null)
return 0;
// progress_func_ = progress_func;
key = _key;
length = _length;
keySize = _keySize;
value = _value;
progress = 0;
resize(65536 * 32);
base[0] = 1;
nextCheckPos = 0;
Node root_node = new Node();
root_node.left = 0;
root_node.right = keySize;
root_node.depth = 0;
List siblings = new ArrayList();
fetch(root_node, siblings);
insert(siblings);
// size += (1 << 8 * 2) + 1; // ???
// if (size >= allocSize) resize (size);
used = null;
key = null;
return error_;
}
public int put(String _key, int _length[], int _value[]) {
if ( _key == null)
return 0;
// progress_func_ = progress_func;
if (key == null) {
key = Lists.newArrayList();
}
key.add(_key);
keySize = key.size();
length = _length;
value = _value;
progress = 0;
resize(65536 * 32);
base[0] = 1;
nextCheckPos = 0;
Node root_node = new Node();
root_node.left = 0;
root_node.right = keySize;
root_node.depth = 0;
List siblings = new ArrayList();
fetch(root_node, siblings);
insert(siblings);
// size += (1 << 8 * 2) + 1; // ???
// if (size >= allocSize) resize (size);
used = null;
key = null;
return error_;
}
public void open(String fileName) throws IOException {
File file = new File(fileName);
size = (int) file.length() / UNIT_SIZE;
check = new int[size];
base = new int[size];
DataInputStream is = null;
try {
is = new DataInputStream(new BufferedInputStream(new FileInputStream(file), BUF_SIZE));
for (int i = 0; i < size; i++) {
base[i] = is.readInt();
check[i] = is.readInt();
}
} finally {
if (is != null)
is.close();
}
}
public void save(String fileName) throws IOException {
DataOutputStream out = null;
try {
out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(fileName)));
for (int i = 0; i < size; i++) {
out.writeInt(base[i]);
out.writeInt(check[i]);
}
out.close();
} finally {
if (out != null)
out.close();
}
}
public int exactMatchSearch(String key) {
return exactMatchSearch(key, 0, 0, 0);
}
public int exactMatchSearch(String key, int pos, int len, int nodePos) {
if (len <= 0)
len = key.length();
if (nodePos <= 0)
nodePos = 0;
int result = -1;
char[] keyChars = key.toCharArray();
int b = base[nodePos];
int p;
for (int i = pos; i < len; i++) {
p = b + (int) (keyChars[i]) + 1;
if (b == check[p])
b = base[p];
else
return result;
}
p = b;
int n = base[p];
if (b == check[p] && n < 0) {
result = -n - 1;
}
return result;
}
public List commonPrefixSearch(String key) {
return commonPrefixSearch(key, 0, 0, 0);
}
public List commonPrefixSearch(String key, int pos, int len, int nodePos) {
if (len <= 0)
len = key.length();
if (nodePos <= 0)
nodePos = 0;
List result = new ArrayList();
char[] keyChars = key.toCharArray();
int b = base[nodePos];
int n;
int p;
for (int i = pos; i < len; i++) {
p = b;
n = base[p];
if (b == check[p] && n < 0) {
result.add(-n - 1);
}
p = b + (int) (keyChars[i]) + 1;
if (b == check[p])
b = base[p];
else
return result;
}
p = b;
n = base[p];
if (b == check[p] && n < 0) {
result.add(-n - 1);
}
return result;
}
// debug
public void dump() {
for (int i = 0; i < size; i++) {
System.err.println("i: " + i + " [" + base[i] + ", " + check[i] + "]");
}
}
public static void main(String[] args) {
DoubleArrayTrie2 dat = new DoubleArrayTrie2();
List list = Lists.newArrayList();
/*list.add("一举");
list.add("一举成名");
list.add("一举成名走四方");*/
list.add("成名");
list.add("走四方");
Collections.sort(list);
dat.build(list);
dat.put("一举", null, null);
dat.put("一举成名", null, null);
String content = "一举成名走四方的是什么";
List rect = dat.commonPrefixSearch(content);
System.out.println();
for (int index : rect) {
System.out.println("前缀 " + list.get(index) + " matched");
}
// 检索key是否完全命中了词典中的某个词
int index = dat.exactMatchSearch("成");
if (index >= 0) {
System.out.println(content + " match " + list.get(index));
} else {
System.out.println(content + " not match any term");
}
}
}
================================================
FILE: src/main/java/com/cnblogs/hoojo/sensitivewords/filter/dfa/DfaFilter.java
================================================
package com.cnblogs.hoojo.sensitivewords.filter.dfa;
import com.cnblogs.hoojo.sensitivewords.filter.AbstractFilter;
import com.cnblogs.hoojo.sensitivewords.filter.dfa.executor.DfaFilterExecutor;
/**
* DFA 算法实现敏感词脱敏过滤
*
* @author hoojo
* @createDate 2018年2月2日 下午4:23:20
* @file DFASWFilter.java
* @package com.cnblogs.hoojo.sensitivewords.support.dfa
* @project fengkong-service-provider
* @blog http://hoojo.cnblogs.com
* @email hoojo_@126.com
* @version 1.0
*/
public class DfaFilter extends AbstractFilter {
public DfaFilter() {
super(DfaFilterExecutor.getInstance());
}
}
================================================
FILE: src/main/java/com/cnblogs/hoojo/sensitivewords/filter/dfa/executor/DfaFilterExecutor.java
================================================
package com.cnblogs.hoojo.sensitivewords.filter.dfa.executor;
import java.util.HashMap;
import org.apache.commons.lang.StringUtils;
import com.cnblogs.hoojo.sensitivewords.filter.AbstractFilterExecutor;
import com.google.common.collect.Maps;
/**
* DFA 脱敏算法实现支持类
*
* @author hoojo
* @createDate 2018年2月9日 上午10:34:42
* @file DfaFilterExecutor.java
* @project fengkong-service-provider
* @blog http://hoojo.cnblogs.com
* @email hoojo_@126.com
* @version 1.0
*/
public final class DfaFilterExecutor extends AbstractFilterExecutor> {
private static class SingleFactory {
private static final DfaFilterExecutor INSTANCE = new DfaFilterExecutor();
}
public static final DfaFilterExecutor getInstance() {
return SingleFactory.INSTANCE;
}
private DfaFilterExecutor() {
super("DFA 脱敏算法实现支持类");
}
@Override
protected boolean put(String word) throws RuntimeException {
if (StringUtils.isBlank(word)) {
return false;
}
word = StringUtils.trim(word);
if (word.length() < 2) {
return false;
}
Character fisrtChar = word.charAt(0);
DfaNode node = cacheNodes.get(fisrtChar);
if (node == null) {
node = new DfaNode(fisrtChar);
cacheNodes.put(fisrtChar, node);
}
for (int i = 1; i < word.length(); i++) {
Character nextChar = word.charAt(i);
DfaNode nextNode = null;
if (!node.isLeaf()) {
nextNode = node.getChilds().get(nextChar);
}
if (nextNode == null) {
nextNode = new DfaNode(nextChar);
}
node.addChild(nextNode);
node = nextNode;
if (i == word.length() - 1) {
node.setWord(true);
}
}
return true;
}
@Override
protected boolean processor(boolean partMatch, String content, Callback callback) throws RuntimeException {
if (StringUtils.isBlank(content)) {
return false;
}
content = StringUtils.trim(content);
if (content.length() < 2) {
return false;
}
for (int index = 0; index < content.length(); index++) {
char fisrtChar = content.charAt(index);
DfaNode node = cacheNodes.get(fisrtChar);
if (node == null || node.isLeaf()) {
continue;
}
int charCount = 1;
for (int i = index + 1; i < content.length(); i++) {
char wordChar = content.charAt(i);
node = node.getChilds().get(wordChar);
if (node != null) {
charCount++;
} else {
break;
}
if (partMatch && node.isWord()) {
if (callback.call(StringUtils.substring(content, index, index + charCount))) {
return true;
}
break;
} else if (node.isWord()) {
if (callback.call(StringUtils.substring(content, index, index + charCount))) {
return true;
}
}
if (node.isLeaf()) {
break;
}
}
if (partMatch) {
index += charCount;
}
}
return false;
}
@Override
protected HashMap getCacheNodes() {
return Maps.newHashMap();
}
public static void main(String[] args) {
DfaFilterExecutor.getInstance().init();
DfaFilterExecutor.getInstance().put("中国人");
DfaFilterExecutor.getInstance().put("中国男人");
DfaFilterExecutor.getInstance().put("中国人民");
DfaFilterExecutor.getInstance().put("人民");
DfaFilterExecutor.getInstance().put("中间");
DfaFilterExecutor.getInstance().put("女人");
DfaFilterExecutor.getInstance().put("一举");
DfaFilterExecutor.getInstance().put("一举成名");
DfaFilterExecutor.getInstance().put("一举成名走四方");
DfaFilterExecutor.getInstance().put("成名");
DfaFilterExecutor.getInstance().put("走四方");
String content = "我们中国人都是好人,在他们中间有男人和女人。中国男人很惨,中国人民长期被压迫。";
System.out.println(DfaFilterExecutor.getInstance().contains(true, content));
System.out.println(DfaFilterExecutor.getInstance().getWords(true, content));
System.out.println(DfaFilterExecutor.getInstance().getWords(false, content));
System.out.println(DfaFilterExecutor.getInstance().filter(false, content, '*'));
System.out.println(DfaFilterExecutor.getInstance().highlight(true, content));
System.out.println(DfaFilterExecutor.getInstance().highlight(false, content));
content = "一举成名走四方的是什么";
System.out.println(DfaFilterExecutor.getInstance().getWords(true, content));
System.out.println(DfaFilterExecutor.getInstance().getWords(false, content));
System.out.println(DfaFilterExecutor.getInstance().filter(false, content, '*'));
System.out.println(DfaFilterExecutor.getInstance().highlight(true, content));
System.out.println(DfaFilterExecutor.getInstance().highlight(false, content));
}
}
================================================
FILE: src/main/java/com/cnblogs/hoojo/sensitivewords/filter/dfa/executor/DfaNode.java
================================================
package com.cnblogs.hoojo.sensitivewords.filter.dfa.executor;
import java.util.Map;
import java.util.Set;
import com.google.common.collect.Maps;
/**
* dfa多叉树模型
* @author hoojo
* @createDate 2018年2月8日 下午8:23:27
* @file DfaNode.java
* @project fengkong-service-provider
* @blog http://hoojo.cnblogs.com
* @email hoojo_@126.com
* @version 1.0
*/
public class DfaNode {
private char _char;
private DfaNode parent;
private boolean word;
private Map childs;
public DfaNode() {
}
public DfaNode(char _char) {
this._char = _char;
}
public boolean isWord() {
return word;
}
public void setWord(boolean word) {
this.word = word;
}
public boolean isLeaf() {
return (childs == null || childs.isEmpty());
}
public char getChar() {
return _char;
}
public void setChar(char _char) {
this._char = _char;
}
public void addChild(DfaNode child) {
if (this.childs == null) {
childs = Maps.newHashMap();
}
this.childs.put(child.getChar(), child);
//child.setParent(this);
}
public void removeChild(DfaNode child) {
if (this.childs != null) {
this.childs.remove(child.getChar());
}
}
public DfaNode getParent() {
return parent;
}
public void setParent(DfaNode parent) {
this.parent = parent;
}
public Map getChilds() {
/*if (this.childs == null) {
this.childs = Maps.newHashMap();
}*/
return this.childs;
}
public void setChilds(Map childs) {
this.childs = childs;
}
public void print(DfaNode node) {
System.out.println(node.getChar());
if (node.getChilds() != null) {
Set keys = node.getChilds().keySet();
for (Character _char: keys) {
print(node.getChilds().get(_char));
}
}
}
public static void main(String[] args) {
DfaNode node = new DfaNode('中');
DfaNode g = new DfaNode('国');
g.addChild(new DfaNode('人'));
DfaNode n = new DfaNode('男');
n.addChild(new DfaNode('人'));
g.addChild(n);
node.addChild(g);
node.addChild(new DfaNode('间'));
node.print(node);
}
}
================================================
FILE: src/main/java/com/cnblogs/hoojo/sensitivewords/filter/simhash/SimHashFilter.java
================================================
package com.cnblogs.hoojo.sensitivewords.filter.simhash;
import com.cnblogs.hoojo.sensitivewords.filter.AbstractFilter;
import com.cnblogs.hoojo.sensitivewords.filter.simhash.executor.SimHashFilterExecutor;
/**
* simhash 算法
*
* @author hoojo
* @createDate 2018年3月23日 下午5:55:49
* @file SimHashFilter.java
* @package com.cnblogs.hoojo.sensitivewords.filter.simhash.executor
* @project fengkong-service-provider
* @blog http://hoojo.cnblogs.com
* @email hoojo_@126.com
* @version 1.0
*/
public class SimHashFilter extends AbstractFilter {
public SimHashFilter() {
super(SimHashFilterExecutor.getInstance());
}
}
================================================
FILE: src/main/java/com/cnblogs/hoojo/sensitivewords/filter/simhash/executor/FNVHashUtils.java
================================================
package com.cnblogs.hoojo.sensitivewords.filter.simhash.executor;
import java.math.BigInteger;
/**
* http://blog.csdn.net/hustfoxy/article/details/23687239
* https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
*
* @author hoojo
* @createDate 2018年3月22日 下午6:48:41
* @file FNVHashUtils.java
* @package com.cnblogs.hoojo.sensitivewords.filter.bucket.executor
* @project fengkong-service-provider
* @blog http://hoojo.cnblogs.com
* @email hoojo_@126.com
* @version 1.0
*/
public abstract class FNVHashUtils {
public static int HASH_BITS = 64;
/** 初始的哈希值 */
public static final BigInteger FNV_64_OFFSET_BASIS = new BigInteger("14695981039346656037");
/** FNV用于散列的质数 */
public static final BigInteger FNV_64_PRIME = new BigInteger("1099511628211");
/** 8位数据(即一个字节) */
public static final BigInteger FNV_64_OCTET_OF_DATA = BigInteger.ONE.shiftLeft(HASH_BITS).subtract(BigInteger.ONE);
/**
* fnv-1 hash算法,将字符串转换为64位hash值
*/
public static BigInteger hash64(String text) {
BigInteger hash = FNV_64_OFFSET_BASIS;
int len = text.length();
for (int i = 0; i < len; i++) {
hash = hash.multiply(FNV_64_PRIME);
hash = hash.xor(BigInteger.valueOf(text.charAt(i)));
}
hash = hash.and(FNV_64_OCTET_OF_DATA);
return hash;
}
/**
* fnv-1a hash算法,将字符串转换为64位hash值
*/
public static BigInteger hash64_(String text) {
BigInteger hash = FNV_64_OFFSET_BASIS;
int len = text.length();
for (int i = 0; i < len; i++) {
hash = hash.xor(BigInteger.valueOf(text.charAt(i)));
hash = hash.multiply(FNV_64_PRIME);
}
hash = hash.and(FNV_64_OCTET_OF_DATA);
return hash;
}
}
================================================
FILE: src/main/java/com/cnblogs/hoojo/sensitivewords/filter/simhash/executor/Murmur3.java
================================================
package com.cnblogs.hoojo.sensitivewords.filter.simhash.executor;
/**
* Copyright 2014 Prasanth Jayachandran
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Murmur3 32 and 128 bit variants.
* 32-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#94
* 128-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#255
*/
public class Murmur3 {
// Constants for 32 bit variant
private static final int C1_32 = 0xcc9e2d51;
private static final int C2_32 = 0x1b873593;
private static final int R1_32 = 15;
private static final int R2_32 = 13;
private static final int M_32 = 5;
private static final int N_32 = 0xe6546b64;
// Constants for 128 bit variant
private static final long C1 = 0x87c37b91114253d5L;
private static final long C2 = 0x4cf5ad432745937fL;
private static final int R1 = 31;
private static final int R2 = 27;
private static final int R3 = 33;
private static final int M = 5;
private static final int N1 = 0x52dce729;
private static final int N2 = 0x38495ab5;
private static final int DEFAULT_SEED = 0;
/**
* Murmur3 32-bit variant.
*
* @param data - input byte array
* @return - hashcode
*/
public static int hash32(byte[] data) {
return hash32(data, data.length, DEFAULT_SEED);
}
/**
* Murmur3 32-bit variant.
*
* @param data - input byte array
* @param length - length of array
* @param seed - seed. (default 0)
* @return - hashcode
*/
public static int hash32(byte[] data, int length, int seed) {
int hash = seed;
final int nblocks = length >> 2;
// body
for (int i = 0; i < nblocks; i++) {
int i_4 = i << 2;
int k = (data[i_4] & 0xff)
| ((data[i_4 + 1] & 0xff) << 8)
| ((data[i_4 + 2] & 0xff) << 16)
| ((data[i_4 + 3] & 0xff) << 24);
// mix functions
k *= C1_32;
k = Integer.rotateLeft(k, R1_32);
k *= C2_32;
hash ^= k;
hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32;
}
// tail
int idx = nblocks << 2;
int k1 = 0;
switch (length - idx) {
case 3:
k1 ^= data[idx + 2] << 16;
case 2:
k1 ^= data[idx + 1] << 8;
case 1:
k1 ^= data[idx];
// mix functions
k1 *= C1_32;
k1 = Integer.rotateLeft(k1, R1_32);
k1 *= C2_32;
hash ^= k1;
}
// finalization
hash ^= length;
hash ^= (hash >>> 16);
hash *= 0x85ebca6b;
hash ^= (hash >>> 13);
hash *= 0xc2b2ae35;
hash ^= (hash >>> 16);
return hash;
}
/**
* Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant.
*
* @param data - input byte array
* @return - hashcode
*/
public static long hash64(byte[] data) {
return hash64(data, data.length, DEFAULT_SEED);
}
/**
* Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant.
*
* @param data - input byte array
* @param length - length of array
* @param seed - seed. (default is 0)
* @return - hashcode
*/
public static long hash64(byte[] data, int length, int seed) {
long hash = seed;
final int nblocks = length >> 3;
// body
for (int i = 0; i < nblocks; i++) {
final int i8 = i << 3;
long k = ((long) data[i8] & 0xff)
| (((long) data[i8 + 1] & 0xff) << 8)
| (((long) data[i8 + 2] & 0xff) << 16)
| (((long) data[i8 + 3] & 0xff) << 24)
| (((long) data[i8 + 4] & 0xff) << 32)
| (((long) data[i8 + 5] & 0xff) << 40)
| (((long) data[i8 + 6] & 0xff) << 48)
| (((long) data[i8 + 7] & 0xff) << 56);
// mix functions
k *= C1;
k = Long.rotateLeft(k, R1);
k *= C2;
hash ^= k;
hash = Long.rotateLeft(hash, R2) * M + N1;
}
// tail
long k1 = 0;
int tailStart = nblocks << 3;
switch (length - tailStart) {
case 7:
k1 ^= ((long) data[tailStart + 6] & 0xff) << 48;
case 6:
k1 ^= ((long) data[tailStart + 5] & 0xff) << 40;
case 5:
k1 ^= ((long) data[tailStart + 4] & 0xff) << 32;
case 4:
k1 ^= ((long) data[tailStart + 3] & 0xff) << 24;
case 3:
k1 ^= ((long) data[tailStart + 2] & 0xff) << 16;
case 2:
k1 ^= ((long) data[tailStart + 1] & 0xff) << 8;
case 1:
k1 ^= ((long) data[tailStart] & 0xff);
k1 *= C1;
k1 = Long.rotateLeft(k1, R1);
k1 *= C2;
hash ^= k1;
}
// finalization
hash ^= length;
hash = fmix64(hash);
return hash;
}
/**
* Murmur3 128-bit variant.
*
* @param data - input byte array
* @return - hashcode (2 longs)
*/
public static long[] hash128(byte[] data) {
return hash128(data, data.length, DEFAULT_SEED);
}
/**
* Murmur3 128-bit variant.
*
* @param data - input byte array
* @param length - length of array
* @param seed - seed. (default is 0)
* @return - hashcode (2 longs)
*/
public static long[] hash128(byte[] data, int length, int seed) {
long h1 = seed;
long h2 = seed;
final int nblocks = length >> 4;
// body
for (int i = 0; i < nblocks; i++) {
final int i16 = i << 4;
long k1 = ((long) data[i16] & 0xff)
| (((long) data[i16 + 1] & 0xff) << 8)
| (((long) data[i16 + 2] & 0xff) << 16)
| (((long) data[i16 + 3] & 0xff) << 24)
| (((long) data[i16 + 4] & 0xff) << 32)
| (((long) data[i16 + 5] & 0xff) << 40)
| (((long) data[i16 + 6] & 0xff) << 48)
| (((long) data[i16 + 7] & 0xff) << 56);
long k2 = ((long) data[i16 + 8] & 0xff)
| (((long) data[i16 + 9] & 0xff) << 8)
| (((long) data[i16 + 10] & 0xff) << 16)
| (((long) data[i16 + 11] & 0xff) << 24)
| (((long) data[i16 + 12] & 0xff) << 32)
| (((long) data[i16 + 13] & 0xff) << 40)
| (((long) data[i16 + 14] & 0xff) << 48)
| (((long) data[i16 + 15] & 0xff) << 56);
// mix functions for k1
k1 *= C1;
k1 = Long.rotateLeft(k1, R1);
k1 *= C2;
h1 ^= k1;
h1 = Long.rotateLeft(h1, R2);
h1 += h2;
h1 = h1 * M + N1;
// mix functions for k2
k2 *= C2;
k2 = Long.rotateLeft(k2, R3);
k2 *= C1;
h2 ^= k2;
h2 = Long.rotateLeft(h2, R1);
h2 += h1;
h2 = h2 * M + N2;
}
// tail
long k1 = 0;
long k2 = 0;
int tailStart = nblocks << 4;
switch (length - tailStart) {
case 15:
k2 ^= (long) (data[tailStart + 14] & 0xff) << 48;
case 14:
k2 ^= (long) (data[tailStart + 13] & 0xff) << 40;
case 13:
k2 ^= (long) (data[tailStart + 12] & 0xff) << 32;
case 12:
k2 ^= (long) (data[tailStart + 11] & 0xff) << 24;
case 11:
k2 ^= (long) (data[tailStart + 10] & 0xff) << 16;
case 10:
k2 ^= (long) (data[tailStart + 9] & 0xff) << 8;
case 9:
k2 ^= (long) (data[tailStart + 8] & 0xff);
k2 *= C2;
k2 = Long.rotateLeft(k2, R3);
k2 *= C1;
h2 ^= k2;
case 8:
k1 ^= (long) (data[tailStart + 7] & 0xff) << 56;
case 7:
k1 ^= (long) (data[tailStart + 6] & 0xff) << 48;
case 6:
k1 ^= (long) (data[tailStart + 5] & 0xff) << 40;
case 5:
k1 ^= (long) (data[tailStart + 4] & 0xff) << 32;
case 4:
k1 ^= (long) (data[tailStart + 3] & 0xff) << 24;
case 3:
k1 ^= (long) (data[tailStart + 2] & 0xff) << 16;
case 2:
k1 ^= (long) (data[tailStart + 1] & 0xff) << 8;
case 1:
k1 ^= (long) (data[tailStart] & 0xff);
k1 *= C1;
k1 = Long.rotateLeft(k1, R1);
k1 *= C2;
h1 ^= k1;
}
// finalization
h1 ^= length;
h2 ^= length;
h1 += h2;
h2 += h1;
h1 = fmix64(h1);
h2 = fmix64(h2);
h1 += h2;
h2 += h1;
return new long[]{h1, h2};
}
private static long fmix64(long h) {
h ^= (h >>> 33);
h *= 0xff51afd7ed558ccdL;
h ^= (h >>> 33);
h *= 0xc4ceb9fe1a85ec53L;
h ^= (h >>> 33);
return h;
}
}
================================================
FILE: src/main/java/com/cnblogs/hoojo/sensitivewords/filter/simhash/executor/SimHashFilterExecutor.java
================================================
package com.cnblogs.hoojo.sensitivewords.filter.simhash.executor;
import java.util.Map;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import com.cnblogs.hoojo.sensitivewords.filter.AbstractFilterExecutor;
import com.google.common.collect.Maps;
/**
* google simhash 算法实现脱敏过滤
*
* 由于simhash是对大文本进行比较,并且比较的是在支持分词的基础上对分词对象进行比较,进而确定相识度。
* 故 在脱敏方面支持不是很友好,在大文本情况下,效率低下。
* 改变情况,需要分词库支持。
*
* @author hoojo
* @createDate 2018年3月22日 上午11:07:47
* @file SimHashFilterExecutor.java
* @package com.cnblogs.hoojo.sensitivewords.filter.simhash.executor
* @project fengkong-service-provider
* @blog http://hoojo.cnblogs.com
* @email hoojo_@126.com
* @version 1.0
*/
public final class SimHashFilterExecutor extends AbstractFilterExecutor