SensitiveHelper.php 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. <?php
  2. /**
  3. * 敏感词类库.
  4. * User: wanghui
  5. * Date: 17/3/9
  6. * Time: 上午9:11
  7. */
  8. namespace addons\blog\library;
  9. class SensitiveHelper
  10. {
  11. /**
  12. * 待检测语句长度
  13. *
  14. * @var int
  15. */
  16. protected $contentLength = 0;
  17. /**
  18. * 敏感词单例
  19. *
  20. * @var object|null
  21. */
  22. private static $_instance = null;
  23. /**
  24. * 铭感词库树
  25. *
  26. * @var HashMap|null
  27. */
  28. protected $wordTree = null;
  29. /**
  30. * 存放待检测语句铭感词
  31. *
  32. * @var array|null
  33. */
  34. protected static $badWordList = null;
  35. /**
  36. * 获取单例
  37. *
  38. * @return self
  39. */
  40. public static function init()
  41. {
  42. if (!self::$_instance instanceof self) {
  43. self::$_instance = new self();
  44. }
  45. return self::$_instance;
  46. }
  47. /**
  48. * 构建铭感词树【文件模式】
  49. *
  50. * @param string $filepath
  51. * @return $this
  52. * @throws \Exception
  53. */
  54. public function setTreeByFile($filepath = '')
  55. {
  56. if (!file_exists($filepath)) {
  57. throw new \Exception('词库文件不存在');
  58. }
  59. // 词库树初始化
  60. $this->wordTree = new HashMap();
  61. foreach ($this->yieldToReadFile($filepath) as $word) {
  62. $this->buildWordToTree(trim($word));
  63. }
  64. return $this;
  65. }
  66. /**
  67. * 构建铭感词树【数组模式】
  68. *
  69. * @param null $sensitiveWords
  70. * @return $this
  71. * @throws \Exception
  72. */
  73. public function setTree($sensitiveWords = null)
  74. {
  75. if (empty($sensitiveWords)) {
  76. throw new \Exception('词库不能为空');
  77. }
  78. $this->wordTree = new HashMap();
  79. foreach ($sensitiveWords as $word) {
  80. $this->buildWordToTree($word);
  81. }
  82. return $this;
  83. }
  84. /**
  85. * 检测文字中的敏感词
  86. *
  87. * @param string $content 待检测内容
  88. * @param int $matchType 匹配类型 [默认为最小匹配规则]
  89. * @param int $wordNum 需要获取的敏感词数量 [默认获取全部]
  90. * @return array
  91. */
  92. public function getBadWord($content, $matchType = 1, $wordNum = 0)
  93. {
  94. $this->contentLength = mb_strlen($content, 'utf-8');
  95. $badWordList = array();
  96. for ($length = 0; $length < $this->contentLength; $length++) {
  97. $matchFlag = 0;
  98. $flag = false;
  99. $tempMap = $this->wordTree;
  100. for ($i = $length; $i < $this->contentLength; $i++) {
  101. $keyChar = mb_substr($content, $i, 1, 'utf-8');
  102. // 获取指定节点树
  103. $nowMap = $tempMap->get($keyChar);
  104. // 不存在节点树,直接返回
  105. if (empty($nowMap)) {
  106. break;
  107. }
  108. // 存在,则判断是否为最后一个
  109. $tempMap = $nowMap;
  110. // 找到相应key,偏移量+1
  111. $matchFlag++;
  112. // 如果为最后一个匹配规则,结束循环,返回匹配标识数
  113. if (false === $nowMap->get('ending')) {
  114. continue;
  115. }
  116. $flag = true;
  117. // 最小规则,直接退出
  118. if (1 === $matchType) {
  119. break;
  120. }
  121. }
  122. if (!$flag) {
  123. $matchFlag = 0;
  124. }
  125. // 找到相应key
  126. if ($matchFlag <= 0) {
  127. continue;
  128. }
  129. $badWordList[] = mb_substr($content, $length, $matchFlag, 'utf-8');
  130. // 有返回数量限制
  131. if ($wordNum > 0 && count($badWordList) == $wordNum) {
  132. return $badWordList;
  133. }
  134. // 需匹配内容标志位往后移
  135. $length = $length + $matchFlag - 1;
  136. }
  137. return $badWordList;
  138. }
  139. /**
  140. * 替换敏感字字符
  141. *
  142. * @param $content
  143. * @param $replaceChar
  144. * @param string $sTag
  145. * @param string $eTag
  146. * @param int $matchType
  147. * @return mixed
  148. */
  149. public function replace($content, $replaceChar = '', $sTag = '', $eTag = '', $matchType = 1)
  150. {
  151. if (empty($content)) {
  152. throw new \Exception('请填写检测的内容');
  153. }
  154. if (empty(self::$badWordList)) {
  155. $badWordList = $this->getBadWord($content, $matchType);
  156. } else {
  157. $badWordList = self::$badWordList;
  158. }
  159. // 未检测到敏感词,直接返回
  160. if (empty($badWordList)) {
  161. return $content;
  162. }
  163. foreach ($badWordList as $badWord) {
  164. if ($sTag || $eTag) {
  165. $replaceChar = $sTag . $badWord . $eTag;
  166. }
  167. $content = str_replace($badWord, $replaceChar, $content);
  168. }
  169. return $content;
  170. }
  171. /**
  172. * 被检测内容是否合法,合法返回true,非法返回false
  173. * @param $content
  174. * @return bool
  175. */
  176. public function islegal($content)
  177. {
  178. $this->contentLength = mb_strlen($content, 'utf-8');
  179. for ($length = 0; $length < $this->contentLength; $length++) {
  180. $matchFlag = 0;
  181. $tempMap = $this->wordTree;
  182. for ($i = $length; $i < $this->contentLength; $i++) {
  183. $keyChar = mb_substr($content, $i, 1, 'utf-8');
  184. // 获取指定节点树
  185. $nowMap = $tempMap->get($keyChar);
  186. // 不存在节点树,直接返回
  187. if (empty($nowMap)) {
  188. break;
  189. }
  190. // 找到相应key,偏移量+1
  191. $tempMap = $nowMap;
  192. $matchFlag++;
  193. // 如果为最后一个匹配规则,结束循环,返回匹配标识数
  194. if (false === $nowMap->get('ending')) {
  195. continue;
  196. }
  197. return false;
  198. }
  199. // 找到相应key
  200. if ($matchFlag <= 0) {
  201. continue;
  202. }
  203. // 需匹配内容标志位往后移
  204. $length = $length + $matchFlag - 1;
  205. }
  206. return true;
  207. }
  208. protected function yieldToReadFile($filepath)
  209. {
  210. $fp = fopen($filepath, 'r');
  211. while (!feof($fp)) {
  212. yield fgets($fp);
  213. }
  214. fclose($fp);
  215. }
  216. // 将单个敏感词构建成树结构
  217. protected function buildWordToTree($word = '')
  218. {
  219. if ('' === $word) {
  220. return;
  221. }
  222. $tree = $this->wordTree;
  223. $wordLength = mb_strlen($word, 'utf-8');
  224. for ($i = 0; $i < $wordLength; $i++) {
  225. $keyChar = mb_substr($word, $i, 1, 'utf-8');
  226. // 获取子节点树结构
  227. $tempTree = $tree->get($keyChar);
  228. if ($tempTree) {
  229. $tree = $tempTree;
  230. } else {
  231. // 设置标志位
  232. $newTree = new HashMap();
  233. $newTree->put('ending', false);
  234. // 添加到集合
  235. $tree->put($keyChar, $newTree);
  236. $tree = $newTree;
  237. }
  238. // 到达最后一个节点
  239. if ($i == $wordLength - 1) {
  240. $tree->put('ending', true);
  241. }
  242. }
  243. return;
  244. }
  245. }