PHP敏感詞過濾
/**
- 禁詞過濾
- 執行效率:每篇用時0.05秒
@author liuxu / class Logic_BlackWord {
const APP_FORUM = 1; const APP_BLOG = 2; const APP_VOTE = 3;
/**
- 過濾得到禁詞
- @param unknown $txt
@return Ambigous <multitype:, unknown> */ public function getHitList($txt) { $hitList = array();
//對禁詞分批過濾 $max = $this->getMax(); if($max) {
$size = 1000; $last = ceil($max/$size); for($page=1;$page<=$last;$page++) { $result = $this->getHitListByPage($txt,$page,$size); if($result) $hitList = array_merge($hitList,$result); }}
$hitList2 = array(); foreach($hitList as $hit=>$type) {
$hitList2[$type][] = $hit;}
return $hitList2; }
private function getMax() { $redis = Rds::factory(); $memKey = 'blackWord_max'; $max = $redis->get($memKey); if($max===false) {
$max = 0; $blackWord = new Model_BlackWord_BlackWord(); $para['field'] = "MAX(id) AS max"; $result = $blackWord->search($para); if(isset($result[0]['max'])) $max = $result[0]['max']; $redis->setex($memKey,300,$max);}
return $max; }
/**
- 分批過濾得到禁詞
- @param unknown $txt
- @param number $page
- @param number $size
@return multitype:Ambigous <multitype:unknown, multitype:arr > */ private function getHitListByPage($txt,$page=1,$size=1000) { $hitList = array();
//分批得到禁詞樹 $wordTree = $this->getWordTreeByPage($page,$size);
$txt = strip_tags($txt); $txt = preg_replace('/[^a-zA-Z0-9\x{4e00}-\x{9fa5}]/iu','',$txt);
$len = mb_strlen($txt,'UTF-8'); for($i=0;$i<$len;$i++) {
$char = mb_substr($txt,$i,1,'UTF-8'); if(isset($wordTree[$char])) { $result = $this->getHitListByTree(mb_substr($txt,$i,50,'UTF-8'),$wordTree); if($result) { foreach($result as $hit=>$type) { $hitList[$hit] = $type; } } }}
return $hitList; }
/**
- 是否禁詞
- @param str $txt
- @param arr $wordTree
@return multitype:unknown */ private function getHitListByTree($txt,&$wordTree) { $len = mb_strlen($txt,'UTF-8'); $point = & $wordTree; $hit = ''; $hitList = array(); for($i=0;$i<$len;$i++) {
$char = mb_substr($txt,$i,1,'UTF-8'); if(isset($point[$char])) { $hit .= $char; $point = & $point[$char]; if(isset($point['type']))//匹配成功 { $hitList[$hit] = $point['type']; } } else { break; }}
return $hitList; }
/**
- 分批得到禁詞樹
- @param int $page
- @param int $size
@return arr: */ private function getWordTreeByPage($page=1,$size=1000) { $redis = Rds::factory(); $memKey = 'blackWordtree'.$page.'_'.$size; $wordTree = $redis->get($memKey); if($wordTree===false) {
$wordTree = array(); $blackWord = new Model_BlackWord_BlackWord(); $start = ($page-1)*$size; $end = $start + $size; $para['where'] = "status=1 AND id>".$start." AND id<=".$end; $result = $blackWord->search($para); if($result) { foreach($result as $value) { if($value['word']) { $value['word'] = preg_split('/(?<!^)(?!$)/u',$value['word']); $point = & $wordTree; foreach($value['word'] as $char) { $point = & $point[$char]; } $point['type'] = $value['type']; } } } $redis->setex($memKey,300,$wordTree);}
return $wordTree; }
}
</pre>