PHP敏感詞過濾

cp5m 10年前發布 | 2K 次閱讀 PHP

 
/**

  • 禁詞過濾
  • 執行效率:每篇用時0.05秒
  • @author liuxu / class Logic_BlackWord {

    const APP_FORUM = 1; const APP_BLOG = 2; const APP_VOTE = 3;

    /**

    • 過濾得到禁詞
    • @param unknown $txt
    • @return Ambigous <multitype:, unknown> */ public function getHitList($txt) { $hitList = array();

      //對禁詞分批過濾 $max = $this->getMax(); if($max) {

       $size = 1000;
       $last = ceil($max/$size);
       for($page=1;$page<=$last;$page++)
       {
           $result = $this->getHitListByPage($txt,$page,$size);
           if($result) $hitList = array_merge($hitList,$result);
       }
      

      }

      $hitList2 = array(); foreach($hitList as $hit=>$type) {

       $hitList2[$type][] = $hit;
      

      }

      return $hitList2; }

      private function getMax() { $redis = Rds::factory(); $memKey = 'blackWord_max'; $max = $redis->get($memKey); if($max===false) {

       $max = 0;
       $blackWord = new Model_BlackWord_BlackWord();
       $para['field'] = "MAX(id) AS max";
       $result = $blackWord->search($para);
       if(isset($result[0]['max'])) $max = $result[0]['max'];
      
       $redis->setex($memKey,300,$max);
      

      }

      return $max; }

      /**

    • 分批過濾得到禁詞
    • @param unknown $txt
    • @param number $page
    • @param number $size
    • @return multitype:Ambigous <multitype:unknown, multitype:arr > */ private function getHitListByPage($txt,$page=1,$size=1000) { $hitList = array();

      //分批得到禁詞樹 $wordTree = $this->getWordTreeByPage($page,$size);

      $txt = strip_tags($txt); $txt = preg_replace('/[^a-zA-Z0-9\x{4e00}-\x{9fa5}]/iu','',$txt);

      $len = mb_strlen($txt,'UTF-8'); for($i=0;$i<$len;$i++) {

       $char = mb_substr($txt,$i,1,'UTF-8');
       if(isset($wordTree[$char]))
       {
           $result = $this->getHitListByTree(mb_substr($txt,$i,50,'UTF-8'),$wordTree);
           if($result)
           {
               foreach($result as $hit=>$type)
               {
                   $hitList[$hit] = $type;
               }
           }
       }
      

      }

      return $hitList; }

      /**

    • 是否禁詞
    • @param str $txt
    • @param arr $wordTree
    • @return multitype:unknown */ private function getHitListByTree($txt,&$wordTree) { $len = mb_strlen($txt,'UTF-8'); $point = & $wordTree; $hit = ''; $hitList = array(); for($i=0;$i<$len;$i++) {

       $char = mb_substr($txt,$i,1,'UTF-8');
       if(isset($point[$char]))
       {
           $hit .= $char;
           $point = & $point[$char];
      
           if(isset($point['type']))//匹配成功
           {
               $hitList[$hit] = $point['type'];
           }
       }
       else
       {
           break;
       }
      
      

      }

      return $hitList; }

      /**

    • 分批得到禁詞樹
    • @param int $page
    • @param int $size
    • @return arr: */ private function getWordTreeByPage($page=1,$size=1000) { $redis = Rds::factory(); $memKey = 'blackWordtree'.$page.'_'.$size; $wordTree = $redis->get($memKey); if($wordTree===false) {

       $wordTree = array();
       $blackWord = new Model_BlackWord_BlackWord();
       $start = ($page-1)*$size;
       $end = $start + $size;
       $para['where'] = "status=1 AND id>".$start." AND id<=".$end;
       $result = $blackWord->search($para);
       if($result)
       {
           foreach($result as $value)
           {
               if($value['word'])
               {
                   $value['word'] = preg_split('/(?<!^)(?!$)/u',$value['word']);
                   $point = & $wordTree;
                   foreach($value['word'] as $char)
                   {
                       $point = & $point[$char];
                   }
      
                   $point['type'] = $value['type'];
               }
           }
       }
      
       $redis->setex($memKey,300,$wordTree);
      

      }

      return $wordTree; }

}

</pre>

 本文由用戶 cp5m 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
 轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
 本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!