應用curl擴展抓取網頁

ccpp 9年前發布 | 1K 次閱讀 PHP curl

    <?php
namespace Think;
header("Content-Type: text/html;charset=utf-8");
class Mycurl
{
public $ch = null;
public $data = null;

    public function __construct($url)  
    {  
        $this->ch = curl_init($url);  
        curl_setopt($this->ch, CURLOPT_HEADER, false);   //不返回頭部信息  
        //將 curl_exec()獲取的信息以文件流的形式返回,而不是直接輸出。  
        curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, true);   
        $this->data = curl_exec($this->ch);    
    }  
    public function __destruct()  //釋放資源   
    {    
        curl_close($this->ch);  
    }  

    public function regmatch()   //正則方式抓取  
    {  
        $reg = '/(?<=<title>)(.*)(?=<\/title>)/i';  //抓取標題  
        $reg = '/<div\sid="article_content"\sclass="article_content">([^(?<\/div>)]*)<\/div>/si'; //抓取文章內容  
        preg_match($reg,$this->data,$out);     
        return $out[1];  
    }   
    public function result($pos1,$pos2)   //字符串方式抓取  
    {  
        $len = strlen($pos1);  
        $flag1 = stripos($this->data, $pos1);  
        $flag2 = stripos($this->data, $pos2);  
        $str = substr($this->data,$flag1,$flag2-$flag1);  
        return $str;  
    }  
    public function exec()   //獲取抓取數據  
    {  
        $data = Array();  
        $data['title'] = self::result('<title>','-盧松松博客</title>');  
        $data['title'] = substr($data['title'],7);  //參數7偏移是為了過濾上一步字符串抓取結果中的前面<title>  
        $data['content'] = self::result('<dd class="post-info">','<center>');  
        $data['content'] = str_ireplace("/upload/","http://lusongsong.com/upload/",$data['content']);  //這一步解決抓取文章的圖片地址錯誤  
        $data['content'] = str_ireplace("http://lusongsong.comhttp://lusongsong.com","http://lusongsong.com",$data['content']); //解決上一步產生的副作用,   
        $data['content'] = str_ireplace("bloghttp://lusongsong.com","blog",$data['content']);  //繼續解決上兩步產生的副作用  
        $data['atime'] = time();  
        $data['author'] = 'Internet';  
        $data['sort'] = '精彩博文';  
        // $data['oldlink'] = '';  
        $data['summary'] = substr(strip_tags($data['content']),0,180);  //截取文章摘要  
        return $data;  
    }  

}  

// 測試  
$url = 'http://lusongsong.com/reed/';  
$num = 100;   //住區文章數目  
$start = 350;  //抓取起點  

$Art = M('article');  

for($i=$start; $i < $start+$num ; $i++)  
{   

    $posurl = $url.$i.'.html';  
    $curl = new Mycurl($posurl);  
    $data = $curl->exec();  
    $data['oldlink'] = $posurl;  
    if($pos = strpos($data['title'], "出現404錯誤頁面了"))  
    {  
        continue;  
    }  
    $Art->add($data);  
    $curl = null;  
}  
$this->success("執行完成!","index");  



?>  </pre> 


 本文由用戶 ccpp 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
 轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
 本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!