應用curl擴展抓取網頁
<?php
namespace Think;
header("Content-Type: text/html;charset=utf-8");
class Mycurl
{
public $ch = null;
public $data = null;public function __construct($url) { $this->ch = curl_init($url); curl_setopt($this->ch, CURLOPT_HEADER, false); //不返回頭部信息 //將 curl_exec()獲取的信息以文件流的形式返回,而不是直接輸出。 curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, true); $this->data = curl_exec($this->ch); } public function __destruct() //釋放資源 { curl_close($this->ch); } public function regmatch() //正則方式抓取 { $reg = '/(?<=<title>)(.*)(?=<\/title>)/i'; //抓取標題 $reg = '/<div\sid="article_content"\sclass="article_content">([^(?<\/div>)]*)<\/div>/si'; //抓取文章內容 preg_match($reg,$this->data,$out); return $out[1]; } public function result($pos1,$pos2) //字符串方式抓取 { $len = strlen($pos1); $flag1 = stripos($this->data, $pos1); $flag2 = stripos($this->data, $pos2); $str = substr($this->data,$flag1,$flag2-$flag1); return $str; } public function exec() //獲取抓取數據 { $data = Array(); $data['title'] = self::result('<title>','-盧松松博客</title>'); $data['title'] = substr($data['title'],7); //參數7偏移是為了過濾上一步字符串抓取結果中的前面<title> $data['content'] = self::result('<dd class="post-info">','<center>'); $data['content'] = str_ireplace("/upload/","http://lusongsong.com/upload/",$data['content']); //這一步解決抓取文章的圖片地址錯誤 $data['content'] = str_ireplace("http://lusongsong.comhttp://lusongsong.com","http://lusongsong.com",$data['content']); //解決上一步產生的副作用, $data['content'] = str_ireplace("bloghttp://lusongsong.com","blog",$data['content']); //繼續解決上兩步產生的副作用 $data['atime'] = time(); $data['author'] = 'Internet'; $data['sort'] = '精彩博文'; // $data['oldlink'] = ''; $data['summary'] = substr(strip_tags($data['content']),0,180); //截取文章摘要 return $data; } } // 測試 $url = 'http://lusongsong.com/reed/'; $num = 100; //住區文章數目 $start = 350; //抓取起點 $Art = M('article'); for($i=$start; $i < $start+$num ; $i++) { $posurl = $url.$i.'.html'; $curl = new Mycurl($posurl); $data = $curl->exec(); $data['oldlink'] = $posurl; if($pos = strpos($data['title'], "出現404錯誤頁面了")) { continue; } $Art->add($data); $curl = null; } $this->success("執行完成!","index"); ?> </pre>
本文由用戶 ccpp 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!