PHP爬蟲_電影ftp下載地址

jopen 9年前發布 | 5K 次閱讀 PHP

建表語句:CREATE TABLE dy2008_url (id int(9) NOT NULL AUTO_INCREMENT, url varchar(2000) NOT NULL, status tinyint(2) NOT NULL, PRIMARY KEY(id));

代碼:

    <?php
declare(ticks = 1);
pcntl_signal(SIGQUIT, 'signal_handler');
pcntl_signal(SIGTERM, 'signal_handler');

    $crawlers_pid = array();  
    $finish_count = 0;  

    //信號處理函數  
    function signal_handler($signal)   
    {  
        global $crawlers_pid;  
        if ($signal == SIGQUIT || $signal == SIGTERM)   
        {  
            foreach ($crawlers_pid as $pid) {  
                posix_kill($pid,SIGTERM);  
            }  
            echo "---------- crawl task exit ----------";  
            global $con;//mysql  
            exit();  
        }  
    }  

    //GET方式獲取鏈接對應頁面內容  
    function get_page_content($url)   
    {  
        $content = file_get_contents($url);  
        return $content;  
    }  

    //POST方式獲取鏈接對應頁面內容  
    function get_page_content_by_post($url, $arr)  
    {  
        $arr = http_build_query($arr);  
        $opts = array (  
            'http' => array('method' => 'POST', 'header' => 'Content-type:application/x-www-form-urlencoded'.' Content-Length:'.strlen($data).'"', 'content' => $data)  
        );  
        $context = stream_context_create($opts);  
        $content = file_get_contents($url,false,$context);  
        return $content;  
    }  

    //dy2018抓取主流程  
    function run_dy2018()   
    {  
        global $crawlers_pid;  
        global $finish_count;  
        $crawl_urls = array("http://www.dy2018.com/html/tv/hytv/",  
        "http://www.dy2018.com/html/tv/hepai/",  
        "http://www.dy2018.com/html/tv/gangtai/",  
        "http://www.dy2018.com/html/tv/oumeitv/",  
        "http://www.dy2018.com/html/tv/rihantv/",  
        "http://www.dy2018.com/html/tv/tvzz/",  
        "http://www.dy2018.com/0/",  
        "http://www.dy2018.com/1/",  
        "http://www.dy2018.com/2/",  
        "http://www.dy2018.com/3/",  
        "http://www.dy2018.com/4/",  
        "http://www.dy2018.com/5/",  
        "http://www.dy2018.com/6/",  
        "http://www.dy2018.com/7/",  
        "http://www.dy2018.com/8/",  
        "http://www.dy2018.com/9/",  
        "http://www.dy2018.com/10/",  
        "http://www.dy2018.com/11/",  
        "http://www.dy2018.com/12/",  
        "http://www.dy2018.com/13/",  
        "http://www.dy2018.com/14/",  
        "http://www.dy2018.com/15/",  
        "http://www.dy2018.com/16/",  
        "http://www.dy2018.com/17/",  
        "http://www.dy2018.com/18/",  
        "http://www.dy2018.com/19/",  
        "http://www.dy2018.com/20/");  

        $i = 0;  
        while($i < count($crawl_urls))   
        {  
            $pid = pcntl_fork();  
            if($pid == -1) {  
                echo "system error. check it now!";  
                exit();  
            } else if($pid > 0){  
                $crawlers_pid[$i] = $pid;  
            } else {  
                $url = $crawl_urls[$i];  
                $con = mysql_connect("localhost", "root", "123456");  
                if(!$con) {  
                    die('Count not connect: '.mysql_error());  
                }  
                mysql_select_db("mysql", $con);  
                crawl_process($url);  
                $finish_count++;  
            }  
            $i++;  
        }  

        //pcntl_waitpid可能會導致信號監聽失敗  
        while (true) {   
            if($finish_count == count($crawlers_pid)) {  
                echo "---------- crawl task finish ----------";  
                mysql_close();  
                exit();  
            }  
            sleep(1);  
        }  

    }  

    //從入口鏈接到其下所有下載頁鏈接抓取過程  
    function crawl_process($url)  
    {  
        echo "start handle url:".$url;  
        $page_idx = 1;  
        $valid_tag = true;  
        $info_url_pattern = '/\/i\/\d+.html/';  
        $ftp_url_pattern = '/ftp:\/\/.*?.(swf|avi|flv|mpg|rm|mov|wav|asf|3gp|mkv|rmvb)/i';//^$兩個符號不起作用  
        while($valid_tag) {  
            $page_url = get_page_index_url($url, $page_idx);  
            printf("start crawl url:".$page_url."\n");  
            $page_content = get_page_content($page_url);  
            $valid_tag = is_valid_page($page_content);  
            if($valid_tag) {  
                $matches_urls = array();  
                preg_match_all($info_url_pattern, $page_content, $matches_urls);  
                $page_content = mb_convert_encoding($page_content, "UTF-8", "GBK");  
                for($i=0; $i<count($matches_urls[0]); $i++) {  
                    $detail_url = 'http://www.dy2018.com'.$matches_urls[0][$i];  
                    $detail_page_content = get_page_content($detail_url);  
                    $detail_page_content = mb_convert_encoding($detail_page_content, "UTF-8", "GBK");  
                    preg_match_all($ftp_url_pattern, $detail_page_content, $ftp_urls);  
                    $ftp_links = array();  
                    for($j=0;$j<count($ftp_urls[0]); $j++) {  

                        $ftp_links[$j] = $ftp_urls[0][$j];  
                    }  
                    $ftp_links_unique = array_values(array_unique($ftp_links));  

                    foreach ($ftp_links_unique as $ftp_link) {  
                        mysql_query("insert into dy2018_url (url, status) values('$ftp_link','0')");  
                        // echo mysql_error();//打印mysql錯誤  
                    }  
                    sleep(1);  
                }  
            }  
            $page_idx++;  
        }  
    }  

    //獲取頁碼對應的url鏈接  
    function get_page_index_url($url, $idx)   
    {  
        $idx_url = $url;  
        if($idx == 1) {  
            $idx_url = $idx_url.'index.html';  
        } else if($idx > 1){  
            $idx_url = $idx_url.'index_'.$idx.'.html';  
        }  
        return $idx_url;  
    }  

    //根據頁面內容判斷鏈接是否有效  
    function is_valid_page($content)  
    {  
        return $content?true:false;  
    }  
    run_dy2018();  
    mysql_close();  
?>  </pre> 


 本文由用戶 jopen 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
 轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
 本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!