基于curl实现登录页面数据采集（百度搜藏版）

2014年11月17日 01:29:02

作者： Zjmainstay

本文以百度搜藏为例，提供一种基于curl采集登录页数据的方法。

获取cURL命令

这个可以借助Firebug的控制台得到，如图（一）所示：
1. 解析cURL命令
cURL具有一定的格式，我们这里需要得到两个数据：

（1）请求链接
它可以基于正则"#curl '([^']*?)'#is"解析得到，唯一，使用preg_match匹配。

（2）请求头HTTPHEADER
它可以基于正则"#-H '([^']*?)'#is"得到，一个或多个，基于preg_match_all匹配。
3. 通过cURL采集数据

本demo基于php多线程采集实现，此时只需要给curl采集请求添加：

（1）压缩参数

CURLOPT_ENCODING="gzip"

（2）HTTP头参数

CURLOPT_HTTPHEADER=cURL解析头结果
4. 程序核心源码（curlBaiduSoucang.php）

<?php
header("Content-type: text/html; charset=utf-8"); 
require_once 'CurlMulti.php';
require_once 'MyCurl.php';
/**
 * 抓取百度搜藏数据
 * @author Zjmainstay
 * @website http://www.zjmainstay.cn
 * @year 2014
 * @usage
    cli命令： php curlBaiduSoucang.php 1 20        //1~20页
    浏览器：  http://localhost/curlmulti/curlBaiduSoucang.php?start=1&end=20
 *        
 */
class curlBaiduSoucang extends MyCurl {
    function __construct() {
        parent::__construct();
        $curlTextFile  = dirname(__FILE__) . '/curlText.txt';
        if(!file_exists($curlTextFile)) {
            exit('curl 命令文件（./curlText.txt）不存在。');
        }
        $curlContent   = trim(file_get_contents($curlTextFile));
        if(!preg_match("#curl '([^']*?)'#is", $curlContent, $match)) {
            exit('请确认curl命令是否正确，无法匹配链接地址。');
        }
        $this->soucangUrl = $match[1];
        if(!preg_match_all("#-H '([^']*?)'#is", $curlContent, $matches)) {
            echo('请确认curl命令是否正确，无法匹配HTTP HEADER信息，可能导致采集失败 
');
        } else {
            $httpHeader = $matches[1];
        }
        $this->curl->opt[CURLOPT_ENCODING] = 'gzip';
        if(!empty($httpHeader)) {
            $this->curl->opt[CURLOPT_HTTPHEADER] = $httpHeader;
        }
    }
    //采集开始处理
    function run() {
        if(PHP_SAPI=='cli') {
            global $argv;
            $this->lineBreak = "\n";
        } else {
            $this->lineBreak = "<br />";
            $argv[2] = (int)@$_GET['start'];
            $argv[3] = (int)@$_GET['end'];
        }
        $startPage     = max(1, (int)@$argv[2]);
        $endPage   = max(1, (int)@$argv[3]);
        $process = array (
            $this,
            'parse' 
        );
        for($i = $startPage; $i <= $endPage; $i++) {
            $this->curl->add ( array (
                    'url' => preg_replace('#(?<=pn=)\d+#i', $i, $this->soucangUrl),
                    'args' => array (
                        //args
                        'page'  => $i,
                    ) 
            ), $process );
        }
        $this->curl->start();
    }
    function parse($res, $param) {
        if (! $this->httpError ( $res ['info'] )) {
            $filename = dirname(__FILE__) . "/soucang/soucang-{$param['page']}.txt";
            file_put_contents($filename, trim(iconv('gbk', 'utf-8//IGNORE', $res['content'])));
        }
        echo "Page: {$param['page']} (ok)" . $this->lineBreak;
    }
    function cbCurlInfo($info) {
        parent::cbCurlInfo($info);
        echo $this->lineBreak;
    }
}
ini_set('max_execution_time', 0);
ini_set('display_errors', 'on');
error_reporting(E_ALL);
$curlObj = new curlBaiduSoucang;
$curlObj->run();