CURL


/**
 * curl例子  
 * @param $query
 * [
 *  'url'=>'www.baidu.com', //链接
 *  'timeput'=>30,          //超时时间 秒
 *  'headers'=>''          //请求头信息
 *  'postData'=>''         //请求数据
 *  'proxy'=>'127.0.0.1:8888' //代理
 * ]
 */
function curlSetting($query)
{
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $query['url']); // 设置url
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); // 返回请求结果,不直接输出
    curl_setopt($ch, CURLOPT_HEADER, true); // 请求结果中也要包含响应的heade0r
    // 设置超时
    $timeout = isset($query['timeout']) ?: 30;
    curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); // 允许跳转
    curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024);

//        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0);
//        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);

    // 设置请求HEADER
    if (isset($query['headers'])) {
        if (!isAssocArray($query['headers'])) {
            curl_setopt($ch, CURLOPT_HTTPHEADER, $query['headers']);
        } else {
            $headers = [];
            foreach ($query['headers'] as $k => $v) $headers[] = "{$k}: {$v}";
            curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
        }
    }

    // 设置post及其数据
    if (isset($query['postData']) && !is_null($query['postData'])) {
        curl_setopt($ch, CURLOPT_POST, true);
        if (is_string($query['postData'])) {
            curl_setopt($ch, CURLOPT_POSTFIELDS, $query['postData']);
        } elseif (is_array($query['postData'])) {
            $queryString = http_build_query($query['postData']);
            // 有可能出现为应对 hobbies[]=swimming&hobbies[]=football 这种情况而出现的 hobbies[0]=swimming&hobbies[1]=football
            $queryString = preg_replace('/%5B\d+%5D/simU', '%5B%5D', $queryString);
            curl_setopt($ch, CURLOPT_POSTFIELDS, $queryString);
        }
    }

    // 设置代理
    !empty($query['proxy']) && curl_setopt($ch, CURLOPT_PROXY, $query['proxy']);
    $response = curl_exec($ch);
    $curlInfo = curl_getinfo($ch);
    var_dump($curlInfo);

}

curlMulti

// 多线程抓取
class CurlMulti
{
    protected static $mh = null;

    // task中需要包含url header postdata proxy timeout
    public static function send(array $tasks)
    {
        // 添加任务到并行句柄中
        foreach ($tasks as $task) {
            $ch = self::taskSetting($task['url'], $task['headers'], $task['post_data'], $task['proxy'], $task['timeout']);
            curl_multi_add_handle(self::getMh(), $ch);
        }

        $result = [];
        do {
            curl_multi_exec(self::getMh(), $running);

            if (($info = curl_multi_info_read(self::getMh())) !== false) {
                if ($info['result'] === CURLE_OK) {
                    $curlInfo     = curl_getinfo($info['handle']);
                    $response     = curl_multi_getcontent($info['handle']);
                    $body         = substr($response, - $curlInfo['size_download']);
                    $header     = substr($response, 0, strlen($response) - $curlInfo['size_download']);
                    $taskCur    = $tasks[array_search($curlInfo['url'], array_column($tasks, 'url'))];
                    if (!empty($taskCur['callback'])) $body = call_user_func_array($taskCur['callback'], [$body]);
                    $result[]     = [
                        'code'         => $curlInfo['http_code'],
                        'header'    => $header,
                        'body'        => empty($taskCur['callback']) ? $body : call_user_func_array($taskCur['callback'], [$body]),
                        'curlInfo'     => $curlInfo,
                    ];
                }
                curl_close($info['handle']);
                curl_multi_remove_handle(self::getMh(), $info['handle']);
            }

        } while ($running > 0);

        return $result;
    }

    protected static function taskSetting($url, $headers = [], $postData = null, $proxy = '', $timeout = 30)
    {
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_HEADER, true);
        curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
        !empty($headers) && curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
        if (!is_null($postData)) {
            curl_setopt($ch, CURLOPT_POST, true);
            curl_setopt($ch, CURLOPT_POSTFIELDS, $postData);
        }
        if (!empty($proxy)) curl_setopt($ch, CURLOPT_PROXY, $proxy);
        return $ch;
    }

    protected static function getMh()
    {
        if (is_null(self::$mh)) {
            self::$mh = curl_multi_init();
        }
        return self::$mh;
    }
}

标签: none

添加新评论