[php]php를 이용한 웹크롤링

의존성

composer.json

{
    "require": {
        "guzzlehttp/guzzle": "^7.0",
        "symfony/dom-crawler": "^5.3",
        "symfony/css-selector": "^5.3"
    }
}

요청과 응답페이지 요소탐색

use GuzzleHttp\Client;
use Symfony\Component\DomCrawler\Crawler;

$url = 'http://ujsstudio.com';
$client = new Client();
$res = $client->get($url);
$res = $res->getBody();
$html = (string)$res; // 문자열로 형변환
// dom 필터링
$crawler = new Crawler($html);
$nodeValues = $crawler->filter("#primary a")->each(function(Crawler $node, $i){
    return $node->attr('href');
});

페이지네이션 처리

use GuzzleHttp\Client;
use GuzzleHttp\Exception\RequestException;
use GuzzleHttp\Pool;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;
use Symfony\Component\DomCrawler\Crawler;

$result = [];
$client = new Client();

// 페이지 조회 익명함수(제너레이터) 생성
$requests = function ($total) use ($client) {
    $uri = 'http://ujsstudio.com/page';
    for ($i = 0; $i < $total; $i++) {
        yield function() use ($client, $uri, &$i) {
            $uri = $uri.'/'.$i+1;
            return $client->getAsync($uri);
        };
    }
};

$pool = new Pool($client, $requests(10), [
        'concurrency' => 5,
        'fulfilled' => function (Response $response, $index) use (&$result) {
            $res = $response->getBody();
            $html = (string)$res;

            // 요소탐색
            $crawler = new Crawler($html);
            $nodeValues = $crawler->filter("#primary a")->each(function(Crawler $node, $i){
                return $node->attr('href');
            });

            $result[] = $nodeValues;
        },
        'rejected' => function (RequestException $reason, $index) {
            // this is delivered each failed request
        },
]);

$promise = $pool->promise();
$promise->wait();