PHP code example of ddliu / spider

1. Go to this page and download the library: Download ddliu/spider library. Choose the download type require.

2. Extract the ZIP file and open the index.php.

3. Add this code to the index.php.
    
        
<?php
require_once('vendor/autoload.php');

/* Start to develop here. Best regards https://php-download.com/ */

    

ddliu / spider example snippets


use ddliu\spider\Spider;
use ddliu\spider\Pipe\NormalizeUrlPipe;
use ddliu\spider\Pipe\RequestPipe;
use ddliu\spider\Pipe\DomCrawlerPipe;

(new Spider())
    ->pipe(new NormalizeUrlPipe())
    ->pipe(new RequestPipe())
    ->pipe(new DomCrawlerPipe())
    ->pipe(function($spider, $task) {
        $task['$dom']->filter('a')->each(function($a) use ($task) {
            $href = $a->attr('href');
            $task->fork($href);
        })
    })
    // the entry task
    ->addTask('http://example.com')
    ->run()
    ->report();

function($spider, $task) {}

use ddliu\spider\Pipe\BasePipe;

class MyPipe extends BasePipe {
    public function run($spider, $task) {
        // process the task...
    }
}

new NormalizeUrlPipe()

new RequestPipe(array(
    'useragent' => 'myspider',
    'timeout' => 10
));

$requestPipe = new RequestPipe();
$cacheForReqPipe = new FileCachePipe($requestPipe, [
    'input' => 'url',
    'output' => 'content',
    'root' => '/path/to/cache/root',
]);

$requestPipe = new RequestPipe();
$retryForReqPipe = new RetryPipe($requestPipe, [
    'count' => 10,
]);

new ReportPipe(array(
    'seconds' => 600
))