PHP code example of baqend / spider

1. Go to this page and download the library: Download baqend/spider library. Choose the download type require.

2. Extract the ZIP file and open the index.php.

3. Add this code to the index.php.
    
        
<?php
require_once('vendor/autoload.php');

/* Start to develop here. Best regards https://php-download.com/ */

    

baqend / spider example snippets



use Baqend\Component\Spider\Processor;
use Baqend\Component\Spider\Queue\BreadthQueue;
use Baqend\Component\Spider\Spider;
use Baqend\Component\Spider\UrlHandler\BlacklistUrlHandler;

// Use the breadth-first queue
$queue = new BreadthQueue();

// Implement the DownloaderInterface
$downloader /* your downloader implementation */;

// Create a URL handler, e.g. the provided blacklist URL handler
$urlHandler = new BlacklistUrlHandler(['**.php']);

// Create some processors which will be executed after another
// More details on the processors below!
$processor = new Processor\Processor();
$processor->addProcessor(new Processor\UrlRewriteProcessor('https://example.org', 'https://example.com/archive'));
$processor->addProcessor($cssProcessor = new Processor\CssProcessor());
$processor->addProcessor(new Processor\HtmlProcessor($cssProcessor));
$processor->addProcessor(new Processor\ReplaceProcessor('https://example.org', 'https://example.com/archive'));
$processor->addProcessor(new Processor\StoreProcessor('https://example.com/archive', '/tmp/output'));

// Create the spider instance
$spider = new Spider($queue, $downloader, $urlHandler, $processor);

// Enqueue some URLs
$spider->queue('https://example.org/index.html');
$spider->queue('https://example.org/news/other-landingpage.html');

// Execute the crawling
$spider->crawl();


use Baqend\Component\Spider\Processor\Processor;

$processor = new Processor();
$processor->addProcessor($firstProcessor);
$processor->addProcessor($secondProcessor);
$processor->addProcessor($thirdProcessor);

// This will call `process` on $firstProcessor, $secondProcessor, and finally on $thirdProcessor:
$processor->process($asset, $queue);


use Baqend\Component\Spider\Processor\ReplaceProcessor;

$processor = new ReplaceProcessor('Hello World', 'Hallo Welt');

// This will replace all occurrences of
// "Hello World" in the asset with "Hallo Welt":
$processor->process($asset, $queue);


use Baqend\Component\Spider\UrlHandler\BlacklistUrlHandler;

$blacklist = [
    'https://other.org/**',     // Don't handle anything from other.org over HTTPS    
    'http{,s}://other.org/**',  // Don't handle anything from other.org over HTTP or HTTPS    
    '**.{png,gif,jpg,jpeg}',    // Don't handle any image files    
];

$urlHandler = new BlacklistUrlHandler($blacklist);