1. Go to this page and download the library: Download baqend/spider library. Choose the download type require.
2. Extract the ZIP file and open the index.php.
3. Add this code to the index.php.
<?php
require_once('vendor/autoload.php');
/* Start to develop here. Best regards https://php-download.com/ */
baqend / spider example snippets
use Baqend\Component\Spider\Processor;
use Baqend\Component\Spider\Queue\BreadthQueue;
use Baqend\Component\Spider\Spider;
use Baqend\Component\Spider\UrlHandler\BlacklistUrlHandler;
// Use the breadth-first queue
$queue = new BreadthQueue();
// Implement the DownloaderInterface
$downloader /* your downloader implementation */;
// Create a URL handler, e.g. the provided blacklist URL handler
$urlHandler = new BlacklistUrlHandler(['**.php']);
// Create some processors which will be executed after another
// More details on the processors below!
$processor = new Processor\Processor();
$processor->addProcessor(new Processor\UrlRewriteProcessor('https://example.org', 'https://example.com/archive'));
$processor->addProcessor($cssProcessor = new Processor\CssProcessor());
$processor->addProcessor(new Processor\HtmlProcessor($cssProcessor));
$processor->addProcessor(new Processor\ReplaceProcessor('https://example.org', 'https://example.com/archive'));
$processor->addProcessor(new Processor\StoreProcessor('https://example.com/archive', '/tmp/output'));
// Create the spider instance
$spider = new Spider($queue, $downloader, $urlHandler, $processor);
// Enqueue some URLs
$spider->queue('https://example.org/index.html');
$spider->queue('https://example.org/news/other-landingpage.html');
// Execute the crawling
$spider->crawl();
use Baqend\Component\Spider\Processor\Processor;
$processor = new Processor();
$processor->addProcessor($firstProcessor);
$processor->addProcessor($secondProcessor);
$processor->addProcessor($thirdProcessor);
// This will call `process` on $firstProcessor, $secondProcessor, and finally on $thirdProcessor:
$processor->process($asset, $queue);
use Baqend\Component\Spider\Processor\ReplaceProcessor;
$processor = new ReplaceProcessor('Hello World', 'Hallo Welt');
// This will replace all occurrences of
// "Hello World" in the asset with "Hallo Welt":
$processor->process($asset, $queue);
use Baqend\Component\Spider\UrlHandler\BlacklistUrlHandler;
$blacklist = [
'https://other.org/**', // Don't handle anything from other.org over HTTPS
'http{,s}://other.org/**', // Don't handle anything from other.org over HTTP or HTTPS
'**.{png,gif,jpg,jpeg}', // Don't handle any image files
];
$urlHandler = new BlacklistUrlHandler($blacklist);
Loading please wait ...
Before you can download the PHP files, the dependencies should be resolved. This can take some minutes. Please be patient.