1. Go to this page and download the library: Download spatie/crawler library. Choose the download type require.
2. Extract the ZIP file and open the index.php.
3. Add this code to the index.php.
<?php
require_once('vendor/autoload.php');
/* Start to develop here. Best regards https://php-download.com/ */
spatie / crawler example snippets
use Spatie\Crawler\Crawler;
Crawler::create()
->setCrawlObserver(<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>)
->startCrawling($url);
namespace Spatie\Crawler\CrawlObservers;
use GuzzleHttp\Exception\RequestException;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\UriInterface;
abstract class CrawlObserver
{
/*
* Called when the crawler will crawl the url.
*/
public function willCrawl(UriInterface $url, ?string $linkText): void
{
}
/*
* Called when the crawler has crawled the given url successfully.
*/
abstract public function crawled(
UriInterface $url,
ResponseInterface $response,
?UriInterface $foundOnUrl = null,
?string $linkText,
): void;
/*
* Called when the crawler had a problem crawling the given url.
*/
abstract public function crawlFailed(
UriInterface $url,
RequestException $requestException,
?UriInterface $foundOnUrl = null,
?string $linkText = null,
): void;
/**
* Called when the crawl has ended.
*/
public function finishedCrawling(): void
{
}
}
Crawler::create()
->setCrawlObservers([
<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>,
<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>,
...
])
->startCrawling($url);
Crawler::create()
->addCrawlObserver(<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>)
->addCrawlObserver(<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>)
->addCrawlObserver(<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>)
->startCrawling($url);
Crawler::create()
->setConcurrency(1) // now all urls will be crawled one by one
$queue = <your selection/implementation of a queue>;
// Crawls 5 URLs and ends.
Crawler::create()
->setCrawlQueue($queue)
->setTotalCrawlLimit(5)
->startCrawling($url);
// Doesn't crawl further as the total limit is reached.
Crawler::create()
->setCrawlQueue($queue)
->setTotalCrawlLimit(5)
->startCrawling($url);
$queue = <your selection/implementation of a queue>;
// Crawls 5 URLs and ends.
Crawler::create()
->setCrawlQueue($queue)
->setCurrentCrawlLimit(5)
->startCrawling($url);
// Crawls the next 5 URLs and ends.
Crawler::create()
->setCrawlQueue($queue)
->setCurrentCrawlLimit(5)
->startCrawling($url);
$queue = <your selection/implementation of a queue>;
// Crawls 5 URLs and ends.
Crawler::create()
->setCrawlQueue($queue)
->setTotalCrawlLimit(10)
->setCurrentCrawlLimit(5)
->startCrawling($url);
// Crawls the next 5 URLs and ends.
Crawler::create()
->setCrawlQueue($queue)
->setTotalCrawlLimit(10)
->setCurrentCrawlLimit(5)
->startCrawling($url);
// Doesn't crawl further as the total limit is reached.
Crawler::create()
->setCrawlQueue($queue)
->setTotalCrawlLimit(10)
->setCurrentCrawlLimit(5)
->startCrawling($url);
// Create a queue using your queue-driver.
$queue = <your selection/implementation of a queue>;
// Crawl the first set of URLs
Crawler::create()
->setCrawlQueue($queue)
->setCurrentCrawlLimit(10)
->startCrawling($url);
// Serialize and store your queue
$serializedQueue = serialize($queue);
// Unserialize queue
$queue = unserialize($serializedQueue);
// Crawls the next set of URLs
Crawler::create()
->setCrawlQueue($queue)
->setCurrentCrawlLimit(10)
->startCrawling($url);
// Serialize and store your queue
$serialized_queue = serialize($queue);
Crawler::create()
->setMaximumDepth(2)
// let's use a 3 MB maximum.
Crawler::create()
->setMaximumResponseSize(1024 * 1024 * 3)
Crawler::create()
->setDelayBetweenRequests(150) // After every page crawled, the crawler will wait for 150ms