PHP code example of spatie / crawler

1. Go to this page and download the library: Download spatie/crawler library. Choose the download type require.

2. Extract the ZIP file and open the index.php.

3. Add this code to the index.php.
    
        
<?php
require_once('vendor/autoload.php');

/* Start to develop here. Best regards https://php-download.com/ */

    

spatie / crawler example snippets


use Spatie\Crawler\Crawler;

Crawler::create()
    ->setCrawlObserver(<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>)
    ->startCrawling($url);

namespace Spatie\Crawler\CrawlObservers;

use GuzzleHttp\Exception\RequestException;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\UriInterface;

abstract class CrawlObserver
{
    /*
     * Called when the crawler will crawl the url.
     */
    public function willCrawl(UriInterface $url, ?string $linkText): void
    {
    }

    /*
     * Called when the crawler has crawled the given url successfully.
     */
    abstract public function crawled(
        UriInterface $url,
        ResponseInterface $response,
        ?UriInterface $foundOnUrl = null,
        ?string $linkText,
    ): void;

    /*
     * Called when the crawler had a problem crawling the given url.
     */
    abstract public function crawlFailed(
        UriInterface $url,
        RequestException $requestException,
        ?UriInterface $foundOnUrl = null,
        ?string $linkText = null,
    ): void;

    /**
     * Called when the crawl has ended.
     */
    public function finishedCrawling(): void
    {
    }
}

Crawler::create()
    ->setCrawlObservers([
        <class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>,
        <class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>,
        ...
     ])
    ->startCrawling($url);

Crawler::create()
    ->addCrawlObserver(<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>)
    ->addCrawlObserver(<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>)
    ->addCrawlObserver(<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>)
    ->startCrawling($url);

Crawler::create()
    ->executeJavaScript()
    ...

Crawler::create()
    ->setBrowsershot($browsershot)
    ->executeJavaScript()
    ...

/*
 * Determine if the given url should be crawled.
 */
public function shouldCrawl(UriInterface $url): bool;

Crawler::create()
    ->setUrlParserClass(<class that implements \Spatie\Crawler\UrlParsers\UrlParser>::class)
    ...

Crawler::create()
    ->setUrlParserClass(SitemapUrlParser::class)
    ...

Crawler::create()
    ->ignoreRobots()
    ...

Crawler::create()
    ->acceptNofollowLinks()
    ...

Crawler::create()
    ->setUserAgent('my-agent')

Crawler::create()
    ->setConcurrency(1) // now all urls will be crawled one by one

$queue = <your selection/implementation of a queue>;

// Crawls 5 URLs and ends.
Crawler::create()
    ->setCrawlQueue($queue)
    ->setTotalCrawlLimit(5)
    ->startCrawling($url);

// Doesn't crawl further as the total limit is reached.
Crawler::create()
    ->setCrawlQueue($queue)
    ->setTotalCrawlLimit(5)
    ->startCrawling($url);

$queue = <your selection/implementation of a queue>;

// Crawls 5 URLs and ends.
Crawler::create()
    ->setCrawlQueue($queue)
    ->setCurrentCrawlLimit(5)
    ->startCrawling($url);

// Crawls the next 5 URLs and ends.
Crawler::create()
    ->setCrawlQueue($queue)
    ->setCurrentCrawlLimit(5)
    ->startCrawling($url);

$queue = <your selection/implementation of a queue>;

// Crawls 5 URLs and ends.
Crawler::create()
    ->setCrawlQueue($queue)
    ->setTotalCrawlLimit(10)
    ->setCurrentCrawlLimit(5)
    ->startCrawling($url);

// Crawls the next 5 URLs and ends.
Crawler::create()
    ->setCrawlQueue($queue)
    ->setTotalCrawlLimit(10)
    ->setCurrentCrawlLimit(5)
    ->startCrawling($url);

// Doesn't crawl further as the total limit is reached.
Crawler::create()
    ->setCrawlQueue($queue)
    ->setTotalCrawlLimit(10)
    ->setCurrentCrawlLimit(5)
    ->startCrawling($url);

// Create a queue using your queue-driver.
$queue = <your selection/implementation of a queue>;

// Crawl the first set of URLs
Crawler::create()
    ->setCrawlQueue($queue)
    ->setCurrentCrawlLimit(10)
    ->startCrawling($url);

// Serialize and store your queue
$serializedQueue = serialize($queue);

// Unserialize queue
$queue = unserialize($serializedQueue);

// Crawls the next set of URLs
Crawler::create()
    ->setCrawlQueue($queue)
    ->setCurrentCrawlLimit(10)
    ->startCrawling($url);

// Serialize and store your queue
$serialized_queue = serialize($queue);

Crawler::create()
    ->setMaximumDepth(2)

// let's use a 3 MB maximum.
Crawler::create()
    ->setMaximumResponseSize(1024 * 1024 * 3)

Crawler::create()
    ->setDelayBetweenRequests(150) // After every page crawled, the crawler will wait for 150ms

Crawler::create()
    ->setParseableMimeTypes(['text/html', 'text/plain'])

Crawler::create()
    ->setCrawlQueue(<implementation of \Spatie\Crawler\CrawlQueues\CrawlQueue>)

Crawler::create()
    ->setDefaultScheme('https')