PHP code example of terminal42 / escargot

1. Go to this page and download the library: Download terminal42/escargot library. Choose the download type require.

2. Extract the ZIP file and open the index.php.

3. Add this code to the index.php.
    
        
<?php
require_once('vendor/autoload.php');

/* Start to develop here. Best regards https://php-download.com/ */

    

terminal42 / escargot example snippets




use Nyholm\Psr7\Uri;
use Terminal42\Escargot\BaseUriCollection;
use Terminal42\Escargot\Escargot;
use Terminal42\Escargot\Queue\InMemoryQueue;

$baseUris = new BaseUriCollection();
$baseUris->add(new Uri('https://www.terminal42.ch'));
$queue = new InMemoryQueue();
        
$escargot = Escargot::create($baseUris, $queue);



use Symfony\Component\HttpClient\CurlHttpClient;
use Terminal42\Escargot\Escargot;
use Terminal42\Escargot\Queue\InMemoryQueue;

$queue = new InMemoryQueue();
        
$escargot = Escargot::createFromJobId($jobId, $queue);



$escargot->crawl();



$escargot->addSubscriber(new MySubscriber());



use Terminal42\Escargot\Subscriber\HtmlCrawlerSubscriber;
use Terminal42\Escargot\Subscriber\RobotsSubscriber;

$escargot->addSubscriber(new RobotsSubscriber());
$escargot->addSubscriber(new HtmlCrawlerSubscriber());



use Symfony\Contracts\HttpClient\ChunkInterface;
use Symfony\Contracts\HttpClient\ResponseInterface;
use Terminal42\Escargot\CrawlUri;
use Terminal42\Escargot\EscargotAwareInterface;
use Terminal42\Escargot\EscargotAwareTrait;
use Terminal42\Escargot\Subscriber\HtmlCrawlerSubscriber;
use Terminal42\Escargot\Subscriber\RobotsSubscriber;
use Terminal42\Escargot\Subscriber\SubscriberInterface;
use Terminal42\Escargot\Subscriber\Util;

class MyWebCrawler implements SubscriberInterface, EscargotAwareInterface
{
    use EscargotAwareTrait;

    public function shouldRequest(CrawlUri $crawlUri): string
    {
        // Check the original crawlUri to see if that one contained nofollow information
        if (null !== $crawlUri->getFoundOn() && ($originalCrawlUri = $this->getEscargot()->getCrawlUri($crawlUri->getFoundOn()))) {
            if ($originalCrawlUri->hasTag(RobotsSubscriber::TAG_NOFOLLOW)) {
                return SubscriberInterface::DECISION_NEGATIVE;
            }
        }
        
        // Skip links that were disallowed by the robots.txt
        if ($crawlUri->hasTag(RobotsSubscriber::TAG_DISALLOWED_ROBOTS_TXT)) {
            return SubscriberInterface::DECISION_NEGATIVE;
        }
    
        // Skip rel="nofollow" links
        if ($crawlUri->hasTag(HtmlCrawlerSubscriber::TAG_REL_NOFOLLOW)) {
            return SubscriberInterface::DECISION_NEGATIVE;
        }
        
        // Hint: All of the above are typical for HTML crawlers, so there's a helper for you
        // to simplify this:
        if (!Util::isAllowedToFollow($crawlUri, $this->getEscargot())) {
            return SubscriberInterface::DECISION_NEGATIVE;
        }
    
        // Skip the links that have the "type" attribute set and it's not text/html
        if ($crawlUri->hasTag(HtmlCrawlerSubscriber::TAG_NO_TEXT_HTML_TYPE)) {
            return SubscriberInterface::DECISION_NEGATIVE;
        }
    
        // Skip links that do not belong to our BaseUriCollection
        if (!$this->escargot->getBaseUris()->containsHost($crawlUri->getUri()->getHost())) {
            return SubscriberInterface::DECISION_NEGATIVE;
        }

        return SubscriberInterface::DECISION_POSITIVE;
    }

    public function needsContent(CrawlUri $crawlUri, ResponseInterface $response, ChunkInterface $chunk): string
    {
        return 200 === $response->getStatusCode() && Util::isOfContentType($response, 'text/html') ? SubscriberInterface::DECISION_POSITIVE : SubscriberInterface::DECISION_NEGATIVE;
    }

    public function onLastChunk(CrawlUri $crawlUri, ResponseInterface $response, ChunkInterface $chunk): void
    {
        // Do something with the data
    }
}



use Terminal42\Escargot\CrawlUri;
use Terminal42\Escargot\Subscriber\SubscriberInterface;
use Psr\Log\LoggerAwareInterface;
use Psr\Log\LoggerAwareTrait;
use Psr\Log\LogLevel;

class MyWebCrawler implements SubscriberInterface, LoggerAwareInterface
{
    use LoggerAwareTrait;

    public function shouldRequest(CrawlUri $crawlUri): string
    {
        if (null !== $this->logger) {
            $this->logger->log(LogLevel::DEBUG, 'My log message');
        }
    }
}



use Terminal42\Escargot\CrawlUri;
use Terminal42\Escargot\SubscriberLoggerTrait;
use Terminal42\Escargot\Subscriber\SubscriberInterface;
use Psr\Log\LoggerAwareInterface;
use Psr\Log\LoggerAwareTrait;
use Psr\Log\LogLevel;

class MyWebCrawler implements SubscriberInterface, LoggerAwareInterface
{
    use LoggerAwareTrait;
    use SubscriberLoggerTrait;

    public function shouldRequest(CrawlUri $crawlUri): string
    {
        // No need to check for $this->logger being null, this is handled by the trait
        $this->logWithCrawlUri($crawlUri, LogLevel::DEBUG, 'My log message');
    }
}