1. Go to this page and download the library: Download terminal42/escargot library. Choose the download type require.
2. Extract the ZIP file and open the index.php.
3. Add this code to the index.php.
<?php
require_once('vendor/autoload.php');
/* Start to develop here. Best regards https://php-download.com/ */
terminal42 / escargot example snippets
use Nyholm\Psr7\Uri;
use Terminal42\Escargot\BaseUriCollection;
use Terminal42\Escargot\Escargot;
use Terminal42\Escargot\Queue\InMemoryQueue;
$baseUris = new BaseUriCollection();
$baseUris->add(new Uri('https://www.terminal42.ch'));
$queue = new InMemoryQueue();
$escargot = Escargot::create($baseUris, $queue);
use Symfony\Component\HttpClient\CurlHttpClient;
use Terminal42\Escargot\Escargot;
use Terminal42\Escargot\Queue\InMemoryQueue;
$queue = new InMemoryQueue();
$escargot = Escargot::createFromJobId($jobId, $queue);
$escargot->crawl();
$escargot->addSubscriber(new MySubscriber());
use Terminal42\Escargot\Subscriber\HtmlCrawlerSubscriber;
use Terminal42\Escargot\Subscriber\RobotsSubscriber;
$escargot->addSubscriber(new RobotsSubscriber());
$escargot->addSubscriber(new HtmlCrawlerSubscriber());
use Symfony\Contracts\HttpClient\ChunkInterface;
use Symfony\Contracts\HttpClient\ResponseInterface;
use Terminal42\Escargot\CrawlUri;
use Terminal42\Escargot\EscargotAwareInterface;
use Terminal42\Escargot\EscargotAwareTrait;
use Terminal42\Escargot\Subscriber\HtmlCrawlerSubscriber;
use Terminal42\Escargot\Subscriber\RobotsSubscriber;
use Terminal42\Escargot\Subscriber\SubscriberInterface;
use Terminal42\Escargot\Subscriber\Util;
class MyWebCrawler implements SubscriberInterface, EscargotAwareInterface
{
use EscargotAwareTrait;
public function shouldRequest(CrawlUri $crawlUri): string
{
// Check the original crawlUri to see if that one contained nofollow information
if (null !== $crawlUri->getFoundOn() && ($originalCrawlUri = $this->getEscargot()->getCrawlUri($crawlUri->getFoundOn()))) {
if ($originalCrawlUri->hasTag(RobotsSubscriber::TAG_NOFOLLOW)) {
return SubscriberInterface::DECISION_NEGATIVE;
}
}
// Skip links that were disallowed by the robots.txt
if ($crawlUri->hasTag(RobotsSubscriber::TAG_DISALLOWED_ROBOTS_TXT)) {
return SubscriberInterface::DECISION_NEGATIVE;
}
// Skip rel="nofollow" links
if ($crawlUri->hasTag(HtmlCrawlerSubscriber::TAG_REL_NOFOLLOW)) {
return SubscriberInterface::DECISION_NEGATIVE;
}
// Hint: All of the above are typical for HTML crawlers, so there's a helper for you
// to simplify this:
if (!Util::isAllowedToFollow($crawlUri, $this->getEscargot())) {
return SubscriberInterface::DECISION_NEGATIVE;
}
// Skip the links that have the "type" attribute set and it's not text/html
if ($crawlUri->hasTag(HtmlCrawlerSubscriber::TAG_NO_TEXT_HTML_TYPE)) {
return SubscriberInterface::DECISION_NEGATIVE;
}
// Skip links that do not belong to our BaseUriCollection
if (!$this->escargot->getBaseUris()->containsHost($crawlUri->getUri()->getHost())) {
return SubscriberInterface::DECISION_NEGATIVE;
}
return SubscriberInterface::DECISION_POSITIVE;
}
public function needsContent(CrawlUri $crawlUri, ResponseInterface $response, ChunkInterface $chunk): string
{
return 200 === $response->getStatusCode() && Util::isOfContentType($response, 'text/html') ? SubscriberInterface::DECISION_POSITIVE : SubscriberInterface::DECISION_NEGATIVE;
}
public function onLastChunk(CrawlUri $crawlUri, ResponseInterface $response, ChunkInterface $chunk): void
{
// Do something with the data
}
}
use Terminal42\Escargot\CrawlUri;
use Terminal42\Escargot\Subscriber\SubscriberInterface;
use Psr\Log\LoggerAwareInterface;
use Psr\Log\LoggerAwareTrait;
use Psr\Log\LogLevel;
class MyWebCrawler implements SubscriberInterface, LoggerAwareInterface
{
use LoggerAwareTrait;
public function shouldRequest(CrawlUri $crawlUri): string
{
if (null !== $this->logger) {
$this->logger->log(LogLevel::DEBUG, 'My log message');
}
}
}
use Terminal42\Escargot\CrawlUri;
use Terminal42\Escargot\SubscriberLoggerTrait;
use Terminal42\Escargot\Subscriber\SubscriberInterface;
use Psr\Log\LoggerAwareInterface;
use Psr\Log\LoggerAwareTrait;
use Psr\Log\LogLevel;
class MyWebCrawler implements SubscriberInterface, LoggerAwareInterface
{
use LoggerAwareTrait;
use SubscriberLoggerTrait;
public function shouldRequest(CrawlUri $crawlUri): string
{
// No need to check for $this->logger being null, this is handled by the trait
$this->logWithCrawlUri($crawlUri, LogLevel::DEBUG, 'My log message');
}
}
Loading please wait ...
Before you can download the PHP files, the dependencies should be resolved. This can take some minutes. Please be patient.