1. Go to this page and download the library: Download scrapy/scrapy library. Choose the download type require.
2. Extract the ZIP file and open the index.php.
3. Add this code to the index.php.
<?php
require_once('vendor/autoload.php');
/* Start to develop here. Best regards https://php-download.com/ */
scrapy / scrapy example snippets
use Scrapy\Builders\ScrapyBuilder;
$html = ScrapyBuilder::make()
->url('https://www.some-url.com')
->build()
->scrape();
use Scrapy\Parsers\Parser;
use Scrapy\Crawlers\Crawly;
class ImageParser extends Parser
{
public function process(Crawly $crawly, array $output): array
{
$output['hello'] = $crawly->filter('h1')->string();
return $output;
}
}
use Scrapy\Builders\ScrapyBuilder;
// Add by class reference
ScrapyBuilder::make()
->parser(ImageParser::class);
// Add concrete instance
ScrapyBuilder::make()
->parser(new ImageParser());
// Add multiple parsers
ScrapyBuilder::make()
->parsers([ImageParser::class, new ImageParser()]);
use Scrapy\Crawlers\Crawly;
use Scrapy\Builders\ScrapyBuilder;
ScrapyBuilder::make()
->parser(function (Crawly $crawly, array $output) {
$output['count'] = $crawly->filter('li')->count();
return $output;
});
$crawly = new Crawly('<ul><li>1</li><li>2</li></ul>');
$crawly->filter('li')->count(); // 2
$crawly = new Crawly('<span>123</span>');
$crawly->filter('span')->int(); // 123
// Use default if selection is not numeric
$crawly = new Crawly('');
$crawly->filter('span')->int(55); // 55
$crawly = new Crawly('<span>18.5</span>');
$crawly->filter('span')->float(); // 18.5
// Use default if selection is not numeric
$crawly = new Crawly('');
$crawly->filter('span')->float(22.4); // 22.4
$crawly = new Crawly('<span>Hello World!</span>');
$crawly->filter('span')->string(); // 'Hello World!'
// Use default in case exception arises
$crawly = new Crawly('');
$crawly->filter('non-existing-selection')->string('Hello'); // 'Hello'
$crawly = new Crawly('<span>Hello World!</span>');
$crawly->filter('span')->html(); // <span>Hello World!</span>
// Use default in case exception arises
$crawly = new Crawly('');
$crawly->filter('non-existing-selection')->html('<div>Hi</div>'); // <div>Hi</div>
$crawly = new Crawly('<span>Hello World!</span>');
$crawly->filter('span')->innerHtml(); // 'Hello World!'
// Use default to handle exceptional cases
$crawly = new Crawly('');
$crawly->filter('non-existing-selection')->innerHtml('<div>Hi</div>'); // 'Hi'
$crawly = new Crawly('<span>Hello World!</span>');
$crawly->filter('span')->exists(); // true
$crawly = new Crawly('');
$crawly->filter('non-existing-selection')->exists(); // false
$crawly->filter('non-existing-selection')->exists(true); // new ScrapeException(...)
$crawly = new Crawly('<ul><li> Hello </li><li> World </li></ul>');
$crawly->filter('li')->map(function (Crawly $crawly, int $index) {
return $crawly->trim()->string() . ' - ' . $index;
}); // ['Hello - 0', 'World - 1']
// limit the map function
$crawly->filter('li')->map(function (Crawly $crawly, int $index) {
return $crawly->trim()->string() . ' - ' . $index;
}, 1); // ['Hello - 0']
$crawly = new Crawly('<ul><li>1</li></ul>');
$crawly = $crawly->filter('li')->node(); // DOMNode representing '<li>1</li>' is returned
use Scrapy\Builders\ScrapyBuilder;
use Scrapy\Readers\UrlReader;
use Scrapy\Readers\FileReader;
ScrapyBuilder::make()
->reader(new UrlReader('https://www.some-url.com'));
ScrapyBuilder::make()
->reader(new FileReader('path-to-file.html'));
use Scrapy\Readers\IReader;
class CustomReader implements IReader
{
public function read(): string
{
return '<h1>Hello World!</h1>';
}
}
use Scrapy\Agents\IUserAgent;
use Scrapy\Readers\UrlReader;
class UserAgent implements IUserAgent
{
public function reader(string $url): UrlReader
{
$reader = new UrlReader($url);
$reader->setConfig(['headers' => ['...']]);
return $reader;
}
}
ScrapyBuilder::make()
->agent(new UserAgent());
use Scrapy\Readers\UrlReader;
use Scrapy\Agents\GoogleAgent;
use Scrapy\Builders\ScrapyBuilder;
ScrapyBuilder::make()
->url('https://www.facebook.com')
->agent(new GoogleAgent())
->reader(new UrlReader('https://www.youtube.com')); // Youtube will be read without GoogleAgent, Facebook will be ignored.
use Scrapy\Builders\ScrapyBuilder;
use Scrapy\Exceptions\ScrapeException;
try {
$html = ScrapyBuilder::make()
->url('https://www.invalid-url.com')
->build()
->scrape();
} catch (ScrapeException $e) {
//
}
Loading please wait ...
Before you can download the PHP files, the dependencies should be resolved. This can take some minutes. Please be patient.