PHP code example of scrapy / scrapy

1. Go to this page and download the library: Download scrapy/scrapy library. Choose the download type require.

2. Extract the ZIP file and open the index.php.

3. Add this code to the index.php.

    
        
<?php
require_once('vendor/autoload.php');

/* Start to develop here. Best regards https://php-download.com/ */

scrapy / scrapy example snippets


    use Scrapy\Builders\ScrapyBuilder;

    $html = ScrapyBuilder::make()
        ->url('https://www.some-url.com')
        ->build()
        ->scrape();


    use Scrapy\Parsers\Parser;
    use Scrapy\Crawlers\Crawly;

    class ImageParser extends Parser
    {
         public function process(Crawly $crawly, array $output): array
         {
            $output['hello'] = $crawly->filter('h1')->string();

            return $output;
         }
    }


    use Scrapy\Builders\ScrapyBuilder;

    // Add by class reference
    ScrapyBuilder::make()
        ->parser(ImageParser::class);
    
    // Add concrete instance
    ScrapyBuilder::make()
        ->parser(new ImageParser());
    
    // Add multiple parsers
    ScrapyBuilder::make()
        ->parsers([ImageParser::class, new ImageParser()]);


    use Scrapy\Crawlers\Crawly;
    use Scrapy\Builders\ScrapyBuilder;

    ScrapyBuilder::make()
        ->parser(function (Crawly $crawly, array $output) {
            $output['count'] = $crawly->filter('li')->count();
            
            return $output;
        });


    use Scrapy\Crawlers\Crawly;
    use Scrapy\Builders\ScrapyBuilder;

    ScrapyBuilder::make()
        ->params(['foo' => 'bar'])
        ->parser(function (Crawly $crawly, array $output) {
                $output['foo'] = $this->param('foo'); // 'bar'
                $output['baz'] = $this->has('baz');   // false
                $output['bar'] = $this->param('baz'); // null
         });


    use Scrapy\Crawlers\Crawly;

    $crawly1 = new Crawly('<ul><li>Hello World!</li></ul>');
    $crawly2 = new Crawly('Hello World!');

    $crawly1->html(); // '<ul><li>Hello World!</li></ul>'
    $crawly2->html(); // '<body>Hello World!</body>'


    $crawly = new Crawly('<ul><li>Hello World!</li></ul>');

    $crawly->filter('li')->html(); // <li>Hello World!</li>


    $crawly = new Crawly('<ul><li>Hello</li><li>World!</li></ul>');

    $crawly->filter('li')->first()->html(); // <li>Hello</li>


    $crawly = new Crawly('<ul><li>Hello</li><li>World!</li></ul>');

    $crawly->filter('li')->nth(1)->html(); // <li>World!</li>


    $crawly = new Crawly('<ul><li>Hello</li><li>World!</li></ul>');

    $crawly->filter('li')->first()->raw()->html(); // Hello


    $crawly = new Crawly('<div><span>    Hello!     </span></div>');

    $crawly->filter('span')->trim()->string(); // 'Hello!'


    $crawly = new Crawly('<ul><li attr="1">1</li><li attr="2">2</li></ul>');
    $crawly->filter('li')->pluck(['attr']); // ["1","2"]

    $crawly = new Crawly('<img width="200" height="300"></img><img width="400" height="500"></img>');
    $crawly->filter('img')->pluck(['width', 'height']); // [ ["200", "300"], ["400", "500"] ]


    $crawly = new Crawly('<ul><li>1</li><li>2</li></ul>');

    $crawly->filter('li')->count(); // 2


    $crawly = new Crawly('<span>123</span>');
    $crawly->filter('span')->int(); // 123

    // Use default if selection is not numeric
    $crawly = new Crawly('');
    $crawly->filter('span')->int(55); // 55


    $crawly = new Crawly('<span>18.5</span>');
    $crawly->filter('span')->float(); // 18.5

    // Use default if selection is not numeric
    $crawly = new Crawly('');
    $crawly->filter('span')->float(22.4); // 22.4


    $crawly = new Crawly('<span>Hello World!</span>');
    $crawly->filter('span')->string(); // 'Hello World!'

    // Use default in case exception arises
    $crawly = new Crawly('');
    $crawly->filter('non-existing-selection')->string('Hello'); // 'Hello'


    $crawly = new Crawly('<span>Hello World!</span>');
    $crawly->filter('span')->html(); // <span>Hello World!</span>

    // Use default in case exception arises
    $crawly = new Crawly('');
    $crawly->filter('non-existing-selection')->html('<div>Hi</div>'); // <div>Hi</div>


    $crawly = new Crawly('<span>Hello World!</span>');
    $crawly->filter('span')->innerHtml(); // 'Hello World!'

    // Use default to handle exceptional cases
    $crawly = new Crawly('');
    $crawly->filter('non-existing-selection')->innerHtml('<div>Hi</div>'); // 'Hi'


    $crawly = new Crawly('<span>Hello World!</span>');
    $crawly->filter('span')->exists(); // true

    $crawly = new Crawly('');
    $crawly->filter('non-existing-selection')->exists();     // false
    $crawly->filter('non-existing-selection')->exists(true); // new ScrapeException(...)


    $crawly = new Crawly('<ul><li>1</li></ul>');
    $crawly = $crawly->filter('li')->html(); // <li>1</li>

    $crawly->reset()->html(); // <ul><li>1</li></ul>


    $crawly = new Crawly('<ul><li>    Hello  </li><li>  World  </li></ul>');

    $crawly->filter('li')->map(function (Crawly $crawly, int $index) {
        return $crawly->trim()->string() . ' - ' . $index;
    }); // ['Hello - 0', 'World - 1']

    // limit the map function
    $crawly->filter('li')->map(function (Crawly $crawly, int $index) {
        return $crawly->trim()->string() . ' - ' . $index;
    }, 1); // ['Hello - 0']


    $crawly = new Crawly('<ul><li>1</li></ul>');

    $crawly = $crawly->filter('li')->node(); // DOMNode representing '<li>1</li>' is returned


    use Scrapy\Builders\ScrapyBuilder;
    use Scrapy\Readers\UrlReader;
    use Scrapy\Readers\FileReader;

    ScrapyBuilder::make()
        ->reader(new UrlReader('https://www.some-url.com'));
    ScrapyBuilder::make()
        ->reader(new FileReader('path-to-file.html'));


    use Scrapy\Readers\IReader;

    class CustomReader implements IReader
    {
        public function read(): string
        {
            return '<h1>Hello World!</h1>';
        }
    }


    ScrapyBuilder::make()
        ->reader(new CustomReader());


    ScrapyBuilder::make()
        ->agent(new GoogleAgent());                     // Googlebot
    ScrapyBuilder::make()
        ->agent(new GoogleChromeAgent(81, 0, 4043, 0)); // Googlebot
    ScrapyBuilder::make()
        ->agent(new BingUserAgent());                   // Bing
    ScrapyBuilder::make()
        ->agent(new YahooUserAgent());                  // Yahoo
    ScrapyBuilder::make()
        ->agent(new DuckUserAgent());                   // Duck


    use Scrapy\Agents\IUserAgent;
    use Scrapy\Readers\UrlReader;

    class UserAgent implements IUserAgent
    {
        public function reader(string $url): UrlReader
        {
            $reader = new UrlReader($url);
            $reader->setConfig(['headers' => ['...']]);
            return $reader;
        }
    }


    ScrapyBuilder::make()
        ->agent(new UserAgent());


    use Scrapy\Readers\UrlReader;
    use Scrapy\Agents\GoogleAgent;
    use Scrapy\Builders\ScrapyBuilder;

    ScrapyBuilder::make()
        ->url('https://www.facebook.com')
        ->agent(new GoogleAgent())
        ->reader(new UrlReader('https://www.youtube.com')); // Youtube will be read without GoogleAgent, Facebook will be ignored.


        use Scrapy\Builders\ScrapyBuilder;
        use Scrapy\Exceptions\ScrapeException;
    
        try {
            $html = ScrapyBuilder::make()
                ->url('https://www.invalid-url.com')
                ->build()
                ->scrape();
        } catch (ScrapeException $e) {
            // 
        }