1. Go to this page and download the library: Download unique/scraper library. Choose the download type require.
2. Extract the ZIP file and open the index.php.
3. Add this code to the index.php.
<?php
require_once('vendor/autoload.php');
/* Start to develop here. Best regards https://php-download.com/ */
unique / scraper example snippets
class SiteItem implements \unique\scraper\interfaces\SiteItemInterface {
protected $id;
protected $url;
protected $title;
// @todo: implement setter and getters for $id, $url, $title
}
class ItemListDownloader extends \unique\scraper\AbstractItemListDownloader {
protected function getNumberOfItemsInPage( \Symfony\Component\DomCrawler\Crawler $doc ): ?int {
// Or we could implement some logic of checking the website for the actual number.
return 20;
}
protected function hasNextPage( \Symfony\Component\DomCrawler\Crawler $doc, int $current_page_num ): bool {
// We could implement some logic of checking the page's paginator,
// or we can just return true and let the scraper go through all of the listing
// pages until it finds one, that has no items in it. It will then stop automatically.
return true;
}
function getListUrl( ?int $page_num ): string {
return 'https://some.website.here/?page_num=' . $page_num;
}
function getTotalItems( \Symfony\Component\DomCrawler\Crawler $doc ): ?int {
// If possible, we could find the total number of items (that's in all of the listing pages)
return null;
}
function getItems( \Symfony\Component\DomCrawler\Crawler $doc ): iterable {
// We define a selector, where each item will be a unique ad.
// The scraper will iterate these items and get all of them.
// It doesn't need to be <a> tag, you define your own logic of how to get
// to the actual item page.
return $doc->filter( 'a.ad-item' );
}
function getItemUrl( \DOMElement $item ): ?string {
// Here, $item is the item from the getItems() method,
// we analyze it and return the url for scraping the item itself.
return $item->getAttribute( 'href' );
}
function getItemId( string $url, \DOMElement $item ): string {
// We return some string by which we can uniquely identify the ad.
// This can later be used to skip the ads, that we already have in DB, for example.
return $item->getAttribute( 'data-id' );
}
function getItemDownloader( string $url, string $id ): ?AbstractItemDownloader {
return new ItemDownloader( 'https://some.website.here/' . $url, $id, $this, new SiteItem() );
}
}
class ItemDownloader extends \unique\scraper\AbstractItemDownloader {
protected function assignItemData( \Symfony\Component\DomCrawler\Crawler $doc ) {
// We set all the attributes we need for our custom SiteItem object,
// which can be accessed by the $this->item attribute.
$this->item->setTitle( $doc->filter( 'h1' )->text() );
}
}
class ItemDownloader extends \unique\scraper\AbstractJsonItemDownloader {
protected function assignItemData( array $json ) {
// We set all the attributes we need for our custom SiteItem object,
// which can be accessed by the $this->item attribute.
$this->item->setTitle( $json['title'] );
}
}
class ScraperController implements \unique\scraper\interfaces\ConsoleInterface {
// @todo implement stdOut() and stdErr() methods for logging.
public function actionRun() {
$transport = new GuzzleHttp\Client();
$log_container = new LogContainerConsole( $this );
$downloader = new ItemListDownloader( SiteItem::class, $transport, $log_container );
$downloader->on( \unique\scraper\AbstractItemListDownloader::EVENT_ON_ITEM_END, function ( \unique\scraper\events\ItemEndEvent $event ) {
if ( $event->site_item ) {
$event->site_item->save();
}
} );
$downloader->scrape();
}
}
Loading please wait ...
Before you can download the PHP files, the dependencies should be resolved. This can take some minutes. Please be patient.