PHP code example of coderden / page-parser

1. Go to this page and download the library: Download coderden/page-parser library. Choose the download type require.

2. Extract the ZIP file and open the index.php.

3. Add this code to the index.php.
    
        
<?php
require_once('vendor/autoload.php');

/* Start to develop here. Best regards https://php-download.com/ */

    

coderden / page-parser example snippets


use CoderDen\PageParser\PageParser;

// Create parser instance
$parser = new PageParser();

// Load and parse a page
$parser->loadPage('https://example.com');

// Get page title
echo $parser->getTitle();

// Extract all links
$links = $parser->getAllLinks();

// Extract specific elements
$products = $parser->extractByXPath('//div[@class="product"]', [
    'name' => './/h3/text()',
    'price' => './/span[@class="price"]/text()',
    'url' => './/a/@href',
]);

$parser = new PageParser([
    'timeout' => 30,
    'headers' => [
        'User-Agent' => 'MyBot/1.0',
    ],
]);

// Load page
$parser->loadPage('https://example.com');

// Extract by XPath
$data = $parser->extractByXPath('//article', [
    'title' => './/h2/text()',
    'content' => './/p/text()',
]);

// Extract by CSS selector
$links = $parser->extractByCss('a.article-link', ['href', '_text']);

// Check element existence
if ($parser->exists('.pagination')) {
    echo 'Pagination found!';
}

// Get element count
$imageCount = $parser->count('img');

use CoderDen\PageParser\ParserHelper;

// Quick extraction
$links = ParserHelper::extractLinks('https://example.com');

// Get page title
$title = ParserHelper::getTitle('https://example.com');

// Extract specific data
$products = ParserHelper::extract(
    'https://example.com/products',
    '//div[@class="product-item"]',
    ['name' => './/h3/text()', 'price' => './/span[@class="price"]/text()']
);

// Check URL availability
if (ParserHelper::checkUrl('https://example.com')) {
    echo 'URL is accessible';
}

$parser = new PageParser();
$parser->loadPage('https://example.com');

// Get meta tags
$metaTags = $parser->getMetaTags();

// Get canonical URL
$canonical = $parser->getCanonicalUrl();

// Get page charset
$charset = $parser->getCharset();

// Get Open Graph data
$ogTitle = $parser->getAttribute('meta[property="og:title"]', 'content');
$ogImage = $parser->getAttribute('meta[property="og:image"]', 'content');

$parser = new PageParser();
$parser->loadPage('https://example.com/blog');

// All links are automatically resolved to absolute URLs
$links = $parser->extractLinksByXPath('//a[@href]');

// Images with relative paths become absolute
$images = $parser->extractImagesByXPath('//img[@src]');

$parser = new PageParser();
$parser->loadPage('https://example.com');

// Search for email addresses
$emails = $parser->searchByRegex('/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/');

// Search for phone numbers
$phones = $parser->searchByRegex('/\+?[\d\s\-\(\)]{7,}/');

$parser = new PageParser([
    // HTTP client options
    'timeout' => 30,
    'connect_timeout' => 10,
    'verify' => true, // SSL verification
    'allow_redirects' => true,
    
    // Custom headers
    'headers' => [
        'User-Agent' => 'MyCrawler/1.0',
        'Accept' => 'text/html,application/xhtml+xml',
        'Accept-Language' => 'en-US,en;q=0.9',
        'Referer' => 'https://google.com',
    ],
    
    // Proxy support
    'proxy' => 'http://proxy.example.com:8080',
    
    // Authentication
    'auth' => ['username', 'password'],
    
    // Cookies
    'cookies' => true,
]);

use CoderDen\PageParser\PageParser;

try {
    $parser = new PageParser();
    $parser->loadPage('https://example.com');
    
    // Your parsing logic here
    
} catch (\RuntimeException $e) {
    echo "Failed to load page: " . $e->getMessage();
    
} catch (\Exception $e) {
    echo "General error: " . $e->getMessage();
}

$parser = new PageParser();
$parser->loadPage('https://example.com/products');

$products = $parser->extractByXPath('//div[contains(@class, "product")]', [
    'name' => './/h3/text()',
    'price' => './/span[@class="price"]/text()',
    'sku' => './/span[@class="sku"]/text()',
    'image' => './/img/@src',
    'url' => './/a/@href',
]);

foreach ($products as $product) {
    echo "Product: {$product['name']}\n";
    echo "Price: {$product['price']}\n";
    echo "Image: {$product['image']}\n";
    echo "---\n";
}

$articleData = ParserHelper::extract(
    'https://example.com/article',
    '//article',
    [
        'title' => './/h1/text()',
        'author' => './/span[@class="author"]/text()',
        'date' => './/time/@datetime',
        'content' => './/div[@class="content"]//p//text()',
        'tags' => './/a[@rel="tag"]//text()',
    ]
);

// Process article content
if (!empty($articleData[0]['content'])) {
    $content = is_array($articleData[0]['content']) 
        ? implode("\n", $articleData[0]['content'])
        : $articleData[0]['content'];
}

$urls = [
    'https://example.com/page1',
    'https://example.com/page2',
    'https://example.com/page3',
];

$allData = [];
foreach ($urls as $url) {
    try {
        $data = ParserHelper::extract($url, '//h1', ['_text']);
        $allData[$url] = $data[0] ?? 'No title';
    } catch (\Exception $e) {
        $allData[$url] = "Error: " . $e->getMessage();
    }
}

// Save results
file_put_contents('results.json', json_encode($allData, JSON_PRETTY_PRINT));

class WebsiteMonitor
{
    private PageParser $parser;
    
    public function __construct()
    {
        $this->parser = new PageParser();
    }
    
    public function checkForChanges(string $url, string $elementSelector): array
    {
        $this->parser->loadPage($url);
        
        return [
            'title' => $this->parser->getTitle(),
            'element_count' => $this->parser->count($elementSelector),
            'element_exists' => $this->parser->exists($elementSelector),
            'status_code' => $this->parser->getStatusCode(),
            'timestamp' => date('Y-m-d H:i:s'),
        ];
    }
}

$monitor = new WebsiteMonitor();
$changes = $monitor->checkForChanges('https://example.com', '.news-item');