PHP code example of cloudstudio / laravel-html-crawler

1. Go to this page and download the library: Download cloudstudio/laravel-html-crawler library. Choose the download type require.

2. Extract the ZIP file and open the index.php.

3. Add this code to the index.php.
    
        
<?php
require_once('vendor/autoload.php');

/* Start to develop here. Best regards https://php-download.com/ */

    

cloudstudio / laravel-html-crawler example snippets


use CloudStudio\HtmlCrawler\Facades\HtmlCrawler;

$html = '<div><p>Hello <strong>World</strong></p></div>';
$cleanHtml = HtmlCrawler::fromHtml($html)->clean();

// Expected output: "Hello World"

use CloudStudio\HtmlCrawler\Facades\HtmlCrawler;

$html = '<div><p>Hello <a href="#">World</a></p></div>';
$cleanHtml = HtmlCrawler::fromHtml($html)
    ->setAllowedTags(['p', 'a'])
    ->clean();

// Expected output: '<p>Hello <a href="#">World</a></p>'

use CloudStudio\HtmlCrawler\Facades\HtmlCrawler;

$html = '<div><p>Hello <a href="#">World</a></p></div>';
$cleanHtml = HtmlCrawler::fromHtml($html)
    ->keepParagraphs()   // Preserves <p> tags
    ->keepLinks()        // Preserves <a> tags
    ->clean();

// Expected output: '<p>Hello <a href="#">World</a></p>'

use CloudStudio\HtmlCrawler\Facades\HtmlCrawler;

$html = '<div><script>alert("x")</script><p>Test</p></div>';
$cleanHtml = HtmlCrawler::fromHtml($html)->clean();

// Expected output: "Test"

use CloudStudio\HtmlCrawler\Facades\HtmlCrawler;

$html = '<div><script>alert("x")</script><p>Test</p></div>';
$cleanHtml = HtmlCrawler::fromHtml($html)
    ->keepScripts()
    ->clean();

// Expected output: '<script>alert("x")</script><p>Test</p>'

use CloudStudio\HtmlCrawler\Facades\HtmlCrawler;

$html = '<div><style>.text { color: red; }</style><p>Styled text</p></div>';
$cleanHtml = HtmlCrawler::fromHtml($html)
    ->keepCss()
    ->clean();

// Expected output: '<style>.text { color: red; }</style><p>Styled text</p>'

use CloudStudio\HtmlCrawler\Facades\HtmlCrawler;

$html = '<div><span class="remove">Remove me</span><p>Keep me</p></div>';
$pattern = '/<span class="remove">.*?<\/span>/';
$cleanHtml = HtmlCrawler::fromHtml($html)
    ->useCustomPattern($pattern)
    ->clean();

// Expected output: '<p>Keep me</p>'

use CloudStudio\HtmlCrawler\Facades\HtmlCrawler;

$html = '<h1>Title</h1><p>Paragraph text</p>';
$markdown = HtmlCrawler::fromHtml($html)
    ->withMarkdown()
    ->clean();


use CloudStudio\HtmlCrawler\Facades\HtmlCrawler;

$html = "Line 1\nLine 2";
$cleanHtml = HtmlCrawler::fromHtml($html)
    ->preserveNewlines(false)  // Set to false to replace newlines with spaces
    ->clean();

// Expected output: "Line 1 Line 2"

use CloudStudio\HtmlCrawler\Facades\HtmlCrawler;

$cleanHtml = HtmlCrawler::fromUrl('https://example.com')
    ->clean();

// Output: the cleaned HTML content retrieved from the URL.

return [
    'preserve_newlines'   => true,
    'allowed_tags'        => [],
    'convert_to_markdown' => false,
    'remove_scripts'      => true,
    'remove_styles'       => true,
];
bash
php artisan vendor:publish --provider="CloudStudio\HtmlCrawler\HtmlCrawlerServiceProvider"