PHP code example of webcrawlerapi / sdk

1. Go to this page and download the library: Download webcrawlerapi/sdk library. Choose the download type require.

2. Extract the ZIP file and open the index.php.

3. Add this code to the index.php.

    
        
<?php
require_once('vendor/autoload.php');

/* Start to develop here. Best regards https://php-download.com/ */

webcrawlerapi / sdk example snippets


use WebCrawlerAPI\WebCrawlerAPI;

// Initialize the client
$crawler = new WebCrawlerAPI('your_api_key');

// Synchronous crawling (blocks until completion)
$job = $crawler->crawl(
    url: 'https://example.com',
    scrapeType: 'markdown',
    itemsLimit: 10,
    webhookUrl: 'https://yourserver.com/webhook',
    allowSubdomains: false,
    maxPolls: 100  // Optional: maximum number of status checks
);
echo "Job completed with status: {$job->status}\n";

// Access job items and their content
foreach ($job->jobItems as $item) {
    echo "Page title: {$item->title}\n";
    echo "Original URL: {$item->originalUrl}\n";
    echo "Item status: {$item->status}\n";
    
    // Get the content based on job's scrape_type
    // Returns null if item is not in "done" status
    $content = $item->getContent();
    if ($content) {
        echo "Content length: " . strlen($content) . "\n";
        echo "Content preview: " . substr($content, 0, 200) . "...\n";
    } else {
        echo "Content not available or item not done\n";
    }
}

// Access job items and their parent job
foreach ($job->jobItems as $item) {
    echo "Item URL: {$item->originalUrl}\n";
    echo "Parent job status: {$item->job->status}\n";
    echo "Parent job URL: {$item->job->url}\n";
}

// Or use asynchronous crawling
$response = $crawler->crawlAsync(
    url: 'https://example.com',
    scrapeType: 'markdown',
    itemsLimit: 10,
    webhookUrl: 'https://yourserver.com/webhook',
    allowSubdomains: false
);

// Get the job ID from the response
$jobId = $response->id;
echo "Crawling job started with ID: {$jobId}\n";

// Check job status and get results
$job = $crawler->getJob($jobId);
echo "Job status: {$job->status}\n";

// Access job details
echo "Crawled URL: {$job->url}\n";
echo "Created at: {$job->createdAt->format('Y-m-d H:i:s')}\n";
echo "Number of items: " . count($job->jobItems) . "\n";

// Cancel a running job if needed
$cancelResponse = $crawler->cancelJob($jobId);
echo "Cancellation response: " . json_encode($cancelResponse) . "\n";