PHP download

PHP code example of content-extract / content-processor

1. Go to this page and download the library: Download content-extract/content-processor library. Choose the download type require.

2. Extract the ZIP file and open the index.php.

3. Add this code to the index.php.

    
        
<?php
require_once('vendor/autoload.php');

/* Start to develop here. Best regards https://php-download.com/ */

content-extract / content-processor example snippets


$result = ContentProcessor::make()
    ->withSchema($schema)
    ->withExtractor(new PdfTextExtractor())
    ->withStructurer(new SchemaAwareStructurer())
    ->fromDirectory('/documents')
    ->processFinal();  // Returns FinalResult with clean API


use ContentProcessor\Schemas\ArraySchema;

$schema = new ArraySchema([
    'name' => [
        'type' => 'string',
        'equired' => false,
        'aliases' => ['years of experience', 'experience'],
    ],
]);


use ContentProcessor\Core\ContentProcessor;
use ContentProcessor\Extractors\PdfTextExtractor;
use ContentProcessor\Structurers\SchemaAwareStructurer;

$result = ContentProcessor::make()
    ->withSchema($schema)
    ->withExtractor(new PdfTextExtractor())
    ->withStructurer(new SchemaAwareStructurer())
    ->fromDirectory('/path/to/documents', '*.pdf')
    ->processFinal();


// Check status
if (!$result->isSuccessful()) {
    echo "Some documents failed:\n";
    foreach ($result->errors() as $error) {
        echo "  - " . $error->getMessage() . "\n";
    }
}

// Process successful data
foreach ($result->data() as $item) {
    echo "Processed: " . $item['document'] . "\n";
    // $item['data'] contains the structured data
    var_dump($item['data']);
}

// Inspect quality warnings
if ($result->hasWarnings()) {
    foreach ($result->warnings() as $warning) {
        echo "⚠️ Field '{$warning->getField()}': {$warning->getMessage()}\n";
    }
}

// Export to JSON
echo $result->toJSONPretty();


interface ExtractorInterface {
    public function extract(string $source): array;
    public function canHandle(string $source): bool;
    public function getName(): string;
}


interface StructurerInterface {
    public function structure(array $content, SchemaInterface $schema): array;
    public function getName(): string;
}


interface SchemaInterface {
    public function getDefinition(): array;
    public function validate(array $data): array;
    public function getName(): string;
}


$processor->withOptions([
    'skip_invalid' => true,    // Skip documents that fail validation
    'preserve_empty' => false, // Preserve empty fields in result
]);


use ContentProcessor\Extractors\CompositePdfExtractor;

// Automatically tries digital extraction first, then OCR if needed
$result = ContentProcessor::make()
    ->withSchema($schema)
    ->withExtractor(new CompositePdfExtractor())  // Tries PDF text first, then OCR
    ->withStructurer(new SchemaAwareStructurer())
    ->fromDirectory('/documents')
    ->processFinal();


$result = ContentProcessor::make()->...->processFinal();

// Access data
$result->data();           // Array of successful documents
$result->errors();         // Array of normalized errors
$result->warnings();       // Array of semantic warnings
$result->summary();        // Summary with statistics

// Status checks
$result->isSuccessful();   // bool - At least 1 successful?
$result->isPerfect();      // bool - No errors or warnings?
$result->hasErrors();      // bool
$result->hasWarnings();    // bool

// Filtering
$result->errorsByType('validation');
$result->warningsByField('email');
$result->warningsByCategory('missing_value');

// Serialization
$result->toArray();        // array
$result->toJSON();         // string (compact)
$result->toJSONPretty();   // string (formatted)
$result->fullResults();    // array (complete audit trail)


src/
├── Contracts/              # Interfaces defining the contract
│   ├── ExtractorInterface.php
│   ├── StructurerInterface.php
│   └── SchemaInterface.php
├── Core/                   # Main classes
│   └── ContentProcessor.php
├── Extractors/             # Extractor implementations
│   ├── PdfTextExtractor.php
│   ├── TextFileExtractor.php
│   └── PdfOcrExtractor.php (v1.5.0+)
├── Schemas/                # Schema implementations
│   └── ArraySchema.php
├── Structurers/            # Structurer implementations
│   ├── SimpleLineStructurer.php
│   ├── RuleBasedStructurer.php
│   ├── SchemaAwareStructurer.php
│   └── CompositePdfExtractor.php (v1.5.0+)
├── Utils/                  # Utilities
│   ├── TextNormalizer.php
│   └── TextSegmenter.php
└── Models/                 # Domain models
    ├── Warning.php
    ├── Error.php
    └── FinalResult.php

examples/
├── example_basic.php
├── example_semantic_structuring.php
└── sample_cv_*.txt

bash
cd examples
php example_basic.php
php example_semantic_structuring.php