PHP code example of content-extract / content-processor
1. Go to this page and download the library: Download content-extract/content-processor library. Choose the download type require.
2. Extract the ZIP file and open the index.php.
3. Add this code to the index.php.
<?php
require_once('vendor/autoload.php');
/* Start to develop here. Best regards https://php-download.com/ */
content-extract / content-processor example snippets
$result = ContentProcessor::make()
->withSchema($schema)
->withExtractor(new PdfTextExtractor())
->withStructurer(new SchemaAwareStructurer())
->fromDirectory('/documents')
->processFinal(); // Returns FinalResult with clean API
use ContentProcessor\Schemas\ArraySchema;
$schema = new ArraySchema([
'name' => [
'type' => 'string',
'equired' => false,
'aliases' => ['years of experience', 'experience'],
],
]);
use ContentProcessor\Core\ContentProcessor;
use ContentProcessor\Extractors\PdfTextExtractor;
use ContentProcessor\Structurers\SchemaAwareStructurer;
$result = ContentProcessor::make()
->withSchema($schema)
->withExtractor(new PdfTextExtractor())
->withStructurer(new SchemaAwareStructurer())
->fromDirectory('/path/to/documents', '*.pdf')
->processFinal();
// Check status
if (!$result->isSuccessful()) {
echo "Some documents failed:\n";
foreach ($result->errors() as $error) {
echo " - " . $error->getMessage() . "\n";
}
}
// Process successful data
foreach ($result->data() as $item) {
echo "Processed: " . $item['document'] . "\n";
// $item['data'] contains the structured data
var_dump($item['data']);
}
// Inspect quality warnings
if ($result->hasWarnings()) {
foreach ($result->warnings() as $warning) {
echo "⚠️ Field '{$warning->getField()}': {$warning->getMessage()}\n";
}
}
// Export to JSON
echo $result->toJSONPretty();
interface ExtractorInterface {
public function extract(string $source): array;
public function canHandle(string $source): bool;
public function getName(): string;
}
interface StructurerInterface {
public function structure(array $content, SchemaInterface $schema): array;
public function getName(): string;
}
interface SchemaInterface {
public function getDefinition(): array;
public function validate(array $data): array;
public function getName(): string;
}
$processor->withOptions([
'skip_invalid' => true, // Skip documents that fail validation
'preserve_empty' => false, // Preserve empty fields in result
]);
use ContentProcessor\Extractors\CompositePdfExtractor;
// Automatically tries digital extraction first, then OCR if needed
$result = ContentProcessor::make()
->withSchema($schema)
->withExtractor(new CompositePdfExtractor()) // Tries PDF text first, then OCR
->withStructurer(new SchemaAwareStructurer())
->fromDirectory('/documents')
->processFinal();
$result = ContentProcessor::make()->...->processFinal();
// Access data
$result->data(); // Array of successful documents
$result->errors(); // Array of normalized errors
$result->warnings(); // Array of semantic warnings
$result->summary(); // Summary with statistics
// Status checks
$result->isSuccessful(); // bool - At least 1 successful?
$result->isPerfect(); // bool - No errors or warnings?
$result->hasErrors(); // bool
$result->hasWarnings(); // bool
// Filtering
$result->errorsByType('validation');
$result->warningsByField('email');
$result->warningsByCategory('missing_value');
// Serialization
$result->toArray(); // array
$result->toJSON(); // string (compact)
$result->toJSONPretty(); // string (formatted)
$result->fullResults(); // array (complete audit trail)