PHP code example of helgesverre / extractor

1. Go to this page and download the library: Download helgesverre/extractor library. Choose the download type require.

2. Extract the ZIP file and open the index.php.

3. Add this code to the index.php.
    
        
<?php
require_once('vendor/autoload.php');

/* Start to develop here. Best regards https://php-download.com/ */

    

helgesverre / extractor example snippets




use HelgeSverre\Extractor\Facades\Extractor;
use HelgeSverre\Extractor\Facades\Text;
use Illuminate\Support\Facades\Storage;

$image = Storage::get("restaurant_menu.png")

// Extract text from images
$textFromImage = Text::textract($image);

// Extract structured data from plain text
$menu = Extractor::fields($textFromImage,
    fields: [
        'restaurantName',
        'phoneNumber',
        'dishes' => [
            'name' => 'name of the dish',
            'description' => 'description of the dish',
            'price' => 'price of the dish as a number',
        ],
    ],
    model: "gpt-3.5-turbo-1106",
    maxTokens: 4000,
);

use HelgeSverre\Extractor\Facades\Text;

$textPlainText = Text::text(file_get_contents('./data.txt'));
$textPdf = Text::pdf(file_get_contents('./data.pdf'));
$textImageOcr = Text::textract(file_get_contents('./data.jpg'));
$textPdfOcr = Text::textractUsingS3Upload(file_get_contents('./data.pdf'));
$textWord = Text::word(file_get_contents('./data.doc'));
$textWeb = Text::web('https://example.com');
$textHtml = Text::html(file_get_contents('./data.html'));

$sample = Text::pdf(file_get_contents(__DIR__.'/../samples/helge-cv.pdf'));

$data = Extractor::fields($sample,
    fields: [
        'name' => 'the name of the candidate',
        'email',
        'certifications' => 'list of certifications, if any',
        'workHistory' => [
            'companyName',
            'from' => 'Y-m-d if available, Year only if not, null if missing',
            'to' => 'Y-m-d if available, Year only if not, null if missing',
            'text',
        ],
    ],
    model: Engine::GPT_3_TURBO_1106,
);

use HelgeSverre\Extractor\Text\ImageContent;

$imagePath = __DIR__ . '/../samples/sample-image.jpg';
$imageContent = ImageContent::file($imagePath);

use HelgeSverre\Extractor\Text\ImageContent;

$rawImageData = file_get_contents(__DIR__ . '/../samples/sample-image.jpg');
$imageContent = ImageContent::raw($rawImageData);

use HelgeSverre\Extractor\Text\ImageContent;

$imageUrl = 'https://example.com/sample-image.jpg';
$imageContent = ImageContent::url($imageUrl);

use HelgeSverre\Extractor\Facades\Extractor;
use HelgeSverre\Extractor\Text\ImageContent;

$imageContent = ImageContent::file(__DIR__ . '/../samples/product-catalog.jpg');

$data = Extractor::fields(
    $imageContent,
    fields: [
        'productName',
        'price',
        'description',
    ],
    model: Engine::GPT_4_VISION,
);



namespace App\Extractors;

use HelgeSverre\Extractor\Extraction\Extractor;use HelgeSverre\Extractor\Text\TextContent;

class JobPostingExtractor extends Extractor
{
    public function prompt(string|TextContent $input): string
    {
        $outputKey = $this->expectedOutputKey();

        return "Extract the following fields from the job posting below:"
            . "\n- jobTitle: The title or designation of the job."
            . "\n- companyName: The name of the company or organization posting the job."
            . "\n- location: The geographical location or workplace where the job is based."
            . "\n- jobType: The nature of employment (e.g., Full-time, Part-time, Contract)."
            . "\n- description: A brief summary or detailed description of the job."
            . "\n- applicationDeadline: The closing date for applications, if specified."
            . "\n\nThe output should be a JSON object under the key '{$outputKey}'."
            . "\n\nINPUT STARTS HERE\n\n$input\n\nOUTPUT IN JSON:\n";
    }

    public function expectedOutputKey(): string
    {
        return 'extractedData';
    }
}

use HelgeSverre\Extractor\Extractor;

Extractor::extend("job-posting", fn() => new JobPostingExtractor());

use HelgeSverre\Extractor\Facades\Text;
use HelgeSverre\Extractor\Extractor;

$jobPostingContent = Text::web("https://www.finn.no/job/fulltime/ad.html?finnkode=329443482");

$extractedData = Extractor::extract('job-posting', $jobPostingContent);
// Or you can specify the class-string instead
// ex: Extractor::extract(JobPostingExtractor::class, $jobPostingContent);

// $extractedData now contains structured information from the job posting



namespace App\Extractors;

use HelgeSverre\Extractor\Extraction\Concerns\HasValidation;
use HelgeSverre\Extractor\Extraction\Extractor;

class JobPostingExtractor extends Extractor
{
    use HasValidation;

    public function rules(): array
    {
        return [
            'jobTitle' => ['ed', 'date']
        ];
    }
}



namespace App\Extractors;

use DateTime;
use App\Extractors\JobPostingDto;
use HelgeSverre\Extractor\Extraction\Concerns\HasDto;
use HelgeSverre\Extractor\Extraction\Extractor;
use Spatie\LaravelData\Data;

class JobPostingDto extends Data
{
    public function __construct(
        public string $jobTitle,
        public string $companyName,
        public string $location,
        public string $jobType,
        public int|float $salary,
        public string $description,
        public DateTime $applicationDeadline
    ) {
    }
}

class JobPostingExtractor extends Extractor
{
    use HasDto;

    public function dataClass(): string
    {
        return JobPostingDto::class;
    }

    public function isCollection(): bool
    {
        return false; 
    }
}

'textract' => [
    'driver' => 's3',
    'key' => env('TEXTRACT_KEY'),
    'secret' => env('TEXTRACT_SECRET'),
    'region' => env('TEXTRACT_REGION'),
    'bucket' => env('TEXTRACT_BUCKET'),
],

return [
    "textract_disk" => env("TEXTRACT_DISK")
];

// Delete the file from the S3 bucket
TextractUsingS3Upload::cleanupFileUsing(function (string $filePath) {
    Storage::disk('textract')->delete($filePath);
}
bash
php artisan vendor:publish --tag="extractor-config"
shell
php artisan vendor:publish --provider="OpenAI\Laravel\ServiceProvider"